# Term Frequency * Inverse Document Frequency import spacy import json import math # This function creats a list of all terms in a document def list_of_all_terms(text, nlp): doc = nlp(text) result = [] for token in doc: if token.text in nlp.Defaults.stop_words: continue if token.pos_ == "NOUN" or token.pos_ == "PROPN": if token.lemma_ not in result: result.append(token.lemma_) return result #calculates the term frequency in all sentences of a text def term_frequency(text, nlp): result = dict() doc = nlp(text) for token in doc: if token.text in nlp.Defaults.stop_words: continue if token.pos_ == "NOUN" or token.pos_ == "PROPN": if token.lemma_ not in result: result[token.lemma_] = 1 else: result[token.lemma_] += 1 return result #this function creates the tf value for a sentence def create_term_frequency_sentence(text, nlp): result = dict() doc = nlp(text) for sentence in doc.sents: list_token_sentence = term_frequency(sentence.text, nlp) result[sentence.text] = list_token_sentence return result def count_sentences_in_doc(text, nlp): count = 0 doc = nlp(text) for sentence in doc.sents: count += 1 return count # Inverse Document frequency idf(t, text) = |text| / log(sum(sentences contains t)) # |text| := number of sentences # # def calc_idf(list_of_terms, term_frequency_sentence, nlp, text): number_of_sentences = count_sentences_in_doc(text, nlp) result = dict() for term in list_of_terms: counter = 0 for sentence in term_frequency_sentence: for other_term in term_frequency_sentence[sentence]: if term == other_term: counter += 1 continue #print(str(counter)+" | "+str(number_of_sentences)) if counter == 0 or number_of_sentences == 0: result[term] = 0 else: idf = math.log10(number_of_sentences / float(counter)) result[term] = idf return result # this function mulitplies the tf*idf def termfrequency_multiply_idf(term_frequency_list,idf_list): result = dict() for term in term_frequency_list: for idf_term in idf_list: if term == idf_term: result[term] = term_frequency_list[term] * idf_list[term] return result #this function calculates the average tfidf value def calc_average_tfidf(tfidf_list): sum_of_tfidf = 0 if len(tfidf_list) == 0: return 0 for term in tfidf_list: sum_of_tfidf += tfidf_list[term] return sum_of_tfidf / len(tfidf_list) def run(text, nlp): # print(text) term_frequency_list = term_frequency(text, nlp) list_of_terms = list_of_all_terms(text, nlp) term_frequency_sentence = create_term_frequency_sentence(text, nlp) idf_list = calc_idf(list_of_terms, term_frequency_sentence, nlp, text) tfidf_list = termfrequency_multiply_idf(term_frequency_list,idf_list) print(len(tfidf_list)) average = calc_average_tfidf(tfidf_list) print(str(average)) return average