You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
132 lines
3.3 KiB
132 lines
3.3 KiB
4 years ago
|
# Term Frequency * Inverse Document Frequency
|
||
|
|
||
|
import spacy
|
||
|
import json
|
||
|
import math
|
||
|
|
||
|
|
||
|
# This function creats a list of all terms in a document
|
||
|
def list_of_all_terms(text, nlp):
|
||
|
doc = nlp(text)
|
||
|
result = []
|
||
|
for token in doc:
|
||
|
if token.text in nlp.Defaults.stop_words:
|
||
|
continue
|
||
|
|
||
|
if token.pos_ == "NOUN" or token.pos_ == "PROPN":
|
||
|
if token.lemma_ not in result:
|
||
|
result.append(token.lemma_)
|
||
|
|
||
|
return result
|
||
|
|
||
|
#calculates the term frequency in all sentences of a text
|
||
|
def term_frequency(text, nlp):
|
||
|
result = dict()
|
||
|
|
||
|
doc = nlp(text)
|
||
|
|
||
|
for token in doc:
|
||
|
if token.text in nlp.Defaults.stop_words:
|
||
|
continue
|
||
|
|
||
|
if token.pos_ == "NOUN" or token.pos_ == "PROPN":
|
||
|
if token.lemma_ not in result:
|
||
|
result[token.lemma_] = 1
|
||
|
else:
|
||
|
result[token.lemma_] += 1
|
||
|
|
||
|
return result
|
||
|
|
||
|
#this function creates the tf value for a sentence
|
||
|
def create_term_frequency_sentence(text, nlp):
|
||
|
result = dict()
|
||
|
|
||
|
doc = nlp(text)
|
||
|
|
||
|
for sentence in doc.sents:
|
||
|
list_token_sentence = term_frequency(sentence.text, nlp)
|
||
|
result[sentence.text] = list_token_sentence
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
|
||
|
def count_sentences_in_doc(text, nlp):
|
||
|
count = 0
|
||
|
doc = nlp(text)
|
||
|
|
||
|
for sentence in doc.sents:
|
||
|
count += 1
|
||
|
|
||
|
|
||
|
return count
|
||
|
|
||
|
# Inverse Document frequency idf(t, text) = |text| / log(sum(sentences contains t))
|
||
|
# |text| := number of sentences
|
||
|
#
|
||
|
#
|
||
|
def calc_idf(list_of_terms, term_frequency_sentence, nlp, text):
|
||
|
number_of_sentences = count_sentences_in_doc(text, nlp)
|
||
|
result = dict()
|
||
|
|
||
|
|
||
|
for term in list_of_terms:
|
||
|
counter = 0
|
||
|
for sentence in term_frequency_sentence:
|
||
|
for other_term in term_frequency_sentence[sentence]:
|
||
|
if term == other_term:
|
||
|
counter += 1
|
||
|
continue
|
||
|
|
||
|
#print(str(counter)+" | "+str(number_of_sentences))
|
||
|
|
||
|
if counter == 0 or number_of_sentences == 0:
|
||
|
result[term] = 0
|
||
|
else:
|
||
|
idf = math.log10(number_of_sentences / float(counter))
|
||
|
result[term] = idf
|
||
|
|
||
|
|
||
|
return result
|
||
|
|
||
|
# this function mulitplies the tf*idf
|
||
|
def termfrequency_multiply_idf(term_frequency_list,idf_list):
|
||
|
result = dict()
|
||
|
for term in term_frequency_list:
|
||
|
for idf_term in idf_list:
|
||
|
if term == idf_term:
|
||
|
result[term] = term_frequency_list[term] * idf_list[term]
|
||
|
|
||
|
return result
|
||
|
|
||
|
#this function calculates the average tfidf value
|
||
|
def calc_average_tfidf(tfidf_list):
|
||
|
sum_of_tfidf = 0
|
||
|
if len(tfidf_list) == 0:
|
||
|
return 0
|
||
|
|
||
|
for term in tfidf_list:
|
||
|
sum_of_tfidf += tfidf_list[term]
|
||
|
|
||
|
|
||
|
return sum_of_tfidf / len(tfidf_list)
|
||
|
|
||
|
def run(text, nlp):
|
||
|
# print(text)
|
||
|
term_frequency_list = term_frequency(text, nlp)
|
||
|
list_of_terms = list_of_all_terms(text, nlp)
|
||
|
term_frequency_sentence = create_term_frequency_sentence(text, nlp)
|
||
|
|
||
|
|
||
|
|
||
|
idf_list = calc_idf(list_of_terms, term_frequency_sentence, nlp, text)
|
||
|
|
||
|
|
||
|
|
||
|
tfidf_list = termfrequency_multiply_idf(term_frequency_list,idf_list)
|
||
|
print(len(tfidf_list))
|
||
|
average = calc_average_tfidf(tfidf_list)
|
||
|
print(str(average))
|
||
|
|
||
|
return average
|