You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
131 lines
3.3 KiB
131 lines
3.3 KiB
# Term Frequency * Inverse Document Frequency |
|
|
|
import spacy |
|
import json |
|
import math |
|
|
|
|
|
# This function creats a list of all terms in a document |
|
def list_of_all_terms(text, nlp): |
|
doc = nlp(text) |
|
result = [] |
|
for token in doc: |
|
if token.text in nlp.Defaults.stop_words: |
|
continue |
|
|
|
if token.pos_ == "NOUN" or token.pos_ == "PROPN": |
|
if token.lemma_ not in result: |
|
result.append(token.lemma_) |
|
|
|
return result |
|
|
|
#calculates the term frequency in all sentences of a text |
|
def term_frequency(text, nlp): |
|
result = dict() |
|
|
|
doc = nlp(text) |
|
|
|
for token in doc: |
|
if token.text in nlp.Defaults.stop_words: |
|
continue |
|
|
|
if token.pos_ == "NOUN" or token.pos_ == "PROPN": |
|
if token.lemma_ not in result: |
|
result[token.lemma_] = 1 |
|
else: |
|
result[token.lemma_] += 1 |
|
|
|
return result |
|
|
|
#this function creates the tf value for a sentence |
|
def create_term_frequency_sentence(text, nlp): |
|
result = dict() |
|
|
|
doc = nlp(text) |
|
|
|
for sentence in doc.sents: |
|
list_token_sentence = term_frequency(sentence.text, nlp) |
|
result[sentence.text] = list_token_sentence |
|
|
|
return result |
|
|
|
|
|
|
|
def count_sentences_in_doc(text, nlp): |
|
count = 0 |
|
doc = nlp(text) |
|
|
|
for sentence in doc.sents: |
|
count += 1 |
|
|
|
|
|
return count |
|
|
|
# Inverse Document frequency idf(t, text) = |text| / log(sum(sentences contains t)) |
|
# |text| := number of sentences |
|
# |
|
# |
|
def calc_idf(list_of_terms, term_frequency_sentence, nlp, text): |
|
number_of_sentences = count_sentences_in_doc(text, nlp) |
|
result = dict() |
|
|
|
|
|
for term in list_of_terms: |
|
counter = 0 |
|
for sentence in term_frequency_sentence: |
|
for other_term in term_frequency_sentence[sentence]: |
|
if term == other_term: |
|
counter += 1 |
|
continue |
|
|
|
#print(str(counter)+" | "+str(number_of_sentences)) |
|
|
|
if counter == 0 or number_of_sentences == 0: |
|
result[term] = 0 |
|
else: |
|
idf = math.log10(number_of_sentences / float(counter)) |
|
result[term] = idf |
|
|
|
|
|
return result |
|
|
|
# this function mulitplies the tf*idf |
|
def termfrequency_multiply_idf(term_frequency_list,idf_list): |
|
result = dict() |
|
for term in term_frequency_list: |
|
for idf_term in idf_list: |
|
if term == idf_term: |
|
result[term] = term_frequency_list[term] * idf_list[term] |
|
|
|
return result |
|
|
|
#this function calculates the average tfidf value |
|
def calc_average_tfidf(tfidf_list): |
|
sum_of_tfidf = 0 |
|
if len(tfidf_list) == 0: |
|
return 0 |
|
|
|
for term in tfidf_list: |
|
sum_of_tfidf += tfidf_list[term] |
|
|
|
|
|
return sum_of_tfidf / len(tfidf_list) |
|
|
|
def run(text, nlp): |
|
# print(text) |
|
term_frequency_list = term_frequency(text, nlp) |
|
list_of_terms = list_of_all_terms(text, nlp) |
|
term_frequency_sentence = create_term_frequency_sentence(text, nlp) |
|
|
|
|
|
|
|
idf_list = calc_idf(list_of_terms, term_frequency_sentence, nlp, text) |
|
|
|
|
|
|
|
tfidf_list = termfrequency_multiply_idf(term_frequency_list,idf_list) |
|
print(len(tfidf_list)) |
|
average = calc_average_tfidf(tfidf_list) |
|
print(str(average)) |
|
|
|
return average
|
|
|