Bachelorarbeit/server-final/tfidf.py

# Term Frequency * Inverse Document Frequency

import spacy
import json
import math


# This function creats a list of all terms in a document
def list_of_all_terms(text, nlp):
    doc = nlp(text)
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue

        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            if token.lemma_ not in result:
                result.append(token.lemma_)

    return result

#calculates the term frequency in all sentences of a text
def term_frequency(text, nlp):
    result = dict()

    doc = nlp(text)

    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue

        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            if token.lemma_ not in result:
                result[token.lemma_] = 1
            else:
                result[token.lemma_] += 1

    return result

#this function creates the tf value for a sentence
def create_term_frequency_sentence(text, nlp):
    result = dict()

    doc = nlp(text)

    for sentence in doc.sents:
        list_token_sentence = term_frequency(sentence.text, nlp)
        result[sentence.text] = list_token_sentence

    return result


def count_sentences_in_doc(text, nlp):
    count = 0
    doc = nlp(text)

    for sentence in doc.sents:
        count += 1


    return count

# Inverse Document frequency idf(t, text) = |text| / log(sum(sentences contains t))
# |text| := number of sentences
#
#
def calc_idf(list_of_terms, term_frequency_sentence, nlp, text):
    number_of_sentences =  count_sentences_in_doc(text, nlp)
    result = dict()


    for term in list_of_terms:
        counter = 0
        for sentence in term_frequency_sentence:
            for other_term in term_frequency_sentence[sentence]:
                if term == other_term:
                    counter += 1
                    continue

        #print(str(counter)+" | "+str(number_of_sentences))

        if counter == 0 or number_of_sentences == 0:
            result[term] = 0
        else:
            idf = math.log10(number_of_sentences / float(counter))
            result[term] = idf


    return result

# this function mulitplies the tf*idf
def termfrequency_multiply_idf(term_frequency_list,idf_list):
    result = dict()
    for term in term_frequency_list:
        for idf_term in idf_list:
            if term == idf_term:
                result[term] = term_frequency_list[term] * idf_list[term]

    return result

#this function calculates the average tfidf value
def calc_average_tfidf(tfidf_list):
    sum_of_tfidf = 0
    if len(tfidf_list) == 0:
        return 0

    for term in tfidf_list:
        sum_of_tfidf += tfidf_list[term]


    return sum_of_tfidf / len(tfidf_list)

def run(text, nlp):
   # print(text)
    term_frequency_list = term_frequency(text, nlp)
    list_of_terms = list_of_all_terms(text, nlp)
    term_frequency_sentence = create_term_frequency_sentence(text, nlp)


    idf_list = calc_idf(list_of_terms, term_frequency_sentence, nlp, text)


    tfidf_list = termfrequency_multiply_idf(term_frequency_list,idf_list)
    print(len(tfidf_list))
    average = calc_average_tfidf(tfidf_list)
    print(str(average))

    return average