Bachelorarbeit/server-final/tfidf.py

# Term Frequency * Inverse Document Frequency 

import spacy
import json
import math


# This function creats a list of all terms in a document
def list_of_all_terms(text, nlp):
    doc = nlp(text)
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue

        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            if token.lemma_ not in result:
                result.append(token.lemma_) 

    return result

#calculates the term frequency in all sentences of a text
def term_frequency(text, nlp):
    result = dict()
    
    doc = nlp(text)
    
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue

        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            if token.lemma_ not in result:
                result[token.lemma_] = 1
            else:
                result[token.lemma_] += 1
    
    return result

#this function creates the tf value for a sentence
def create_term_frequency_sentence(text, nlp):
    result = dict()
    
    doc = nlp(text)

    for sentence in doc.sents:
        list_token_sentence = term_frequency(sentence.text, nlp)
        result[sentence.text] = list_token_sentence

    return result


def count_sentences_in_doc(text, nlp):
    count = 0
    doc = nlp(text)

    for sentence in doc.sents:
        count += 1

    
    return count

# Inverse Document frequency idf(t, text) = |text| / log(sum(sentences contains t))
# |text| := number of sentences 
# 
#
def calc_idf(list_of_terms, term_frequency_sentence, nlp, text):
    number_of_sentences =  count_sentences_in_doc(text, nlp)   
    result = dict()


    for term in list_of_terms:
        counter = 0
        for sentence in term_frequency_sentence:
            for other_term in term_frequency_sentence[sentence]:
                if term == other_term:
                    counter += 1
                    continue
        
        #print(str(counter)+" | "+str(number_of_sentences))
    
        if counter == 0 or number_of_sentences == 0:
            result[term] = 0
        else:
            idf = math.log10(number_of_sentences / float(counter))
            result[term] = idf
    

    return result         
    
# this function mulitplies the tf*idf
def termfrequency_multiply_idf(term_frequency_list,idf_list):
    result = dict()
    for term in term_frequency_list:
        for idf_term in idf_list:
            if term == idf_term:
                result[term] = term_frequency_list[term] * idf_list[term]
 
    return result

#this function calculates the average tfidf value
def calc_average_tfidf(tfidf_list):
    sum_of_tfidf = 0
    if len(tfidf_list) == 0:
        return 0
    
    for term in tfidf_list:
        sum_of_tfidf += tfidf_list[term]


    return sum_of_tfidf / len(tfidf_list)

def run(text, nlp):
   # print(text)
    term_frequency_list = term_frequency(text, nlp)
    list_of_terms = list_of_all_terms(text, nlp)
    term_frequency_sentence = create_term_frequency_sentence(text, nlp)

    
    idf_list = calc_idf(list_of_terms, term_frequency_sentence, nlp, text)

        
    tfidf_list = termfrequency_multiply_idf(term_frequency_list,idf_list)
    print(len(tfidf_list))
    average = calc_average_tfidf(tfidf_list)
    print(str(average))
    
    return average
Umzug auf Gitea 4 years ago			`# Term Frequency * Inverse Document Frequency`

			`import spacy`
			`import json`
			`import math`


			`# This function creats a list of all terms in a document`
			`def list_of_all_terms(text, nlp):`
			`doc = nlp(text)`
			`result = []`
			`for token in doc:`
			`if token.text in nlp.Defaults.stop_words:`
			`continue`

			`if token.pos_ == "NOUN" or token.pos_ == "PROPN":`
			`if token.lemma_ not in result:`
			`result.append(token.lemma_)`

			`return result`

			`#calculates the term frequency in all sentences of a text`
			`def term_frequency(text, nlp):`
			`result = dict()`

			`doc = nlp(text)`

			`for token in doc:`
			`if token.text in nlp.Defaults.stop_words:`
			`continue`

			`if token.pos_ == "NOUN" or token.pos_ == "PROPN":`
			`if token.lemma_ not in result:`
			`result[token.lemma_] = 1`
			`else:`
			`result[token.lemma_] += 1`

			`return result`

			`#this function creates the tf value for a sentence`
			`def create_term_frequency_sentence(text, nlp):`
			`result = dict()`

			`doc = nlp(text)`

			`for sentence in doc.sents:`
			`list_token_sentence = term_frequency(sentence.text, nlp)`
			`result[sentence.text] = list_token_sentence`

			`return result`



			`def count_sentences_in_doc(text, nlp):`
			`count = 0`
			`doc = nlp(text)`

			`for sentence in doc.sents:`
			`count += 1`


			`return count`

			`# Inverse Document frequency idf(t, text) = \|text\| / log(sum(sentences contains t))`
			`# \|text\| := number of sentences`
			`#`
			`#`
			`def calc_idf(list_of_terms, term_frequency_sentence, nlp, text):`
			`number_of_sentences = count_sentences_in_doc(text, nlp)`
			`result = dict()`


			`for term in list_of_terms:`
			`counter = 0`
			`for sentence in term_frequency_sentence:`
			`for other_term in term_frequency_sentence[sentence]:`
			`if term == other_term:`
			`counter += 1`
			`continue`

			`#print(str(counter)+" \| "+str(number_of_sentences))`

			`if counter == 0 or number_of_sentences == 0:`
			`result[term] = 0`
			`else:`
			`idf = math.log10(number_of_sentences / float(counter))`
			`result[term] = idf`


			`return result`

			`# this function mulitplies the tf*idf`
			`def termfrequency_multiply_idf(term_frequency_list,idf_list):`
			`result = dict()`
			`for term in term_frequency_list:`
			`for idf_term in idf_list:`
			`if term == idf_term:`
			`result[term] = term_frequency_list[term] * idf_list[term]`

			`return result`

			`#this function calculates the average tfidf value`
			`def calc_average_tfidf(tfidf_list):`
			`sum_of_tfidf = 0`
			`if len(tfidf_list) == 0:`
			`return 0`

			`for term in tfidf_list:`
			`sum_of_tfidf += tfidf_list[term]`


			`return sum_of_tfidf / len(tfidf_list)`

			`def run(text, nlp):`
			`# print(text)`
			`term_frequency_list = term_frequency(text, nlp)`
			`list_of_terms = list_of_all_terms(text, nlp)`
			`term_frequency_sentence = create_term_frequency_sentence(text, nlp)`



			`idf_list = calc_idf(list_of_terms, term_frequency_sentence, nlp, text)`



			`tfidf_list = termfrequency_multiply_idf(term_frequency_list,idf_list)`
			`print(len(tfidf_list))`
			`average = calc_average_tfidf(tfidf_list)`
			`print(str(average))`

			`return average`