Bachelorarbeit: "Ein Tool zur Erklärung von Datenschutzrichtlinien"
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

131 lines
3.3 KiB

# Term Frequency * Inverse Document Frequency
import spacy
import json
import math
# This function creats a list of all terms in a document
def list_of_all_terms(text, nlp):
doc = nlp(text)
result = []
for token in doc:
if token.text in nlp.Defaults.stop_words:
continue
if token.pos_ == "NOUN" or token.pos_ == "PROPN":
if token.lemma_ not in result:
result.append(token.lemma_)
return result
#calculates the term frequency in all sentences of a text
def term_frequency(text, nlp):
result = dict()
doc = nlp(text)
for token in doc:
if token.text in nlp.Defaults.stop_words:
continue
if token.pos_ == "NOUN" or token.pos_ == "PROPN":
if token.lemma_ not in result:
result[token.lemma_] = 1
else:
result[token.lemma_] += 1
return result
#this function creates the tf value for a sentence
def create_term_frequency_sentence(text, nlp):
result = dict()
doc = nlp(text)
for sentence in doc.sents:
list_token_sentence = term_frequency(sentence.text, nlp)
result[sentence.text] = list_token_sentence
return result
def count_sentences_in_doc(text, nlp):
count = 0
doc = nlp(text)
for sentence in doc.sents:
count += 1
return count
# Inverse Document frequency idf(t, text) = |text| / log(sum(sentences contains t))
# |text| := number of sentences
#
#
def calc_idf(list_of_terms, term_frequency_sentence, nlp, text):
number_of_sentences = count_sentences_in_doc(text, nlp)
result = dict()
for term in list_of_terms:
counter = 0
for sentence in term_frequency_sentence:
for other_term in term_frequency_sentence[sentence]:
if term == other_term:
counter += 1
continue
#print(str(counter)+" | "+str(number_of_sentences))
if counter == 0 or number_of_sentences == 0:
result[term] = 0
else:
idf = math.log10(number_of_sentences / float(counter))
result[term] = idf
return result
# this function mulitplies the tf*idf
def termfrequency_multiply_idf(term_frequency_list,idf_list):
result = dict()
for term in term_frequency_list:
for idf_term in idf_list:
if term == idf_term:
result[term] = term_frequency_list[term] * idf_list[term]
return result
#this function calculates the average tfidf value
def calc_average_tfidf(tfidf_list):
sum_of_tfidf = 0
if len(tfidf_list) == 0:
return 0
for term in tfidf_list:
sum_of_tfidf += tfidf_list[term]
return sum_of_tfidf / len(tfidf_list)
def run(text, nlp):
# print(text)
term_frequency_list = term_frequency(text, nlp)
list_of_terms = list_of_all_terms(text, nlp)
term_frequency_sentence = create_term_frequency_sentence(text, nlp)
idf_list = calc_idf(list_of_terms, term_frequency_sentence, nlp, text)
tfidf_list = termfrequency_multiply_idf(term_frequency_list,idf_list)
print(len(tfidf_list))
average = calc_average_tfidf(tfidf_list)
print(str(average))
return average