Bachelorarbeit: "Ein Tool zur Erklärung von Datenschutzrichtlinien"
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
2.1 KiB

4 years ago
# Summary creator
import json
import spacy
import summaryhelper
import tfidf
import key_words_technical
import killwords
import collectpersonaldata
import collectthirdparty
# This function create a complete paragraph out of the pieces.
# This approach is better because more sentences gives a better context for nlp
def create_paragraph(cluster):
result = ""
for entry in cluster:
if entry["tag"] == "P" or entry["tag"] == "LI":
result += entry["text"]
return result
#this function is the entry point into this
def run(liste, nlp):
#with open('test.txt') as json_file:
# data = json.load(json_file)
#liste = data
#print(liste)
# contains the result of every step
result_list = [];
#indexdict cotains every tf*idf measure
# these measure is used because of the fact that spacy not always classify correctly.
# So the measurement is used to drop the paragraphs that has no value for us
indexdict = []
for cluster in liste:
paragraph = create_paragraph(cluster)
indexdict.append({"average" : tfidf.run(paragraph, nlp), "text": paragraph, "id":cluster[0]["id"], "cluster": cluster})
# Printfunction for debugging
#for tf in indexdict:
# print(tf["average"])
print(len(indexdict))
# indexdict save is used to hold the original tfidf-list back
indexdict_save = indexdict
#this stept creates the list with the data that will be saved
#list_save_saved = create_saved_list(indexdict_save,nlp)
#this step updates the tf*idf lis in combination with third parties
#list_third_party = create_third(indexdict_save,nlp)
collected_data = collectpersonaldata.run(indexdict_save,nlp)
collected_third = collectthirdparty.run(indexdict_save,nlp)
#for col in collected_data:
# print(col["text"])
result_list.append(collected_data)
result_list.append(collected_third)
return result_list
#with open('test.txt') as json_file:
# data = json.load(json_file)
#nlp = spacy.load("en_core_web_sm")
#run(data, nlp)