Bachelorarbeit: "Ein Tool zur Erklärung von Datenschutzrichtlinien"
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

130 lines
3.8 KiB

import spacy
import json
from spacy.matcher import Matcher
import listofpersonaldata
import summaryhelper
##
#
# This function evaluate a new tf*idf value for each cluster
#
#
##
def evaluate_new_tfidf(liste, nlp):
# inits the spacy matcher
matcher = Matcher(nlp.vocab)
result = []
for cluster in liste:
doc = nlp(cluster["text"])
hit = 0
for key in listofpersonaldata.data:
pattern = []
for word in key.split():
pattern.append({"LOWER": word.lower()})
#print(json.dumps(pattern))
matcher.add("id", [pattern])
match = matcher(doc)
hit = len(match)
if hit > 0:
print(json.dumps(pattern)+" "+str(hit))
matcher.remove("id")
cluster["average"] = cluster["average"] + (hit * 0.2)
result.append(cluster)
#print("---- New tfidf value ----")
#for cluster in result:
#print(cluster["average"])
return result
##
# liste contains many dict := {id: "position in orginal policy, "text" : "paragraph"}
##
def collect_personal_data(liste, nlp):
# inits the spacy matcher
matcher = Matcher(nlp.vocab)
# this list will contain the data that will be stored
result = [{"id": "none", "text": "none"}]
for cluster in liste:
doc = nlp(cluster["text"])
#print(cluster["text"])
for key in listofpersonaldata.data:
pattern = []
for word in key.split():
pattern.append({"LOWER": word.lower()})
matcher.add("id", [pattern])
match = matcher(doc)
if len(match) > 0:
is_not_in_list = True
for result_cluster in result:
if listofpersonaldata.data[key] == result_cluster["text"]:
is_not_in_list = False
if is_not_in_list:
result.append({"id": cluster["id"],"text": listofpersonaldata.data[key]})
matcher.remove("id")
del result[0]
#for re in result:
#print(re)
return result
def collect_information_from_list(liste, nlp):
result = []
for cluster in liste:
for entry in cluster["cluster"]:
if entry["tag"] == "LI":
doc = nlp(entry["text"])
print(len(doc))
if len(doc) < 15:
result_string = ""
for token in doc:
if token.lemma_ == "our":
result_string += "company" + " "
else:
result_string += token.text + " "
result.append({"id": cluster["id"], "text": result_string})
return result
##
# This function returns a list of data may collected by a policy
##
def run(liste, nlp):
#print("----- output collected data -----")
#print(len(liste))
#list of clusters with theire new tfidf values
liste_new_tfidf = evaluate_new_tfidf(liste,nlp)
#calculating average threshold
average_threshold = summaryhelper.get_average_threshold(liste_new_tfidf)
# list of cluster above the threhold
liste = summaryhelper.get_clusters_above_average_threshold(liste_new_tfidf, average_threshold)
# list of collected data by a html list
list_collected_html = collect_information_from_list(liste,nlp)
print("---------------------------------------------------")
print(len(list_collected_html))
# result the collected informations
result = collect_personal_data(liste, nlp)
result = result + list_collected_html
return result