Bachelorarbeit: "Ein Tool zur Erklärung von Datenschutzrichtlinien"
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
2.9 KiB

4 years ago
import spacy
import json
from spacy.matcher import Matcher
import listofthirdparties
import summaryhelper
import falsepositive
##
#
# This function returns a list of organisation, found by spaCy inside a cluster
#
##
def collect_third(cluster,nlp):
doc = nlp(cluster["text"])
result = []
for entity in doc.ents:
if entity.label_ == "ORG":
result.append({"id": cluster["id"], "text": entity.text})
return result
##
#
# This function returns a list of companys matched by the lost of thirdparties.
#
##
def get_organisation_from_lists(liste,nlp):
matcher = Matcher(nlp.vocab)
result = []
for cluster in liste:
doc = nlp(cluster["text"])
for key in listofthirdparties.data:
pattern = []
for word in key.split():
pattern.append({"LOWER": word.lower()})
matcher.add("id", [pattern])
match = matcher(doc)
if len(match) > 0:
print(json.dumps(word))
result.append({"id": cluster["id"], "text": key})
matcher.remove("id")
return result
##
#
# This function returns a cluster of organisation
#
##
def collect_third_parties(liste,nlp):
result = [{"id": "none", "text": "none"}]
for cluster in liste:
org_hits = collect_third(cluster,nlp)
# checks if there are doppeld elements inside this list
is_not_in_list = True
for hits in org_hits:
for result_cluster in result:
if hits["text"] == result_cluster["text"]:
is_not_in_list = False
if is_not_in_list:
result.append(hits)
del result[0]
return result
# this function is the entrypoint to the summarycreator
def run(liste,nlp):
# Getting the average threshold
average_threshold = summaryhelper.get_average_threshold(liste)
# getting a list of clusters above the average_threshold
liste_org = summaryhelper.get_clusters_above_average_threshold(liste, average_threshold)
# Getting a list of organisation that matches the elements of the thirdparties list
liste_analytics = get_organisation_from_lists(liste,nlp)
# Gettung a list of organisation, collectet by cluster above threshold
result = collect_third_parties(liste_org, nlp)
# concat the list of organisation
result_complete = liste_analytics + result
result_without_duplicate = []
for org in result_complete:
if org["text"] in falsepositive.data:
continue
is_org_not_inside = True
for org_maybe_duplicate in result_without_duplicate:
if org["text"] == org_maybe_duplicate["text"]:
is_org_not_inside = False
if is_org_not_inside:
result_without_duplicate.append(org)
return result_without_duplicate