You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
131 lines
3.8 KiB
131 lines
3.8 KiB
4 years ago
|
import spacy
|
||
|
import json
|
||
|
from spacy.matcher import Matcher
|
||
|
import listofpersonaldata
|
||
|
import summaryhelper
|
||
|
|
||
|
|
||
|
##
|
||
|
#
|
||
|
# This function evaluate a new tf*idf value for each cluster
|
||
|
#
|
||
|
#
|
||
|
##
|
||
|
def evaluate_new_tfidf(liste, nlp):
|
||
|
# inits the spacy matcher
|
||
|
matcher = Matcher(nlp.vocab)
|
||
|
|
||
|
result = []
|
||
|
for cluster in liste:
|
||
|
doc = nlp(cluster["text"])
|
||
|
hit = 0
|
||
|
for key in listofpersonaldata.data:
|
||
|
pattern = []
|
||
|
|
||
|
for word in key.split():
|
||
|
pattern.append({"LOWER": word.lower()})
|
||
|
|
||
|
#print(json.dumps(pattern))
|
||
|
matcher.add("id", [pattern])
|
||
|
match = matcher(doc)
|
||
|
hit = len(match)
|
||
|
|
||
|
if hit > 0:
|
||
|
print(json.dumps(pattern)+" "+str(hit))
|
||
|
matcher.remove("id")
|
||
|
|
||
|
|
||
|
cluster["average"] = cluster["average"] + (hit * 0.2)
|
||
|
result.append(cluster)
|
||
|
|
||
|
#print("---- New tfidf value ----")
|
||
|
#for cluster in result:
|
||
|
#print(cluster["average"])
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
|
||
|
##
|
||
|
# liste contains many dict := {id: "position in orginal policy, "text" : "paragraph"}
|
||
|
##
|
||
|
def collect_personal_data(liste, nlp):
|
||
|
# inits the spacy matcher
|
||
|
matcher = Matcher(nlp.vocab)
|
||
|
|
||
|
# this list will contain the data that will be stored
|
||
|
result = [{"id": "none", "text": "none"}]
|
||
|
|
||
|
|
||
|
for cluster in liste:
|
||
|
doc = nlp(cluster["text"])
|
||
|
#print(cluster["text"])
|
||
|
for key in listofpersonaldata.data:
|
||
|
pattern = []
|
||
|
|
||
|
for word in key.split():
|
||
|
pattern.append({"LOWER": word.lower()})
|
||
|
|
||
|
matcher.add("id", [pattern])
|
||
|
match = matcher(doc)
|
||
|
if len(match) > 0:
|
||
|
is_not_in_list = True
|
||
|
for result_cluster in result:
|
||
|
if listofpersonaldata.data[key] == result_cluster["text"]:
|
||
|
is_not_in_list = False
|
||
|
|
||
|
if is_not_in_list:
|
||
|
result.append({"id": cluster["id"],"text": listofpersonaldata.data[key]})
|
||
|
|
||
|
matcher.remove("id")
|
||
|
|
||
|
del result[0]
|
||
|
#for re in result:
|
||
|
#print(re)
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
def collect_information_from_list(liste, nlp):
|
||
|
result = []
|
||
|
for cluster in liste:
|
||
|
for entry in cluster["cluster"]:
|
||
|
if entry["tag"] == "LI":
|
||
|
doc = nlp(entry["text"])
|
||
|
print(len(doc))
|
||
|
if len(doc) < 15:
|
||
|
result_string = ""
|
||
|
for token in doc:
|
||
|
if token.lemma_ == "our":
|
||
|
result_string += "company" + " "
|
||
|
else:
|
||
|
result_string += token.text + " "
|
||
|
|
||
|
result.append({"id": cluster["id"], "text": result_string})
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
##
|
||
|
# This function returns a list of data may collected by a policy
|
||
|
##
|
||
|
def run(liste, nlp):
|
||
|
#print("----- output collected data -----")
|
||
|
#print(len(liste))
|
||
|
|
||
|
#list of clusters with theire new tfidf values
|
||
|
liste_new_tfidf = evaluate_new_tfidf(liste,nlp)
|
||
|
#calculating average threshold
|
||
|
average_threshold = summaryhelper.get_average_threshold(liste_new_tfidf)
|
||
|
# list of cluster above the threhold
|
||
|
liste = summaryhelper.get_clusters_above_average_threshold(liste_new_tfidf, average_threshold)
|
||
|
|
||
|
# list of collected data by a html list
|
||
|
list_collected_html = collect_information_from_list(liste,nlp)
|
||
|
print("---------------------------------------------------")
|
||
|
print(len(list_collected_html))
|
||
|
# result the collected informations
|
||
|
result = collect_personal_data(liste, nlp)
|
||
|
result = result + list_collected_html
|
||
|
return result
|