You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
130 lines
3.8 KiB
130 lines
3.8 KiB
import spacy |
|
import json |
|
from spacy.matcher import Matcher |
|
import listofpersonaldata |
|
import summaryhelper |
|
|
|
|
|
## |
|
# |
|
# This function evaluate a new tf*idf value for each cluster |
|
# |
|
# |
|
## |
|
def evaluate_new_tfidf(liste, nlp): |
|
# inits the spacy matcher |
|
matcher = Matcher(nlp.vocab) |
|
|
|
result = [] |
|
for cluster in liste: |
|
doc = nlp(cluster["text"]) |
|
hit = 0 |
|
for key in listofpersonaldata.data: |
|
pattern = [] |
|
|
|
for word in key.split(): |
|
pattern.append({"LOWER": word.lower()}) |
|
|
|
#print(json.dumps(pattern)) |
|
matcher.add("id", [pattern]) |
|
match = matcher(doc) |
|
hit = len(match) |
|
|
|
if hit > 0: |
|
print(json.dumps(pattern)+" "+str(hit)) |
|
matcher.remove("id") |
|
|
|
|
|
cluster["average"] = cluster["average"] + (hit * 0.2) |
|
result.append(cluster) |
|
|
|
#print("---- New tfidf value ----") |
|
#for cluster in result: |
|
#print(cluster["average"]) |
|
|
|
return result |
|
|
|
|
|
|
|
## |
|
# liste contains many dict := {id: "position in orginal policy, "text" : "paragraph"} |
|
## |
|
def collect_personal_data(liste, nlp): |
|
# inits the spacy matcher |
|
matcher = Matcher(nlp.vocab) |
|
|
|
# this list will contain the data that will be stored |
|
result = [{"id": "none", "text": "none"}] |
|
|
|
|
|
for cluster in liste: |
|
doc = nlp(cluster["text"]) |
|
#print(cluster["text"]) |
|
for key in listofpersonaldata.data: |
|
pattern = [] |
|
|
|
for word in key.split(): |
|
pattern.append({"LOWER": word.lower()}) |
|
|
|
matcher.add("id", [pattern]) |
|
match = matcher(doc) |
|
if len(match) > 0: |
|
is_not_in_list = True |
|
for result_cluster in result: |
|
if listofpersonaldata.data[key] == result_cluster["text"]: |
|
is_not_in_list = False |
|
|
|
if is_not_in_list: |
|
result.append({"id": cluster["id"],"text": listofpersonaldata.data[key]}) |
|
|
|
matcher.remove("id") |
|
|
|
del result[0] |
|
#for re in result: |
|
#print(re) |
|
|
|
return result |
|
|
|
|
|
def collect_information_from_list(liste, nlp): |
|
result = [] |
|
for cluster in liste: |
|
for entry in cluster["cluster"]: |
|
if entry["tag"] == "LI": |
|
doc = nlp(entry["text"]) |
|
print(len(doc)) |
|
if len(doc) < 15: |
|
result_string = "" |
|
for token in doc: |
|
if token.lemma_ == "our": |
|
result_string += "company" + " " |
|
else: |
|
result_string += token.text + " " |
|
|
|
result.append({"id": cluster["id"], "text": result_string}) |
|
|
|
return result |
|
|
|
|
|
## |
|
# This function returns a list of data may collected by a policy |
|
## |
|
def run(liste, nlp): |
|
#print("----- output collected data -----") |
|
#print(len(liste)) |
|
|
|
#list of clusters with theire new tfidf values |
|
liste_new_tfidf = evaluate_new_tfidf(liste,nlp) |
|
#calculating average threshold |
|
average_threshold = summaryhelper.get_average_threshold(liste_new_tfidf) |
|
# list of cluster above the threhold |
|
liste = summaryhelper.get_clusters_above_average_threshold(liste_new_tfidf, average_threshold) |
|
|
|
# list of collected data by a html list |
|
list_collected_html = collect_information_from_list(liste,nlp) |
|
print("---------------------------------------------------") |
|
print(len(list_collected_html)) |
|
# result the collected informations |
|
result = collect_personal_data(liste, nlp) |
|
result = result + list_collected_html |
|
return result
|
|
|