import spacy import json from spacy.matcher import Matcher import listofpersonaldata import summaryhelper ## # # This function evaluate a new tf*idf value for each cluster # # ## def evaluate_new_tfidf(liste, nlp): # inits the spacy matcher matcher = Matcher(nlp.vocab) result = [] for cluster in liste: doc = nlp(cluster["text"]) hit = 0 for key in listofpersonaldata.data: pattern = [] for word in key.split(): pattern.append({"LOWER": word.lower()}) #print(json.dumps(pattern)) matcher.add("id", [pattern]) match = matcher(doc) hit = len(match) if hit > 0: print(json.dumps(pattern)+" "+str(hit)) matcher.remove("id") cluster["average"] = cluster["average"] + (hit * 0.2) result.append(cluster) #print("---- New tfidf value ----") #for cluster in result: #print(cluster["average"]) return result ## # liste contains many dict := {id: "position in orginal policy, "text" : "paragraph"} ## def collect_personal_data(liste, nlp): # inits the spacy matcher matcher = Matcher(nlp.vocab) # this list will contain the data that will be stored result = [{"id": "none", "text": "none"}] for cluster in liste: doc = nlp(cluster["text"]) #print(cluster["text"]) for key in listofpersonaldata.data: pattern = [] for word in key.split(): pattern.append({"LOWER": word.lower()}) matcher.add("id", [pattern]) match = matcher(doc) if len(match) > 0: is_not_in_list = True for result_cluster in result: if listofpersonaldata.data[key] == result_cluster["text"]: is_not_in_list = False if is_not_in_list: result.append({"id": cluster["id"],"text": listofpersonaldata.data[key]}) matcher.remove("id") del result[0] #for re in result: #print(re) return result def collect_information_from_list(liste, nlp): result = [] for cluster in liste: for entry in cluster["cluster"]: if entry["tag"] == "LI": doc = nlp(entry["text"]) print(len(doc)) if len(doc) < 15: result_string = "" for token in doc: if token.lemma_ == "our": result_string += "company" + " " else: result_string += token.text + " " result.append({"id": cluster["id"], "text": result_string}) return result ## # This function returns a list of data may collected by a policy ## def run(liste, nlp): #print("----- output collected data -----") #print(len(liste)) #list of clusters with theire new tfidf values liste_new_tfidf = evaluate_new_tfidf(liste,nlp) #calculating average threshold average_threshold = summaryhelper.get_average_threshold(liste_new_tfidf) # list of cluster above the threhold liste = summaryhelper.get_clusters_above_average_threshold(liste_new_tfidf, average_threshold) # list of collected data by a html list list_collected_html = collect_information_from_list(liste,nlp) print("---------------------------------------------------") print(len(list_collected_html)) # result the collected informations result = collect_personal_data(liste, nlp) result = result + list_collected_html return result