Bachelorarbeit/server-final/collectpersonaldata.py

import spacy
import json
from spacy.matcher import Matcher
import listofpersonaldata
import summaryhelper


##
#
# This function evaluate a new tf*idf value for each cluster 
#
#
##
def evaluate_new_tfidf(liste, nlp):
    # inits the spacy matcher
    matcher = Matcher(nlp.vocab)
    
    result = [] 
    for cluster in liste:
        doc = nlp(cluster["text"])
        hit = 0
        for key in listofpersonaldata.data: 
            pattern = []     
            
            for word in key.split():
                pattern.append({"LOWER": word.lower()})
               
            #print(json.dumps(pattern)) 
            matcher.add("id", [pattern])
            match = matcher(doc)
            hit = len(match)
            
            if hit > 0:
                print(json.dumps(pattern)+" "+str(hit)) 
            matcher.remove("id")    
        
        
        cluster["average"] = cluster["average"] + (hit * 0.2)
        result.append(cluster)

    #print("---- New tfidf value ----")
    #for cluster in result:
        #print(cluster["average"])

    return result


##
# liste contains many dict := {id: "position in orginal policy, "text" : "paragraph"}
##
def collect_personal_data(liste, nlp):
    # inits the spacy matcher
    matcher = Matcher(nlp.vocab)
    
    # this list will contain the data that will be stored
    result = [{"id": "none", "text": "none"}]
     
    
    for cluster in liste:
        doc = nlp(cluster["text"])
        #print(cluster["text"])
        for key in listofpersonaldata.data: 
            pattern = []     
            
            for word in key.split():
                pattern.append({"LOWER": word.lower()})
                
            matcher.add("id", [pattern])
            match = matcher(doc)
            if len(match) > 0:
                is_not_in_list = True
                for result_cluster in result:
                    if listofpersonaldata.data[key] == result_cluster["text"]:
                        is_not_in_list = False            
                
                if is_not_in_list:        
                    result.append({"id": cluster["id"],"text": listofpersonaldata.data[key]})
            
            matcher.remove("id")    
        
    del result[0]
    #for re in result:
        #print(re)

    return result


def collect_information_from_list(liste, nlp):
    result = []
    for cluster in liste:
        for entry in cluster["cluster"]:
            if entry["tag"] == "LI":
                doc = nlp(entry["text"])
                print(len(doc))
                if len(doc) < 15:
                    result_string = ""
                    for token in doc:
                        if token.lemma_ == "our":
                            result_string += "company" + " "
                        else:
                            result_string += token.text + " "
                            
                    result.append({"id": cluster["id"], "text": result_string}) 

    return result
                     

##
# This function returns a list of data may collected by a policy
##
def run(liste, nlp):
    #print("----- output collected data -----")
    #print(len(liste))

    #list of clusters with theire new tfidf values
    liste_new_tfidf = evaluate_new_tfidf(liste,nlp)
    #calculating average threshold
    average_threshold = summaryhelper.get_average_threshold(liste_new_tfidf)
    # list of cluster above the threhold
    liste = summaryhelper.get_clusters_above_average_threshold(liste_new_tfidf, average_threshold)
     
    # list of collected data by a html list
    list_collected_html = collect_information_from_list(liste,nlp)
    print("---------------------------------------------------")
    print(len(list_collected_html))
    # result the collected informations
    result = collect_personal_data(liste, nlp)     
    result = result + list_collected_html
    return result
Umzug auf Gitea 4 years ago			`import spacy`
			`import json`
			`from spacy.matcher import Matcher`
			`import listofpersonaldata`
			`import summaryhelper`


			`##`
			`#`
			`# This function evaluate a new tf*idf value for each cluster`
			`#`
			`#`
			`##`
			`def evaluate_new_tfidf(liste, nlp):`
			`# inits the spacy matcher`
			`matcher = Matcher(nlp.vocab)`

			`result = []`
			`for cluster in liste:`
			`doc = nlp(cluster["text"])`
			`hit = 0`
			`for key in listofpersonaldata.data:`
			`pattern = []`

			`for word in key.split():`
			`pattern.append({"LOWER": word.lower()})`

			`#print(json.dumps(pattern))`
			`matcher.add("id", [pattern])`
			`match = matcher(doc)`
			`hit = len(match)`

			`if hit > 0:`
			`print(json.dumps(pattern)+" "+str(hit))`
			`matcher.remove("id")`


			`cluster["average"] = cluster["average"] + (hit * 0.2)`
			`result.append(cluster)`

			`#print("---- New tfidf value ----")`
			`#for cluster in result:`
			`#print(cluster["average"])`

			`return result`



			`##`
			`# liste contains many dict := {id: "position in orginal policy, "text" : "paragraph"}`
			`##`
			`def collect_personal_data(liste, nlp):`
			`# inits the spacy matcher`
			`matcher = Matcher(nlp.vocab)`

			`# this list will contain the data that will be stored`
			`result = [{"id": "none", "text": "none"}]`


			`for cluster in liste:`
			`doc = nlp(cluster["text"])`
			`#print(cluster["text"])`
			`for key in listofpersonaldata.data:`
			`pattern = []`

			`for word in key.split():`
			`pattern.append({"LOWER": word.lower()})`

			`matcher.add("id", [pattern])`
			`match = matcher(doc)`
			`if len(match) > 0:`
			`is_not_in_list = True`
			`for result_cluster in result:`
			`if listofpersonaldata.data[key] == result_cluster["text"]:`
			`is_not_in_list = False`

			`if is_not_in_list:`
			`result.append({"id": cluster["id"],"text": listofpersonaldata.data[key]})`

			`matcher.remove("id")`

			`del result[0]`
			`#for re in result:`
			`#print(re)`

			`return result`


			`def collect_information_from_list(liste, nlp):`
			`result = []`
			`for cluster in liste:`
			`for entry in cluster["cluster"]:`
			`if entry["tag"] == "LI":`
			`doc = nlp(entry["text"])`
			`print(len(doc))`
			`if len(doc) < 15:`
			`result_string = ""`
			`for token in doc:`
			`if token.lemma_ == "our":`
			`result_string += "company" + " "`
			`else:`
			`result_string += token.text + " "`

			`result.append({"id": cluster["id"], "text": result_string})`

			`return result`


			`##`
			`# This function returns a list of data may collected by a policy`
			`##`
			`def run(liste, nlp):`
			`#print("----- output collected data -----")`
			`#print(len(liste))`

			`#list of clusters with theire new tfidf values`
			`liste_new_tfidf = evaluate_new_tfidf(liste,nlp)`
			`#calculating average threshold`
			`average_threshold = summaryhelper.get_average_threshold(liste_new_tfidf)`
			`# list of cluster above the threhold`
			`liste = summaryhelper.get_clusters_above_average_threshold(liste_new_tfidf, average_threshold)`

			`# list of collected data by a html list`
			`list_collected_html = collect_information_from_list(liste,nlp)`
			`print("---------------------------------------------------")`
			`print(len(list_collected_html))`
			`# result the collected informations`
			`result = collect_personal_data(liste, nlp)`
			`result = result + list_collected_html`
			`return result`