Bachelorarbeit/server-final/collectpersonaldata.py

import spacy
import json
from spacy.matcher import Matcher
import listofpersonaldata
import summaryhelper


##
#
# This function evaluate a new tf*idf value for each cluster
#
#
##
def evaluate_new_tfidf(liste, nlp):
    # inits the spacy matcher
    matcher = Matcher(nlp.vocab)

    result = []
    for cluster in liste:
        doc = nlp(cluster["text"])
        hit = 0
        for key in listofpersonaldata.data:
            pattern = []

            for word in key.split():
                pattern.append({"LOWER": word.lower()})

            #print(json.dumps(pattern))
            matcher.add("id", [pattern])
            match = matcher(doc)
            hit = len(match)

            if hit > 0:
                print(json.dumps(pattern)+" "+str(hit))
            matcher.remove("id")


        cluster["average"] = cluster["average"] + (hit * 0.2)
        result.append(cluster)

    #print("---- New tfidf value ----")
    #for cluster in result:
        #print(cluster["average"])

    return result


##
# liste contains many dict := {id: "position in orginal policy, "text" : "paragraph"}
##
def collect_personal_data(liste, nlp):
    # inits the spacy matcher
    matcher = Matcher(nlp.vocab)

    # this list will contain the data that will be stored
    result = [{"id": "none", "text": "none"}]


    for cluster in liste:
        doc = nlp(cluster["text"])
        #print(cluster["text"])
        for key in listofpersonaldata.data:
            pattern = []

            for word in key.split():
                pattern.append({"LOWER": word.lower()})

            matcher.add("id", [pattern])
            match = matcher(doc)
            if len(match) > 0:
                is_not_in_list = True
                for result_cluster in result:
                    if listofpersonaldata.data[key] == result_cluster["text"]:
                        is_not_in_list = False

                if is_not_in_list:
                    result.append({"id": cluster["id"],"text": listofpersonaldata.data[key]})

            matcher.remove("id")

    del result[0]
    #for re in result:
        #print(re)

    return result


def collect_information_from_list(liste, nlp):
    result = []
    for cluster in liste:
        for entry in cluster["cluster"]:
            if entry["tag"] == "LI":
                doc = nlp(entry["text"])
                print(len(doc))
                if len(doc) < 15:
                    result_string = ""
                    for token in doc:
                        if token.lemma_ == "our":
                            result_string += "company" + " "
                        else:
                            result_string += token.text + " "

                    result.append({"id": cluster["id"], "text": result_string})

    return result


##
# This function returns a list of data may collected by a policy
##
def run(liste, nlp):
    #print("----- output collected data -----")
    #print(len(liste))

    #list of clusters with theire new tfidf values
    liste_new_tfidf = evaluate_new_tfidf(liste,nlp)
    #calculating average threshold
    average_threshold = summaryhelper.get_average_threshold(liste_new_tfidf)
    # list of cluster above the threhold
    liste = summaryhelper.get_clusters_above_average_threshold(liste_new_tfidf, average_threshold)

    # list of collected data by a html list
    list_collected_html = collect_information_from_list(liste,nlp)
    print("---------------------------------------------------")
    print(len(list_collected_html))
    # result the collected informations
    result = collect_personal_data(liste, nlp)
    result = result + list_collected_html
    return result