import spacy
import json
from spacy.matcher import Matcher
import listofpersonaldata
import summaryhelper


##
#
# This function evaluate a new tf*idf value for each cluster 
#
#
##
def evaluate_new_tfidf(liste, nlp):
    # inits the spacy matcher
    matcher = Matcher(nlp.vocab)
    
    result = [] 
    for cluster in liste:
        doc = nlp(cluster["text"])
        hit = 0
        for key in listofpersonaldata.data: 
            pattern = []     
            
            for word in key.split():
                pattern.append({"LOWER": word.lower()})
               
            #print(json.dumps(pattern)) 
            matcher.add("id", [pattern])
            match = matcher(doc)
            hit = len(match)
            
            if hit > 0:
                print(json.dumps(pattern)+" "+str(hit)) 
            matcher.remove("id")    
        
        
        cluster["average"] = cluster["average"] + (hit * 0.2)
        result.append(cluster)

    #print("---- New tfidf value ----")
    #for cluster in result:
        #print(cluster["average"])

    return result


##
# liste contains many dict := {id: "position in orginal policy, "text" : "paragraph"}
##
def collect_personal_data(liste, nlp):
    # inits the spacy matcher
    matcher = Matcher(nlp.vocab)
    
    # this list will contain the data that will be stored
    result = [{"id": "none", "text": "none"}]
     
    
    for cluster in liste:
        doc = nlp(cluster["text"])
        #print(cluster["text"])
        for key in listofpersonaldata.data: 
            pattern = []     
            
            for word in key.split():
                pattern.append({"LOWER": word.lower()})
                
            matcher.add("id", [pattern])
            match = matcher(doc)
            if len(match) > 0:
                is_not_in_list = True
                for result_cluster in result:
                    if listofpersonaldata.data[key] == result_cluster["text"]:
                        is_not_in_list = False            
                
                if is_not_in_list:        
                    result.append({"id": cluster["id"],"text": listofpersonaldata.data[key]})
            
            matcher.remove("id")    
        
    del result[0]
    #for re in result:
        #print(re)

    return result


def collect_information_from_list(liste, nlp):
    result = []
    for cluster in liste:
        for entry in cluster["cluster"]:
            if entry["tag"] == "LI":
                doc = nlp(entry["text"])
                print(len(doc))
                if len(doc) < 15:
                    result_string = ""
                    for token in doc:
                        if token.lemma_ == "our":
                            result_string += "company" + " "
                        else:
                            result_string += token.text + " "
                            
                    result.append({"id": cluster["id"], "text": result_string}) 

    return result
                     

##
# This function returns a list of data may collected by a policy
##
def run(liste, nlp):
    #print("----- output collected data -----")
    #print(len(liste))

    #list of clusters with theire new tfidf values
    liste_new_tfidf = evaluate_new_tfidf(liste,nlp)
    #calculating average threshold
    average_threshold = summaryhelper.get_average_threshold(liste_new_tfidf)
    # list of cluster above the threhold
    liste = summaryhelper.get_clusters_above_average_threshold(liste_new_tfidf, average_threshold)
     
    # list of collected data by a html list
    list_collected_html = collect_information_from_list(liste,nlp)
    print("---------------------------------------------------")
    print(len(list_collected_html))
    # result the collected informations
    result = collect_personal_data(liste, nlp)     
    result = result + list_collected_html
    return result