Bachelorarbeit/server-final/collectthirdparty.py

import spacy
import json
from spacy.matcher import Matcher
import listofthirdparties
import summaryhelper
import falsepositive


##
#
# This function returns a list of organisation, found by spaCy inside a cluster
#
##
def collect_third(cluster,nlp):
    doc = nlp(cluster["text"])
    result = []

    for entity in doc.ents:
        if entity.label_ == "ORG":
            result.append({"id": cluster["id"], "text": entity.text})

    return result

##
#
# This function returns a list of companys matched by the lost of thirdparties.
# 
##
def get_organisation_from_lists(liste,nlp):
    matcher = Matcher(nlp.vocab)
    result = []

    for cluster in liste:
        doc = nlp(cluster["text"])
        
        for key in listofthirdparties.data:
            pattern = []
    
            for word in key.split():
                pattern.append({"LOWER": word.lower()})

            matcher.add("id", [pattern])
            match = matcher(doc)
            
            if len(match) > 0:
                print(json.dumps(word))
                result.append({"id": cluster["id"], "text": key})
        
            matcher.remove("id")


    return result


##
#
# This function returns a cluster of organisation
#
##
def collect_third_parties(liste,nlp):
    result = [{"id": "none", "text": "none"}]
    
    for cluster in liste:
        org_hits = collect_third(cluster,nlp)  
        
        # checks if there are doppeld elements inside this list
        is_not_in_list = True
        for hits in org_hits:
            for result_cluster in result:
                if hits["text"] == result_cluster["text"]:
                    is_not_in_list = False
    
            if is_not_in_list:
                result.append(hits)


    del result[0]
    return result
    

# this function is the entrypoint to the summarycreator
def run(liste,nlp):     
    # Getting the average threshold 
    average_threshold = summaryhelper.get_average_threshold(liste)
   
    # getting a list of clusters above the average_threshold
    liste_org = summaryhelper.get_clusters_above_average_threshold(liste, average_threshold)
    # Getting a list of organisation that matches the elements of the thirdparties list
    liste_analytics = get_organisation_from_lists(liste,nlp)
    # Gettung a list of organisation, collectet by cluster above threshold
    result = collect_third_parties(liste_org, nlp)
    # concat the list of organisation 
    result_complete = liste_analytics + result
    result_without_duplicate = []   

    for org in result_complete:
        if org["text"] in falsepositive.data:
            continue
        
        is_org_not_inside = True
        for org_maybe_duplicate in result_without_duplicate:
            if org["text"] == org_maybe_duplicate["text"]:
                is_org_not_inside = False

        if is_org_not_inside:
            result_without_duplicate.append(org)
 
    return result_without_duplicate
Umzug auf Gitea 4 years ago			`import spacy`
			`import json`
			`from spacy.matcher import Matcher`
			`import listofthirdparties`
			`import summaryhelper`
			`import falsepositive`


			`##`
			`#`
			`# This function returns a list of organisation, found by spaCy inside a cluster`
			`#`
			`##`
			`def collect_third(cluster,nlp):`
			`doc = nlp(cluster["text"])`
			`result = []`

			`for entity in doc.ents:`
			`if entity.label_ == "ORG":`
			`result.append({"id": cluster["id"], "text": entity.text})`

			`return result`

			`##`
			`#`
			`# This function returns a list of companys matched by the lost of thirdparties.`
			`#`
			`##`
			`def get_organisation_from_lists(liste,nlp):`
			`matcher = Matcher(nlp.vocab)`
			`result = []`

			`for cluster in liste:`
			`doc = nlp(cluster["text"])`

			`for key in listofthirdparties.data:`
			`pattern = []`

			`for word in key.split():`
			`pattern.append({"LOWER": word.lower()})`

			`matcher.add("id", [pattern])`
			`match = matcher(doc)`

			`if len(match) > 0:`
			`print(json.dumps(word))`
			`result.append({"id": cluster["id"], "text": key})`

			`matcher.remove("id")`


			`return result`


			`##`
			`#`
			`# This function returns a cluster of organisation`
			`#`
			`##`
			`def collect_third_parties(liste,nlp):`
			`result = [{"id": "none", "text": "none"}]`

			`for cluster in liste:`
			`org_hits = collect_third(cluster,nlp)`

			`# checks if there are doppeld elements inside this list`
			`is_not_in_list = True`
			`for hits in org_hits:`
			`for result_cluster in result:`
			`if hits["text"] == result_cluster["text"]:`
			`is_not_in_list = False`

			`if is_not_in_list:`
			`result.append(hits)`



			`del result[0]`
			`return result`


			`# this function is the entrypoint to the summarycreator`
			`def run(liste,nlp):`
			`# Getting the average threshold`
			`average_threshold = summaryhelper.get_average_threshold(liste)`

			`# getting a list of clusters above the average_threshold`
			`liste_org = summaryhelper.get_clusters_above_average_threshold(liste, average_threshold)`
			`# Getting a list of organisation that matches the elements of the thirdparties list`
			`liste_analytics = get_organisation_from_lists(liste,nlp)`
			`# Gettung a list of organisation, collectet by cluster above threshold`
			`result = collect_third_parties(liste_org, nlp)`
			`# concat the list of organisation`
			`result_complete = liste_analytics + result`
			`result_without_duplicate = []`

			`for org in result_complete:`
			`if org["text"] in falsepositive.data:`
			`continue`

			`is_org_not_inside = True`
			`for org_maybe_duplicate in result_without_duplicate:`
			`if org["text"] == org_maybe_duplicate["text"]:`
			`is_org_not_inside = False`

			`if is_org_not_inside:`
			`result_without_duplicate.append(org)`

			`return result_without_duplicate`