Bachelorarbeit/server-final/collectthirdparty.py

import spacy
import json
from spacy.matcher import Matcher
import listofthirdparties
import summaryhelper
import falsepositive


##
#
# This function returns a list of organisation, found by spaCy inside a cluster
#
##
def collect_third(cluster,nlp):
    doc = nlp(cluster["text"])
    result = []

    for entity in doc.ents:
        if entity.label_ == "ORG":
            result.append({"id": cluster["id"], "text": entity.text})

    return result

##
#
# This function returns a list of companys matched by the lost of thirdparties.
#
##
def get_organisation_from_lists(liste,nlp):
    matcher = Matcher(nlp.vocab)
    result = []

    for cluster in liste:
        doc = nlp(cluster["text"])

        for key in listofthirdparties.data:
            pattern = []

            for word in key.split():
                pattern.append({"LOWER": word.lower()})

            matcher.add("id", [pattern])
            match = matcher(doc)

            if len(match) > 0:
                print(json.dumps(word))
                result.append({"id": cluster["id"], "text": key})

            matcher.remove("id")


    return result


##
#
# This function returns a cluster of organisation
#
##
def collect_third_parties(liste,nlp):
    result = [{"id": "none", "text": "none"}]

    for cluster in liste:
        org_hits = collect_third(cluster,nlp)

        # checks if there are doppeld elements inside this list
        is_not_in_list = True
        for hits in org_hits:
            for result_cluster in result:
                if hits["text"] == result_cluster["text"]:
                    is_not_in_list = False

            if is_not_in_list:
                result.append(hits)


    del result[0]
    return result


# this function is the entrypoint to the summarycreator
def run(liste,nlp):
    # Getting the average threshold
    average_threshold = summaryhelper.get_average_threshold(liste)

    # getting a list of clusters above the average_threshold
    liste_org = summaryhelper.get_clusters_above_average_threshold(liste, average_threshold)
    # Getting a list of organisation that matches the elements of the thirdparties list
    liste_analytics = get_organisation_from_lists(liste,nlp)
    # Gettung a list of organisation, collectet by cluster above threshold
    result = collect_third_parties(liste_org, nlp)
    # concat the list of organisation
    result_complete = liste_analytics + result
    result_without_duplicate = []

    for org in result_complete:
        if org["text"] in falsepositive.data:
            continue

        is_org_not_inside = True
        for org_maybe_duplicate in result_without_duplicate:
            if org["text"] == org_maybe_duplicate["text"]:
                is_org_not_inside = False

        if is_org_not_inside:
            result_without_duplicate.append(org)

    return result_without_duplicate