You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
110 lines
2.9 KiB
110 lines
2.9 KiB
4 years ago
|
import spacy
|
||
|
import json
|
||
|
from spacy.matcher import Matcher
|
||
|
import listofthirdparties
|
||
|
import summaryhelper
|
||
|
import falsepositive
|
||
|
|
||
|
|
||
|
##
|
||
|
#
|
||
|
# This function returns a list of organisation, found by spaCy inside a cluster
|
||
|
#
|
||
|
##
|
||
|
def collect_third(cluster,nlp):
|
||
|
doc = nlp(cluster["text"])
|
||
|
result = []
|
||
|
|
||
|
for entity in doc.ents:
|
||
|
if entity.label_ == "ORG":
|
||
|
result.append({"id": cluster["id"], "text": entity.text})
|
||
|
|
||
|
return result
|
||
|
|
||
|
##
|
||
|
#
|
||
|
# This function returns a list of companys matched by the lost of thirdparties.
|
||
|
#
|
||
|
##
|
||
|
def get_organisation_from_lists(liste,nlp):
|
||
|
matcher = Matcher(nlp.vocab)
|
||
|
result = []
|
||
|
|
||
|
for cluster in liste:
|
||
|
doc = nlp(cluster["text"])
|
||
|
|
||
|
for key in listofthirdparties.data:
|
||
|
pattern = []
|
||
|
|
||
|
for word in key.split():
|
||
|
pattern.append({"LOWER": word.lower()})
|
||
|
|
||
|
matcher.add("id", [pattern])
|
||
|
match = matcher(doc)
|
||
|
|
||
|
if len(match) > 0:
|
||
|
print(json.dumps(word))
|
||
|
result.append({"id": cluster["id"], "text": key})
|
||
|
|
||
|
matcher.remove("id")
|
||
|
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
##
|
||
|
#
|
||
|
# This function returns a cluster of organisation
|
||
|
#
|
||
|
##
|
||
|
def collect_third_parties(liste,nlp):
|
||
|
result = [{"id": "none", "text": "none"}]
|
||
|
|
||
|
for cluster in liste:
|
||
|
org_hits = collect_third(cluster,nlp)
|
||
|
|
||
|
# checks if there are doppeld elements inside this list
|
||
|
is_not_in_list = True
|
||
|
for hits in org_hits:
|
||
|
for result_cluster in result:
|
||
|
if hits["text"] == result_cluster["text"]:
|
||
|
is_not_in_list = False
|
||
|
|
||
|
if is_not_in_list:
|
||
|
result.append(hits)
|
||
|
|
||
|
|
||
|
|
||
|
del result[0]
|
||
|
return result
|
||
|
|
||
|
|
||
|
# this function is the entrypoint to the summarycreator
|
||
|
def run(liste,nlp):
|
||
|
# Getting the average threshold
|
||
|
average_threshold = summaryhelper.get_average_threshold(liste)
|
||
|
|
||
|
# getting a list of clusters above the average_threshold
|
||
|
liste_org = summaryhelper.get_clusters_above_average_threshold(liste, average_threshold)
|
||
|
# Getting a list of organisation that matches the elements of the thirdparties list
|
||
|
liste_analytics = get_organisation_from_lists(liste,nlp)
|
||
|
# Gettung a list of organisation, collectet by cluster above threshold
|
||
|
result = collect_third_parties(liste_org, nlp)
|
||
|
# concat the list of organisation
|
||
|
result_complete = liste_analytics + result
|
||
|
result_without_duplicate = []
|
||
|
|
||
|
for org in result_complete:
|
||
|
if org["text"] in falsepositive.data:
|
||
|
continue
|
||
|
|
||
|
is_org_not_inside = True
|
||
|
for org_maybe_duplicate in result_without_duplicate:
|
||
|
if org["text"] == org_maybe_duplicate["text"]:
|
||
|
is_org_not_inside = False
|
||
|
|
||
|
if is_org_not_inside:
|
||
|
result_without_duplicate.append(org)
|
||
|
|
||
|
return result_without_duplicate
|