You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
109 lines
2.9 KiB
109 lines
2.9 KiB
import spacy |
|
import json |
|
from spacy.matcher import Matcher |
|
import listofthirdparties |
|
import summaryhelper |
|
import falsepositive |
|
|
|
|
|
## |
|
# |
|
# This function returns a list of organisation, found by spaCy inside a cluster |
|
# |
|
## |
|
def collect_third(cluster,nlp): |
|
doc = nlp(cluster["text"]) |
|
result = [] |
|
|
|
for entity in doc.ents: |
|
if entity.label_ == "ORG": |
|
result.append({"id": cluster["id"], "text": entity.text}) |
|
|
|
return result |
|
|
|
## |
|
# |
|
# This function returns a list of companys matched by the lost of thirdparties. |
|
# |
|
## |
|
def get_organisation_from_lists(liste,nlp): |
|
matcher = Matcher(nlp.vocab) |
|
result = [] |
|
|
|
for cluster in liste: |
|
doc = nlp(cluster["text"]) |
|
|
|
for key in listofthirdparties.data: |
|
pattern = [] |
|
|
|
for word in key.split(): |
|
pattern.append({"LOWER": word.lower()}) |
|
|
|
matcher.add("id", [pattern]) |
|
match = matcher(doc) |
|
|
|
if len(match) > 0: |
|
print(json.dumps(word)) |
|
result.append({"id": cluster["id"], "text": key}) |
|
|
|
matcher.remove("id") |
|
|
|
|
|
return result |
|
|
|
|
|
## |
|
# |
|
# This function returns a cluster of organisation |
|
# |
|
## |
|
def collect_third_parties(liste,nlp): |
|
result = [{"id": "none", "text": "none"}] |
|
|
|
for cluster in liste: |
|
org_hits = collect_third(cluster,nlp) |
|
|
|
# checks if there are doppeld elements inside this list |
|
is_not_in_list = True |
|
for hits in org_hits: |
|
for result_cluster in result: |
|
if hits["text"] == result_cluster["text"]: |
|
is_not_in_list = False |
|
|
|
if is_not_in_list: |
|
result.append(hits) |
|
|
|
|
|
|
|
del result[0] |
|
return result |
|
|
|
|
|
# this function is the entrypoint to the summarycreator |
|
def run(liste,nlp): |
|
# Getting the average threshold |
|
average_threshold = summaryhelper.get_average_threshold(liste) |
|
|
|
# getting a list of clusters above the average_threshold |
|
liste_org = summaryhelper.get_clusters_above_average_threshold(liste, average_threshold) |
|
# Getting a list of organisation that matches the elements of the thirdparties list |
|
liste_analytics = get_organisation_from_lists(liste,nlp) |
|
# Gettung a list of organisation, collectet by cluster above threshold |
|
result = collect_third_parties(liste_org, nlp) |
|
# concat the list of organisation |
|
result_complete = liste_analytics + result |
|
result_without_duplicate = [] |
|
|
|
for org in result_complete: |
|
if org["text"] in falsepositive.data: |
|
continue |
|
|
|
is_org_not_inside = True |
|
for org_maybe_duplicate in result_without_duplicate: |
|
if org["text"] == org_maybe_duplicate["text"]: |
|
is_org_not_inside = False |
|
|
|
if is_org_not_inside: |
|
result_without_duplicate.append(org) |
|
|
|
return result_without_duplicate
|
|
|