import spacy import json from spacy.matcher import Matcher import listofthirdparties import summaryhelper import falsepositive ## # # This function returns a list of organisation, found by spaCy inside a cluster # ## def collect_third(cluster,nlp): doc = nlp(cluster["text"]) result = [] for entity in doc.ents: if entity.label_ == "ORG": result.append({"id": cluster["id"], "text": entity.text}) return result ## # # This function returns a list of companys matched by the lost of thirdparties. # ## def get_organisation_from_lists(liste,nlp): matcher = Matcher(nlp.vocab) result = [] for cluster in liste: doc = nlp(cluster["text"]) for key in listofthirdparties.data: pattern = [] for word in key.split(): pattern.append({"LOWER": word.lower()}) matcher.add("id", [pattern]) match = matcher(doc) if len(match) > 0: print(json.dumps(word)) result.append({"id": cluster["id"], "text": key}) matcher.remove("id") return result ## # # This function returns a cluster of organisation # ## def collect_third_parties(liste,nlp): result = [{"id": "none", "text": "none"}] for cluster in liste: org_hits = collect_third(cluster,nlp) # checks if there are doppeld elements inside this list is_not_in_list = True for hits in org_hits: for result_cluster in result: if hits["text"] == result_cluster["text"]: is_not_in_list = False if is_not_in_list: result.append(hits) del result[0] return result # this function is the entrypoint to the summarycreator def run(liste,nlp): # Getting the average threshold average_threshold = summaryhelper.get_average_threshold(liste) # getting a list of clusters above the average_threshold liste_org = summaryhelper.get_clusters_above_average_threshold(liste, average_threshold) # Getting a list of organisation that matches the elements of the thirdparties list liste_analytics = get_organisation_from_lists(liste,nlp) # Gettung a list of organisation, collectet by cluster above threshold result = collect_third_parties(liste_org, nlp) # concat the list of organisation result_complete = liste_analytics + result result_without_duplicate = [] for org in result_complete: if org["text"] in falsepositive.data: continue is_org_not_inside = True for org_maybe_duplicate in result_without_duplicate: if org["text"] == org_maybe_duplicate["text"]: is_org_not_inside = False if is_org_not_inside: result_without_duplicate.append(org) return result_without_duplicate