You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
2.1 KiB
80 lines
2.1 KiB
4 years ago
|
# Summary creator
|
||
|
|
||
|
import json
|
||
|
import spacy
|
||
|
import summaryhelper
|
||
|
import tfidf
|
||
|
import key_words_technical
|
||
|
import killwords
|
||
|
import collectpersonaldata
|
||
|
import collectthirdparty
|
||
|
|
||
|
# This function create a complete paragraph out of the pieces.
|
||
|
# This approach is better because more sentences gives a better context for nlp
|
||
|
def create_paragraph(cluster):
|
||
|
result = ""
|
||
|
for entry in cluster:
|
||
|
if entry["tag"] == "P" or entry["tag"] == "LI":
|
||
|
result += entry["text"]
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
|
||
|
#this function is the entry point into this
|
||
|
|
||
|
def run(liste, nlp):
|
||
|
|
||
|
#with open('test.txt') as json_file:
|
||
|
# data = json.load(json_file)
|
||
|
|
||
|
#liste = data
|
||
|
#print(liste)
|
||
|
|
||
|
|
||
|
# contains the result of every step
|
||
|
result_list = [];
|
||
|
|
||
|
#indexdict cotains every tf*idf measure
|
||
|
# these measure is used because of the fact that spacy not always classify correctly.
|
||
|
# So the measurement is used to drop the paragraphs that has no value for us
|
||
|
indexdict = []
|
||
|
for cluster in liste:
|
||
|
paragraph = create_paragraph(cluster)
|
||
|
indexdict.append({"average" : tfidf.run(paragraph, nlp), "text": paragraph, "id":cluster[0]["id"], "cluster": cluster})
|
||
|
|
||
|
# Printfunction for debugging
|
||
|
#for tf in indexdict:
|
||
|
# print(tf["average"])
|
||
|
|
||
|
|
||
|
|
||
|
print(len(indexdict))
|
||
|
|
||
|
|
||
|
# indexdict save is used to hold the original tfidf-list back
|
||
|
indexdict_save = indexdict
|
||
|
|
||
|
#this stept creates the list with the data that will be saved
|
||
|
#list_save_saved = create_saved_list(indexdict_save,nlp)
|
||
|
|
||
|
#this step updates the tf*idf lis in combination with third parties
|
||
|
#list_third_party = create_third(indexdict_save,nlp)
|
||
|
collected_data = collectpersonaldata.run(indexdict_save,nlp)
|
||
|
collected_third = collectthirdparty.run(indexdict_save,nlp)
|
||
|
#for col in collected_data:
|
||
|
# print(col["text"])
|
||
|
result_list.append(collected_data)
|
||
|
result_list.append(collected_third)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
return result_list
|
||
|
|
||
|
#with open('test.txt') as json_file:
|
||
|
# data = json.load(json_file)
|
||
|
|
||
|
#nlp = spacy.load("en_core_web_sm")
|
||
|
#run(data, nlp)
|