You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
79 lines
2.1 KiB
79 lines
2.1 KiB
# Summary creator |
|
|
|
import json |
|
import spacy |
|
import summaryhelper |
|
import tfidf |
|
import key_words_technical |
|
import killwords |
|
import collectpersonaldata |
|
import collectthirdparty |
|
|
|
# This function create a complete paragraph out of the pieces. |
|
# This approach is better because more sentences gives a better context for nlp |
|
def create_paragraph(cluster): |
|
result = "" |
|
for entry in cluster: |
|
if entry["tag"] == "P" or entry["tag"] == "LI": |
|
result += entry["text"] |
|
|
|
return result |
|
|
|
|
|
|
|
#this function is the entry point into this |
|
|
|
def run(liste, nlp): |
|
|
|
#with open('test.txt') as json_file: |
|
# data = json.load(json_file) |
|
|
|
#liste = data |
|
#print(liste) |
|
|
|
|
|
# contains the result of every step |
|
result_list = []; |
|
|
|
#indexdict cotains every tf*idf measure |
|
# these measure is used because of the fact that spacy not always classify correctly. |
|
# So the measurement is used to drop the paragraphs that has no value for us |
|
indexdict = [] |
|
for cluster in liste: |
|
paragraph = create_paragraph(cluster) |
|
indexdict.append({"average" : tfidf.run(paragraph, nlp), "text": paragraph, "id":cluster[0]["id"], "cluster": cluster}) |
|
|
|
# Printfunction for debugging |
|
#for tf in indexdict: |
|
# print(tf["average"]) |
|
|
|
|
|
|
|
print(len(indexdict)) |
|
|
|
|
|
# indexdict save is used to hold the original tfidf-list back |
|
indexdict_save = indexdict |
|
|
|
#this stept creates the list with the data that will be saved |
|
#list_save_saved = create_saved_list(indexdict_save,nlp) |
|
|
|
#this step updates the tf*idf lis in combination with third parties |
|
#list_third_party = create_third(indexdict_save,nlp) |
|
collected_data = collectpersonaldata.run(indexdict_save,nlp) |
|
collected_third = collectthirdparty.run(indexdict_save,nlp) |
|
#for col in collected_data: |
|
# print(col["text"]) |
|
result_list.append(collected_data) |
|
result_list.append(collected_third) |
|
|
|
|
|
|
|
|
|
return result_list |
|
|
|
#with open('test.txt') as json_file: |
|
# data = json.load(json_file) |
|
|
|
#nlp = spacy.load("en_core_web_sm") |
|
#run(data, nlp)
|
|
|