Eine KI die Termine in PDF Dokumenten finden kann
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

105 lines
2.8 KiB

import re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import codecs
print("--- Starte Training ---")
# Beispieltext mit Terminen, die wir filtern wollen
lines = open('traingsdaten0.txt', 'r', encoding='utf-8').readlines()
text = ''.join(lines)
print(text)
# Filtern der Termine mit dem Regulären Ausdruck
meetings = lines
# Erstellen des Trainingsdatensatzes
data = []
labels = []
for meeting in meetings:
data.append(text)
labels.append(1)
other_text = text.replace(meeting, "")
data.append(other_text)
labels.append(0)
# Tokenisierung des Texts und Erstellen von Sequenzen
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 200000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
VALIDATION_SPLIT = 0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
EMBEDDING_DIM = 100
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM)(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
model.fit(x_train, y_train,
batch_size=128,
epochs=1000,
validation_data=(x_val, y_val))
print("--- Starte Auswertung ---")
# Textdaten in lowercase konvertieren
text = open('output.txt', 'r', encoding='utf-8').readlines()
print(text)
predictions = []
for line in text:
line_vectors = tokenizer.texts_to_sequences([line])
line_vectors = pad_sequences(line_vectors, maxlen=MAX_SEQUENCE_LENGTH)
prediction = model.predict(line_vectors)
predictions.append(prediction)
print("--- Predictions: "+str(len(predictions)))
for result in predictions:
print(result.tolist())
print("--- Folgende Termine wurden gefunden: ")
for i in range(len(text)):
if(predictions[i][0][0] >= 0.85):
print(text[i])