You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
105 lines
2.8 KiB
105 lines
2.8 KiB
import re |
|
import numpy as np |
|
from keras.preprocessing.text import Tokenizer |
|
from keras.utils import pad_sequences |
|
from keras.utils import to_categorical |
|
from keras.layers import Dense, Input, GlobalMaxPooling1D |
|
from keras.layers import Conv1D, MaxPooling1D, Embedding |
|
from keras.models import Model |
|
import codecs |
|
|
|
|
|
print("--- Starte Training ---") |
|
|
|
# Beispieltext mit Terminen, die wir filtern wollen |
|
lines = open('traingsdaten0.txt', 'r', encoding='utf-8').readlines() |
|
text = ''.join(lines) |
|
|
|
print(text) |
|
|
|
|
|
# Filtern der Termine mit dem Regulären Ausdruck |
|
meetings = lines |
|
|
|
|
|
# Erstellen des Trainingsdatensatzes |
|
data = [] |
|
labels = [] |
|
for meeting in meetings: |
|
data.append(text) |
|
labels.append(1) |
|
other_text = text.replace(meeting, "") |
|
data.append(other_text) |
|
labels.append(0) |
|
|
|
# Tokenisierung des Texts und Erstellen von Sequenzen |
|
MAX_SEQUENCE_LENGTH = 1000 |
|
MAX_NUM_WORDS = 200000 |
|
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) |
|
tokenizer.fit_on_texts(data) |
|
sequences = tokenizer.texts_to_sequences(data) |
|
word_index = tokenizer.word_index |
|
print("Found %s unique tokens." % len(word_index)) |
|
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) |
|
labels = to_categorical(np.asarray(labels)) |
|
|
|
VALIDATION_SPLIT = 0.2 |
|
indices = np.arange(data.shape[0]) |
|
np.random.shuffle(indices) |
|
data = data[indices] |
|
labels = labels[indices] |
|
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) |
|
x_train = data[:-num_validation_samples] |
|
y_train = labels[:-num_validation_samples] |
|
x_val = data[-num_validation_samples:] |
|
y_val = labels[-num_validation_samples:] |
|
|
|
EMBEDDING_DIM = 100 |
|
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') |
|
embedded_sequences = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM)(sequence_input) |
|
x = Conv1D(128, 5, activation='relu')(embedded_sequences) |
|
x = MaxPooling1D(5)(x) |
|
x = Conv1D(128, 5, activation='relu')(x) |
|
x = MaxPooling1D(5)(x) |
|
x = Conv1D(128, 5, activation='relu')(x) |
|
x = GlobalMaxPooling1D()(x) |
|
x = Dense(128, activation='relu')(x) |
|
preds = Dense(2, activation='softmax')(x) |
|
|
|
model = Model(sequence_input, preds) |
|
model.compile(loss='categorical_crossentropy', |
|
optimizer='rmsprop', |
|
metrics=['acc']) |
|
|
|
model.fit(x_train, y_train, |
|
batch_size=128, |
|
epochs=1000, |
|
validation_data=(x_val, y_val)) |
|
|
|
|
|
print("--- Starte Auswertung ---") |
|
|
|
# Textdaten in lowercase konvertieren |
|
text = open('output.txt', 'r', encoding='utf-8').readlines() |
|
|
|
print(text) |
|
|
|
|
|
predictions = [] |
|
|
|
for line in text: |
|
line_vectors = tokenizer.texts_to_sequences([line]) |
|
line_vectors = pad_sequences(line_vectors, maxlen=MAX_SEQUENCE_LENGTH) |
|
prediction = model.predict(line_vectors) |
|
predictions.append(prediction) |
|
|
|
|
|
print("--- Predictions: "+str(len(predictions))) |
|
for result in predictions: |
|
print(result.tolist()) |
|
|
|
|
|
print("--- Folgende Termine wurden gefunden: ") |
|
for i in range(len(text)): |
|
if(predictions[i][0][0] >= 0.85): |
|
print(text[i]) |