terminai/ai.py

import re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import codecs


print("--- Starte Training ---")

# Beispieltext mit Terminen, die wir filtern wollen
lines = open('traingsdaten0.txt', 'r', encoding='utf-8').readlines()
text = ''.join(lines)

print(text)


# Filtern der Termine mit dem Regulären Ausdruck
meetings = lines


# Erstellen des Trainingsdatensatzes
data = []
labels = []
for meeting in meetings:
    data.append(text)
    labels.append(1)
    other_text = text.replace(meeting, "")
    data.append(other_text)
    labels.append(0)

# Tokenisierung des Texts und Erstellen von Sequenzen
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 200000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))

VALIDATION_SPLIT = 0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

EMBEDDING_DIM = 100
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM)(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['acc'])

model.fit(x_train, y_train,
batch_size=128,
epochs=1000,
validation_data=(x_val, y_val))


print("--- Starte Auswertung ---")

# Textdaten in lowercase konvertieren
text = open('output.txt', 'r', encoding='utf-8').readlines()

print(text)


predictions = []

for line in text:
    line_vectors = tokenizer.texts_to_sequences([line])
    line_vectors = pad_sequences(line_vectors, maxlen=MAX_SEQUENCE_LENGTH)
    prediction = model.predict(line_vectors)
    predictions.append(prediction)


print("--- Predictions: "+str(len(predictions)))
for result in predictions:
    print(result.tolist())


print("--- Folgende Termine wurden gefunden: ")
for i in range(len(text)):
    if(predictions[i][0][0] >= 0.85):
        print(text[i])