import re import numpy as np from keras.preprocessing.text import Tokenizer from keras.utils import pad_sequences from keras.utils import to_categorical from keras.layers import Dense, Input, GlobalMaxPooling1D from keras.layers import Conv1D, MaxPooling1D, Embedding from keras.models import Model import codecs print("--- Starte Training ---") # Beispieltext mit Terminen, die wir filtern wollen lines = open('traingsdaten0.txt', 'r', encoding='utf-8').readlines() text = ''.join(lines) print(text) # Filtern der Termine mit dem Regulären Ausdruck meetings = lines # Erstellen des Trainingsdatensatzes data = [] labels = [] for meeting in meetings: data.append(text) labels.append(1) other_text = text.replace(meeting, "") data.append(other_text) labels.append(0) # Tokenisierung des Texts und Erstellen von Sequenzen MAX_SEQUENCE_LENGTH = 1000 MAX_NUM_WORDS = 200000 tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(data) sequences = tokenizer.texts_to_sequences(data) word_index = tokenizer.word_index print("Found %s unique tokens." % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) VALIDATION_SPLIT = 0.2 indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-num_validation_samples] y_train = labels[:-num_validation_samples] x_val = data[-num_validation_samples:] y_val = labels[-num_validation_samples:] EMBEDDING_DIM = 100 sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') embedded_sequences = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM)(sequence_input) x = Conv1D(128, 5, activation='relu')(embedded_sequences) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = GlobalMaxPooling1D()(x) x = Dense(128, activation='relu')(x) preds = Dense(2, activation='softmax')(x) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) model.fit(x_train, y_train, batch_size=128, epochs=1000, validation_data=(x_val, y_val)) print("--- Starte Auswertung ---") # Textdaten in lowercase konvertieren text = open('output.txt', 'r', encoding='utf-8').readlines() print(text) predictions = [] for line in text: line_vectors = tokenizer.texts_to_sequences([line]) line_vectors = pad_sequences(line_vectors, maxlen=MAX_SEQUENCE_LENGTH) prediction = model.predict(line_vectors) predictions.append(prediction) print("--- Predictions: "+str(len(predictions))) for result in predictions: print(result.tolist()) print("--- Folgende Termine wurden gefunden: ") for i in range(len(text)): if(predictions[i][0][0] >= 0.85): print(text[i])