Browse Source

first commit

Urs Kehrli 4 years ago
commit
7e0bb43880
4 changed files with 273 additions and 0 deletions
  1. 10 0
      .gitignore
  2. 16 0
      README.md
  3. 231 0
      embedding_seq.py
  4. 16 0
      main.py

+ 10 - 0
.gitignore

@@ -0,0 +1,10 @@
+data
+models
+venv
+.DS_Store
+.idea
+.pyc
+.idea/misc.xml
+.idea/predicty-3.0.0.iml
+__pycache__
+__MACOSX/

+ 16 - 0
README.md

@@ -0,0 +1,16 @@
+Author: Urs Kehrli
+Date: Friday, 7. August 2020
+Source:
+Python for NLP: Word Embeddings for Deep Learning in Keras   
+https://stackabuse.com/python-for-nlp-word-embeddings-for-deep-learning-in-keras/
+
+Learn form:
+https://developers.google.com/machine-learning/crash-course/embeddings/motivation-from-collaborative-filtering
+https://cloud.google.com/solutions/machine-learning/overview-extracting-and-serving-feature-embeddings-for-machine-learning
+ttps://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d
+
+Implementation:
+The implementation of the pretrained Word Embeddings of Glove.6B.zip" creates often an exeption.
+Cause is not found. Maybe tf version conflict 
+fit with pretrained embedding layer glove
+fit keras function API with pretrained embedding layer glove

+ 231 - 0
embedding_seq.py

@@ -0,0 +1,231 @@
+from typing import Any, IO
+
+import keras
+from matplotlib import colors  # type: ignore
+from numpy import array, asarray, zeros, ndarray, random  # type: ignore
+from keras.preprocessing.text import one_hot, Tokenizer  # type: ignore
+from keras.preprocessing.sequence import pad_sequences  # type: ignore
+from keras.models import Sequential, Model  # type: ignore
+from keras.layers import Dense, Input  # type: ignore
+from keras.layers import Flatten  # type: ignore
+from keras.layers.embeddings import Embedding  # type: ignore
+from nltk.tokenize import word_tokenize  # type: ignore
+import matplotlib.pyplot as plt  # type: ignore
+import matplotlib as mpl  # type: ignore
+
+corpus: list = [
+    # Positive Reviews
+
+    'This is an excellent movie',
+    'The move was fantastic I like it',
+    'You should watch it is brilliant',
+    'Exceptionally good',
+    'Wonderfully directed and executed I like it',
+    'Its a fantastic series',
+    'Never watched such a brillent movie',
+    'It is a Wonderful movie',
+
+    # Negtive Reviews
+
+    "horrible acting",
+    'waste of money',
+    'pathetic picture',
+    'It was very boring',
+    'I did not like the movie',
+    'The movie was horrible',
+    'I will not recommend',
+    'The acting is pathetic'
+]
+
+_METRICS: list = [
+    keras.metrics.TruePositives(name='tp'),
+    keras.metrics.FalsePositives(name='fp'),
+    keras.metrics.TrueNegatives(name='tn'),
+    keras.metrics.FalseNegatives(name='fn'),
+    keras.metrics.BinaryAccuracy(name='accuracy'),
+    keras.metrics.Precision(name='precision'),
+    keras.metrics.Recall(name='recall'),
+    keras.metrics.AUC(name='auc'),
+]
+
+
+def set_seed(seed):
+    random.seed(seed)
+    # tf.random.set_seed(seed)
+
+
+def do_tokenize_text_keras(corpus_: list) -> Tokenizer:
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(corpus_)
+    return tokenizer
+
+
+def do_tokenized_txt_nltk(corpus_: list) -> list:
+    words: list = []
+    for sent in corpus_:
+        tokenize_word = word_tokenize(sent)
+        for word in tokenize_word:
+            words.append(word)
+    return words
+
+
+def do_sentence_hotencoding(corpus_: list, vocab_length_: int) -> list:
+    embedded_sentences = [one_hot(sent, vocab_length_) for sent in corpus_]
+    return embedded_sentences
+
+
+def get_max_sentence_lenght(corpus_: list) -> int:
+    word_count: Any = lambda sentence: len(word_tokenize(sentence))
+    longest_sentence = max(corpus_, key=word_count)
+    return len(word_tokenize(longest_sentence))
+
+
+def do_padding_sentence(embedded_sentences_: list, len_max_: int) -> list:
+    return pad_sequences(embedded_sentences_, len_max_, padding='post')
+
+
+def load_embedding_model() -> dict:
+    embeddings_dictionary = dict()
+    glove_file: IO = open('data/glove.6B/glove.6B.100d.txt', encoding="utf8")
+    for line in glove_file:
+        records = line.split()
+        word = records[0]
+        vector_dimensions = asarray(records[1:], dtype='float32')
+        embeddings_dictionary[word] = vector_dimensions
+
+    glove_file.close()
+    return embeddings_dictionary
+
+
+def do_embedding_glove(vocab_length_: int, word_tokenizer_: Tokenizer, embeddings_dictionary: dict) -> ndarray:
+    embedding_matrix: ndarray = zeros((vocab_length_, 100))
+    for word, index in word_tokenizer_.word_index.items():
+        embedding_vector = embeddings_dictionary.get(word)
+        if embedding_vector is not None:
+            embedding_matrix[index] = embedding_vector
+    return embedding_matrix
+
+
+def get_model_sq(vocab_size_: int, word_vector_size_: int, max_sentence_length_: int) -> keras.models.Sequential:
+    model_sq: Sequential = Sequential()
+    model_sq.add(Embedding(vocab_size_, word_vector_size_, input_length=max_sentence_length_))
+    model_sq.add(Flatten())
+    model_sq.add(Dense(1, activation='sigmoid'))
+    return model_sq
+
+
+def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
+                                     max_sentence_length_: int) -> keras.models.Sequential:
+    model_sq: Sequential = Sequential()
+    embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
+                                input_length=max_sentence_length_,
+                                trainable=False)
+    model_sq.add(embedding_layer)
+    model_sq.add(Flatten())
+    model_sq.add(Dense(1, activation='sigmoid'))
+    return model_sq
+
+
+def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
+                                     max_sentence_length_: int) -> keras.models.Sequential:
+    model_sq: Sequential = Sequential()
+    embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
+                                input_length=max_sentence_length_,
+                                trainable=False)
+    model_sq.add(embedding_layer)
+    model_sq.add(Flatten())
+    model_sq.add(Dense(1, activation='sigmoid'))
+    return model_sq
+
+
+def get_model_fc_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
+                                     max_sentence_length_: int) -> keras.models.Model:
+    deep_inputs = Input(shape=(max_sentence_length_))
+    embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
+                                input_length=max_sentence_length_,
+                                trainable=False)(deep_inputs)
+    flatten = Flatten()(embedding_layer)
+    hidden = Dense(1, activation='sigmoid')(flatten)
+    model = Model(inputs=deep_inputs, outputs=hidden)
+    return model
+
+
+def plot_metrics(history_: keras.callbacks.History, plt_=plt, mpl_=mpl) -> None:
+    mpl_.rcParams['figure.figsize'] = (12, 10)
+    metrics: list = ['loss', 'auc', 'precision', 'recall']
+    colors_plt: colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
+    for n, metric in enumerate(metrics):
+        name = metric.replace("_", " ").capitalize()
+        plt_.subplot(2, 2, n + 1)
+        plt_.plot(history_.epoch, history_.history[metric], color=colors_plt[0], label='Train')
+        # plt.plot(history.epoch, history.history['val_' + metric],
+        #          color=colors[0], linestyle="--", label='Val')
+        plt_.xlabel('Epoch')
+        plt_.ylabel(name)
+        if metric == 'loss':
+            plt_.ylim([0, plt.ylim()[1]])
+        elif metric == 'auc':
+            plt_.ylim([0.8, 1])
+        else:
+            plt_.ylim([0, 1])
+        plt_.legend()
+    plt_.show()
+
+
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    version: int = 0
+    set_seed(16)
+
+    print('Tokenized words in corpus (Keras tokenizer) -------')
+    keras_word_tokenizer: Tokenizer = do_tokenize_text_keras(corpus)
+    keras_unique_words = keras_word_tokenizer.word_index
+    unique_keras_words_length: int = len(keras_word_tokenizer.word_index) + 1
+    print(keras_word_tokenizer.word_index)
+    print(unique_keras_words_length)
+    print('Word in corpus (nltk tokenizer) -------')
+    unique_words: set = set(do_tokenized_txt_nltk(corpus))
+    unique_words_length: int = len(unique_words)
+    print(unique_words)
+    print(unique_words_length)
+    print('Sentence_hotencoding -------')
+    sentence_hotencoding: list = do_sentence_hotencoding(corpus, unique_words_length)
+    print(do_sentence_hotencoding(corpus, len(unique_words)))
+    print('Max sentence lenghth -------')
+    max_sentence_lenght: int = get_max_sentence_lenght(corpus)
+    print(max_sentence_lenght)
+    print('Sentence_hotencoding_and_padding -------')
+    padding_sentence: list = do_padding_sentence(sentence_hotencoding, max_sentence_lenght)
+    print(padding_sentence)
+
+    # dataset
+    data_set: list = do_padding_sentence(do_sentence_hotencoding(corpus, len(unique_words)),
+                                         get_max_sentence_lenght(corpus))
+    label_set = array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
+
+    if version == 1:  # fit keras function API with pretrained embedding layer glove
+        weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model())
+        model_fc: keras.models.Model = get_model_fc_with_embedded_layer(unique_keras_words_length, 100, weights,
+                                                                        max_sentence_lenght)
+        model_fc.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
+        print(model_fc.summary())
+        history_dict = model_fc.fit(data_set, label_set, epochs=25, verbose=1)
+        for layer in model_fc.layers: print(layer.get_config(), layer.get_weights())
+    elif version == 2:  # fit with pretrained embedding layer glove
+        weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model())
+        model = get_model_sq_with_embedded_layer(unique_keras_words_length, 100, weights, max_sentence_lenght)
+        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
+        print(model.summary())
+        history_dict = model.fit(data_set, label_set, epochs=25, verbose=1)
+        for layer in model.layers: print(layer.get_config(), layer.get_weights())
+    else:     # fit self embedding
+        # model
+        # embeeding dim: _vocab_size * _word_vector_size, 45 * 20  900
+        # output shape: _max_sentence_length * _word_vector_size, 7 * 20 =140
+        model: keras.models.Sequential = get_model_sq(unique_words_length, 128, max_sentence_lenght)
+        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
+        print(model.summary())
+        history_dict = model.fit(data_set, label_set, epochs=200, verbose=1)
+        for layer in model.layers: print(layer.get_config(), layer.get_weights())
+
+    plot_metrics(history_dict)

+ 16 - 0
main.py

@@ -0,0 +1,16 @@
+# This is a sample Python script.
+
+# Press ⌃R to execute it or replace it with your code.
+# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
+
+
+def print_hi(name):
+    # Use a breakpoint in the code line below to debug your script.
+    print(f'Hi, {name}')  # Press ⌘F8 to toggle the breakpoint.
+
+
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    print_hi('PyCharm')
+
+# See PyCharm help at https://www.jetbrains.com/help/pycharm/