|
@@ -0,0 +1,231 @@
|
|
|
+from typing import Any, IO
|
|
|
+
|
|
|
+import keras
|
|
|
+from matplotlib import colors # type: ignore
|
|
|
+from numpy import array, asarray, zeros, ndarray, random # type: ignore
|
|
|
+from keras.preprocessing.text import one_hot, Tokenizer # type: ignore
|
|
|
+from keras.preprocessing.sequence import pad_sequences # type: ignore
|
|
|
+from keras.models import Sequential, Model # type: ignore
|
|
|
+from keras.layers import Dense, Input # type: ignore
|
|
|
+from keras.layers import Flatten # type: ignore
|
|
|
+from keras.layers.embeddings import Embedding # type: ignore
|
|
|
+from nltk.tokenize import word_tokenize # type: ignore
|
|
|
+import matplotlib.pyplot as plt # type: ignore
|
|
|
+import matplotlib as mpl # type: ignore
|
|
|
+
|
|
|
+corpus: list = [
|
|
|
+ # Positive Reviews
|
|
|
+
|
|
|
+ 'This is an excellent movie',
|
|
|
+ 'The move was fantastic I like it',
|
|
|
+ 'You should watch it is brilliant',
|
|
|
+ 'Exceptionally good',
|
|
|
+ 'Wonderfully directed and executed I like it',
|
|
|
+ 'Its a fantastic series',
|
|
|
+ 'Never watched such a brillent movie',
|
|
|
+ 'It is a Wonderful movie',
|
|
|
+
|
|
|
+ # Negtive Reviews
|
|
|
+
|
|
|
+ "horrible acting",
|
|
|
+ 'waste of money',
|
|
|
+ 'pathetic picture',
|
|
|
+ 'It was very boring',
|
|
|
+ 'I did not like the movie',
|
|
|
+ 'The movie was horrible',
|
|
|
+ 'I will not recommend',
|
|
|
+ 'The acting is pathetic'
|
|
|
+]
|
|
|
+
|
|
|
+_METRICS: list = [
|
|
|
+ keras.metrics.TruePositives(name='tp'),
|
|
|
+ keras.metrics.FalsePositives(name='fp'),
|
|
|
+ keras.metrics.TrueNegatives(name='tn'),
|
|
|
+ keras.metrics.FalseNegatives(name='fn'),
|
|
|
+ keras.metrics.BinaryAccuracy(name='accuracy'),
|
|
|
+ keras.metrics.Precision(name='precision'),
|
|
|
+ keras.metrics.Recall(name='recall'),
|
|
|
+ keras.metrics.AUC(name='auc'),
|
|
|
+]
|
|
|
+
|
|
|
+
|
|
|
+def set_seed(seed):
|
|
|
+ random.seed(seed)
|
|
|
+ # tf.random.set_seed(seed)
|
|
|
+
|
|
|
+
|
|
|
+def do_tokenize_text_keras(corpus_: list) -> Tokenizer:
|
|
|
+ tokenizer = Tokenizer()
|
|
|
+ tokenizer.fit_on_texts(corpus_)
|
|
|
+ return tokenizer
|
|
|
+
|
|
|
+
|
|
|
+def do_tokenized_txt_nltk(corpus_: list) -> list:
|
|
|
+ words: list = []
|
|
|
+ for sent in corpus_:
|
|
|
+ tokenize_word = word_tokenize(sent)
|
|
|
+ for word in tokenize_word:
|
|
|
+ words.append(word)
|
|
|
+ return words
|
|
|
+
|
|
|
+
|
|
|
+def do_sentence_hotencoding(corpus_: list, vocab_length_: int) -> list:
|
|
|
+ embedded_sentences = [one_hot(sent, vocab_length_) for sent in corpus_]
|
|
|
+ return embedded_sentences
|
|
|
+
|
|
|
+
|
|
|
+def get_max_sentence_lenght(corpus_: list) -> int:
|
|
|
+ word_count: Any = lambda sentence: len(word_tokenize(sentence))
|
|
|
+ longest_sentence = max(corpus_, key=word_count)
|
|
|
+ return len(word_tokenize(longest_sentence))
|
|
|
+
|
|
|
+
|
|
|
+def do_padding_sentence(embedded_sentences_: list, len_max_: int) -> list:
|
|
|
+ return pad_sequences(embedded_sentences_, len_max_, padding='post')
|
|
|
+
|
|
|
+
|
|
|
+def load_embedding_model() -> dict:
|
|
|
+ embeddings_dictionary = dict()
|
|
|
+ glove_file: IO = open('data/glove.6B/glove.6B.100d.txt', encoding="utf8")
|
|
|
+ for line in glove_file:
|
|
|
+ records = line.split()
|
|
|
+ word = records[0]
|
|
|
+ vector_dimensions = asarray(records[1:], dtype='float32')
|
|
|
+ embeddings_dictionary[word] = vector_dimensions
|
|
|
+
|
|
|
+ glove_file.close()
|
|
|
+ return embeddings_dictionary
|
|
|
+
|
|
|
+
|
|
|
+def do_embedding_glove(vocab_length_: int, word_tokenizer_: Tokenizer, embeddings_dictionary: dict) -> ndarray:
|
|
|
+ embedding_matrix: ndarray = zeros((vocab_length_, 100))
|
|
|
+ for word, index in word_tokenizer_.word_index.items():
|
|
|
+ embedding_vector = embeddings_dictionary.get(word)
|
|
|
+ if embedding_vector is not None:
|
|
|
+ embedding_matrix[index] = embedding_vector
|
|
|
+ return embedding_matrix
|
|
|
+
|
|
|
+
|
|
|
+def get_model_sq(vocab_size_: int, word_vector_size_: int, max_sentence_length_: int) -> keras.models.Sequential:
|
|
|
+ model_sq: Sequential = Sequential()
|
|
|
+ model_sq.add(Embedding(vocab_size_, word_vector_size_, input_length=max_sentence_length_))
|
|
|
+ model_sq.add(Flatten())
|
|
|
+ model_sq.add(Dense(1, activation='sigmoid'))
|
|
|
+ return model_sq
|
|
|
+
|
|
|
+
|
|
|
+def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
|
|
|
+ max_sentence_length_: int) -> keras.models.Sequential:
|
|
|
+ model_sq: Sequential = Sequential()
|
|
|
+ embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
|
|
|
+ input_length=max_sentence_length_,
|
|
|
+ trainable=False)
|
|
|
+ model_sq.add(embedding_layer)
|
|
|
+ model_sq.add(Flatten())
|
|
|
+ model_sq.add(Dense(1, activation='sigmoid'))
|
|
|
+ return model_sq
|
|
|
+
|
|
|
+
|
|
|
+def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
|
|
|
+ max_sentence_length_: int) -> keras.models.Sequential:
|
|
|
+ model_sq: Sequential = Sequential()
|
|
|
+ embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
|
|
|
+ input_length=max_sentence_length_,
|
|
|
+ trainable=False)
|
|
|
+ model_sq.add(embedding_layer)
|
|
|
+ model_sq.add(Flatten())
|
|
|
+ model_sq.add(Dense(1, activation='sigmoid'))
|
|
|
+ return model_sq
|
|
|
+
|
|
|
+
|
|
|
+def get_model_fc_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
|
|
|
+ max_sentence_length_: int) -> keras.models.Model:
|
|
|
+ deep_inputs = Input(shape=(max_sentence_length_))
|
|
|
+ embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
|
|
|
+ input_length=max_sentence_length_,
|
|
|
+ trainable=False)(deep_inputs)
|
|
|
+ flatten = Flatten()(embedding_layer)
|
|
|
+ hidden = Dense(1, activation='sigmoid')(flatten)
|
|
|
+ model = Model(inputs=deep_inputs, outputs=hidden)
|
|
|
+ return model
|
|
|
+
|
|
|
+
|
|
|
+def plot_metrics(history_: keras.callbacks.History, plt_=plt, mpl_=mpl) -> None:
|
|
|
+ mpl_.rcParams['figure.figsize'] = (12, 10)
|
|
|
+ metrics: list = ['loss', 'auc', 'precision', 'recall']
|
|
|
+ colors_plt: colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
|
|
|
+ for n, metric in enumerate(metrics):
|
|
|
+ name = metric.replace("_", " ").capitalize()
|
|
|
+ plt_.subplot(2, 2, n + 1)
|
|
|
+ plt_.plot(history_.epoch, history_.history[metric], color=colors_plt[0], label='Train')
|
|
|
+ # plt.plot(history.epoch, history.history['val_' + metric],
|
|
|
+ # color=colors[0], linestyle="--", label='Val')
|
|
|
+ plt_.xlabel('Epoch')
|
|
|
+ plt_.ylabel(name)
|
|
|
+ if metric == 'loss':
|
|
|
+ plt_.ylim([0, plt.ylim()[1]])
|
|
|
+ elif metric == 'auc':
|
|
|
+ plt_.ylim([0.8, 1])
|
|
|
+ else:
|
|
|
+ plt_.ylim([0, 1])
|
|
|
+ plt_.legend()
|
|
|
+ plt_.show()
|
|
|
+
|
|
|
+
|
|
|
+# Press the green button in the gutter to run the script.
|
|
|
+if __name__ == '__main__':
|
|
|
+ version: int = 0
|
|
|
+ set_seed(16)
|
|
|
+
|
|
|
+ print('Tokenized words in corpus (Keras tokenizer) -------')
|
|
|
+ keras_word_tokenizer: Tokenizer = do_tokenize_text_keras(corpus)
|
|
|
+ keras_unique_words = keras_word_tokenizer.word_index
|
|
|
+ unique_keras_words_length: int = len(keras_word_tokenizer.word_index) + 1
|
|
|
+ print(keras_word_tokenizer.word_index)
|
|
|
+ print(unique_keras_words_length)
|
|
|
+ print('Word in corpus (nltk tokenizer) -------')
|
|
|
+ unique_words: set = set(do_tokenized_txt_nltk(corpus))
|
|
|
+ unique_words_length: int = len(unique_words)
|
|
|
+ print(unique_words)
|
|
|
+ print(unique_words_length)
|
|
|
+ print('Sentence_hotencoding -------')
|
|
|
+ sentence_hotencoding: list = do_sentence_hotencoding(corpus, unique_words_length)
|
|
|
+ print(do_sentence_hotencoding(corpus, len(unique_words)))
|
|
|
+ print('Max sentence lenghth -------')
|
|
|
+ max_sentence_lenght: int = get_max_sentence_lenght(corpus)
|
|
|
+ print(max_sentence_lenght)
|
|
|
+ print('Sentence_hotencoding_and_padding -------')
|
|
|
+ padding_sentence: list = do_padding_sentence(sentence_hotencoding, max_sentence_lenght)
|
|
|
+ print(padding_sentence)
|
|
|
+
|
|
|
+ # dataset
|
|
|
+ data_set: list = do_padding_sentence(do_sentence_hotencoding(corpus, len(unique_words)),
|
|
|
+ get_max_sentence_lenght(corpus))
|
|
|
+ label_set = array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
|
|
|
+
|
|
|
+ if version == 1: # fit keras function API with pretrained embedding layer glove
|
|
|
+ weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model())
|
|
|
+ model_fc: keras.models.Model = get_model_fc_with_embedded_layer(unique_keras_words_length, 100, weights,
|
|
|
+ max_sentence_lenght)
|
|
|
+ model_fc.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
|
|
|
+ print(model_fc.summary())
|
|
|
+ history_dict = model_fc.fit(data_set, label_set, epochs=25, verbose=1)
|
|
|
+ for layer in model_fc.layers: print(layer.get_config(), layer.get_weights())
|
|
|
+ elif version == 2: # fit with pretrained embedding layer glove
|
|
|
+ weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model())
|
|
|
+ model = get_model_sq_with_embedded_layer(unique_keras_words_length, 100, weights, max_sentence_lenght)
|
|
|
+ model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
|
|
|
+ print(model.summary())
|
|
|
+ history_dict = model.fit(data_set, label_set, epochs=25, verbose=1)
|
|
|
+ for layer in model.layers: print(layer.get_config(), layer.get_weights())
|
|
|
+ else: # fit self embedding
|
|
|
+ # model
|
|
|
+ # embeeding dim: _vocab_size * _word_vector_size, 45 * 20 900
|
|
|
+ # output shape: _max_sentence_length * _word_vector_size, 7 * 20 =140
|
|
|
+ model: keras.models.Sequential = get_model_sq(unique_words_length, 128, max_sentence_lenght)
|
|
|
+ model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
|
|
|
+ print(model.summary())
|
|
|
+ history_dict = model.fit(data_set, label_set, epochs=200, verbose=1)
|
|
|
+ for layer in model.layers: print(layer.get_config(), layer.get_weights())
|
|
|
+
|
|
|
+ plot_metrics(history_dict)
|