from typing import Any, IO import keras from matplotlib import colors # type: ignore from numpy import array, asarray, zeros, ndarray, random # type: ignore from keras.preprocessing.text import one_hot, Tokenizer # type: ignore from keras.preprocessing.sequence import pad_sequences # type: ignore from keras.models import Sequential, Model # type: ignore from keras.layers import Dense, Input # type: ignore from keras.layers import Flatten # type: ignore from keras.layers.embeddings import Embedding # type: ignore from nltk.tokenize import word_tokenize # type: ignore import matplotlib.pyplot as plt # type: ignore import matplotlib as mpl # type: ignore corpus: list = [ # Positive Reviews 'This is an excellent movie', 'The move was fantastic I like it', 'You should watch it is brilliant', 'Exceptionally good', 'Wonderfully directed and executed I like it', 'Its a fantastic series', 'Never watched such a brillent movie', 'It is a Wonderful movie', # Negtive Reviews "horrible acting", 'waste of money', 'pathetic picture', 'It was very boring', 'I did not like the movie', 'The movie was horrible', 'I will not recommend', 'The acting is pathetic' ] _METRICS: list = [ keras.metrics.TruePositives(name='tp'), keras.metrics.FalsePositives(name='fp'), keras.metrics.TrueNegatives(name='tn'), keras.metrics.FalseNegatives(name='fn'), keras.metrics.BinaryAccuracy(name='accuracy'), keras.metrics.Precision(name='precision'), keras.metrics.Recall(name='recall'), keras.metrics.AUC(name='auc'), ] def set_seed(seed): random.seed(seed) # tf.random.set_seed(seed) def do_tokenize_text_keras(corpus_: list) -> Tokenizer: tokenizer = Tokenizer() tokenizer.fit_on_texts(corpus_) return tokenizer def do_tokenized_txt_nltk(corpus_: list) -> list: words: list = [] for sent in corpus_: tokenize_word = word_tokenize(sent) for word in tokenize_word: words.append(word) return words def do_sentence_hotencoding(corpus_: list, vocab_length_: int) -> list: embedded_sentences = [one_hot(sent, vocab_length_) for sent in corpus_] return embedded_sentences def get_max_sentence_lenght(corpus_: list) -> int: word_count: Any = lambda sentence: len(word_tokenize(sentence)) longest_sentence = max(corpus_, key=word_count) return len(word_tokenize(longest_sentence)) def do_padding_sentence(embedded_sentences_: list, len_max_: int) -> list: return pad_sequences(embedded_sentences_, len_max_, padding='post') def load_embedding_model() -> dict: embeddings_dictionary = dict() glove_file: IO = open('data/glove.6B/glove.6B.100d.txt', encoding="utf8") for line in glove_file: records = line.split() word = records[0] vector_dimensions = asarray(records[1:], dtype='float32') embeddings_dictionary[word] = vector_dimensions glove_file.close() return embeddings_dictionary def do_embedding_glove(vocab_length_: int, word_tokenizer_: Tokenizer, embeddings_dictionary: dict) -> ndarray: embedding_matrix: ndarray = zeros((vocab_length_, 100)) for word, index in word_tokenizer_.word_index.items(): embedding_vector = embeddings_dictionary.get(word) if embedding_vector is not None: embedding_matrix[index] = embedding_vector return embedding_matrix def get_model_sq(vocab_size_: int, word_vector_size_: int, max_sentence_length_: int) -> keras.models.Sequential: model_sq: Sequential = Sequential() model_sq.add(Embedding(vocab_size_, word_vector_size_, input_length=max_sentence_length_)) model_sq.add(Flatten()) model_sq.add(Dense(1, activation='sigmoid')) return model_sq def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray, max_sentence_length_: int) -> keras.models.Sequential: model_sq: Sequential = Sequential() embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_], input_length=max_sentence_length_, trainable=False) model_sq.add(embedding_layer) model_sq.add(Flatten()) model_sq.add(Dense(1, activation='sigmoid')) return model_sq def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray, max_sentence_length_: int) -> keras.models.Sequential: model_sq: Sequential = Sequential() embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_], input_length=max_sentence_length_, trainable=False) model_sq.add(embedding_layer) model_sq.add(Flatten()) model_sq.add(Dense(1, activation='sigmoid')) return model_sq def get_model_fc_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray, max_sentence_length_: int) -> keras.models.Model: deep_inputs = Input(shape=(max_sentence_length_)) embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_], input_length=max_sentence_length_, trainable=False)(deep_inputs) flatten = Flatten()(embedding_layer) hidden = Dense(1, activation='sigmoid')(flatten) model = Model(inputs=deep_inputs, outputs=hidden) return model def plot_metrics(history_: keras.callbacks.History, plt_=plt, mpl_=mpl) -> None: mpl_.rcParams['figure.figsize'] = (12, 10) metrics: list = ['loss', 'auc', 'precision', 'recall'] colors_plt: colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] for n, metric in enumerate(metrics): name = metric.replace("_", " ").capitalize() plt_.subplot(2, 2, n + 1) plt_.plot(history_.epoch, history_.history[metric], color=colors_plt[0], label='Train') # plt.plot(history.epoch, history.history['val_' + metric], # color=colors[0], linestyle="--", label='Val') plt_.xlabel('Epoch') plt_.ylabel(name) if metric == 'loss': plt_.ylim([0, plt.ylim()[1]]) elif metric == 'auc': plt_.ylim([0.8, 1]) else: plt_.ylim([0, 1]) plt_.legend() plt_.show() # Press the green button in the gutter to run the script. if __name__ == '__main__': version: int = 0 set_seed(16) print('Tokenized words in corpus (Keras tokenizer) -------') keras_word_tokenizer: Tokenizer = do_tokenize_text_keras(corpus) keras_unique_words = keras_word_tokenizer.word_index unique_keras_words_length: int = len(keras_word_tokenizer.word_index) + 1 print(keras_word_tokenizer.word_index) print(unique_keras_words_length) print('Word in corpus (nltk tokenizer) -------') unique_words: set = set(do_tokenized_txt_nltk(corpus)) unique_words_length: int = len(unique_words) print(unique_words) print(unique_words_length) print('Sentence_hotencoding -------') sentence_hotencoding: list = do_sentence_hotencoding(corpus, unique_words_length) print(do_sentence_hotencoding(corpus, len(unique_words))) print('Max sentence lenghth -------') max_sentence_lenght: int = get_max_sentence_lenght(corpus) print(max_sentence_lenght) print('Sentence_hotencoding_and_padding -------') padding_sentence: list = do_padding_sentence(sentence_hotencoding, max_sentence_lenght) print(padding_sentence) # dataset data_set: list = do_padding_sentence(do_sentence_hotencoding(corpus, len(unique_words)), get_max_sentence_lenght(corpus)) label_set = array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]) if version == 1: # fit keras function API with pretrained embedding layer glove weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model()) model_fc: keras.models.Model = get_model_fc_with_embedded_layer(unique_keras_words_length, 100, weights, max_sentence_lenght) model_fc.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS) print(model_fc.summary()) history_dict = model_fc.fit(data_set, label_set, epochs=25, verbose=1) for layer in model_fc.layers: print(layer.get_config(), layer.get_weights()) elif version == 2: # fit with pretrained embedding layer glove weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model()) model = get_model_sq_with_embedded_layer(unique_keras_words_length, 100, weights, max_sentence_lenght) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS) print(model.summary()) history_dict = model.fit(data_set, label_set, epochs=25, verbose=1) for layer in model.layers: print(layer.get_config(), layer.get_weights()) else: # fit self embedding # model # embeeding dim: _vocab_size * _word_vector_size, 45 * 20 900 # output shape: _max_sentence_length * _word_vector_size, 7 * 20 =140 model: keras.models.Sequential = get_model_sq(unique_words_length, 128, max_sentence_lenght) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS) print(model.summary()) history_dict = model.fit(data_set, label_set, epochs=200, verbose=1) for layer in model.layers: print(layer.get_config(), layer.get_weights()) plot_metrics(history_dict)