123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231 |
- from typing import Any, IO
- import keras
- from matplotlib import colors # type: ignore
- from numpy import array, asarray, zeros, ndarray, random # type: ignore
- from keras.preprocessing.text import one_hot, Tokenizer # type: ignore
- from keras.preprocessing.sequence import pad_sequences # type: ignore
- from keras.models import Sequential, Model # type: ignore
- from keras.layers import Dense, Input # type: ignore
- from keras.layers import Flatten # type: ignore
- from keras.layers.embeddings import Embedding # type: ignore
- from nltk.tokenize import word_tokenize # type: ignore
- import matplotlib.pyplot as plt # type: ignore
- import matplotlib as mpl # type: ignore
- corpus: list = [
- # Positive Reviews
- 'This is an excellent movie',
- 'The move was fantastic I like it',
- 'You should watch it is brilliant',
- 'Exceptionally good',
- 'Wonderfully directed and executed I like it',
- 'Its a fantastic series',
- 'Never watched such a brillent movie',
- 'It is a Wonderful movie',
- # Negtive Reviews
- "horrible acting",
- 'waste of money',
- 'pathetic picture',
- 'It was very boring',
- 'I did not like the movie',
- 'The movie was horrible',
- 'I will not recommend',
- 'The acting is pathetic'
- ]
- _METRICS: list = [
- keras.metrics.TruePositives(name='tp'),
- keras.metrics.FalsePositives(name='fp'),
- keras.metrics.TrueNegatives(name='tn'),
- keras.metrics.FalseNegatives(name='fn'),
- keras.metrics.BinaryAccuracy(name='accuracy'),
- keras.metrics.Precision(name='precision'),
- keras.metrics.Recall(name='recall'),
- keras.metrics.AUC(name='auc'),
- ]
- def set_seed(seed):
- random.seed(seed)
- # tf.random.set_seed(seed)
- def do_tokenize_text_keras(corpus_: list) -> Tokenizer:
- tokenizer = Tokenizer()
- tokenizer.fit_on_texts(corpus_)
- return tokenizer
- def do_tokenized_txt_nltk(corpus_: list) -> list:
- words: list = []
- for sent in corpus_:
- tokenize_word = word_tokenize(sent)
- for word in tokenize_word:
- words.append(word)
- return words
- def do_sentence_hotencoding(corpus_: list, vocab_length_: int) -> list:
- embedded_sentences = [one_hot(sent, vocab_length_) for sent in corpus_]
- return embedded_sentences
- def get_max_sentence_lenght(corpus_: list) -> int:
- word_count: Any = lambda sentence: len(word_tokenize(sentence))
- longest_sentence = max(corpus_, key=word_count)
- return len(word_tokenize(longest_sentence))
- def do_padding_sentence(embedded_sentences_: list, len_max_: int) -> list:
- return pad_sequences(embedded_sentences_, len_max_, padding='post')
- def load_embedding_model() -> dict:
- embeddings_dictionary = dict()
- glove_file: IO = open('data/glove.6B/glove.6B.100d.txt', encoding="utf8")
- for line in glove_file:
- records = line.split()
- word = records[0]
- vector_dimensions = asarray(records[1:], dtype='float32')
- embeddings_dictionary[word] = vector_dimensions
- glove_file.close()
- return embeddings_dictionary
- def do_embedding_glove(vocab_length_: int, word_tokenizer_: Tokenizer, embeddings_dictionary: dict) -> ndarray:
- embedding_matrix: ndarray = zeros((vocab_length_, 100))
- for word, index in word_tokenizer_.word_index.items():
- embedding_vector = embeddings_dictionary.get(word)
- if embedding_vector is not None:
- embedding_matrix[index] = embedding_vector
- return embedding_matrix
- def get_model_sq(vocab_size_: int, word_vector_size_: int, max_sentence_length_: int) -> keras.models.Sequential:
- model_sq: Sequential = Sequential()
- model_sq.add(Embedding(vocab_size_, word_vector_size_, input_length=max_sentence_length_))
- model_sq.add(Flatten())
- model_sq.add(Dense(1, activation='sigmoid'))
- return model_sq
- def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
- max_sentence_length_: int) -> keras.models.Sequential:
- model_sq: Sequential = Sequential()
- embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
- input_length=max_sentence_length_,
- trainable=False)
- model_sq.add(embedding_layer)
- model_sq.add(Flatten())
- model_sq.add(Dense(1, activation='sigmoid'))
- return model_sq
- def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
- max_sentence_length_: int) -> keras.models.Sequential:
- model_sq: Sequential = Sequential()
- embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
- input_length=max_sentence_length_,
- trainable=False)
- model_sq.add(embedding_layer)
- model_sq.add(Flatten())
- model_sq.add(Dense(1, activation='sigmoid'))
- return model_sq
- def get_model_fc_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
- max_sentence_length_: int) -> keras.models.Model:
- deep_inputs = Input(shape=(max_sentence_length_))
- embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
- input_length=max_sentence_length_,
- trainable=False)(deep_inputs)
- flatten = Flatten()(embedding_layer)
- hidden = Dense(1, activation='sigmoid')(flatten)
- model = Model(inputs=deep_inputs, outputs=hidden)
- return model
- def plot_metrics(history_: keras.callbacks.History, plt_=plt, mpl_=mpl) -> None:
- mpl_.rcParams['figure.figsize'] = (12, 10)
- metrics: list = ['loss', 'auc', 'precision', 'recall']
- colors_plt: colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
- for n, metric in enumerate(metrics):
- name = metric.replace("_", " ").capitalize()
- plt_.subplot(2, 2, n + 1)
- plt_.plot(history_.epoch, history_.history[metric], color=colors_plt[0], label='Train')
- # plt.plot(history.epoch, history.history['val_' + metric],
- # color=colors[0], linestyle="--", label='Val')
- plt_.xlabel('Epoch')
- plt_.ylabel(name)
- if metric == 'loss':
- plt_.ylim([0, plt.ylim()[1]])
- elif metric == 'auc':
- plt_.ylim([0.8, 1])
- else:
- plt_.ylim([0, 1])
- plt_.legend()
- plt_.show()
- # Press the green button in the gutter to run the script.
- if __name__ == '__main__':
- version: int = 0
- set_seed(16)
- print('Tokenized words in corpus (Keras tokenizer) -------')
- keras_word_tokenizer: Tokenizer = do_tokenize_text_keras(corpus)
- keras_unique_words = keras_word_tokenizer.word_index
- unique_keras_words_length: int = len(keras_word_tokenizer.word_index) + 1
- print(keras_word_tokenizer.word_index)
- print(unique_keras_words_length)
- print('Word in corpus (nltk tokenizer) -------')
- unique_words: set = set(do_tokenized_txt_nltk(corpus))
- unique_words_length: int = len(unique_words)
- print(unique_words)
- print(unique_words_length)
- print('Sentence_hotencoding -------')
- sentence_hotencoding: list = do_sentence_hotencoding(corpus, unique_words_length)
- print(do_sentence_hotencoding(corpus, len(unique_words)))
- print('Max sentence lenghth -------')
- max_sentence_lenght: int = get_max_sentence_lenght(corpus)
- print(max_sentence_lenght)
- print('Sentence_hotencoding_and_padding -------')
- padding_sentence: list = do_padding_sentence(sentence_hotencoding, max_sentence_lenght)
- print(padding_sentence)
- # dataset
- data_set: list = do_padding_sentence(do_sentence_hotencoding(corpus, len(unique_words)),
- get_max_sentence_lenght(corpus))
- label_set = array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
- if version == 1: # fit keras function API with pretrained embedding layer glove
- weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model())
- model_fc: keras.models.Model = get_model_fc_with_embedded_layer(unique_keras_words_length, 100, weights,
- max_sentence_lenght)
- model_fc.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
- print(model_fc.summary())
- history_dict = model_fc.fit(data_set, label_set, epochs=25, verbose=1)
- for layer in model_fc.layers: print(layer.get_config(), layer.get_weights())
- elif version == 2: # fit with pretrained embedding layer glove
- weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model())
- model = get_model_sq_with_embedded_layer(unique_keras_words_length, 100, weights, max_sentence_lenght)
- model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
- print(model.summary())
- history_dict = model.fit(data_set, label_set, epochs=25, verbose=1)
- for layer in model.layers: print(layer.get_config(), layer.get_weights())
- else: # fit self embedding
- # model
- # embeeding dim: _vocab_size * _word_vector_size, 45 * 20 900
- # output shape: _max_sentence_length * _word_vector_size, 7 * 20 =140
- model: keras.models.Sequential = get_model_sq(unique_words_length, 128, max_sentence_lenght)
- model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
- print(model.summary())
- history_dict = model.fit(data_set, label_set, epochs=200, verbose=1)
- for layer in model.layers: print(layer.get_config(), layer.get_weights())
- plot_metrics(history_dict)
|