embedding_seq.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. from typing import Any, IO
  2. import keras
  3. from matplotlib import colors # type: ignore
  4. from numpy import array, asarray, zeros, ndarray, random # type: ignore
  5. from keras.preprocessing.text import one_hot, Tokenizer # type: ignore
  6. from keras.preprocessing.sequence import pad_sequences # type: ignore
  7. from keras.models import Sequential, Model # type: ignore
  8. from keras.layers import Dense, Input # type: ignore
  9. from keras.layers import Flatten # type: ignore
  10. from keras.layers.embeddings import Embedding # type: ignore
  11. from nltk.tokenize import word_tokenize # type: ignore
  12. import matplotlib.pyplot as plt # type: ignore
  13. import matplotlib as mpl # type: ignore
  14. corpus: list = [
  15. # Positive Reviews
  16. 'This is an excellent movie',
  17. 'The move was fantastic I like it',
  18. 'You should watch it is brilliant',
  19. 'Exceptionally good',
  20. 'Wonderfully directed and executed I like it',
  21. 'Its a fantastic series',
  22. 'Never watched such a brillent movie',
  23. 'It is a Wonderful movie',
  24. # Negtive Reviews
  25. "horrible acting",
  26. 'waste of money',
  27. 'pathetic picture',
  28. 'It was very boring',
  29. 'I did not like the movie',
  30. 'The movie was horrible',
  31. 'I will not recommend',
  32. 'The acting is pathetic'
  33. ]
  34. _METRICS: list = [
  35. keras.metrics.TruePositives(name='tp'),
  36. keras.metrics.FalsePositives(name='fp'),
  37. keras.metrics.TrueNegatives(name='tn'),
  38. keras.metrics.FalseNegatives(name='fn'),
  39. keras.metrics.BinaryAccuracy(name='accuracy'),
  40. keras.metrics.Precision(name='precision'),
  41. keras.metrics.Recall(name='recall'),
  42. keras.metrics.AUC(name='auc'),
  43. ]
  44. def set_seed(seed):
  45. random.seed(seed)
  46. # tf.random.set_seed(seed)
  47. def do_tokenize_text_keras(corpus_: list) -> Tokenizer:
  48. tokenizer = Tokenizer()
  49. tokenizer.fit_on_texts(corpus_)
  50. return tokenizer
  51. def do_tokenized_txt_nltk(corpus_: list) -> list:
  52. words: list = []
  53. for sent in corpus_:
  54. tokenize_word = word_tokenize(sent)
  55. for word in tokenize_word:
  56. words.append(word)
  57. return words
  58. def do_sentence_hotencoding(corpus_: list, vocab_length_: int) -> list:
  59. embedded_sentences = [one_hot(sent, vocab_length_) for sent in corpus_]
  60. return embedded_sentences
  61. def get_max_sentence_lenght(corpus_: list) -> int:
  62. word_count: Any = lambda sentence: len(word_tokenize(sentence))
  63. longest_sentence = max(corpus_, key=word_count)
  64. return len(word_tokenize(longest_sentence))
  65. def do_padding_sentence(embedded_sentences_: list, len_max_: int) -> list:
  66. return pad_sequences(embedded_sentences_, len_max_, padding='post')
  67. def load_embedding_model() -> dict:
  68. embeddings_dictionary = dict()
  69. glove_file: IO = open('data/glove.6B/glove.6B.100d.txt', encoding="utf8")
  70. for line in glove_file:
  71. records = line.split()
  72. word = records[0]
  73. vector_dimensions = asarray(records[1:], dtype='float32')
  74. embeddings_dictionary[word] = vector_dimensions
  75. glove_file.close()
  76. return embeddings_dictionary
  77. def do_embedding_glove(vocab_length_: int, word_tokenizer_: Tokenizer, embeddings_dictionary: dict) -> ndarray:
  78. embedding_matrix: ndarray = zeros((vocab_length_, 100))
  79. for word, index in word_tokenizer_.word_index.items():
  80. embedding_vector = embeddings_dictionary.get(word)
  81. if embedding_vector is not None:
  82. embedding_matrix[index] = embedding_vector
  83. return embedding_matrix
  84. def get_model_sq(vocab_size_: int, word_vector_size_: int, max_sentence_length_: int) -> keras.models.Sequential:
  85. model_sq: Sequential = Sequential()
  86. model_sq.add(Embedding(vocab_size_, word_vector_size_, input_length=max_sentence_length_))
  87. model_sq.add(Flatten())
  88. model_sq.add(Dense(1, activation='sigmoid'))
  89. return model_sq
  90. def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
  91. max_sentence_length_: int) -> keras.models.Sequential:
  92. model_sq: Sequential = Sequential()
  93. embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
  94. input_length=max_sentence_length_,
  95. trainable=False)
  96. model_sq.add(embedding_layer)
  97. model_sq.add(Flatten())
  98. model_sq.add(Dense(1, activation='sigmoid'))
  99. return model_sq
  100. def get_model_sq_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
  101. max_sentence_length_: int) -> keras.models.Sequential:
  102. model_sq: Sequential = Sequential()
  103. embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
  104. input_length=max_sentence_length_,
  105. trainable=False)
  106. model_sq.add(embedding_layer)
  107. model_sq.add(Flatten())
  108. model_sq.add(Dense(1, activation='sigmoid'))
  109. return model_sq
  110. def get_model_fc_with_embedded_layer(vocab_size_: int, word_vector_size_: int, embedding_matrix_: ndarray,
  111. max_sentence_length_: int) -> keras.models.Model:
  112. deep_inputs = Input(shape=(max_sentence_length_))
  113. embedding_layer = Embedding(vocab_size_, word_vector_size_, weights=[embedding_matrix_],
  114. input_length=max_sentence_length_,
  115. trainable=False)(deep_inputs)
  116. flatten = Flatten()(embedding_layer)
  117. hidden = Dense(1, activation='sigmoid')(flatten)
  118. model = Model(inputs=deep_inputs, outputs=hidden)
  119. return model
  120. def plot_metrics(history_: keras.callbacks.History, plt_=plt, mpl_=mpl) -> None:
  121. mpl_.rcParams['figure.figsize'] = (12, 10)
  122. metrics: list = ['loss', 'auc', 'precision', 'recall']
  123. colors_plt: colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
  124. for n, metric in enumerate(metrics):
  125. name = metric.replace("_", " ").capitalize()
  126. plt_.subplot(2, 2, n + 1)
  127. plt_.plot(history_.epoch, history_.history[metric], color=colors_plt[0], label='Train')
  128. # plt.plot(history.epoch, history.history['val_' + metric],
  129. # color=colors[0], linestyle="--", label='Val')
  130. plt_.xlabel('Epoch')
  131. plt_.ylabel(name)
  132. if metric == 'loss':
  133. plt_.ylim([0, plt.ylim()[1]])
  134. elif metric == 'auc':
  135. plt_.ylim([0.8, 1])
  136. else:
  137. plt_.ylim([0, 1])
  138. plt_.legend()
  139. plt_.show()
  140. # Press the green button in the gutter to run the script.
  141. if __name__ == '__main__':
  142. version: int = 0
  143. set_seed(16)
  144. print('Tokenized words in corpus (Keras tokenizer) -------')
  145. keras_word_tokenizer: Tokenizer = do_tokenize_text_keras(corpus)
  146. keras_unique_words = keras_word_tokenizer.word_index
  147. unique_keras_words_length: int = len(keras_word_tokenizer.word_index) + 1
  148. print(keras_word_tokenizer.word_index)
  149. print(unique_keras_words_length)
  150. print('Word in corpus (nltk tokenizer) -------')
  151. unique_words: set = set(do_tokenized_txt_nltk(corpus))
  152. unique_words_length: int = len(unique_words)
  153. print(unique_words)
  154. print(unique_words_length)
  155. print('Sentence_hotencoding -------')
  156. sentence_hotencoding: list = do_sentence_hotencoding(corpus, unique_words_length)
  157. print(do_sentence_hotencoding(corpus, len(unique_words)))
  158. print('Max sentence lenghth -------')
  159. max_sentence_lenght: int = get_max_sentence_lenght(corpus)
  160. print(max_sentence_lenght)
  161. print('Sentence_hotencoding_and_padding -------')
  162. padding_sentence: list = do_padding_sentence(sentence_hotencoding, max_sentence_lenght)
  163. print(padding_sentence)
  164. # dataset
  165. data_set: list = do_padding_sentence(do_sentence_hotencoding(corpus, len(unique_words)),
  166. get_max_sentence_lenght(corpus))
  167. label_set = array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
  168. if version == 1: # fit keras function API with pretrained embedding layer glove
  169. weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model())
  170. model_fc: keras.models.Model = get_model_fc_with_embedded_layer(unique_keras_words_length, 100, weights,
  171. max_sentence_lenght)
  172. model_fc.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
  173. print(model_fc.summary())
  174. history_dict = model_fc.fit(data_set, label_set, epochs=25, verbose=1)
  175. for layer in model_fc.layers: print(layer.get_config(), layer.get_weights())
  176. elif version == 2: # fit with pretrained embedding layer glove
  177. weights = do_embedding_glove(unique_keras_words_length, keras_word_tokenizer, load_embedding_model())
  178. model = get_model_sq_with_embedded_layer(unique_keras_words_length, 100, weights, max_sentence_lenght)
  179. model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
  180. print(model.summary())
  181. history_dict = model.fit(data_set, label_set, epochs=25, verbose=1)
  182. for layer in model.layers: print(layer.get_config(), layer.get_weights())
  183. else: # fit self embedding
  184. # model
  185. # embeeding dim: _vocab_size * _word_vector_size, 45 * 20 900
  186. # output shape: _max_sentence_length * _word_vector_size, 7 * 20 =140
  187. model: keras.models.Sequential = get_model_sq(unique_words_length, 128, max_sentence_lenght)
  188. model.compile(optimizer='adam', loss='binary_crossentropy', metrics=_METRICS)
  189. print(model.summary())
  190. history_dict = model.fit(data_set, label_set, epochs=200, verbose=1)
  191. for layer in model.layers: print(layer.get_config(), layer.get_weights())
  192. plot_metrics(history_dict)