tensorflow_datasetに用意されているimdb_reviewsデータセットを取得して、kerasのTokenizerを使って前処理して、Modelを書いて、学習。
これをテンプレートとして使えば、EmbeddingはBERTに変えたり、imdb_reviewsを自分で作ったデータセットにしたりして、拡張していけそう。
import tensorflow_datasets as tfds import numpy as np from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import tensorflow as tf if __name__ == "__main__": # 1. Load imdb dataset imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True) # 2. Split imdb dataset into train / test data train_data, test_data = imdb['train'], imdb['test'] # 3. Prepare sentences and labels training_sentences = [] training_labels = [] testing_sentences = [] testing_labels = [] for s, l in train_data: training_sentences.append(str(s.numpy())) training_labels.append(l.numpy()) for s, l in test_data: testing_sentences.append(str(s.numpy())) testing_labels.append(l.numpy()) training_labels = np.array(training_labels) testing_labels = np.array(testing_labels) # 4. Hyperparameters vocab_size = 10000 embedding_dim = 16 max_length = 120 trunc_type = 'post' oov_token = '<OOV>' # 5. Tokenize tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(training_sentences) padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) testing_sequences = tokenizer.texts_to_sequences(testing_sentences) testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type) # 6. Defining model model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), tf.keras.layers.Flatten(), tf.keras.layers.Dense(6, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) # 7. Compile and training model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() num_epochs = 10 model.fit(padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels))
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, 120, 16) 160000 _________________________________________________________________ flatten_1 (Flatten) (None, 1920) 0 _________________________________________________________________ dense_2 (Dense) (None, 6) 11526 _________________________________________________________________ dense_3 (Dense) (None, 1) 7 ================================================================= Total params: 171,533 Trainable params: 171,533 Non-trainable params: 0 _________________________________________________________________ Epoch 1/10 782/782 [==============================] - 4s 5ms/step - loss: 0.5723 - accuracy: 0.7062 - val_loss: 0.4707 - val_accuracy: 0.8163 Epoch 2/10 782/782 [==============================] - 4s 5ms/step - loss: 0.3454 - accuracy: 0.8918 - val_loss: 0.4481 - val_accuracy: 0.8147 Epoch 3/10 782/782 [==============================] - 4s 5ms/step - loss: 0.1946 - accuracy: 0.9558 - val_loss: 0.5217 - val_accuracy: 0.8035 Epoch 4/10 782/782 [==============================] - 4s 5ms/step - loss: 0.1172 - accuracy: 0.9798 - val_loss: 0.5584 - val_accuracy: 0.8052 Epoch 5/10 782/782 [==============================] - 4s 5ms/step - loss: 0.0851 - accuracy: 0.9852 - val_loss: 0.6381 - val_accuracy: 0.8047 Epoch 6/10 782/782 [==============================] - 4s 5ms/step - loss: 0.0707 - accuracy: 0.9869 - val_loss: 0.6734 - val_accuracy: 0.8055 Epoch 7/10 782/782 [==============================] - 4s 5ms/step - loss: 0.0643 - accuracy: 0.9872 - val_loss: 0.7113 - val_accuracy: 0.8036 Epoch 8/10 782/782 [==============================] - 4s 5ms/step - loss: 0.0598 - accuracy: 0.9879 - val_loss: 0.7544 - val_accuracy: 0.8028 Epoch 9/10 782/782 [==============================] - 4s 5ms/step - loss: 0.0568 - accuracy: 0.9883 - val_loss: 0.7722 - val_accuracy: 0.8052 Epoch 10/10 782/782 [==============================] - 4s 5ms/step - loss: 0.0558 - accuracy: 0.9884 - val_loss: 0.8033 - val_accuracy: 0.8050