import tensorflow as tf import numpy as npLoad the data: IMDB movie review sentiment classification
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz !tar -xf aclImdb_v1.tar.gz
train/ ...pos/ ......text_1.txt ......text_2.txt ...neg/ ......text_1.txt ......text_2.txt
batch_size = 32 raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory( "../input/aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="training", seed=1337, ) raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory( "../input/aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="validation", seed=1337, ) raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory( "../input/aclImdb/test", batch_size=batch_size ) print( "Number of batches in raw_train_ds: %d" % tf.data.experimental.cardinality(raw_train_ds) ) print( "Number of batches in raw_val_ds: %d" % tf.data.experimental.cardinality(raw_val_ds) ) print( "Number of batches in raw_test_ds: %d" % tf.data.experimental.cardinality(raw_test_ds) )
# It's important to take a look at your raw data to ensure your normalization # and tokenization will work as expected. We can do that by taking a few # examples from the training set and looking at them. # This is one of the places where eager execution shines: # we can just evaluate these tensors using .numpy() # instead of needing to evaluate them in a Session/Graph context. for text_batch, label_batch in raw_train_ds.take(1): for i in range(2): print(text_batch.numpy()[i]) print(label_batch.numpy()[i])
數據預處理,特別去除 <br />
對文本數據處理from tensorflow.keras.layers.experimental.preprocessing import TextVectorization import string import re # Having looked at our data above, we see that the raw text contains HTML break # tags of the form '<br />'. These tags will not be removed by the default # standardizer (which doesn't strip HTML). Because of this, we will need to # create a custom standardization function. def custom_standardization(input_data): lowercase = tf.strings.lower(input_data) stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ") return tf.strings.regex_replace( stripped_html, "[%s]" % re.escape(string.punctuation), "" ) # Model constants. max_features = 20000 embedding_dim = 128 sequence_length = 500 # Now that we have our custom standardization, we can instantiate our text # vectorization layer. We are using this layer to normalize, split, and map # strings to integers, so we set our 'output_mode' to 'int'. # Note that we're using the default split function, # and the custom standardization defined above. # We also set an explicit maximum sequence length, since the CNNs later in our # model won't support ragged sequences. vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=max_features, output_mode="int", output_sequence_length=sequence_length, ) # Now that the vocab layer has been created, call `adapt` on a text-only # dataset to create the vocabulary. You don't have to batch, but for very large # datasets this means you're not keeping spare copies of the dataset in memory. # Let's make a text-only dataset (no labels): text_ds = raw_train_ds.map(lambda x, y: x) # 取出數據流中訓練數據x # Let's call `adapt`: vectorize_layer.adapt(text_ds)Two options to vectorize the data
text_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='text') x = vectorize_layer(text_input) x = layers.Embedding(max_features + 1, embedding_dim)(x) ...
def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label # Vectorize the data. train_ds = raw_train_ds.map(vectorize_text) val_ds = raw_val_ds.map(vectorize_text) test_ds = raw_test_ds.map(vectorize_text) # Do async prefetching / buffering of the data for best performance on GPU. train_ds = train_ds.cache().prefetch(buffer_size=10) val_ds = val_ds.cache().prefetch(buffer_size=10) test_ds = test_ds.cache().prefetch(buffer_size=10)
Build a modelfrom tensorflow.keras import layers # A integer input for vocab indices. inputs = tf.keras.Input(shape=(None,), dtype="int64") # Next, we add a layer to map those vocab indices into a space of dimensionality # 'embedding_dim'. x = layers.Embedding(max_features, embedding_dim)(inputs) x = layers.Dropout(0.5)(x) # Conv1D + global max pooling x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x) x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x) x = layers.GlobalMaxPooling1D()(x) # We add a vanilla hidden layer: x = layers.Dense(128, activation="relu")(x) x = layers.Dropout(0.5)(x) # We project onto a single unit output layer, and squash it with a sigmoid: predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x) model = tf.keras.Model(inputs, predictions) # Compile the model with binary crossentropy loss and an adam optimizer. model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) model.summary()Train the model
epochs = 3 # Fit the model using the train and test datasets. model.fit(train_ds, validation_data=val_ds, epochs=epochs)Evaluate the model on the test set
model.evaluate(test_ds)Make an end-to-end model
# A string input inputs = tf.keras.Input(shape=(1,), dtype="string") # Turn strings into vocab indices indices = vectorize_layer(inputs) # Turn vocab indices into predictions outputs = model(indices) # Our end to end model end_to_end_model = tf.keras.Model(inputs, outputs) end_to_end_model.compile( loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"] ) # Test it with `raw_test_ds`, which yields raw strings end_to_end_model.evaluate(raw_test_ds)Summary
import re import string import numpy as np import tensorflow as tf from tensorflow.keras import layers from tensorflow.keras.layers.experimental.preprocessing import TextVectorization """數據讀取""" batch_size = 32 raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory( "../input/aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="training", seed=1337, ) raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory( "../input/aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="validation", seed=1337, ) raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory( "../input/aclImdb/test", batch_size=batch_size ) """數據預處理""" def custom_standardization(input_data): lowercase = tf.strings.lower(input_data) stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ") return tf.strings.regex_replace( stripped_html, "[%s]" % re.escape(string.punctuation), "" ) # Model constants. max_features = 20000 embedding_dim = 128 sequence_length = 500 """文本向量化""" vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=max_features, output_mode="int", output_sequence_length=sequence_length, ) """!!!必定要adapt""" # Let's make a text-only dataset (no labels): text_ds = raw_train_ds.map(lambda x, y: x) # Let's call `adapt`: vectorize_layer.adapt(text_ds) def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label # Vectorize the data. train_ds = raw_train_ds.map(vectorize_text) val_ds = raw_val_ds.map(vectorize_text) test_ds = raw_test_ds.map(vectorize_text) # # Do async prefetching / buffering of the data for best performance on GPU. train_ds = train_ds.cache().prefetch(buffer_size=10) val_ds = val_ds.cache().prefetch(buffer_size=10) test_ds = test_ds.cache().prefetch(buffer_size=10) """定義模型""" # A integer input for vocab indices. inputs = tf.keras.Input(shape=(None,), dtype="int64") # Next, we add a layer to map those vocab indices into a space of dimensionality # 'embedding_dim'. x = layers.Embedding(max_features, embedding_dim)(inputs) x = layers.Dropout(0.5)(x) # Conv1D + global max pooling x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x) x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x) x = layers.GlobalMaxPooling1D()(x) # We add a vanilla hidden layer: x = layers.Dense(128, activation="relu")(x) x = layers.Dropout(0.5)(x) # We project onto a single unit output layer, and squash it with a sigmoid: predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x) model = tf.keras.Model(inputs, predictions) # Compile the model with binary crossentropy loss and an adam optimizer. model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) # model.summary() """模型訓練""" epochs = 3 # Fit the model using the train and test datasets. model.fit(train_ds, validation_data=val_ds, epochs=epochs) """模型評估""" print('evalutate:') print(model.evaluate(test_ds))
# Let's make a text-only dataset (no labels): text_ds = raw_train_ds.map(lambda x, y: x) # Let's call `adapt`: vectorize_layer.adapt(text_ds)