Encoding Text

Contents

(word-encodings=)

Encoding Text#

Download Movie Reviews Dataset#

# uncomment the following lines to download and extract the IMDB dataset
#!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 
#!tar -xf aclImdb_v1.tar.gz
#!rm -r aclImdb/train/unsup

import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb") 
val_dir = base_dir / "val" 
train_dir = base_dir / "train" 

for category in ("neg", "pos"):
    os.makedirs(val_dir / category) 
    files = os.listdir(train_dir / category) 
    random.Random(1337).shuffle(files) 
    num_val_samples = int(0.2 * len(files))

val_files = files[-num_val_samples:]
for fname in val_files:
    shutil.move(train_dir / category / fname, val_dir / category / fname)

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 9
      7 for category in ("neg", "pos"):
      8     os.makedirs(val_dir / category) 
----> 9     files = os.listdir(train_dir / category) 
     10     random.Random(1337).shuffle(files) 
     11     num_val_samples = int(0.2 * len(files))

FileNotFoundError: [Errno 2] No such file or directory: 'aclImdb/train/neg'

from tensorflow import keras 

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory( "aclImdb/train", batch_size=batch_size ) 

val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size ) 

test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size )

Found 22500 files belonging to 2 classes.
Found 2500 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.

for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'While "Santa Claus Conquers the Martians" is usually cited as one of the worse films ever made, this Mexican-made film from 1959 is so bad it makes "SCCM" look like "It\'s a Wonderful Life." You have to wonder what the people who made this film were thinking; perhaps they meant it as a third-world allegory about capitalist greed and conspicuous consumption. Nah . . . They just weren\'t very good. The same production company made an even more disturbing version of "Little Red Riding Hood" in which the wolf\'s obsession with our heroine has unmistakable hints of pedophilia. (Perhaps this was the inspiration for "Freeway.") Back to "Santa Claus": instead of the North Pole, Jolly Old Saint Nicholas resides in a satellite in geosynchronous earth orbit (shades of "MST3K"); instead of elves his toys are made by children chosen from around the world; and he had sophisticated spy equipment to check just which kids are naughty and nice. The result is like an Orwellian outer space sweat shop. It\'s enough to turn you off Christmas forever. This and other low-rent Mexican children\'s\' films were dubbed in English and widely distributed in the U.S. in the early 1960s; no wonder the sixties became such a turbulent period in American history. The baby boomers who were forced to endure these "family" films as children would be all too eager to turn revolutionary.', shape=(), dtype=string)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)

Encoding Text - Bag of Words#

from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization( 
    max_tokens=20000,
    output_mode="multi_hot",
) 
text_only_train_ds = train_ds.map(lambda x, y: x) 
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
binary_1gram_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
binary_1gram_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)

from tensorflow import keras 
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"])
    
    return model

model = get_model() 
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint(
        "binary_1gram.weights.h5",   # any filename; .h5 is conventional
        save_best_only=True,
        save_weights_only=True,      # <—  key line
        verbose=1
    )
]

# --------------------------------------------------
# 3.  Fit
# --------------------------------------------------
model.fit(
    binary_1gram_train_ds.cache(),
    validation_data=binary_1gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

# --------------------------------------------------
# 4.  Reload the best weights into an *identical* model
# --------------------------------------------------
best_model = keras.models.clone_model(model)      # architecture only
best_model.compile(                              # ← compile it
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
best_model.load_weights("binary_1gram.weights.h5")  # load the saved weights

# Now it's ready for evaluation
test_loss, test_acc = best_model.evaluate(binary_1gram_test_ds)
print(f"Test acc: {test_acc:.3f}")

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_9 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_15 (Dense)            (None, 16)                320016    
                                                                 
 dropout_8 (Dropout)         (None, 16)                0         
                                                                 
 dense_16 (Dense)            (None, 1)                 17        
                                                                 
=================================================================
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

Epoch 1/10
678/704 [===========================>..] - ETA: 0s - loss: 0.3946 - accuracy: 0.8347
Epoch 1: val_loss improved from inf to 0.34251, saving model to binary_1gram.weights.h5
704/704 [==============================] - 2s 2ms/step - loss: 0.3926 - accuracy: 0.8362 - val_loss: 0.3425 - val_accuracy: 0.8616
Epoch 2/10
678/704 [===========================>..] - ETA: 0s - loss: 0.2724 - accuracy: 0.8993
Epoch 2: val_loss improved from 0.34251 to 0.31518, saving model to binary_1gram.weights.h5
704/704 [==============================] - 1s 2ms/step - loss: 0.2727 - accuracy: 0.8995 - val_loss: 0.3152 - val_accuracy: 0.8752
Epoch 3/10
701/704 [============================>.] - ETA: 0s - loss: 0.2479 - accuracy: 0.9125
Epoch 3: val_loss did not improve from 0.31518
704/704 [==============================] - 1s 2ms/step - loss: 0.2479 - accuracy: 0.9123 - val_loss: 0.3394 - val_accuracy: 0.8748
Epoch 4/10
681/704 [============================>.] - ETA: 0s - loss: 0.2393 - accuracy: 0.9184
Epoch 4: val_loss did not improve from 0.31518
704/704 [==============================] - 1s 1ms/step - loss: 0.2392 - accuracy: 0.9186 - val_loss: 0.3386 - val_accuracy: 0.8736
Epoch 5/10
675/704 [===========================>..] - ETA: 0s - loss: 0.2339 - accuracy: 0.9240
Epoch 5: val_loss did not improve from 0.31518
704/704 [==============================] - 1s 1ms/step - loss: 0.2342 - accuracy: 0.9238 - val_loss: 0.3817 - val_accuracy: 0.8508
Epoch 6/10
702/704 [============================>.] - ETA: 0s - loss: 0.2284 - accuracy: 0.9250
Epoch 6: val_loss did not improve from 0.31518
704/704 [==============================] - 1s 1ms/step - loss: 0.2284 - accuracy: 0.9251 - val_loss: 0.3867 - val_accuracy: 0.8540
Epoch 7/10
689/704 [============================>.] - ETA: 0s - loss: 0.2271 - accuracy: 0.9281
Epoch 7: val_loss did not improve from 0.31518
704/704 [==============================] - 1s 1ms/step - loss: 0.2276 - accuracy: 0.9279 - val_loss: 0.3691 - val_accuracy: 0.8580
Epoch 8/10
673/704 [===========================>..] - ETA: 0s - loss: 0.2321 - accuracy: 0.9272
Epoch 8: val_loss did not improve from 0.31518
704/704 [==============================] - 1s 1ms/step - loss: 0.2318 - accuracy: 0.9276 - val_loss: 0.3486 - val_accuracy: 0.8752
Epoch 9/10
674/704 [===========================>..] - ETA: 0s - loss: 0.2208 - accuracy: 0.9325
Epoch 9: val_loss did not improve from 0.31518
704/704 [==============================] - 1s 1ms/step - loss: 0.2218 - accuracy: 0.9325 - val_loss: 0.3940 - val_accuracy: 0.8488
Epoch 10/10
693/704 [============================>.] - ETA: 0s - loss: 0.2232 - accuracy: 0.9332
Epoch 10: val_loss did not improve from 0.31518
704/704 [==============================] - 1s 2ms/step - loss: 0.2233 - accuracy: 0.9329 - val_loss: 0.4039 - val_accuracy: 0.8384
782/782 [==============================] - 1s 1ms/step - loss: 0.2962 - accuracy: 0.8849
Test acc: 0.885

Bigram Encoding#

text_vectorization = TextVectorization( 
    ngrams=2, 
    max_tokens=20000,
    output_mode="multi_hot",
)

text_vectorization.adapt(text_only_train_ds) 
binary_2gram_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
binary_2gram_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
binary_2gram_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

model = get_model() 
model.summary() 

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "binary_2gram.weights.h5",   # any filename; .h5 is conventional
        save_best_only=True,
        save_weights_only=True,      # <—  key line
        verbose=1
    )
]

# --------------------------------------------------
# 3.  Fit
# --------------------------------------------------
model.fit(
    binary_2gram_train_ds.cache(),
    validation_data=binary_2gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

# --------------------------------------------------
# 4.  Reload the best weights into an *identical* model
# --------------------------------------------------
best_model = keras.models.clone_model(model)      # architecture only
best_model.compile(                              # ← compile it
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
best_model.load_weights("binary_2gram.weights.h5")  # load the saved weights

# Now it's ready for evaluation
test_loss, test_acc = best_model.evaluate(binary_2gram_test_ds)
print(f"Test acc: {test_acc:.3f}")

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_10 (InputLayer)       [(None, 20000)]           0         
                                                                 
 dense_17 (Dense)            (None, 16)                320016    
                                                                 
 dropout_9 (Dropout)         (None, 16)                0         
                                                                 
 dense_18 (Dense)            (None, 1)                 17        
                                                                 
=================================================================
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
676/704 [===========================>..] - ETA: 0s - loss: 0.3785 - accuracy: 0.8450
Epoch 1: val_loss improved from inf to 0.27960, saving model to binary_2gram.weights.h5
704/704 [==============================] - 2s 2ms/step - loss: 0.3765 - accuracy: 0.8460 - val_loss: 0.2796 - val_accuracy: 0.8964
Epoch 2/10
703/704 [============================>.] - ETA: 0s - loss: 0.2458 - accuracy: 0.9152
Epoch 2: val_loss improved from 0.27960 to 0.27515, saving model to binary_2gram.weights.h5
704/704 [==============================] - 1s 2ms/step - loss: 0.2458 - accuracy: 0.9152 - val_loss: 0.2751 - val_accuracy: 0.8940
Epoch 3/10
687/704 [============================>.] - ETA: 0s - loss: 0.2149 - accuracy: 0.9312
Epoch 3: val_loss did not improve from 0.27515
704/704 [==============================] - 1s 2ms/step - loss: 0.2141 - accuracy: 0.9310 - val_loss: 0.2782 - val_accuracy: 0.8984
Epoch 4/10
687/704 [============================>.] - ETA: 0s - loss: 0.2002 - accuracy: 0.9368
Epoch 4: val_loss did not improve from 0.27515
704/704 [==============================] - 1s 2ms/step - loss: 0.1994 - accuracy: 0.9369 - val_loss: 0.3293 - val_accuracy: 0.8780
Epoch 5/10
686/704 [============================>.] - ETA: 0s - loss: 0.1919 - accuracy: 0.9427
Epoch 5: val_loss did not improve from 0.27515
704/704 [==============================] - 1s 2ms/step - loss: 0.1912 - accuracy: 0.9428 - val_loss: 0.2993 - val_accuracy: 0.8912
Epoch 6/10
689/704 [============================>.] - ETA: 0s - loss: 0.1911 - accuracy: 0.9443
Epoch 6: val_loss did not improve from 0.27515
704/704 [==============================] - 1s 2ms/step - loss: 0.1906 - accuracy: 0.9442 - val_loss: 0.3911 - val_accuracy: 0.8548
Epoch 7/10
699/704 [============================>.] - ETA: 0s - loss: 0.1905 - accuracy: 0.9447
Epoch 7: val_loss did not improve from 0.27515
704/704 [==============================] - 1s 2ms/step - loss: 0.1908 - accuracy: 0.9444 - val_loss: 0.3444 - val_accuracy: 0.8704
Epoch 8/10
672/704 [===========================>..] - ETA: 0s - loss: 0.1870 - accuracy: 0.9464
Epoch 8: val_loss did not improve from 0.27515
704/704 [==============================] - 1s 2ms/step - loss: 0.1865 - accuracy: 0.9460 - val_loss: 0.3417 - val_accuracy: 0.8692
Epoch 9/10
676/704 [===========================>..] - ETA: 0s - loss: 0.1831 - accuracy: 0.9481
Epoch 9: val_loss did not improve from 0.27515
704/704 [==============================] - 1s 2ms/step - loss: 0.1816 - accuracy: 0.9478 - val_loss: 0.3404 - val_accuracy: 0.8648
Epoch 10/10
688/704 [============================>.] - ETA: 0s - loss: 0.1892 - accuracy: 0.9481
Epoch 10: val_loss did not improve from 0.27515
704/704 [==============================] - 1s 2ms/step - loss: 0.1881 - accuracy: 0.9481 - val_loss: 0.3599 - val_accuracy: 0.8524
782/782 [==============================] - 1s 1ms/step - loss: 0.2694 - accuracy: 0.9012
Test acc: 0.901

text_vectorization = TextVectorization( 
    ngrams=2,
      max_tokens=20000,
      output_mode="tf_idf",
)

text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
tfidf_2gram_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
tfidf_2gram_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)


model = get_model() 
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "tfidf_2gram.weights.h5",   # any filename; .h5 is conventional
        save_best_only=True,
        save_weights_only=True,      # <—  key line
        verbose=1
    )
]

# --------------------------------------------------
# 3.  Fit
# --------------------------------------------------
model.fit(
    tfidf_2gram_train_ds.cache(),
    validation_data=tfidf_2gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

# --------------------------------------------------
# 4.  Reload the best weights into an *identical* model
# --------------------------------------------------
best_model = keras.models.clone_model(model)      # architecture only
best_model.compile(                              # ← compile it
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
best_model.load_weights("tfidf_2gram.weights.h5")  # load the saved weights

# Now it's ready for evaluation
test_loss, test_acc = best_model.evaluate(tfidf_2gram_test_ds)
print(f"Test acc: {test_acc:.3f}")

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_11 (InputLayer)       [(None, 20000)]           0         
                                                                 
 dense_19 (Dense)            (None, 16)                320016    
                                                                 
 dropout_10 (Dropout)        (None, 16)                0         
                                                                 
 dense_20 (Dense)            (None, 1)                 17        
                                                                 
=================================================================
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
703/704 [============================>.] - ETA: 0s - loss: 0.4872 - accuracy: 0.7948
Epoch 1: val_loss improved from inf to 0.25396, saving model to tfidf_2gram.weights.h5
704/704 [==============================] - 2s 2ms/step - loss: 0.4872 - accuracy: 0.7948 - val_loss: 0.2540 - val_accuracy: 0.9292
Epoch 2/10
696/704 [============================>.] - ETA: 0s - loss: 0.3318 - accuracy: 0.8618
Epoch 2: val_loss improved from 0.25396 to 0.23088, saving model to tfidf_2gram.weights.h5
704/704 [==============================] - 1s 2ms/step - loss: 0.3311 - accuracy: 0.8623 - val_loss: 0.2309 - val_accuracy: 0.9356
Epoch 3/10
678/704 [===========================>..] - ETA: 0s - loss: 0.3021 - accuracy: 0.8740
Epoch 3: val_loss improved from 0.23088 to 0.22145, saving model to tfidf_2gram.weights.h5
704/704 [==============================] - 1s 2ms/step - loss: 0.3038 - accuracy: 0.8731 - val_loss: 0.2214 - val_accuracy: 0.9580
Epoch 4/10
685/704 [============================>.] - ETA: 0s - loss: 0.2775 - accuracy: 0.8860
Epoch 4: val_loss did not improve from 0.22145
704/704 [==============================] - 1s 2ms/step - loss: 0.2775 - accuracy: 0.8854 - val_loss: 0.2587 - val_accuracy: 0.8944
Epoch 5/10
681/704 [============================>.] - ETA: 0s - loss: 0.2542 - accuracy: 0.8970
Epoch 5: val_loss did not improve from 0.22145
704/704 [==============================] - 1s 2ms/step - loss: 0.2537 - accuracy: 0.8963 - val_loss: 0.3549 - val_accuracy: 0.8544
Epoch 6/10
677/704 [===========================>..] - ETA: 0s - loss: 0.2467 - accuracy: 0.9000
Epoch 6: val_loss did not improve from 0.22145
704/704 [==============================] - 1s 2ms/step - loss: 0.2463 - accuracy: 0.8994 - val_loss: 0.3698 - val_accuracy: 0.8564
Epoch 7/10
688/704 [============================>.] - ETA: 0s - loss: 0.2355 - accuracy: 0.9038
Epoch 7: val_loss did not improve from 0.22145
704/704 [==============================] - 1s 2ms/step - loss: 0.2352 - accuracy: 0.9035 - val_loss: 0.3385 - val_accuracy: 0.8724
Epoch 8/10
680/704 [===========================>..] - ETA: 0s - loss: 0.2304 - accuracy: 0.9051
Epoch 8: val_loss did not improve from 0.22145
704/704 [==============================] - 1s 2ms/step - loss: 0.2308 - accuracy: 0.9042 - val_loss: 0.4323 - val_accuracy: 0.8120
Epoch 9/10
687/704 [============================>.] - ETA: 0s - loss: 0.2299 - accuracy: 0.9062
Epoch 9: val_loss did not improve from 0.22145
704/704 [==============================] - 1s 2ms/step - loss: 0.2294 - accuracy: 0.9060 - val_loss: 0.3392 - val_accuracy: 0.8636
Epoch 10/10
703/704 [============================>.] - ETA: 0s - loss: 0.2314 - accuracy: 0.9046
Epoch 10: val_loss did not improve from 0.22145
704/704 [==============================] - 1s 2ms/step - loss: 0.2314 - accuracy: 0.9046 - val_loss: 0.3469 - val_accuracy: 0.8492
782/782 [==============================] - 1s 1ms/step - loss: 0.3344 - accuracy: 0.8503
Test acc: 0.850

from tensorflow.keras import layers

max_length = 600 
max_tokens = 20000 
text_vectorization = layers.TextVectorization( max_tokens=max_tokens,
                                              output_mode="int",
                                              output_sequence_length=max_length,

) 

text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

int_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
int_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)