Encoding Text - Using Predefined Word Embeddings

Contents

Encoding Text - Using Predefined Word Embeddings#

Download Word Embeddings#

In this notebook, we will download the GloVe word embeddings and use them to encode text.

# uncomment to download GloVe embeddings
#!curl -L -o glove.6B.zip https://nlp.stanford.edu/data/glove.6B.zip 
#!unzip -q glove.6B.zip

# assuming you have the GloVe embeddings in the same directory
# reading GloVe embeddings
import numpy as np 

path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {} 

with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 9
      5 path_to_glove_file = "glove.6B.100d.txt"
      7 embeddings_index = {} 
----> 9 with open(path_to_glove_file) as f:
     10     for line in f:
     11         word, coefs = line.split(maxsplit=1)

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/IPython/core/interactiveshell.py:326, in _modified_open(file, *args, **kwargs)
    319 if file in {0, 1, 2}:
    320     raise ValueError(
    321         f"IPython won't let you open fd={file} by default "
    322         "as it is likely to crash IPython. If you know what you are doing, "
    323         "you can use builtins' open."
    324     )
--> 326 return io_open(file, *args, **kwargs)

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.100d.txt'

# this code will not work unless you have the impdb movie reviews in 
# the directory imdb_raw_reviews/train, imdb_raw_reviews/val,
# and imdb_raw_reviews/test

from tensorflow.keras import layers
from tensorflow import keras

batch_size  = 32
max_length  = 600
max_tokens  = 20_000

text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

# ------------------------------------------------------------
# read the raw text datasets (labelled)
# ------------------------------------------------------------
train_ds = keras.utils.text_dataset_from_directory(
    "imdb_raw_reviews/train", batch_size=batch_size
)
val_ds   = keras.utils.text_dataset_from_directory(
    "imdb_raw_reviews/val",   batch_size=batch_size
)
test_ds  = keras.utils.text_dataset_from_directory(
    "imdb_raw_reviews/test",  batch_size=batch_size
)

# ------------------------------------------------------------
# ADAPT the vectoriser on *text only* (no labels)
# ------------------------------------------------------------
text_only_ds = train_ds.map(lambda x, y: x)   # strip labels
text_vectorization.adapt(text_only_ds)        # builds the vocabulary

# optional: inspect
vocabulary = text_vectorization.get_vocabulary()
print("Top 10 tokens:", vocabulary[:10])

# ------------------------------------------------------------
# map datasets to integer sequences
# ------------------------------------------------------------
int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)

Found 22500 files belonging to 2 classes.
Found 2500 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
Top 10 tokens: ['', '[UNK]', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'it']

# associate the GloVe embeddings with each token in the vocabulary
embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary() 
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim)) 

for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word) 
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# use the GloVe embeddings to learn sentiment analysis for the IMDB dataset
# 

# the embedding layer will be initialized with the GloVe embeddings
# the embedding layer will not be trainable (i.e. frozen)
import tensorflow as tf 

embedding_layer = layers.Embedding(
    max_tokens, 
    embedding_dim, 
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False, 
    mask_zero=True)


# ----- model definition -------------------------------------------------
inputs   = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)                   # your pre‑built Glove Embedding layer
x        = layers.Bidirectional(layers.LSTM(32))(embedded)
x        = layers.Dropout(0.5)(x)
outputs  = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

# ----- weights‑only checkpoint (no .keras format) -----------------------
checkpoint = keras.callbacks.ModelCheckpoint(
    "glove_seq_best.weights.h5",   # any filename ending in .h5 is fine
    monitor="val_accuracy",        # save the epoch with highest val‑acc
    save_best_only=True,
    save_weights_only=True         # ← avoids the native .keras saver bug
)

# ----- training ---------------------------------------------------------
model.fit(int_train_ds,
          validation_data=int_val_ds,
          epochs=10,
          callbacks=[checkpoint])

# ----- reload & evaluate ------------------------------------------------
model.load_weights("glove_seq_best.weights.h5")      # weights‑only file
test_acc = model.evaluate(int_test_ds, verbose=0)[1]
print(f"Test acc: {test_acc:.3f}")

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_5 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         2000000   
                                                                 
 bidirectional_4 (Bidirecti  (None, 64)                34048     
 onal)                                                           
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
=================================================================
Total params: 2034113 (7.76 MB)
Trainable params: 34113 (133.25 KB)
Non-trainable params: 2000000 (7.63 MB)
_________________________________________________________________
Epoch 1/10
704/704 [==============================] - 87s 120ms/step - loss: 0.5733 - accuracy: 0.6987 - val_loss: 0.9806 - val_accuracy: 0.4596
Epoch 2/10
704/704 [==============================] - 80s 114ms/step - loss: 0.4490 - accuracy: 0.7931 - val_loss: 0.5843 - val_accuracy: 0.7004
Epoch 3/10
704/704 [==============================] - 73s 104ms/step - loss: 0.3947 - accuracy: 0.8269 - val_loss: 0.6973 - val_accuracy: 0.6260
Epoch 4/10
704/704 [==============================] - 82s 117ms/step - loss: 0.3658 - accuracy: 0.8416 - val_loss: 0.0864 - val_accuracy: 0.9760
Epoch 5/10
704/704 [==============================] - 74s 106ms/step - loss: 0.3430 - accuracy: 0.8524 - val_loss: 0.3594 - val_accuracy: 0.8452
Epoch 6/10
704/704 [==============================] - 74s 105ms/step - loss: 0.3216 - accuracy: 0.8607 - val_loss: 0.2895 - val_accuracy: 0.8860
Epoch 7/10
704/704 [==============================] - 74s 106ms/step - loss: 0.3050 - accuracy: 0.8723 - val_loss: 0.3155 - val_accuracy: 0.8688
Epoch 8/10
704/704 [==============================] - 73s 104ms/step - loss: 0.2896 - accuracy: 0.8815 - val_loss: 0.2923 - val_accuracy: 0.8812
Epoch 9/10
704/704 [==============================] - 77s 110ms/step - loss: 0.2721 - accuracy: 0.8875 - val_loss: 0.3105 - val_accuracy: 0.8708
Epoch 10/10
704/704 [==============================] - 79s 112ms/step - loss: 0.2634 - accuracy: 0.8935 - val_loss: 0.7338 - val_accuracy: 0.6680
Test acc: 0.754