Fine-Tuning GPT

Fine-Tuning GPT#

import tensorflow as tf, math, numpy as np
from tensorflow.keras import mixed_precision
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel, create_optimizer
from pathlib import Path

mixed_precision.set_global_policy("mixed_float16")   # GPU speed-up
import tensorflow as tf, os
print(tf.config.list_physical_devices("GPU"))

2025-05-08 15:04:27.527837: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-08 15:04:27.531050: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-08 15:04:27.539624: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1746716667.553369   54080 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746716667.557476   54080 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746716667.568828   54080 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746716667.568838   54080 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746716667.568840   54080 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746716667.568841   54080 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
2025-05-08 15:04:27.572822: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 3
      1 import tensorflow as tf, math, numpy as np
      2 from tensorflow.keras import mixed_precision
----> 3 from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel, create_optimizer
      4 from pathlib import Path
      6 mixed_precision.set_global_policy("mixed_float16")   # GPU speed-up

ModuleNotFoundError: No module named 'transformers'

# ── 1.  tokenizer with course-specific special tokens ──────────────
SPECIAL = ["<|question|>", "<|answer|>", "<|statement|>", "<|end|>"]
tok = GPT2TokenizerFast.from_pretrained("gpt2")
tok.pad_token = tok.eos_token                     # GPT-2 needs explicit pad
tok.add_special_tokens({"additional_special_tokens": SPECIAL})

# ── 2.  read the corpus and pre-tokenise in one call ───────────────
BLOCK = 512                                       # GPT-2 context span
txt_path = "data/clean_corpus.txt"
lines = Path(txt_path).read_text().splitlines()     # ≈ 1 line = 1 sample

enc = tok(lines,
          truncation=True,
          max_length=BLOCK,
          padding="max_length",                     # left-padded to 1024
          return_tensors="np")                      # gives NumPy arrays

input_ids      = enc["input_ids"]                   # shape (N, 1024)
attention_mask = enc["attention_mask"]

# ── 3.  wrap the arrays in tf.data  ────────────────────────────────
def as_ds(arr):          # helper: slice a 2-D NumPy array
    return tf.data.Dataset.from_tensor_slices(arr)

ds_ids  = as_ds(input_ids)
ds_mask = as_ds(attention_mask)

dataset = tf.data.Dataset.zip((ds_ids, ds_mask)).map(
    lambda ids, mask: {"input_ids": ids,
                       "attention_mask": mask,
                       "labels": ids},      # causal-LM target = ids
    num_parallel_calls=tf.data.AUTOTUNE)

# ── 4.  train / valid split, shuffle, batch ────────────────────────
SIZE   = tf.data.experimental.cardinality(dataset).numpy()
split  = int(0.95 * SIZE)

train_ds = (dataset.take(split)
                     .shuffle(10_000)
                     .batch(8, drop_remainder=True)
                     .prefetch(tf.data.AUTOTUNE))

valid_ds = (dataset.skip(split)
                     .batch(8, drop_remainder=True)
                     .prefetch(tf.data.AUTOTUNE))

# ── 5.  build & compile the model ──────────────────────────────────
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tok))             # **critical**

EPOCHS  = 3
STEPS   = tf.data.experimental.cardinality(train_ds).numpy() * EPOCHS
WARMUP  = int(0.1 * STEPS)

opt, lr_schedule = create_optimizer(
        init_lr=5e-5,
        num_train_steps=STEPS,
        num_warmup_steps=WARMUP,
        weight_decay_rate=0.01)

model.compile(optimizer=opt)                        # HF supplies loss

# ── 6.  train ──────────────────────────────────────────────────────
hist = model.fit(
        train_ds,
        validation_data=valid_ds,
        epochs=EPOCHS,
        # callbacks=[tf.keras.callbacks.EarlyStopping(
        #                monitor="val_loss",
        #                patience=2,
        #                restore_best_weights=True)]
)

print("final validation perplexity:",
      round(math.exp(hist.history["val_loss"][-1]), 2))

# ── 7.  save checkpoint ────────────────────────────────────────────
SAVE_DIR = "phpe400_finetuned"
model.save_pretrained(SAVE_DIR)
tok.save_pretrained(SAVE_DIR)
print("✓ saved to", SAVE_DIR)

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.

Epoch 1/3

2025-05-06 21:44:18.381807: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.

2798/2798 [==============================] - 2408s 856ms/step - loss: 9.9474 - val_loss: 6.8138
Epoch 2/3
2798/2798 [==============================] - 2520s 901ms/step - loss: 5.3886 - val_loss: 3.8185
Epoch 3/3
2798/2798 [==============================] - 2501s 894ms/step - loss: 2.4547 - val_loss: 1.4297
final validation perplexity: 4.18
✓ saved to phpe400_finetuned

from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
tok   = GPT2TokenizerFast.from_pretrained("phpe400_finetuned")
model = TFGPT2LMHeadModel.from_pretrained("phpe400_finetuned")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at phpe400_finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.

prompt = "<|question|> What is a rational preference? <|answer|> "
inputs  = tok(prompt, return_tensors="tf")

eos_id  = tok.convert_tokens_to_ids("<|end|>")      # the end-marker you added
gen_ids = model.generate(
            **inputs,
            max_new_tokens=240,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=eos_id,
            pad_token_id=eos_id,
)

print(tok.decode(gen_ids[0], skip_special_tokens=True))

 What is a rational preference?  ��� in���� the�U����anceteness���et�