Fine-Tuning: Pre-Processing the Data#
Convert Slides (LaTeX) to Raw Text#
from pathlib import Path, PurePath
import pypandoc, re
input_dir = 'phpe400_corpus/slides'
output_file = 'extracted/raw_slides.txt'
def tex_to_plain(tex_path):
txt = pypandoc.convert_file(
str(tex_path),
to="plain",
format="latex",
extra_args=["--wrap=none", "--quiet"])
return re.sub(r'\n\s*\n+', '\n', txt) # collapse blank lines
with Path(output_file).open("a", encoding="utf-8") as out:
for tex in Path(input_dir).rglob("*.tex"):
print(f"Processing {tex}")
out.write(tex_to_plain(tex) + "\n")
print(f"\nOutput written to {output_file}")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 2
1 from pathlib import Path, PurePath
----> 2 import pypandoc, re
4 input_dir = 'phpe400_corpus/slides'
5 output_file = 'extracted/raw_slides.txt'
ModuleNotFoundError: No module named 'pypandoc'
Convert Syllabus (PDF) to Raw Text#
from pathlib import Path
from pypdf import PdfReader
import re, textwrap
SRC = "phpe400_corpus/syllabus/syl-methods-ppe-v4.pdf" # the file you just uploaded
DEST = "extracted/raw_syllabus.txt" # append or create
def pdf_to_plain(path):
reader = PdfReader(path)
lines = []
for page in reader.pages:
txt = page.extract_text() or ""
txt = txt.replace("\u200b", "") # zero-width
# strip page headers/footers like “5 / 8”
txt = re.sub(r'\b\d+\s*/\s*\d+\s*$', '', txt, flags=re.M)
lines.extend(txt.splitlines())
# collapse blocks of ≥2 blank lines to a single blank
cleaned = "\n".join(line.rstrip() for line in lines)
cleaned = re.sub(r'\n\s*\n+', '\n', cleaned)
return cleaned.strip()
print(f"Processing {SRC}")
plain = pdf_to_plain(SRC)
# stream into your master corpus file
with Path(DEST).open("a", encoding="utf-8") as out:
for para in plain.split("\n"):
if para.strip():
out.write(para.strip() + "\n")
print(f"\nOutput written to {DEST}")
Processing phpe400_corpus/syllabus/syl-methods-ppe-v4.pdf
Output written to extracted/raw_syllabus.txt
Convert Review Sheets (LaTeX) to Raw Text#
from pathlib import Path, PurePath
import pypandoc, re
input_dir = 'phpe400_corpus/review-sheets'
output_file = 'extracted/raw_review-sheets.txt'
def tex_to_plain(tex_path):
txt = pypandoc.convert_file(
str(tex_path),
to="plain",
format="latex",
extra_args=["--wrap=none", "--quiet"])
return re.sub(r'\n\s*\n+', '\n', txt) # collapse blank lines
with Path(output_file).open("a", encoding="utf-8") as out:
for tex in Path(input_dir).rglob("*.tex"):
print(f"Processing {tex}")
out.write(tex_to_plain(tex) + "\n")
print(f"\nOutput written to {output_file}")
Processing phpe400_corpus/review-sheets/exam1-review-answers.tex
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing phpe400_corpus/review-sheets/final-exam-review.tex
Processing phpe400_corpus/review-sheets/exam1-review.tex
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing phpe400_corpus/review-sheets/final-exam-review-answers.tex
Output written to extracted/raw_review-sheets.txt
Convert Chapters (PDF) to Raw Text#
from pathlib import Path
from pypdf import PdfReader
import re
input_dir = 'phpe400_corpus/text' # folder with your 3 PDFs
output_file = 'extracted/raw_text.txt'
# --------------------------- 1. header & noise patterns --------------------------
HEADER_RE = re.compile(
r'^[ \t]*(?:Social Choice Theory|Game Theory|Rationality and Utility Theory)'
r'(?:[ \t]+\d+)?[ \t]*$', # optional page number
flags=re.M
)
# Lone page numbers like “55”
LONE_PAGE_RE = re.compile(r'^[ \t]*\d{1,3}[ \t]*$', flags=re.M)
# All-caps chapter banners such as “FOUR” or “PART II”
CAPS_BANNER_RE = re.compile(r'^[ \t]*[A-Z]{2,}[ \t]*$', flags=re.M)
# Numbered section headings “4.1 …” or “2.3.5 …”
SECNUM_RE = re.compile(r'^\d+(?:\.\d+)+\s+.*$', flags=re.M)
# --------------------------- 2. tidy helper --------------------------------------
def tidy(text: str) -> str:
text = HEADER_RE.sub('', text)
text = LONE_PAGE_RE.sub('', text)
text = CAPS_BANNER_RE.sub('', text)
text = SECNUM_RE.sub('', text)
text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text) # hard hyphen breaks
text = re.sub(r'(\w+)\u00ad(\w+)', r'\1\2', text) # soft hyphen
text = re.sub(r'\n\s*\n+', '\n', text) # collapse blank lines
return text.strip()
# --------------------------- 3. PDF → plain-text ---------------------------------
def pdf_to_plain(path: Path) -> str:
reader = PdfReader(path)
pages = []
for pg in reader.pages:
raw = pg.extract_text() or ""
raw = raw.replace('\u200b', '') # zero-width chars
raw = re.sub(r'\b\d+\s*/\s*\d+\s*$', '', raw, flags=re.M) # “5 / 15” style
pages.append(tidy(raw))
return "\n".join(pages)
# --------------------------- 4. batch over all PDFs ------------------------------
PDF_DIR = Path(input_dir) # folder with your 3 PDFs
DEST = Path(output_file) # master output file
with DEST.open("a", encoding="utf-8") as out:
for pdf in sorted(PDF_DIR.glob("*.pdf")):
print(f"Processing {pdf.name}")
for para in pdf_to_plain(pdf).split("\n"):
if para.strip():
out.write(para + "\n")
print("\nOutput written to", DEST.name)
Processing econ-analysis-moral-phil-public-policy-ch4.pdf
Processing econ-analysis-moral-philosophy-public-policy-ch13.pdf
Processing econ-analysis-moral-philosophy-public-policy-ch14.pdf
Output written to raw_text.txt
Convert Piazza Comments/Answers (JSON) to Raw Text#
from pathlib import Path
import json, html
import re
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
PIAZZA = Path("phpe400_corpus/piazza/class_content_flat.json") # uploaded file
OUT = Path("extracted/raw_piazza_qa.txt")
def clean(html_snippet: str) -> str:
"""Remove <tags>, decode entities and collapse whitespace."""
# 1. HTML → plain text
text = BeautifulSoup(html_snippet, "html.parser").get_text(" ", strip=True)
# 2. Unescape etc.
text = html.unescape(text)
# 3. Tighten spaces/newlines
return re.sub(r'\s+', ' ', text).strip()
with PIAZZA.open() as f:
posts = json.load(f)
# Index every post by its Piazza ID so we can match answers to questions
by_id = {p["id"]: p for p in posts}
qapairs = []
for p in posts:
if p["type"] == "question":
qtxt = f"Q: {clean(p['subject'])}\n{clean(p['content'])}"
# grab instructor answer (type == 'i_answer') in same thread
ans = next((by_id[c] for c in by_id # walk once over dict
if by_id[c].get("parent_id") == p["id"]
and by_id[c]["type"] == "i_answer"),
None)
if ans:
atxt = clean(ans["content"])
qapairs.append(f"{qtxt}\nA: {atxt}")
OUT.write_text("\n\n".join(qapairs), encoding="utf-8")
print(f"Wrote {len(qapairs)} Q/A pairs to {OUT}")
Wrote 56 Q/A pairs to extracted/raw_piazza_qa.txt
Convert Online Notes (HTML) to Raw Text#
from pathlib import Path
from bs4 import BeautifulSoup
import html2text, html, re
# === 1. one-time imports & (auto-)installs =========================
import sys, subprocess, re, html, shutil
from pathlib import Path
for pkg in ("beautifulsoup4", "html2text", "lxml"):
if not shutil.which("pip") or subprocess.call(
[sys.executable, "-m", "pip", "show", pkg],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL):
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
from bs4 import BeautifulSoup
import html2text
# === 2. configure the directory to scan ============================
HTML_DIR = Path("phpe400_corpus/notes") # <── change if your html lives elsewhere
DEST = Path("extracted/raw_html_notes.txt")
# === 3. regex helpers ==============================================
SPAN_MATH = re.compile(r'<span[^>]*class="math[^"]*"(?:[^>]*>)(.*?)</span>', re.S)
SCR_MATH = re.compile(r'<script[^>]*type="math/tex[^"]*"(?:[^>]*>)(.*?)</script>', re.S)
IMG_TAG = re.compile(r'<img[^>]*>', re.S)
LINE_JUNK = re.compile(r'^(?:__+|\s*[*\-]\s*$|\s*\d+\.\s+\S.*)$')
SIDEBAR_KW = ("sidebar", "toc", "nav-page", "menu")
# === 4. utility functions ==========================================
def strip_outer(expr: str) -> str:
expr = expr.strip()
if expr.startswith(r'\(') and expr.endswith(r'\)'): expr = expr[2:-2].strip()
elif expr.startswith(r'\[') and expr.endswith(r'\]'): expr = expr[2:-2].strip()
return re.sub(r'\\\\([{}])', r'\\\1', expr) # \\{ -> \{ \\} -> \}
def looks_sidebar(tag) -> bool:
if tag.name in ("nav", "aside"): return True
blob = " ".join([tag.get("id", ""), *tag.get("class", [])]).lower()
return any(k in blob for k in SIDEBAR_KW)
h2t = html2text.HTML2Text(); h2t.ignore_links = True; h2t.body_width = 0
def html_to_plain(html_file: Path) -> str:
soup = BeautifulSoup(html_file.read_text(errors="ignore"), "lxml")
main = soup.find("main", id="quarto-content") or soup.find("main") or soup.body or soup
soup = BeautifulSoup(str(main), "lxml") # clone so .decompose() is safe
for tag in soup.find_all(looks_sidebar): tag.decompose()
raw = IMG_TAG.sub('', str(soup))
raw = SPAN_MATH.sub(lambda m: f"${strip_outer(m.group(1))}$", raw)
raw = SCR_MATH.sub( lambda m: f"${strip_outer(m.group(1))}$", raw)
text = h2t.handle(raw)
text = html.unescape(text)
text = re.sub(r'\\\\([{}])', r'\\\1', text) # collapse any \\{ left
lines = [ln.strip() for ln in text.splitlines()
if ln.strip() and not LINE_JUNK.match(ln)]
return "\n".join(lines)
# === 5. process every html file ====================================
DEST.write_text("", encoding="utf-8") # overwrite
count = 0
for html_f in sorted(HTML_DIR.rglob("*.html")):
print(f"Processing {html_f.name}")
cleaned = html_to_plain(html_f)
DEST.write_text(DEST.read_text() + cleaned + "\n", encoding="utf-8")
count += 1
print(f"\n Output {count} html files to {DEST.name}")
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing evaluative-voting.html
Processing grading-vs-ranking.html
Processing interpersonal-comparison-utilities.html
Processing objections-utilitarianism.html
Processing social-welfare-functionals.html
Processing decision-problems.html
Processing newcomb.html
Processing rational-decisions.html
Processing allais.html
Processing ellsberg.html
Processing evaluating-axioms.html
Processing expected-utility.html
Processing independence.html
Processing functions.html
Processing lotteries.html
Processing preferences-over-lotteries.html
Processing utility-functions.html
Processing index.html
Processing completeness.html
Processing relations.html
Processing sets.html
Processing preference-and-choice.html
Processing preference-relations.html
Processing rational-preferences.html
Processing transitivity.html
Processing references.html
Processing arrows-theorem.html
Processing beyond-two-alternatives.html
Processing condorcet-jury-theorem.html
Processing justifying-majority-rule.html
Processing mays-theorem.html
Processing social-welfare-functions.html
Processing voting-axioms.html
Processing comparing-voting-methods.html
Processing condorcet-consistent-methods.html
Processing condorcet-paradox.html
Processing elections.html
Processing iterative-methods.html
Processing majority-preference.html
Processing scoring-rules.html
Processing voting-methods.html
Output 41 html files to raw_html_notes.txt
Combine All Text Files#
from pathlib import Path
raw_text_files = [
"raw_slides.txt",
"raw_syllabus.txt",
"raw_review-sheets.txt",
"raw_text.txt",
"raw_piazza_qa.txt",
"raw_html_notes.txt"
]
MASTER = Path("extracted/raw_phpe400_corpus.txt")
MASTER.parent.mkdir(parents=True, exist_ok=True)
with MASTER.open("w", encoding="utf-8") as master:
for raw in raw_text_files:
p = Path(raw)
if not p.exists():
print(f"⚠ {raw} not found — skipping")
continue
print(f"✓ adding {raw}")
for line in p.read_text(encoding="utf-8").splitlines():
if line.strip(): # skip truly empty lines
master.write(line.strip() + "\n")
master.write("<|eod|>\n") # <-- boundary token
print(f"\nwrote combined corpus → {MASTER} ({MASTER.stat().st_size/1024:.1f} KB)")
✓ adding raw_slides.txt
✓ adding raw_syllabus.txt
✓ adding raw_review-sheets.txt
✓ adding raw_text.txt
✓ adding raw_piazza_qa.txt
✓ adding raw_html_notes.txt
wrote combined corpus → extracted/raw_phpe400_corpus.txt (1328.9 KB)
import re, textwrap, random
from pathlib import Path
random.seed(42)
# ────────────────────────────────────────────────────────────────
# 0. directories
# ────────────────────────────────────────────────────────────────
EXTRACT_DIR = Path("extracted") # all *.txt live here
OUT_DIR = Path("data")
OUT_DIR.mkdir(exist_ok=True)
# ────────────────────────────────────────────────────────────────
# 1. helper: Q: ... A: ... → <|question|> ... <|answer|> ... <|end|>
# ────────────────────────────────────────────────────────────────
def convert_QA_blocks(text:str) -> str:
pat = re.compile(
r"^Q:\s*(?P<q>.*?)\nA:\s*(?P<a>.*?)(?=^\s*\n|\Z)",
flags=re.S | re.M)
def repl(m):
q = textwrap.dedent(m.group("q")).strip()
a = textwrap.dedent(m.group("a")).strip()
return f"<|question|>\n{q}\n<|answer|>\n{a}\n<|end|>"
return pat.sub(repl, text)
# ────────────────────────────────────────────────────────────────
# 2. helper: full cleaner you already tested
# ────────────────────────────────────────────────────────────────
def clean_text_block(text:str) -> str:
text = re.sub(r"^[\u2022\-\*-]\s*", "", text, flags=re.M) # bullets
text = re.sub(r"\n{3,}", "\n\n", text) # blank lines
text = re.sub(r"[ \t]{2,}", " ", text) # 2+ spaces
H_RULE = re.compile(r"^[\s\--—_=]{3,}$", flags=re.M) # rules
text = re.sub(H_RULE, "", text)
seen, out = set(), []
for line in text.splitlines(): # dedupe titles
if re.fullmatch(r"[A-Z][A-Z ]{2,40}", line.strip()):
if line in seen: continue
seen.add(line)
out.append(line)
return "\n".join(out).strip()
# ────────────────────────────────────────────────────────────────
# 3. helper: split long prose into <|statement|> blocks
# (but skip lines that are already tagged)
# ────────────────────────────────────────────────────────────────
def paragraph_blocks(text:str, max_chars=3500):
if text.startswith("<|question|>"): # already tagged, keep as-is
return [text]
buf, out = [], []
for line in text.splitlines():
buf.append(line)
if len(" ".join(buf)) > max_chars:
out.append("\n".join(buf)); buf=[]
if buf: out.append("\n".join(buf))
return [f"<|statement|>\n{b.strip()}\n<|end|>" for b in out]
# ────────────────────────────────────────────────────────────────
# 4. load every extracted *.txt (except qa_pairs.txt)
# ────────────────────────────────────────────────────────────────
plain_blocks = []
for txt_file in EXTRACT_DIR.glob("*.txt"):
if txt_file.name == "qa_pairs.txt":
continue
raw = txt_file.read_text()
raw = convert_QA_blocks(raw) # convert embedded Q/A
clean = clean_text_block(raw) # your cleaner
plain_blocks.extend(paragraph_blocks(clean))
print("✓ processed", txt_file.name)
# ────────────────────────────────────────────────────────────────
# 5. load hand-crafted Q-A file, dedupe, oversample
# ────────────────────────────────────────────────────────────────
qa_raw = Path(EXTRACT_DIR / "qa_pairs.txt").read_text()
def dedupe_qas(raw):
seen, out = set(), []
for m in re.finditer(r"<\|question\|>.*?<\|end\|>", raw, re.S):
blk = textwrap.dedent(m.group(0)).strip()
if blk not in seen:
out.append(blk); seen.add(blk)
return out
qa_blocks = dedupe_qas(qa_raw)
qa_blocks *= 2 # oversample factor (2 = duplicate once)
# ────────────────────────────────────────────────────────────────
# 6. shuffle & write corpus
# ────────────────────────────────────────────────────────────────
all_blocks = qa_blocks + plain_blocks
random.shuffle(all_blocks)
out_file = OUT_DIR / "clean_corpus.txt"
out_file.write_text("\n\n".join(all_blocks), encoding="utf-8")
print(f"✓ wrote {out_file} — {len(all_blocks)} tagged blocks")
✓ processed raw_html_notes.txt
✓ processed raw_syllabus.txt
✓ processed raw_text.txt
✓ processed raw_phpe400_corpus.txt
✓ processed raw_piazza_qa.txt
✓ processed raw_slides.txt
✓ processed raw_review-sheets.txt
✓ wrote data/clean_corpus.txt — 580 tagged blocks
from pathlib import Path
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
corpus = Path("data/clean_corpus.txt").read_text(encoding="utf-8")
n_tokens = len(tokenizer(corpus).input_ids)
print(f"Total tokens: {n_tokens:,}")
Token indices sequence length is longer than the specified maximum sequence length for this model (552851 > 1024). Running this sequence through the model will result in indexing errors
Total tokens: 552,851