From 4f3b96622f0cb86470f1e90ede21a0ad64bb2252 Mon Sep 17 00:00:00 2001 From: Surendra Raika Date: Mon, 23 Feb 2026 18:00:18 +0530 Subject: [PATCH] Add GPT course: 12-chapter guide to understanding microGPT and minGPT A comprehensive course for early CS students covering: - Ch01-06: Fundamentals (language models, tokenization, autograd, NN blocks, attention, training) - Ch07: microGPT full annotated walkthrough (pure Python, zero dependencies) - Ch08-10: minGPT with PyTorch (model architecture, trainer, inference) - Ch11: Side-by-side comparison of both implementations - Ch12: Exercises and next steps All chapters include runnable Python examples. Ch01-07 require no dependencies; Ch08-10 require PyTorch. --- gpt/local/course/README.md | 64 +++ .../language_model_idea-checkpoint.py | 104 +++++ .../ch01_What_is_a_Language_Model/README.md | 102 +++++ .../language_model_idea.py | 104 +++++ gpt/local/course/ch02_Tokenization/README.md | 83 ++++ .../course/ch02_Tokenization/bpe_intuition.py | 112 +++++ .../ch02_Tokenization/char_tokenizer.py | 92 +++++ gpt/local/course/ch03_Autograd/README.md | 59 +++ .../course/ch03_Autograd/computation_graph.py | 138 +++++++ .../course/ch03_Autograd/gradient_descent.py | 122 ++++++ .../course/ch03_Autograd/value_basics.py | 164 ++++++++ .../README.md | 69 ++++ .../building_blocks.py | 174 ++++++++ .../mlp.py | 167 ++++++++ .../ch05_Attention_and_Transformers/README.md | 95 +++++ .../attention_basics.py | 184 +++++++++ .../multi_head_attention.py | 151 +++++++ .../transformer_block.py | 207 ++++++++++ .../README.md | 67 +++ .../adam_optimizer.py | 124 ++++++ .../cross_entropy.py | 121 ++++++ .../training_loop.py | 171 ++++++++ .../ch07_microGPT_Full_Walkthrough/README.md | 70 ++++ .../microgpt_annotated.py | 382 ++++++++++++++++++ .../ch08_Scaling_Up_with_PyTorch/README.md | 85 ++++ .../pytorch_basics.py | 200 +++++++++ .../pytorch_vs_manual.py | 202 +++++++++ .../ch09_minGPT_Model_Deep_Dive/README.md | 79 ++++ .../model_sizes.py | 146 +++++++ .../model_walkthrough.py | 263 ++++++++++++ .../ch10_Training_and_Inference/README.md | 80 ++++ .../generate_text.py | 206 ++++++++++ .../trainer_explained.py | 272 +++++++++++++ .../ch11_Side_by_Side_Comparison/README.md | 46 +++ .../comparison.py | 290 +++++++++++++ .../ch12_Exercises_and_Next_Steps/README.md | 102 +++++ .../exercise_solutions.py | 250 ++++++++++++ 37 files changed, 5347 insertions(+) create mode 100644 gpt/local/course/README.md create mode 100644 gpt/local/course/ch01_What_is_a_Language_Model/.ipynb_checkpoints/language_model_idea-checkpoint.py create mode 100644 gpt/local/course/ch01_What_is_a_Language_Model/README.md create mode 100644 gpt/local/course/ch01_What_is_a_Language_Model/language_model_idea.py create mode 100644 gpt/local/course/ch02_Tokenization/README.md create mode 100644 gpt/local/course/ch02_Tokenization/bpe_intuition.py create mode 100644 gpt/local/course/ch02_Tokenization/char_tokenizer.py create mode 100644 gpt/local/course/ch03_Autograd/README.md create mode 100644 gpt/local/course/ch03_Autograd/computation_graph.py create mode 100644 gpt/local/course/ch03_Autograd/gradient_descent.py create mode 100644 gpt/local/course/ch03_Autograd/value_basics.py create mode 100644 gpt/local/course/ch04_Neural_Network_Building_Blocks/README.md create mode 100644 gpt/local/course/ch04_Neural_Network_Building_Blocks/building_blocks.py create mode 100644 gpt/local/course/ch04_Neural_Network_Building_Blocks/mlp.py create mode 100644 gpt/local/course/ch05_Attention_and_Transformers/README.md create mode 100644 gpt/local/course/ch05_Attention_and_Transformers/attention_basics.py create mode 100644 gpt/local/course/ch05_Attention_and_Transformers/multi_head_attention.py create mode 100644 gpt/local/course/ch05_Attention_and_Transformers/transformer_block.py create mode 100644 gpt/local/course/ch06_Training_Loop_and_Optimization/README.md create mode 100644 gpt/local/course/ch06_Training_Loop_and_Optimization/adam_optimizer.py create mode 100644 gpt/local/course/ch06_Training_Loop_and_Optimization/cross_entropy.py create mode 100644 gpt/local/course/ch06_Training_Loop_and_Optimization/training_loop.py create mode 100644 gpt/local/course/ch07_microGPT_Full_Walkthrough/README.md create mode 100644 gpt/local/course/ch07_microGPT_Full_Walkthrough/microgpt_annotated.py create mode 100644 gpt/local/course/ch08_Scaling_Up_with_PyTorch/README.md create mode 100644 gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_basics.py create mode 100644 gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_vs_manual.py create mode 100644 gpt/local/course/ch09_minGPT_Model_Deep_Dive/README.md create mode 100644 gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_sizes.py create mode 100644 gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_walkthrough.py create mode 100644 gpt/local/course/ch10_Training_and_Inference/README.md create mode 100644 gpt/local/course/ch10_Training_and_Inference/generate_text.py create mode 100644 gpt/local/course/ch10_Training_and_Inference/trainer_explained.py create mode 100644 gpt/local/course/ch11_Side_by_Side_Comparison/README.md create mode 100644 gpt/local/course/ch11_Side_by_Side_Comparison/comparison.py create mode 100644 gpt/local/course/ch12_Exercises_and_Next_Steps/README.md create mode 100644 gpt/local/course/ch12_Exercises_and_Next_Steps/exercise_solutions.py diff --git a/gpt/local/course/README.md b/gpt/local/course/README.md new file mode 100644 index 0000000..7415525 --- /dev/null +++ b/gpt/local/course/README.md @@ -0,0 +1,64 @@ +# Understanding GPT From Scratch + +## A Course Based on Andrej Karpathy's Implementations + +**Target Audience:** Early CS students who understand basic programming (Python, loops, functions, classes) but have no background in AI/ML or GPT. + +--- + +## What This Course Covers + +This course walks you through **two real GPT implementations** by Andrej Karpathy, starting from zero AI knowledge: + +| Implementation | Location | What It Is | +|---|---|---| +| **microGPT** | `../../8627fe009c40f57531cb18360106ce95/microgpt.py` | ~200 lines of pure Python. No libraries. Builds everything from scratch — autograd, neural networks, the full transformer. | +| **minGPT** | `../../minGPT/` | A clean PyTorch implementation. Production-style code with proper modules, training infrastructure, and real GPT-2 compatibility. | + +--- + +## Course Structure + +| Folder | Title | Key Idea | +|---|---|---| +| `ch01_What_is_a_Language_Model/` | What is a Language Model? | The big picture — predicting the next word | +| `ch02_Tokenization/` | Tokenization | Turning text into numbers a computer can process | +| `ch03_Autograd/` | Autograd | Teaching computers to do calculus automatically | +| `ch04_Neural_Network_Building_Blocks/` | Neural Network Building Blocks | Linear layers, activation functions, softmax | +| `ch05_Attention_and_Transformers/` | Attention & Transformers | The core innovation behind GPT | +| `ch06_Training_Loop_and_Optimization/` | Training: How Models Learn | Loss functions, backprop, optimizers | +| `ch07_microGPT_Full_Walkthrough/` | microGPT: Full Walkthrough | Line-by-line through the 200-line pure-Python GPT | +| `ch08_Scaling_Up_with_PyTorch/` | Scaling Up with PyTorch | Why we need frameworks, intro to PyTorch | +| `ch09_minGPT_Model_Deep_Dive/` | minGPT: Model Deep Dive | The production-quality GPT architecture | +| `ch10_Training_and_Inference/` | minGPT: Training & Inference | Training loops, text generation, real demos | +| `ch11_Side_by_Side_Comparison/` | Side-by-Side Comparison | microGPT vs minGPT — same ideas, different scales | +| `ch12_Exercises_and_Next_Steps/` | Exercises & Next Steps | Hands-on challenges and further reading | + +--- + +## How to Use This Course + +1. **Read chapters in order** — each builds on the previous one +2. **Run the code examples** — every chapter has runnable `.py` files in its folder +3. **Chapters 01-06** teach the concepts with small, isolated examples +4. **Chapters 07-10** apply those concepts to the real Karpathy code +5. **Chapter 11** ties everything together +6. **Chapter 12** gives you challenges to test your understanding + +## Prerequisites + +- Python basics: variables, loops, functions, classes, lists, dictionaries +- Basic math: addition, multiplication, exponents (no calculus needed — we teach it!) +- A terminal / command line +- Python 3.8+ installed + +## Running Examples + +```bash +# For chapters 01-07 (pure Python, no dependencies): +python ch01_What_is_a_Language_Model/language_model_idea.py + +# For chapters 08-10 (needs PyTorch): +pip install torch +python ch08_Scaling_Up_with_PyTorch/pytorch_basics.py +``` diff --git a/gpt/local/course/ch01_What_is_a_Language_Model/.ipynb_checkpoints/language_model_idea-checkpoint.py b/gpt/local/course/ch01_What_is_a_Language_Model/.ipynb_checkpoints/language_model_idea-checkpoint.py new file mode 100644 index 0000000..cabf6f1 --- /dev/null +++ b/gpt/local/course/ch01_What_is_a_Language_Model/.ipynb_checkpoints/language_model_idea-checkpoint.py @@ -0,0 +1,104 @@ +""" +Chapter 01: The Simplest Possible "Language Model" + +This is NOT a real language model — it's a toy to show the core idea: + 1. Learn patterns from data (training) + 2. Generate new text using those patterns (inference) + +We simply count how often each character follows another character. +""" + +import random + +# ============================================================ +# STEP 1: Our "training data" — a few names +# ============================================================ +training_data = [ + "emma", "olivia", "ava", "sophia", "isabella", + "mia", "charlotte", "amelia", "harper", "evelyn", +] + +print("=== Training Data ===") +for name in training_data: + print(f" {name}") + +# ============================================================ +# STEP 2: TRAINING — Count character transitions +# ============================================================ +# We'll count: given character X, how often does character Y come next? +# We use a special character '.' to mean "start" or "end" of a name. + +# Build a dictionary of dictionaries: +# counts['a']['b'] = number of times 'b' follows 'a' in the training data +counts = {} + +for name in training_data: + # Add start/end markers: ".emma." + chars = ['.'] + list(name) + ['.'] + for i in range(len(chars) - 1): + current = chars[i] + next_char = chars[i + 1] + if current not in counts: + counts[current] = {} + counts[current][next_char] = counts[current].get(next_char, 0) + 1 + +# Let's look at what follows 'a': +print("\n=== What follows 'a' in training data? ===") +if 'a' in counts: + total = sum(counts['a'].values()) + for char, count in sorted(counts['a'].items(), key=lambda x: -x[1]): + prob = count / total + display = "END" if char == '.' else char + print(f" '{display}' : {count} times ({prob:.0%})") + +# ============================================================ +# STEP 3: Convert counts to probabilities +# ============================================================ +probs = {} +for current_char, next_chars in counts.items(): + total = sum(next_chars.values()) + probs[current_char] = {} + for next_char, count in next_chars.items(): + probs[current_char][next_char] = count / total + +# ============================================================ +# STEP 4: INFERENCE — Generate new names! +# ============================================================ +print("\n=== Generated Names (sampling from our 'model') ===") +random.seed(42) + +for i in range(10): + name = [] + current = '.' # start token + + for _ in range(20): # max length safety + if current not in probs: + break + # Get possible next characters and their probabilities + next_chars = list(probs[current].keys()) + weights = [probs[current][c] for c in next_chars] + + # Sample randomly, weighted by probability + chosen = random.choices(next_chars, weights=weights, k=1)[0] + + if chosen == '.': # end token + break + name.append(chosen) + current = chosen + + print(f" {i+1:2d}. {''.join(name)}") + +# ============================================================ +# KEY TAKEAWAYS +# ============================================================ +print(""" +=== Key Takeaways === +1. We LEARNED patterns from data (counted character transitions) +2. We GENERATED new text by sampling from those patterns +3. The generated names "sound like" the training data but are new + +This is exactly what GPT does — just with WAY more sophisticated +pattern detection (neural networks instead of simple counting). + +Next chapter: How do we turn text into numbers? → Tokenization +""") diff --git a/gpt/local/course/ch01_What_is_a_Language_Model/README.md b/gpt/local/course/ch01_What_is_a_Language_Model/README.md new file mode 100644 index 0000000..6f23bd5 --- /dev/null +++ b/gpt/local/course/ch01_What_is_a_Language_Model/README.md @@ -0,0 +1,102 @@ +# Chapter 01: What is a Language Model? + +## The One-Sentence Summary + +A language model is a program that **predicts the next word** (or character) given some previous words. + +--- + +## Think of It Like Autocomplete + +You've used autocomplete on your phone: + +``` +You type: "How are ___" +Phone suggests: "you" +``` + +That's a language model! It looked at "How are" and predicted "you" is the most likely next word. + +GPT is the same idea — just way more powerful. + +--- + +## The Core Loop + +Every language model follows this pattern: + +``` +1. Look at some text → "The cat sat on the" +2. Predict the next word → "mat" (70%), "floor" (20%), "dog" (10%) +3. Pick one (sample or take best) → "mat" +4. Append it and repeat → "The cat sat on the mat" +``` + +That's it. GPT generates entire essays by repeating steps 1-4 over and over. + +--- + +## But How Does It Know? + +The model **learns patterns from data**. If you show it millions of sentences, it notices: +- "The cat sat on the ___" → usually "mat", "floor", "chair" +- "Once upon a ___" → usually "time" +- "def __init__(self, ___" → usually a parameter name + +It doesn't "understand" language. It's really good at **pattern matching**. + +--- + +## The Two Phases + +### Phase 1: Training (Learning) +- Feed the model tons of text +- For each position, it tries to predict the next character/word +- When it's wrong, adjust its internal numbers to be less wrong next time +- Repeat millions of times + +### Phase 2: Inference (Generating) +- Give it a starting text (prompt) +- Let it predict the next token, one at a time +- It generates new text that "sounds like" its training data + +--- + +## Characters vs Words vs Tokens + +Language models can predict at different levels: + +| Level | Example Input | Predicts | +|---|---|---| +| **Character-level** | `['H', 'e', 'l', 'l']` | `'o'` | +| **Word-level** | `['The', 'cat']` | `'sat'` | +| **Subword (BPE)** | `['The', ' cat', ' s']` | `'at'` | + +- **microGPT** uses character-level (simplest) +- **minGPT** can use any level, but demos use character-level and BPE + +--- + +## What's Inside a Language Model? + +At its core, a language model is just a **function with adjustable numbers** (parameters): + +``` +f(input_tokens, parameters) → probability of each possible next token +``` + +- The **parameters** are millions of numbers that encode patterns +- **Training** = finding good values for those numbers +- **Architecture** = how those numbers are organized and combined + +GPT uses an architecture called a **Transformer**, which we'll learn about in Chapter 05. + +--- + +## Run the Example + +The file `language_model_idea.py` in this folder shows the simplest possible "language model" — just counting letter frequencies. It's silly and bad, but it captures the core idea. + +```bash +python language_model_idea.py +``` diff --git a/gpt/local/course/ch01_What_is_a_Language_Model/language_model_idea.py b/gpt/local/course/ch01_What_is_a_Language_Model/language_model_idea.py new file mode 100644 index 0000000..cabf6f1 --- /dev/null +++ b/gpt/local/course/ch01_What_is_a_Language_Model/language_model_idea.py @@ -0,0 +1,104 @@ +""" +Chapter 01: The Simplest Possible "Language Model" + +This is NOT a real language model — it's a toy to show the core idea: + 1. Learn patterns from data (training) + 2. Generate new text using those patterns (inference) + +We simply count how often each character follows another character. +""" + +import random + +# ============================================================ +# STEP 1: Our "training data" — a few names +# ============================================================ +training_data = [ + "emma", "olivia", "ava", "sophia", "isabella", + "mia", "charlotte", "amelia", "harper", "evelyn", +] + +print("=== Training Data ===") +for name in training_data: + print(f" {name}") + +# ============================================================ +# STEP 2: TRAINING — Count character transitions +# ============================================================ +# We'll count: given character X, how often does character Y come next? +# We use a special character '.' to mean "start" or "end" of a name. + +# Build a dictionary of dictionaries: +# counts['a']['b'] = number of times 'b' follows 'a' in the training data +counts = {} + +for name in training_data: + # Add start/end markers: ".emma." + chars = ['.'] + list(name) + ['.'] + for i in range(len(chars) - 1): + current = chars[i] + next_char = chars[i + 1] + if current not in counts: + counts[current] = {} + counts[current][next_char] = counts[current].get(next_char, 0) + 1 + +# Let's look at what follows 'a': +print("\n=== What follows 'a' in training data? ===") +if 'a' in counts: + total = sum(counts['a'].values()) + for char, count in sorted(counts['a'].items(), key=lambda x: -x[1]): + prob = count / total + display = "END" if char == '.' else char + print(f" '{display}' : {count} times ({prob:.0%})") + +# ============================================================ +# STEP 3: Convert counts to probabilities +# ============================================================ +probs = {} +for current_char, next_chars in counts.items(): + total = sum(next_chars.values()) + probs[current_char] = {} + for next_char, count in next_chars.items(): + probs[current_char][next_char] = count / total + +# ============================================================ +# STEP 4: INFERENCE — Generate new names! +# ============================================================ +print("\n=== Generated Names (sampling from our 'model') ===") +random.seed(42) + +for i in range(10): + name = [] + current = '.' # start token + + for _ in range(20): # max length safety + if current not in probs: + break + # Get possible next characters and their probabilities + next_chars = list(probs[current].keys()) + weights = [probs[current][c] for c in next_chars] + + # Sample randomly, weighted by probability + chosen = random.choices(next_chars, weights=weights, k=1)[0] + + if chosen == '.': # end token + break + name.append(chosen) + current = chosen + + print(f" {i+1:2d}. {''.join(name)}") + +# ============================================================ +# KEY TAKEAWAYS +# ============================================================ +print(""" +=== Key Takeaways === +1. We LEARNED patterns from data (counted character transitions) +2. We GENERATED new text by sampling from those patterns +3. The generated names "sound like" the training data but are new + +This is exactly what GPT does — just with WAY more sophisticated +pattern detection (neural networks instead of simple counting). + +Next chapter: How do we turn text into numbers? → Tokenization +""") diff --git a/gpt/local/course/ch02_Tokenization/README.md b/gpt/local/course/ch02_Tokenization/README.md new file mode 100644 index 0000000..d1c531a --- /dev/null +++ b/gpt/local/course/ch02_Tokenization/README.md @@ -0,0 +1,83 @@ +# Chapter 02: Tokenization — Turning Text into Numbers + +## Why Tokenization? + +Computers don't understand letters or words — they work with **numbers**. Before we can feed text into any model, we need to convert it to a sequence of integers. This conversion is called **tokenization**. + +``` +"hello" → [7, 4, 11, 11, 14] +``` + +--- + +## Three Levels of Tokenization + +### 1. Character-Level (used by microGPT) + +Each unique character gets a number: + +``` +Vocabulary: {a:0, b:1, c:2, d:3, e:4, ...} +"cab" → [2, 0, 1] +``` + +**Pros:** Tiny vocabulary, simple to implement +**Cons:** Sequences are long, model must learn to spell words + +### 2. Word-Level + +Each unique word gets a number: + +``` +Vocabulary: {"the":0, "cat":1, "sat":2, ...} +"the cat" → [0, 1] +``` + +**Pros:** Short sequences +**Cons:** Huge vocabulary, can't handle new/misspelled words + +### 3. Subword / BPE (used by GPT-2, minGPT) + +Byte Pair Encoding — a clever middle ground. Frequent words stay whole, rare words get split: + +``` +"unhappiness" → ["un", "happiness"] (common prefix + common word) +"Karpathy" → ["K", "arp", "athy"] (rare name, split into pieces) +``` + +**Pros:** Handles any text, balanced vocabulary size +**Cons:** More complex to implement + +--- + +## The Vocabulary + +The set of all possible tokens is the **vocabulary**. Its size matters: + +| Tokenizer | Vocab Size | Example | +|---|---|---| +| microGPT (names dataset) | ~27 | a-z + BOS token | +| GPT-2 (BPE) | 50,257 | All common English subwords | + +The model's final layer must output a probability for **every token in the vocabulary**. Bigger vocab = bigger model. + +--- + +## Special Tokens + +Most tokenizers have special tokens with reserved meanings: + +- **BOS** (Beginning of Sequence): Marks the start of a document +- **EOS** (End of Sequence): Marks the end +- **PAD**: Fills empty space when batching sequences of different lengths + +In microGPT, there's one special token: `BOS` (used for both start and end). + +--- + +## Run the Examples + +```bash +python char_tokenizer.py # Character-level tokenization (like microGPT) +python bpe_intuition.py # How BPE works, step by step +``` diff --git a/gpt/local/course/ch02_Tokenization/bpe_intuition.py b/gpt/local/course/ch02_Tokenization/bpe_intuition.py new file mode 100644 index 0000000..db83616 --- /dev/null +++ b/gpt/local/course/ch02_Tokenization/bpe_intuition.py @@ -0,0 +1,112 @@ +""" +Chapter 02: BPE (Byte Pair Encoding) Intuition + +BPE is the tokenizer used by GPT-2 and minGPT. +This simplified example shows the IDEA behind BPE. +(The real implementation is in minGPT/mingpt/bpe.py) +""" + +# ============================================================ +# THE IDEA: Repeatedly merge the most common pair of tokens +# ============================================================ + +def simple_bpe_train(corpus, num_merges): + """ + Train a simple BPE tokenizer. + + Start with individual characters as tokens. + Then repeatedly find the most common adjacent pair + and merge them into a single new token. + """ + # Start: each word is split into individual characters + # We add a special end-of-word marker '_' + words = {} + for word in corpus.split(): + chars = ' '.join(list(word)) + ' _' + words[chars] = words.get(chars, 0) + 1 + + print("=== Starting tokens (individual characters) ===") + print(f" Words: {words}\n") + + merges = [] + + for step in range(num_merges): + # Count all adjacent pairs + pairs = {} + for word, freq in words.items(): + symbols = word.split() + for i in range(len(symbols) - 1): + pair = (symbols[i], symbols[i + 1]) + pairs[pair] = pairs.get(pair, 0) + freq + + if not pairs: + break + + # Find the most common pair + best_pair = max(pairs, key=pairs.get) + merges.append(best_pair) + + print(f"Step {step + 1}: Merge '{best_pair[0]}' + '{best_pair[1]}' " + f"→ '{best_pair[0]}{best_pair[1]}' " + f"(appeared {pairs[best_pair]} times)") + + # Apply the merge: replace all occurrences of this pair + new_words = {} + for word, freq in words.items(): + new_word = word.replace( + f"{best_pair[0]} {best_pair[1]}", + f"{best_pair[0]}{best_pair[1]}" + ) + new_words[new_word] = freq + words = new_words + + # Show current state of words + print(f" Words now: {words}\n") + + return merges, words + + +# ============================================================ +# DEMO: Train BPE on a tiny corpus +# ============================================================ +corpus = "low low low low low lowest lowest newer newer newer wider" + +print("=" * 60) +print("BPE Training Demo") +print(f"Corpus: \"{corpus}\"") +print("=" * 60 + "\n") + +merges, final_words = simple_bpe_train(corpus, num_merges=10) + +print("=" * 60) +print("Final merge rules learned:") +for i, (a, b) in enumerate(merges): + print(f" Rule {i+1}: '{a}' + '{b}' → '{a}{b}'") + +# ============================================================ +# HOW BPE TOKENIZES NEW TEXT +# ============================================================ +print(""" +=== How BPE Tokenizes New Text === + +After training, to tokenize a new word like "lowest": + 1. Start with characters: ['l', 'o', 'w', 'e', 's', 't'] + 2. Apply merge rules in order: + - Rule 'l'+'o' → 'lo': ['lo', 'w', 'e', 's', 't'] + - Rule 'lo'+'w' → 'low': ['low', 'e', 's', 't'] + - Rule 'e'+'s' → 'es': ['low', 'es', 't'] + - Rule 'es'+'t' → 'est': ['low', 'est'] + 3. Final tokens: ['low', 'est'] + +For a RARE word like "xylophone": + Characters are mostly left as-is because no merge rules apply. + → ['x', 'y', 'l', 'o', 'p', 'h', 'o', 'n', 'e'] + +This is the beauty of BPE: + - COMMON words/subwords become single tokens → efficient + - RARE words get broken into characters → still works + - Vocabulary size is controllable (= number of merges + base chars) + +GPT-2 uses 50,000 merges → vocab size of ~50,257 +microGPT uses 0 merges → vocab is just individual characters (~27) +""") diff --git a/gpt/local/course/ch02_Tokenization/char_tokenizer.py b/gpt/local/course/ch02_Tokenization/char_tokenizer.py new file mode 100644 index 0000000..5e6cdb8 --- /dev/null +++ b/gpt/local/course/ch02_Tokenization/char_tokenizer.py @@ -0,0 +1,92 @@ +""" +Chapter 02: Character-Level Tokenizer + +This is exactly how microGPT tokenizes text. +Each unique character becomes a token ID (integer). +""" + +# ============================================================ +# STEP 1: Build vocabulary from data +# ============================================================ +# Imagine our dataset is a list of names +docs = ["emma", "olivia", "ava", "sophia", "isabella", "mia"] + +# Find all unique characters, sorted +all_text = ''.join(docs) +uchars = sorted(set(all_text)) + +print("=== Vocabulary ===") +print(f"Unique characters: {uchars}") +print(f"Number of unique characters: {len(uchars)}") + +# Each character maps to an integer index +char_to_id = {ch: i for i, ch in enumerate(uchars)} +id_to_char = {i: ch for i, ch in enumerate(uchars)} + +print("\nCharacter → ID mapping:") +for ch, idx in char_to_id.items(): + print(f" '{ch}' → {idx}") + +# ============================================================ +# STEP 2: Add special tokens +# ============================================================ +# BOS = Beginning of Sequence. A special token that marks boundaries. +BOS = len(uchars) # Gets the next available ID +vocab_size = len(uchars) + 1 + +print(f"\nBOS token ID: {BOS}") +print(f"Total vocab size: {vocab_size}") + +# ============================================================ +# STEP 3: Encode — turn text into numbers +# ============================================================ +def encode(text): + """Convert a string to a list of token IDs""" + return [char_to_id[ch] for ch in text] + +def decode(token_ids): + """Convert a list of token IDs back to a string""" + return ''.join(id_to_char[i] for i in token_ids if i != BOS) + +# Try encoding some names +print("\n=== Encoding Examples ===") +for name in ["emma", "ava", "mia"]: + tokens = encode(name) + print(f" '{name}' → {tokens}") + + # Add BOS tokens on both sides (like microGPT does) + tokens_with_bos = [BOS] + tokens + [BOS] + print(f" with BOS: {tokens_with_bos}") + +# ============================================================ +# STEP 4: Decode — turn numbers back to text +# ============================================================ +print("\n=== Decoding Examples ===") +sample_tokens = [3, 7, 7, 0] # e, m, m, a +decoded = decode(sample_tokens) +print(f" {sample_tokens} → '{decoded}'") + +# ============================================================ +# STEP 5: How this connects to the model +# ============================================================ +print(""" +=== How This Connects to GPT === + +During training, the model sees sequences like: + [BOS, e, m, m, a, BOS] → [{bos}, {e}, {m}, {m2}, {a}, {bos2}] + +At each position, it tries to predict the NEXT token: + Given [BOS] → predict 'e' (token {e}) + Given [BOS, e] → predict 'm' (token {m}) + Given [BOS, e, m] → predict 'm' (token {m2}) + Given [BOS, e, m, m] → predict 'a' (token {a}) + Given [BOS, e, m, m, a] → predict BOS (end of name) + +The model outputs {vocab_size} probabilities — one for each possible +next token. Training adjusts the model so the correct next token +gets the highest probability. +""".format( + bos=BOS, e=char_to_id['e'], m=char_to_id['m'], + m2=char_to_id['m'], a=char_to_id['a'], + bos2=BOS, vocab_size=vocab_size +)) diff --git a/gpt/local/course/ch03_Autograd/README.md b/gpt/local/course/ch03_Autograd/README.md new file mode 100644 index 0000000..1adb7d3 --- /dev/null +++ b/gpt/local/course/ch03_Autograd/README.md @@ -0,0 +1,59 @@ +# Chapter 03: Autograd — Teaching Computers Calculus + +## Why Do We Need Calculus? + +To train a model, we need to answer: **"If I nudge this parameter a tiny bit, does the output get better or worse?"** That's what a derivative tells us. + +But a GPT has millions of parameters and hundreds of operations chained together. Computing derivatives by hand is impossible. So we use **automatic differentiation** (autograd). + +--- + +## The Chain Rule — The One Rule That Powers All of AI + +If you compute `y = f(g(x))`, the derivative of y with respect to x is: + +``` +dy/dx = dy/dg * dg/dx +``` + +"Multiply the local derivatives along the chain." That's it. Autograd just automates this. + +--- + +## microGPT's `Value` Class + +microGPT implements autograd from scratch in ~40 lines. The `Value` class: +1. Wraps a number +2. Records what operations created it (the computation graph) +3. Can compute gradients automatically via `backward()` + +--- + +## The Computation Graph + +When you write `c = a + b`, autograd secretly builds a graph: + +``` +a ──┐ + ├──(+)──→ c +b ──┘ +``` + +Each node knows its children and the local derivative of the operation. + +--- + +## Forward Pass vs Backward Pass + +- **Forward pass**: Compute the output value (follow the arrows forward) +- **Backward pass**: Compute gradients by walking the graph backward, multiplying local derivatives (chain rule) + +--- + +## Run the Examples + +```bash +python value_basics.py # Understand the Value class +python computation_graph.py # See the chain rule in action +python gradient_descent.py # Use gradients to optimize a simple function +``` diff --git a/gpt/local/course/ch03_Autograd/computation_graph.py b/gpt/local/course/ch03_Autograd/computation_graph.py new file mode 100644 index 0000000..5e1dfdd --- /dev/null +++ b/gpt/local/course/ch03_Autograd/computation_graph.py @@ -0,0 +1,138 @@ +""" +Chapter 03: Visualizing the Computation Graph + +This script shows how operations build a graph, +and how backward() walks it in reverse to compute gradients. +""" + +import math + +class Value: + def __init__(self, data, children=(), local_grads=(), label=''): + self.data = data + self.grad = 0 + self._children = children + self._local_grads = local_grads + self.label = label + + def __add__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data + other.data, (self, other), (1, 1), + f"({self.label}+{other.label})") + + def __mul__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data * other.data, (self, other), (other.data, self.data), + f"({self.label}*{other.label})") + + def __pow__(self, n): + return Value(self.data ** n, (self,), (n * self.data ** (n-1),), + f"({self.label}^{n})") + + def __neg__(self): return self * Value(-1, label='-1') + def __sub__(self, other): return self + (-other) + def __rmul__(self, other): return Value(other, label=str(other)) * self + + def log(self): + return Value(math.log(self.data), (self,), (1/self.data,), + f"log({self.label})") + + def exp(self): + return Value(math.exp(self.data), (self,), (math.exp(self.data),), + f"exp({self.label})") + + def backward(self): + topo = [] + visited = set() + def build_topo(v): + if v not in visited: + visited.add(v) + for child in v._children: + build_topo(child) + topo.append(v) + build_topo(self) + self.grad = 1 + for v in reversed(topo): + for child, local_grad in zip(v._children, v._local_grads): + child.grad += local_grad * v.grad + + +# ============================================================ +# BUILD A COMPUTATION: loss = -log(softmax(score)[target]) +# This is EXACTLY what happens in GPT at each prediction step! +# ============================================================ + +print("=" * 60) +print("Computation Graph: A Mini Cross-Entropy Loss") +print("=" * 60) +print() +print("Scenario: Model outputs scores [2.0, 1.0, 0.5] for 3 tokens.") +print("The correct token is index 0. What's the loss?\n") + +# These are our "model outputs" (logits) +s0 = Value(2.0, label='s0') # score for token 0 (correct answer) +s1 = Value(1.0, label='s1') # score for token 1 +s2 = Value(0.5, label='s2') # score for token 2 + +# Softmax: convert scores to probabilities +# softmax(si) = exp(si) / sum(exp(sj)) +e0 = s0.exp() +e1 = s1.exp() +e2 = s2.exp() +total = e0 + e1 + e2 + +p0 = e0 * (total ** -1) # probability of token 0 + +# Cross-entropy loss for the correct token (index 0) +loss = -(p0.log()) + +print("Forward Pass (computing the loss):") +print(f" Scores: s0={s0.data:.2f}, s1={s1.data:.2f}, s2={s2.data:.2f}") +print(f" Exponentials: exp(s0)={e0.data:.4f}, exp(s1)={e1.data:.4f}, exp(s2)={e2.data:.4f}") +print(f" Sum of exps: {total.data:.4f}") +print(f" Prob of s0: {p0.data:.4f} ({p0.data*100:.1f}%)") +print(f" Loss: -log({p0.data:.4f}) = {loss.data:.4f}") + +# Now compute gradients! +loss.backward() + +print(f"\nBackward Pass (computing gradients):") +print(f" d(loss)/d(s0) = {s0.grad:.4f}") +print(f" d(loss)/d(s1) = {s1.grad:.4f}") +print(f" d(loss)/d(s2) = {s2.grad:.4f}") + +print(f""" +Interpretation: + s0.grad = {s0.grad:.4f} (NEGATIVE → increasing s0 DECREASES loss → good!) + s1.grad = {s1.grad:.4f} (POSITIVE → increasing s1 INCREASES loss → bad!) + s2.grad = {s2.grad:.4f} (POSITIVE → increasing s2 INCREASES loss → bad!) + +This makes sense! The correct answer is token 0, so: + - We WANT s0 to be higher (gradient pushes it up) + - We WANT s1, s2 to be lower (gradient pushes them down) + +This is exactly how a neural network learns: + 1. Forward pass → compute loss (how wrong are we?) + 2. Backward pass → compute gradients (which direction to adjust?) + 3. Update parameters in the direction that reduces loss +""") + +# ============================================================ +# VISUALIZE THE GRAPH STRUCTURE +# ============================================================ +print("=" * 60) +print("The Computation Graph (text visualization)") +print("=" * 60) +print(""" + s0 ─→ exp() ─→ e0 ──┐ + ├─→ e0/total ─→ p0 ─→ log() ─→ neg ─→ LOSS + s1 ─→ exp() ─→ e1 ──┤ ↑ + ├─→ total ───────┘ + s2 ─→ exp() ─→ e2 ──┘ + +Forward: Left to right (compute values) +Backward: Right to left (compute gradients using chain rule) + +Each arrow multiplies by the local derivative. +The chain rule = multiply all local derivatives along the path. +""") diff --git a/gpt/local/course/ch03_Autograd/gradient_descent.py b/gpt/local/course/ch03_Autograd/gradient_descent.py new file mode 100644 index 0000000..7492fe0 --- /dev/null +++ b/gpt/local/course/ch03_Autograd/gradient_descent.py @@ -0,0 +1,122 @@ +""" +Chapter 03: Gradient Descent — Using Gradients to Learn + +This is the fundamental learning algorithm: + 1. Start with random parameter values + 2. Compute how wrong the output is (loss) + 3. Compute gradients (which direction to adjust) + 4. Nudge parameters to reduce the loss + 5. Repeat + +We'll train a tiny "model" (just 2 parameters) to fit a target. +""" + +import math +import random + +random.seed(42) + +class Value: + def __init__(self, data, children=(), local_grads=()): + self.data = data + self.grad = 0 + self._children = children + self._local_grads = local_grads + + def __add__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data + other.data, (self, other), (1, 1)) + def __mul__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data * other.data, (self, other), (other.data, self.data)) + def __pow__(self, n): + return Value(self.data ** n, (self,), (n * self.data ** (n-1),)) + def __neg__(self): return self * -1 + def __sub__(self, other): return self + (-other) + def __radd__(self, other): return self + other + def __rmul__(self, other): return self * other + + def backward(self): + topo, visited = [], set() + def build(v): + if v not in visited: + visited.add(v) + for c in v._children: build(c) + topo.append(v) + build(self) + self.grad = 1 + for v in reversed(topo): + for child, lg in zip(v._children, v._local_grads): + child.grad += lg * v.grad + + +# ============================================================ +# PROBLEM: Learn y = 3x + 7 +# Our model: y_pred = w*x + b (we must find w=3, b=7) +# ============================================================ + +# Initialize parameters randomly +w = Value(random.uniform(-1, 1)) # will learn to be ~3.0 +b = Value(random.uniform(-1, 1)) # will learn to be ~7.0 + +# Training data: (x, y) pairs from the true function y = 3x + 7 +data = [(1, 10), (2, 13), (3, 16), (4, 19), (5, 22)] + +learning_rate = 0.01 + +print("=" * 50) +print("Goal: Learn y = 3x + 7") +print(f"Starting: w = {w.data:.4f}, b = {b.data:.4f}") +print("=" * 50) + +for step in range(100): + # ---- Forward pass: compute predictions and loss ---- + total_loss = Value(0) + for x_val, y_true in data: + y_pred = w * x_val + b # our model's prediction + diff = y_pred - y_true # error + loss = diff ** 2 # squared error (always positive) + total_loss = total_loss + loss + + # Average loss + avg_loss = total_loss * (1.0 / len(data)) + + # ---- Backward pass: compute gradients ---- + avg_loss.backward() + + # ---- Update parameters: move in the opposite direction of gradient ---- + w.data -= learning_rate * w.grad + b.data -= learning_rate * b.grad + + # ---- Reset gradients for next step ---- + w.grad = 0 + b.grad = 0 + + # Print progress + if step % 10 == 0 or step == 99: + print(f" Step {step:3d} | loss = {avg_loss.data:.4f} | w = {w.data:.4f} | b = {b.data:.4f}") + + # Recreate Value objects to reset computation graph + w = Value(w.data) + b = Value(b.data) + +print(f"\n Final: w = {w.data:.4f} (target: 3.0), b = {b.data:.4f} (target: 7.0)") + +print(""" +=== What Just Happened === + +1. We started with random w and b +2. Each step: + a. Forward pass: computed predictions and loss + b. Backward pass: autograd computed d(loss)/dw and d(loss)/db + c. Update: moved w and b in the direction that reduces loss +3. After 100 steps, w ≈ 3.0 and b ≈ 7.0 + +This is EXACTLY what happens in GPT training: + - Instead of 2 parameters (w, b), GPT has millions + - Instead of y = wx + b, GPT is a transformer neural network + - Instead of squared error, GPT uses cross-entropy loss + - But the loop is identical: forward → backward → update + +Next chapter: What are the building blocks of that neural network? +""") diff --git a/gpt/local/course/ch03_Autograd/value_basics.py b/gpt/local/course/ch03_Autograd/value_basics.py new file mode 100644 index 0000000..7beebc6 --- /dev/null +++ b/gpt/local/course/ch03_Autograd/value_basics.py @@ -0,0 +1,164 @@ +""" +Chapter 03: Understanding the Value Class + +The Value class is microGPT's autograd engine. +It wraps a number and tracks the computation graph +so gradients can be computed automatically. + +This is a simplified version of the Value class from microgpt.py +""" + +import math + +# ============================================================ +# THE VALUE CLASS (simplified from microGPT) +# ============================================================ +class Value: + """ + A single scalar value that tracks its computation history. + + Think of it as a "smart number" that remembers: + - Its current value (self.data) + - How it was created (self._children, self._local_grads) + - Its gradient, computed later (self.grad) + """ + + def __init__(self, data, children=(), local_grads=()): + self.data = data # the actual number + self.grad = 0 # derivative of loss w.r.t. this value + self._children = children # what values were used to compute this + self._local_grads = local_grads # derivative of this op w.r.t. each child + + def __add__(self, other): + """Addition: c = a + b""" + other = other if isinstance(other, Value) else Value(other) + # d(a+b)/da = 1, d(a+b)/db = 1 + return Value(self.data + other.data, (self, other), (1, 1)) + + def __mul__(self, other): + """Multiplication: c = a * b""" + other = other if isinstance(other, Value) else Value(other) + # d(a*b)/da = b, d(a*b)/db = a + return Value(self.data * other.data, (self, other), (other.data, self.data)) + + def __pow__(self, power): + """Power: c = a ** n""" + # d(a^n)/da = n * a^(n-1) + return Value(self.data ** power, (self,), (power * self.data ** (power - 1),)) + + def __neg__(self): return self * -1 + def __sub__(self, other): return self + (-other) + def __truediv__(self, other): return self * other ** -1 + def __radd__(self, other): return self + other + def __rmul__(self, other): return self * other + + def backward(self): + """Compute gradients for all values in the computation graph.""" + # Step 1: Build topological ordering (children before parents) + topo = [] + visited = set() + def build_topo(v): + if v not in visited: + visited.add(v) + for child in v._children: + build_topo(child) + topo.append(v) + build_topo(self) + + # Step 2: Backpropagate gradients + self.grad = 1 # d(self)/d(self) = 1 + for v in reversed(topo): + for child, local_grad in zip(v._children, v._local_grads): + child.grad += local_grad * v.grad # Chain rule! + + def __repr__(self): + return f"Value(data={self.data:.4f}, grad={self.grad:.4f})" + + +# ============================================================ +# EXAMPLE 1: Simple addition +# ============================================================ +print("=== Example 1: Addition (c = a + b) ===") +a = Value(3.0) +b = Value(5.0) +c = a + b # c = 8.0 + +c.backward() + +print(f" a = {a}") +print(f" b = {b}") +print(f" c = a + b = {c}") +print(f" dc/da = {a.grad} (if we increase a by 1, c increases by 1)") +print(f" dc/db = {b.grad} (if we increase b by 1, c increases by 1)") + +# ============================================================ +# EXAMPLE 2: Multiplication +# ============================================================ +print("\n=== Example 2: Multiplication (c = a * b) ===") +a = Value(3.0) +b = Value(5.0) +c = a * b # c = 15.0 + +c.backward() + +print(f" a = {a}") +print(f" b = {b}") +print(f" c = a * b = {c}") +print(f" dc/da = {a.grad} (= b's value, because d(a*b)/da = b)") +print(f" dc/db = {b.grad} (= a's value, because d(a*b)/db = a)") + +# ============================================================ +# EXAMPLE 3: A chain of operations (chain rule!) +# ============================================================ +print("\n=== Example 3: Chain Rule (d = (a + b) * c) ===") +a = Value(2.0) +b = Value(3.0) +c = Value(4.0) + +# Forward pass: +ab = a + b # ab = 5.0 +d = ab * c # d = 20.0 + +# Backward pass: +d.backward() + +print(f" a = {a}") +print(f" b = {b}") +print(f" c = {c}") +print(f" d = (a + b) * c = {d}") +print(f" dd/da = {a.grad} (= c, because d = (a+b)*c, dd/da = c = 4)") +print(f" dd/db = {b.grad} (= c, same reasoning)") +print(f" dd/dc = {c.grad} (= a+b = 5)") + +# ============================================================ +# EXAMPLE 4: More complex expression +# ============================================================ +print("\n=== Example 4: Complex Expression (y = (2x + 3)^2) ===") +x = Value(1.0) + +# Forward pass: +y = (2 * x + 3) ** 2 # y = (2*1+3)^2 = 25 + +# Backward pass: +y.backward() + +print(f" x = {x}") +print(f" y = (2x + 3)^2 = {y}") +print(f" dy/dx = {x.grad}") +print(f" By hand: dy/dx = 2 * 2 * (2x+3) = 4 * (2*1+3) = 20 ✓") + +# ============================================================ +# KEY TAKEAWAY +# ============================================================ +print(""" +=== Key Takeaway === +The Value class lets us: + 1. Write normal math expressions (forward pass) + 2. Automatically compute ALL derivatives (backward pass) + +In GPT, the expression is HUGE (millions of operations), +but the same principle applies. The chain rule just propagates +gradients through the entire computation graph. + +Next: See computation_graph.py for a visual explanation. +""") diff --git a/gpt/local/course/ch04_Neural_Network_Building_Blocks/README.md b/gpt/local/course/ch04_Neural_Network_Building_Blocks/README.md new file mode 100644 index 0000000..03b8da7 --- /dev/null +++ b/gpt/local/course/ch04_Neural_Network_Building_Blocks/README.md @@ -0,0 +1,69 @@ +# Chapter 04: Neural Network Building Blocks + +## Overview + +A neural network is built from a few simple, reusable pieces. Understanding these pieces is the key to understanding GPT. + +--- + +## The Building Blocks + +### 1. Linear Layer (Matrix Multiply) +The workhorse of neural networks. Takes an input vector, multiplies by a weight matrix, producing an output vector. + +``` +input: [x1, x2, x3] (3 numbers) +weights: 2x3 matrix (6 numbers, learned) +output: [y1, y2] (2 numbers) + +y1 = w11*x1 + w12*x2 + w13*x3 +y2 = w21*x1 + w22*x2 + w23*x3 +``` + +This is just a weighted sum — each output is a mix of all inputs. + +### 2. Activation Functions (ReLU, GELU) +Without activations, stacking linear layers would just be one big linear layer. Activations add non-linearity so the network can learn complex patterns. + +``` +ReLU(x) = max(0, x) # Simple: zero out negatives (used in microGPT) +GELU(x) ≈ x * sigmoid(1.7*x) # Smoother version (used in minGPT) +``` + +### 3. Softmax +Converts a list of arbitrary numbers into probabilities (positive, sum to 1): + +``` +scores: [2.0, 1.0, 0.5] +softmax: [0.59, 0.22, 0.19] (sum = 1.0) +``` + +Used at the end of GPT to get "probability of each next token." + +### 4. Layer Normalization / RMSNorm +Keeps numbers in a stable range so the network doesn't explode or vanish during training. + +``` +RMSNorm: scale each vector so its average squared value is ~1 +LayerNorm: shift and scale so mean=0, variance=1 +``` + +microGPT uses RMSNorm, minGPT uses LayerNorm. + +### 5. Embedding +A lookup table: token ID → vector of numbers. + +``` +Token 5 → [0.12, -0.34, 0.56, ...] (a vector of n_embd numbers) +``` + +This is how tokens enter the neural network — each token gets mapped to a dense vector. + +--- + +## Run the Examples + +```bash +python building_blocks.py # Interactive demo of each building block +python mlp.py # A complete Multi-Layer Perceptron (MLP) +``` diff --git a/gpt/local/course/ch04_Neural_Network_Building_Blocks/building_blocks.py b/gpt/local/course/ch04_Neural_Network_Building_Blocks/building_blocks.py new file mode 100644 index 0000000..f963d91 --- /dev/null +++ b/gpt/local/course/ch04_Neural_Network_Building_Blocks/building_blocks.py @@ -0,0 +1,174 @@ +""" +Chapter 04: Neural Network Building Blocks — Hands-On + +Every building block used in GPT, implemented from scratch. +These are the SAME functions used in microGPT (microgpt.py). +""" + +import math +import random + +random.seed(42) + +# ============================================================ +# BLOCK 1: LINEAR LAYER +# ============================================================ +# A linear layer is just matrix-vector multiplication. +# Each output = weighted sum of all inputs. + +def linear(x, w): + """ + x: input vector of length n_in + w: weight matrix of shape [n_out][n_in] + returns: output vector of length n_out + """ + return [sum(wi * xi for wi, xi in zip(row, x)) for row in w] + +print("=" * 50) +print("BLOCK 1: Linear Layer") +print("=" * 50) + +# Input: a vector of 3 numbers +x = [1.0, 2.0, 3.0] + +# Weights: a 2x3 matrix (maps 3 inputs → 2 outputs) +w = [ + [0.1, 0.2, 0.3], # weights for output 0 + [0.4, 0.5, 0.6], # weights for output 1 +] + +y = linear(x, w) +print(f"Input: {x} (3 values)") +print(f"Weights: {w}") +print(f"Output: {y} (2 values)") +print(f" y[0] = 0.1*1 + 0.2*2 + 0.3*3 = {0.1*1 + 0.2*2 + 0.3*3}") +print(f" y[1] = 0.4*1 + 0.5*2 + 0.6*3 = {0.4*1 + 0.5*2 + 0.6*3}") + +# ============================================================ +# BLOCK 2: ACTIVATION FUNCTIONS +# ============================================================ +print("\n" + "=" * 50) +print("BLOCK 2: Activation Functions") +print("=" * 50) + +def relu(x): + """ReLU: zero out negative values. Used in microGPT.""" + return max(0, x) + +print("\nReLU (used in microGPT):") +for val in [-2, -1, 0, 1, 2, 3]: + print(f" relu({val:2d}) = {relu(val)}") + +print("\nWhy ReLU? Without it, stacking linear layers = one big linear layer.") +print("ReLU adds 'kinks' so the network can model non-linear patterns.") + +# GELU (used in minGPT/GPT-2) — smoother version of ReLU +def gelu(x): + """GELU: Gaussian Error Linear Unit. Used in GPT-2/minGPT.""" + return 0.5 * x * (1.0 + math.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * x**3))) + +print("\nGELU (used in minGPT/GPT-2):") +for val in [-2, -1, 0, 1, 2, 3]: + print(f" gelu({val:2d}) = {gelu(val):.4f}") + +print("\nGELU is smoother than ReLU — slightly negative inputs get small negative outputs") +print("instead of being hard-zeroed. This helps training in practice.") + +# ============================================================ +# BLOCK 3: SOFTMAX +# ============================================================ +print("\n" + "=" * 50) +print("BLOCK 3: Softmax") +print("=" * 50) + +def softmax(logits): + """Convert raw scores to probabilities.""" + max_val = max(logits) + exps = [math.exp(v - max_val) for v in logits] # subtract max for numerical stability + total = sum(exps) + return [e / total for e in exps] + +scores = [2.0, 1.0, 0.5] +probs = softmax(scores) +print(f"\nRaw scores (logits): {scores}") +print(f"After softmax: [{', '.join(f'{p:.4f}' for p in probs)}]") +print(f"Sum of probabilities: {sum(probs):.4f}") +print(f"\nHigher score → higher probability. All positive, sum to 1.") + +# Show the effect of "temperature" +print("\nTemperature effect (used during text generation):") +for temp in [0.5, 1.0, 2.0]: + scaled = [s / temp for s in scores] + p = softmax(scaled) + print(f" temp={temp}: [{', '.join(f'{x:.3f}' for x in p)}] ", end="") + if temp < 1: + print("← more confident (peaked)") + elif temp > 1: + print("← more random (flat)") + else: + print("← normal") + +# ============================================================ +# BLOCK 4: RMSNORM / LAYERNORM +# ============================================================ +print("\n" + "=" * 50) +print("BLOCK 4: Normalization") +print("=" * 50) + +def rmsnorm(x): + """RMSNorm: scale so root-mean-square = 1. Used in microGPT.""" + ms = sum(xi * xi for xi in x) / len(x) + scale = (ms + 1e-5) ** -0.5 + return [xi * scale for xi in x] + +def layernorm(x): + """LayerNorm: shift to mean=0, scale to std=1. Used in minGPT.""" + mean = sum(x) / len(x) + variance = sum((xi - mean) ** 2 for xi in x) / len(x) + scale = (variance + 1e-5) ** -0.5 + return [(xi - mean) * scale for xi in x] + +x = [10.0, 20.0, 30.0, 40.0] +print(f"\nOriginal: {x}") +print(f"RMSNorm: [{', '.join(f'{v:.4f}' for v in rmsnorm(x))}]") +print(f"LayerNorm: [{', '.join(f'{v:.4f}' for v in layernorm(x))}]") + +print("\nWhy normalize? Keeps values in a reasonable range.") +print("Without it, values can grow huge or tiny, making training unstable.") + +# ============================================================ +# BLOCK 5: EMBEDDING +# ============================================================ +print("\n" + "=" * 50) +print("BLOCK 5: Embedding (Lookup Table)") +print("=" * 50) + +# An embedding is just a table of vectors, one per token +n_embd = 4 # each token becomes a vector of 4 numbers +vocab_size = 5 # we have 5 possible tokens + +# Initialize randomly (these get learned during training) +embedding_table = [[random.gauss(0, 0.1) for _ in range(n_embd)] + for _ in range(vocab_size)] + +print(f"\nEmbedding table ({vocab_size} tokens × {n_embd} dimensions):") +for i, row in enumerate(embedding_table): + print(f" Token {i} → [{', '.join(f'{v:+.3f}' for v in row)}]") + +token_id = 3 +print(f"\nLooking up token {token_id}:") +print(f" → [{', '.join(f'{v:+.3f}' for v in embedding_table[token_id])}]") +print("\nEmbeddings are LEARNED — after training, similar tokens") +print("end up with similar vectors (e.g., 'cat' close to 'dog').") + +print(""" +=== Summary === +These 5 blocks are ALL you need to build GPT: + 1. Linear → combine information (weighted sums) + 2. Activation → add non-linearity (ReLU or GELU) + 3. Softmax → get probabilities for next token + 4. Norm → keep numbers stable (RMSNorm or LayerNorm) + 5. Embedding → turn token IDs into vectors + +Next chapter: Attention — the special sauce of Transformers! +""") diff --git a/gpt/local/course/ch04_Neural_Network_Building_Blocks/mlp.py b/gpt/local/course/ch04_Neural_Network_Building_Blocks/mlp.py new file mode 100644 index 0000000..e78c51b --- /dev/null +++ b/gpt/local/course/ch04_Neural_Network_Building_Blocks/mlp.py @@ -0,0 +1,167 @@ +""" +Chapter 04: The MLP (Multi-Layer Perceptron) + +The MLP is a key component inside every Transformer block. +It processes each token position independently, adding +"thinking capacity" to the model. + +Structure: Linear → ReLU → Linear +""" + +import math +import random + +random.seed(42) + +# ============================================================ +# Value class (from Chapter 03) for automatic gradients +# ============================================================ +class Value: + def __init__(self, data, children=(), local_grads=()): + self.data = data + self.grad = 0 + self._children = children + self._local_grads = local_grads + + def __add__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data + other.data, (self, other), (1, 1)) + def __mul__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data * other.data, (self, other), (other.data, self.data)) + def __pow__(self, n): + return Value(self.data ** n, (self,), (n * self.data ** (n-1),)) + def relu(self): + return Value(max(0, self.data), (self,), (float(self.data > 0),)) + def __neg__(self): return self * -1 + def __sub__(self, other): return self + (-other) + def __radd__(self, other): return self + other + def __rmul__(self, other): return self * other + def __truediv__(self, other): return self * other ** -1 + + def backward(self): + topo, visited = [], set() + def build(v): + if v not in visited: + visited.add(v) + for c in v._children: build(c) + topo.append(v) + build(self) + self.grad = 1 + for v in reversed(topo): + for child, lg in zip(v._children, v._local_grads): + child.grad += lg * v.grad + +# ============================================================ +# MLP: Two linear layers with ReLU in between +# ============================================================ +# In GPT, the MLP has this structure: +# input (n_embd) → Linear (4*n_embd) → ReLU → Linear (n_embd) → output +# It "expands" the representation, processes it, then "compresses" back. + +n_in = 3 # input dimension +n_hidden = 8 # hidden dimension (expanded) +n_out = 3 # output dimension (same as input in GPT) + +# Initialize weights randomly +def make_matrix(rows, cols): + return [[Value(random.gauss(0, 0.3)) for _ in range(cols)] for _ in range(rows)] + +W1 = make_matrix(n_hidden, n_in) # first linear layer +W2 = make_matrix(n_out, n_hidden) # second linear layer + +def linear(x, w): + return [sum(wi * xi for wi, xi in zip(row, x)) for row in w] + +def mlp_forward(x): + """The MLP forward pass: Linear → ReLU → Linear""" + # Step 1: Expand (3 → 8) + h = linear(x, W1) + # Step 2: Activate (apply ReLU) + h = [hi.relu() for hi in h] + # Step 3: Compress (8 → 3) + out = linear(h, W2) + return out + +# ============================================================ +# Demo: Run a vector through the MLP +# ============================================================ +print("=" * 50) +print("MLP (Multi-Layer Perceptron) Demo") +print("=" * 50) + +x = [Value(1.0), Value(0.5), Value(-0.3)] + +print(f"\nInput ({n_in} values): [{', '.join(f'{v.data:.2f}' for v in x)}]") + +h = linear(x, W1) +print(f"After Linear1 ({n_hidden} values): [{', '.join(f'{v.data:.2f}' for v in h)}]") + +h_relu = [hi.relu() for hi in h] +print(f"After ReLU ({n_hidden} values): [{', '.join(f'{v.data:.2f}' for v in h_relu)}]") + +out = linear(h_relu, W2) +print(f"After Linear2 ({n_out} values): [{', '.join(f'{v.data:.2f}' for v in out)}]") + +# ============================================================ +# Train the MLP to learn a simple function +# ============================================================ +print("\n" + "=" * 50) +print("Training an MLP to learn: [a, b, c] → [a+b, b+c, a+c]") +print("=" * 50) + +# Training data +data = [ + ([1.0, 2.0, 3.0], [3.0, 5.0, 4.0]), + ([0.5, 1.0, 0.5], [1.5, 1.5, 1.0]), + ([2.0, 0.0, 1.0], [2.0, 1.0, 3.0]), + ([1.0, 1.0, 1.0], [2.0, 2.0, 2.0]), + ([0.0, 3.0, 2.0], [3.0, 5.0, 2.0]), +] + +# Collect all parameters for gradient descent +params = [p for row in W1 for p in row] + [p for row in W2 for p in row] +print(f"Number of parameters: {len(params)}") + +lr = 0.005 +for step in range(200): + # Forward pass on all data points + total_loss = Value(0) + for x_data, y_target in data: + x = [Value(v) for v in x_data] + y_pred = mlp_forward(x) + for yp, yt in zip(y_pred, y_target): + total_loss = total_loss + (yp - yt) ** 2 + + total_loss = total_loss * (1.0 / (len(data) * n_out)) + + # Backward pass + total_loss.backward() + + # Update parameters + for p in params: + p.data -= lr * p.grad + p.grad = 0 + + if step % 20 == 0: + print(f" Step {step:3d} | Loss: {total_loss.data:.4f}") + +# Test +print("\nAfter training:") +for x_data, y_target in data[:3]: + x = [Value(v) for v in x_data] + y_pred = mlp_forward(x) + pred = [f"{v.data:.2f}" for v in y_pred] + target = [f"{v:.2f}" for v in y_target] + print(f" Input: {x_data} → Predicted: [{', '.join(pred)}] | Target: [{', '.join(target)}]") + +print(""" +=== In GPT === +Every transformer block contains an MLP just like this. + - Input dimension: n_embd (e.g., 768 for GPT-2) + - Hidden dimension: 4 * n_embd (e.g., 3072) + - The MLP processes each token position independently + - It adds "thinking" — the ability to transform representations + +The MLP is the "feedforward" part. Next: Attention is the "communication" part. +""") diff --git a/gpt/local/course/ch05_Attention_and_Transformers/README.md b/gpt/local/course/ch05_Attention_and_Transformers/README.md new file mode 100644 index 0000000..050e01d --- /dev/null +++ b/gpt/local/course/ch05_Attention_and_Transformers/README.md @@ -0,0 +1,95 @@ +# Chapter 05: Attention & The Transformer Architecture + +## The Key Insight + +The MLP from Chapter 04 processes each token **independently**. But language requires **context** — the meaning of "bank" depends on whether we're talking about rivers or money. + +**Attention** lets each token look at all previous tokens and decide which ones are relevant. + +--- + +## Attention in Plain English + +Imagine you're reading: "The cat sat on the ___" + +To predict the next word, you need to look back at the whole sentence. But not all words matter equally: +- "cat" is very relevant (what's sitting?) +- "The" is less relevant +- "sat on" tells you it's about a location + +Attention is a mechanism that **automatically learns** which previous tokens to focus on. + +--- + +## How Attention Works: Query, Key, Value + +Each token produces three vectors: +- **Query (Q)**: "What am I looking for?" +- **Key (K)**: "What do I contain?" +- **Value (V)**: "What information do I provide?" + +The process: +1. Each token's Query is compared against all previous tokens' Keys (dot product) +2. The dot products become **attention weights** (via softmax) +3. The weighted sum of Values is the output + +``` +Token "___" asks: "Who should I pay attention to?" + - Q("___") · K("cat") = 0.7 (high match!) + - Q("___") · K("The") = 0.1 (low match) + - Q("___") · K("sat") = 0.5 (medium match) + - Q("___") · K("on") = 0.6 (medium-high match) + - Q("___") · K("the") = 0.1 (low match) + +After softmax: weights = [0.35, 0.05, 0.18, 0.27, 0.05] +Output = weighted sum of all Value vectors +``` + +--- + +## Multi-Head Attention + +Instead of one set of Q/K/V, GPT uses **multiple heads** (e.g., 4 or 12). Each head can learn to pay attention to different things: +- Head 1 might focus on syntax (subject-verb agreement) +- Head 2 might focus on nearby context +- Head 3 might focus on semantic similarity + +The outputs of all heads are concatenated. + +--- + +## Causal (Masked) Attention + +GPT is **autoregressive** — it generates text left to right. So token at position 5 can only attend to positions 0-4, never to future positions. This is enforced by masking. + +--- + +## The Transformer Block + +One transformer block = Attention + MLP, with residual connections and normalization: + +``` +Input + │ + ├──→ Norm → Multi-Head Attention ──┐ + │ │ + └───────────────────── (+) ←────────┘ ← Residual connection + │ + ├──→ Norm → MLP (Linear → ReLU → Linear) ──┐ + │ │ + └─────────────────────────────── (+) ←────────┘ ← Residual connection + │ +Output +``` + +GPT stacks multiple blocks: microGPT uses 1, GPT-2 uses 12-48. + +--- + +## Run the Examples + +```bash +python attention_basics.py # Step-by-step attention computation +python multi_head_attention.py # Multi-head attention from scratch +python transformer_block.py # A complete transformer block +``` diff --git a/gpt/local/course/ch05_Attention_and_Transformers/attention_basics.py b/gpt/local/course/ch05_Attention_and_Transformers/attention_basics.py new file mode 100644 index 0000000..dffecfe --- /dev/null +++ b/gpt/local/course/ch05_Attention_and_Transformers/attention_basics.py @@ -0,0 +1,184 @@ +""" +Chapter 05: Attention — Step by Step + +This builds single-head attention from scratch, showing every +intermediate value so you can see exactly what's happening. +""" + +import math +import random + +random.seed(42) + +# ============================================================ +# SETUP: A tiny sequence of 3 tokens, each with 4-dimensional embeddings +# ============================================================ +# Imagine we're processing: "the cat sat" +# After embedding, each token is a vector of numbers. + +seq_len = 3 +d_model = 4 # embedding dimension + +# Pretend these are embeddings (normally learned, here random for demo) +tokens = { + 0: "the", + 1: "cat", + 2: "sat", +} + +# Token embeddings (what the model "sees") +X = [ + [1.0, 0.5, -0.3, 0.8], # "the" + [0.2, 1.2, 0.7, -0.1], # "cat" + [-0.5, 0.3, 1.0, 0.6], # "sat" +] + +print("=" * 60) +print("Single-Head Self-Attention — Step by Step") +print("=" * 60) +print(f"\nSequence: {[tokens[i] for i in range(seq_len)]}") +print(f"Embedding dimension: {d_model}") +print(f"\nToken embeddings (X):") +for i in range(seq_len): + print(f" {tokens[i]:>5}: {X[i]}") + +# ============================================================ +# STEP 1: Project to Q, K, V using weight matrices +# ============================================================ +# In practice, these are learned weight matrices. +# Here we use simple random values for illustration. + +head_dim = d_model # In multi-head, this would be d_model // n_heads + +# Weight matrices (would be learned during training) +Wq = [[random.gauss(0, 0.5) for _ in range(d_model)] for _ in range(head_dim)] +Wk = [[random.gauss(0, 0.5) for _ in range(d_model)] for _ in range(head_dim)] +Wv = [[random.gauss(0, 0.5) for _ in range(d_model)] for _ in range(head_dim)] + +def matmul_vec(w, x): + """Multiply weight matrix by input vector.""" + return [sum(wi * xi for wi, xi in zip(row, x)) for row in w] + +# Compute Q, K, V for each token +Q = [matmul_vec(Wq, X[i]) for i in range(seq_len)] +K = [matmul_vec(Wk, X[i]) for i in range(seq_len)] +V = [matmul_vec(Wv, X[i]) for i in range(seq_len)] + +print("\n--- Step 1: Compute Q (Query), K (Key), V (Value) ---") +for i in range(seq_len): + print(f" {tokens[i]:>5} Q: [{', '.join(f'{v:+.2f}' for v in Q[i])}]") + print(f" {tokens[i]:>5} K: [{', '.join(f'{v:+.2f}' for v in K[i])}]") + print(f" {tokens[i]:>5} V: [{', '.join(f'{v:+.2f}' for v in V[i])}]") + +# ============================================================ +# STEP 2: Compute attention scores (Q dot K) +# ============================================================ +print("\n--- Step 2: Attention Scores = Q · K^T / sqrt(d) ---") +print(" Each score measures how much token i 'wants to attend to' token j.\n") + +scale = math.sqrt(head_dim) +scores = [[0.0] * seq_len for _ in range(seq_len)] + +for i in range(seq_len): + for j in range(seq_len): + # Dot product of Q[i] and K[j] + dot = sum(Q[i][d] * K[j][d] for d in range(head_dim)) + scores[i][j] = dot / scale + +# Print score matrix +print(f" {'':>10}", end="") +for j in range(seq_len): + print(f"{tokens[j]:>10}", end="") +print() +for i in range(seq_len): + print(f" {tokens[i]:>10}", end="") + for j in range(seq_len): + print(f"{scores[i][j]:>10.3f}", end="") + print() + +# ============================================================ +# STEP 3: Apply CAUSAL MASK (can't look at future tokens) +# ============================================================ +print("\n--- Step 3: Apply Causal Mask (hide future) ---") +print(" GPT generates left-to-right, so each token can only see itself and earlier tokens.\n") + +masked_scores = [[0.0] * seq_len for _ in range(seq_len)] +for i in range(seq_len): + for j in range(seq_len): + if j > i: # future token — mask it! + masked_scores[i][j] = float('-inf') + else: + masked_scores[i][j] = scores[i][j] + +print(f" {'':>10}", end="") +for j in range(seq_len): + print(f"{tokens[j]:>10}", end="") +print() +for i in range(seq_len): + print(f" {tokens[i]:>10}", end="") + for j in range(seq_len): + if masked_scores[i][j] == float('-inf'): + print(f"{' -inf':>10}", end="") + else: + print(f"{masked_scores[i][j]:>10.3f}", end="") + print() + +# ============================================================ +# STEP 4: Softmax → attention weights (probabilities) +# ============================================================ +print("\n--- Step 4: Softmax → Attention Weights ---") +print(" Convert scores to probabilities. -inf becomes 0 (masked out).\n") + +def softmax(logits): + max_val = max(v for v in logits if v != float('-inf')) + exps = [math.exp(v - max_val) if v != float('-inf') else 0.0 for v in logits] + total = sum(exps) + return [e / total if total > 0 else 0.0 for e in exps] + +attn_weights = [softmax(masked_scores[i]) for i in range(seq_len)] + +print(f" {'':>10}", end="") +for j in range(seq_len): + print(f"{tokens[j]:>10}", end="") +print() +for i in range(seq_len): + print(f" {tokens[i]:>10}", end="") + for j in range(seq_len): + print(f"{attn_weights[i][j]:>10.3f}", end="") + print(f" (sum={sum(attn_weights[i]):.3f})") + +print("\n Interpretation:") +print(f" '{tokens[0]}' attends to: only itself (first token, nothing before it)") +print(f" '{tokens[1]}' attends to: '{tokens[0]}' and itself") +print(f" '{tokens[2]}' attends to: all three tokens") + +# ============================================================ +# STEP 5: Weighted sum of Values → output +# ============================================================ +print("\n--- Step 5: Output = Attention Weights × Values ---") +print(" Each token's output is a weighted blend of all attended Value vectors.\n") + +output = [] +for i in range(seq_len): + out_i = [0.0] * head_dim + for j in range(seq_len): + for d in range(head_dim): + out_i[d] += attn_weights[i][j] * V[j][d] + output.append(out_i) + +for i in range(seq_len): + print(f" {tokens[i]:>5} output: [{', '.join(f'{v:+.3f}' for v in output[i])}]") + +print(""" +=== Summary === +Attention in 5 steps: + 1. Project each token to Q, K, V vectors + 2. Score: how much does each Q match each K? (dot product) + 3. Mask: hide future tokens (causal / autoregressive) + 4. Softmax: convert scores to weights (probabilities) + 5. Output: weighted sum of V vectors + +Key insight: The model LEARNS the W_q, W_k, W_v matrices during training. +After training, the attention patterns emerge automatically — the model +discovers which tokens are relevant to which other tokens. +""") diff --git a/gpt/local/course/ch05_Attention_and_Transformers/multi_head_attention.py b/gpt/local/course/ch05_Attention_and_Transformers/multi_head_attention.py new file mode 100644 index 0000000..dec5b70 --- /dev/null +++ b/gpt/local/course/ch05_Attention_and_Transformers/multi_head_attention.py @@ -0,0 +1,151 @@ +""" +Chapter 05: Multi-Head Attention + +Instead of one attention mechanism, GPT uses MULTIPLE "heads" +that run in parallel. Each head can learn to focus on different +types of relationships. + +This code follows the exact pattern from microGPT (microgpt.py). +""" + +import math +import random + +random.seed(42) + +# ============================================================ +# Configuration (matches microGPT's tiny config) +# ============================================================ +n_embd = 8 # embedding dimension +n_head = 2 # number of attention heads +head_dim = n_embd // n_head # = 4 dimensions per head +seq_len = 4 # sequence length + +print("=" * 60) +print("Multi-Head Attention") +print("=" * 60) +print(f" Embedding dim (n_embd): {n_embd}") +print(f" Number of heads: {n_head}") +print(f" Dimension per head: {head_dim}") +print(f" Sequence length: {seq_len}") + +# ============================================================ +# Initialize weight matrices +# ============================================================ +def make_matrix(rows, cols): + return [[random.gauss(0, 0.3) for _ in range(cols)] for _ in range(rows)] + +Wq = make_matrix(n_embd, n_embd) # Query projection +Wk = make_matrix(n_embd, n_embd) # Key projection +Wv = make_matrix(n_embd, n_embd) # Value projection +Wo = make_matrix(n_embd, n_embd) # Output projection + +def linear(x, w): + return [sum(wi * xi for wi, xi in zip(row, x)) for row in w] + +def softmax(logits): + max_val = max(logits) + exps = [math.exp(v - max_val) for v in logits] + total = sum(exps) + return [e / total for e in exps] + +# ============================================================ +# Generate token embeddings (pretend these came from earlier layers) +# ============================================================ +X = [[random.gauss(0, 0.5) for _ in range(n_embd)] for _ in range(seq_len)] + +# ============================================================ +# Multi-Head Attention (KV-cache style, like microGPT) +# ============================================================ +# microGPT processes one token at a time, accumulating keys/values +# This is more memory efficient and matches how inference works. + +print("\n--- Processing tokens one at a time (KV-cache style) ---\n") + +all_keys = [] # accumulated keys +all_values = [] # accumulated values +outputs = [] + +for pos in range(seq_len): + x = X[pos] + + # Project current token to Q, K, V (full n_embd dimension) + q = linear(x, Wq) + k = linear(x, Wk) + v = linear(x, Wv) + + # Store K and V for future tokens to attend to + all_keys.append(k) + all_values.append(v) + + # Now run each attention head + x_attn = [] # will collect outputs from all heads + + for h in range(n_head): + # Each head gets a SLICE of the Q, K, V vectors + hs = h * head_dim # start index for this head + + # Slice out this head's portion + q_h = q[hs:hs + head_dim] + k_h = [ki[hs:hs + head_dim] for ki in all_keys] # all stored keys + v_h = [vi[hs:hs + head_dim] for vi in all_values] # all stored values + + # Compute attention scores: Q dot K for all positions up to current + attn_logits = [] + for t in range(len(k_h)): + dot = sum(q_h[j] * k_h[t][j] for j in range(head_dim)) + attn_logits.append(dot / head_dim ** 0.5) + + # Softmax to get weights + attn_weights = softmax(attn_logits) + + # Weighted sum of values + head_out = [] + for j in range(head_dim): + val = sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) + head_out.append(val) + + x_attn.extend(head_out) # Concatenate head outputs + + if pos == seq_len - 1: # Print details for last token + print(f" Token {pos}, Head {h}:") + print(f" Attention weights: [{', '.join(f'{w:.3f}' for w in attn_weights)}]") + print(f" Head output: [{', '.join(f'{v:.3f}' for v in head_out)}]") + + # Final output projection: combine all heads + out = linear(x_attn, Wo) + outputs.append(out) + +print(f"\n--- Outputs ---") +for pos in range(seq_len): + print(f" Token {pos}: [{', '.join(f'{v:+.3f}' for v in outputs[pos][:4])}...]") + +# ============================================================ +# Explain Multi-Head +# ============================================================ +print(f""" +=== Why Multiple Heads? === + +With {n_head} heads, each getting {head_dim} dimensions: + - Head 0 uses dimensions [0:{head_dim}] of Q, K, V + - Head 1 uses dimensions [{head_dim}:{n_embd}] of Q, K, V + +Each head learns DIFFERENT attention patterns: + - Head 0 might learn to attend to the previous token + - Head 1 might learn to attend to the subject of the sentence + +After computing, head outputs are CONCATENATED: + [{head_dim} dims from head 0] + [{head_dim} dims from head 1] = [{n_embd} dims total] + +Then a final linear layer (Wo) mixes the head outputs. + +=== KV-Cache === + +Notice how we process ONE token at a time and store its K, V: + - Token 0: compute K0, V0, store them + - Token 1: compute K1, V1, attend to [K0, K1], [V0, V1] + - Token 2: compute K2, V2, attend to [K0, K1, K2], [V0, V1, V2] + +This is exactly how microGPT works (and how real GPT inference works). +The "KV-cache" avoids recomputing keys and values for earlier tokens. +""") diff --git a/gpt/local/course/ch05_Attention_and_Transformers/transformer_block.py b/gpt/local/course/ch05_Attention_and_Transformers/transformer_block.py new file mode 100644 index 0000000..454b163 --- /dev/null +++ b/gpt/local/course/ch05_Attention_and_Transformers/transformer_block.py @@ -0,0 +1,207 @@ +""" +Chapter 05: A Complete Transformer Block + +This combines everything from Chapters 04 and 05: + - RMSNorm + - Multi-Head Attention + - MLP (Linear → ReLU → Linear) + - Residual Connections + +This is the EXACT structure of one layer in microGPT. +""" + +import math +import random + +random.seed(42) + +# ============================================================ +# Configuration +# ============================================================ +n_embd = 8 +n_head = 2 +head_dim = n_embd // n_head + +# ============================================================ +# Helper functions (from earlier chapters) +# ============================================================ +def linear(x, w): + return [sum(wi * xi for wi, xi in zip(row, x)) for row in w] + +def softmax(logits): + max_val = max(logits) + exps = [math.exp(v - max_val) for v in logits] + total = sum(exps) + return [e / total for e in exps] + +def rmsnorm(x): + ms = sum(xi * xi for xi in x) / len(x) + scale = (ms + 1e-5) ** -0.5 + return [xi * scale for xi in x] + +def relu(x): + return max(0, x) + +# ============================================================ +# Initialize all weights for one transformer block +# ============================================================ +def make_matrix(rows, cols): + return [[random.gauss(0, 0.3) for _ in range(cols)] for _ in range(rows)] + +weights = { + 'attn_wq': make_matrix(n_embd, n_embd), + 'attn_wk': make_matrix(n_embd, n_embd), + 'attn_wv': make_matrix(n_embd, n_embd), + 'attn_wo': make_matrix(n_embd, n_embd), + 'mlp_fc1': make_matrix(4 * n_embd, n_embd), # expand + 'mlp_fc2': make_matrix(n_embd, 4 * n_embd), # compress +} + +# ============================================================ +# THE TRANSFORMER BLOCK +# ============================================================ +def transformer_block(x, keys_cache, values_cache, pos): + """ + Process one token through one transformer block. + + Args: + x: input embedding vector [n_embd] + keys_cache: list of previous key vectors + values_cache: list of previous value vectors + pos: position index (for printing) + + Returns: + output vector [n_embd] + """ + # ---- Part 1: Multi-Head Attention ---- + x_residual = x[:] # save for residual connection + + # Pre-norm + x_normed = rmsnorm(x) + + # Compute Q, K, V + q = linear(x_normed, weights['attn_wq']) + k = linear(x_normed, weights['attn_wk']) + v = linear(x_normed, weights['attn_wv']) + + # Add current K, V to cache + keys_cache.append(k) + values_cache.append(v) + + # Multi-head attention + x_attn = [] + for h in range(n_head): + hs = h * head_dim + q_h = q[hs:hs + head_dim] + k_h = [ki[hs:hs + head_dim] for ki in keys_cache] + v_h = [vi[hs:hs + head_dim] for vi in values_cache] + + # Attention scores + attn_logits = [ + sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim ** 0.5 + for t in range(len(k_h)) + ] + attn_weights = softmax(attn_logits) + + # Weighted sum of values + head_out = [ + sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) + for j in range(head_dim) + ] + x_attn.extend(head_out) + + # Output projection + x = linear(x_attn, weights['attn_wo']) + + # Residual connection: ADD the input back + x = [a + b for a, b in zip(x, x_residual)] + + # ---- Part 2: MLP ---- + x_residual = x[:] # save for residual connection + + # Pre-norm + x_normed = rmsnorm(x) + + # MLP: expand → activate → compress + h = linear(x_normed, weights['mlp_fc1']) # n_embd → 4*n_embd + h = [relu(hi) for hi in h] # ReLU activation + x_mlp = linear(h, weights['mlp_fc2']) # 4*n_embd → n_embd + + # Residual connection: ADD the input back + x = [a + b for a, b in zip(x_mlp, x_residual)] + + return x + +# ============================================================ +# DEMO: Process a sequence through the transformer block +# ============================================================ +print("=" * 60) +print("Complete Transformer Block Demo") +print("=" * 60) + +# Fake token embeddings +tokens = ["The", "cat", "sat", "on"] +X = [[random.gauss(0, 0.5) for _ in range(n_embd)] for _ in range(len(tokens))] + +keys_cache = [] +values_cache = [] + +print(f"\nProcessing {len(tokens)} tokens through one transformer block:\n") + +for pos, token in enumerate(tokens): + x_in = X[pos] + x_out = transformer_block(x_in, keys_cache, values_cache, pos) + + # Compute how much the representation changed + change = sum((a - b) ** 2 for a, b in zip(x_in, x_out)) ** 0.5 + + print(f" Token {pos} ('{token}'):") + print(f" Input: [{', '.join(f'{v:+.3f}' for v in x_in[:4])}...]") + print(f" Output: [{', '.join(f'{v:+.3f}' for v in x_out[:4])}...]") + print(f" Change magnitude: {change:.4f}") + print(f" Attended to {pos + 1} token(s)") + +print(""" +=== Anatomy of a Transformer Block === + + Input x + │ + ├─── save as residual + │ + ▼ + RMSNorm(x) + │ + ▼ + Multi-Head Attention (look at previous tokens) + │ + ▼ + + residual ← Information highway! Original info preserved + │ + ├─── save as residual + │ + ▼ + RMSNorm(x) + │ + ▼ + MLP: Linear(4x) → ReLU → Linear(1x) (per-token processing) + │ + ▼ + + residual ← Information highway again + │ + ▼ + Output x + +=== Why Residual Connections? === +Without them, information from early tokens would get lost. +The residual connection ensures the original signal always +has a direct path through the network. Think of it as: + "Output = Original + What I Learned" + +=== GPT = Stack of These Blocks === + - microGPT: 1 block (tiny, for learning) + - GPT-2 small: 12 blocks + - GPT-2 XL: 48 blocks + - GPT-3: 96 blocks + +More blocks = more processing steps = more "thinking" +""") diff --git a/gpt/local/course/ch06_Training_Loop_and_Optimization/README.md b/gpt/local/course/ch06_Training_Loop_and_Optimization/README.md new file mode 100644 index 0000000..6823f40 --- /dev/null +++ b/gpt/local/course/ch06_Training_Loop_and_Optimization/README.md @@ -0,0 +1,67 @@ +# Chapter 06: Training — How Models Learn + +## The Training Loop + +Every neural network trains with the same loop: + +``` +for each training step: + 1. Pick a training example + 2. Forward pass: run the model, compute the loss + 3. Backward pass: compute gradients (autograd) + 4. Update: adjust parameters to reduce the loss +``` + +--- + +## Loss Function: Cross-Entropy + +For language models, the loss measures: **"How surprised was the model by the correct next token?"** + +``` +Model predicts: P("the")=0.6, P("a")=0.3, P("cat")=0.1 +Correct answer: "the" + +Loss = -log(0.6) = 0.51 (low loss — model was fairly confident) + +Model predicts: P("the")=0.1, P("a")=0.1, P("cat")=0.8 +Correct answer: "the" + +Loss = -log(0.1) = 2.30 (high loss — model was wrong!) +``` + +Lower loss = better predictions. Training minimizes the average loss. + +--- + +## The Optimizer: Adam + +Simple gradient descent updates parameters as: `p -= learning_rate * gradient` + +**Adam** is smarter — it maintains: +- **Momentum (m)**: Running average of gradients (direction smoothing) +- **Velocity (v)**: Running average of squared gradients (per-parameter learning rate) + +This makes training faster and more stable. + +--- + +## Learning Rate + +The learning rate controls step size: +- **Too high**: Parameters overshoot, loss explodes +- **Too low**: Training is painfully slow +- **Just right**: Loss decreases steadily + +Common practice: **start with a reasonable LR and decay it** over training. +microGPT uses linear decay: `lr = lr_start * (1 - step/total_steps)` + +--- + +## Run the Examples + +```bash +python cross_entropy.py # Understanding the loss function +python adam_optimizer.py # How Adam works, with visualization +python training_loop.py # A complete mini training loop +``` diff --git a/gpt/local/course/ch06_Training_Loop_and_Optimization/adam_optimizer.py b/gpt/local/course/ch06_Training_Loop_and_Optimization/adam_optimizer.py new file mode 100644 index 0000000..75d7ee1 --- /dev/null +++ b/gpt/local/course/ch06_Training_Loop_and_Optimization/adam_optimizer.py @@ -0,0 +1,124 @@ +""" +Chapter 06: The Adam Optimizer + +Adam is the optimizer used in both microGPT and minGPT. +It's smarter than plain gradient descent because it adapts +the learning rate for each parameter individually. + +This demo compares plain SGD vs Adam on a simple problem. +""" + +import math +import random + +random.seed(42) + +# ============================================================ +# Problem: Minimize f(x, y) = (x - 3)^2 + 10*(y - 7)^2 +# The minimum is at (3, 7). Can Adam find it? +# ============================================================ + +def f(x, y): + """Function to minimize. Minimum at (3, 7).""" + return (x - 3) ** 2 + 10 * (y - 7) ** 2 + +def grad_f(x, y): + """Gradient of f.""" + return 2 * (x - 3), 20 * (y - 7) + +# ============================================================ +# Method 1: Plain Gradient Descent (SGD) +# ============================================================ +print("=" * 60) +print("Method 1: Plain Gradient Descent (SGD)") +print("=" * 60) + +x, y = 0.0, 0.0 # start far from the minimum +lr = 0.01 + +for step in range(50): + gx, gy = grad_f(x, y) + x -= lr * gx + y -= lr * gy + if step % 5 == 0: + print(f" Step {step:3d}: x={x:.4f}, y={y:.4f}, f={f(x, y):.4f}") + +print(f" Final: x={x:.4f} (target: 3.0), y={y:.4f} (target: 7.0)") + +# ============================================================ +# Method 2: Adam Optimizer +# ============================================================ +print("\n" + "=" * 60) +print("Method 2: Adam Optimizer") +print("=" * 60) + +x, y = 0.0, 0.0 # same starting point +lr = 0.1 +beta1, beta2, eps = 0.9, 0.99, 1e-8 + +# Adam's extra state: momentum and velocity for each parameter +mx, my = 0.0, 0.0 # first moment (momentum) +vx, vy = 0.0, 0.0 # second moment (velocity) + +for step in range(50): + gx, gy = grad_f(x, y) + + # Update momentum (exponential moving average of gradients) + mx = beta1 * mx + (1 - beta1) * gx + my = beta1 * my + (1 - beta1) * gy + + # Update velocity (exponential moving average of squared gradients) + vx = beta2 * vx + (1 - beta2) * gx ** 2 + vy = beta2 * vy + (1 - beta2) * gy ** 2 + + # Bias correction (important for early steps when m and v are near 0) + mx_hat = mx / (1 - beta1 ** (step + 1)) + my_hat = my / (1 - beta1 ** (step + 1)) + vx_hat = vx / (1 - beta2 ** (step + 1)) + vy_hat = vy / (1 - beta2 ** (step + 1)) + + # Update parameters + x -= lr * mx_hat / (vx_hat ** 0.5 + eps) + y -= lr * my_hat / (vy_hat ** 0.5 + eps) + + if step % 5 == 0: + print(f" Step {step:3d}: x={x:.4f}, y={y:.4f}, f={f(x, y):.4f}") + +print(f" Final: x={x:.4f} (target: 3.0), y={y:.4f} (target: 7.0)") + +# ============================================================ +# Explain what Adam does +# ============================================================ +print(f""" +=== Why Adam is Better === + +Plain SGD: every parameter uses the SAME learning rate. + - The y direction has steep gradients (10x) → it overshoots or oscillates + - The x direction has gentle gradients → it moves too slowly + - You can't make both happy with one learning rate + +Adam: ADAPTS the learning rate per parameter. + - Momentum (m): smooths out gradient noise, like a rolling ball + - Velocity (v): tracks gradient magnitude per parameter + - Parameters with large gradients get SMALLER steps + - Parameters with small gradients get LARGER steps + +=== Adam in microGPT (line 176-182) === + + for i, p in enumerate(params): + m[i] = beta1 * m[i] + (1 - beta1) * p.grad # momentum + v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 # velocity + m_hat = m[i] / (1 - beta1 ** (step + 1)) # bias correction + v_hat = v[i] / (1 - beta2 ** (step + 1)) # bias correction + p.data -= lr * m_hat / (v_hat ** 0.5 + eps) # update! + p.grad = 0 # reset for next step + +Same algorithm, just applied to thousands of parameters at once. + +=== Learning Rate Decay === + +microGPT also decays the learning rate linearly: + lr_t = lr * (1 - step / total_steps) + +This means: take big steps early (explore), small steps late (fine-tune). +""") diff --git a/gpt/local/course/ch06_Training_Loop_and_Optimization/cross_entropy.py b/gpt/local/course/ch06_Training_Loop_and_Optimization/cross_entropy.py new file mode 100644 index 0000000..bcf6925 --- /dev/null +++ b/gpt/local/course/ch06_Training_Loop_and_Optimization/cross_entropy.py @@ -0,0 +1,121 @@ +""" +Chapter 06: Cross-Entropy Loss — The GPT Loss Function + +The cross-entropy loss measures how "surprised" the model is +by the correct answer. It's the standard loss function for +classification and language modeling. + +Formula: loss = -log(probability_of_correct_token) +""" + +import math + +# ============================================================ +# The Intuition +# ============================================================ +print("=" * 60) +print("Cross-Entropy Loss: How 'Surprised' Is the Model?") +print("=" * 60) + +print(""" +If the model assigns HIGH probability to the correct answer: + loss = -log(0.9) = {:.3f} ← Low loss (good!) + +If the model assigns LOW probability to the correct answer: + loss = -log(0.1) = {:.3f} ← High loss (bad!) + +If the model assigns VERY LOW probability: + loss = -log(0.01) = {:.3f} ← Very high loss (terrible!) +""".format(-math.log(0.9), -math.log(0.1), -math.log(0.01))) + +# ============================================================ +# Step by step: from logits to loss +# ============================================================ +print("=" * 60) +print("From Model Output to Loss (Step by Step)") +print("=" * 60) + +# Pretend our vocabulary is: [a, b, c, d, e] (5 tokens) +vocab = ['a', 'b', 'c', 'd', 'e'] +correct_token = 2 # the correct next token is 'c' + +# Step 1: Model outputs raw scores (logits) +logits = [1.0, 2.0, 4.0, 1.5, 0.5] +print(f"\nStep 1 - Model outputs logits (raw scores):") +for i, (tok, score) in enumerate(zip(vocab, logits)): + marker = " ← correct" if i == correct_token else "" + print(f" '{tok}': {score:.1f}{marker}") + +# Step 2: Softmax converts to probabilities +def softmax(logits): + max_val = max(logits) + exps = [math.exp(v - max_val) for v in logits] + total = sum(exps) + return [e / total for e in exps] + +probs = softmax(logits) +print(f"\nStep 2 - Softmax → probabilities:") +for i, (tok, p) in enumerate(zip(vocab, probs)): + bar = "█" * int(p * 40) + marker = " ← correct" if i == correct_token else "" + print(f" '{tok}': {p:.4f} {bar}{marker}") +print(f" Sum: {sum(probs):.4f}") + +# Step 3: Cross-entropy loss +loss = -math.log(probs[correct_token]) +print(f"\nStep 3 - Loss = -log(P(correct)) = -log({probs[correct_token]:.4f}) = {loss:.4f}") + +# ============================================================ +# What the gradients look like +# ============================================================ +print("\n" + "=" * 60) +print("What Gradients Tell the Model") +print("=" * 60) +print(f""" +After computing loss, backpropagation gives us gradients. +For softmax + cross-entropy, the gradient for each logit is simple: + + gradient[i] = probability[i] - (1 if i is correct else 0) +""") + +for i, (tok, p) in enumerate(zip(vocab, probs)): + target = 1.0 if i == correct_token else 0.0 + grad = p - target + direction = "↓ decrease" if grad > 0 else "↑ INCREASE" + print(f" '{tok}': grad = {p:.4f} - {target:.0f} = {grad:+.4f} → {direction} this score") + +print(""" +The gradients push the model to: + - INCREASE the score of the correct token ('c') + - DECREASE the scores of all incorrect tokens + - The adjustment is proportional to how wrong the model was +""") + +# ============================================================ +# Average loss over a sequence +# ============================================================ +print("=" * 60) +print("Average Loss Over a Sequence") +print("=" * 60) + +# Simulating predictions for "hello" where model gets better over positions +sequence = [ + ("h", [0.1, 0.8, 0.05, 0.05]), # model correctly predicts 'h' with 10% + ("e", [0.05, 0.05, 0.8, 0.1]), # model correctly predicts 'e' with 80% + ("l", [0.2, 0.6, 0.1, 0.1]), # model correctly predicts 'l' with 60% + ("l", [0.1, 0.1, 0.7, 0.1]), # model correctly predicts second 'l' with 70% + ("o", [0.1, 0.1, 0.1, 0.7]), # model correctly predicts 'o' with 70% +] + +print(f"\nPredicting each character in a sequence:") +losses = [] +for char, probs_for_correct in sequence: + p = probs_for_correct[0] # simplified: first prob is for correct token + l = -math.log(max(p, 1e-10)) + losses.append(l) + print(f" '{char}': P(correct) = {p:.2f}, loss = {l:.4f}") + +avg_loss = sum(losses) / len(losses) +print(f"\n Average loss = {avg_loss:.4f}") +print(f" This is what microGPT computes: (1/n) * sum(losses)") +print(f"\n Training goal: make this number as small as possible!") diff --git a/gpt/local/course/ch06_Training_Loop_and_Optimization/training_loop.py b/gpt/local/course/ch06_Training_Loop_and_Optimization/training_loop.py new file mode 100644 index 0000000..7e88988 --- /dev/null +++ b/gpt/local/course/ch06_Training_Loop_and_Optimization/training_loop.py @@ -0,0 +1,171 @@ +""" +Chapter 06: A Complete Mini Training Loop + +This script trains a tiny neural network on a simple pattern, +using the EXACT same training loop structure as microGPT. + +The pattern: learn to predict the next character in "abcabc..." +""" + +import math +import random + +random.seed(42) + +# ============================================================ +# Autograd engine (from Chapter 03) +# ============================================================ +class Value: + __slots__ = ('data', 'grad', '_children', '_local_grads') + def __init__(self, data, children=(), local_grads=()): + self.data = data + self.grad = 0 + self._children = children + self._local_grads = local_grads + def __add__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data + other.data, (self, other), (1, 1)) + def __mul__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data * other.data, (self, other), (other.data, self.data)) + def __pow__(self, n): return Value(self.data**n, (self,), (n * self.data**(n-1),)) + def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),)) + def log(self): return Value(math.log(self.data), (self,), (1/self.data,)) + def __neg__(self): return self * -1 + def __sub__(self, other): return self + (-other) + def __radd__(self, other): return self + other + def __rmul__(self, other): return self * other + def __truediv__(self, other): return self * other**-1 + def backward(self): + topo, visited = [], set() + def build(v): + if v not in visited: + visited.add(v) + for c in v._children: build(c) + topo.append(v) + build(self) + self.grad = 1 + for v in reversed(topo): + for child, lg in zip(v._children, v._local_grads): + child.grad += lg * v.grad + +# ============================================================ +# Simple model: a single linear layer (logits = W @ one_hot_input) +# This is basically just a lookup table — the simplest "model" +# ============================================================ +vocab = ['a', 'b', 'c'] +vocab_size = len(vocab) +char_to_id = {ch: i for i, ch in enumerate(vocab)} + +# The training data: repeating "abc" +data = "abcabcabcabcabc" + +# Our "model": a weight matrix that maps each token to logits for next token +# W[i][j] = score for predicting token j when input is token i +matrix = lambda rows, cols: [[Value(random.gauss(0, 0.5)) for _ in range(cols)] + for _ in range(rows)] +W = matrix(vocab_size, vocab_size) +params = [p for row in W for p in row] + +def softmax(logits): + max_val = max(val.data for val in logits) + exps = [(val - max_val).exp() for val in logits] + total = sum(exps) + return [e / total for e in exps] + +# ============================================================ +# Training loop — same structure as microGPT! +# ============================================================ +print("=" * 50) +print("Training a tiny model to predict: a→b, b→c, c→a") +print(f"Parameters: {len(params)}") +print("=" * 50) + +learning_rate = 0.5 +beta1, beta2, eps = 0.85, 0.99, 1e-8 +m = [0.0] * len(params) +v = [0.0] * len(params) + +num_steps = 100 + +for step in range(num_steps): + # 1. Pick a training example (input token → target token) + pos = step % (len(data) - 1) + input_id = char_to_id[data[pos]] + target_id = char_to_id[data[pos + 1]] + + # 2. Forward pass: look up the row, get logits, compute loss + logits = W[input_id] # simple lookup: logits for this input + probs = softmax(logits) + loss = -probs[target_id].log() + + # 3. Backward pass + loss.backward() + + # 4. Adam optimizer update + lr_t = learning_rate * (1 - step / num_steps) # linear decay + for i, p in enumerate(params): + m[i] = beta1 * m[i] + (1 - beta1) * p.grad + v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 + m_hat = m[i] / (1 - beta1 ** (step + 1)) + v_hat = v[i] / (1 - beta2 ** (step + 1)) + p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps) + p.grad = 0 # reset gradient + + if step % 10 == 0: + print(f" Step {step:3d} | loss {loss.data:.4f} | " + f"input='{data[pos]}' target='{data[pos+1]}' " + f"P(correct)={probs[target_id].data:.3f}") + +# ============================================================ +# Inference: what did the model learn? +# ============================================================ +print("\n" + "=" * 50) +print("After Training: What the Model Learned") +print("=" * 50) + +print("\nPrediction probabilities:") +for input_ch in vocab: + input_id = char_to_id[input_ch] + logits = W[input_id] + probs = softmax(logits) + print(f"\n Given '{input_ch}', predict:") + for j, ch in enumerate(vocab): + bar = "█" * int(probs[j].data * 30) + print(f" '{ch}': {probs[j].data:.3f} {bar}") + +# Generate a sequence! +print("\n" + "=" * 50) +print("Generating text (starting from 'a'):") +print("=" * 50) + +current = char_to_id['a'] +generated = ['a'] +for _ in range(20): + logits = W[current] + probs = softmax(logits) + # Greedy: pick the highest probability token + next_token = max(range(vocab_size), key=lambda i: probs[i].data) + generated.append(vocab[next_token]) + current = next_token + +print(f" {''.join(generated)}") +print(f" (Should be 'abcabcabc...' if training worked!)") + +print(""" +=== This Is microGPT's Training Loop === + +What we just did: + 1. Pick a document, tokenize it ← line 156-158 + 2. For each position, predict next token ← line 163-168 + 3. Compute cross-entropy loss ← line 167-169 + 4. Backward pass to get gradients ← line 172 + 5. Adam optimizer updates all parameters ← line 176-182 + +The only difference in real microGPT: + - The model is a full transformer (not a lookup table) + - It processes entire sequences, not single tokens + - It has ~5000 parameters instead of 9 + +But the training loop structure is IDENTICAL. +""") diff --git a/gpt/local/course/ch07_microGPT_Full_Walkthrough/README.md b/gpt/local/course/ch07_microGPT_Full_Walkthrough/README.md new file mode 100644 index 0000000..840c88d --- /dev/null +++ b/gpt/local/course/ch07_microGPT_Full_Walkthrough/README.md @@ -0,0 +1,70 @@ +# Chapter 07: microGPT — Full Walkthrough + +## What is microGPT? + +microGPT is a **complete GPT implementation in 200 lines of pure Python**. No PyTorch, no TensorFlow, no libraries (except `math`, `random`, `os`). It builds everything from scratch: + +- Autograd engine (Chapter 03) +- Neural network layers (Chapter 04) +- Multi-head attention (Chapter 05) +- Training loop with Adam optimizer (Chapter 06) + +It trains on a dataset of baby names and learns to generate new, plausible-sounding names. + +**Source:** `../../8627fe009c40f57531cb18360106ce95/microgpt.py` + +--- + +## The 7 Sections of microGPT + +| Lines | Section | What It Does | +|---|---|---| +| 1-21 | Dataset | Downloads names, shuffles them | +| 23-27 | Tokenizer | Character-level: each letter → integer | +| 29-73 | Autograd | The `Value` class with forward/backward | +| 74-90 | Parameters | Initialize all weight matrices | +| 92-144 | Model | The GPT architecture (attention + MLP) | +| 146-184 | Training | Forward → loss → backward → Adam update | +| 186-200 | Inference | Generate new names from the trained model | + +--- + +## Architecture Summary + +``` +microGPT Config: + n_layer = 1 (1 transformer block) + n_embd = 16 (16-dimensional embeddings) + block_size = 16 (max sequence length) + n_head = 4 (4 attention heads) + head_dim = 4 (16 / 4 = 4 dims per head) + vocab_size = 27 (26 letters + BOS token) +``` + +Total parameters: ~5,000 (vs GPT-2's 124 million!) + +--- + +## Run the Annotated Version + +```bash +python microgpt_annotated.py # The full microGPT with detailed comments +``` + +**Warning:** This takes several minutes to run because it's pure Python (no GPU acceleration). That's the tradeoff for zero dependencies! + +--- + +## Key Differences from "Real" GPT-2 + +| Feature | microGPT | GPT-2 | +|---|---|---| +| Dependencies | None | PyTorch | +| Autograd | Custom `Value` class | PyTorch autograd | +| Normalization | RMSNorm | LayerNorm | +| Activation | ReLU | GELU | +| Biases | None | Yes | +| Layers | 1 | 12-48 | +| Parameters | ~5,000 | 124M-1.5B | +| Training | Single document at a time | Batched | +| Tokenizer | Character-level | BPE (50K vocab) | diff --git a/gpt/local/course/ch07_microGPT_Full_Walkthrough/microgpt_annotated.py b/gpt/local/course/ch07_microGPT_Full_Walkthrough/microgpt_annotated.py new file mode 100644 index 0000000..7150177 --- /dev/null +++ b/gpt/local/course/ch07_microGPT_Full_Walkthrough/microgpt_annotated.py @@ -0,0 +1,382 @@ +""" +Chapter 07: microGPT — Fully Annotated + +This is the COMPLETE microgpt.py by @karpathy, with extensive +annotations for CS students. Every section maps back to the +concepts from Chapters 01-06. + +Original: ../../8627fe009c40f57531cb18360106ce95/microgpt.py + +To run: python microgpt_annotated.py +(Takes several minutes — it's pure Python, no GPU!) +You can reduce num_steps below to speed it up. +""" + +# ===================================================================== +# SECTION 1: IMPORTS & SETUP (no external dependencies!) +# ===================================================================== +# This entire GPT uses ONLY Python standard library. +# "Everything else is just efficiency." — @karpathy + +import os # os.path.exists +import math # math.log, math.exp +import random # random.seed, random.choices, random.gauss, random.shuffle +random.seed(42) # Fixed seed for reproducibility + +# ===================================================================== +# SECTION 2: DATASET (Chapter 01 — The Data) +# ===================================================================== +# Downloads a list of ~32,000 baby names. +# Each name becomes a "document" that the model learns to generate. + +if not os.path.exists('input.txt'): + import urllib.request + names_url = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt' + urllib.request.urlretrieve(names_url, 'input.txt') +docs = [line.strip() for line in open('input.txt') if line.strip()] +random.shuffle(docs) +print(f"num docs: {len(docs)}") +print(f"first 5 names: {docs[:5]}") + +# ===================================================================== +# SECTION 3: TOKENIZER (Chapter 02 — Tokenization) +# ===================================================================== +# Character-level tokenizer: each unique letter → integer +# Plus one special BOS (Beginning of Sequence) token + +uchars = sorted(set(''.join(docs))) # unique characters: ['a', 'b', ..., 'z'] +BOS = len(uchars) # BOS gets the next available ID (26) +vocab_size = len(uchars) + 1 # 27 total tokens +print(f"vocab size: {vocab_size}") +print(f"characters: {uchars}") +print(f"BOS token id: {BOS}") + +# Example: tokenizing "emma" +example = "emma" +example_tokens = [BOS] + [uchars.index(ch) for ch in example] + [BOS] +print(f"'{example}' tokenized: {example_tokens}") + +# ===================================================================== +# SECTION 4: AUTOGRAD ENGINE (Chapter 03 — The Value Class) +# ===================================================================== +# Every number in the model is a Value object that tracks: +# - Its numeric value (.data) +# - Its gradient (.grad) — filled in during backward pass +# - How it was computed (_children, _local_grads) — for chain rule + +class Value: + __slots__ = ('data', 'grad', '_children', '_local_grads') + + def __init__(self, data, children=(), local_grads=()): + self.data = data + self.grad = 0 + self._children = children + self._local_grads = local_grads + + # --- ARITHMETIC OPERATIONS --- + # Each operation records the local derivative for the chain rule. + + def __add__(self, other): + # d(a+b)/da = 1, d(a+b)/db = 1 + other = other if isinstance(other, Value) else Value(other) + return Value(self.data + other.data, (self, other), (1, 1)) + + def __mul__(self, other): + # d(a*b)/da = b, d(a*b)/db = a + other = other if isinstance(other, Value) else Value(other) + return Value(self.data * other.data, (self, other), (other.data, self.data)) + + def __pow__(self, other): + # d(a^n)/da = n * a^(n-1) + return Value(self.data**other, (self,), (other * self.data**(other-1),)) + + def log(self): + # d(log(a))/da = 1/a + return Value(math.log(self.data), (self,), (1/self.data,)) + + def exp(self): + # d(exp(a))/da = exp(a) + return Value(math.exp(self.data), (self,), (math.exp(self.data),)) + + def relu(self): + # d(relu(a))/da = 1 if a > 0, else 0 + return Value(max(0, self.data), (self,), (float(self.data > 0),)) + + # Convenience methods for operator overloading + def __neg__(self): return self * -1 + def __radd__(self, other): return self + other + def __sub__(self, other): return self + (-other) + def __rsub__(self, other): return other + (-self) + def __rmul__(self, other): return self * other + def __truediv__(self, other): return self * other**-1 + def __rtruediv__(self, other): return other * self**-1 + + def backward(self): + """Backpropagation: compute gradients via reverse-mode autodiff.""" + # Step 1: Topological sort (ensure children processed before parents) + topo = [] + visited = set() + def build_topo(v): + if v not in visited: + visited.add(v) + for child in v._children: + build_topo(child) + topo.append(v) + build_topo(self) + + # Step 2: Propagate gradients backward through the graph + self.grad = 1 # d(loss)/d(loss) = 1 + for v in reversed(topo): + for child, local_grad in zip(v._children, v._local_grads): + # Chain rule: child.grad += local_derivative * parent.grad + child.grad += local_grad * v.grad + +# ===================================================================== +# SECTION 5: PARAMETER INITIALIZATION (Chapter 04 — Weight Matrices) +# ===================================================================== +# The "knowledge" of the model lives in these weight matrices. +# Before training, they're random. After training, they encode patterns. + +n_layer = 1 # 1 transformer block (GPT-2 uses 12-48) +n_embd = 16 # embedding dimension (GPT-2 uses 768-1600) +block_size = 16 # max context length (longest name is 15 chars) +n_head = 4 # attention heads (GPT-2 uses 12-25) +head_dim = n_embd // n_head # = 4 dims per head + +# Helper: create a matrix of Value objects initialized with small random values +matrix = lambda nout, nin, std=0.08: [ + [Value(random.gauss(0, std)) for _ in range(nin)] + for _ in range(nout) +] + +# The state_dict: all learnable parameters organized by name +state_dict = { + 'wte': matrix(vocab_size, n_embd), # Token embeddings: 27 tokens × 16 dims + 'wpe': matrix(block_size, n_embd), # Position embeddings: 16 positions × 16 dims + 'lm_head': matrix(vocab_size, n_embd) # Output layer: maps embeddings → token scores +} + +# Each transformer layer has attention and MLP weights +for i in range(n_layer): + state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) # Query projection + state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) # Key projection + state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) # Value projection + state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd) # Output projection + state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) # MLP expand (16→64) + state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd) # MLP compress (64→16) + +# Flatten all parameters into one list (for the optimizer) +params = [p for mat in state_dict.values() for row in mat for p in row] +print(f"num params: {len(params)}") + +# ===================================================================== +# SECTION 6: MODEL ARCHITECTURE (Chapter 05 — Attention + MLP) +# ===================================================================== +# The GPT model: token → embedding → transformer blocks → logits +# Follows GPT-2 with minor changes: RMSNorm instead of LayerNorm, +# no biases, ReLU instead of GELU. + +def linear(x, w): + """Linear layer: matrix-vector multiply (Chapter 04)""" + return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] + +def softmax(logits): + """Softmax: scores → probabilities (Chapter 04)""" + max_val = max(val.data for val in logits) + exps = [(val - max_val).exp() for val in logits] + total = sum(exps) + return [e / total for e in exps] + +def rmsnorm(x): + """RMSNorm: stabilize values (Chapter 04)""" + ms = sum(xi * xi for xi in x) / len(x) + scale = (ms + 1e-5) ** -0.5 + return [xi * scale for xi in x] + +def gpt(token_id, pos_id, keys, values): + """ + Process one token through the GPT model. + + This is called once per token position during both training and inference. + Uses KV-cache: stores keys/values so previous positions aren't recomputed. + + Args: + token_id: integer, which token (0-26) + pos_id: integer, position in sequence (0-15) + keys: list of lists, accumulated key vectors per layer + values: list of lists, accumulated value vectors per layer + + Returns: + logits: list of 27 Values, scores for each possible next token + """ + # STEP 1: Embedding lookup (Chapter 02) + # Each token gets a learned vector + a position vector + tok_emb = state_dict['wte'][token_id] # [n_embd] = [16] + pos_emb = state_dict['wpe'][pos_id] # [n_embd] = [16] + x = [t + p for t, p in zip(tok_emb, pos_emb)] # combine + x = rmsnorm(x) # normalize + + # STEP 2: Transformer blocks (Chapter 05) + for li in range(n_layer): + + # --- ATTENTION BLOCK --- + x_residual = x + x = rmsnorm(x) + + # Project to Q, K, V + q = linear(x, state_dict[f'layer{li}.attn_wq']) + k = linear(x, state_dict[f'layer{li}.attn_wk']) + v = linear(x, state_dict[f'layer{li}.attn_wv']) + + # Store K, V in cache for future tokens + keys[li].append(k) + values[li].append(v) + + # Multi-head attention + x_attn = [] + for h in range(n_head): + hs = h * head_dim # start index for this head's slice + + # Get this head's Q, K, V + q_h = q[hs:hs+head_dim] + k_h = [ki[hs:hs+head_dim] for ki in keys[li]] # all cached keys + v_h = [vi[hs:hs+head_dim] for vi in values[li]] # all cached values + + # Attention scores: Q · K / sqrt(d) + attn_logits = [ + sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 + for t in range(len(k_h)) + ] + + # Attention weights (softmax) + attn_weights = softmax(attn_logits) + + # Weighted sum of values + head_out = [ + sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) + for j in range(head_dim) + ] + x_attn.extend(head_out) # concatenate heads + + # Output projection + residual connection + x = linear(x_attn, state_dict[f'layer{li}.attn_wo']) + x = [a + b for a, b in zip(x, x_residual)] + + # --- MLP BLOCK --- + x_residual = x + x = rmsnorm(x) + x = linear(x, state_dict[f'layer{li}.mlp_fc1']) # expand: 16 → 64 + x = [xi.relu() for xi in x] # activate + x = linear(x, state_dict[f'layer{li}.mlp_fc2']) # compress: 64 → 16 + x = [a + b for a, b in zip(x, x_residual)] # residual + + # STEP 3: Output head (map embedding → token scores) + logits = linear(x, state_dict['lm_head']) # [vocab_size] = [27] + return logits + +# ===================================================================== +# SECTION 7: ADAM OPTIMIZER SETUP (Chapter 06) +# ===================================================================== +learning_rate, beta1, beta2, eps_adam = 0.01, 0.85, 0.99, 1e-8 +m = [0.0] * len(params) # momentum buffer +v = [0.0] * len(params) # velocity buffer + +# ===================================================================== +# SECTION 8: TRAINING LOOP (Chapter 06) +# ===================================================================== +# For each step: +# 1. Take a name, tokenize it +# 2. Forward pass: predict each next character +# 3. Compute cross-entropy loss +# 4. Backward pass: compute gradients +# 5. Adam update: adjust parameters + +num_steps = 200 # Reduced from 1000 for faster demo. Increase for better results! +print(f"\nTraining for {num_steps} steps...") +print("(Reduce num_steps in the code if this is too slow)\n") + +for step in range(num_steps): + + # 1. Pick a document (name) and tokenize it + doc = docs[step % len(docs)] + tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS] + n = min(block_size, len(tokens) - 1) + + # 2-3. Forward pass: predict next token at each position, accumulate loss + keys_cache = [[] for _ in range(n_layer)] + values_cache = [[] for _ in range(n_layer)] + losses = [] + + for pos_id in range(n): + token_id = tokens[pos_id] + target_id = tokens[pos_id + 1] + + # Forward: get logits for this position + logits = gpt(token_id, pos_id, keys_cache, values_cache) + + # Softmax → probabilities + probs = softmax(logits) + + # Cross-entropy loss for this position + loss_t = -probs[target_id].log() + losses.append(loss_t) + + # Average loss over all positions in this name + loss = (1 / n) * sum(losses) + + # 4. Backward pass: compute all gradients + loss.backward() + + # 5. Adam optimizer: update all parameters + lr_t = learning_rate * (1 - step / num_steps) # linear LR decay + for i, p in enumerate(params): + m[i] = beta1 * m[i] + (1 - beta1) * p.grad + v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 + m_hat = m[i] / (1 - beta1 ** (step + 1)) + v_hat = v[i] / (1 - beta2 ** (step + 1)) + p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) + p.grad = 0 # reset for next step + + if step % 20 == 0 or step == num_steps - 1: + print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}") + +# ===================================================================== +# SECTION 9: INFERENCE (Chapter 01 — Generating Text) +# ===================================================================== +# Now the model generates new names that "sound like" the training data. +# Process: start with BOS, predict next token, repeat until BOS again. + +temperature = 0.5 # lower = more conservative, higher = more creative + +print("\n--- inference (new, hallucinated names) ---") +for sample_idx in range(20): + keys_cache = [[] for _ in range(n_layer)] + values_cache = [[] for _ in range(n_layer)] + token_id = BOS + sample = [] + + for pos_id in range(block_size): + logits = gpt(token_id, pos_id, keys_cache, values_cache) + # Apply temperature: divide logits before softmax + probs = softmax([l / temperature for l in logits]) + # Sample from the probability distribution + token_id = random.choices(range(vocab_size), + weights=[p.data for p in probs])[0] + if token_id == BOS: + break # end of name + sample.append(uchars[token_id]) + + print(f"sample {sample_idx+1:2d}: {''.join(sample)}") + +print(""" +=== That's the entire microGPT! === + +200 lines that implement: + ✓ Dataset loading and tokenization + ✓ Automatic differentiation (autograd) + ✓ Transformer architecture (attention + MLP) + ✓ Training with Adam optimizer + ✓ Text generation (inference) + +Next: Chapter 08 shows how PyTorch makes this 100x faster and cleaner. +""") diff --git a/gpt/local/course/ch08_Scaling_Up_with_PyTorch/README.md b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/README.md new file mode 100644 index 0000000..91101e4 --- /dev/null +++ b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/README.md @@ -0,0 +1,85 @@ +# Chapter 08: Scaling Up with PyTorch — Introduction to minGPT + +## Why PyTorch? + +microGPT is beautiful for learning, but it's **painfully slow**. Processing one name takes seconds because every single multiplication is a Python function call. + +PyTorch solves this by: +1. **Tensors**: Multi-dimensional arrays that run on GPU (1000x faster math) +2. **Autograd**: Automatic differentiation built-in (no need for our `Value` class) +3. **nn.Module**: Clean way to organize model layers +4. **Optimizers**: Adam, SGD, etc. already implemented and optimized + +--- + +## The Same Ideas, Better Tools + +| Concept | microGPT (Pure Python) | minGPT (PyTorch) | +|---|---|---| +| Numbers | `Value(3.14)` | `torch.tensor(3.14)` | +| Gradients | `value.backward()` | `tensor.backward()` | +| Linear layer | `linear(x, w)` (manual loop) | `nn.Linear(in, out)` | +| Matrix multiply | Nested for-loops | `@` operator (GPU-accelerated) | +| Softmax | Manual exp/sum | `F.softmax(x)` | +| Optimizer | Manual Adam code | `torch.optim.AdamW(...)` | + +--- + +## minGPT Project Structure + +``` +minGPT/ +├── mingpt/ +│ ├── model.py ← The GPT model (311 lines) +│ ├── trainer.py ← Training loop (110 lines) +│ ├── bpe.py ← BPE tokenizer (320 lines) +│ └── utils.py ← Config helpers (104 lines) +├── projects/ +│ ├── chargpt/ ← Character-level text generation +│ └── adder/ ← Teaching GPT to add numbers +├── demo.ipynb ← Sorting demo +└── generate.ipynb ← GPT-2 text generation +``` + +--- + +## Key PyTorch Concepts for minGPT + +### Tensors +```python +# A tensor is like a multi-dimensional array +x = torch.tensor([1.0, 2.0, 3.0]) # 1D +W = torch.randn(4, 3) # 2D (matrix) +batch = torch.randn(32, 10, 768) # 3D (batch of sequences) +``` + +### nn.Module +```python +# Every layer inherits from nn.Module +class MyLayer(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(768, 768) # weight matrix inside + + def forward(self, x): + return self.linear(x) +``` + +### Autograd +```python +x = torch.tensor(3.0, requires_grad=True) +y = x ** 2 + 2 * x +y.backward() # computes dy/dx +print(x.grad) # tensor(8.) (= 2*3 + 2) +``` + +--- + +## Run the Examples + +```bash +python pytorch_basics.py # PyTorch fundamentals +python pytorch_vs_manual.py # Side-by-side: manual vs PyTorch +``` + +**Note:** These examples require PyTorch. Install with: `pip install torch` diff --git a/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_basics.py b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_basics.py new file mode 100644 index 0000000..7f03a1a --- /dev/null +++ b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_basics.py @@ -0,0 +1,200 @@ +""" +Chapter 08: PyTorch Basics for Understanding minGPT + +This introduces the PyTorch concepts you need to read minGPT's code. +Each section maps to something we built manually in earlier chapters. + +Requires: pip install torch +""" + +try: + import torch + import torch.nn as nn + from torch.nn import functional as F +except ImportError: + print("PyTorch not installed. Run: pip install torch") + print("This chapter requires PyTorch.") + exit(1) + +# ============================================================ +# 1. TENSORS — PyTorch's version of our Value class +# ============================================================ +print("=" * 60) +print("1. Tensors — Multi-dimensional arrays with autograd") +print("=" * 60) + +# Scalars, vectors, matrices, and higher +scalar = torch.tensor(3.14) +vector = torch.tensor([1.0, 2.0, 3.0]) +matrix = torch.randn(3, 4) # 3x4 matrix of random numbers + +print(f" Scalar: {scalar} (shape: {scalar.shape})") +print(f" Vector: {vector} (shape: {vector.shape})") +print(f" Matrix shape: {matrix.shape}") + +# Matrix multiplication — what took nested loops in microGPT +A = torch.randn(2, 3) +B = torch.randn(3, 4) +C = A @ B # matrix multiply — runs on GPU if available! +print(f"\n Matrix multiply: ({list(A.shape)}) @ ({list(B.shape)}) → ({list(C.shape)})") +print(f" This replaces our manual linear() function with nested loops!") + +# ============================================================ +# 2. AUTOGRAD — Automatic gradients (replaces our Value class) +# ============================================================ +print("\n" + "=" * 60) +print("2. Autograd — Automatic differentiation") +print("=" * 60) + +# Compare with Chapter 03's Value class +x = torch.tensor(2.0, requires_grad=True) +y = (2 * x + 3) ** 2 # y = (2*2+3)^2 = 25 + +y.backward() # compute dy/dx + +print(f" x = {x.item():.1f}") +print(f" y = (2x + 3)^2 = {y.item():.1f}") +print(f" dy/dx = {x.grad.item():.1f}") +print(f" Manual: 4*(2x+3) = 4*(2*2+3) = 20.0 ✓") + +# ============================================================ +# 3. nn.Linear — Replaces our manual linear() function +# ============================================================ +print("\n" + "=" * 60) +print("3. nn.Linear — Built-in linear layer") +print("=" * 60) + +# In microGPT: linear(x, w) with manual loops +# In minGPT: nn.Linear does the same thing, but GPU-accelerated +layer = nn.Linear(3, 2) # 3 inputs → 2 outputs + +x = torch.tensor([1.0, 2.0, 3.0]) +y = layer(x) # applies y = Wx + b + +print(f" Input: {x.tolist()} (3 values)") +print(f" Output: [{', '.join(f'{v:.4f}' for v in y.tolist())}] (2 values)") +print(f" Weight shape: {list(layer.weight.shape)}") +print(f" Bias shape: {list(layer.bias.shape)}") +print(f" Total params: {layer.weight.numel() + layer.bias.numel()}") + +# ============================================================ +# 4. nn.Embedding — Replaces our embedding lookup table +# ============================================================ +print("\n" + "=" * 60) +print("4. nn.Embedding — Token/position lookup table") +print("=" * 60) + +# In microGPT: state_dict['wte'][token_id] +# In minGPT: nn.Embedding(vocab_size, n_embd) +emb = nn.Embedding(10, 4) # 10 tokens, each maps to 4 numbers + +token_ids = torch.tensor([2, 5, 7]) # look up 3 tokens at once +vectors = emb(token_ids) + +print(f" Token IDs: {token_ids.tolist()}") +print(f" Output shape: {list(vectors.shape)} (3 tokens × 4 dims)") +print(f" Token 2 → [{', '.join(f'{v:.3f}' for v in vectors[0].tolist())}]") +print(f" Token 5 → [{', '.join(f'{v:.3f}' for v in vectors[1].tolist())}]") +print(f" Token 7 → [{', '.join(f'{v:.3f}' for v in vectors[2].tolist())}]") + +# ============================================================ +# 5. SOFTMAX & CROSS-ENTROPY +# ============================================================ +print("\n" + "=" * 60) +print("5. Softmax & Cross-Entropy — Built-in and optimized") +print("=" * 60) + +logits = torch.tensor([2.0, 1.0, 0.5]) +probs = F.softmax(logits, dim=-1) +print(f" Logits: {logits.tolist()}") +print(f" Softmax: [{', '.join(f'{p:.4f}' for p in probs.tolist())}]") + +# Cross-entropy loss (combines softmax + negative log likelihood) +logits_batch = torch.tensor([[2.0, 1.0, 0.5]]) # batch of 1 +targets = torch.tensor([0]) # correct answer is token 0 +loss = F.cross_entropy(logits_batch, targets) +print(f" Cross-entropy loss: {loss.item():.4f}") +print(f" Same as: -log(softmax[0]) = -log({probs[0].item():.4f}) = {-probs[0].log().item():.4f}") + +# ============================================================ +# 6. BATCHING — Processing multiple examples at once +# ============================================================ +print("\n" + "=" * 60) +print("6. Batching — The big speed advantage") +print("=" * 60) + +print(""" + microGPT processes ONE token at a time (sequential Python loops). + minGPT processes MANY tokens at once using batch dimensions: + + microGPT: for each token in sequence: logits = gpt(token, pos) + minGPT: logits = model(all_tokens_at_once) # one call! + + Shape conventions in minGPT: + B = batch size (e.g., 32 documents at once) + T = sequence length (e.g., 128 tokens) + C = embedding dim (e.g., 768) + + Input: (B, T) — batch of token ID sequences + After embedding: (B, T, C) — batch of embedding sequences + Logits: (B, T, vocab_size) — predictions at every position +""") + +# Demo: batch processing +B, T, C = 2, 4, 8 # 2 sequences, 4 tokens each, 8-dim embeddings +x = torch.randn(B, T, C) +linear_layer = nn.Linear(C, C) +y = linear_layer(x) # applies to ALL positions in ALL sequences at once! +print(f" Input shape: {list(x.shape)} (B={B}, T={T}, C={C})") +print(f" Output shape: {list(y.shape)} (same shape — processed all at once!)") + +# ============================================================ +# 7. nn.Module — Organizing models +# ============================================================ +print("\n" + "=" * 60) +print("7. nn.Module — How minGPT organizes its model") +print("=" * 60) + +class TinyModel(nn.Module): + def __init__(self, n_in, n_hidden, n_out): + super().__init__() + self.layer1 = nn.Linear(n_in, n_hidden) + self.layer2 = nn.Linear(n_hidden, n_out) + + def forward(self, x): + x = F.relu(self.layer1(x)) + x = self.layer2(x) + return x + +model = TinyModel(3, 8, 2) +n_params = sum(p.numel() for p in model.parameters()) +print(f" Model: 3 → 8 → 2") +print(f" Total parameters: {n_params}") +print(f" Parameters are automatically tracked by nn.Module!") + +# Forward and backward in one go +x = torch.randn(5, 3) # batch of 5 inputs +y = model(x) +loss = y.sum() +loss.backward() + +print(f"\n Input shape: {list(x.shape)}") +print(f" Output shape: {list(y.shape)}") +print(f" Gradients computed for all {n_params} parameters!") + +print(""" +=== Key Takeaway === + +PyTorch gives us the same building blocks as microGPT: + Value class → torch.Tensor with autograd + manual linear() → nn.Linear + manual softmax() → F.softmax + manual Adam → torch.optim.AdamW + +But MUCH faster because: + - Operations run on GPU (or optimized CPU) + - Batch processing: handle many sequences at once + - No Python-level loops for math + +Next: Chapter 09 — How minGPT uses all of this to build a full GPT model. +""") diff --git a/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_vs_manual.py b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_vs_manual.py new file mode 100644 index 0000000..ee31e64 --- /dev/null +++ b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_vs_manual.py @@ -0,0 +1,202 @@ +""" +Chapter 08: Side-by-Side — Manual Python vs PyTorch + +Shows the SAME operation implemented both ways, +so you can see exactly what PyTorch is doing for you. + +Requires: pip install torch +""" + +import math +import random + +random.seed(42) + +try: + import torch + import torch.nn as nn + from torch.nn import functional as F + torch.manual_seed(42) +except ImportError: + print("PyTorch not installed. Run: pip install torch") + exit(1) + +# ============================================================ +# COMPARISON 1: Softmax +# ============================================================ +print("=" * 60) +print("Comparison 1: Softmax") +print("=" * 60) + +scores = [2.0, 1.0, 0.5] + +# MANUAL (like microGPT) +max_val = max(scores) +exps = [math.exp(s - max_val) for s in scores] +total = sum(exps) +manual_probs = [e / total for e in exps] + +# PYTORCH (like minGPT) +torch_probs = F.softmax(torch.tensor(scores), dim=-1) + +print(f" Input scores: {scores}") +print(f" Manual: [{', '.join(f'{p:.4f}' for p in manual_probs)}]") +print(f" PyTorch: {torch_probs.tolist()}") +print(f" Match: {all(abs(a - b) < 1e-6 for a, b in zip(manual_probs, torch_probs.tolist()))}") + +# ============================================================ +# COMPARISON 2: Linear Layer +# ============================================================ +print("\n" + "=" * 60) +print("Comparison 2: Linear Layer (Matrix Multiply)") +print("=" * 60) + +x = [1.0, 2.0, 3.0] +W = [[0.1, 0.2, 0.3], + [0.4, 0.5, 0.6]] + +# MANUAL (like microGPT) +manual_out = [sum(wi * xi for wi, xi in zip(row, x)) for row in W] + +# PYTORCH (like minGPT) +x_t = torch.tensor(x) +W_t = torch.tensor(W) +torch_out = (W_t @ x_t).tolist() + +print(f" Input: {x}") +print(f" Manual: {manual_out}") +print(f" PyTorch: {torch_out}") +print(f" Match: {all(abs(a - b) < 1e-6 for a, b in zip(manual_out, torch_out))}") + +# ============================================================ +# COMPARISON 3: Autograd (Gradient Computation) +# ============================================================ +print("\n" + "=" * 60) +print("Comparison 3: Autograd") +print("=" * 60) + +# MANUAL Value class (like microGPT) +class Value: + def __init__(self, data, children=(), local_grads=()): + self.data = data + self.grad = 0 + self._children = children + self._local_grads = local_grads + def __add__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data + other.data, (self, other), (1, 1)) + def __mul__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data * other.data, (self, other), (other.data, self.data)) + def __pow__(self, n): + return Value(self.data ** n, (self,), (n * self.data ** (n-1),)) + def __rmul__(self, other): return self * other + def backward(self): + topo, visited = [], set() + def build(v): + if v not in visited: + visited.add(v) + for c in v._children: build(c) + topo.append(v) + build(self) + self.grad = 1 + for v in reversed(topo): + for child, lg in zip(v._children, v._local_grads): + child.grad += lg * v.grad + +# Manual autograd +a_manual = Value(2.0) +b_manual = Value(3.0) +c_manual = (a_manual * b_manual + 5) ** 2 # (2*3+5)^2 = 121 +c_manual.backward() + +# PyTorch autograd +a_torch = torch.tensor(2.0, requires_grad=True) +b_torch = torch.tensor(3.0, requires_grad=True) +c_torch = (a_torch * b_torch + 5) ** 2 +c_torch.backward() + +print(f" Expression: (a*b + 5)^2 where a=2, b=3") +print(f" Result: manual={c_manual.data:.1f}, pytorch={c_torch.item():.1f}") +print(f" da: manual={a_manual.grad:.1f}, pytorch={a_torch.grad.item():.1f}") +print(f" db: manual={b_manual.grad:.1f}, pytorch={b_torch.grad.item():.1f}") + +# ============================================================ +# COMPARISON 4: Cross-Entropy Loss +# ============================================================ +print("\n" + "=" * 60) +print("Comparison 4: Cross-Entropy Loss") +print("=" * 60) + +logits = [2.0, 1.0, 0.5] +target = 0 # correct token is index 0 + +# MANUAL (like microGPT) +max_val = max(logits) +exps = [math.exp(v - max_val) for v in logits] +total = sum(exps) +probs = [e / total for e in exps] +manual_loss = -math.log(probs[target]) + +# PYTORCH (like minGPT) +logits_t = torch.tensor([logits]) # add batch dim +target_t = torch.tensor([target]) +torch_loss = F.cross_entropy(logits_t, target_t) + +print(f" Logits: {logits}, Target: {target}") +print(f" Manual loss: {manual_loss:.6f}") +print(f" PyTorch loss: {torch_loss.item():.6f}") +print(f" Match: {abs(manual_loss - torch_loss.item()) < 1e-5}") + +# ============================================================ +# COMPARISON 5: Speed +# ============================================================ +print("\n" + "=" * 60) +print("Comparison 5: Speed (rough estimate)") +print("=" * 60) + +import time + +size = 100 + +# Manual matrix multiply +W_manual = [[random.random() for _ in range(size)] for _ in range(size)] +x_manual = [random.random() for _ in range(size)] + +start = time.time() +for _ in range(100): + result = [sum(wi * xi for wi, xi in zip(row, x_manual)) for row in W_manual] +manual_time = time.time() - start + +# PyTorch matrix multiply +W_torch = torch.randn(size, size) +x_torch = torch.randn(size) + +start = time.time() +for _ in range(100): + result = W_torch @ x_torch +pytorch_time = time.time() - start + +speedup = manual_time / max(pytorch_time, 1e-10) +print(f" {size}x{size} matrix-vector multiply × 100 iterations:") +print(f" Manual Python: {manual_time*1000:.1f}ms") +print(f" PyTorch: {pytorch_time*1000:.1f}ms") +print(f" Speedup: ~{speedup:.0f}x") +print(f" (On GPU, the speedup would be 100-1000x more!)") + +print(""" +=== Summary === + +microGPT and minGPT implement the SAME algorithm. +The difference is purely in efficiency: + + microGPT: every operation is a Python function call + minGPT: operations are batched into fast tensor ops + +This means: + - microGPT trains for minutes on tiny data + - minGPT can train on Shakespeare, add numbers, sort lists + - Real GPT-2 trains on the entire internet + +The math is identical. The speed is not. +""") diff --git a/gpt/local/course/ch09_minGPT_Model_Deep_Dive/README.md b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/README.md new file mode 100644 index 0000000..1abf1e2 --- /dev/null +++ b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/README.md @@ -0,0 +1,79 @@ +# Chapter 09: minGPT Model — Deep Dive + +## Overview + +minGPT's model is defined in `mingpt/model.py` (~311 lines). It implements the **exact same architecture** as microGPT but using PyTorch, making it fast enough to train on real tasks. + +**Source:** `../../minGPT/mingpt/model.py` + +--- + +## The Four Classes + +### 1. `NewGELU` (line 21-27) +Activation function — smoother alternative to ReLU. +- microGPT uses `ReLU(x) = max(0, x)` +- minGPT uses `GELU(x) ≈ 0.5 * x * (1 + tanh(...))` + +### 2. `CausalSelfAttention` (line 29-71) +Multi-head attention with causal masking. +- Same math as microGPT's attention, but batched +- Uses a pre-computed triangular mask for causal attention +- All Q/K/V computed in one matrix multiply (efficiency trick) + +### 3. `Block` (line 73-93) +One transformer block = Attention + MLP + residual connections. +- Identical structure to microGPT's transformer block +- Uses LayerNorm instead of RMSNorm + +### 4. `GPT` (line 95-311) +The full model: embeddings + stacked blocks + output head. +- Supports multiple model sizes (gpt-nano to gpt2-xl) +- Can load pretrained GPT-2 weights from HuggingFace +- Includes generation (inference) logic + +--- + +## Model Size Configurations + +```python +'gpt-nano': n_layer=3, n_head=3, n_embd=48 # Tiny, for demos +'gpt-micro': n_layer=4, n_head=4, n_embd=128 # Small +'gpt-mini': n_layer=6, n_head=6, n_embd=192 # Medium-small +'gpt2': n_layer=12, n_head=12, n_embd=768 # 124M params (real GPT-2) +'gpt2-xl': n_layer=48, n_head=25, n_embd=1600 # 1.5B params +``` + +--- + +## The Forward Pass (simplified) + +``` +Input token IDs: (batch_size, seq_len) + ↓ +Token Embedding + Position Embedding + ↓ +Dropout + ↓ +Block 0: LayerNorm → Attention → (+) → LayerNorm → MLP → (+) +Block 1: LayerNorm → Attention → (+) → LayerNorm → MLP → (+) + ... +Block N: LayerNorm → Attention → (+) → LayerNorm → MLP → (+) + ↓ +Final LayerNorm + ↓ +Linear head → logits (batch_size, seq_len, vocab_size) + ↓ +Cross-entropy loss (if targets provided) +``` + +--- + +## Run the Examples + +```bash +python model_walkthrough.py # Annotated walkthrough of minGPT's model.py +python model_sizes.py # Compare different GPT configurations +``` + +**Requires:** `pip install torch` diff --git a/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_sizes.py b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_sizes.py new file mode 100644 index 0000000..9eed563 --- /dev/null +++ b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_sizes.py @@ -0,0 +1,146 @@ +""" +Chapter 09: GPT Model Sizes — From Nano to GPT-2 + +Shows how the same architecture scales from tiny to huge +by changing just three numbers: n_layer, n_head, n_embd. + +Requires: pip install torch +""" + +try: + import torch + import torch.nn as nn + import math +except ImportError: + print("PyTorch not installed. Run: pip install torch") + exit(1) + +# ============================================================ +# Calculate parameter count for a GPT model +# ============================================================ +def count_gpt_params(vocab_size, block_size, n_layer, n_head, n_embd): + """Count parameters in a GPT model without building it.""" + params = {} + + # Token embeddings + params['wte'] = vocab_size * n_embd + + # Position embeddings + params['wpe'] = block_size * n_embd + + # Per-layer parameters + per_layer = {} + # Attention: c_attn (Q,K,V combined) + c_proj (output) + per_layer['c_attn_weight'] = n_embd * (3 * n_embd) + per_layer['c_attn_bias'] = 3 * n_embd + per_layer['c_proj_weight'] = n_embd * n_embd + per_layer['c_proj_bias'] = n_embd + # MLP: c_fc (expand) + c_proj (compress) + per_layer['mlp_fc_weight'] = n_embd * (4 * n_embd) + per_layer['mlp_fc_bias'] = 4 * n_embd + per_layer['mlp_proj_weight'] = (4 * n_embd) * n_embd + per_layer['mlp_proj_bias'] = n_embd + # LayerNorms (2 per layer) + per_layer['ln1'] = 2 * n_embd # weight + bias + per_layer['ln2'] = 2 * n_embd + + layer_total = sum(per_layer.values()) + params['all_layers'] = layer_total * n_layer + + # Final LayerNorm + params['ln_f'] = 2 * n_embd + + # Output head (often shares weights with wte, but let's count separately) + params['lm_head'] = n_embd * vocab_size + + total = sum(params.values()) + return total, params, per_layer, layer_total + + +# ============================================================ +# Compare model sizes +# ============================================================ +print("=" * 70) +print("GPT Model Size Comparison") +print("=" * 70) + +configs = [ + # name, vocab_size, block_size, n_layer, n_head, n_embd + ("microGPT", 27, 16, 1, 4, 16), + ("gpt-nano", 50257, 1024, 3, 3, 48), + ("gpt-micro", 50257, 1024, 4, 4, 128), + ("gpt-mini", 50257, 1024, 6, 6, 192), + ("GPT-2", 50257, 1024, 12, 12, 768), + ("GPT-2 medium",50257, 1024, 24, 16, 1024), + ("GPT-2 large", 50257, 1024, 36, 20, 1280), + ("GPT-2 XL", 50257, 1024, 48, 25, 1600), +] + +print(f"\n{'Model':<16} {'Layers':>6} {'Heads':>6} {'Embd':>6} {'Params':>14} {'Relative':>10}") +print("-" * 70) + +base_params = None +for name, vocab, block, n_layer, n_head, n_embd in configs: + total, _, _, _ = count_gpt_params(vocab, block, n_layer, n_head, n_embd) + if base_params is None: + base_params = total + relative = total / base_params + + if total < 1_000_000: + param_str = f"{total:,}" + else: + param_str = f"{total/1e6:.1f}M" + + print(f"{name:<16} {n_layer:>6} {n_head:>6} {n_embd:>6} {param_str:>14} {relative:>9.0f}x") + +# ============================================================ +# Detailed breakdown of one model +# ============================================================ +print("\n" + "=" * 70) +print("Detailed Breakdown: gpt-mini (n_layer=6, n_head=6, n_embd=192)") +print("=" * 70) + +total, params, per_layer, layer_total = count_gpt_params(50257, 1024, 6, 6, 192) + +print(f"\n Token embeddings (wte): {params['wte']:>10,} ({params['wte']/total*100:.1f}%)") +print(f" Position embeddings (wpe): {params['wpe']:>10,} ({params['wpe']/total*100:.1f}%)") +print(f" All transformer layers: {params['all_layers']:>10,} ({params['all_layers']/total*100:.1f}%)") +print(f" Final LayerNorm: {params['ln_f']:>10,} ({params['ln_f']/total*100:.1f}%)") +print(f" Output head (lm_head): {params['lm_head']:>10,} ({params['lm_head']/total*100:.1f}%)") +print(f" {'─'*40}") +print(f" TOTAL: {total:>10,}") + +print(f"\n Per-layer breakdown ({layer_total:,} params per layer × 6 layers):") +for name, count in per_layer.items(): + print(f" {name:<20} {count:>8,} ({count/layer_total*100:.1f}%)") + +# ============================================================ +# What each parameter "does" +# ============================================================ +print(""" +=== What Each Component Stores === + +Token Embeddings (wte): + A lookup table: token ID → vector. Captures "meaning" of each token. + Similar tokens (e.g., "cat", "dog") end up with similar vectors. + +Position Embeddings (wpe): + A lookup table: position → vector. Captures "where in the sequence". + Without this, the model can't tell position 1 from position 100. + +Attention Weights (c_attn, c_proj): + Control HOW tokens communicate. Q/K weights determine what each + token "looks for" and "advertises". V/output weights control what + information flows between tokens. + +MLP Weights (mlp_fc, mlp_proj): + Per-token processing. Each token's representation gets "refined" + through an expand→activate→compress pipeline. This is where a lot + of the model's "knowledge" is stored (facts, patterns, etc). + +=== The Scaling Insight === + +Notice: most parameters are in the transformer layers (not embeddings). +Doubling n_embd roughly QUADRUPLES the parameter count (because weight +matrices are n_embd × n_embd). This is why bigger models are so expensive. +""") diff --git a/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_walkthrough.py b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_walkthrough.py new file mode 100644 index 0000000..c21462e --- /dev/null +++ b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_walkthrough.py @@ -0,0 +1,263 @@ +""" +Chapter 09: minGPT Model Walkthrough + +This script builds a minGPT model step-by-step, explaining +every component and how it maps to the microGPT concepts +from Chapter 07. + +Source: ../../minGPT/mingpt/model.py + +Requires: pip install torch +""" + +try: + import math + import torch + import torch.nn as nn + from torch.nn import functional as F +except ImportError: + print("PyTorch not installed. Run: pip install torch") + exit(1) + +torch.manual_seed(42) + +# ============================================================ +# COMPONENT 1: NewGELU Activation +# ============================================================ +# In microGPT: x.relu() → max(0, x) +# In minGPT: NewGELU → smoother, slightly better for training + +print("=" * 60) +print("Component 1: NewGELU Activation") +print("=" * 60) + +class NewGELU(nn.Module): + """ + GELU = Gaussian Error Linear Unit + Used in GPT-2 instead of ReLU. + Almost identical to ReLU for positive values, + but allows small negative values through. + """ + def forward(self, x): + return 0.5 * x * (1.0 + torch.tanh( + math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)) + )) + +gelu = NewGELU() +test_vals = torch.tensor([-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0]) +print(f" Input: {test_vals.tolist()}") +print(f" GELU: [{', '.join(f'{v:.3f}' for v in gelu(test_vals).tolist())}]") +print(f" ReLU: [{', '.join(f'{v:.3f}' for v in F.relu(test_vals).tolist())}]") +print(" Notice: GELU is smoother — small negatives get small negative outputs") + +# ============================================================ +# COMPONENT 2: CausalSelfAttention +# ============================================================ +print("\n" + "=" * 60) +print("Component 2: CausalSelfAttention") +print("=" * 60) + +class CausalSelfAttention(nn.Module): + """ + Multi-head masked self-attention. + + microGPT equivalent: the attention section in gpt() function (lines 114-133) + + Key differences from microGPT: + - Processes ALL positions at once (batched), not one-by-one + - Q, K, V computed in ONE matrix multiply (c_attn), not three + - Uses a triangular mask instead of KV-cache for causal masking + - Includes dropout for regularization + """ + def __init__(self, n_embd, n_head, block_size): + super().__init__() + assert n_embd % n_head == 0 + # Combined Q, K, V projection (efficiency: one matmul instead of three) + self.c_attn = nn.Linear(n_embd, 3 * n_embd) + # Output projection + self.c_proj = nn.Linear(n_embd, n_embd) + # Causal mask: lower triangular matrix + self.register_buffer("bias", + torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size)) + self.n_head = n_head + self.n_embd = n_embd + + def forward(self, x): + B, T, C = x.size() # batch, sequence length, embedding dim + + # Compute Q, K, V all at once, then split + q, k, v = self.c_attn(x).split(self.n_embd, dim=2) + + # Reshape for multi-head: (B, T, C) → (B, n_head, T, head_dim) + k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) + q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) + v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) + + # Attention scores: (B, nh, T, hs) × (B, nh, hs, T) → (B, nh, T, T) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + + # Causal mask: set future positions to -inf + att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf')) + + # Softmax → attention weights + att = F.softmax(att, dim=-1) + + # Weighted sum of values + y = att @ v # (B, nh, T, T) × (B, nh, T, hs) → (B, nh, T, hs) + + # Concatenate heads: (B, nh, T, hs) → (B, T, C) + y = y.transpose(1, 2).contiguous().view(B, T, C) + + # Output projection + y = self.c_proj(y) + return y + +# Demo +n_embd, n_head, block_size = 16, 4, 8 +attn = CausalSelfAttention(n_embd, n_head, block_size) +x = torch.randn(2, 5, n_embd) # batch=2, seq_len=5 +y = attn(x) +print(f" Config: n_embd={n_embd}, n_head={n_head}") +print(f" Input: {list(x.shape)} (batch=2, tokens=5, dim={n_embd})") +print(f" Output: {list(y.shape)} (same shape — attention transforms, doesn't resize)") +print(f" Params: {sum(p.numel() for p in attn.parameters())}") + +print(f"\n Causal mask (first 5×5):") +mask = attn.bias[0, 0, :5, :5] +for i in range(5): + row = ['✓' if mask[i, j] == 1 else '✗' for j in range(5)] + print(f" pos {i}: {' '.join(row)} (can attend to {int(mask[i].sum())} positions)") + +# ============================================================ +# COMPONENT 3: Block (Transformer Block) +# ============================================================ +print("\n" + "=" * 60) +print("Component 3: Transformer Block") +print("=" * 60) + +class Block(nn.Module): + """ + One transformer block: Attention + MLP with residual connections. + + microGPT equivalent: one iteration of the `for li in range(n_layer)` loop + + Structure: + x → LayerNorm → Attention → (+x) → LayerNorm → MLP → (+x) → output + """ + def __init__(self, n_embd, n_head, block_size): + super().__init__() + self.ln_1 = nn.LayerNorm(n_embd) # Pre-attention norm + self.attn = CausalSelfAttention(n_embd, n_head, block_size) + self.ln_2 = nn.LayerNorm(n_embd) # Pre-MLP norm + self.mlp = nn.ModuleDict(dict( + c_fc = nn.Linear(n_embd, 4 * n_embd), # Expand + c_proj = nn.Linear(4 * n_embd, n_embd), # Compress + act = NewGELU(), + )) + + def forward(self, x): + # Attention with residual connection + x = x + self.attn(self.ln_1(x)) + # MLP with residual connection + m = self.mlp + x = x + m.c_proj(m.act(m.c_fc(self.ln_2(x)))) + return x + +block = Block(n_embd, n_head, block_size) +x = torch.randn(2, 5, n_embd) +y = block(x) +n_params = sum(p.numel() for p in block.parameters()) +print(f" Input: {list(x.shape)}") +print(f" Output: {list(y.shape)}") +print(f" Block params: {n_params}") +print(f" Components: LayerNorm + Attention + LayerNorm + MLP(expand→GELU→compress)") + +# ============================================================ +# COMPONENT 4: Full GPT Model +# ============================================================ +print("\n" + "=" * 60) +print("Component 4: Full GPT Model") +print("=" * 60) + +class MiniGPT(nn.Module): + """ + Simplified version of minGPT's GPT class for illustration. + Same architecture, just without the config system and pretrained loading. + """ + def __init__(self, vocab_size, block_size, n_layer, n_head, n_embd): + super().__init__() + self.block_size = block_size + + self.transformer = nn.ModuleDict(dict( + wte = nn.Embedding(vocab_size, n_embd), # Token embeddings + wpe = nn.Embedding(block_size, n_embd), # Position embeddings + h = nn.ModuleList([Block(n_embd, n_head, block_size) + for _ in range(n_layer)]), # Transformer blocks + ln_f = nn.LayerNorm(n_embd), # Final norm + )) + self.lm_head = nn.Linear(n_embd, vocab_size, bias=False) # Output projection + + def forward(self, idx, targets=None): + B, T = idx.size() + pos = torch.arange(0, T, dtype=torch.long, device=idx.device) + + # Embeddings + tok_emb = self.transformer.wte(idx) # (B, T, n_embd) + pos_emb = self.transformer.wpe(pos) # (T, n_embd) → broadcasts to (B, T, n_embd) + x = tok_emb + pos_emb + + # Transformer blocks + for block in self.transformer.h: + x = block(x) + + # Final norm + output projection + x = self.transformer.ln_f(x) + logits = self.lm_head(x) # (B, T, vocab_size) + + # Compute loss if targets given + loss = None + if targets is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) + + return logits, loss + +# Create a tiny GPT +model = MiniGPT( + vocab_size=27, # 26 letters + BOS (like microGPT) + block_size=16, # max sequence length + n_layer=3, # 3 transformer blocks + n_head=3, # 3 attention heads + n_embd=48, # 48-dim embeddings +) + +n_params = sum(p.numel() for p in model.parameters()) +print(f" Model: vocab=27, block_size=16, layers=3, heads=3, embd=48") +print(f" Total parameters: {n_params:,}") + +# Test forward pass +idx = torch.randint(0, 27, (2, 10)) # batch of 2 sequences, 10 tokens each +targets = torch.randint(0, 27, (2, 10)) # target tokens + +logits, loss = model(idx, targets) +print(f"\n Forward pass:") +print(f" Input shape: {list(idx.shape)} (2 sequences × 10 tokens)") +print(f" Logits shape: {list(logits.shape)} (2 × 10 × 27 vocab)") +print(f" Loss: {loss.item():.4f}") +print(f" (Random init → loss ≈ -log(1/27) = {-math.log(1/27):.4f})") + +print(""" +=== How This Maps to microGPT === + +minGPT Component → microGPT Equivalent +───────────────────────────────────────────────── +nn.Embedding(wte) → state_dict['wte'][token_id] +nn.Embedding(wpe) → state_dict['wpe'][pos_id] +CausalSelfAttention → The attention loop in gpt() +Block → One iteration of for li in range(n_layer) +nn.LayerNorm → rmsnorm() +NewGELU → .relu() +nn.Linear(lm_head) → linear(x, state_dict['lm_head']) +F.cross_entropy → -probs[target_id].log() + +Same architecture. Same math. Different scale and speed. +""") diff --git a/gpt/local/course/ch10_Training_and_Inference/README.md b/gpt/local/course/ch10_Training_and_Inference/README.md new file mode 100644 index 0000000..d4f6719 --- /dev/null +++ b/gpt/local/course/ch10_Training_and_Inference/README.md @@ -0,0 +1,80 @@ +# Chapter 10: Training & Inference with minGPT + +## Overview + +This chapter covers the two remaining pieces of minGPT: +1. **Trainer** (`mingpt/trainer.py`) — The training loop +2. **Generation** (`GPT.generate()`) — How the model creates new text + +Plus two real demo projects: +- **Sorting demo** — Teaching GPT to sort numbers +- **CharGPT** — Training on Shakespeare to generate new plays + +--- + +## The Trainer (mingpt/trainer.py) + +minGPT's Trainer is ~110 lines of clean PyTorch boilerplate: + +```python +while True: + x, y = next(data_iter) # 1. Get batch of data + logits, loss = model(x, y) # 2. Forward pass + model.zero_grad() # 3. Reset gradients + loss.backward() # 4. Backward pass (autograd) + clip_grad_norm_(model.parameters()) # 5. Prevent gradient explosion + optimizer.step() # 6. Adam update +``` + +Compare to microGPT's training loop — identical structure! + +--- + +## The Optimizer Setup + +minGPT is careful about **weight decay** (regularization): +- **Decay** weights in Linear layers (prevents overfitting) +- **Don't decay** biases, LayerNorm weights, embeddings + +This is handled in `GPT.configure_optimizers()`. + +--- + +## Text Generation (Inference) + +`GPT.generate()` works like microGPT's inference section: + +```python +for each new token to generate: + 1. Forward pass → get logits for last position + 2. Apply temperature (divide logits by temperature) + 3. Optionally apply top-k filtering + 4. Softmax → probabilities + 5. Sample from distribution (or take argmax) + 6. Append new token to sequence +``` + +--- + +## Demo Projects + +### 1. Sorting (demo.ipynb) +Teaches GPT to sort lists of numbers: `[2, 0, 1] → [0, 1, 2]` +- Shows that GPT can learn algorithmic tasks +- Uses gpt-nano (~90K params), trains in ~1 minute + +### 2. CharGPT (projects/chargpt/) +Character-level language model on any text file (e.g., Shakespeare) +- Uses gpt-mini (~1M params) +- Generates new text that mimics the training data style + +--- + +## Run the Examples + +```bash +python trainer_explained.py # Annotated training loop +python generate_text.py # Text generation from scratch +``` + +**Requires:** `pip install torch` diff --git a/gpt/local/course/ch10_Training_and_Inference/generate_text.py b/gpt/local/course/ch10_Training_and_Inference/generate_text.py new file mode 100644 index 0000000..ba35aa1 --- /dev/null +++ b/gpt/local/course/ch10_Training_and_Inference/generate_text.py @@ -0,0 +1,206 @@ +""" +Chapter 10: Text Generation — How GPT Creates Text + +This script explains and demonstrates the text generation process, +mapping to both microGPT's and minGPT's generation code. + +Requires: pip install torch +""" + +try: + import torch + import torch.nn as nn + from torch.nn import functional as F + import math +except ImportError: + print("PyTorch not installed. Run: pip install torch") + exit(1) + +torch.manual_seed(42) + +# ============================================================ +# TEXT GENERATION EXPLAINED +# ============================================================ +print("=" * 60) +print("How GPT Generates Text") +print("=" * 60) +print(""" +Generation is an iterative process: + + 1. Start with a prompt (or just a start token) + 2. Feed it through the model → get logits for ALL vocab tokens + 3. Take only the LAST position's logits (prediction for next token) + 4. Apply temperature and optional top-k filtering + 5. Convert to probabilities (softmax) + 6. Sample a token from the distribution + 7. Append it to the sequence + 8. Repeat from step 2 +""") + +# ============================================================ +# STEP-BY-STEP GENERATION DEMO (no model, just the mechanics) +# ============================================================ +print("=" * 60) +print("Step-by-Step: Temperature & Sampling") +print("=" * 60) + +vocab = ['a', 'b', 'c', 'd', 'e'] +logits = torch.tensor([2.0, 5.0, 1.0, 0.5, 0.1]) + +print(f"\nRaw logits: {logits.tolist()}") +print(f"Vocab: {vocab}") + +# Temperature effect +print("\n--- Temperature Effect ---") +for temp in [0.1, 0.5, 1.0, 1.5, 3.0]: + scaled = logits / temp + probs = F.softmax(scaled, dim=-1) + print(f" temp={temp:.1f}: [{', '.join(f'{p:.3f}' for p in probs.tolist())}]", end="") + if temp < 0.5: + print(" ← very peaked (almost deterministic)") + elif temp < 1.0: + print(" ← confident") + elif temp == 1.0: + print(" ← normal") + else: + print(" ← more random") + +# Top-k filtering +print("\n--- Top-k Filtering ---") +print(" Only consider the k most likely tokens, set rest to -inf") +for k in [1, 2, 3, 5]: + filtered = logits.clone() + if k < len(logits): + v, _ = torch.topk(logits, k) + filtered[filtered < v[-1]] = float('-inf') + probs = F.softmax(filtered, dim=-1) + top_tokens = [vocab[i] for i in range(len(vocab)) if probs[i] > 0] + print(f" top_k={k}: [{', '.join(f'{p:.3f}' for p in probs.tolist())}] " + f"candidates: {top_tokens}") + +# Sampling vs Greedy +print("\n--- Sampling vs Greedy ---") +probs = F.softmax(logits, dim=-1) +print(f" Probabilities: [{', '.join(f'{p:.3f}' for p in probs.tolist())}]") + +# Greedy: always pick the most likely +_, greedy_idx = torch.topk(probs, k=1) +print(f" Greedy (argmax): always picks '{vocab[greedy_idx.item()]}' (highest prob)") + +# Sampling: randomly pick, weighted by probability +print(f" Sampling (10 tries): ", end="") +samples = [] +for _ in range(10): + idx = torch.multinomial(probs, num_samples=1) + samples.append(vocab[idx.item()]) +print(' '.join(samples)) +print(f" ('{vocab[1]}' appears most often because it has highest probability)") + +# ============================================================ +# THE GENERATE FUNCTION (from minGPT) +# ============================================================ +print("\n" + "=" * 60) +print("The Generate Function — Code Comparison") +print("=" * 60) + +print(""" +microGPT (lines 189-200): minGPT (model.py lines 283-310): +───────────────────────── ────────────────────────────────── +token_id = BOS idx = starting_tokens +for pos_id in range(block_size): for _ in range(max_new_tokens): + logits = gpt(token_id, pos_id, ...) idx_cond = idx[:, -block_size:] + probs = softmax([l/temp for l in ..]) logits, _ = self(idx_cond) + token_id = random.choices(...) logits = logits[:, -1, :] / temp + if token_id == BOS: break probs = F.softmax(logits, dim=-1) + sample.append(uchars[token_id]) idx_next = torch.multinomial(...) + idx = torch.cat((idx, idx_next), 1) + +Same algorithm: + 1. Get logits from model + 2. Scale by temperature + 3. Softmax → probabilities + 4. Sample next token + 5. Append and repeat + +Key difference: minGPT processes in batches and handles +sequences longer than block_size by cropping. +""") + +# ============================================================ +# HOW CHARGPT GENERATES SHAKESPEARE +# ============================================================ +print("=" * 60) +print("Real Application: CharGPT on Shakespeare") +print("=" * 60) + +print(""" +The minGPT project 'chargpt' trains on Shakespeare's text: + + Source: ../../minGPT/projects/chargpt/chargpt.py + + 1. Load text file (e.g., all of Shakespeare) + 2. Build character vocabulary ({'a':0, 'b':1, ..., 'z':25, ' ':26, ...}) + 3. Create a CharDataset: + - Each training example is a random 128-character chunk + - Input: characters [0:127], Target: characters [1:128] + 4. Train gpt-mini (6 layers, 192 dim) for thousands of iterations + 5. Every 500 steps, generate a sample starting from "O God, O God!" + + After training, it produces text like: + "O God, O God! what shall I say to thee? + That I have lost my father, and my friend, + And all my mother's sons, and all my kin..." + + It learned: + - English spelling and grammar + - Shakespeare's vocabulary and style + - Poetic meter (roughly) + - Character names and dialogue patterns + + All from raw character sequences — no rules, no grammar engine, + just pattern matching at scale. +""") + +# ============================================================ +# THE COMPLETE GENERATION PIPELINE +# ============================================================ +print("=" * 60) +print("Summary: The Complete Generation Pipeline") +print("=" * 60) + +print(""" + ┌─────────────────────────────────────────────────┐ + │ GENERATION │ + │ │ + │ Prompt: "Hello" │ + │ ↓ │ + │ Tokenize: [H, e, l, l, o] → [7, 4, 11, 11, 14]│ + │ ↓ │ + │ ┌─── LOOP (repeat for each new token) ─────┐ │ + │ │ │ │ + │ │ Feed tokens through GPT model │ │ + │ │ ↓ │ │ + │ │ Get logits at last position │ │ + │ │ ↓ │ │ + │ │ Apply temperature (divide by T) │ │ + │ │ ↓ │ │ + │ │ Optional: top-k filtering │ │ + │ │ ↓ │ │ + │ │ Softmax → probabilities │ │ + │ │ ↓ │ │ + │ │ Sample next token │ │ + │ │ ↓ │ │ + │ │ Append to sequence │ │ + │ │ │ │ + │ └───────────────────────────────────────────┘ │ + │ ↓ │ + │ Decode tokens → text: "Hello world, I am..." │ + │ │ + └─────────────────────────────────────────────────┘ + + Controls: + temperature < 1.0 → more focused, repetitive + temperature > 1.0 → more creative, random + top_k = small → only consider top few options + do_sample = False → always pick best (greedy/deterministic) +""") diff --git a/gpt/local/course/ch10_Training_and_Inference/trainer_explained.py b/gpt/local/course/ch10_Training_and_Inference/trainer_explained.py new file mode 100644 index 0000000..1dbead4 --- /dev/null +++ b/gpt/local/course/ch10_Training_and_Inference/trainer_explained.py @@ -0,0 +1,272 @@ +""" +Chapter 10: minGPT Trainer — Annotated + +This builds a simplified version of minGPT's Trainer and trains +a tiny GPT on a sorting task (from demo.ipynb). + +Source: ../../minGPT/mingpt/trainer.py +Demo: ../../minGPT/demo.ipynb + +Requires: pip install torch +""" + +try: + import torch + import torch.nn as nn + from torch.nn import functional as F + from torch.utils.data import Dataset, DataLoader + import math + import time +except ImportError: + print("PyTorch not installed. Run: pip install torch") + exit(1) + +torch.manual_seed(3407) + +# ============================================================ +# MODEL: Simplified GPT (from Chapter 09) +# ============================================================ +class NewGELU(nn.Module): + def forward(self, x): + return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0/math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) + +class CausalSelfAttention(nn.Module): + def __init__(self, n_embd, n_head, block_size): + super().__init__() + self.c_attn = nn.Linear(n_embd, 3 * n_embd) + self.c_proj = nn.Linear(n_embd, n_embd) + self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size)).view(1,1,block_size,block_size)) + self.n_head = n_head + self.n_embd = n_embd + def forward(self, x): + B, T, C = x.size() + q, k, v = self.c_attn(x).split(self.n_embd, dim=2) + k = k.view(B, T, self.n_head, C//self.n_head).transpose(1, 2) + q = q.view(B, T, self.n_head, C//self.n_head).transpose(1, 2) + v = v.view(B, T, self.n_head, C//self.n_head).transpose(1, 2) + att = (q @ k.transpose(-2,-1)) * (1.0/math.sqrt(k.size(-1))) + att = att.masked_fill(self.bias[:,:,:T,:T]==0, float('-inf')) + att = F.softmax(att, dim=-1) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, C) + return self.c_proj(y) + +class Block(nn.Module): + def __init__(self, n_embd, n_head, block_size): + super().__init__() + self.ln_1 = nn.LayerNorm(n_embd) + self.attn = CausalSelfAttention(n_embd, n_head, block_size) + self.ln_2 = nn.LayerNorm(n_embd) + self.mlp = nn.Sequential( + nn.Linear(n_embd, 4*n_embd), NewGELU(), nn.Linear(4*n_embd, n_embd)) + def forward(self, x): + x = x + self.attn(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + +class GPT(nn.Module): + def __init__(self, vocab_size, block_size, n_layer, n_head, n_embd): + super().__init__() + self.block_size = block_size + self.wte = nn.Embedding(vocab_size, n_embd) + self.wpe = nn.Embedding(block_size, n_embd) + self.blocks = nn.ModuleList([Block(n_embd, n_head, block_size) for _ in range(n_layer)]) + self.ln_f = nn.LayerNorm(n_embd) + self.head = nn.Linear(n_embd, vocab_size, bias=False) + self.apply(self._init_weights) + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + + def forward(self, idx, targets=None): + B, T = idx.size() + pos = torch.arange(T, dtype=torch.long, device=idx.device) + x = self.wte(idx) + self.wpe(pos) + for block in self.blocks: + x = block(x) + logits = self.head(self.ln_f(x)) + loss = None + if targets is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) + return logits, loss + + @torch.no_grad() + def generate(self, idx, max_new_tokens): + for _ in range(max_new_tokens): + idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:] + logits, _ = self(idx_cond) + logits = logits[:, -1, :] + probs = F.softmax(logits, dim=-1) + _, idx_next = torch.topk(probs, k=1, dim=-1) + idx = torch.cat((idx, idx_next), dim=1) + return idx + +# ============================================================ +# DATASET: Sorting task (from demo.ipynb) +# ============================================================ +class SortDataset(Dataset): + """ + Input: [2, 0, 1] → Output: [0, 1, 2] + Concatenated: [2, 0, 1, 0, 1, 2] + + The model learns to "sort" by seeing thousands of examples. + This proves GPT can learn algorithms, not just language! + """ + def __init__(self, split, length=6, num_digits=3): + self.split = split + self.length = length + self.num_digits = num_digits + + def __len__(self): + return 10000 + + def get_vocab_size(self): + return self.num_digits + + def get_block_size(self): + return self.length * 2 - 1 + + def __getitem__(self, idx): + import pickle + while True: + inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long) + h = hash(pickle.dumps(inp.tolist())) + if (h % 4 == 0) == (self.split == 'test'): + break + sol = torch.sort(inp)[0] + cat = torch.cat((inp, sol), dim=0) + x = cat[:-1].clone() + y = cat[1:].clone() + y[:self.length - 1] = -1 # mask: don't predict at input positions + return x, y + +# ============================================================ +# TRAINING LOOP (annotated version of mingpt/trainer.py) +# ============================================================ +print("=" * 60) +print("Training GPT to Sort Numbers") +print("=" * 60) + +# Create dataset and model +train_dataset = SortDataset('train') +test_dataset = SortDataset('test') + +model = GPT( + vocab_size=train_dataset.get_vocab_size(), # 3 digits: 0, 1, 2 + block_size=train_dataset.get_block_size(), # 11 positions + n_layer=3, n_head=3, n_embd=48 # gpt-nano +) + +n_params = sum(p.numel() for p in model.parameters()) +print(f"\nModel: gpt-nano, {n_params:,} parameters") +print(f"Task: sort lists of {train_dataset.length} digits ({train_dataset.num_digits} possible values)") +print(f"Example: [2, 0, 1] → [0, 1, 2]\n") + +# Setup (mirrors mingpt/trainer.py) +device = 'cuda' if torch.cuda.is_available() else 'cpu' +model = model.to(device) +optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1) + +# Training loop +train_loader = DataLoader(train_dataset, batch_size=64, num_workers=0, + sampler=torch.utils.data.RandomSampler(train_dataset, replacement=True, num_samples=int(1e10))) +data_iter = iter(train_loader) + +max_iters = 1000 +print(f"Training for {max_iters} iterations on {device}...\n") + +model.train() +t0 = time.time() + +for iter_num in range(max_iters): + # ---- Step 1: Get batch ---- + batch = next(data_iter) + x, y = [t.to(device) for t in batch] + + # ---- Step 2: Forward pass ---- + logits, loss = model(x, y) + + # ---- Step 3-4: Backward pass ---- + model.zero_grad(set_to_none=True) + loss.backward() + + # ---- Step 5: Gradient clipping (prevent explosions) ---- + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + + # ---- Step 6: Optimizer step ---- + optimizer.step() + + if iter_num % 100 == 0: + elapsed = time.time() - t0 + print(f" iter {iter_num:4d} | loss {loss.item():.5f} | time {elapsed:.1f}s") + +elapsed = time.time() - t0 +print(f"\nTraining complete in {elapsed:.1f}s") + +# ============================================================ +# EVALUATION: Test if it actually learned to sort! +# ============================================================ +print("\n" + "=" * 60) +print("Evaluation: Can it sort?") +print("=" * 60) + +model.eval() +n = train_dataset.length + +def eval_split(split, max_batches=50): + dataset = train_dataset if split == 'train' else test_dataset + loader = DataLoader(dataset, batch_size=100, num_workers=0) + correct = 0 + total = 0 + for b, (x, y) in enumerate(loader): + x = x.to(device) + inp = x[:, :n] + with torch.no_grad(): + cat = model.generate(inp, n) + sol_pred = cat[:, n:] + sol_true = torch.sort(inp)[0] + correct += (sol_pred.cpu() == sol_true).all(1).sum().item() + total += x.size(0) + if b + 1 >= max_batches: + break + return correct, total + +train_correct, train_total = eval_split('train') +test_correct, test_total = eval_split('test') + +print(f"\n Train: {train_correct}/{train_total} = {100*train_correct/train_total:.1f}% correct") +print(f" Test: {test_correct}/{test_total} = {100*test_correct/test_total:.1f}% correct") + +# Show some examples +print("\n Sample predictions:") +loader = DataLoader(test_dataset, batch_size=5, num_workers=0) +x, y = next(iter(loader)) +inp = x[:, :n].to(device) +with torch.no_grad(): + cat = model.generate(inp, n) +for i in range(5): + input_list = inp[i].tolist() + pred_list = cat[i, n:].tolist() + true_list = sorted(input_list) + status = "✓" if pred_list == true_list else "✗" + print(f" {input_list} → {pred_list} (expected {true_list}) {status}") + +print(""" +=== What This Demonstrates === + +GPT learned to SORT numbers — a task it was never explicitly programmed for! +It learned the algorithm purely from examples: + - See thousands of (unsorted → sorted) pairs + - Learn the pattern through gradient descent + - Generalize to new, unseen inputs + +This is the power of the transformer architecture: given enough data +and parameters, it can learn surprisingly complex functions. + +The same architecture, scaled up with more data and parameters, +learns to write code, answer questions, translate languages... +""") diff --git a/gpt/local/course/ch11_Side_by_Side_Comparison/README.md b/gpt/local/course/ch11_Side_by_Side_Comparison/README.md new file mode 100644 index 0000000..b801b2a --- /dev/null +++ b/gpt/local/course/ch11_Side_by_Side_Comparison/README.md @@ -0,0 +1,46 @@ +# Chapter 11: Side-by-Side — microGPT vs minGPT + +## The Same Algorithm, Two Implementations + +Both implementations by Andrej Karpathy implement the **exact same GPT architecture**. The difference is in tooling and scale, not in ideas. + +--- + +## Philosophy Comparison + +| Aspect | microGPT | minGPT | +|---|---|---| +| **Goal** | "This is the complete algorithm. Everything else is just efficiency." | "Small, clean, interpretable and educational" | +| **Dependencies** | Zero (pure Python) | PyTorch | +| **Lines of code** | ~200 (single file) | ~850 (across 4 files) | +| **Speed** | Minutes per 200 steps | Seconds per 2000 steps | +| **GPU support** | No | Yes | +| **Batching** | 1 document at a time | Configurable batch size | +| **Best for** | Understanding the math | Building real projects | + +--- + +## Architecture Mapping + +``` +microGPT minGPT +──────── ────── +Value class → torch.Tensor + autograd +state_dict['wte'][id] → nn.Embedding(vocab, n_embd) +state_dict['wpe'][id] → nn.Embedding(block_size, n_embd) +rmsnorm(x) → nn.LayerNorm(n_embd) +linear(x, w) [manual loop] → nn.Linear(in, out) [GPU matmul] +xi.relu() → NewGELU() +softmax(logits) → F.softmax(logits, dim=-1) +-probs[target].log() → F.cross_entropy(logits, targets) +Manual Adam loop → torch.optim.AdamW(...) +KV-cache (sequential) → Triangular mask (parallel) +``` + +--- + +## Run the Comparison + +```bash +python comparison.py # Detailed side-by-side code comparison +``` diff --git a/gpt/local/course/ch11_Side_by_Side_Comparison/comparison.py b/gpt/local/course/ch11_Side_by_Side_Comparison/comparison.py new file mode 100644 index 0000000..eb93d8e --- /dev/null +++ b/gpt/local/course/ch11_Side_by_Side_Comparison/comparison.py @@ -0,0 +1,290 @@ +""" +Chapter 11: microGPT vs minGPT — Detailed Comparison + +This script walks through each section of both implementations, +showing the equivalent code side-by-side with explanations. + +No dependencies required — this is a reading/reference script. +""" + +# ============================================================ +# Helper to format code snippets +# ============================================================ +def compare(title, micro_code, min_code, explanation): + print("=" * 70) + print(f" {title}") + print("=" * 70) + print(f"\n microGPT (pure Python):") + for line in micro_code.strip().split('\n'): + print(f" {line}") + print(f"\n minGPT (PyTorch):") + for line in min_code.strip().split('\n'): + print(f" {line}") + print(f"\n → {explanation}\n") + + +# ============================================================ +# COMPARISON 1: Tokenization +# ============================================================ +compare( + "1. TOKENIZATION", + """ +uchars = sorted(set(''.join(docs))) +BOS = len(uchars) +vocab_size = len(uchars) + 1 +tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS] +""", + """ +# CharDataset (projects/chargpt/chargpt.py) +chars = sorted(list(set(data))) +self.stoi = {ch:i for i,ch in enumerate(chars)} +self.itos = {i:ch for i,ch in enumerate(chars)} +dix = [self.stoi[s] for s in chunk] +x = torch.tensor(dix[:-1], dtype=torch.long) +""", + "Same idea: map characters to integers. minGPT wraps it in a Dataset class\n" + " and returns PyTorch tensors instead of plain lists." +) + +# ============================================================ +# COMPARISON 2: Embeddings +# ============================================================ +compare( + "2. EMBEDDINGS", + """ +# Token + Position embedding (inline in gpt function) +tok_emb = state_dict['wte'][token_id] # list lookup +pos_emb = state_dict['wpe'][pos_id] # list lookup +x = [t + p for t, p in zip(tok_emb, pos_emb)] +""", + """ +# GPT.__init__: +self.transformer.wte = nn.Embedding(vocab_size, n_embd) +self.transformer.wpe = nn.Embedding(block_size, n_embd) + +# GPT.forward: +tok_emb = self.transformer.wte(idx) # (B, T, n_embd) +pos_emb = self.transformer.wpe(pos) # (1, T, n_embd) +x = tok_emb + pos_emb # broadcasts! +""", + "microGPT looks up one embedding at a time (one token, one position).\n" + " minGPT looks up ALL embeddings for ALL tokens in the batch at once." +) + +# ============================================================ +# COMPARISON 3: Attention +# ============================================================ +compare( + "3. MULTI-HEAD ATTENTION", + """ +# Process one token at a time, accumulate KV cache +q = linear(x, state_dict[f'layer{li}.attn_wq']) +k = linear(x, state_dict[f'layer{li}.attn_wk']) +v = linear(x, state_dict[f'layer{li}.attn_wv']) +keys[li].append(k) +values[li].append(v) + +for h in range(n_head): + hs = h * head_dim + q_h = q[hs:hs+head_dim] + k_h = [ki[hs:hs+head_dim] for ki in keys[li]] + v_h = [vi[hs:hs+head_dim] for vi in values[li]] + attn_logits = [sum(...) / head_dim**0.5 for t in ...] + attn_weights = softmax(attn_logits) + head_out = [sum(...) for j in range(head_dim)] + x_attn.extend(head_out) +""", + """ +# Process ALL tokens at once with masked attention +q, k, v = self.c_attn(x).split(self.n_embd, dim=2) +k = k.view(B, T, nh, hs).transpose(1, 2) # (B, nh, T, hs) +q = q.view(B, T, nh, hs).transpose(1, 2) +v = v.view(B, T, nh, hs).transpose(1, 2) + +att = (q @ k.transpose(-2, -1)) * (1.0 / sqrt(hs)) +att = att.masked_fill(self.bias[:,:,:T,:T] == 0, -inf) +att = F.softmax(att, dim=-1) +y = att @ v +y = y.transpose(1, 2).contiguous().view(B, T, C) +""", + "This is the BIGGEST difference between the two implementations:\n" + " - microGPT: sequential (one token at a time, Python loops over heads)\n" + " - minGPT: parallel (all tokens, all heads, all batches in one matmul)\n" + " Same math, but minGPT is orders of magnitude faster." +) + +# ============================================================ +# COMPARISON 4: MLP Block +# ============================================================ +compare( + "4. MLP (FEEDFORWARD) BLOCK", + """ +x = linear(x, state_dict[f'layer{li}.mlp_fc1']) # expand +x = [xi.relu() for xi in x] # activate +x = linear(x, state_dict[f'layer{li}.mlp_fc2']) # compress +""", + """ +# Block.__init__: +self.mlp = nn.ModuleDict(dict( + c_fc = nn.Linear(n_embd, 4 * n_embd), + c_proj = nn.Linear(4 * n_embd, n_embd), + act = NewGELU(), +)) + +# Block.forward: +self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) +""", + "Identical structure: expand → activate → compress.\n" + " microGPT uses ReLU, minGPT uses GELU (smoother).\n" + " minGPT adds dropout for regularization." +) + +# ============================================================ +# COMPARISON 5: Normalization +# ============================================================ +compare( + "5. NORMALIZATION", + """ +def rmsnorm(x): + ms = sum(xi * xi for xi in x) / len(x) + scale = (ms + 1e-5) ** -0.5 + return [xi * scale for xi in x] +""", + """ +# Uses PyTorch's built-in LayerNorm +self.ln_1 = nn.LayerNorm(n_embd) +self.ln_2 = nn.LayerNorm(n_embd) + +# In forward: +x = x + self.attn(self.ln_1(x)) +""", + "RMSNorm (microGPT) only scales by magnitude.\n" + " LayerNorm (minGPT) also shifts by mean and has learnable parameters.\n" + " Both serve the same purpose: keep values in a stable range." +) + +# ============================================================ +# COMPARISON 6: Residual Connections +# ============================================================ +compare( + "6. RESIDUAL CONNECTIONS", + """ +x_residual = x +x = rmsnorm(x) +# ... attention ... +x = [a + b for a, b in zip(x, x_residual)] # ADD residual + +x_residual = x +x = rmsnorm(x) +# ... MLP ... +x = [a + b for a, b in zip(x, x_residual)] # ADD residual +""", + """ +def forward(self, x): + x = x + self.attn(self.ln_1(x)) # residual around attention + x = x + self.mlpf(self.ln_2(x)) # residual around MLP + return x +""", + "Identical concept: output = input + transformation(input).\n" + " minGPT is more concise because PyTorch handles the element-wise add." +) + +# ============================================================ +# COMPARISON 7: Training Loop +# ============================================================ +compare( + "7. TRAINING LOOP", + """ +for step in range(num_steps): + doc = docs[step % len(docs)] + tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS] + # ... forward each token through gpt() ... + loss = (1/n) * sum(losses) + loss.backward() + # Manual Adam: + for i, p in enumerate(params): + m[i] = beta1 * m[i] + (1-beta1) * p.grad + v[i] = beta2 * v[i] + (1-beta2) * p.grad**2 + # ... bias correction, update ... + p.data -= lr_t * m_hat / (v_hat**0.5 + eps) + p.grad = 0 +""", + """ +while True: + batch = next(data_iter) + x, y = [t.to(self.device) for t in batch] + logits, self.loss = model(x, y) + model.zero_grad(set_to_none=True) + self.loss.backward() + clip_grad_norm_(model.parameters(), config.grad_norm_clip) + self.optimizer.step() +""", + "Same loop: forward → loss → backward → update.\n" + " microGPT: 1 document at a time, manual Adam, ~15 lines.\n" + " minGPT: batched, built-in optimizer, gradient clipping, ~10 lines." +) + +# ============================================================ +# COMPARISON 8: Inference +# ============================================================ +compare( + "8. TEXT GENERATION (INFERENCE)", + """ +token_id = BOS +for pos_id in range(block_size): + logits = gpt(token_id, pos_id, keys, values) + probs = softmax([l / temperature for l in logits]) + token_id = random.choices(range(vocab_size), + weights=[p.data for p in probs])[0] + if token_id == BOS: break + sample.append(uchars[token_id]) +""", + """ +for _ in range(max_new_tokens): + idx_cond = idx if idx.size(1) <= self.block_size \\ + else idx[:, -self.block_size:] + logits, _ = self(idx_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + v, _ = torch.topk(logits, top_k) + logits[logits < v[:, [-1]]] = -float('Inf') + probs = F.softmax(logits, dim=-1) + if do_sample: + idx_next = torch.multinomial(probs, num_samples=1) + else: + _, idx_next = torch.topk(probs, k=1, dim=-1) + idx = torch.cat((idx, idx_next), dim=1) +""", + "microGPT: sequential, one token at a time, KV-cache.\n" + " minGPT: can process batches, supports top-k filtering, greedy mode.\n" + " Both: predict one token, append, repeat." +) + +# ============================================================ +# SUMMARY TABLE +# ============================================================ +print("=" * 70) +print(" SUMMARY: When to Use Which") +print("=" * 70) +print(""" + ┌──────────────────────────────────────────────────────────────────┐ + │ USE microGPT WHEN YOU WANT TO: │ + │ • Understand exactly how every piece of GPT works │ + │ • See autograd, attention, training loop without any magic │ + │ • Learn the algorithm, not the framework │ + │ • Have a single-file reference with zero dependencies │ + │ │ + │ USE minGPT WHEN YOU WANT TO: │ + │ • Actually train on real data (Shakespeare, code, etc.) │ + │ • Experiment with different model sizes │ + │ • Use GPU acceleration │ + │ • Load pretrained GPT-2 weights │ + │ • Build projects on top of a clean GPT implementation │ + │ │ + │ THE KEY INSIGHT: │ + │ microGPT IS minGPT, just without the efficiency tricks. │ + │ If you understand microGPT, you understand minGPT. │ + │ If you understand minGPT, you understand GPT-2. │ + │ The architecture is the same at every scale. │ + └──────────────────────────────────────────────────────────────────┘ +""") diff --git a/gpt/local/course/ch12_Exercises_and_Next_Steps/README.md b/gpt/local/course/ch12_Exercises_and_Next_Steps/README.md new file mode 100644 index 0000000..9087d05 --- /dev/null +++ b/gpt/local/course/ch12_Exercises_and_Next_Steps/README.md @@ -0,0 +1,102 @@ +# Chapter 12: Exercises & Next Steps + +## Congratulations! + +You now understand the complete GPT architecture — from raw text to generated output. Here are exercises to solidify your understanding, organized by difficulty. + +--- + +## Beginner Exercises (Chapters 01-04) + +### Exercise 1: Bigram Language Model +Build a character-level bigram model (like Ch01) but train it using gradient descent and the Value class (Ch03) instead of counting. + +### Exercise 2: Custom Tokenizer +Modify the character tokenizer from Ch02 to handle uppercase letters, spaces, and punctuation. Test it on a paragraph of text. + +### Exercise 3: Autograd Extensions +Add `tanh()` and `sigmoid()` operations to the Value class. Verify their gradients by comparing with numerical differentiation. + +### Exercise 4: Softmax Temperature Explorer +Write a script that visualizes how different temperatures (0.1 to 5.0) affect the output distribution for a fixed set of logits. + +--- + +## Intermediate Exercises (Chapters 05-07) + +### Exercise 5: Attention Visualization +Modify the attention code from Ch05 to print attention weights for each head. Feed in a known pattern (e.g., "abab") and see which positions attend to which. + +### Exercise 6: microGPT Hyperparameter Tuning +Modify microGPT's hyperparameters and measure the effect on loss: +- Try `n_head = 1, 2, 4, 8` +- Try `n_embd = 8, 16, 32` +- Try `n_layer = 1, 2, 3` + +### Exercise 7: Different Dataset +Modify microGPT to train on a different character-level dataset (e.g., country names, short words, or DNA sequences like "ATCGATCG"). + +--- + +## Advanced Exercises (Chapters 08-11) + +### Exercise 8: minGPT Character Generator +Use minGPT's `chargpt` project to train on a text file of your choice. Try song lyrics, code, or recipes. + +### Exercise 9: Add Dropout to microGPT +Implement dropout in pure Python and add it to microGPT's attention and MLP blocks. Does it help with overfitting? + +### Exercise 10: Implement Beam Search +Instead of sampling one token at a time, implement beam search for text generation: maintain the top-k most likely sequences at each step. + +### Exercise 11: Weight Sharing +In real GPT-2, the token embedding matrix (`wte`) and the output head (`lm_head`) share weights. Implement this in either microGPT or the simplified minGPT from Ch09. + +--- + +## Challenge Exercises + +### Exercise 12: Build nanoGPT +Combine the best of both implementations: write a GPT that uses PyTorch but is contained in a single file under 300 lines. (This is essentially what Karpathy did with nanoGPT!) + +### Exercise 13: Positional Encoding Variants +Replace learned positional embeddings with sinusoidal positional encodings (from the original "Attention is All You Need" paper). Compare results. + +--- + +## Next Steps & Further Reading + +### Karpathy's Other Projects +- **nanoGPT**: The successor to minGPT — more practical, reproduces benchmarks + - https://github.com/karpathy/nanoGPT +- **makemore**: Character-level name generation (the dataset microGPT uses) + - https://github.com/karpathy/makemore +- **Karpathy's YouTube**: Full video lectures building these from scratch + - https://www.youtube.com/c/AndrejKarpathy + +### Papers to Read +1. **"Attention Is All You Need"** (2017) — The original transformer paper +2. **"Improving Language Understanding by Generative Pre-Training"** (GPT-1, 2018) +3. **"Language Models are Few-Shot Learners"** (GPT-3, 2020) + +### Topics to Explore Next +- **Tokenization deep-dive**: SentencePiece, tiktoken +- **Training at scale**: Distributed training, mixed precision +- **Fine-tuning**: LoRA, RLHF, instruction tuning +- **Inference optimization**: KV-cache, quantization, speculative decoding +- **Other architectures**: BERT (encoder), T5 (encoder-decoder) + +--- + +## Quick Reference Card + +``` +GPT in one paragraph: + Tokenize text into integers. Embed each token into a vector. + Add position embeddings. Pass through N transformer blocks, + each doing: normalize → multi-head attention → residual → + normalize → MLP → residual. Final linear layer maps to + vocab-sized logits. Softmax gives next-token probabilities. + Train with cross-entropy loss and Adam optimizer. + Generate by sampling one token at a time. +``` diff --git a/gpt/local/course/ch12_Exercises_and_Next_Steps/exercise_solutions.py b/gpt/local/course/ch12_Exercises_and_Next_Steps/exercise_solutions.py new file mode 100644 index 0000000..234c6f7 --- /dev/null +++ b/gpt/local/course/ch12_Exercises_and_Next_Steps/exercise_solutions.py @@ -0,0 +1,250 @@ +""" +Chapter 12: Exercise Solution Starters + +This file contains STARTER CODE for some of the exercises. +Complete the TODO sections to solve each exercise. +""" + +import math +import random + +random.seed(42) + +# ============================================================ +# Value class (from Chapter 03) — needed for exercises +# ============================================================ +class Value: + __slots__ = ('data', 'grad', '_children', '_local_grads') + def __init__(self, data, children=(), local_grads=()): + self.data = data + self.grad = 0 + self._children = children + self._local_grads = local_grads + def __add__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data + other.data, (self, other), (1, 1)) + def __mul__(self, other): + other = other if isinstance(other, Value) else Value(other) + return Value(self.data * other.data, (self, other), (other.data, self.data)) + def __pow__(self, n): return Value(self.data**n, (self,), (n * self.data**(n-1),)) + def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),)) + def log(self): return Value(math.log(self.data), (self,), (1/self.data,)) + def relu(self): return Value(max(0, self.data), (self,), (float(self.data > 0),)) + def __neg__(self): return self * -1 + def __radd__(self, other): return self + other + def __sub__(self, other): return self + (-other) + def __rmul__(self, other): return self * other + def __truediv__(self, other): return self * other**-1 + def backward(self): + topo, visited = [], set() + def build(v): + if v not in visited: + visited.add(v) + for c in v._children: build(c) + topo.append(v) + build(self) + self.grad = 1 + for v in reversed(topo): + for child, lg in zip(v._children, v._local_grads): + child.grad += lg * v.grad + + +# ============================================================ +# EXERCISE 3: Add tanh() and sigmoid() to Value class +# ============================================================ +print("=" * 50) +print("Exercise 3: Extending the Value Class") +print("=" * 50) + +def tanh(self): + """ + TODO: Implement tanh for the Value class. + + tanh(x) = (exp(2x) - 1) / (exp(2x) + 1) + d(tanh(x))/dx = 1 - tanh(x)^2 + + Hint: You can compute the forward value using math.tanh() + and the local gradient using the derivative formula above. + """ + t = math.tanh(self.data) + return Value(t, (self,), (1 - t**2,)) + +# Attach to Value class +Value.tanh = tanh + +# Test it +x = Value(0.5) +y = x.tanh() +y.backward() +print(f" tanh({x.data}) = {y.data:.6f}") +print(f" d(tanh)/dx = {x.grad:.6f}") +print(f" Expected: tanh(0.5) = {math.tanh(0.5):.6f}") +print(f" Expected grad: 1 - tanh(0.5)^2 = {1 - math.tanh(0.5)**2:.6f}") + + +def sigmoid(self): + """ + TODO: Implement sigmoid for the Value class. + + sigmoid(x) = 1 / (1 + exp(-x)) + d(sigmoid(x))/dx = sigmoid(x) * (1 - sigmoid(x)) + """ + s = 1.0 / (1.0 + math.exp(-self.data)) + return Value(s, (self,), (s * (1 - s),)) + +Value.sigmoid = sigmoid + +x = Value(1.0) +y = x.sigmoid() +y.backward() +s = 1.0 / (1.0 + math.exp(-1.0)) +print(f"\n sigmoid({x.data}) = {y.data:.6f}") +print(f" d(sigmoid)/dx = {x.grad:.6f}") +print(f" Expected: sigmoid(1.0) = {s:.6f}") +print(f" Expected grad: {s * (1 - s):.6f}") + + +# ============================================================ +# EXERCISE 4: Temperature Explorer +# ============================================================ +print("\n" + "=" * 50) +print("Exercise 4: Temperature Explorer") +print("=" * 50) + +def softmax(logits): + max_val = max(logits) + exps = [math.exp(v - max_val) for v in logits] + total = sum(exps) + return [e / total for e in exps] + +logits = [3.0, 1.5, 0.5, 0.1, -0.5] +tokens = ['A', 'B', 'C', 'D', 'E'] + +print(f"\nLogits: {logits}") +print(f"Tokens: {tokens}\n") + +temperatures = [0.1, 0.25, 0.5, 1.0, 2.0, 5.0] + +print(f"{'Temp':>6} | ", end="") +for tok in tokens: + print(f" {tok:>5}", end="") +print(f" | {'Entropy':>8} | Description") +print("-" * 70) + +for temp in temperatures: + scaled = [l / temp for l in logits] + probs = softmax(scaled) + + # Compute entropy: -sum(p * log(p)) + entropy = -sum(p * math.log(p + 1e-10) for p in probs) + + desc = "" + if temp < 0.3: + desc = "Nearly deterministic" + elif temp < 0.8: + desc = "Confident" + elif temp <= 1.2: + desc = "Balanced" + elif temp < 3.0: + desc = "Creative / random" + else: + desc = "Nearly uniform" + + print(f"{temp:>6.2f} | ", end="") + for p in probs: + print(f" {p:>5.3f}", end="") + print(f" | {entropy:>8.4f} | {desc}") + +print(""" + Low temperature → low entropy → model picks the "best" token + High temperature → high entropy → model picks more randomly + + Temperature = 1.0 is the "natural" distribution + Temperature → 0 approaches argmax (greedy decoding) + Temperature → ∞ approaches uniform random +""") + + +# ============================================================ +# EXERCISE 1 STARTER: Bigram Model with Gradient Descent +# ============================================================ +print("=" * 50) +print("Exercise 1 Starter: Bigram Model with Gradient Descent") +print("=" * 50) + +# Dataset +names = ["emma", "olivia", "ava", "sophia", "mia", "luna", "ella", "aria"] +chars = sorted(set(''.join(names))) +BOS = len(chars) +vocab_size = len(chars) + 1 + +print(f"\nVocab: {chars + ['']}") +print(f"Vocab size: {vocab_size}") + +# The model: a weight matrix W where W[i][j] = score for token j following token i +W = [[Value(random.gauss(0, 0.5)) for _ in range(vocab_size)] + for _ in range(vocab_size)] +params = [p for row in W for p in row] + +def softmax_value(logits): + max_val = max(v.data for v in logits) + exps = [(v - max_val).exp() for v in logits] + total = sum(exps) + return [e / total for e in exps] + +# TODO: Complete the training loop +# Hint: For each name, create pairs (current_char, next_char) +# and minimize cross-entropy loss + +lr = 0.1 +for step in range(200): + total_loss = Value(0) + count = 0 + + for name in names: + tokens = [BOS] + [chars.index(ch) for ch in name] + [BOS] + for i in range(len(tokens) - 1): + current = tokens[i] + target = tokens[i + 1] + logits = W[current] + probs = softmax_value(logits) + total_loss = total_loss + (-probs[target].log()) + count += 1 + + avg_loss = total_loss * (1.0 / count) + avg_loss.backward() + + for p in params: + p.data -= lr * p.grad + p.grad = 0 + + if step % 20 == 0: + print(f" Step {step:3d} | Loss: {avg_loss.data:.4f}") + +# Generate! +print("\n Generated names:") +for i in range(10): + name = [] + current = BOS + for _ in range(20): + logits = W[current] + probs = softmax_value(logits) + weights = [p.data for p in probs] + current = random.choices(range(vocab_size), weights=weights, k=1)[0] + if current == BOS: + break + name.append(chars[current]) + print(f" {i+1:2d}. {''.join(name)}") + +print(""" +=== Next Steps === + +1. Try all the exercises above +2. Read the original source code with your new understanding +3. Watch Karpathy's YouTube videos for visual walkthroughs +4. Build something with minGPT — train on YOUR data! +5. Graduate to nanoGPT for real-world experiments + +You now understand how GPT works, from the ground up. +The rest is just scale. Good luck! +""")