From 4f3b96622f0cb86470f1e90ede21a0ad64bb2252 Mon Sep 17 00:00:00 2001
From: Surendra Raika <surendraraika@Surendras-MacBook-Pro.local>
Date: Mon, 23 Feb 2026 18:00:18 +0530
Subject: [PATCH] Add GPT course: 12-chapter guide to understanding microGPT
 and minGPT

A comprehensive course for early CS students covering:
- Ch01-06: Fundamentals (language models, tokenization, autograd, NN blocks, attention, training)
- Ch07: microGPT full annotated walkthrough (pure Python, zero dependencies)
- Ch08-10: minGPT with PyTorch (model architecture, trainer, inference)
- Ch11: Side-by-side comparison of both implementations
- Ch12: Exercises and next steps

All chapters include runnable Python examples.
Ch01-07 require no dependencies; Ch08-10 require PyTorch.
---
 gpt/local/course/README.md                    |  64 +++
 .../language_model_idea-checkpoint.py         | 104 +++++
 .../ch01_What_is_a_Language_Model/README.md   | 102 +++++
 .../language_model_idea.py                    | 104 +++++
 gpt/local/course/ch02_Tokenization/README.md  |  83 ++++
 .../course/ch02_Tokenization/bpe_intuition.py | 112 +++++
 .../ch02_Tokenization/char_tokenizer.py       |  92 +++++
 gpt/local/course/ch03_Autograd/README.md      |  59 +++
 .../course/ch03_Autograd/computation_graph.py | 138 +++++++
 .../course/ch03_Autograd/gradient_descent.py  | 122 ++++++
 .../course/ch03_Autograd/value_basics.py      | 164 ++++++++
 .../README.md                                 |  69 ++++
 .../building_blocks.py                        | 174 ++++++++
 .../mlp.py                                    | 167 ++++++++
 .../ch05_Attention_and_Transformers/README.md |  95 +++++
 .../attention_basics.py                       | 184 +++++++++
 .../multi_head_attention.py                   | 151 +++++++
 .../transformer_block.py                      | 207 ++++++++++
 .../README.md                                 |  67 +++
 .../adam_optimizer.py                         | 124 ++++++
 .../cross_entropy.py                          | 121 ++++++
 .../training_loop.py                          | 171 ++++++++
 .../ch07_microGPT_Full_Walkthrough/README.md  |  70 ++++
 .../microgpt_annotated.py                     | 382 ++++++++++++++++++
 .../ch08_Scaling_Up_with_PyTorch/README.md    |  85 ++++
 .../pytorch_basics.py                         | 200 +++++++++
 .../pytorch_vs_manual.py                      | 202 +++++++++
 .../ch09_minGPT_Model_Deep_Dive/README.md     |  79 ++++
 .../model_sizes.py                            | 146 +++++++
 .../model_walkthrough.py                      | 263 ++++++++++++
 .../ch10_Training_and_Inference/README.md     |  80 ++++
 .../generate_text.py                          | 206 ++++++++++
 .../trainer_explained.py                      | 272 +++++++++++++
 .../ch11_Side_by_Side_Comparison/README.md    |  46 +++
 .../comparison.py                             | 290 +++++++++++++
 .../ch12_Exercises_and_Next_Steps/README.md   | 102 +++++
 .../exercise_solutions.py                     | 250 ++++++++++++
 37 files changed, 5347 insertions(+)
 create mode 100644 gpt/local/course/README.md
 create mode 100644 gpt/local/course/ch01_What_is_a_Language_Model/.ipynb_checkpoints/language_model_idea-checkpoint.py
 create mode 100644 gpt/local/course/ch01_What_is_a_Language_Model/README.md
 create mode 100644 gpt/local/course/ch01_What_is_a_Language_Model/language_model_idea.py
 create mode 100644 gpt/local/course/ch02_Tokenization/README.md
 create mode 100644 gpt/local/course/ch02_Tokenization/bpe_intuition.py
 create mode 100644 gpt/local/course/ch02_Tokenization/char_tokenizer.py
 create mode 100644 gpt/local/course/ch03_Autograd/README.md
 create mode 100644 gpt/local/course/ch03_Autograd/computation_graph.py
 create mode 100644 gpt/local/course/ch03_Autograd/gradient_descent.py
 create mode 100644 gpt/local/course/ch03_Autograd/value_basics.py
 create mode 100644 gpt/local/course/ch04_Neural_Network_Building_Blocks/README.md
 create mode 100644 gpt/local/course/ch04_Neural_Network_Building_Blocks/building_blocks.py
 create mode 100644 gpt/local/course/ch04_Neural_Network_Building_Blocks/mlp.py
 create mode 100644 gpt/local/course/ch05_Attention_and_Transformers/README.md
 create mode 100644 gpt/local/course/ch05_Attention_and_Transformers/attention_basics.py
 create mode 100644 gpt/local/course/ch05_Attention_and_Transformers/multi_head_attention.py
 create mode 100644 gpt/local/course/ch05_Attention_and_Transformers/transformer_block.py
 create mode 100644 gpt/local/course/ch06_Training_Loop_and_Optimization/README.md
 create mode 100644 gpt/local/course/ch06_Training_Loop_and_Optimization/adam_optimizer.py
 create mode 100644 gpt/local/course/ch06_Training_Loop_and_Optimization/cross_entropy.py
 create mode 100644 gpt/local/course/ch06_Training_Loop_and_Optimization/training_loop.py
 create mode 100644 gpt/local/course/ch07_microGPT_Full_Walkthrough/README.md
 create mode 100644 gpt/local/course/ch07_microGPT_Full_Walkthrough/microgpt_annotated.py
 create mode 100644 gpt/local/course/ch08_Scaling_Up_with_PyTorch/README.md
 create mode 100644 gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_basics.py
 create mode 100644 gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_vs_manual.py
 create mode 100644 gpt/local/course/ch09_minGPT_Model_Deep_Dive/README.md
 create mode 100644 gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_sizes.py
 create mode 100644 gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_walkthrough.py
 create mode 100644 gpt/local/course/ch10_Training_and_Inference/README.md
 create mode 100644 gpt/local/course/ch10_Training_and_Inference/generate_text.py
 create mode 100644 gpt/local/course/ch10_Training_and_Inference/trainer_explained.py
 create mode 100644 gpt/local/course/ch11_Side_by_Side_Comparison/README.md
 create mode 100644 gpt/local/course/ch11_Side_by_Side_Comparison/comparison.py
 create mode 100644 gpt/local/course/ch12_Exercises_and_Next_Steps/README.md
 create mode 100644 gpt/local/course/ch12_Exercises_and_Next_Steps/exercise_solutions.py

diff --git a/gpt/local/course/README.md b/gpt/local/course/README.md
new file mode 100644
index 0000000..7415525
--- /dev/null
+++ b/gpt/local/course/README.md
@@ -0,0 +1,64 @@
+# Understanding GPT From Scratch
+
+## A Course Based on Andrej Karpathy's Implementations
+
+**Target Audience:** Early CS students who understand basic programming (Python, loops, functions, classes) but have no background in AI/ML or GPT.
+
+---
+
+## What This Course Covers
+
+This course walks you through **two real GPT implementations** by Andrej Karpathy, starting from zero AI knowledge:
+
+| Implementation | Location | What It Is |
+|---|---|---|
+| **microGPT** | `../../8627fe009c40f57531cb18360106ce95/microgpt.py` | ~200 lines of pure Python. No libraries. Builds everything from scratch — autograd, neural networks, the full transformer. |
+| **minGPT** | `../../minGPT/` | A clean PyTorch implementation. Production-style code with proper modules, training infrastructure, and real GPT-2 compatibility. |
+
+---
+
+## Course Structure
+
+| Folder | Title | Key Idea |
+|---|---|---|
+| `ch01_What_is_a_Language_Model/` | What is a Language Model? | The big picture — predicting the next word |
+| `ch02_Tokenization/` | Tokenization | Turning text into numbers a computer can process |
+| `ch03_Autograd/` | Autograd | Teaching computers to do calculus automatically |
+| `ch04_Neural_Network_Building_Blocks/` | Neural Network Building Blocks | Linear layers, activation functions, softmax |
+| `ch05_Attention_and_Transformers/` | Attention & Transformers | The core innovation behind GPT |
+| `ch06_Training_Loop_and_Optimization/` | Training: How Models Learn | Loss functions, backprop, optimizers |
+| `ch07_microGPT_Full_Walkthrough/` | microGPT: Full Walkthrough | Line-by-line through the 200-line pure-Python GPT |
+| `ch08_Scaling_Up_with_PyTorch/` | Scaling Up with PyTorch | Why we need frameworks, intro to PyTorch |
+| `ch09_minGPT_Model_Deep_Dive/` | minGPT: Model Deep Dive | The production-quality GPT architecture |
+| `ch10_Training_and_Inference/` | minGPT: Training & Inference | Training loops, text generation, real demos |
+| `ch11_Side_by_Side_Comparison/` | Side-by-Side Comparison | microGPT vs minGPT — same ideas, different scales |
+| `ch12_Exercises_and_Next_Steps/` | Exercises & Next Steps | Hands-on challenges and further reading |
+
+---
+
+## How to Use This Course
+
+1. **Read chapters in order** — each builds on the previous one
+2. **Run the code examples** — every chapter has runnable `.py` files in its folder
+3. **Chapters 01-06** teach the concepts with small, isolated examples
+4. **Chapters 07-10** apply those concepts to the real Karpathy code
+5. **Chapter 11** ties everything together
+6. **Chapter 12** gives you challenges to test your understanding
+
+## Prerequisites
+
+- Python basics: variables, loops, functions, classes, lists, dictionaries
+- Basic math: addition, multiplication, exponents (no calculus needed — we teach it!)
+- A terminal / command line
+- Python 3.8+ installed
+
+## Running Examples
+
+```bash
+# For chapters 01-07 (pure Python, no dependencies):
+python ch01_What_is_a_Language_Model/language_model_idea.py
+
+# For chapters 08-10 (needs PyTorch):
+pip install torch
+python ch08_Scaling_Up_with_PyTorch/pytorch_basics.py
+```
diff --git a/gpt/local/course/ch01_What_is_a_Language_Model/.ipynb_checkpoints/language_model_idea-checkpoint.py b/gpt/local/course/ch01_What_is_a_Language_Model/.ipynb_checkpoints/language_model_idea-checkpoint.py
new file mode 100644
index 0000000..cabf6f1
--- /dev/null
+++ b/gpt/local/course/ch01_What_is_a_Language_Model/.ipynb_checkpoints/language_model_idea-checkpoint.py
@@ -0,0 +1,104 @@
+"""
+Chapter 01: The Simplest Possible "Language Model"
+
+This is NOT a real language model — it's a toy to show the core idea:
+  1. Learn patterns from data (training)
+  2. Generate new text using those patterns (inference)
+
+We simply count how often each character follows another character.
+"""
+
+import random
+
+# ============================================================
+# STEP 1: Our "training data" — a few names
+# ============================================================
+training_data = [
+    "emma", "olivia", "ava", "sophia", "isabella",
+    "mia", "charlotte", "amelia", "harper", "evelyn",
+]
+
+print("=== Training Data ===")
+for name in training_data:
+    print(f"  {name}")
+
+# ============================================================
+# STEP 2: TRAINING — Count character transitions
+# ============================================================
+# We'll count: given character X, how often does character Y come next?
+# We use a special character '.' to mean "start" or "end" of a name.
+
+# Build a dictionary of dictionaries:
+#   counts['a']['b'] = number of times 'b' follows 'a' in the training data
+counts = {}
+
+for name in training_data:
+    # Add start/end markers: ".emma."
+    chars = ['.'] + list(name) + ['.']
+    for i in range(len(chars) - 1):
+        current = chars[i]
+        next_char = chars[i + 1]
+        if current not in counts:
+            counts[current] = {}
+        counts[current][next_char] = counts[current].get(next_char, 0) + 1
+
+# Let's look at what follows 'a':
+print("\n=== What follows 'a' in training data? ===")
+if 'a' in counts:
+    total = sum(counts['a'].values())
+    for char, count in sorted(counts['a'].items(), key=lambda x: -x[1]):
+        prob = count / total
+        display = "END" if char == '.' else char
+        print(f"  '{display}' : {count} times ({prob:.0%})")
+
+# ============================================================
+# STEP 3: Convert counts to probabilities
+# ============================================================
+probs = {}
+for current_char, next_chars in counts.items():
+    total = sum(next_chars.values())
+    probs[current_char] = {}
+    for next_char, count in next_chars.items():
+        probs[current_char][next_char] = count / total
+
+# ============================================================
+# STEP 4: INFERENCE — Generate new names!
+# ============================================================
+print("\n=== Generated Names (sampling from our 'model') ===")
+random.seed(42)
+
+for i in range(10):
+    name = []
+    current = '.'  # start token
+
+    for _ in range(20):  # max length safety
+        if current not in probs:
+            break
+        # Get possible next characters and their probabilities
+        next_chars = list(probs[current].keys())
+        weights = [probs[current][c] for c in next_chars]
+
+        # Sample randomly, weighted by probability
+        chosen = random.choices(next_chars, weights=weights, k=1)[0]
+
+        if chosen == '.':  # end token
+            break
+        name.append(chosen)
+        current = chosen
+
+    print(f"  {i+1:2d}. {''.join(name)}")
+
+# ============================================================
+# KEY TAKEAWAYS
+# ============================================================
+print("""
+=== Key Takeaways ===
+1. We LEARNED patterns from data (counted character transitions)
+2. We GENERATED new text by sampling from those patterns
+3. The generated names "sound like" the training data but are new
+
+This is exactly what GPT does — just with WAY more sophisticated
+pattern detection (neural networks instead of simple counting).
+
+Next chapter: How do we turn text into numbers? → Tokenization
+""")
diff --git a/gpt/local/course/ch01_What_is_a_Language_Model/README.md b/gpt/local/course/ch01_What_is_a_Language_Model/README.md
new file mode 100644
index 0000000..6f23bd5
--- /dev/null
+++ b/gpt/local/course/ch01_What_is_a_Language_Model/README.md
@@ -0,0 +1,102 @@
+# Chapter 01: What is a Language Model?
+
+## The One-Sentence Summary
+
+A language model is a program that **predicts the next word** (or character) given some previous words.
+
+---
+
+## Think of It Like Autocomplete
+
+You've used autocomplete on your phone:
+
+```
+You type: "How are ___"
+Phone suggests: "you"
+```
+
+That's a language model! It looked at "How are" and predicted "you" is the most likely next word.
+
+GPT is the same idea — just way more powerful.
+
+---
+
+## The Core Loop
+
+Every language model follows this pattern:
+
+```
+1. Look at some text               → "The cat sat on the"
+2. Predict the next word            → "mat" (70%), "floor" (20%), "dog" (10%)
+3. Pick one (sample or take best)   → "mat"
+4. Append it and repeat             → "The cat sat on the mat"
+```
+
+That's it. GPT generates entire essays by repeating steps 1-4 over and over.
+
+---
+
+## But How Does It Know?
+
+The model **learns patterns from data**. If you show it millions of sentences, it notices:
+- "The cat sat on the ___" → usually "mat", "floor", "chair"
+- "Once upon a ___" → usually "time"
+- "def __init__(self, ___" → usually a parameter name
+
+It doesn't "understand" language. It's really good at **pattern matching**.
+
+---
+
+## The Two Phases
+
+### Phase 1: Training (Learning)
+- Feed the model tons of text
+- For each position, it tries to predict the next character/word
+- When it's wrong, adjust its internal numbers to be less wrong next time
+- Repeat millions of times
+
+### Phase 2: Inference (Generating)
+- Give it a starting text (prompt)
+- Let it predict the next token, one at a time
+- It generates new text that "sounds like" its training data
+
+---
+
+## Characters vs Words vs Tokens
+
+Language models can predict at different levels:
+
+| Level | Example Input | Predicts |
+|---|---|---|
+| **Character-level** | `['H', 'e', 'l', 'l']` | `'o'` |
+| **Word-level** | `['The', 'cat']` | `'sat'` |
+| **Subword (BPE)** | `['The', ' cat', ' s']` | `'at'` |
+
+- **microGPT** uses character-level (simplest)
+- **minGPT** can use any level, but demos use character-level and BPE
+
+---
+
+## What's Inside a Language Model?
+
+At its core, a language model is just a **function with adjustable numbers** (parameters):
+
+```
+f(input_tokens, parameters) → probability of each possible next token
+```
+
+- The **parameters** are millions of numbers that encode patterns
+- **Training** = finding good values for those numbers
+- **Architecture** = how those numbers are organized and combined
+
+GPT uses an architecture called a **Transformer**, which we'll learn about in Chapter 05.
+
+---
+
+## Run the Example
+
+The file `language_model_idea.py` in this folder shows the simplest possible "language model" — just counting letter frequencies. It's silly and bad, but it captures the core idea.
+
+```bash
+python language_model_idea.py
+```
diff --git a/gpt/local/course/ch01_What_is_a_Language_Model/language_model_idea.py b/gpt/local/course/ch01_What_is_a_Language_Model/language_model_idea.py
new file mode 100644
index 0000000..cabf6f1
--- /dev/null
+++ b/gpt/local/course/ch01_What_is_a_Language_Model/language_model_idea.py
@@ -0,0 +1,104 @@
+"""
+Chapter 01: The Simplest Possible "Language Model"
+
+This is NOT a real language model — it's a toy to show the core idea:
+  1. Learn patterns from data (training)
+  2. Generate new text using those patterns (inference)
+
+We simply count how often each character follows another character.
+"""
+
+import random
+
+# ============================================================
+# STEP 1: Our "training data" — a few names
+# ============================================================
+training_data = [
+    "emma", "olivia", "ava", "sophia", "isabella",
+    "mia", "charlotte", "amelia", "harper", "evelyn",
+]
+
+print("=== Training Data ===")
+for name in training_data:
+    print(f"  {name}")
+
+# ============================================================
+# STEP 2: TRAINING — Count character transitions
+# ============================================================
+# We'll count: given character X, how often does character Y come next?
+# We use a special character '.' to mean "start" or "end" of a name.
+
+# Build a dictionary of dictionaries:
+#   counts['a']['b'] = number of times 'b' follows 'a' in the training data
+counts = {}
+
+for name in training_data:
+    # Add start/end markers: ".emma."
+    chars = ['.'] + list(name) + ['.']
+    for i in range(len(chars) - 1):
+        current = chars[i]
+        next_char = chars[i + 1]
+        if current not in counts:
+            counts[current] = {}
+        counts[current][next_char] = counts[current].get(next_char, 0) + 1
+
+# Let's look at what follows 'a':
+print("\n=== What follows 'a' in training data? ===")
+if 'a' in counts:
+    total = sum(counts['a'].values())
+    for char, count in sorted(counts['a'].items(), key=lambda x: -x[1]):
+        prob = count / total
+        display = "END" if char == '.' else char
+        print(f"  '{display}' : {count} times ({prob:.0%})")
+
+# ============================================================
+# STEP 3: Convert counts to probabilities
+# ============================================================
+probs = {}
+for current_char, next_chars in counts.items():
+    total = sum(next_chars.values())
+    probs[current_char] = {}
+    for next_char, count in next_chars.items():
+        probs[current_char][next_char] = count / total
+
+# ============================================================
+# STEP 4: INFERENCE — Generate new names!
+# ============================================================
+print("\n=== Generated Names (sampling from our 'model') ===")
+random.seed(42)
+
+for i in range(10):
+    name = []
+    current = '.'  # start token
+
+    for _ in range(20):  # max length safety
+        if current not in probs:
+            break
+        # Get possible next characters and their probabilities
+        next_chars = list(probs[current].keys())
+        weights = [probs[current][c] for c in next_chars]
+
+        # Sample randomly, weighted by probability
+        chosen = random.choices(next_chars, weights=weights, k=1)[0]
+
+        if chosen == '.':  # end token
+            break
+        name.append(chosen)
+        current = chosen
+
+    print(f"  {i+1:2d}. {''.join(name)}")
+
+# ============================================================
+# KEY TAKEAWAYS
+# ============================================================
+print("""
+=== Key Takeaways ===
+1. We LEARNED patterns from data (counted character transitions)
+2. We GENERATED new text by sampling from those patterns
+3. The generated names "sound like" the training data but are new
+
+This is exactly what GPT does — just with WAY more sophisticated
+pattern detection (neural networks instead of simple counting).
+
+Next chapter: How do we turn text into numbers? → Tokenization
+""")
diff --git a/gpt/local/course/ch02_Tokenization/README.md b/gpt/local/course/ch02_Tokenization/README.md
new file mode 100644
index 0000000..d1c531a
--- /dev/null
+++ b/gpt/local/course/ch02_Tokenization/README.md
@@ -0,0 +1,83 @@
+# Chapter 02: Tokenization — Turning Text into Numbers
+
+## Why Tokenization?
+
+Computers don't understand letters or words — they work with **numbers**. Before we can feed text into any model, we need to convert it to a sequence of integers. This conversion is called **tokenization**.
+
+```
+"hello" → [7, 4, 11, 11, 14]
+```
+
+---
+
+## Three Levels of Tokenization
+
+### 1. Character-Level (used by microGPT)
+
+Each unique character gets a number:
+
+```
+Vocabulary: {a:0, b:1, c:2, d:3, e:4, ...}
+"cab" → [2, 0, 1]
+```
+
+**Pros:** Tiny vocabulary, simple to implement
+**Cons:** Sequences are long, model must learn to spell words
+
+### 2. Word-Level
+
+Each unique word gets a number:
+
+```
+Vocabulary: {"the":0, "cat":1, "sat":2, ...}
+"the cat" → [0, 1]
+```
+
+**Pros:** Short sequences
+**Cons:** Huge vocabulary, can't handle new/misspelled words
+
+### 3. Subword / BPE (used by GPT-2, minGPT)
+
+Byte Pair Encoding — a clever middle ground. Frequent words stay whole, rare words get split:
+
+```
+"unhappiness" → ["un", "happiness"]  (common prefix + common word)
+"Karpathy"    → ["K", "arp", "athy"] (rare name, split into pieces)
+```
+
+**Pros:** Handles any text, balanced vocabulary size
+**Cons:** More complex to implement
+
+---
+
+## The Vocabulary
+
+The set of all possible tokens is the **vocabulary**. Its size matters:
+
+| Tokenizer | Vocab Size | Example |
+|---|---|---|
+| microGPT (names dataset) | ~27 | a-z + BOS token |
+| GPT-2 (BPE) | 50,257 | All common English subwords |
+
+The model's final layer must output a probability for **every token in the vocabulary**. Bigger vocab = bigger model.
+
+---
+
+## Special Tokens
+
+Most tokenizers have special tokens with reserved meanings:
+
+- **BOS** (Beginning of Sequence): Marks the start of a document
+- **EOS** (End of Sequence): Marks the end
+- **PAD**: Fills empty space when batching sequences of different lengths
+
+In microGPT, there's one special token: `BOS` (used for both start and end).
+
+---
+
+## Run the Examples
+
+```bash
+python char_tokenizer.py     # Character-level tokenization (like microGPT)
+python bpe_intuition.py      # How BPE works, step by step
+```
diff --git a/gpt/local/course/ch02_Tokenization/bpe_intuition.py b/gpt/local/course/ch02_Tokenization/bpe_intuition.py
new file mode 100644
index 0000000..db83616
--- /dev/null
+++ b/gpt/local/course/ch02_Tokenization/bpe_intuition.py
@@ -0,0 +1,112 @@
+"""
+Chapter 02: BPE (Byte Pair Encoding) Intuition
+
+BPE is the tokenizer used by GPT-2 and minGPT.
+This simplified example shows the IDEA behind BPE.
+(The real implementation is in minGPT/mingpt/bpe.py)
+"""
+
+# ============================================================
+# THE IDEA: Repeatedly merge the most common pair of tokens
+# ============================================================
+
+def simple_bpe_train(corpus, num_merges):
+    """
+    Train a simple BPE tokenizer.
+    
+    Start with individual characters as tokens.
+    Then repeatedly find the most common adjacent pair
+    and merge them into a single new token.
+    """
+    # Start: each word is split into individual characters
+    # We add a special end-of-word marker '_'
+    words = {}
+    for word in corpus.split():
+        chars = ' '.join(list(word)) + ' _'
+        words[chars] = words.get(chars, 0) + 1
+
+    print("=== Starting tokens (individual characters) ===")
+    print(f"  Words: {words}\n")
+
+    merges = []
+
+    for step in range(num_merges):
+        # Count all adjacent pairs
+        pairs = {}
+        for word, freq in words.items():
+            symbols = word.split()
+            for i in range(len(symbols) - 1):
+                pair = (symbols[i], symbols[i + 1])
+                pairs[pair] = pairs.get(pair, 0) + freq
+
+        if not pairs:
+            break
+
+        # Find the most common pair
+        best_pair = max(pairs, key=pairs.get)
+        merges.append(best_pair)
+
+        print(f"Step {step + 1}: Merge '{best_pair[0]}' + '{best_pair[1]}' "
+              f"→ '{best_pair[0]}{best_pair[1]}' "
+              f"(appeared {pairs[best_pair]} times)")
+
+        # Apply the merge: replace all occurrences of this pair
+        new_words = {}
+        for word, freq in words.items():
+            new_word = word.replace(
+                f"{best_pair[0]} {best_pair[1]}",
+                f"{best_pair[0]}{best_pair[1]}"
+            )
+            new_words[new_word] = freq
+        words = new_words
+
+        # Show current state of words
+        print(f"  Words now: {words}\n")
+
+    return merges, words
+
+
+# ============================================================
+# DEMO: Train BPE on a tiny corpus
+# ============================================================
+corpus = "low low low low low lowest lowest newer newer newer wider"
+
+print("=" * 60)
+print("BPE Training Demo")
+print(f"Corpus: \"{corpus}\"")
+print("=" * 60 + "\n")
+
+merges, final_words = simple_bpe_train(corpus, num_merges=10)
+
+print("=" * 60)
+print("Final merge rules learned:")
+for i, (a, b) in enumerate(merges):
+    print(f"  Rule {i+1}: '{a}' + '{b}' → '{a}{b}'")
+
+# ============================================================
+# HOW BPE TOKENIZES NEW TEXT
+# ============================================================
+print("""
+=== How BPE Tokenizes New Text ===
+
+After training, to tokenize a new word like "lowest":
+  1. Start with characters: ['l', 'o', 'w', 'e', 's', 't']
+  2. Apply merge rules in order:
+     - Rule 'l'+'o' → 'lo':   ['lo', 'w', 'e', 's', 't']
+     - Rule 'lo'+'w' → 'low': ['low', 'e', 's', 't']
+     - Rule 'e'+'s' → 'es':   ['low', 'es', 't']
+     - Rule 'es'+'t' → 'est': ['low', 'est']
+  3. Final tokens: ['low', 'est']
+
+For a RARE word like "xylophone":
+  Characters are mostly left as-is because no merge rules apply.
+  → ['x', 'y', 'l', 'o', 'p', 'h', 'o', 'n', 'e']
+
+This is the beauty of BPE:
+  - COMMON words/subwords become single tokens → efficient
+  - RARE words get broken into characters → still works
+  - Vocabulary size is controllable (= number of merges + base chars)
+
+GPT-2 uses 50,000 merges → vocab size of ~50,257
+microGPT uses 0 merges → vocab is just individual characters (~27)
+""")
diff --git a/gpt/local/course/ch02_Tokenization/char_tokenizer.py b/gpt/local/course/ch02_Tokenization/char_tokenizer.py
new file mode 100644
index 0000000..5e6cdb8
--- /dev/null
+++ b/gpt/local/course/ch02_Tokenization/char_tokenizer.py
@@ -0,0 +1,92 @@
+"""
+Chapter 02: Character-Level Tokenizer
+
+This is exactly how microGPT tokenizes text.
+Each unique character becomes a token ID (integer).
+"""
+
+# ============================================================
+# STEP 1: Build vocabulary from data
+# ============================================================
+# Imagine our dataset is a list of names
+docs = ["emma", "olivia", "ava", "sophia", "isabella", "mia"]
+
+# Find all unique characters, sorted
+all_text = ''.join(docs)
+uchars = sorted(set(all_text))
+
+print("=== Vocabulary ===")
+print(f"Unique characters: {uchars}")
+print(f"Number of unique characters: {len(uchars)}")
+
+# Each character maps to an integer index
+char_to_id = {ch: i for i, ch in enumerate(uchars)}
+id_to_char = {i: ch for i, ch in enumerate(uchars)}
+
+print("\nCharacter → ID mapping:")
+for ch, idx in char_to_id.items():
+    print(f"  '{ch}' → {idx}")
+
+# ============================================================
+# STEP 2: Add special tokens
+# ============================================================
+# BOS = Beginning of Sequence. A special token that marks boundaries.
+BOS = len(uchars)  # Gets the next available ID
+vocab_size = len(uchars) + 1
+
+print(f"\nBOS token ID: {BOS}")
+print(f"Total vocab size: {vocab_size}")
+
+# ============================================================
+# STEP 3: Encode — turn text into numbers
+# ============================================================
+def encode(text):
+    """Convert a string to a list of token IDs"""
+    return [char_to_id[ch] for ch in text]
+
+def decode(token_ids):
+    """Convert a list of token IDs back to a string"""
+    return ''.join(id_to_char[i] for i in token_ids if i != BOS)
+
+# Try encoding some names
+print("\n=== Encoding Examples ===")
+for name in ["emma", "ava", "mia"]:
+    tokens = encode(name)
+    print(f"  '{name}' → {tokens}")
+
+    # Add BOS tokens on both sides (like microGPT does)
+    tokens_with_bos = [BOS] + tokens + [BOS]
+    print(f"  with BOS: {tokens_with_bos}")
+
+# ============================================================
+# STEP 4: Decode — turn numbers back to text
+# ============================================================
+print("\n=== Decoding Examples ===")
+sample_tokens = [3, 7, 7, 0]  # e, m, m, a
+decoded = decode(sample_tokens)
+print(f"  {sample_tokens} → '{decoded}'")
+
+# ============================================================
+# STEP 5: How this connects to the model
+# ============================================================
+print("""
+=== How This Connects to GPT ===
+
+During training, the model sees sequences like:
+  [BOS, e, m, m, a, BOS]  →  [{bos}, {e}, {m}, {m2}, {a}, {bos2}]
+
+At each position, it tries to predict the NEXT token:
+  Given [BOS]           → predict 'e'  (token {e})
+  Given [BOS, e]        → predict 'm'  (token {m})
+  Given [BOS, e, m]     → predict 'm'  (token {m2})
+  Given [BOS, e, m, m]  → predict 'a'  (token {a})
+  Given [BOS, e, m, m, a] → predict BOS (end of name)
+
+The model outputs {vocab_size} probabilities — one for each possible
+next token. Training adjusts the model so the correct next token
+gets the highest probability.
+""".format(
+    bos=BOS, e=char_to_id['e'], m=char_to_id['m'],
+    m2=char_to_id['m'], a=char_to_id['a'],
+    bos2=BOS, vocab_size=vocab_size
+))
diff --git a/gpt/local/course/ch03_Autograd/README.md b/gpt/local/course/ch03_Autograd/README.md
new file mode 100644
index 0000000..1adb7d3
--- /dev/null
+++ b/gpt/local/course/ch03_Autograd/README.md
@@ -0,0 +1,59 @@
+# Chapter 03: Autograd — Teaching Computers Calculus
+
+## Why Do We Need Calculus?
+
+To train a model, we need to answer: **"If I nudge this parameter a tiny bit, does the output get better or worse?"** That's what a derivative tells us.
+
+But a GPT has millions of parameters and hundreds of operations chained together. Computing derivatives by hand is impossible. So we use **automatic differentiation** (autograd).
+
+---
+
+## The Chain Rule — The One Rule That Powers All of AI
+
+If you compute `y = f(g(x))`, the derivative of y with respect to x is:
+
+```
+dy/dx = dy/dg * dg/dx
+```
+
+"Multiply the local derivatives along the chain." That's it. Autograd just automates this.
+
+---
+
+## microGPT's `Value` Class
+
+microGPT implements autograd from scratch in ~40 lines. The `Value` class:
+1. Wraps a number
+2. Records what operations created it (the computation graph)
+3. Can compute gradients automatically via `backward()`
+
+---
+
+## The Computation Graph
+
+When you write `c = a + b`, autograd secretly builds a graph:
+
+```
+a ──┐
+    ├──(+)──→ c
+b ──┘
+```
+
+Each node knows its children and the local derivative of the operation.
+
+---
+
+## Forward Pass vs Backward Pass
+
+- **Forward pass**: Compute the output value (follow the arrows forward)
+- **Backward pass**: Compute gradients by walking the graph backward, multiplying local derivatives (chain rule)
+
+---
+
+## Run the Examples
+
+```bash
+python value_basics.py        # Understand the Value class
+python computation_graph.py   # See the chain rule in action
+python gradient_descent.py    # Use gradients to optimize a simple function
+```
diff --git a/gpt/local/course/ch03_Autograd/computation_graph.py b/gpt/local/course/ch03_Autograd/computation_graph.py
new file mode 100644
index 0000000..5e1dfdd
--- /dev/null
+++ b/gpt/local/course/ch03_Autograd/computation_graph.py
@@ -0,0 +1,138 @@
+"""
+Chapter 03: Visualizing the Computation Graph
+
+This script shows how operations build a graph,
+and how backward() walks it in reverse to compute gradients.
+"""
+
+import math
+
+class Value:
+    def __init__(self, data, children=(), local_grads=(), label=''):
+        self.data = data
+        self.grad = 0
+        self._children = children
+        self._local_grads = local_grads
+        self.label = label
+
+    def __add__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data + other.data, (self, other), (1, 1),
+                     f"({self.label}+{other.label})")
+
+    def __mul__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data * other.data, (self, other), (other.data, self.data),
+                     f"({self.label}*{other.label})")
+
+    def __pow__(self, n):
+        return Value(self.data ** n, (self,), (n * self.data ** (n-1),),
+                     f"({self.label}^{n})")
+
+    def __neg__(self): return self * Value(-1, label='-1')
+    def __sub__(self, other): return self + (-other)
+    def __rmul__(self, other): return Value(other, label=str(other)) * self
+
+    def log(self):
+        return Value(math.log(self.data), (self,), (1/self.data,),
+                     f"log({self.label})")
+
+    def exp(self):
+        return Value(math.exp(self.data), (self,), (math.exp(self.data),),
+                     f"exp({self.label})")
+
+    def backward(self):
+        topo = []
+        visited = set()
+        def build_topo(v):
+            if v not in visited:
+                visited.add(v)
+                for child in v._children:
+                    build_topo(child)
+                topo.append(v)
+        build_topo(self)
+        self.grad = 1
+        for v in reversed(topo):
+            for child, local_grad in zip(v._children, v._local_grads):
+                child.grad += local_grad * v.grad
+
+
+# ============================================================
+# BUILD A COMPUTATION: loss = -log(softmax(score)[target])
+# This is EXACTLY what happens in GPT at each prediction step!
+# ============================================================
+
+print("=" * 60)
+print("Computation Graph: A Mini Cross-Entropy Loss")
+print("=" * 60)
+print()
+print("Scenario: Model outputs scores [2.0, 1.0, 0.5] for 3 tokens.")
+print("The correct token is index 0. What's the loss?\n")
+
+# These are our "model outputs" (logits)
+s0 = Value(2.0, label='s0')   # score for token 0 (correct answer)
+s1 = Value(1.0, label='s1')   # score for token 1
+s2 = Value(0.5, label='s2')   # score for token 2
+
+# Softmax: convert scores to probabilities
+# softmax(si) = exp(si) / sum(exp(sj))
+e0 = s0.exp()
+e1 = s1.exp()
+e2 = s2.exp()
+total = e0 + e1 + e2
+
+p0 = e0 * (total ** -1)  # probability of token 0
+
+# Cross-entropy loss for the correct token (index 0)
+loss = -(p0.log())
+
+print("Forward Pass (computing the loss):")
+print(f"  Scores:        s0={s0.data:.2f}, s1={s1.data:.2f}, s2={s2.data:.2f}")
+print(f"  Exponentials:  exp(s0)={e0.data:.4f}, exp(s1)={e1.data:.4f}, exp(s2)={e2.data:.4f}")
+print(f"  Sum of exps:   {total.data:.4f}")
+print(f"  Prob of s0:    {p0.data:.4f}  ({p0.data*100:.1f}%)")
+print(f"  Loss:          -log({p0.data:.4f}) = {loss.data:.4f}")
+
+# Now compute gradients!
+loss.backward()
+
+print(f"\nBackward Pass (computing gradients):")
+print(f"  d(loss)/d(s0) = {s0.grad:.4f}")
+print(f"  d(loss)/d(s1) = {s1.grad:.4f}")
+print(f"  d(loss)/d(s2) = {s2.grad:.4f}")
+
+print(f"""
+Interpretation:
+  s0.grad = {s0.grad:.4f} (NEGATIVE → increasing s0 DECREASES loss → good!)
+  s1.grad = {s1.grad:.4f} (POSITIVE → increasing s1 INCREASES loss → bad!)
+  s2.grad = {s2.grad:.4f} (POSITIVE → increasing s2 INCREASES loss → bad!)
+
+This makes sense! The correct answer is token 0, so:
+  - We WANT s0 to be higher (gradient pushes it up)
+  - We WANT s1, s2 to be lower (gradient pushes them down)
+
+This is exactly how a neural network learns:
+  1. Forward pass → compute loss (how wrong are we?)
+  2. Backward pass → compute gradients (which direction to adjust?)
+  3. Update parameters in the direction that reduces loss
+""")
+
+# ============================================================
+# VISUALIZE THE GRAPH STRUCTURE
+# ============================================================
+print("=" * 60)
+print("The Computation Graph (text visualization)")
+print("=" * 60)
+print("""
+     s0 ─→ exp() ─→ e0 ──┐
+                          ├─→ e0/total ─→ p0 ─→ log() ─→ neg ─→ LOSS
+     s1 ─→ exp() ─→ e1 ──┤                ↑
+                          ├─→ total ───────┘
+     s2 ─→ exp() ─→ e2 ──┘
+
+Forward:  Left to right (compute values)
+Backward: Right to left (compute gradients using chain rule)
+
+Each arrow multiplies by the local derivative.
+The chain rule = multiply all local derivatives along the path.
+""")
diff --git a/gpt/local/course/ch03_Autograd/gradient_descent.py b/gpt/local/course/ch03_Autograd/gradient_descent.py
new file mode 100644
index 0000000..7492fe0
--- /dev/null
+++ b/gpt/local/course/ch03_Autograd/gradient_descent.py
@@ -0,0 +1,122 @@
+"""
+Chapter 03: Gradient Descent — Using Gradients to Learn
+
+This is the fundamental learning algorithm:
+  1. Start with random parameter values
+  2. Compute how wrong the output is (loss)
+  3. Compute gradients (which direction to adjust)
+  4. Nudge parameters to reduce the loss
+  5. Repeat
+
+We'll train a tiny "model" (just 2 parameters) to fit a target.
+"""
+
+import math
+import random
+
+random.seed(42)
+
+class Value:
+    def __init__(self, data, children=(), local_grads=()):
+        self.data = data
+        self.grad = 0
+        self._children = children
+        self._local_grads = local_grads
+
+    def __add__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data + other.data, (self, other), (1, 1))
+    def __mul__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data * other.data, (self, other), (other.data, self.data))
+    def __pow__(self, n):
+        return Value(self.data ** n, (self,), (n * self.data ** (n-1),))
+    def __neg__(self): return self * -1
+    def __sub__(self, other): return self + (-other)
+    def __radd__(self, other): return self + other
+    def __rmul__(self, other): return self * other
+
+    def backward(self):
+        topo, visited = [], set()
+        def build(v):
+            if v not in visited:
+                visited.add(v)
+                for c in v._children: build(c)
+                topo.append(v)
+        build(self)
+        self.grad = 1
+        for v in reversed(topo):
+            for child, lg in zip(v._children, v._local_grads):
+                child.grad += lg * v.grad
+
+
+# ============================================================
+# PROBLEM: Learn y = 3x + 7
+# Our model: y_pred = w*x + b  (we must find w=3, b=7)
+# ============================================================
+
+# Initialize parameters randomly
+w = Value(random.uniform(-1, 1))  # will learn to be ~3.0
+b = Value(random.uniform(-1, 1))  # will learn to be ~7.0
+
+# Training data: (x, y) pairs from the true function y = 3x + 7
+data = [(1, 10), (2, 13), (3, 16), (4, 19), (5, 22)]
+
+learning_rate = 0.01
+
+print("=" * 50)
+print("Goal: Learn y = 3x + 7")
+print(f"Starting: w = {w.data:.4f}, b = {b.data:.4f}")
+print("=" * 50)
+
+for step in range(100):
+    # ---- Forward pass: compute predictions and loss ----
+    total_loss = Value(0)
+    for x_val, y_true in data:
+        y_pred = w * x_val + b           # our model's prediction
+        diff = y_pred - y_true            # error
+        loss = diff ** 2                  # squared error (always positive)
+        total_loss = total_loss + loss
+
+    # Average loss
+    avg_loss = total_loss * (1.0 / len(data))
+
+    # ---- Backward pass: compute gradients ----
+    avg_loss.backward()
+
+    # ---- Update parameters: move in the opposite direction of gradient ----
+    w.data -= learning_rate * w.grad
+    b.data -= learning_rate * b.grad
+
+    # ---- Reset gradients for next step ----
+    w.grad = 0
+    b.grad = 0
+
+    # Print progress
+    if step % 10 == 0 or step == 99:
+        print(f"  Step {step:3d} | loss = {avg_loss.data:.4f} | w = {w.data:.4f} | b = {b.data:.4f}")
+
+    # Recreate Value objects to reset computation graph
+    w = Value(w.data)
+    b = Value(b.data)
+
+print(f"\n  Final: w = {w.data:.4f} (target: 3.0), b = {b.data:.4f} (target: 7.0)")
+
+print("""
+=== What Just Happened ===
+
+1. We started with random w and b
+2. Each step:
+   a. Forward pass: computed predictions and loss
+   b. Backward pass: autograd computed d(loss)/dw and d(loss)/db
+   c. Update: moved w and b in the direction that reduces loss
+3. After 100 steps, w ≈ 3.0 and b ≈ 7.0
+
+This is EXACTLY what happens in GPT training:
+  - Instead of 2 parameters (w, b), GPT has millions
+  - Instead of y = wx + b, GPT is a transformer neural network
+  - Instead of squared error, GPT uses cross-entropy loss
+  - But the loop is identical: forward → backward → update
+
+Next chapter: What are the building blocks of that neural network?
+""")
diff --git a/gpt/local/course/ch03_Autograd/value_basics.py b/gpt/local/course/ch03_Autograd/value_basics.py
new file mode 100644
index 0000000..7beebc6
--- /dev/null
+++ b/gpt/local/course/ch03_Autograd/value_basics.py
@@ -0,0 +1,164 @@
+"""
+Chapter 03: Understanding the Value Class
+
+The Value class is microGPT's autograd engine.
+It wraps a number and tracks the computation graph
+so gradients can be computed automatically.
+
+This is a simplified version of the Value class from microgpt.py
+"""
+
+import math
+
+# ============================================================
+# THE VALUE CLASS (simplified from microGPT)
+# ============================================================
+class Value:
+    """
+    A single scalar value that tracks its computation history.
+    
+    Think of it as a "smart number" that remembers:
+    - Its current value (self.data)
+    - How it was created (self._children, self._local_grads)
+    - Its gradient, computed later (self.grad)
+    """
+
+    def __init__(self, data, children=(), local_grads=()):
+        self.data = data                # the actual number
+        self.grad = 0                   # derivative of loss w.r.t. this value
+        self._children = children       # what values were used to compute this
+        self._local_grads = local_grads # derivative of this op w.r.t. each child
+
+    def __add__(self, other):
+        """Addition: c = a + b"""
+        other = other if isinstance(other, Value) else Value(other)
+        # d(a+b)/da = 1, d(a+b)/db = 1
+        return Value(self.data + other.data, (self, other), (1, 1))
+
+    def __mul__(self, other):
+        """Multiplication: c = a * b"""
+        other = other if isinstance(other, Value) else Value(other)
+        # d(a*b)/da = b, d(a*b)/db = a
+        return Value(self.data * other.data, (self, other), (other.data, self.data))
+
+    def __pow__(self, power):
+        """Power: c = a ** n"""
+        # d(a^n)/da = n * a^(n-1)
+        return Value(self.data ** power, (self,), (power * self.data ** (power - 1),))
+
+    def __neg__(self): return self * -1
+    def __sub__(self, other): return self + (-other)
+    def __truediv__(self, other): return self * other ** -1
+    def __radd__(self, other): return self + other
+    def __rmul__(self, other): return self * other
+
+    def backward(self):
+        """Compute gradients for all values in the computation graph."""
+        # Step 1: Build topological ordering (children before parents)
+        topo = []
+        visited = set()
+        def build_topo(v):
+            if v not in visited:
+                visited.add(v)
+                for child in v._children:
+                    build_topo(child)
+                topo.append(v)
+        build_topo(self)
+
+        # Step 2: Backpropagate gradients
+        self.grad = 1  # d(self)/d(self) = 1
+        for v in reversed(topo):
+            for child, local_grad in zip(v._children, v._local_grads):
+                child.grad += local_grad * v.grad  # Chain rule!
+
+    def __repr__(self):
+        return f"Value(data={self.data:.4f}, grad={self.grad:.4f})"
+
+
+# ============================================================
+# EXAMPLE 1: Simple addition
+# ============================================================
+print("=== Example 1: Addition (c = a + b) ===")
+a = Value(3.0)
+b = Value(5.0)
+c = a + b  # c = 8.0
+
+c.backward()
+
+print(f"  a = {a}")
+print(f"  b = {b}")
+print(f"  c = a + b = {c}")
+print(f"  dc/da = {a.grad} (if we increase a by 1, c increases by 1)")
+print(f"  dc/db = {b.grad} (if we increase b by 1, c increases by 1)")
+
+# ============================================================
+# EXAMPLE 2: Multiplication
+# ============================================================
+print("\n=== Example 2: Multiplication (c = a * b) ===")
+a = Value(3.0)
+b = Value(5.0)
+c = a * b  # c = 15.0
+
+c.backward()
+
+print(f"  a = {a}")
+print(f"  b = {b}")
+print(f"  c = a * b = {c}")
+print(f"  dc/da = {a.grad} (= b's value, because d(a*b)/da = b)")
+print(f"  dc/db = {b.grad} (= a's value, because d(a*b)/db = a)")
+
+# ============================================================
+# EXAMPLE 3: A chain of operations (chain rule!)
+# ============================================================
+print("\n=== Example 3: Chain Rule (d = (a + b) * c) ===")
+a = Value(2.0)
+b = Value(3.0)
+c = Value(4.0)
+
+# Forward pass:
+ab = a + b    # ab = 5.0
+d = ab * c    # d = 20.0
+
+# Backward pass:
+d.backward()
+
+print(f"  a = {a}")
+print(f"  b = {b}")
+print(f"  c = {c}")
+print(f"  d = (a + b) * c = {d}")
+print(f"  dd/da = {a.grad} (= c, because d = (a+b)*c, dd/da = c = 4)")
+print(f"  dd/db = {b.grad} (= c, same reasoning)")
+print(f"  dd/dc = {c.grad} (= a+b = 5)")
+
+# ============================================================
+# EXAMPLE 4: More complex expression
+# ============================================================
+print("\n=== Example 4: Complex Expression (y = (2x + 3)^2) ===")
+x = Value(1.0)
+
+# Forward pass:
+y = (2 * x + 3) ** 2  # y = (2*1+3)^2 = 25
+
+# Backward pass:
+y.backward()
+
+print(f"  x = {x}")
+print(f"  y = (2x + 3)^2 = {y}")
+print(f"  dy/dx = {x.grad}")
+print(f"  By hand: dy/dx = 2 * 2 * (2x+3) = 4 * (2*1+3) = 20 ✓")
+
+# ============================================================
+# KEY TAKEAWAY
+# ============================================================
+print("""
+=== Key Takeaway ===
+The Value class lets us:
+  1. Write normal math expressions (forward pass)
+  2. Automatically compute ALL derivatives (backward pass)
+
+In GPT, the expression is HUGE (millions of operations),
+but the same principle applies. The chain rule just propagates
+gradients through the entire computation graph.
+
+Next: See computation_graph.py for a visual explanation.
+""")
diff --git a/gpt/local/course/ch04_Neural_Network_Building_Blocks/README.md b/gpt/local/course/ch04_Neural_Network_Building_Blocks/README.md
new file mode 100644
index 0000000..03b8da7
--- /dev/null
+++ b/gpt/local/course/ch04_Neural_Network_Building_Blocks/README.md
@@ -0,0 +1,69 @@
+# Chapter 04: Neural Network Building Blocks
+
+## Overview
+
+A neural network is built from a few simple, reusable pieces. Understanding these pieces is the key to understanding GPT.
+
+---
+
+## The Building Blocks
+
+### 1. Linear Layer (Matrix Multiply)
+The workhorse of neural networks. Takes an input vector, multiplies by a weight matrix, producing an output vector.
+
+```
+input:  [x1, x2, x3]      (3 numbers)
+weights: 2x3 matrix        (6 numbers, learned)
+output: [y1, y2]           (2 numbers)
+
+y1 = w11*x1 + w12*x2 + w13*x3
+y2 = w21*x1 + w22*x2 + w23*x3
+```
+
+This is just a weighted sum — each output is a mix of all inputs.
+
+### 2. Activation Functions (ReLU, GELU)
+Without activations, stacking linear layers would just be one big linear layer. Activations add non-linearity so the network can learn complex patterns.
+
+```
+ReLU(x) = max(0, x)     # Simple: zero out negatives (used in microGPT)
+GELU(x) ≈ x * sigmoid(1.7*x)  # Smoother version (used in minGPT)
+```
+
+### 3. Softmax
+Converts a list of arbitrary numbers into probabilities (positive, sum to 1):
+
+```
+scores:       [2.0, 1.0, 0.5]
+softmax:      [0.59, 0.22, 0.19]  (sum = 1.0)
+```
+
+Used at the end of GPT to get "probability of each next token."
+
+### 4. Layer Normalization / RMSNorm
+Keeps numbers in a stable range so the network doesn't explode or vanish during training.
+
+```
+RMSNorm: scale each vector so its average squared value is ~1
+LayerNorm: shift and scale so mean=0, variance=1
+```
+
+microGPT uses RMSNorm, minGPT uses LayerNorm.
+
+### 5. Embedding
+A lookup table: token ID → vector of numbers.
+
+```
+Token 5 → [0.12, -0.34, 0.56, ...]  (a vector of n_embd numbers)
+```
+
+This is how tokens enter the neural network — each token gets mapped to a dense vector.
+
+---
+
+## Run the Examples
+
+```bash
+python building_blocks.py   # Interactive demo of each building block
+python mlp.py               # A complete Multi-Layer Perceptron (MLP)
+```
diff --git a/gpt/local/course/ch04_Neural_Network_Building_Blocks/building_blocks.py b/gpt/local/course/ch04_Neural_Network_Building_Blocks/building_blocks.py
new file mode 100644
index 0000000..f963d91
--- /dev/null
+++ b/gpt/local/course/ch04_Neural_Network_Building_Blocks/building_blocks.py
@@ -0,0 +1,174 @@
+"""
+Chapter 04: Neural Network Building Blocks — Hands-On
+
+Every building block used in GPT, implemented from scratch.
+These are the SAME functions used in microGPT (microgpt.py).
+"""
+
+import math
+import random
+
+random.seed(42)
+
+# ============================================================
+# BLOCK 1: LINEAR LAYER
+# ============================================================
+# A linear layer is just matrix-vector multiplication.
+# Each output = weighted sum of all inputs.
+
+def linear(x, w):
+    """
+    x: input vector of length n_in
+    w: weight matrix of shape [n_out][n_in]
+    returns: output vector of length n_out
+    """
+    return [sum(wi * xi for wi, xi in zip(row, x)) for row in w]
+
+print("=" * 50)
+print("BLOCK 1: Linear Layer")
+print("=" * 50)
+
+# Input: a vector of 3 numbers
+x = [1.0, 2.0, 3.0]
+
+# Weights: a 2x3 matrix (maps 3 inputs → 2 outputs)
+w = [
+    [0.1, 0.2, 0.3],  # weights for output 0
+    [0.4, 0.5, 0.6],  # weights for output 1
+]
+
+y = linear(x, w)
+print(f"Input:   {x}  (3 values)")
+print(f"Weights: {w}")
+print(f"Output:  {y}  (2 values)")
+print(f"  y[0] = 0.1*1 + 0.2*2 + 0.3*3 = {0.1*1 + 0.2*2 + 0.3*3}")
+print(f"  y[1] = 0.4*1 + 0.5*2 + 0.6*3 = {0.4*1 + 0.5*2 + 0.6*3}")
+
+# ============================================================
+# BLOCK 2: ACTIVATION FUNCTIONS
+# ============================================================
+print("\n" + "=" * 50)
+print("BLOCK 2: Activation Functions")
+print("=" * 50)
+
+def relu(x):
+    """ReLU: zero out negative values. Used in microGPT."""
+    return max(0, x)
+
+print("\nReLU (used in microGPT):")
+for val in [-2, -1, 0, 1, 2, 3]:
+    print(f"  relu({val:2d}) = {relu(val)}")
+
+print("\nWhy ReLU? Without it, stacking linear layers = one big linear layer.")
+print("ReLU adds 'kinks' so the network can model non-linear patterns.")
+
+# GELU (used in minGPT/GPT-2) — smoother version of ReLU
+def gelu(x):
+    """GELU: Gaussian Error Linear Unit. Used in GPT-2/minGPT."""
+    return 0.5 * x * (1.0 + math.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * x**3)))
+
+print("\nGELU (used in minGPT/GPT-2):")
+for val in [-2, -1, 0, 1, 2, 3]:
+    print(f"  gelu({val:2d}) = {gelu(val):.4f}")
+
+print("\nGELU is smoother than ReLU — slightly negative inputs get small negative outputs")
+print("instead of being hard-zeroed. This helps training in practice.")
+
+# ============================================================
+# BLOCK 3: SOFTMAX
+# ============================================================
+print("\n" + "=" * 50)
+print("BLOCK 3: Softmax")
+print("=" * 50)
+
+def softmax(logits):
+    """Convert raw scores to probabilities."""
+    max_val = max(logits)
+    exps = [math.exp(v - max_val) for v in logits]  # subtract max for numerical stability
+    total = sum(exps)
+    return [e / total for e in exps]
+
+scores = [2.0, 1.0, 0.5]
+probs = softmax(scores)
+print(f"\nRaw scores (logits): {scores}")
+print(f"After softmax:       [{', '.join(f'{p:.4f}' for p in probs)}]")
+print(f"Sum of probabilities: {sum(probs):.4f}")
+print(f"\nHigher score → higher probability. All positive, sum to 1.")
+
+# Show the effect of "temperature"
+print("\nTemperature effect (used during text generation):")
+for temp in [0.5, 1.0, 2.0]:
+    scaled = [s / temp for s in scores]
+    p = softmax(scaled)
+    print(f"  temp={temp}: [{', '.join(f'{x:.3f}' for x in p)}]  ", end="")
+    if temp < 1:
+        print("← more confident (peaked)")
+    elif temp > 1:
+        print("← more random (flat)")
+    else:
+        print("← normal")
+
+# ============================================================
+# BLOCK 4: RMSNORM / LAYERNORM
+# ============================================================
+print("\n" + "=" * 50)
+print("BLOCK 4: Normalization")
+print("=" * 50)
+
+def rmsnorm(x):
+    """RMSNorm: scale so root-mean-square = 1. Used in microGPT."""
+    ms = sum(xi * xi for xi in x) / len(x)
+    scale = (ms + 1e-5) ** -0.5
+    return [xi * scale for xi in x]
+
+def layernorm(x):
+    """LayerNorm: shift to mean=0, scale to std=1. Used in minGPT."""
+    mean = sum(x) / len(x)
+    variance = sum((xi - mean) ** 2 for xi in x) / len(x)
+    scale = (variance + 1e-5) ** -0.5
+    return [(xi - mean) * scale for xi in x]
+
+x = [10.0, 20.0, 30.0, 40.0]
+print(f"\nOriginal:   {x}")
+print(f"RMSNorm:    [{', '.join(f'{v:.4f}' for v in rmsnorm(x))}]")
+print(f"LayerNorm:  [{', '.join(f'{v:.4f}' for v in layernorm(x))}]")
+
+print("\nWhy normalize? Keeps values in a reasonable range.")
+print("Without it, values can grow huge or tiny, making training unstable.")
+
+# ============================================================
+# BLOCK 5: EMBEDDING
+# ============================================================
+print("\n" + "=" * 50)
+print("BLOCK 5: Embedding (Lookup Table)")
+print("=" * 50)
+
+# An embedding is just a table of vectors, one per token
+n_embd = 4  # each token becomes a vector of 4 numbers
+vocab_size = 5  # we have 5 possible tokens
+
+# Initialize randomly (these get learned during training)
+embedding_table = [[random.gauss(0, 0.1) for _ in range(n_embd)]
+                   for _ in range(vocab_size)]
+
+print(f"\nEmbedding table ({vocab_size} tokens × {n_embd} dimensions):")
+for i, row in enumerate(embedding_table):
+    print(f"  Token {i} → [{', '.join(f'{v:+.3f}' for v in row)}]")
+
+token_id = 3
+print(f"\nLooking up token {token_id}:")
+print(f"  → [{', '.join(f'{v:+.3f}' for v in embedding_table[token_id])}]")
+print("\nEmbeddings are LEARNED — after training, similar tokens")
+print("end up with similar vectors (e.g., 'cat' close to 'dog').")
+
+print("""
+=== Summary ===
+These 5 blocks are ALL you need to build GPT:
+  1. Linear    → combine information (weighted sums)
+  2. Activation → add non-linearity (ReLU or GELU)
+  3. Softmax   → get probabilities for next token
+  4. Norm      → keep numbers stable (RMSNorm or LayerNorm)
+  5. Embedding → turn token IDs into vectors
+
+Next chapter: Attention — the special sauce of Transformers!
+""")
diff --git a/gpt/local/course/ch04_Neural_Network_Building_Blocks/mlp.py b/gpt/local/course/ch04_Neural_Network_Building_Blocks/mlp.py
new file mode 100644
index 0000000..e78c51b
--- /dev/null
+++ b/gpt/local/course/ch04_Neural_Network_Building_Blocks/mlp.py
@@ -0,0 +1,167 @@
+"""
+Chapter 04: The MLP (Multi-Layer Perceptron)
+
+The MLP is a key component inside every Transformer block.
+It processes each token position independently, adding
+"thinking capacity" to the model.
+
+Structure: Linear → ReLU → Linear
+"""
+
+import math
+import random
+
+random.seed(42)
+
+# ============================================================
+# Value class (from Chapter 03) for automatic gradients
+# ============================================================
+class Value:
+    def __init__(self, data, children=(), local_grads=()):
+        self.data = data
+        self.grad = 0
+        self._children = children
+        self._local_grads = local_grads
+
+    def __add__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data + other.data, (self, other), (1, 1))
+    def __mul__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data * other.data, (self, other), (other.data, self.data))
+    def __pow__(self, n):
+        return Value(self.data ** n, (self,), (n * self.data ** (n-1),))
+    def relu(self):
+        return Value(max(0, self.data), (self,), (float(self.data > 0),))
+    def __neg__(self): return self * -1
+    def __sub__(self, other): return self + (-other)
+    def __radd__(self, other): return self + other
+    def __rmul__(self, other): return self * other
+    def __truediv__(self, other): return self * other ** -1
+
+    def backward(self):
+        topo, visited = [], set()
+        def build(v):
+            if v not in visited:
+                visited.add(v)
+                for c in v._children: build(c)
+                topo.append(v)
+        build(self)
+        self.grad = 1
+        for v in reversed(topo):
+            for child, lg in zip(v._children, v._local_grads):
+                child.grad += lg * v.grad
+
+# ============================================================
+# MLP: Two linear layers with ReLU in between
+# ============================================================
+# In GPT, the MLP has this structure:
+#   input (n_embd) → Linear (4*n_embd) → ReLU → Linear (n_embd) → output
+# It "expands" the representation, processes it, then "compresses" back.
+
+n_in = 3        # input dimension
+n_hidden = 8    # hidden dimension (expanded)
+n_out = 3       # output dimension (same as input in GPT)
+
+# Initialize weights randomly
+def make_matrix(rows, cols):
+    return [[Value(random.gauss(0, 0.3)) for _ in range(cols)] for _ in range(rows)]
+
+W1 = make_matrix(n_hidden, n_in)   # first linear layer
+W2 = make_matrix(n_out, n_hidden)  # second linear layer
+
+def linear(x, w):
+    return [sum(wi * xi for wi, xi in zip(row, x)) for row in w]
+
+def mlp_forward(x):
+    """The MLP forward pass: Linear → ReLU → Linear"""
+    # Step 1: Expand (3 → 8)
+    h = linear(x, W1)
+    # Step 2: Activate (apply ReLU)
+    h = [hi.relu() for hi in h]
+    # Step 3: Compress (8 → 3)
+    out = linear(h, W2)
+    return out
+
+# ============================================================
+# Demo: Run a vector through the MLP
+# ============================================================
+print("=" * 50)
+print("MLP (Multi-Layer Perceptron) Demo")
+print("=" * 50)
+
+x = [Value(1.0), Value(0.5), Value(-0.3)]
+
+print(f"\nInput ({n_in} values):  [{', '.join(f'{v.data:.2f}' for v in x)}]")
+
+h = linear(x, W1)
+print(f"After Linear1 ({n_hidden} values): [{', '.join(f'{v.data:.2f}' for v in h)}]")
+
+h_relu = [hi.relu() for hi in h]
+print(f"After ReLU ({n_hidden} values):    [{', '.join(f'{v.data:.2f}' for v in h_relu)}]")
+
+out = linear(h_relu, W2)
+print(f"After Linear2 ({n_out} values): [{', '.join(f'{v.data:.2f}' for v in out)}]")
+
+# ============================================================
+# Train the MLP to learn a simple function
+# ============================================================
+print("\n" + "=" * 50)
+print("Training an MLP to learn: [a, b, c] → [a+b, b+c, a+c]")
+print("=" * 50)
+
+# Training data
+data = [
+    ([1.0, 2.0, 3.0], [3.0, 5.0, 4.0]),
+    ([0.5, 1.0, 0.5], [1.5, 1.5, 1.0]),
+    ([2.0, 0.0, 1.0], [2.0, 1.0, 3.0]),
+    ([1.0, 1.0, 1.0], [2.0, 2.0, 2.0]),
+    ([0.0, 3.0, 2.0], [3.0, 5.0, 2.0]),
+]
+
+# Collect all parameters for gradient descent
+params = [p for row in W1 for p in row] + [p for row in W2 for p in row]
+print(f"Number of parameters: {len(params)}")
+
+lr = 0.005
+for step in range(200):
+    # Forward pass on all data points
+    total_loss = Value(0)
+    for x_data, y_target in data:
+        x = [Value(v) for v in x_data]
+        y_pred = mlp_forward(x)
+        for yp, yt in zip(y_pred, y_target):
+            total_loss = total_loss + (yp - yt) ** 2
+
+    total_loss = total_loss * (1.0 / (len(data) * n_out))
+
+    # Backward pass
+    total_loss.backward()
+
+    # Update parameters
+    for p in params:
+        p.data -= lr * p.grad
+        p.grad = 0
+
+    if step % 20 == 0:
+        print(f"  Step {step:3d} | Loss: {total_loss.data:.4f}")
+
+# Test
+print("\nAfter training:")
+for x_data, y_target in data[:3]:
+    x = [Value(v) for v in x_data]
+    y_pred = mlp_forward(x)
+    pred = [f"{v.data:.2f}" for v in y_pred]
+    target = [f"{v:.2f}" for v in y_target]
+    print(f"  Input: {x_data} → Predicted: [{', '.join(pred)}] | Target: [{', '.join(target)}]")
+
+print("""
+=== In GPT ===
+Every transformer block contains an MLP just like this.
+  - Input dimension: n_embd (e.g., 768 for GPT-2)
+  - Hidden dimension: 4 * n_embd (e.g., 3072)
+  - The MLP processes each token position independently
+  - It adds "thinking" — the ability to transform representations
+
+The MLP is the "feedforward" part. Next: Attention is the "communication" part.
+""")
diff --git a/gpt/local/course/ch05_Attention_and_Transformers/README.md b/gpt/local/course/ch05_Attention_and_Transformers/README.md
new file mode 100644
index 0000000..050e01d
--- /dev/null
+++ b/gpt/local/course/ch05_Attention_and_Transformers/README.md
@@ -0,0 +1,95 @@
+# Chapter 05: Attention & The Transformer Architecture
+
+## The Key Insight
+
+The MLP from Chapter 04 processes each token **independently**. But language requires **context** — the meaning of "bank" depends on whether we're talking about rivers or money.
+
+**Attention** lets each token look at all previous tokens and decide which ones are relevant.
+
+---
+
+## Attention in Plain English
+
+Imagine you're reading: "The cat sat on the ___"
+
+To predict the next word, you need to look back at the whole sentence. But not all words matter equally:
+- "cat" is very relevant (what's sitting?)
+- "The" is less relevant
+- "sat on" tells you it's about a location
+
+Attention is a mechanism that **automatically learns** which previous tokens to focus on.
+
+---
+
+## How Attention Works: Query, Key, Value
+
+Each token produces three vectors:
+- **Query (Q)**: "What am I looking for?"
+- **Key (K)**: "What do I contain?"
+- **Value (V)**: "What information do I provide?"
+
+The process:
+1. Each token's Query is compared against all previous tokens' Keys (dot product)
+2. The dot products become **attention weights** (via softmax)
+3. The weighted sum of Values is the output
+
+```
+Token "___" asks: "Who should I pay attention to?"
+  - Q("___") · K("cat")  = 0.7  (high match!)
+  - Q("___") · K("The")  = 0.1  (low match)
+  - Q("___") · K("sat")  = 0.5  (medium match)
+  - Q("___") · K("on")   = 0.6  (medium-high match)
+  - Q("___") · K("the")  = 0.1  (low match)
+
+After softmax: weights = [0.35, 0.05, 0.18, 0.27, 0.05]
+Output = weighted sum of all Value vectors
+```
+
+---
+
+## Multi-Head Attention
+
+Instead of one set of Q/K/V, GPT uses **multiple heads** (e.g., 4 or 12). Each head can learn to pay attention to different things:
+- Head 1 might focus on syntax (subject-verb agreement)
+- Head 2 might focus on nearby context
+- Head 3 might focus on semantic similarity
+
+The outputs of all heads are concatenated.
+
+---
+
+## Causal (Masked) Attention
+
+GPT is **autoregressive** — it generates text left to right. So token at position 5 can only attend to positions 0-4, never to future positions. This is enforced by masking.
+
+---
+
+## The Transformer Block
+
+One transformer block = Attention + MLP, with residual connections and normalization:
+
+```
+Input
+  │
+  ├──→ Norm → Multi-Head Attention ──┐
+  │                                   │
+  └───────────────────── (+) ←────────┘  ← Residual connection
+  │
+  ├──→ Norm → MLP (Linear → ReLU → Linear) ──┐
+  │                                             │
+  └─────────────────────────────── (+) ←────────┘  ← Residual connection
+  │
+Output
+```
+
+GPT stacks multiple blocks: microGPT uses 1, GPT-2 uses 12-48.
+
+---
+
+## Run the Examples
+
+```bash
+python attention_basics.py       # Step-by-step attention computation
+python multi_head_attention.py   # Multi-head attention from scratch
+python transformer_block.py      # A complete transformer block
+```
diff --git a/gpt/local/course/ch05_Attention_and_Transformers/attention_basics.py b/gpt/local/course/ch05_Attention_and_Transformers/attention_basics.py
new file mode 100644
index 0000000..dffecfe
--- /dev/null
+++ b/gpt/local/course/ch05_Attention_and_Transformers/attention_basics.py
@@ -0,0 +1,184 @@
+"""
+Chapter 05: Attention — Step by Step
+
+This builds single-head attention from scratch, showing every
+intermediate value so you can see exactly what's happening.
+"""
+
+import math
+import random
+
+random.seed(42)
+
+# ============================================================
+# SETUP: A tiny sequence of 3 tokens, each with 4-dimensional embeddings
+# ============================================================
+# Imagine we're processing: "the cat sat"
+# After embedding, each token is a vector of numbers.
+
+seq_len = 3
+d_model = 4  # embedding dimension
+
+# Pretend these are embeddings (normally learned, here random for demo)
+tokens = {
+    0: "the",
+    1: "cat",
+    2: "sat",
+}
+
+# Token embeddings (what the model "sees")
+X = [
+    [1.0, 0.5, -0.3, 0.8],   # "the"
+    [0.2, 1.2,  0.7, -0.1],   # "cat"
+    [-0.5, 0.3, 1.0,  0.6],   # "sat"
+]
+
+print("=" * 60)
+print("Single-Head Self-Attention — Step by Step")
+print("=" * 60)
+print(f"\nSequence: {[tokens[i] for i in range(seq_len)]}")
+print(f"Embedding dimension: {d_model}")
+print(f"\nToken embeddings (X):")
+for i in range(seq_len):
+    print(f"  {tokens[i]:>5}: {X[i]}")
+
+# ============================================================
+# STEP 1: Project to Q, K, V using weight matrices
+# ============================================================
+# In practice, these are learned weight matrices.
+# Here we use simple random values for illustration.
+
+head_dim = d_model  # In multi-head, this would be d_model // n_heads
+
+# Weight matrices (would be learned during training)
+Wq = [[random.gauss(0, 0.5) for _ in range(d_model)] for _ in range(head_dim)]
+Wk = [[random.gauss(0, 0.5) for _ in range(d_model)] for _ in range(head_dim)]
+Wv = [[random.gauss(0, 0.5) for _ in range(d_model)] for _ in range(head_dim)]
+
+def matmul_vec(w, x):
+    """Multiply weight matrix by input vector."""
+    return [sum(wi * xi for wi, xi in zip(row, x)) for row in w]
+
+# Compute Q, K, V for each token
+Q = [matmul_vec(Wq, X[i]) for i in range(seq_len)]
+K = [matmul_vec(Wk, X[i]) for i in range(seq_len)]
+V = [matmul_vec(Wv, X[i]) for i in range(seq_len)]
+
+print("\n--- Step 1: Compute Q (Query), K (Key), V (Value) ---")
+for i in range(seq_len):
+    print(f"  {tokens[i]:>5} Q: [{', '.join(f'{v:+.2f}' for v in Q[i])}]")
+    print(f"  {tokens[i]:>5} K: [{', '.join(f'{v:+.2f}' for v in K[i])}]")
+    print(f"  {tokens[i]:>5} V: [{', '.join(f'{v:+.2f}' for v in V[i])}]")
+
+# ============================================================
+# STEP 2: Compute attention scores (Q dot K)
+# ============================================================
+print("\n--- Step 2: Attention Scores = Q · K^T / sqrt(d) ---")
+print("  Each score measures how much token i 'wants to attend to' token j.\n")
+
+scale = math.sqrt(head_dim)
+scores = [[0.0] * seq_len for _ in range(seq_len)]
+
+for i in range(seq_len):
+    for j in range(seq_len):
+        # Dot product of Q[i] and K[j]
+        dot = sum(Q[i][d] * K[j][d] for d in range(head_dim))
+        scores[i][j] = dot / scale
+
+# Print score matrix
+print(f"  {'':>10}", end="")
+for j in range(seq_len):
+    print(f"{tokens[j]:>10}", end="")
+print()
+for i in range(seq_len):
+    print(f"  {tokens[i]:>10}", end="")
+    for j in range(seq_len):
+        print(f"{scores[i][j]:>10.3f}", end="")
+    print()
+
+# ============================================================
+# STEP 3: Apply CAUSAL MASK (can't look at future tokens)
+# ============================================================
+print("\n--- Step 3: Apply Causal Mask (hide future) ---")
+print("  GPT generates left-to-right, so each token can only see itself and earlier tokens.\n")
+
+masked_scores = [[0.0] * seq_len for _ in range(seq_len)]
+for i in range(seq_len):
+    for j in range(seq_len):
+        if j > i:  # future token — mask it!
+            masked_scores[i][j] = float('-inf')
+        else:
+            masked_scores[i][j] = scores[i][j]
+
+print(f"  {'':>10}", end="")
+for j in range(seq_len):
+    print(f"{tokens[j]:>10}", end="")
+print()
+for i in range(seq_len):
+    print(f"  {tokens[i]:>10}", end="")
+    for j in range(seq_len):
+        if masked_scores[i][j] == float('-inf'):
+            print(f"{'  -inf':>10}", end="")
+        else:
+            print(f"{masked_scores[i][j]:>10.3f}", end="")
+    print()
+
+# ============================================================
+# STEP 4: Softmax → attention weights (probabilities)
+# ============================================================
+print("\n--- Step 4: Softmax → Attention Weights ---")
+print("  Convert scores to probabilities. -inf becomes 0 (masked out).\n")
+
+def softmax(logits):
+    max_val = max(v for v in logits if v != float('-inf'))
+    exps = [math.exp(v - max_val) if v != float('-inf') else 0.0 for v in logits]
+    total = sum(exps)
+    return [e / total if total > 0 else 0.0 for e in exps]
+
+attn_weights = [softmax(masked_scores[i]) for i in range(seq_len)]
+
+print(f"  {'':>10}", end="")
+for j in range(seq_len):
+    print(f"{tokens[j]:>10}", end="")
+print()
+for i in range(seq_len):
+    print(f"  {tokens[i]:>10}", end="")
+    for j in range(seq_len):
+        print(f"{attn_weights[i][j]:>10.3f}", end="")
+    print(f"  (sum={sum(attn_weights[i]):.3f})")
+
+print("\n  Interpretation:")
+print(f"    '{tokens[0]}' attends to: only itself (first token, nothing before it)")
+print(f"    '{tokens[1]}' attends to: '{tokens[0]}' and itself")
+print(f"    '{tokens[2]}' attends to: all three tokens")
+
+# ============================================================
+# STEP 5: Weighted sum of Values → output
+# ============================================================
+print("\n--- Step 5: Output = Attention Weights × Values ---")
+print("  Each token's output is a weighted blend of all attended Value vectors.\n")
+
+output = []
+for i in range(seq_len):
+    out_i = [0.0] * head_dim
+    for j in range(seq_len):
+        for d in range(head_dim):
+            out_i[d] += attn_weights[i][j] * V[j][d]
+    output.append(out_i)
+
+for i in range(seq_len):
+    print(f"  {tokens[i]:>5} output: [{', '.join(f'{v:+.3f}' for v in output[i])}]")
+
+print("""
+=== Summary ===
+Attention in 5 steps:
+  1. Project each token to Q, K, V vectors
+  2. Score: how much does each Q match each K? (dot product)
+  3. Mask: hide future tokens (causal / autoregressive)
+  4. Softmax: convert scores to weights (probabilities)
+  5. Output: weighted sum of V vectors
+
+Key insight: The model LEARNS the W_q, W_k, W_v matrices during training.
+After training, the attention patterns emerge automatically — the model
+discovers which tokens are relevant to which other tokens.
+""")
diff --git a/gpt/local/course/ch05_Attention_and_Transformers/multi_head_attention.py b/gpt/local/course/ch05_Attention_and_Transformers/multi_head_attention.py
new file mode 100644
index 0000000..dec5b70
--- /dev/null
+++ b/gpt/local/course/ch05_Attention_and_Transformers/multi_head_attention.py
@@ -0,0 +1,151 @@
+"""
+Chapter 05: Multi-Head Attention
+
+Instead of one attention mechanism, GPT uses MULTIPLE "heads"
+that run in parallel. Each head can learn to focus on different
+types of relationships.
+
+This code follows the exact pattern from microGPT (microgpt.py).
+"""
+
+import math
+import random
+
+random.seed(42)
+
+# ============================================================
+# Configuration (matches microGPT's tiny config)
+# ============================================================
+n_embd = 8       # embedding dimension
+n_head = 2       # number of attention heads
+head_dim = n_embd // n_head  # = 4 dimensions per head
+seq_len = 4      # sequence length
+
+print("=" * 60)
+print("Multi-Head Attention")
+print("=" * 60)
+print(f"  Embedding dim (n_embd): {n_embd}")
+print(f"  Number of heads:        {n_head}")
+print(f"  Dimension per head:     {head_dim}")
+print(f"  Sequence length:        {seq_len}")
+
+# ============================================================
+# Initialize weight matrices
+# ============================================================
+def make_matrix(rows, cols):
+    return [[random.gauss(0, 0.3) for _ in range(cols)] for _ in range(rows)]
+
+Wq = make_matrix(n_embd, n_embd)  # Query projection
+Wk = make_matrix(n_embd, n_embd)  # Key projection
+Wv = make_matrix(n_embd, n_embd)  # Value projection
+Wo = make_matrix(n_embd, n_embd)  # Output projection
+
+def linear(x, w):
+    return [sum(wi * xi for wi, xi in zip(row, x)) for row in w]
+
+def softmax(logits):
+    max_val = max(logits)
+    exps = [math.exp(v - max_val) for v in logits]
+    total = sum(exps)
+    return [e / total for e in exps]
+
+# ============================================================
+# Generate token embeddings (pretend these came from earlier layers)
+# ============================================================
+X = [[random.gauss(0, 0.5) for _ in range(n_embd)] for _ in range(seq_len)]
+
+# ============================================================
+# Multi-Head Attention (KV-cache style, like microGPT)
+# ============================================================
+# microGPT processes one token at a time, accumulating keys/values
+# This is more memory efficient and matches how inference works.
+
+print("\n--- Processing tokens one at a time (KV-cache style) ---\n")
+
+all_keys = []   # accumulated keys
+all_values = []  # accumulated values
+outputs = []
+
+for pos in range(seq_len):
+    x = X[pos]
+
+    # Project current token to Q, K, V (full n_embd dimension)
+    q = linear(x, Wq)
+    k = linear(x, Wk)
+    v = linear(x, Wv)
+
+    # Store K and V for future tokens to attend to
+    all_keys.append(k)
+    all_values.append(v)
+
+    # Now run each attention head
+    x_attn = []  # will collect outputs from all heads
+
+    for h in range(n_head):
+        # Each head gets a SLICE of the Q, K, V vectors
+        hs = h * head_dim  # start index for this head
+
+        # Slice out this head's portion
+        q_h = q[hs:hs + head_dim]
+        k_h = [ki[hs:hs + head_dim] for ki in all_keys]    # all stored keys
+        v_h = [vi[hs:hs + head_dim] for vi in all_values]   # all stored values
+
+        # Compute attention scores: Q dot K for all positions up to current
+        attn_logits = []
+        for t in range(len(k_h)):
+            dot = sum(q_h[j] * k_h[t][j] for j in range(head_dim))
+            attn_logits.append(dot / head_dim ** 0.5)
+
+        # Softmax to get weights
+        attn_weights = softmax(attn_logits)
+
+        # Weighted sum of values
+        head_out = []
+        for j in range(head_dim):
+            val = sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h)))
+            head_out.append(val)
+
+        x_attn.extend(head_out)  # Concatenate head outputs
+
+        if pos == seq_len - 1:  # Print details for last token
+            print(f"  Token {pos}, Head {h}:")
+            print(f"    Attention weights: [{', '.join(f'{w:.3f}' for w in attn_weights)}]")
+            print(f"    Head output: [{', '.join(f'{v:.3f}' for v in head_out)}]")
+
+    # Final output projection: combine all heads
+    out = linear(x_attn, Wo)
+    outputs.append(out)
+
+print(f"\n--- Outputs ---")
+for pos in range(seq_len):
+    print(f"  Token {pos}: [{', '.join(f'{v:+.3f}' for v in outputs[pos][:4])}...]")
+
+# ============================================================
+# Explain Multi-Head
+# ============================================================
+print(f"""
+=== Why Multiple Heads? ===
+
+With {n_head} heads, each getting {head_dim} dimensions:
+  - Head 0 uses dimensions [0:{head_dim}] of Q, K, V
+  - Head 1 uses dimensions [{head_dim}:{n_embd}] of Q, K, V
+
+Each head learns DIFFERENT attention patterns:
+  - Head 0 might learn to attend to the previous token
+  - Head 1 might learn to attend to the subject of the sentence
+
+After computing, head outputs are CONCATENATED:
+  [{head_dim} dims from head 0] + [{head_dim} dims from head 1] = [{n_embd} dims total]
+
+Then a final linear layer (Wo) mixes the head outputs.
+
+=== KV-Cache ===
+
+Notice how we process ONE token at a time and store its K, V:
+  - Token 0: compute K0, V0, store them
+  - Token 1: compute K1, V1, attend to [K0, K1], [V0, V1]
+  - Token 2: compute K2, V2, attend to [K0, K1, K2], [V0, V1, V2]
+
+This is exactly how microGPT works (and how real GPT inference works).
+The "KV-cache" avoids recomputing keys and values for earlier tokens.
+""")
diff --git a/gpt/local/course/ch05_Attention_and_Transformers/transformer_block.py b/gpt/local/course/ch05_Attention_and_Transformers/transformer_block.py
new file mode 100644
index 0000000..454b163
--- /dev/null
+++ b/gpt/local/course/ch05_Attention_and_Transformers/transformer_block.py
@@ -0,0 +1,207 @@
+"""
+Chapter 05: A Complete Transformer Block
+
+This combines everything from Chapters 04 and 05:
+  - RMSNorm
+  - Multi-Head Attention
+  - MLP (Linear → ReLU → Linear)
+  - Residual Connections
+
+This is the EXACT structure of one layer in microGPT.
+"""
+
+import math
+import random
+
+random.seed(42)
+
+# ============================================================
+# Configuration
+# ============================================================
+n_embd = 8
+n_head = 2
+head_dim = n_embd // n_head
+
+# ============================================================
+# Helper functions (from earlier chapters)
+# ============================================================
+def linear(x, w):
+    return [sum(wi * xi for wi, xi in zip(row, x)) for row in w]
+
+def softmax(logits):
+    max_val = max(logits)
+    exps = [math.exp(v - max_val) for v in logits]
+    total = sum(exps)
+    return [e / total for e in exps]
+
+def rmsnorm(x):
+    ms = sum(xi * xi for xi in x) / len(x)
+    scale = (ms + 1e-5) ** -0.5
+    return [xi * scale for xi in x]
+
+def relu(x):
+    return max(0, x)
+
+# ============================================================
+# Initialize all weights for one transformer block
+# ============================================================
+def make_matrix(rows, cols):
+    return [[random.gauss(0, 0.3) for _ in range(cols)] for _ in range(rows)]
+
+weights = {
+    'attn_wq': make_matrix(n_embd, n_embd),
+    'attn_wk': make_matrix(n_embd, n_embd),
+    'attn_wv': make_matrix(n_embd, n_embd),
+    'attn_wo': make_matrix(n_embd, n_embd),
+    'mlp_fc1': make_matrix(4 * n_embd, n_embd),  # expand
+    'mlp_fc2': make_matrix(n_embd, 4 * n_embd),   # compress
+}
+
+# ============================================================
+# THE TRANSFORMER BLOCK
+# ============================================================
+def transformer_block(x, keys_cache, values_cache, pos):
+    """
+    Process one token through one transformer block.
+
+    Args:
+        x: input embedding vector [n_embd]
+        keys_cache: list of previous key vectors
+        values_cache: list of previous value vectors
+        pos: position index (for printing)
+
+    Returns:
+        output vector [n_embd]
+    """
+    # ---- Part 1: Multi-Head Attention ----
+    x_residual = x[:]  # save for residual connection
+
+    # Pre-norm
+    x_normed = rmsnorm(x)
+
+    # Compute Q, K, V
+    q = linear(x_normed, weights['attn_wq'])
+    k = linear(x_normed, weights['attn_wk'])
+    v = linear(x_normed, weights['attn_wv'])
+
+    # Add current K, V to cache
+    keys_cache.append(k)
+    values_cache.append(v)
+
+    # Multi-head attention
+    x_attn = []
+    for h in range(n_head):
+        hs = h * head_dim
+        q_h = q[hs:hs + head_dim]
+        k_h = [ki[hs:hs + head_dim] for ki in keys_cache]
+        v_h = [vi[hs:hs + head_dim] for vi in values_cache]
+
+        # Attention scores
+        attn_logits = [
+            sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim ** 0.5
+            for t in range(len(k_h))
+        ]
+        attn_weights = softmax(attn_logits)
+
+        # Weighted sum of values
+        head_out = [
+            sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h)))
+            for j in range(head_dim)
+        ]
+        x_attn.extend(head_out)
+
+    # Output projection
+    x = linear(x_attn, weights['attn_wo'])
+
+    # Residual connection: ADD the input back
+    x = [a + b for a, b in zip(x, x_residual)]
+
+    # ---- Part 2: MLP ----
+    x_residual = x[:]  # save for residual connection
+
+    # Pre-norm
+    x_normed = rmsnorm(x)
+
+    # MLP: expand → activate → compress
+    h = linear(x_normed, weights['mlp_fc1'])   # n_embd → 4*n_embd
+    h = [relu(hi) for hi in h]                  # ReLU activation
+    x_mlp = linear(h, weights['mlp_fc2'])       # 4*n_embd → n_embd
+
+    # Residual connection: ADD the input back
+    x = [a + b for a, b in zip(x_mlp, x_residual)]
+
+    return x
+
+# ============================================================
+# DEMO: Process a sequence through the transformer block
+# ============================================================
+print("=" * 60)
+print("Complete Transformer Block Demo")
+print("=" * 60)
+
+# Fake token embeddings
+tokens = ["The", "cat", "sat", "on"]
+X = [[random.gauss(0, 0.5) for _ in range(n_embd)] for _ in range(len(tokens))]
+
+keys_cache = []
+values_cache = []
+
+print(f"\nProcessing {len(tokens)} tokens through one transformer block:\n")
+
+for pos, token in enumerate(tokens):
+    x_in = X[pos]
+    x_out = transformer_block(x_in, keys_cache, values_cache, pos)
+
+    # Compute how much the representation changed
+    change = sum((a - b) ** 2 for a, b in zip(x_in, x_out)) ** 0.5
+
+    print(f"  Token {pos} ('{token}'):")
+    print(f"    Input:  [{', '.join(f'{v:+.3f}' for v in x_in[:4])}...]")
+    print(f"    Output: [{', '.join(f'{v:+.3f}' for v in x_out[:4])}...]")
+    print(f"    Change magnitude: {change:.4f}")
+    print(f"    Attended to {pos + 1} token(s)")
+
+print("""
+=== Anatomy of a Transformer Block ===
+
+  Input x
+    │
+    ├─── save as residual
+    │
+    ▼
+  RMSNorm(x)
+    │
+    ▼
+  Multi-Head Attention (look at previous tokens)
+    │
+    ▼
+  + residual  ← Information highway! Original info preserved
+    │
+    ├─── save as residual
+    │
+    ▼
+  RMSNorm(x)
+    │
+    ▼
+  MLP: Linear(4x) → ReLU → Linear(1x)  (per-token processing)
+    │
+    ▼
+  + residual  ← Information highway again
+    │
+    ▼
+  Output x
+
+=== Why Residual Connections? ===
+Without them, information from early tokens would get lost.
+The residual connection ensures the original signal always
+has a direct path through the network. Think of it as:
+  "Output = Original + What I Learned"
+
+=== GPT = Stack of These Blocks ===
+  - microGPT: 1 block  (tiny, for learning)
+  - GPT-2 small: 12 blocks
+  - GPT-2 XL: 48 blocks
+  - GPT-3: 96 blocks
+
+More blocks = more processing steps = more "thinking"
+""")
diff --git a/gpt/local/course/ch06_Training_Loop_and_Optimization/README.md b/gpt/local/course/ch06_Training_Loop_and_Optimization/README.md
new file mode 100644
index 0000000..6823f40
--- /dev/null
+++ b/gpt/local/course/ch06_Training_Loop_and_Optimization/README.md
@@ -0,0 +1,67 @@
+# Chapter 06: Training — How Models Learn
+
+## The Training Loop
+
+Every neural network trains with the same loop:
+
+```
+for each training step:
+    1. Pick a training example
+    2. Forward pass: run the model, compute the loss
+    3. Backward pass: compute gradients (autograd)
+    4. Update: adjust parameters to reduce the loss
+```
+
+---
+
+## Loss Function: Cross-Entropy
+
+For language models, the loss measures: **"How surprised was the model by the correct next token?"**
+
+```
+Model predicts: P("the")=0.6, P("a")=0.3, P("cat")=0.1
+Correct answer: "the"
+
+Loss = -log(0.6) = 0.51  (low loss — model was fairly confident)
+
+Model predicts: P("the")=0.1, P("a")=0.1, P("cat")=0.8
+Correct answer: "the"
+
+Loss = -log(0.1) = 2.30  (high loss — model was wrong!)
+```
+
+Lower loss = better predictions. Training minimizes the average loss.
+
+---
+
+## The Optimizer: Adam
+
+Simple gradient descent updates parameters as: `p -= learning_rate * gradient`
+
+**Adam** is smarter — it maintains:
+- **Momentum (m)**: Running average of gradients (direction smoothing)
+- **Velocity (v)**: Running average of squared gradients (per-parameter learning rate)
+
+This makes training faster and more stable.
+
+---
+
+## Learning Rate
+
+The learning rate controls step size:
+- **Too high**: Parameters overshoot, loss explodes
+- **Too low**: Training is painfully slow
+- **Just right**: Loss decreases steadily
+
+Common practice: **start with a reasonable LR and decay it** over training.
+microGPT uses linear decay: `lr = lr_start * (1 - step/total_steps)`
+
+---
+
+## Run the Examples
+
+```bash
+python cross_entropy.py     # Understanding the loss function
+python adam_optimizer.py     # How Adam works, with visualization
+python training_loop.py     # A complete mini training loop
+```
diff --git a/gpt/local/course/ch06_Training_Loop_and_Optimization/adam_optimizer.py b/gpt/local/course/ch06_Training_Loop_and_Optimization/adam_optimizer.py
new file mode 100644
index 0000000..75d7ee1
--- /dev/null
+++ b/gpt/local/course/ch06_Training_Loop_and_Optimization/adam_optimizer.py
@@ -0,0 +1,124 @@
+"""
+Chapter 06: The Adam Optimizer
+
+Adam is the optimizer used in both microGPT and minGPT.
+It's smarter than plain gradient descent because it adapts
+the learning rate for each parameter individually.
+
+This demo compares plain SGD vs Adam on a simple problem.
+"""
+
+import math
+import random
+
+random.seed(42)
+
+# ============================================================
+# Problem: Minimize f(x, y) = (x - 3)^2 + 10*(y - 7)^2
+# The minimum is at (3, 7). Can Adam find it?
+# ============================================================
+
+def f(x, y):
+    """Function to minimize. Minimum at (3, 7)."""
+    return (x - 3) ** 2 + 10 * (y - 7) ** 2
+
+def grad_f(x, y):
+    """Gradient of f."""
+    return 2 * (x - 3), 20 * (y - 7)
+
+# ============================================================
+# Method 1: Plain Gradient Descent (SGD)
+# ============================================================
+print("=" * 60)
+print("Method 1: Plain Gradient Descent (SGD)")
+print("=" * 60)
+
+x, y = 0.0, 0.0  # start far from the minimum
+lr = 0.01
+
+for step in range(50):
+    gx, gy = grad_f(x, y)
+    x -= lr * gx
+    y -= lr * gy
+    if step % 5 == 0:
+        print(f"  Step {step:3d}: x={x:.4f}, y={y:.4f}, f={f(x, y):.4f}")
+
+print(f"  Final: x={x:.4f} (target: 3.0), y={y:.4f} (target: 7.0)")
+
+# ============================================================
+# Method 2: Adam Optimizer
+# ============================================================
+print("\n" + "=" * 60)
+print("Method 2: Adam Optimizer")
+print("=" * 60)
+
+x, y = 0.0, 0.0  # same starting point
+lr = 0.1
+beta1, beta2, eps = 0.9, 0.99, 1e-8
+
+# Adam's extra state: momentum and velocity for each parameter
+mx, my = 0.0, 0.0  # first moment (momentum)
+vx, vy = 0.0, 0.0  # second moment (velocity)
+
+for step in range(50):
+    gx, gy = grad_f(x, y)
+
+    # Update momentum (exponential moving average of gradients)
+    mx = beta1 * mx + (1 - beta1) * gx
+    my = beta1 * my + (1 - beta1) * gy
+
+    # Update velocity (exponential moving average of squared gradients)
+    vx = beta2 * vx + (1 - beta2) * gx ** 2
+    vy = beta2 * vy + (1 - beta2) * gy ** 2
+
+    # Bias correction (important for early steps when m and v are near 0)
+    mx_hat = mx / (1 - beta1 ** (step + 1))
+    my_hat = my / (1 - beta1 ** (step + 1))
+    vx_hat = vx / (1 - beta2 ** (step + 1))
+    vy_hat = vy / (1 - beta2 ** (step + 1))
+
+    # Update parameters
+    x -= lr * mx_hat / (vx_hat ** 0.5 + eps)
+    y -= lr * my_hat / (vy_hat ** 0.5 + eps)
+
+    if step % 5 == 0:
+        print(f"  Step {step:3d}: x={x:.4f}, y={y:.4f}, f={f(x, y):.4f}")
+
+print(f"  Final: x={x:.4f} (target: 3.0), y={y:.4f} (target: 7.0)")
+
+# ============================================================
+# Explain what Adam does
+# ============================================================
+print(f"""
+=== Why Adam is Better ===
+
+Plain SGD: every parameter uses the SAME learning rate.
+  - The y direction has steep gradients (10x) → it overshoots or oscillates
+  - The x direction has gentle gradients → it moves too slowly
+  - You can't make both happy with one learning rate
+
+Adam: ADAPTS the learning rate per parameter.
+  - Momentum (m): smooths out gradient noise, like a rolling ball
+  - Velocity (v): tracks gradient magnitude per parameter
+  - Parameters with large gradients get SMALLER steps
+  - Parameters with small gradients get LARGER steps
+
+=== Adam in microGPT (line 176-182) ===
+
+  for i, p in enumerate(params):
+      m[i] = beta1 * m[i] + (1 - beta1) * p.grad         # momentum
+      v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2    # velocity
+      m_hat = m[i] / (1 - beta1 ** (step + 1))            # bias correction
+      v_hat = v[i] / (1 - beta2 ** (step + 1))            # bias correction
+      p.data -= lr * m_hat / (v_hat ** 0.5 + eps)         # update!
+      p.grad = 0                                            # reset for next step
+
+Same algorithm, just applied to thousands of parameters at once.
+
+=== Learning Rate Decay ===
+
+microGPT also decays the learning rate linearly:
+  lr_t = lr * (1 - step / total_steps)
+
+This means: take big steps early (explore), small steps late (fine-tune).
+""")
diff --git a/gpt/local/course/ch06_Training_Loop_and_Optimization/cross_entropy.py b/gpt/local/course/ch06_Training_Loop_and_Optimization/cross_entropy.py
new file mode 100644
index 0000000..bcf6925
--- /dev/null
+++ b/gpt/local/course/ch06_Training_Loop_and_Optimization/cross_entropy.py
@@ -0,0 +1,121 @@
+"""
+Chapter 06: Cross-Entropy Loss — The GPT Loss Function
+
+The cross-entropy loss measures how "surprised" the model is
+by the correct answer. It's the standard loss function for
+classification and language modeling.
+
+Formula: loss = -log(probability_of_correct_token)
+"""
+
+import math
+
+# ============================================================
+# The Intuition
+# ============================================================
+print("=" * 60)
+print("Cross-Entropy Loss: How 'Surprised' Is the Model?")
+print("=" * 60)
+
+print("""
+If the model assigns HIGH probability to the correct answer:
+  loss = -log(0.9) = {:.3f}  ← Low loss (good!)
+
+If the model assigns LOW probability to the correct answer:
+  loss = -log(0.1) = {:.3f}  ← High loss (bad!)
+
+If the model assigns VERY LOW probability:
+  loss = -log(0.01) = {:.3f} ← Very high loss (terrible!)
+""".format(-math.log(0.9), -math.log(0.1), -math.log(0.01)))
+
+# ============================================================
+# Step by step: from logits to loss
+# ============================================================
+print("=" * 60)
+print("From Model Output to Loss (Step by Step)")
+print("=" * 60)
+
+# Pretend our vocabulary is: [a, b, c, d, e] (5 tokens)
+vocab = ['a', 'b', 'c', 'd', 'e']
+correct_token = 2  # the correct next token is 'c'
+
+# Step 1: Model outputs raw scores (logits)
+logits = [1.0, 2.0, 4.0, 1.5, 0.5]
+print(f"\nStep 1 - Model outputs logits (raw scores):")
+for i, (tok, score) in enumerate(zip(vocab, logits)):
+    marker = " ← correct" if i == correct_token else ""
+    print(f"  '{tok}': {score:.1f}{marker}")
+
+# Step 2: Softmax converts to probabilities
+def softmax(logits):
+    max_val = max(logits)
+    exps = [math.exp(v - max_val) for v in logits]
+    total = sum(exps)
+    return [e / total for e in exps]
+
+probs = softmax(logits)
+print(f"\nStep 2 - Softmax → probabilities:")
+for i, (tok, p) in enumerate(zip(vocab, probs)):
+    bar = "█" * int(p * 40)
+    marker = " ← correct" if i == correct_token else ""
+    print(f"  '{tok}': {p:.4f} {bar}{marker}")
+print(f"  Sum: {sum(probs):.4f}")
+
+# Step 3: Cross-entropy loss
+loss = -math.log(probs[correct_token])
+print(f"\nStep 3 - Loss = -log(P(correct)) = -log({probs[correct_token]:.4f}) = {loss:.4f}")
+
+# ============================================================
+# What the gradients look like
+# ============================================================
+print("\n" + "=" * 60)
+print("What Gradients Tell the Model")
+print("=" * 60)
+print(f"""
+After computing loss, backpropagation gives us gradients.
+For softmax + cross-entropy, the gradient for each logit is simple:
+
+  gradient[i] = probability[i] - (1 if i is correct else 0)
+""")
+
+for i, (tok, p) in enumerate(zip(vocab, probs)):
+    target = 1.0 if i == correct_token else 0.0
+    grad = p - target
+    direction = "↓ decrease" if grad > 0 else "↑ INCREASE"
+    print(f"  '{tok}': grad = {p:.4f} - {target:.0f} = {grad:+.4f}  → {direction} this score")
+
+print("""
+The gradients push the model to:
+  - INCREASE the score of the correct token ('c')
+  - DECREASE the scores of all incorrect tokens
+  - The adjustment is proportional to how wrong the model was
+""")
+
+# ============================================================
+# Average loss over a sequence
+# ============================================================
+print("=" * 60)
+print("Average Loss Over a Sequence")
+print("=" * 60)
+
+# Simulating predictions for "hello" where model gets better over positions
+sequence = [
+    ("h", [0.1, 0.8, 0.05, 0.05]),    # model correctly predicts 'h' with 10%
+    ("e", [0.05, 0.05, 0.8, 0.1]),     # model correctly predicts 'e' with 80%
+    ("l", [0.2, 0.6, 0.1, 0.1]),       # model correctly predicts 'l' with 60%
+    ("l", [0.1, 0.1, 0.7, 0.1]),       # model correctly predicts second 'l' with 70%
+    ("o", [0.1, 0.1, 0.1, 0.7]),       # model correctly predicts 'o' with 70%
+]
+
+print(f"\nPredicting each character in a sequence:")
+losses = []
+for char, probs_for_correct in sequence:
+    p = probs_for_correct[0]  # simplified: first prob is for correct token
+    l = -math.log(max(p, 1e-10))
+    losses.append(l)
+    print(f"  '{char}': P(correct) = {p:.2f}, loss = {l:.4f}")
+
+avg_loss = sum(losses) / len(losses)
+print(f"\n  Average loss = {avg_loss:.4f}")
+print(f"  This is what microGPT computes: (1/n) * sum(losses)")
+print(f"\n  Training goal: make this number as small as possible!")
diff --git a/gpt/local/course/ch06_Training_Loop_and_Optimization/training_loop.py b/gpt/local/course/ch06_Training_Loop_and_Optimization/training_loop.py
new file mode 100644
index 0000000..7e88988
--- /dev/null
+++ b/gpt/local/course/ch06_Training_Loop_and_Optimization/training_loop.py
@@ -0,0 +1,171 @@
+"""
+Chapter 06: A Complete Mini Training Loop
+
+This script trains a tiny neural network on a simple pattern,
+using the EXACT same training loop structure as microGPT.
+
+The pattern: learn to predict the next character in "abcabc..."
+"""
+
+import math
+import random
+
+random.seed(42)
+
+# ============================================================
+# Autograd engine (from Chapter 03)
+# ============================================================
+class Value:
+    __slots__ = ('data', 'grad', '_children', '_local_grads')
+    def __init__(self, data, children=(), local_grads=()):
+        self.data = data
+        self.grad = 0
+        self._children = children
+        self._local_grads = local_grads
+    def __add__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data + other.data, (self, other), (1, 1))
+    def __mul__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data * other.data, (self, other), (other.data, self.data))
+    def __pow__(self, n): return Value(self.data**n, (self,), (n * self.data**(n-1),))
+    def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),))
+    def log(self): return Value(math.log(self.data), (self,), (1/self.data,))
+    def __neg__(self): return self * -1
+    def __sub__(self, other): return self + (-other)
+    def __radd__(self, other): return self + other
+    def __rmul__(self, other): return self * other
+    def __truediv__(self, other): return self * other**-1
+    def backward(self):
+        topo, visited = [], set()
+        def build(v):
+            if v not in visited:
+                visited.add(v)
+                for c in v._children: build(c)
+                topo.append(v)
+        build(self)
+        self.grad = 1
+        for v in reversed(topo):
+            for child, lg in zip(v._children, v._local_grads):
+                child.grad += lg * v.grad
+
+# ============================================================
+# Simple model: a single linear layer (logits = W @ one_hot_input)
+# This is basically just a lookup table — the simplest "model"
+# ============================================================
+vocab = ['a', 'b', 'c']
+vocab_size = len(vocab)
+char_to_id = {ch: i for i, ch in enumerate(vocab)}
+
+# The training data: repeating "abc"
+data = "abcabcabcabcabc"
+
+# Our "model": a weight matrix that maps each token to logits for next token
+# W[i][j] = score for predicting token j when input is token i
+matrix = lambda rows, cols: [[Value(random.gauss(0, 0.5)) for _ in range(cols)]
+                              for _ in range(rows)]
+W = matrix(vocab_size, vocab_size)
+params = [p for row in W for p in row]
+
+def softmax(logits):
+    max_val = max(val.data for val in logits)
+    exps = [(val - max_val).exp() for val in logits]
+    total = sum(exps)
+    return [e / total for e in exps]
+
+# ============================================================
+# Training loop — same structure as microGPT!
+# ============================================================
+print("=" * 50)
+print("Training a tiny model to predict: a→b, b→c, c→a")
+print(f"Parameters: {len(params)}")
+print("=" * 50)
+
+learning_rate = 0.5
+beta1, beta2, eps = 0.85, 0.99, 1e-8
+m = [0.0] * len(params)
+v = [0.0] * len(params)
+
+num_steps = 100
+
+for step in range(num_steps):
+    # 1. Pick a training example (input token → target token)
+    pos = step % (len(data) - 1)
+    input_id = char_to_id[data[pos]]
+    target_id = char_to_id[data[pos + 1]]
+
+    # 2. Forward pass: look up the row, get logits, compute loss
+    logits = W[input_id]  # simple lookup: logits for this input
+    probs = softmax(logits)
+    loss = -probs[target_id].log()
+
+    # 3. Backward pass
+    loss.backward()
+
+    # 4. Adam optimizer update
+    lr_t = learning_rate * (1 - step / num_steps)  # linear decay
+    for i, p in enumerate(params):
+        m[i] = beta1 * m[i] + (1 - beta1) * p.grad
+        v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2
+        m_hat = m[i] / (1 - beta1 ** (step + 1))
+        v_hat = v[i] / (1 - beta2 ** (step + 1))
+        p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps)
+        p.grad = 0  # reset gradient
+
+    if step % 10 == 0:
+        print(f"  Step {step:3d} | loss {loss.data:.4f} | "
+              f"input='{data[pos]}' target='{data[pos+1]}' "
+              f"P(correct)={probs[target_id].data:.3f}")
+
+# ============================================================
+# Inference: what did the model learn?
+# ============================================================
+print("\n" + "=" * 50)
+print("After Training: What the Model Learned")
+print("=" * 50)
+
+print("\nPrediction probabilities:")
+for input_ch in vocab:
+    input_id = char_to_id[input_ch]
+    logits = W[input_id]
+    probs = softmax(logits)
+    print(f"\n  Given '{input_ch}', predict:")
+    for j, ch in enumerate(vocab):
+        bar = "█" * int(probs[j].data * 30)
+        print(f"    '{ch}': {probs[j].data:.3f} {bar}")
+
+# Generate a sequence!
+print("\n" + "=" * 50)
+print("Generating text (starting from 'a'):")
+print("=" * 50)
+
+current = char_to_id['a']
+generated = ['a']
+for _ in range(20):
+    logits = W[current]
+    probs = softmax(logits)
+    # Greedy: pick the highest probability token
+    next_token = max(range(vocab_size), key=lambda i: probs[i].data)
+    generated.append(vocab[next_token])
+    current = next_token
+
+print(f"  {''.join(generated)}")
+print(f"  (Should be 'abcabcabc...' if training worked!)")
+
+print("""
+=== This Is microGPT's Training Loop ===
+
+What we just did:
+  1. Pick a document, tokenize it                    ← line 156-158
+  2. For each position, predict next token            ← line 163-168
+  3. Compute cross-entropy loss                       ← line 167-169
+  4. Backward pass to get gradients                   ← line 172
+  5. Adam optimizer updates all parameters            ← line 176-182
+
+The only difference in real microGPT:
+  - The model is a full transformer (not a lookup table)
+  - It processes entire sequences, not single tokens
+  - It has ~5000 parameters instead of 9
+
+But the training loop structure is IDENTICAL.
+""")
diff --git a/gpt/local/course/ch07_microGPT_Full_Walkthrough/README.md b/gpt/local/course/ch07_microGPT_Full_Walkthrough/README.md
new file mode 100644
index 0000000..840c88d
--- /dev/null
+++ b/gpt/local/course/ch07_microGPT_Full_Walkthrough/README.md
@@ -0,0 +1,70 @@
+# Chapter 07: microGPT — Full Walkthrough
+
+## What is microGPT?
+
+microGPT is a **complete GPT implementation in 200 lines of pure Python**. No PyTorch, no TensorFlow, no libraries (except `math`, `random`, `os`). It builds everything from scratch:
+
+- Autograd engine (Chapter 03)
+- Neural network layers (Chapter 04)
+- Multi-head attention (Chapter 05)
+- Training loop with Adam optimizer (Chapter 06)
+
+It trains on a dataset of baby names and learns to generate new, plausible-sounding names.
+
+**Source:** `../../8627fe009c40f57531cb18360106ce95/microgpt.py`
+
+---
+
+## The 7 Sections of microGPT
+
+| Lines | Section | What It Does |
+|---|---|---|
+| 1-21 | Dataset | Downloads names, shuffles them |
+| 23-27 | Tokenizer | Character-level: each letter → integer |
+| 29-73 | Autograd | The `Value` class with forward/backward |
+| 74-90 | Parameters | Initialize all weight matrices |
+| 92-144 | Model | The GPT architecture (attention + MLP) |
+| 146-184 | Training | Forward → loss → backward → Adam update |
+| 186-200 | Inference | Generate new names from the trained model |
+
+---
+
+## Architecture Summary
+
+```
+microGPT Config:
+  n_layer = 1       (1 transformer block)
+  n_embd = 16       (16-dimensional embeddings)
+  block_size = 16   (max sequence length)
+  n_head = 4        (4 attention heads)
+  head_dim = 4      (16 / 4 = 4 dims per head)
+  vocab_size = 27   (26 letters + BOS token)
+```
+
+Total parameters: ~5,000 (vs GPT-2's 124 million!)
+
+---
+
+## Run the Annotated Version
+
+```bash
+python microgpt_annotated.py   # The full microGPT with detailed comments
+```
+
+**Warning:** This takes several minutes to run because it's pure Python (no GPU acceleration). That's the tradeoff for zero dependencies!
+
+---
+
+## Key Differences from "Real" GPT-2
+
+| Feature | microGPT | GPT-2 |
+|---|---|---|
+| Dependencies | None | PyTorch |
+| Autograd | Custom `Value` class | PyTorch autograd |
+| Normalization | RMSNorm | LayerNorm |
+| Activation | ReLU | GELU |
+| Biases | None | Yes |
+| Layers | 1 | 12-48 |
+| Parameters | ~5,000 | 124M-1.5B |
+| Training | Single document at a time | Batched |
+| Tokenizer | Character-level | BPE (50K vocab) |
diff --git a/gpt/local/course/ch07_microGPT_Full_Walkthrough/microgpt_annotated.py b/gpt/local/course/ch07_microGPT_Full_Walkthrough/microgpt_annotated.py
new file mode 100644
index 0000000..7150177
--- /dev/null
+++ b/gpt/local/course/ch07_microGPT_Full_Walkthrough/microgpt_annotated.py
@@ -0,0 +1,382 @@
+"""
+Chapter 07: microGPT — Fully Annotated
+
+This is the COMPLETE microgpt.py by @karpathy, with extensive
+annotations for CS students. Every section maps back to the
+concepts from Chapters 01-06.
+
+Original: ../../8627fe009c40f57531cb18360106ce95/microgpt.py
+
+To run: python microgpt_annotated.py
+(Takes several minutes — it's pure Python, no GPU!)
+You can reduce num_steps below to speed it up.
+"""
+
+# =====================================================================
+# SECTION 1: IMPORTS & SETUP (no external dependencies!)
+# =====================================================================
+# This entire GPT uses ONLY Python standard library.
+# "Everything else is just efficiency." — @karpathy
+
+import os       # os.path.exists
+import math     # math.log, math.exp
+import random   # random.seed, random.choices, random.gauss, random.shuffle
+random.seed(42) # Fixed seed for reproducibility
+
+# =====================================================================
+# SECTION 2: DATASET (Chapter 01 — The Data)
+# =====================================================================
+# Downloads a list of ~32,000 baby names.
+# Each name becomes a "document" that the model learns to generate.
+
+if not os.path.exists('input.txt'):
+    import urllib.request
+    names_url = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt'
+    urllib.request.urlretrieve(names_url, 'input.txt')
+docs = [line.strip() for line in open('input.txt') if line.strip()]
+random.shuffle(docs)
+print(f"num docs: {len(docs)}")
+print(f"first 5 names: {docs[:5]}")
+
+# =====================================================================
+# SECTION 3: TOKENIZER (Chapter 02 — Tokenization)
+# =====================================================================
+# Character-level tokenizer: each unique letter → integer
+# Plus one special BOS (Beginning of Sequence) token
+
+uchars = sorted(set(''.join(docs)))  # unique characters: ['a', 'b', ..., 'z']
+BOS = len(uchars)                     # BOS gets the next available ID (26)
+vocab_size = len(uchars) + 1          # 27 total tokens
+print(f"vocab size: {vocab_size}")
+print(f"characters: {uchars}")
+print(f"BOS token id: {BOS}")
+
+# Example: tokenizing "emma"
+example = "emma"
+example_tokens = [BOS] + [uchars.index(ch) for ch in example] + [BOS]
+print(f"'{example}' tokenized: {example_tokens}")
+
+# =====================================================================
+# SECTION 4: AUTOGRAD ENGINE (Chapter 03 — The Value Class)
+# =====================================================================
+# Every number in the model is a Value object that tracks:
+#   - Its numeric value (.data)
+#   - Its gradient (.grad) — filled in during backward pass
+#   - How it was computed (_children, _local_grads) — for chain rule
+
+class Value:
+    __slots__ = ('data', 'grad', '_children', '_local_grads')
+
+    def __init__(self, data, children=(), local_grads=()):
+        self.data = data
+        self.grad = 0
+        self._children = children
+        self._local_grads = local_grads
+
+    # --- ARITHMETIC OPERATIONS ---
+    # Each operation records the local derivative for the chain rule.
+
+    def __add__(self, other):
+        # d(a+b)/da = 1, d(a+b)/db = 1
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data + other.data, (self, other), (1, 1))
+
+    def __mul__(self, other):
+        # d(a*b)/da = b, d(a*b)/db = a
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data * other.data, (self, other), (other.data, self.data))
+
+    def __pow__(self, other):
+        # d(a^n)/da = n * a^(n-1)
+        return Value(self.data**other, (self,), (other * self.data**(other-1),))
+
+    def log(self):
+        # d(log(a))/da = 1/a
+        return Value(math.log(self.data), (self,), (1/self.data,))
+
+    def exp(self):
+        # d(exp(a))/da = exp(a)
+        return Value(math.exp(self.data), (self,), (math.exp(self.data),))
+
+    def relu(self):
+        # d(relu(a))/da = 1 if a > 0, else 0
+        return Value(max(0, self.data), (self,), (float(self.data > 0),))
+
+    # Convenience methods for operator overloading
+    def __neg__(self): return self * -1
+    def __radd__(self, other): return self + other
+    def __sub__(self, other): return self + (-other)
+    def __rsub__(self, other): return other + (-self)
+    def __rmul__(self, other): return self * other
+    def __truediv__(self, other): return self * other**-1
+    def __rtruediv__(self, other): return other * self**-1
+
+    def backward(self):
+        """Backpropagation: compute gradients via reverse-mode autodiff."""
+        # Step 1: Topological sort (ensure children processed before parents)
+        topo = []
+        visited = set()
+        def build_topo(v):
+            if v not in visited:
+                visited.add(v)
+                for child in v._children:
+                    build_topo(child)
+                topo.append(v)
+        build_topo(self)
+
+        # Step 2: Propagate gradients backward through the graph
+        self.grad = 1  # d(loss)/d(loss) = 1
+        for v in reversed(topo):
+            for child, local_grad in zip(v._children, v._local_grads):
+                # Chain rule: child.grad += local_derivative * parent.grad
+                child.grad += local_grad * v.grad
+
+# =====================================================================
+# SECTION 5: PARAMETER INITIALIZATION (Chapter 04 — Weight Matrices)
+# =====================================================================
+# The "knowledge" of the model lives in these weight matrices.
+# Before training, they're random. After training, they encode patterns.
+
+n_layer = 1      # 1 transformer block (GPT-2 uses 12-48)
+n_embd = 16      # embedding dimension (GPT-2 uses 768-1600)
+block_size = 16   # max context length (longest name is 15 chars)
+n_head = 4        # attention heads (GPT-2 uses 12-25)
+head_dim = n_embd // n_head  # = 4 dims per head
+
+# Helper: create a matrix of Value objects initialized with small random values
+matrix = lambda nout, nin, std=0.08: [
+    [Value(random.gauss(0, std)) for _ in range(nin)]
+    for _ in range(nout)
+]
+
+# The state_dict: all learnable parameters organized by name
+state_dict = {
+    'wte': matrix(vocab_size, n_embd),    # Token embeddings: 27 tokens × 16 dims
+    'wpe': matrix(block_size, n_embd),    # Position embeddings: 16 positions × 16 dims
+    'lm_head': matrix(vocab_size, n_embd) # Output layer: maps embeddings → token scores
+}
+
+# Each transformer layer has attention and MLP weights
+for i in range(n_layer):
+    state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd)  # Query projection
+    state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd)  # Key projection
+    state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd)  # Value projection
+    state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd)  # Output projection
+    state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd)  # MLP expand (16→64)
+    state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd)  # MLP compress (64→16)
+
+# Flatten all parameters into one list (for the optimizer)
+params = [p for mat in state_dict.values() for row in mat for p in row]
+print(f"num params: {len(params)}")
+
+# =====================================================================
+# SECTION 6: MODEL ARCHITECTURE (Chapter 05 — Attention + MLP)
+# =====================================================================
+# The GPT model: token → embedding → transformer blocks → logits
+# Follows GPT-2 with minor changes: RMSNorm instead of LayerNorm,
+# no biases, ReLU instead of GELU.
+
+def linear(x, w):
+    """Linear layer: matrix-vector multiply (Chapter 04)"""
+    return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
+
+def softmax(logits):
+    """Softmax: scores → probabilities (Chapter 04)"""
+    max_val = max(val.data for val in logits)
+    exps = [(val - max_val).exp() for val in logits]
+    total = sum(exps)
+    return [e / total for e in exps]
+
+def rmsnorm(x):
+    """RMSNorm: stabilize values (Chapter 04)"""
+    ms = sum(xi * xi for xi in x) / len(x)
+    scale = (ms + 1e-5) ** -0.5
+    return [xi * scale for xi in x]
+
+def gpt(token_id, pos_id, keys, values):
+    """
+    Process one token through the GPT model.
+    
+    This is called once per token position during both training and inference.
+    Uses KV-cache: stores keys/values so previous positions aren't recomputed.
+    
+    Args:
+        token_id: integer, which token (0-26)
+        pos_id: integer, position in sequence (0-15)
+        keys: list of lists, accumulated key vectors per layer
+        values: list of lists, accumulated value vectors per layer
+    
+    Returns:
+        logits: list of 27 Values, scores for each possible next token
+    """
+    # STEP 1: Embedding lookup (Chapter 02)
+    # Each token gets a learned vector + a position vector
+    tok_emb = state_dict['wte'][token_id]   # [n_embd] = [16]
+    pos_emb = state_dict['wpe'][pos_id]     # [n_embd] = [16]
+    x = [t + p for t, p in zip(tok_emb, pos_emb)]  # combine
+    x = rmsnorm(x)  # normalize
+
+    # STEP 2: Transformer blocks (Chapter 05)
+    for li in range(n_layer):
+
+        # --- ATTENTION BLOCK ---
+        x_residual = x
+        x = rmsnorm(x)
+
+        # Project to Q, K, V
+        q = linear(x, state_dict[f'layer{li}.attn_wq'])
+        k = linear(x, state_dict[f'layer{li}.attn_wk'])
+        v = linear(x, state_dict[f'layer{li}.attn_wv'])
+
+        # Store K, V in cache for future tokens
+        keys[li].append(k)
+        values[li].append(v)
+
+        # Multi-head attention
+        x_attn = []
+        for h in range(n_head):
+            hs = h * head_dim  # start index for this head's slice
+
+            # Get this head's Q, K, V
+            q_h = q[hs:hs+head_dim]
+            k_h = [ki[hs:hs+head_dim] for ki in keys[li]]   # all cached keys
+            v_h = [vi[hs:hs+head_dim] for vi in values[li]]  # all cached values
+
+            # Attention scores: Q · K / sqrt(d)
+            attn_logits = [
+                sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5
+                for t in range(len(k_h))
+            ]
+
+            # Attention weights (softmax)
+            attn_weights = softmax(attn_logits)
+
+            # Weighted sum of values
+            head_out = [
+                sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h)))
+                for j in range(head_dim)
+            ]
+            x_attn.extend(head_out)  # concatenate heads
+
+        # Output projection + residual connection
+        x = linear(x_attn, state_dict[f'layer{li}.attn_wo'])
+        x = [a + b for a, b in zip(x, x_residual)]
+
+        # --- MLP BLOCK ---
+        x_residual = x
+        x = rmsnorm(x)
+        x = linear(x, state_dict[f'layer{li}.mlp_fc1'])  # expand: 16 → 64
+        x = [xi.relu() for xi in x]                       # activate
+        x = linear(x, state_dict[f'layer{li}.mlp_fc2'])  # compress: 64 → 16
+        x = [a + b for a, b in zip(x, x_residual)]       # residual
+
+    # STEP 3: Output head (map embedding → token scores)
+    logits = linear(x, state_dict['lm_head'])  # [vocab_size] = [27]
+    return logits
+
+# =====================================================================
+# SECTION 7: ADAM OPTIMIZER SETUP (Chapter 06)
+# =====================================================================
+learning_rate, beta1, beta2, eps_adam = 0.01, 0.85, 0.99, 1e-8
+m = [0.0] * len(params)  # momentum buffer
+v = [0.0] * len(params)  # velocity buffer
+
+# =====================================================================
+# SECTION 8: TRAINING LOOP (Chapter 06)
+# =====================================================================
+# For each step:
+#   1. Take a name, tokenize it
+#   2. Forward pass: predict each next character
+#   3. Compute cross-entropy loss
+#   4. Backward pass: compute gradients
+#   5. Adam update: adjust parameters
+
+num_steps = 200  # Reduced from 1000 for faster demo. Increase for better results!
+print(f"\nTraining for {num_steps} steps...")
+print("(Reduce num_steps in the code if this is too slow)\n")
+
+for step in range(num_steps):
+
+    # 1. Pick a document (name) and tokenize it
+    doc = docs[step % len(docs)]
+    tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS]
+    n = min(block_size, len(tokens) - 1)
+
+    # 2-3. Forward pass: predict next token at each position, accumulate loss
+    keys_cache = [[] for _ in range(n_layer)]
+    values_cache = [[] for _ in range(n_layer)]
+    losses = []
+
+    for pos_id in range(n):
+        token_id = tokens[pos_id]
+        target_id = tokens[pos_id + 1]
+
+        # Forward: get logits for this position
+        logits = gpt(token_id, pos_id, keys_cache, values_cache)
+
+        # Softmax → probabilities
+        probs = softmax(logits)
+
+        # Cross-entropy loss for this position
+        loss_t = -probs[target_id].log()
+        losses.append(loss_t)
+
+    # Average loss over all positions in this name
+    loss = (1 / n) * sum(losses)
+
+    # 4. Backward pass: compute all gradients
+    loss.backward()
+
+    # 5. Adam optimizer: update all parameters
+    lr_t = learning_rate * (1 - step / num_steps)  # linear LR decay
+    for i, p in enumerate(params):
+        m[i] = beta1 * m[i] + (1 - beta1) * p.grad
+        v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2
+        m_hat = m[i] / (1 - beta1 ** (step + 1))
+        v_hat = v[i] / (1 - beta2 ** (step + 1))
+        p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam)
+        p.grad = 0  # reset for next step
+
+    if step % 20 == 0 or step == num_steps - 1:
+        print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}")
+
+# =====================================================================
+# SECTION 9: INFERENCE (Chapter 01 — Generating Text)
+# =====================================================================
+# Now the model generates new names that "sound like" the training data.
+# Process: start with BOS, predict next token, repeat until BOS again.
+
+temperature = 0.5  # lower = more conservative, higher = more creative
+
+print("\n--- inference (new, hallucinated names) ---")
+for sample_idx in range(20):
+    keys_cache = [[] for _ in range(n_layer)]
+    values_cache = [[] for _ in range(n_layer)]
+    token_id = BOS
+    sample = []
+
+    for pos_id in range(block_size):
+        logits = gpt(token_id, pos_id, keys_cache, values_cache)
+        # Apply temperature: divide logits before softmax
+        probs = softmax([l / temperature for l in logits])
+        # Sample from the probability distribution
+        token_id = random.choices(range(vocab_size),
+                                  weights=[p.data for p in probs])[0]
+        if token_id == BOS:
+            break  # end of name
+        sample.append(uchars[token_id])
+
+    print(f"sample {sample_idx+1:2d}: {''.join(sample)}")
+
+print("""
+=== That's the entire microGPT! ===
+
+200 lines that implement:
+  ✓ Dataset loading and tokenization
+  ✓ Automatic differentiation (autograd)
+  ✓ Transformer architecture (attention + MLP)
+  ✓ Training with Adam optimizer
+  ✓ Text generation (inference)
+
+Next: Chapter 08 shows how PyTorch makes this 100x faster and cleaner.
+""")
diff --git a/gpt/local/course/ch08_Scaling_Up_with_PyTorch/README.md b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/README.md
new file mode 100644
index 0000000..91101e4
--- /dev/null
+++ b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/README.md
@@ -0,0 +1,85 @@
+# Chapter 08: Scaling Up with PyTorch — Introduction to minGPT
+
+## Why PyTorch?
+
+microGPT is beautiful for learning, but it's **painfully slow**. Processing one name takes seconds because every single multiplication is a Python function call.
+
+PyTorch solves this by:
+1. **Tensors**: Multi-dimensional arrays that run on GPU (1000x faster math)
+2. **Autograd**: Automatic differentiation built-in (no need for our `Value` class)
+3. **nn.Module**: Clean way to organize model layers
+4. **Optimizers**: Adam, SGD, etc. already implemented and optimized
+
+---
+
+## The Same Ideas, Better Tools
+
+| Concept | microGPT (Pure Python) | minGPT (PyTorch) |
+|---|---|---|
+| Numbers | `Value(3.14)` | `torch.tensor(3.14)` |
+| Gradients | `value.backward()` | `tensor.backward()` |
+| Linear layer | `linear(x, w)` (manual loop) | `nn.Linear(in, out)` |
+| Matrix multiply | Nested for-loops | `@` operator (GPU-accelerated) |
+| Softmax | Manual exp/sum | `F.softmax(x)` |
+| Optimizer | Manual Adam code | `torch.optim.AdamW(...)` |
+
+---
+
+## minGPT Project Structure
+
+```
+minGPT/
+├── mingpt/
+│   ├── model.py      ← The GPT model (311 lines)
+│   ├── trainer.py     ← Training loop (110 lines)
+│   ├── bpe.py         ← BPE tokenizer (320 lines)
+│   └── utils.py       ← Config helpers (104 lines)
+├── projects/
+│   ├── chargpt/       ← Character-level text generation
+│   └── adder/         ← Teaching GPT to add numbers
+├── demo.ipynb         ← Sorting demo
+└── generate.ipynb     ← GPT-2 text generation
+```
+
+---
+
+## Key PyTorch Concepts for minGPT
+
+### Tensors
+```python
+# A tensor is like a multi-dimensional array
+x = torch.tensor([1.0, 2.0, 3.0])           # 1D
+W = torch.randn(4, 3)                        # 2D (matrix)
+batch = torch.randn(32, 10, 768)             # 3D (batch of sequences)
+```
+
+### nn.Module
+```python
+# Every layer inherits from nn.Module
+class MyLayer(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(768, 768)  # weight matrix inside
+    
+    def forward(self, x):
+        return self.linear(x)
+```
+
+### Autograd
+```python
+x = torch.tensor(3.0, requires_grad=True)
+y = x ** 2 + 2 * x
+y.backward()        # computes dy/dx
+print(x.grad)       # tensor(8.)  (= 2*3 + 2)
+```
+
+---
+
+## Run the Examples
+
+```bash
+python pytorch_basics.py       # PyTorch fundamentals
+python pytorch_vs_manual.py    # Side-by-side: manual vs PyTorch
+```
+
+**Note:** These examples require PyTorch. Install with: `pip install torch`
diff --git a/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_basics.py b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_basics.py
new file mode 100644
index 0000000..7f03a1a
--- /dev/null
+++ b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_basics.py
@@ -0,0 +1,200 @@
+"""
+Chapter 08: PyTorch Basics for Understanding minGPT
+
+This introduces the PyTorch concepts you need to read minGPT's code.
+Each section maps to something we built manually in earlier chapters.
+
+Requires: pip install torch
+"""
+
+try:
+    import torch
+    import torch.nn as nn
+    from torch.nn import functional as F
+except ImportError:
+    print("PyTorch not installed. Run: pip install torch")
+    print("This chapter requires PyTorch.")
+    exit(1)
+
+# ============================================================
+# 1. TENSORS — PyTorch's version of our Value class
+# ============================================================
+print("=" * 60)
+print("1. Tensors — Multi-dimensional arrays with autograd")
+print("=" * 60)
+
+# Scalars, vectors, matrices, and higher
+scalar = torch.tensor(3.14)
+vector = torch.tensor([1.0, 2.0, 3.0])
+matrix = torch.randn(3, 4)  # 3x4 matrix of random numbers
+
+print(f"  Scalar: {scalar}  (shape: {scalar.shape})")
+print(f"  Vector: {vector}  (shape: {vector.shape})")
+print(f"  Matrix shape: {matrix.shape}")
+
+# Matrix multiplication — what took nested loops in microGPT
+A = torch.randn(2, 3)
+B = torch.randn(3, 4)
+C = A @ B  # matrix multiply — runs on GPU if available!
+print(f"\n  Matrix multiply: ({list(A.shape)}) @ ({list(B.shape)}) → ({list(C.shape)})")
+print(f"  This replaces our manual linear() function with nested loops!")
+
+# ============================================================
+# 2. AUTOGRAD — Automatic gradients (replaces our Value class)
+# ============================================================
+print("\n" + "=" * 60)
+print("2. Autograd — Automatic differentiation")
+print("=" * 60)
+
+# Compare with Chapter 03's Value class
+x = torch.tensor(2.0, requires_grad=True)
+y = (2 * x + 3) ** 2  # y = (2*2+3)^2 = 25
+
+y.backward()  # compute dy/dx
+
+print(f"  x = {x.item():.1f}")
+print(f"  y = (2x + 3)^2 = {y.item():.1f}")
+print(f"  dy/dx = {x.grad.item():.1f}")
+print(f"  Manual: 4*(2x+3) = 4*(2*2+3) = 20.0 ✓")
+
+# ============================================================
+# 3. nn.Linear — Replaces our manual linear() function
+# ============================================================
+print("\n" + "=" * 60)
+print("3. nn.Linear — Built-in linear layer")
+print("=" * 60)
+
+# In microGPT: linear(x, w) with manual loops
+# In minGPT:   nn.Linear does the same thing, but GPU-accelerated
+layer = nn.Linear(3, 2)  # 3 inputs → 2 outputs
+
+x = torch.tensor([1.0, 2.0, 3.0])
+y = layer(x)  # applies y = Wx + b
+
+print(f"  Input:  {x.tolist()} (3 values)")
+print(f"  Output: [{', '.join(f'{v:.4f}' for v in y.tolist())}] (2 values)")
+print(f"  Weight shape: {list(layer.weight.shape)}")
+print(f"  Bias shape:   {list(layer.bias.shape)}")
+print(f"  Total params: {layer.weight.numel() + layer.bias.numel()}")
+
+# ============================================================
+# 4. nn.Embedding — Replaces our embedding lookup table
+# ============================================================
+print("\n" + "=" * 60)
+print("4. nn.Embedding — Token/position lookup table")
+print("=" * 60)
+
+# In microGPT: state_dict['wte'][token_id]
+# In minGPT:   nn.Embedding(vocab_size, n_embd)
+emb = nn.Embedding(10, 4)  # 10 tokens, each maps to 4 numbers
+
+token_ids = torch.tensor([2, 5, 7])  # look up 3 tokens at once
+vectors = emb(token_ids)
+
+print(f"  Token IDs:  {token_ids.tolist()}")
+print(f"  Output shape: {list(vectors.shape)}  (3 tokens × 4 dims)")
+print(f"  Token 2 →  [{', '.join(f'{v:.3f}' for v in vectors[0].tolist())}]")
+print(f"  Token 5 →  [{', '.join(f'{v:.3f}' for v in vectors[1].tolist())}]")
+print(f"  Token 7 →  [{', '.join(f'{v:.3f}' for v in vectors[2].tolist())}]")
+
+# ============================================================
+# 5. SOFTMAX & CROSS-ENTROPY
+# ============================================================
+print("\n" + "=" * 60)
+print("5. Softmax & Cross-Entropy — Built-in and optimized")
+print("=" * 60)
+
+logits = torch.tensor([2.0, 1.0, 0.5])
+probs = F.softmax(logits, dim=-1)
+print(f"  Logits:  {logits.tolist()}")
+print(f"  Softmax: [{', '.join(f'{p:.4f}' for p in probs.tolist())}]")
+
+# Cross-entropy loss (combines softmax + negative log likelihood)
+logits_batch = torch.tensor([[2.0, 1.0, 0.5]])  # batch of 1
+targets = torch.tensor([0])  # correct answer is token 0
+loss = F.cross_entropy(logits_batch, targets)
+print(f"  Cross-entropy loss: {loss.item():.4f}")
+print(f"  Same as: -log(softmax[0]) = -log({probs[0].item():.4f}) = {-probs[0].log().item():.4f}")
+
+# ============================================================
+# 6. BATCHING — Processing multiple examples at once
+# ============================================================
+print("\n" + "=" * 60)
+print("6. Batching — The big speed advantage")
+print("=" * 60)
+
+print("""
+  microGPT processes ONE token at a time (sequential Python loops).
+  minGPT processes MANY tokens at once using batch dimensions:
+
+  microGPT: for each token in sequence: logits = gpt(token, pos)
+  minGPT:   logits = model(all_tokens_at_once)  # one call!
+
+  Shape conventions in minGPT:
+    B = batch size (e.g., 32 documents at once)
+    T = sequence length (e.g., 128 tokens)
+    C = embedding dim (e.g., 768)
+
+    Input:  (B, T)     — batch of token ID sequences
+    After embedding: (B, T, C) — batch of embedding sequences
+    Logits: (B, T, vocab_size) — predictions at every position
+""")
+
+# Demo: batch processing
+B, T, C = 2, 4, 8  # 2 sequences, 4 tokens each, 8-dim embeddings
+x = torch.randn(B, T, C)
+linear_layer = nn.Linear(C, C)
+y = linear_layer(x)  # applies to ALL positions in ALL sequences at once!
+print(f"  Input shape:  {list(x.shape)} (B={B}, T={T}, C={C})")
+print(f"  Output shape: {list(y.shape)} (same shape — processed all at once!)")
+
+# ============================================================
+# 7. nn.Module — Organizing models
+# ============================================================
+print("\n" + "=" * 60)
+print("7. nn.Module — How minGPT organizes its model")
+print("=" * 60)
+
+class TinyModel(nn.Module):
+    def __init__(self, n_in, n_hidden, n_out):
+        super().__init__()
+        self.layer1 = nn.Linear(n_in, n_hidden)
+        self.layer2 = nn.Linear(n_hidden, n_out)
+
+    def forward(self, x):
+        x = F.relu(self.layer1(x))
+        x = self.layer2(x)
+        return x
+
+model = TinyModel(3, 8, 2)
+n_params = sum(p.numel() for p in model.parameters())
+print(f"  Model: 3 → 8 → 2")
+print(f"  Total parameters: {n_params}")
+print(f"  Parameters are automatically tracked by nn.Module!")
+
+# Forward and backward in one go
+x = torch.randn(5, 3)  # batch of 5 inputs
+y = model(x)
+loss = y.sum()
+loss.backward()
+
+print(f"\n  Input shape:  {list(x.shape)}")
+print(f"  Output shape: {list(y.shape)}")
+print(f"  Gradients computed for all {n_params} parameters!")
+
+print("""
+=== Key Takeaway ===
+
+PyTorch gives us the same building blocks as microGPT:
+  Value class      → torch.Tensor with autograd
+  manual linear()  → nn.Linear
+  manual softmax() → F.softmax
+  manual Adam      → torch.optim.AdamW
+
+But MUCH faster because:
+  - Operations run on GPU (or optimized CPU)
+  - Batch processing: handle many sequences at once
+  - No Python-level loops for math
+
+Next: Chapter 09 — How minGPT uses all of this to build a full GPT model.
+""")
diff --git a/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_vs_manual.py b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_vs_manual.py
new file mode 100644
index 0000000..ee31e64
--- /dev/null
+++ b/gpt/local/course/ch08_Scaling_Up_with_PyTorch/pytorch_vs_manual.py
@@ -0,0 +1,202 @@
+"""
+Chapter 08: Side-by-Side — Manual Python vs PyTorch
+
+Shows the SAME operation implemented both ways,
+so you can see exactly what PyTorch is doing for you.
+
+Requires: pip install torch
+"""
+
+import math
+import random
+
+random.seed(42)
+
+try:
+    import torch
+    import torch.nn as nn
+    from torch.nn import functional as F
+    torch.manual_seed(42)
+except ImportError:
+    print("PyTorch not installed. Run: pip install torch")
+    exit(1)
+
+# ============================================================
+# COMPARISON 1: Softmax
+# ============================================================
+print("=" * 60)
+print("Comparison 1: Softmax")
+print("=" * 60)
+
+scores = [2.0, 1.0, 0.5]
+
+# MANUAL (like microGPT)
+max_val = max(scores)
+exps = [math.exp(s - max_val) for s in scores]
+total = sum(exps)
+manual_probs = [e / total for e in exps]
+
+# PYTORCH (like minGPT)
+torch_probs = F.softmax(torch.tensor(scores), dim=-1)
+
+print(f"  Input scores: {scores}")
+print(f"  Manual: [{', '.join(f'{p:.4f}' for p in manual_probs)}]")
+print(f"  PyTorch: {torch_probs.tolist()}")
+print(f"  Match: {all(abs(a - b) < 1e-6 for a, b in zip(manual_probs, torch_probs.tolist()))}")
+
+# ============================================================
+# COMPARISON 2: Linear Layer
+# ============================================================
+print("\n" + "=" * 60)
+print("Comparison 2: Linear Layer (Matrix Multiply)")
+print("=" * 60)
+
+x = [1.0, 2.0, 3.0]
+W = [[0.1, 0.2, 0.3],
+     [0.4, 0.5, 0.6]]
+
+# MANUAL (like microGPT)
+manual_out = [sum(wi * xi for wi, xi in zip(row, x)) for row in W]
+
+# PYTORCH (like minGPT)
+x_t = torch.tensor(x)
+W_t = torch.tensor(W)
+torch_out = (W_t @ x_t).tolist()
+
+print(f"  Input: {x}")
+print(f"  Manual: {manual_out}")
+print(f"  PyTorch: {torch_out}")
+print(f"  Match: {all(abs(a - b) < 1e-6 for a, b in zip(manual_out, torch_out))}")
+
+# ============================================================
+# COMPARISON 3: Autograd (Gradient Computation)
+# ============================================================
+print("\n" + "=" * 60)
+print("Comparison 3: Autograd")
+print("=" * 60)
+
+# MANUAL Value class (like microGPT)
+class Value:
+    def __init__(self, data, children=(), local_grads=()):
+        self.data = data
+        self.grad = 0
+        self._children = children
+        self._local_grads = local_grads
+    def __add__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data + other.data, (self, other), (1, 1))
+    def __mul__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data * other.data, (self, other), (other.data, self.data))
+    def __pow__(self, n):
+        return Value(self.data ** n, (self,), (n * self.data ** (n-1),))
+    def __rmul__(self, other): return self * other
+    def backward(self):
+        topo, visited = [], set()
+        def build(v):
+            if v not in visited:
+                visited.add(v)
+                for c in v._children: build(c)
+                topo.append(v)
+        build(self)
+        self.grad = 1
+        for v in reversed(topo):
+            for child, lg in zip(v._children, v._local_grads):
+                child.grad += lg * v.grad
+
+# Manual autograd
+a_manual = Value(2.0)
+b_manual = Value(3.0)
+c_manual = (a_manual * b_manual + 5) ** 2  # (2*3+5)^2 = 121
+c_manual.backward()
+
+# PyTorch autograd
+a_torch = torch.tensor(2.0, requires_grad=True)
+b_torch = torch.tensor(3.0, requires_grad=True)
+c_torch = (a_torch * b_torch + 5) ** 2
+c_torch.backward()
+
+print(f"  Expression: (a*b + 5)^2 where a=2, b=3")
+print(f"  Result:     manual={c_manual.data:.1f}, pytorch={c_torch.item():.1f}")
+print(f"  da:         manual={a_manual.grad:.1f}, pytorch={a_torch.grad.item():.1f}")
+print(f"  db:         manual={b_manual.grad:.1f}, pytorch={b_torch.grad.item():.1f}")
+
+# ============================================================
+# COMPARISON 4: Cross-Entropy Loss
+# ============================================================
+print("\n" + "=" * 60)
+print("Comparison 4: Cross-Entropy Loss")
+print("=" * 60)
+
+logits = [2.0, 1.0, 0.5]
+target = 0  # correct token is index 0
+
+# MANUAL (like microGPT)
+max_val = max(logits)
+exps = [math.exp(v - max_val) for v in logits]
+total = sum(exps)
+probs = [e / total for e in exps]
+manual_loss = -math.log(probs[target])
+
+# PYTORCH (like minGPT)
+logits_t = torch.tensor([logits])  # add batch dim
+target_t = torch.tensor([target])
+torch_loss = F.cross_entropy(logits_t, target_t)
+
+print(f"  Logits: {logits}, Target: {target}")
+print(f"  Manual loss: {manual_loss:.6f}")
+print(f"  PyTorch loss: {torch_loss.item():.6f}")
+print(f"  Match: {abs(manual_loss - torch_loss.item()) < 1e-5}")
+
+# ============================================================
+# COMPARISON 5: Speed
+# ============================================================
+print("\n" + "=" * 60)
+print("Comparison 5: Speed (rough estimate)")
+print("=" * 60)
+
+import time
+
+size = 100
+
+# Manual matrix multiply
+W_manual = [[random.random() for _ in range(size)] for _ in range(size)]
+x_manual = [random.random() for _ in range(size)]
+
+start = time.time()
+for _ in range(100):
+    result = [sum(wi * xi for wi, xi in zip(row, x_manual)) for row in W_manual]
+manual_time = time.time() - start
+
+# PyTorch matrix multiply
+W_torch = torch.randn(size, size)
+x_torch = torch.randn(size)
+
+start = time.time()
+for _ in range(100):
+    result = W_torch @ x_torch
+pytorch_time = time.time() - start
+
+speedup = manual_time / max(pytorch_time, 1e-10)
+print(f"  {size}x{size} matrix-vector multiply × 100 iterations:")
+print(f"  Manual Python: {manual_time*1000:.1f}ms")
+print(f"  PyTorch:       {pytorch_time*1000:.1f}ms")
+print(f"  Speedup:       ~{speedup:.0f}x")
+print(f"  (On GPU, the speedup would be 100-1000x more!)")
+
+print("""
+=== Summary ===
+
+microGPT and minGPT implement the SAME algorithm.
+The difference is purely in efficiency:
+
+  microGPT: every operation is a Python function call
+  minGPT: operations are batched into fast tensor ops
+
+This means:
+  - microGPT trains for minutes on tiny data
+  - minGPT can train on Shakespeare, add numbers, sort lists
+  - Real GPT-2 trains on the entire internet
+
+The math is identical. The speed is not.
+""")
diff --git a/gpt/local/course/ch09_minGPT_Model_Deep_Dive/README.md b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/README.md
new file mode 100644
index 0000000..1abf1e2
--- /dev/null
+++ b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/README.md
@@ -0,0 +1,79 @@
+# Chapter 09: minGPT Model — Deep Dive
+
+## Overview
+
+minGPT's model is defined in `mingpt/model.py` (~311 lines). It implements the **exact same architecture** as microGPT but using PyTorch, making it fast enough to train on real tasks.
+
+**Source:** `../../minGPT/mingpt/model.py`
+
+---
+
+## The Four Classes
+
+### 1. `NewGELU` (line 21-27)
+Activation function — smoother alternative to ReLU.
+- microGPT uses `ReLU(x) = max(0, x)`
+- minGPT uses `GELU(x) ≈ 0.5 * x * (1 + tanh(...))`
+
+### 2. `CausalSelfAttention` (line 29-71)
+Multi-head attention with causal masking.
+- Same math as microGPT's attention, but batched
+- Uses a pre-computed triangular mask for causal attention
+- All Q/K/V computed in one matrix multiply (efficiency trick)
+
+### 3. `Block` (line 73-93)
+One transformer block = Attention + MLP + residual connections.
+- Identical structure to microGPT's transformer block
+- Uses LayerNorm instead of RMSNorm
+
+### 4. `GPT` (line 95-311)
+The full model: embeddings + stacked blocks + output head.
+- Supports multiple model sizes (gpt-nano to gpt2-xl)
+- Can load pretrained GPT-2 weights from HuggingFace
+- Includes generation (inference) logic
+
+---
+
+## Model Size Configurations
+
+```python
+'gpt-nano':    n_layer=3,  n_head=3,  n_embd=48    # Tiny, for demos
+'gpt-micro':   n_layer=4,  n_head=4,  n_embd=128   # Small
+'gpt-mini':    n_layer=6,  n_head=6,  n_embd=192   # Medium-small
+'gpt2':        n_layer=12, n_head=12, n_embd=768   # 124M params (real GPT-2)
+'gpt2-xl':     n_layer=48, n_head=25, n_embd=1600  # 1.5B params
+```
+
+---
+
+## The Forward Pass (simplified)
+
+```
+Input token IDs: (batch_size, seq_len)
+        ↓
+Token Embedding + Position Embedding
+        ↓
+Dropout
+        ↓
+Block 0: LayerNorm → Attention → (+) → LayerNorm → MLP → (+)
+Block 1: LayerNorm → Attention → (+) → LayerNorm → MLP → (+)
+  ...
+Block N: LayerNorm → Attention → (+) → LayerNorm → MLP → (+)
+        ↓
+Final LayerNorm
+        ↓
+Linear head → logits (batch_size, seq_len, vocab_size)
+        ↓
+Cross-entropy loss (if targets provided)
+```
+
+---
+
+## Run the Examples
+
+```bash
+python model_walkthrough.py    # Annotated walkthrough of minGPT's model.py
+python model_sizes.py          # Compare different GPT configurations
+```
+
+**Requires:** `pip install torch`
diff --git a/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_sizes.py b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_sizes.py
new file mode 100644
index 0000000..9eed563
--- /dev/null
+++ b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_sizes.py
@@ -0,0 +1,146 @@
+"""
+Chapter 09: GPT Model Sizes — From Nano to GPT-2
+
+Shows how the same architecture scales from tiny to huge
+by changing just three numbers: n_layer, n_head, n_embd.
+
+Requires: pip install torch
+"""
+
+try:
+    import torch
+    import torch.nn as nn
+    import math
+except ImportError:
+    print("PyTorch not installed. Run: pip install torch")
+    exit(1)
+
+# ============================================================
+# Calculate parameter count for a GPT model
+# ============================================================
+def count_gpt_params(vocab_size, block_size, n_layer, n_head, n_embd):
+    """Count parameters in a GPT model without building it."""
+    params = {}
+
+    # Token embeddings
+    params['wte'] = vocab_size * n_embd
+
+    # Position embeddings
+    params['wpe'] = block_size * n_embd
+
+    # Per-layer parameters
+    per_layer = {}
+    # Attention: c_attn (Q,K,V combined) + c_proj (output)
+    per_layer['c_attn_weight'] = n_embd * (3 * n_embd)
+    per_layer['c_attn_bias'] = 3 * n_embd
+    per_layer['c_proj_weight'] = n_embd * n_embd
+    per_layer['c_proj_bias'] = n_embd
+    # MLP: c_fc (expand) + c_proj (compress)
+    per_layer['mlp_fc_weight'] = n_embd * (4 * n_embd)
+    per_layer['mlp_fc_bias'] = 4 * n_embd
+    per_layer['mlp_proj_weight'] = (4 * n_embd) * n_embd
+    per_layer['mlp_proj_bias'] = n_embd
+    # LayerNorms (2 per layer)
+    per_layer['ln1'] = 2 * n_embd  # weight + bias
+    per_layer['ln2'] = 2 * n_embd
+
+    layer_total = sum(per_layer.values())
+    params['all_layers'] = layer_total * n_layer
+
+    # Final LayerNorm
+    params['ln_f'] = 2 * n_embd
+
+    # Output head (often shares weights with wte, but let's count separately)
+    params['lm_head'] = n_embd * vocab_size
+
+    total = sum(params.values())
+    return total, params, per_layer, layer_total
+
+
+# ============================================================
+# Compare model sizes
+# ============================================================
+print("=" * 70)
+print("GPT Model Size Comparison")
+print("=" * 70)
+
+configs = [
+    # name, vocab_size, block_size, n_layer, n_head, n_embd
+    ("microGPT",    27,    16,   1,  4,   16),
+    ("gpt-nano",    50257, 1024, 3,  3,   48),
+    ("gpt-micro",   50257, 1024, 4,  4,   128),
+    ("gpt-mini",    50257, 1024, 6,  6,   192),
+    ("GPT-2",       50257, 1024, 12, 12,  768),
+    ("GPT-2 medium",50257, 1024, 24, 16,  1024),
+    ("GPT-2 large", 50257, 1024, 36, 20,  1280),
+    ("GPT-2 XL",    50257, 1024, 48, 25,  1600),
+]
+
+print(f"\n{'Model':<16} {'Layers':>6} {'Heads':>6} {'Embd':>6} {'Params':>14} {'Relative':>10}")
+print("-" * 70)
+
+base_params = None
+for name, vocab, block, n_layer, n_head, n_embd in configs:
+    total, _, _, _ = count_gpt_params(vocab, block, n_layer, n_head, n_embd)
+    if base_params is None:
+        base_params = total
+    relative = total / base_params
+    
+    if total < 1_000_000:
+        param_str = f"{total:,}"
+    else:
+        param_str = f"{total/1e6:.1f}M"
+    
+    print(f"{name:<16} {n_layer:>6} {n_head:>6} {n_embd:>6} {param_str:>14} {relative:>9.0f}x")
+
+# ============================================================
+# Detailed breakdown of one model
+# ============================================================
+print("\n" + "=" * 70)
+print("Detailed Breakdown: gpt-mini (n_layer=6, n_head=6, n_embd=192)")
+print("=" * 70)
+
+total, params, per_layer, layer_total = count_gpt_params(50257, 1024, 6, 6, 192)
+
+print(f"\n  Token embeddings (wte):    {params['wte']:>10,}  ({params['wte']/total*100:.1f}%)")
+print(f"  Position embeddings (wpe): {params['wpe']:>10,}  ({params['wpe']/total*100:.1f}%)")
+print(f"  All transformer layers:    {params['all_layers']:>10,}  ({params['all_layers']/total*100:.1f}%)")
+print(f"  Final LayerNorm:           {params['ln_f']:>10,}  ({params['ln_f']/total*100:.1f}%)")
+print(f"  Output head (lm_head):     {params['lm_head']:>10,}  ({params['lm_head']/total*100:.1f}%)")
+print(f"  {'─'*40}")
+print(f"  TOTAL:                     {total:>10,}")
+
+print(f"\n  Per-layer breakdown ({layer_total:,} params per layer × 6 layers):")
+for name, count in per_layer.items():
+    print(f"    {name:<20} {count:>8,}  ({count/layer_total*100:.1f}%)")
+
+# ============================================================
+# What each parameter "does"
+# ============================================================
+print("""
+=== What Each Component Stores ===
+
+Token Embeddings (wte):
+  A lookup table: token ID → vector. Captures "meaning" of each token.
+  Similar tokens (e.g., "cat", "dog") end up with similar vectors.
+
+Position Embeddings (wpe):
+  A lookup table: position → vector. Captures "where in the sequence".
+  Without this, the model can't tell position 1 from position 100.
+
+Attention Weights (c_attn, c_proj):
+  Control HOW tokens communicate. Q/K weights determine what each
+  token "looks for" and "advertises". V/output weights control what
+  information flows between tokens.
+
+MLP Weights (mlp_fc, mlp_proj):
+  Per-token processing. Each token's representation gets "refined"
+  through an expand→activate→compress pipeline. This is where a lot
+  of the model's "knowledge" is stored (facts, patterns, etc).
+
+=== The Scaling Insight ===
+
+Notice: most parameters are in the transformer layers (not embeddings).
+Doubling n_embd roughly QUADRUPLES the parameter count (because weight
+matrices are n_embd × n_embd). This is why bigger models are so expensive.
+""")
diff --git a/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_walkthrough.py b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_walkthrough.py
new file mode 100644
index 0000000..c21462e
--- /dev/null
+++ b/gpt/local/course/ch09_minGPT_Model_Deep_Dive/model_walkthrough.py
@@ -0,0 +1,263 @@
+"""
+Chapter 09: minGPT Model Walkthrough
+
+This script builds a minGPT model step-by-step, explaining
+every component and how it maps to the microGPT concepts
+from Chapter 07.
+
+Source: ../../minGPT/mingpt/model.py
+
+Requires: pip install torch
+"""
+
+try:
+    import math
+    import torch
+    import torch.nn as nn
+    from torch.nn import functional as F
+except ImportError:
+    print("PyTorch not installed. Run: pip install torch")
+    exit(1)
+
+torch.manual_seed(42)
+
+# ============================================================
+# COMPONENT 1: NewGELU Activation
+# ============================================================
+# In microGPT:  x.relu()  →  max(0, x)
+# In minGPT:    NewGELU   →  smoother, slightly better for training
+
+print("=" * 60)
+print("Component 1: NewGELU Activation")
+print("=" * 60)
+
+class NewGELU(nn.Module):
+    """
+    GELU = Gaussian Error Linear Unit
+    Used in GPT-2 instead of ReLU.
+    Almost identical to ReLU for positive values,
+    but allows small negative values through.
+    """
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(
+            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
+        ))
+
+gelu = NewGELU()
+test_vals = torch.tensor([-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0])
+print(f"  Input:  {test_vals.tolist()}")
+print(f"  GELU:   [{', '.join(f'{v:.3f}' for v in gelu(test_vals).tolist())}]")
+print(f"  ReLU:   [{', '.join(f'{v:.3f}' for v in F.relu(test_vals).tolist())}]")
+print("  Notice: GELU is smoother — small negatives get small negative outputs")
+
+# ============================================================
+# COMPONENT 2: CausalSelfAttention
+# ============================================================
+print("\n" + "=" * 60)
+print("Component 2: CausalSelfAttention")
+print("=" * 60)
+
+class CausalSelfAttention(nn.Module):
+    """
+    Multi-head masked self-attention.
+    
+    microGPT equivalent: the attention section in gpt() function (lines 114-133)
+    
+    Key differences from microGPT:
+    - Processes ALL positions at once (batched), not one-by-one
+    - Q, K, V computed in ONE matrix multiply (c_attn), not three
+    - Uses a triangular mask instead of KV-cache for causal masking
+    - Includes dropout for regularization
+    """
+    def __init__(self, n_embd, n_head, block_size):
+        super().__init__()
+        assert n_embd % n_head == 0
+        # Combined Q, K, V projection (efficiency: one matmul instead of three)
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        # Output projection
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        # Causal mask: lower triangular matrix
+        self.register_buffer("bias",
+            torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+        self.n_head = n_head
+        self.n_embd = n_embd
+
+    def forward(self, x):
+        B, T, C = x.size()  # batch, sequence length, embedding dim
+
+        # Compute Q, K, V all at once, then split
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+
+        # Reshape for multi-head: (B, T, C) → (B, n_head, T, head_dim)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+
+        # Attention scores: (B, nh, T, hs) × (B, nh, hs, T) → (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+
+        # Causal mask: set future positions to -inf
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
+
+        # Softmax → attention weights
+        att = F.softmax(att, dim=-1)
+
+        # Weighted sum of values
+        y = att @ v  # (B, nh, T, T) × (B, nh, T, hs) → (B, nh, T, hs)
+
+        # Concatenate heads: (B, nh, T, hs) → (B, T, C)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+
+        # Output projection
+        y = self.c_proj(y)
+        return y
+
+# Demo
+n_embd, n_head, block_size = 16, 4, 8
+attn = CausalSelfAttention(n_embd, n_head, block_size)
+x = torch.randn(2, 5, n_embd)  # batch=2, seq_len=5
+y = attn(x)
+print(f"  Config: n_embd={n_embd}, n_head={n_head}")
+print(f"  Input:  {list(x.shape)} (batch=2, tokens=5, dim={n_embd})")
+print(f"  Output: {list(y.shape)} (same shape — attention transforms, doesn't resize)")
+print(f"  Params: {sum(p.numel() for p in attn.parameters())}")
+
+print(f"\n  Causal mask (first 5×5):")
+mask = attn.bias[0, 0, :5, :5]
+for i in range(5):
+    row = ['✓' if mask[i, j] == 1 else '✗' for j in range(5)]
+    print(f"    pos {i}: {' '.join(row)}  (can attend to {int(mask[i].sum())} positions)")
+
+# ============================================================
+# COMPONENT 3: Block (Transformer Block)
+# ============================================================
+print("\n" + "=" * 60)
+print("Component 3: Transformer Block")
+print("=" * 60)
+
+class Block(nn.Module):
+    """
+    One transformer block: Attention + MLP with residual connections.
+    
+    microGPT equivalent: one iteration of the `for li in range(n_layer)` loop
+    
+    Structure:
+        x → LayerNorm → Attention → (+x) → LayerNorm → MLP → (+x) → output
+    """
+    def __init__(self, n_embd, n_head, block_size):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(n_embd)        # Pre-attention norm
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size)
+        self.ln_2 = nn.LayerNorm(n_embd)        # Pre-MLP norm
+        self.mlp = nn.ModuleDict(dict(
+            c_fc   = nn.Linear(n_embd, 4 * n_embd),   # Expand
+            c_proj = nn.Linear(4 * n_embd, n_embd),    # Compress
+            act    = NewGELU(),
+        ))
+
+    def forward(self, x):
+        # Attention with residual connection
+        x = x + self.attn(self.ln_1(x))
+        # MLP with residual connection
+        m = self.mlp
+        x = x + m.c_proj(m.act(m.c_fc(self.ln_2(x))))
+        return x
+
+block = Block(n_embd, n_head, block_size)
+x = torch.randn(2, 5, n_embd)
+y = block(x)
+n_params = sum(p.numel() for p in block.parameters())
+print(f"  Input:  {list(x.shape)}")
+print(f"  Output: {list(y.shape)}")
+print(f"  Block params: {n_params}")
+print(f"  Components: LayerNorm + Attention + LayerNorm + MLP(expand→GELU→compress)")
+
+# ============================================================
+# COMPONENT 4: Full GPT Model
+# ============================================================
+print("\n" + "=" * 60)
+print("Component 4: Full GPT Model")
+print("=" * 60)
+
+class MiniGPT(nn.Module):
+    """
+    Simplified version of minGPT's GPT class for illustration.
+    Same architecture, just without the config system and pretrained loading.
+    """
+    def __init__(self, vocab_size, block_size, n_layer, n_head, n_embd):
+        super().__init__()
+        self.block_size = block_size
+
+        self.transformer = nn.ModuleDict(dict(
+            wte  = nn.Embedding(vocab_size, n_embd),          # Token embeddings
+            wpe  = nn.Embedding(block_size, n_embd),          # Position embeddings
+            h    = nn.ModuleList([Block(n_embd, n_head, block_size)
+                                  for _ in range(n_layer)]),  # Transformer blocks
+            ln_f = nn.LayerNorm(n_embd),                      # Final norm
+        ))
+        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)  # Output projection
+
+    def forward(self, idx, targets=None):
+        B, T = idx.size()
+        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
+
+        # Embeddings
+        tok_emb = self.transformer.wte(idx)    # (B, T, n_embd)
+        pos_emb = self.transformer.wpe(pos)    # (T, n_embd) → broadcasts to (B, T, n_embd)
+        x = tok_emb + pos_emb
+
+        # Transformer blocks
+        for block in self.transformer.h:
+            x = block(x)
+
+        # Final norm + output projection
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)               # (B, T, vocab_size)
+
+        # Compute loss if targets given
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+
+        return logits, loss
+
+# Create a tiny GPT
+model = MiniGPT(
+    vocab_size=27,    # 26 letters + BOS (like microGPT)
+    block_size=16,    # max sequence length
+    n_layer=3,        # 3 transformer blocks
+    n_head=3,         # 3 attention heads
+    n_embd=48,        # 48-dim embeddings
+)
+
+n_params = sum(p.numel() for p in model.parameters())
+print(f"  Model: vocab=27, block_size=16, layers=3, heads=3, embd=48")
+print(f"  Total parameters: {n_params:,}")
+
+# Test forward pass
+idx = torch.randint(0, 27, (2, 10))     # batch of 2 sequences, 10 tokens each
+targets = torch.randint(0, 27, (2, 10))  # target tokens
+
+logits, loss = model(idx, targets)
+print(f"\n  Forward pass:")
+print(f"    Input shape:  {list(idx.shape)} (2 sequences × 10 tokens)")
+print(f"    Logits shape: {list(logits.shape)} (2 × 10 × 27 vocab)")
+print(f"    Loss: {loss.item():.4f}")
+print(f"    (Random init → loss ≈ -log(1/27) = {-math.log(1/27):.4f})")
+
+print("""
+=== How This Maps to microGPT ===
+
+minGPT Component        →  microGPT Equivalent
+─────────────────────────────────────────────────
+nn.Embedding(wte)       →  state_dict['wte'][token_id]
+nn.Embedding(wpe)       →  state_dict['wpe'][pos_id]
+CausalSelfAttention     →  The attention loop in gpt()
+Block                   →  One iteration of for li in range(n_layer)
+nn.LayerNorm            →  rmsnorm()
+NewGELU                 →  .relu()
+nn.Linear(lm_head)      →  linear(x, state_dict['lm_head'])
+F.cross_entropy         →  -probs[target_id].log()
+
+Same architecture. Same math. Different scale and speed.
+""")
diff --git a/gpt/local/course/ch10_Training_and_Inference/README.md b/gpt/local/course/ch10_Training_and_Inference/README.md
new file mode 100644
index 0000000..d4f6719
--- /dev/null
+++ b/gpt/local/course/ch10_Training_and_Inference/README.md
@@ -0,0 +1,80 @@
+# Chapter 10: Training & Inference with minGPT
+
+## Overview
+
+This chapter covers the two remaining pieces of minGPT:
+1. **Trainer** (`mingpt/trainer.py`) — The training loop
+2. **Generation** (`GPT.generate()`) — How the model creates new text
+
+Plus two real demo projects:
+- **Sorting demo** — Teaching GPT to sort numbers
+- **CharGPT** — Training on Shakespeare to generate new plays
+
+---
+
+## The Trainer (mingpt/trainer.py)
+
+minGPT's Trainer is ~110 lines of clean PyTorch boilerplate:
+
+```python
+while True:
+    x, y = next(data_iter)              # 1. Get batch of data
+    logits, loss = model(x, y)          # 2. Forward pass
+    model.zero_grad()                    # 3. Reset gradients
+    loss.backward()                      # 4. Backward pass (autograd)
+    clip_grad_norm_(model.parameters())  # 5. Prevent gradient explosion
+    optimizer.step()                      # 6. Adam update
+```
+
+Compare to microGPT's training loop — identical structure!
+
+---
+
+## The Optimizer Setup
+
+minGPT is careful about **weight decay** (regularization):
+- **Decay** weights in Linear layers (prevents overfitting)
+- **Don't decay** biases, LayerNorm weights, embeddings
+
+This is handled in `GPT.configure_optimizers()`.
+
+---
+
+## Text Generation (Inference)
+
+`GPT.generate()` works like microGPT's inference section:
+
+```python
+for each new token to generate:
+    1. Forward pass → get logits for last position
+    2. Apply temperature (divide logits by temperature)
+    3. Optionally apply top-k filtering
+    4. Softmax → probabilities
+    5. Sample from distribution (or take argmax)
+    6. Append new token to sequence
+```
+
+---
+
+## Demo Projects
+
+### 1. Sorting (demo.ipynb)
+Teaches GPT to sort lists of numbers: `[2, 0, 1] → [0, 1, 2]`
+- Shows that GPT can learn algorithmic tasks
+- Uses gpt-nano (~90K params), trains in ~1 minute
+
+### 2. CharGPT (projects/chargpt/)
+Character-level language model on any text file (e.g., Shakespeare)
+- Uses gpt-mini (~1M params)
+- Generates new text that mimics the training data style
+
+---
+
+## Run the Examples
+
+```bash
+python trainer_explained.py    # Annotated training loop
+python generate_text.py        # Text generation from scratch
+```
+
+**Requires:** `pip install torch`
diff --git a/gpt/local/course/ch10_Training_and_Inference/generate_text.py b/gpt/local/course/ch10_Training_and_Inference/generate_text.py
new file mode 100644
index 0000000..ba35aa1
--- /dev/null
+++ b/gpt/local/course/ch10_Training_and_Inference/generate_text.py
@@ -0,0 +1,206 @@
+"""
+Chapter 10: Text Generation — How GPT Creates Text
+
+This script explains and demonstrates the text generation process,
+mapping to both microGPT's and minGPT's generation code.
+
+Requires: pip install torch
+"""
+
+try:
+    import torch
+    import torch.nn as nn
+    from torch.nn import functional as F
+    import math
+except ImportError:
+    print("PyTorch not installed. Run: pip install torch")
+    exit(1)
+
+torch.manual_seed(42)
+
+# ============================================================
+# TEXT GENERATION EXPLAINED
+# ============================================================
+print("=" * 60)
+print("How GPT Generates Text")
+print("=" * 60)
+print("""
+Generation is an iterative process:
+
+  1. Start with a prompt (or just a start token)
+  2. Feed it through the model → get logits for ALL vocab tokens
+  3. Take only the LAST position's logits (prediction for next token)
+  4. Apply temperature and optional top-k filtering
+  5. Convert to probabilities (softmax)
+  6. Sample a token from the distribution
+  7. Append it to the sequence
+  8. Repeat from step 2
+""")
+
+# ============================================================
+# STEP-BY-STEP GENERATION DEMO (no model, just the mechanics)
+# ============================================================
+print("=" * 60)
+print("Step-by-Step: Temperature & Sampling")
+print("=" * 60)
+
+vocab = ['a', 'b', 'c', 'd', 'e']
+logits = torch.tensor([2.0, 5.0, 1.0, 0.5, 0.1])
+
+print(f"\nRaw logits: {logits.tolist()}")
+print(f"Vocab: {vocab}")
+
+# Temperature effect
+print("\n--- Temperature Effect ---")
+for temp in [0.1, 0.5, 1.0, 1.5, 3.0]:
+    scaled = logits / temp
+    probs = F.softmax(scaled, dim=-1)
+    print(f"  temp={temp:.1f}: [{', '.join(f'{p:.3f}' for p in probs.tolist())}]", end="")
+    if temp < 0.5:
+        print("  ← very peaked (almost deterministic)")
+    elif temp < 1.0:
+        print("  ← confident")
+    elif temp == 1.0:
+        print("  ← normal")
+    else:
+        print("  ← more random")
+
+# Top-k filtering
+print("\n--- Top-k Filtering ---")
+print("  Only consider the k most likely tokens, set rest to -inf")
+for k in [1, 2, 3, 5]:
+    filtered = logits.clone()
+    if k < len(logits):
+        v, _ = torch.topk(logits, k)
+        filtered[filtered < v[-1]] = float('-inf')
+    probs = F.softmax(filtered, dim=-1)
+    top_tokens = [vocab[i] for i in range(len(vocab)) if probs[i] > 0]
+    print(f"  top_k={k}: [{', '.join(f'{p:.3f}' for p in probs.tolist())}] "
+          f"candidates: {top_tokens}")
+
+# Sampling vs Greedy
+print("\n--- Sampling vs Greedy ---")
+probs = F.softmax(logits, dim=-1)
+print(f"  Probabilities: [{', '.join(f'{p:.3f}' for p in probs.tolist())}]")
+
+# Greedy: always pick the most likely
+_, greedy_idx = torch.topk(probs, k=1)
+print(f"  Greedy (argmax):  always picks '{vocab[greedy_idx.item()]}' (highest prob)")
+
+# Sampling: randomly pick, weighted by probability
+print(f"  Sampling (10 tries): ", end="")
+samples = []
+for _ in range(10):
+    idx = torch.multinomial(probs, num_samples=1)
+    samples.append(vocab[idx.item()])
+print(' '.join(samples))
+print(f"  ('{vocab[1]}' appears most often because it has highest probability)")
+
+# ============================================================
+# THE GENERATE FUNCTION (from minGPT)
+# ============================================================
+print("\n" + "=" * 60)
+print("The Generate Function — Code Comparison")
+print("=" * 60)
+
+print("""
+microGPT (lines 189-200):                 minGPT (model.py lines 283-310):
+─────────────────────────                  ──────────────────────────────────
+token_id = BOS                             idx = starting_tokens
+for pos_id in range(block_size):           for _ in range(max_new_tokens):
+    logits = gpt(token_id, pos_id, ...)        idx_cond = idx[:, -block_size:]
+    probs = softmax([l/temp for l in ..])      logits, _ = self(idx_cond)
+    token_id = random.choices(...)             logits = logits[:, -1, :] / temp
+    if token_id == BOS: break                  probs = F.softmax(logits, dim=-1)
+    sample.append(uchars[token_id])            idx_next = torch.multinomial(...)
+                                               idx = torch.cat((idx, idx_next), 1)
+
+Same algorithm:
+  1. Get logits from model
+  2. Scale by temperature
+  3. Softmax → probabilities
+  4. Sample next token
+  5. Append and repeat
+
+Key difference: minGPT processes in batches and handles
+sequences longer than block_size by cropping.
+""")
+
+# ============================================================
+# HOW CHARGPT GENERATES SHAKESPEARE
+# ============================================================
+print("=" * 60)
+print("Real Application: CharGPT on Shakespeare")
+print("=" * 60)
+
+print("""
+The minGPT project 'chargpt' trains on Shakespeare's text:
+
+  Source: ../../minGPT/projects/chargpt/chargpt.py
+
+  1. Load text file (e.g., all of Shakespeare)
+  2. Build character vocabulary ({'a':0, 'b':1, ..., 'z':25, ' ':26, ...})
+  3. Create a CharDataset:
+     - Each training example is a random 128-character chunk
+     - Input: characters [0:127], Target: characters [1:128]
+  4. Train gpt-mini (6 layers, 192 dim) for thousands of iterations
+  5. Every 500 steps, generate a sample starting from "O God, O God!"
+
+  After training, it produces text like:
+  "O God, O God! what shall I say to thee?
+   That I have lost my father, and my friend,
+   And all my mother's sons, and all my kin..."
+
+  It learned:
+  - English spelling and grammar
+  - Shakespeare's vocabulary and style
+  - Poetic meter (roughly)
+  - Character names and dialogue patterns
+
+  All from raw character sequences — no rules, no grammar engine,
+  just pattern matching at scale.
+""")
+
+# ============================================================
+# THE COMPLETE GENERATION PIPELINE
+# ============================================================
+print("=" * 60)
+print("Summary: The Complete Generation Pipeline")
+print("=" * 60)
+
+print("""
+  ┌─────────────────────────────────────────────────┐
+  │                  GENERATION                      │
+  │                                                  │
+  │  Prompt: "Hello"                                 │
+  │       ↓                                          │
+  │  Tokenize: [H, e, l, l, o] → [7, 4, 11, 11, 14]│
+  │       ↓                                          │
+  │  ┌─── LOOP (repeat for each new token) ─────┐   │
+  │  │                                           │   │
+  │  │  Feed tokens through GPT model            │   │
+  │  │       ↓                                   │   │
+  │  │  Get logits at last position               │   │
+  │  │       ↓                                   │   │
+  │  │  Apply temperature (divide by T)          │   │
+  │  │       ↓                                   │   │
+  │  │  Optional: top-k filtering                │   │
+  │  │       ↓                                   │   │
+  │  │  Softmax → probabilities                  │   │
+  │  │       ↓                                   │   │
+  │  │  Sample next token                        │   │
+  │  │       ↓                                   │   │
+  │  │  Append to sequence                       │   │
+  │  │                                           │   │
+  │  └───────────────────────────────────────────┘   │
+  │       ↓                                          │
+  │  Decode tokens → text: "Hello world, I am..."   │
+  │                                                  │
+  └─────────────────────────────────────────────────┘
+
+  Controls:
+    temperature < 1.0 → more focused, repetitive
+    temperature > 1.0 → more creative, random
+    top_k = small     → only consider top few options
+    do_sample = False → always pick best (greedy/deterministic)
+""")
diff --git a/gpt/local/course/ch10_Training_and_Inference/trainer_explained.py b/gpt/local/course/ch10_Training_and_Inference/trainer_explained.py
new file mode 100644
index 0000000..1dbead4
--- /dev/null
+++ b/gpt/local/course/ch10_Training_and_Inference/trainer_explained.py
@@ -0,0 +1,272 @@
+"""
+Chapter 10: minGPT Trainer — Annotated
+
+This builds a simplified version of minGPT's Trainer and trains
+a tiny GPT on a sorting task (from demo.ipynb).
+
+Source: ../../minGPT/mingpt/trainer.py
+Demo:   ../../minGPT/demo.ipynb
+
+Requires: pip install torch
+"""
+
+try:
+    import torch
+    import torch.nn as nn
+    from torch.nn import functional as F
+    from torch.utils.data import Dataset, DataLoader
+    import math
+    import time
+except ImportError:
+    print("PyTorch not installed. Run: pip install torch")
+    exit(1)
+
+torch.manual_seed(3407)
+
+# ============================================================
+# MODEL: Simplified GPT (from Chapter 09)
+# ============================================================
+class NewGELU(nn.Module):
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0/math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd, n_head, block_size):
+        super().__init__()
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size)).view(1,1,block_size,block_size))
+        self.n_head = n_head
+        self.n_embd = n_embd
+    def forward(self, x):
+        B, T, C = x.size()
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C//self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C//self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C//self.n_head).transpose(1, 2)
+        att = (q @ k.transpose(-2,-1)) * (1.0/math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:,:,:T,:T]==0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+
+class Block(nn.Module):
+    def __init__(self, n_embd, n_head, block_size):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size)
+        self.ln_2 = nn.LayerNorm(n_embd)
+        self.mlp = nn.Sequential(
+            nn.Linear(n_embd, 4*n_embd), NewGELU(), nn.Linear(4*n_embd, n_embd))
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+class GPT(nn.Module):
+    def __init__(self, vocab_size, block_size, n_layer, n_head, n_embd):
+        super().__init__()
+        self.block_size = block_size
+        self.wte = nn.Embedding(vocab_size, n_embd)
+        self.wpe = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.ModuleList([Block(n_embd, n_head, block_size) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.head = nn.Linear(n_embd, vocab_size, bias=False)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None: torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+    def forward(self, idx, targets=None):
+        B, T = idx.size()
+        pos = torch.arange(T, dtype=torch.long, device=idx.device)
+        x = self.wte(idx) + self.wpe(pos)
+        for block in self.blocks:
+            x = block(x)
+        logits = self.head(self.ln_f(x))
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        return logits, loss
+
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens):
+        for _ in range(max_new_tokens):
+            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :]
+            probs = F.softmax(logits, dim=-1)
+            _, idx_next = torch.topk(probs, k=1, dim=-1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+
+# ============================================================
+# DATASET: Sorting task (from demo.ipynb)
+# ============================================================
+class SortDataset(Dataset):
+    """
+    Input:  [2, 0, 1]    → Output: [0, 1, 2]
+    Concatenated: [2, 0, 1, 0, 1, 2]
+    
+    The model learns to "sort" by seeing thousands of examples.
+    This proves GPT can learn algorithms, not just language!
+    """
+    def __init__(self, split, length=6, num_digits=3):
+        self.split = split
+        self.length = length
+        self.num_digits = num_digits
+
+    def __len__(self):
+        return 10000
+
+    def get_vocab_size(self):
+        return self.num_digits
+
+    def get_block_size(self):
+        return self.length * 2 - 1
+
+    def __getitem__(self, idx):
+        import pickle
+        while True:
+            inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
+            h = hash(pickle.dumps(inp.tolist()))
+            if (h % 4 == 0) == (self.split == 'test'):
+                break
+        sol = torch.sort(inp)[0]
+        cat = torch.cat((inp, sol), dim=0)
+        x = cat[:-1].clone()
+        y = cat[1:].clone()
+        y[:self.length - 1] = -1  # mask: don't predict at input positions
+        return x, y
+
+# ============================================================
+# TRAINING LOOP (annotated version of mingpt/trainer.py)
+# ============================================================
+print("=" * 60)
+print("Training GPT to Sort Numbers")
+print("=" * 60)
+
+# Create dataset and model
+train_dataset = SortDataset('train')
+test_dataset = SortDataset('test')
+
+model = GPT(
+    vocab_size=train_dataset.get_vocab_size(),  # 3 digits: 0, 1, 2
+    block_size=train_dataset.get_block_size(),  # 11 positions
+    n_layer=3, n_head=3, n_embd=48              # gpt-nano
+)
+
+n_params = sum(p.numel() for p in model.parameters())
+print(f"\nModel: gpt-nano, {n_params:,} parameters")
+print(f"Task: sort lists of {train_dataset.length} digits ({train_dataset.num_digits} possible values)")
+print(f"Example: [2, 0, 1] → [0, 1, 2]\n")
+
+# Setup (mirrors mingpt/trainer.py)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = model.to(device)
+optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
+
+# Training loop
+train_loader = DataLoader(train_dataset, batch_size=64, num_workers=0,
+                          sampler=torch.utils.data.RandomSampler(train_dataset, replacement=True, num_samples=int(1e10)))
+data_iter = iter(train_loader)
+
+max_iters = 1000
+print(f"Training for {max_iters} iterations on {device}...\n")
+
+model.train()
+t0 = time.time()
+
+for iter_num in range(max_iters):
+    # ---- Step 1: Get batch ----
+    batch = next(data_iter)
+    x, y = [t.to(device) for t in batch]
+
+    # ---- Step 2: Forward pass ----
+    logits, loss = model(x, y)
+
+    # ---- Step 3-4: Backward pass ----
+    model.zero_grad(set_to_none=True)
+    loss.backward()
+
+    # ---- Step 5: Gradient clipping (prevent explosions) ----
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+    # ---- Step 6: Optimizer step ----
+    optimizer.step()
+
+    if iter_num % 100 == 0:
+        elapsed = time.time() - t0
+        print(f"  iter {iter_num:4d} | loss {loss.item():.5f} | time {elapsed:.1f}s")
+
+elapsed = time.time() - t0
+print(f"\nTraining complete in {elapsed:.1f}s")
+
+# ============================================================
+# EVALUATION: Test if it actually learned to sort!
+# ============================================================
+print("\n" + "=" * 60)
+print("Evaluation: Can it sort?")
+print("=" * 60)
+
+model.eval()
+n = train_dataset.length
+
+def eval_split(split, max_batches=50):
+    dataset = train_dataset if split == 'train' else test_dataset
+    loader = DataLoader(dataset, batch_size=100, num_workers=0)
+    correct = 0
+    total = 0
+    for b, (x, y) in enumerate(loader):
+        x = x.to(device)
+        inp = x[:, :n]
+        with torch.no_grad():
+            cat = model.generate(inp, n)
+        sol_pred = cat[:, n:]
+        sol_true = torch.sort(inp)[0]
+        correct += (sol_pred.cpu() == sol_true).all(1).sum().item()
+        total += x.size(0)
+        if b + 1 >= max_batches:
+            break
+    return correct, total
+
+train_correct, train_total = eval_split('train')
+test_correct, test_total = eval_split('test')
+
+print(f"\n  Train: {train_correct}/{train_total} = {100*train_correct/train_total:.1f}% correct")
+print(f"  Test:  {test_correct}/{test_total} = {100*test_correct/test_total:.1f}% correct")
+
+# Show some examples
+print("\n  Sample predictions:")
+loader = DataLoader(test_dataset, batch_size=5, num_workers=0)
+x, y = next(iter(loader))
+inp = x[:, :n].to(device)
+with torch.no_grad():
+    cat = model.generate(inp, n)
+for i in range(5):
+    input_list = inp[i].tolist()
+    pred_list = cat[i, n:].tolist()
+    true_list = sorted(input_list)
+    status = "✓" if pred_list == true_list else "✗"
+    print(f"    {input_list} → {pred_list} (expected {true_list}) {status}")
+
+print("""
+=== What This Demonstrates ===
+
+GPT learned to SORT numbers — a task it was never explicitly programmed for!
+It learned the algorithm purely from examples:
+  - See thousands of (unsorted → sorted) pairs
+  - Learn the pattern through gradient descent
+  - Generalize to new, unseen inputs
+
+This is the power of the transformer architecture: given enough data
+and parameters, it can learn surprisingly complex functions.
+
+The same architecture, scaled up with more data and parameters,
+learns to write code, answer questions, translate languages...
+""")
diff --git a/gpt/local/course/ch11_Side_by_Side_Comparison/README.md b/gpt/local/course/ch11_Side_by_Side_Comparison/README.md
new file mode 100644
index 0000000..b801b2a
--- /dev/null
+++ b/gpt/local/course/ch11_Side_by_Side_Comparison/README.md
@@ -0,0 +1,46 @@
+# Chapter 11: Side-by-Side — microGPT vs minGPT
+
+## The Same Algorithm, Two Implementations
+
+Both implementations by Andrej Karpathy implement the **exact same GPT architecture**. The difference is in tooling and scale, not in ideas.
+
+---
+
+## Philosophy Comparison
+
+| Aspect | microGPT | minGPT |
+|---|---|---|
+| **Goal** | "This is the complete algorithm. Everything else is just efficiency." | "Small, clean, interpretable and educational" |
+| **Dependencies** | Zero (pure Python) | PyTorch |
+| **Lines of code** | ~200 (single file) | ~850 (across 4 files) |
+| **Speed** | Minutes per 200 steps | Seconds per 2000 steps |
+| **GPU support** | No | Yes |
+| **Batching** | 1 document at a time | Configurable batch size |
+| **Best for** | Understanding the math | Building real projects |
+
+---
+
+## Architecture Mapping
+
+```
+microGPT                          minGPT
+────────                          ──────
+Value class                   →   torch.Tensor + autograd
+state_dict['wte'][id]         →   nn.Embedding(vocab, n_embd)
+state_dict['wpe'][id]         →   nn.Embedding(block_size, n_embd)
+rmsnorm(x)                    →   nn.LayerNorm(n_embd)
+linear(x, w) [manual loop]   →   nn.Linear(in, out) [GPU matmul]
+xi.relu()                     →   NewGELU()
+softmax(logits)               →   F.softmax(logits, dim=-1)
+-probs[target].log()          →   F.cross_entropy(logits, targets)
+Manual Adam loop              →   torch.optim.AdamW(...)
+KV-cache (sequential)         →   Triangular mask (parallel)
+```
+
+---
+
+## Run the Comparison
+
+```bash
+python comparison.py    # Detailed side-by-side code comparison
+```
diff --git a/gpt/local/course/ch11_Side_by_Side_Comparison/comparison.py b/gpt/local/course/ch11_Side_by_Side_Comparison/comparison.py
new file mode 100644
index 0000000..eb93d8e
--- /dev/null
+++ b/gpt/local/course/ch11_Side_by_Side_Comparison/comparison.py
@@ -0,0 +1,290 @@
+"""
+Chapter 11: microGPT vs minGPT — Detailed Comparison
+
+This script walks through each section of both implementations,
+showing the equivalent code side-by-side with explanations.
+
+No dependencies required — this is a reading/reference script.
+"""
+
+# ============================================================
+# Helper to format code snippets
+# ============================================================
+def compare(title, micro_code, min_code, explanation):
+    print("=" * 70)
+    print(f"  {title}")
+    print("=" * 70)
+    print(f"\n  microGPT (pure Python):")
+    for line in micro_code.strip().split('\n'):
+        print(f"    {line}")
+    print(f"\n  minGPT (PyTorch):")
+    for line in min_code.strip().split('\n'):
+        print(f"    {line}")
+    print(f"\n  → {explanation}\n")
+
+
+# ============================================================
+# COMPARISON 1: Tokenization
+# ============================================================
+compare(
+    "1. TOKENIZATION",
+    """
+uchars = sorted(set(''.join(docs)))
+BOS = len(uchars)
+vocab_size = len(uchars) + 1
+tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS]
+""",
+    """
+# CharDataset (projects/chargpt/chargpt.py)
+chars = sorted(list(set(data)))
+self.stoi = {ch:i for i,ch in enumerate(chars)}
+self.itos = {i:ch for i,ch in enumerate(chars)}
+dix = [self.stoi[s] for s in chunk]
+x = torch.tensor(dix[:-1], dtype=torch.long)
+""",
+    "Same idea: map characters to integers. minGPT wraps it in a Dataset class\n"
+    "  and returns PyTorch tensors instead of plain lists."
+)
+
+# ============================================================
+# COMPARISON 2: Embeddings
+# ============================================================
+compare(
+    "2. EMBEDDINGS",
+    """
+# Token + Position embedding (inline in gpt function)
+tok_emb = state_dict['wte'][token_id]    # list lookup
+pos_emb = state_dict['wpe'][pos_id]      # list lookup
+x = [t + p for t, p in zip(tok_emb, pos_emb)]
+""",
+    """
+# GPT.__init__:
+self.transformer.wte = nn.Embedding(vocab_size, n_embd)
+self.transformer.wpe = nn.Embedding(block_size, n_embd)
+
+# GPT.forward:
+tok_emb = self.transformer.wte(idx)    # (B, T, n_embd)
+pos_emb = self.transformer.wpe(pos)    # (1, T, n_embd)
+x = tok_emb + pos_emb                 # broadcasts!
+""",
+    "microGPT looks up one embedding at a time (one token, one position).\n"
+    "  minGPT looks up ALL embeddings for ALL tokens in the batch at once."
+)
+
+# ============================================================
+# COMPARISON 3: Attention
+# ============================================================
+compare(
+    "3. MULTI-HEAD ATTENTION",
+    """
+# Process one token at a time, accumulate KV cache
+q = linear(x, state_dict[f'layer{li}.attn_wq'])
+k = linear(x, state_dict[f'layer{li}.attn_wk'])
+v = linear(x, state_dict[f'layer{li}.attn_wv'])
+keys[li].append(k)
+values[li].append(v)
+
+for h in range(n_head):
+    hs = h * head_dim
+    q_h = q[hs:hs+head_dim]
+    k_h = [ki[hs:hs+head_dim] for ki in keys[li]]
+    v_h = [vi[hs:hs+head_dim] for vi in values[li]]
+    attn_logits = [sum(...) / head_dim**0.5 for t in ...]
+    attn_weights = softmax(attn_logits)
+    head_out = [sum(...) for j in range(head_dim)]
+    x_attn.extend(head_out)
+""",
+    """
+# Process ALL tokens at once with masked attention
+q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+k = k.view(B, T, nh, hs).transpose(1, 2)  # (B, nh, T, hs)
+q = q.view(B, T, nh, hs).transpose(1, 2)
+v = v.view(B, T, nh, hs).transpose(1, 2)
+
+att = (q @ k.transpose(-2, -1)) * (1.0 / sqrt(hs))
+att = att.masked_fill(self.bias[:,:,:T,:T] == 0, -inf)
+att = F.softmax(att, dim=-1)
+y = att @ v
+y = y.transpose(1, 2).contiguous().view(B, T, C)
+""",
+    "This is the BIGGEST difference between the two implementations:\n"
+    "  - microGPT: sequential (one token at a time, Python loops over heads)\n"
+    "  - minGPT: parallel (all tokens, all heads, all batches in one matmul)\n"
+    "  Same math, but minGPT is orders of magnitude faster."
+)
+
+# ============================================================
+# COMPARISON 4: MLP Block
+# ============================================================
+compare(
+    "4. MLP (FEEDFORWARD) BLOCK",
+    """
+x = linear(x, state_dict[f'layer{li}.mlp_fc1'])  # expand
+x = [xi.relu() for xi in x]                       # activate
+x = linear(x, state_dict[f'layer{li}.mlp_fc2'])  # compress
+""",
+    """
+# Block.__init__:
+self.mlp = nn.ModuleDict(dict(
+    c_fc   = nn.Linear(n_embd, 4 * n_embd),
+    c_proj = nn.Linear(4 * n_embd, n_embd),
+    act    = NewGELU(),
+))
+
+# Block.forward:
+self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x))))
+""",
+    "Identical structure: expand → activate → compress.\n"
+    "  microGPT uses ReLU, minGPT uses GELU (smoother).\n"
+    "  minGPT adds dropout for regularization."
+)
+
+# ============================================================
+# COMPARISON 5: Normalization
+# ============================================================
+compare(
+    "5. NORMALIZATION",
+    """
+def rmsnorm(x):
+    ms = sum(xi * xi for xi in x) / len(x)
+    scale = (ms + 1e-5) ** -0.5
+    return [xi * scale for xi in x]
+""",
+    """
+# Uses PyTorch's built-in LayerNorm
+self.ln_1 = nn.LayerNorm(n_embd)
+self.ln_2 = nn.LayerNorm(n_embd)
+
+# In forward:
+x = x + self.attn(self.ln_1(x))
+""",
+    "RMSNorm (microGPT) only scales by magnitude.\n"
+    "  LayerNorm (minGPT) also shifts by mean and has learnable parameters.\n"
+    "  Both serve the same purpose: keep values in a stable range."
+)
+
+# ============================================================
+# COMPARISON 6: Residual Connections
+# ============================================================
+compare(
+    "6. RESIDUAL CONNECTIONS",
+    """
+x_residual = x
+x = rmsnorm(x)
+# ... attention ...
+x = [a + b for a, b in zip(x, x_residual)]  # ADD residual
+
+x_residual = x
+x = rmsnorm(x)
+# ... MLP ...
+x = [a + b for a, b in zip(x, x_residual)]  # ADD residual
+""",
+    """
+def forward(self, x):
+    x = x + self.attn(self.ln_1(x))    # residual around attention
+    x = x + self.mlpf(self.ln_2(x))    # residual around MLP
+    return x
+""",
+    "Identical concept: output = input + transformation(input).\n"
+    "  minGPT is more concise because PyTorch handles the element-wise add."
+)
+
+# ============================================================
+# COMPARISON 7: Training Loop
+# ============================================================
+compare(
+    "7. TRAINING LOOP",
+    """
+for step in range(num_steps):
+    doc = docs[step % len(docs)]
+    tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS]
+    # ... forward each token through gpt() ...
+    loss = (1/n) * sum(losses)
+    loss.backward()
+    # Manual Adam:
+    for i, p in enumerate(params):
+        m[i] = beta1 * m[i] + (1-beta1) * p.grad
+        v[i] = beta2 * v[i] + (1-beta2) * p.grad**2
+        # ... bias correction, update ...
+        p.data -= lr_t * m_hat / (v_hat**0.5 + eps)
+        p.grad = 0
+""",
+    """
+while True:
+    batch = next(data_iter)
+    x, y = [t.to(self.device) for t in batch]
+    logits, self.loss = model(x, y)
+    model.zero_grad(set_to_none=True)
+    self.loss.backward()
+    clip_grad_norm_(model.parameters(), config.grad_norm_clip)
+    self.optimizer.step()
+""",
+    "Same loop: forward → loss → backward → update.\n"
+    "  microGPT: 1 document at a time, manual Adam, ~15 lines.\n"
+    "  minGPT: batched, built-in optimizer, gradient clipping, ~10 lines."
+)
+
+# ============================================================
+# COMPARISON 8: Inference
+# ============================================================
+compare(
+    "8. TEXT GENERATION (INFERENCE)",
+    """
+token_id = BOS
+for pos_id in range(block_size):
+    logits = gpt(token_id, pos_id, keys, values)
+    probs = softmax([l / temperature for l in logits])
+    token_id = random.choices(range(vocab_size),
+                              weights=[p.data for p in probs])[0]
+    if token_id == BOS: break
+    sample.append(uchars[token_id])
+""",
+    """
+for _ in range(max_new_tokens):
+    idx_cond = idx if idx.size(1) <= self.block_size \\
+               else idx[:, -self.block_size:]
+    logits, _ = self(idx_cond)
+    logits = logits[:, -1, :] / temperature
+    if top_k is not None:
+        v, _ = torch.topk(logits, top_k)
+        logits[logits < v[:, [-1]]] = -float('Inf')
+    probs = F.softmax(logits, dim=-1)
+    if do_sample:
+        idx_next = torch.multinomial(probs, num_samples=1)
+    else:
+        _, idx_next = torch.topk(probs, k=1, dim=-1)
+    idx = torch.cat((idx, idx_next), dim=1)
+""",
+    "microGPT: sequential, one token at a time, KV-cache.\n"
+    "  minGPT: can process batches, supports top-k filtering, greedy mode.\n"
+    "  Both: predict one token, append, repeat."
+)
+
+# ============================================================
+# SUMMARY TABLE
+# ============================================================
+print("=" * 70)
+print("  SUMMARY: When to Use Which")
+print("=" * 70)
+print("""
+  ┌──────────────────────────────────────────────────────────────────┐
+  │  USE microGPT WHEN YOU WANT TO:                                  │
+  │    • Understand exactly how every piece of GPT works             │
+  │    • See autograd, attention, training loop without any magic     │
+  │    • Learn the algorithm, not the framework                      │
+  │    • Have a single-file reference with zero dependencies         │
+  │                                                                  │
+  │  USE minGPT WHEN YOU WANT TO:                                    │
+  │    • Actually train on real data (Shakespeare, code, etc.)       │
+  │    • Experiment with different model sizes                       │
+  │    • Use GPU acceleration                                        │
+  │    • Load pretrained GPT-2 weights                               │
+  │    • Build projects on top of a clean GPT implementation         │
+  │                                                                  │
+  │  THE KEY INSIGHT:                                                │
+  │    microGPT IS minGPT, just without the efficiency tricks.       │
+  │    If you understand microGPT, you understand minGPT.            │
+  │    If you understand minGPT, you understand GPT-2.               │
+  │    The architecture is the same at every scale.                  │
+  └──────────────────────────────────────────────────────────────────┘
+""")
diff --git a/gpt/local/course/ch12_Exercises_and_Next_Steps/README.md b/gpt/local/course/ch12_Exercises_and_Next_Steps/README.md
new file mode 100644
index 0000000..9087d05
--- /dev/null
+++ b/gpt/local/course/ch12_Exercises_and_Next_Steps/README.md
@@ -0,0 +1,102 @@
+# Chapter 12: Exercises & Next Steps
+
+## Congratulations!
+
+You now understand the complete GPT architecture — from raw text to generated output. Here are exercises to solidify your understanding, organized by difficulty.
+
+---
+
+## Beginner Exercises (Chapters 01-04)
+
+### Exercise 1: Bigram Language Model
+Build a character-level bigram model (like Ch01) but train it using gradient descent and the Value class (Ch03) instead of counting.
+
+### Exercise 2: Custom Tokenizer
+Modify the character tokenizer from Ch02 to handle uppercase letters, spaces, and punctuation. Test it on a paragraph of text.
+
+### Exercise 3: Autograd Extensions
+Add `tanh()` and `sigmoid()` operations to the Value class. Verify their gradients by comparing with numerical differentiation.
+
+### Exercise 4: Softmax Temperature Explorer
+Write a script that visualizes how different temperatures (0.1 to 5.0) affect the output distribution for a fixed set of logits.
+
+---
+
+## Intermediate Exercises (Chapters 05-07)
+
+### Exercise 5: Attention Visualization
+Modify the attention code from Ch05 to print attention weights for each head. Feed in a known pattern (e.g., "abab") and see which positions attend to which.
+
+### Exercise 6: microGPT Hyperparameter Tuning
+Modify microGPT's hyperparameters and measure the effect on loss:
+- Try `n_head = 1, 2, 4, 8`
+- Try `n_embd = 8, 16, 32`
+- Try `n_layer = 1, 2, 3`
+
+### Exercise 7: Different Dataset
+Modify microGPT to train on a different character-level dataset (e.g., country names, short words, or DNA sequences like "ATCGATCG").
+
+---
+
+## Advanced Exercises (Chapters 08-11)
+
+### Exercise 8: minGPT Character Generator
+Use minGPT's `chargpt` project to train on a text file of your choice. Try song lyrics, code, or recipes.
+
+### Exercise 9: Add Dropout to microGPT
+Implement dropout in pure Python and add it to microGPT's attention and MLP blocks. Does it help with overfitting?
+
+### Exercise 10: Implement Beam Search
+Instead of sampling one token at a time, implement beam search for text generation: maintain the top-k most likely sequences at each step.
+
+### Exercise 11: Weight Sharing
+In real GPT-2, the token embedding matrix (`wte`) and the output head (`lm_head`) share weights. Implement this in either microGPT or the simplified minGPT from Ch09.
+
+---
+
+## Challenge Exercises
+
+### Exercise 12: Build nanoGPT
+Combine the best of both implementations: write a GPT that uses PyTorch but is contained in a single file under 300 lines. (This is essentially what Karpathy did with nanoGPT!)
+
+### Exercise 13: Positional Encoding Variants
+Replace learned positional embeddings with sinusoidal positional encodings (from the original "Attention is All You Need" paper). Compare results.
+
+---
+
+## Next Steps & Further Reading
+
+### Karpathy's Other Projects
+- **nanoGPT**: The successor to minGPT — more practical, reproduces benchmarks
+  - https://github.com/karpathy/nanoGPT
+- **makemore**: Character-level name generation (the dataset microGPT uses)
+  - https://github.com/karpathy/makemore
+- **Karpathy's YouTube**: Full video lectures building these from scratch
+  - https://www.youtube.com/c/AndrejKarpathy
+
+### Papers to Read
+1. **"Attention Is All You Need"** (2017) — The original transformer paper
+2. **"Improving Language Understanding by Generative Pre-Training"** (GPT-1, 2018)
+3. **"Language Models are Few-Shot Learners"** (GPT-3, 2020)
+
+### Topics to Explore Next
+- **Tokenization deep-dive**: SentencePiece, tiktoken
+- **Training at scale**: Distributed training, mixed precision
+- **Fine-tuning**: LoRA, RLHF, instruction tuning
+- **Inference optimization**: KV-cache, quantization, speculative decoding
+- **Other architectures**: BERT (encoder), T5 (encoder-decoder)
+
+---
+
+## Quick Reference Card
+
+```
+GPT in one paragraph:
+  Tokenize text into integers. Embed each token into a vector.
+  Add position embeddings. Pass through N transformer blocks,
+  each doing: normalize → multi-head attention → residual →
+  normalize → MLP → residual. Final linear layer maps to
+  vocab-sized logits. Softmax gives next-token probabilities.
+  Train with cross-entropy loss and Adam optimizer.
+  Generate by sampling one token at a time.
+```
diff --git a/gpt/local/course/ch12_Exercises_and_Next_Steps/exercise_solutions.py b/gpt/local/course/ch12_Exercises_and_Next_Steps/exercise_solutions.py
new file mode 100644
index 0000000..234c6f7
--- /dev/null
+++ b/gpt/local/course/ch12_Exercises_and_Next_Steps/exercise_solutions.py
@@ -0,0 +1,250 @@
+"""
+Chapter 12: Exercise Solution Starters
+
+This file contains STARTER CODE for some of the exercises.
+Complete the TODO sections to solve each exercise.
+"""
+
+import math
+import random
+
+random.seed(42)
+
+# ============================================================
+# Value class (from Chapter 03) — needed for exercises
+# ============================================================
+class Value:
+    __slots__ = ('data', 'grad', '_children', '_local_grads')
+    def __init__(self, data, children=(), local_grads=()):
+        self.data = data
+        self.grad = 0
+        self._children = children
+        self._local_grads = local_grads
+    def __add__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data + other.data, (self, other), (1, 1))
+    def __mul__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        return Value(self.data * other.data, (self, other), (other.data, self.data))
+    def __pow__(self, n): return Value(self.data**n, (self,), (n * self.data**(n-1),))
+    def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),))
+    def log(self): return Value(math.log(self.data), (self,), (1/self.data,))
+    def relu(self): return Value(max(0, self.data), (self,), (float(self.data > 0),))
+    def __neg__(self): return self * -1
+    def __radd__(self, other): return self + other
+    def __sub__(self, other): return self + (-other)
+    def __rmul__(self, other): return self * other
+    def __truediv__(self, other): return self * other**-1
+    def backward(self):
+        topo, visited = [], set()
+        def build(v):
+            if v not in visited:
+                visited.add(v)
+                for c in v._children: build(c)
+                topo.append(v)
+        build(self)
+        self.grad = 1
+        for v in reversed(topo):
+            for child, lg in zip(v._children, v._local_grads):
+                child.grad += lg * v.grad
+
+
+# ============================================================
+# EXERCISE 3: Add tanh() and sigmoid() to Value class
+# ============================================================
+print("=" * 50)
+print("Exercise 3: Extending the Value Class")
+print("=" * 50)
+
+def tanh(self):
+    """
+    TODO: Implement tanh for the Value class.
+    
+    tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+    d(tanh(x))/dx = 1 - tanh(x)^2
+    
+    Hint: You can compute the forward value using math.tanh()
+    and the local gradient using the derivative formula above.
+    """
+    t = math.tanh(self.data)
+    return Value(t, (self,), (1 - t**2,))
+
+# Attach to Value class
+Value.tanh = tanh
+
+# Test it
+x = Value(0.5)
+y = x.tanh()
+y.backward()
+print(f"  tanh({x.data}) = {y.data:.6f}")
+print(f"  d(tanh)/dx = {x.grad:.6f}")
+print(f"  Expected: tanh(0.5) = {math.tanh(0.5):.6f}")
+print(f"  Expected grad: 1 - tanh(0.5)^2 = {1 - math.tanh(0.5)**2:.6f}")
+
+
+def sigmoid(self):
+    """
+    TODO: Implement sigmoid for the Value class.
+    
+    sigmoid(x) = 1 / (1 + exp(-x))
+    d(sigmoid(x))/dx = sigmoid(x) * (1 - sigmoid(x))
+    """
+    s = 1.0 / (1.0 + math.exp(-self.data))
+    return Value(s, (self,), (s * (1 - s),))
+
+Value.sigmoid = sigmoid
+
+x = Value(1.0)
+y = x.sigmoid()
+y.backward()
+s = 1.0 / (1.0 + math.exp(-1.0))
+print(f"\n  sigmoid({x.data}) = {y.data:.6f}")
+print(f"  d(sigmoid)/dx = {x.grad:.6f}")
+print(f"  Expected: sigmoid(1.0) = {s:.6f}")
+print(f"  Expected grad: {s * (1 - s):.6f}")
+
+
+# ============================================================
+# EXERCISE 4: Temperature Explorer
+# ============================================================
+print("\n" + "=" * 50)
+print("Exercise 4: Temperature Explorer")
+print("=" * 50)
+
+def softmax(logits):
+    max_val = max(logits)
+    exps = [math.exp(v - max_val) for v in logits]
+    total = sum(exps)
+    return [e / total for e in exps]
+
+logits = [3.0, 1.5, 0.5, 0.1, -0.5]
+tokens = ['A', 'B', 'C', 'D', 'E']
+
+print(f"\nLogits: {logits}")
+print(f"Tokens: {tokens}\n")
+
+temperatures = [0.1, 0.25, 0.5, 1.0, 2.0, 5.0]
+
+print(f"{'Temp':>6} | ", end="")
+for tok in tokens:
+    print(f"  {tok:>5}", end="")
+print(f" | {'Entropy':>8} | Description")
+print("-" * 70)
+
+for temp in temperatures:
+    scaled = [l / temp for l in logits]
+    probs = softmax(scaled)
+
+    # Compute entropy: -sum(p * log(p))
+    entropy = -sum(p * math.log(p + 1e-10) for p in probs)
+
+    desc = ""
+    if temp < 0.3:
+        desc = "Nearly deterministic"
+    elif temp < 0.8:
+        desc = "Confident"
+    elif temp <= 1.2:
+        desc = "Balanced"
+    elif temp < 3.0:
+        desc = "Creative / random"
+    else:
+        desc = "Nearly uniform"
+
+    print(f"{temp:>6.2f} | ", end="")
+    for p in probs:
+        print(f"  {p:>5.3f}", end="")
+    print(f" | {entropy:>8.4f} | {desc}")
+
+print("""
+  Low temperature → low entropy → model picks the "best" token
+  High temperature → high entropy → model picks more randomly
+  
+  Temperature = 1.0 is the "natural" distribution
+  Temperature → 0 approaches argmax (greedy decoding)
+  Temperature → ∞ approaches uniform random
+""")
+
+
+# ============================================================
+# EXERCISE 1 STARTER: Bigram Model with Gradient Descent
+# ============================================================
+print("=" * 50)
+print("Exercise 1 Starter: Bigram Model with Gradient Descent")
+print("=" * 50)
+
+# Dataset
+names = ["emma", "olivia", "ava", "sophia", "mia", "luna", "ella", "aria"]
+chars = sorted(set(''.join(names)))
+BOS = len(chars)
+vocab_size = len(chars) + 1
+
+print(f"\nVocab: {chars + ['<BOS>']}")
+print(f"Vocab size: {vocab_size}")
+
+# The model: a weight matrix W where W[i][j] = score for token j following token i
+W = [[Value(random.gauss(0, 0.5)) for _ in range(vocab_size)]
+     for _ in range(vocab_size)]
+params = [p for row in W for p in row]
+
+def softmax_value(logits):
+    max_val = max(v.data for v in logits)
+    exps = [(v - max_val).exp() for v in logits]
+    total = sum(exps)
+    return [e / total for e in exps]
+
+# TODO: Complete the training loop
+# Hint: For each name, create pairs (current_char, next_char)
+# and minimize cross-entropy loss
+
+lr = 0.1
+for step in range(200):
+    total_loss = Value(0)
+    count = 0
+
+    for name in names:
+        tokens = [BOS] + [chars.index(ch) for ch in name] + [BOS]
+        for i in range(len(tokens) - 1):
+            current = tokens[i]
+            target = tokens[i + 1]
+            logits = W[current]
+            probs = softmax_value(logits)
+            total_loss = total_loss + (-probs[target].log())
+            count += 1
+
+    avg_loss = total_loss * (1.0 / count)
+    avg_loss.backward()
+
+    for p in params:
+        p.data -= lr * p.grad
+        p.grad = 0
+
+    if step % 20 == 0:
+        print(f"  Step {step:3d} | Loss: {avg_loss.data:.4f}")
+
+# Generate!
+print("\n  Generated names:")
+for i in range(10):
+    name = []
+    current = BOS
+    for _ in range(20):
+        logits = W[current]
+        probs = softmax_value(logits)
+        weights = [p.data for p in probs]
+        current = random.choices(range(vocab_size), weights=weights, k=1)[0]
+        if current == BOS:
+            break
+        name.append(chars[current])
+    print(f"    {i+1:2d}. {''.join(name)}")
+
+print("""
+=== Next Steps ===
+
+1. Try all the exercises above
+2. Read the original source code with your new understanding
+3. Watch Karpathy's YouTube videos for visual walkthroughs
+4. Build something with minGPT — train on YOUR data!
+5. Graduate to nanoGPT for real-world experiments
+
+You now understand how GPT works, from the ground up.
+The rest is just scale. Good luck!
+""")