From 95069214f6b09063969546ca21decf3d3c07c0b2 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Mon, 9 Feb 2026 16:10:38 -0400 Subject: [PATCH 01/20] T6 M0: Technical plan + analysis notebook for multi-objective vector scores --- docs/T6_technical_plan.md | 837 +++++++++++++++++++++ examples/notebooks/t6_m0_analysis.ipynb | 950 ++++++++++++++++++++++++ 2 files changed, 1787 insertions(+) create mode 100644 docs/T6_technical_plan.md create mode 100644 examples/notebooks/t6_m0_analysis.ipynb diff --git a/docs/T6_technical_plan.md b/docs/T6_technical_plan.md new file mode 100644 index 00000000..e37c0c8c --- /dev/null +++ b/docs/T6_technical_plan.md @@ -0,0 +1,837 @@ +# T6 Technical Plan — Multi-Objective Vector Scores for Trainer Selection + +**Version:** 1.0 (Refined) +**Author:** Carlos Rodriguez +**Date:** February 9, 2025 +**Status:** M0 Deliverable — Analysis + Architecture + Interface Spec + +**Target repos / branches:** +- **Primary (implementation + PR):** [`AgentOpt/OpenTrace@experimental`](https://github.com/AgentOpt/OpenTrace/tree/experimental) +- **Benchmark integration (M3):** [`AgentOpt/Trace-Bench`](https://github.com/AgentOpt/Trace-Bench) + +--- + +## Table of Contents + +1. [Executive Summary](#1-executive-summary) +2. [Goals, Non-Goals, Success Criteria](#2-goals-non-goals-success-criteria) +3. [Current Code Reality (Baseline)](#3-current-code-reality-baseline) +4. [Proposed Architecture (Minimal Delta)](#4-proposed-architecture-minimal-delta) +5. [Public API & Data Contracts](#5-public-api--data-contracts) +6. [Module Modifications (Files to Create / Modify)](#6-module-modifications) +7. [Edge Cases & Defensive Design](#7-edge-cases--defensive-design) +8. [Milestones & Validation Gates](#8-milestones--validation-gates) +9. [Test Plan](#9-test-plan) +10. [Risks & Mitigation](#10-risks--mitigation) +11. [Design Decisions (Resolved)](#11-design-decisions-resolved) +12. [Appendix: Code Touchpoints](#12-appendix-code-touchpoints) + +--- + +## 1. Executive Summary + +Today, trainer selection in Trace is driven by a **single scalar score**. Guides return `Tuple[float, str]` via `get_feedback()`, evaluators produce `np.array` of floats, and trainers (`BasicSearchAlgorithm`, `BeamsearchAlgorithm`) select candidates via scalar comparison (`max(candidates, key=lambda x: x[0])` and `sorted(..., key=lambda x: x[0])` respectively). This blocks trainer-side search from exploiting multiple metrics like `{accuracy, latency_ms, cost}`. + +### What this plan adds + +| Component | Change | +|-----------|--------| +| **Score contract** | `Dict[str, float]` returned by guides (optional), with backward-compatible scalar fallback | +| **ObjectiveConfig** | Frozen dataclass defining selection mode: `scalar` (default), `weighted`, or `pareto` | +| **objectives.py** (new) | All multi-objective logic isolated in pure, testable functions | +| **Evaluators** | Vector-score aggregation helpers (`evaluate_vector`, `aggregate_vector_scores`) | +| **BasicSearchAlgorithm** | Selection via `select_best(candidates, objective_config)` | +| **BeamsearchAlgorithm** | Selection via `select_top_k(candidates, objective_config, k)` | +| **PrioritySearch** (optional) | Scalarize heap priority via ObjectiveConfig; store dict for logging | +| **Benchmarks** (M3) | 3 simple benchmarks integrated into Trace-Bench | + +### Guiding principles + +- **Backward compatibility is non-negotiable.** `mode="scalar"` (the default) preserves identical behavior. +- **Isolate complexity.** All multi-objective logic lives in `objectives.py` — pure functions, easy to test. +- **Minimal churn.** Trainers gain an optional `objective_config` parameter; existing call sites are untouched. +- **Determinism.** Fixed `seed` → deterministic selection, especially Pareto tie-breaks. + +--- + +## 2. Goals, Non-Goals, Success Criteria + +### 2.1 Goals + +| ID | Goal | Acceptance Signal | +|----|------|-------------------| +| G1 | **Backward compatibility** | Existing scalar-score guides/trainers produce identical results when `objective_config` is `None` or `mode="scalar"` | +| G2 | **Vector score support** | Guide returns `{"accuracy": 1.0, "latency_ms": 120.0}` and trainers select candidates using weighted or Pareto mode | +| G3 | **Determinism** | Fixed `seed` → identical selection across runs (tested in CI) | +| G4 | **Actionability** | Every milestone: Colab notebook + pytest coverage (M1+) | +| G5 | **Benchmarks** | 3 benchmarks defined, integrated into Trace-Bench, runnable from notebooks | + +### 2.2 Non-goals (explicit) + +- No multi-objective UCB (MO-UCB) — too risky for v1 scope. +- No Pareto archive / non-dominated set management inside PrioritySearch. +- No changes to optimizer internals or new telemetry infrastructure. +- No modification to `get_feedback()` return signature (we use a helper instead). + +### 2.3 Crisp success criteria + +All of the following must be true: + +1. Scalar-only trainers still work and produce same results by default. +2. Multi-objective guide dict works end-to-end for BasicSearch + Beamsearch. +3. Deterministic behavior with fixed seed (tests + notebook). +4. Each milestone delivers a runnable Colab notebook. +5. From M1 onward, new functions have pytest tests and CI is green. +6. M3: three benchmarks exist, run, and Trace-Bench integration works. + +--- + +## 3. Current Code Reality (Baseline) + +### 3.1 Guide — scalar score contract + +```python +# opto/trainer/guide.py + +class Guide: + def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]: + raise NotImplementedError + + def metric(self, query, response, reference=None, **kwargs) -> float: + return self.get_feedback(query, response, reference)[0] # extracts scalar +``` + +**Implication:** `metric()` always returns `float`. Multi-metric feedback is not usable for selection. + +### 3.2 Evaluators — scalar arrays + +```python +# opto/trainer/evaluators.py + +def evaluate(agent, guide, inputs, infos, ...) -> np.ndarray: + # Calls guide.metric() per example → float + # Returns np.array of shape (N,) or (N, num_samples) +``` + +**Implication:** All scores are numeric scalars aggregated via `np.mean()`. + +### 3.3 BasicSearchAlgorithm — scalar max selection + +```python +# opto/trainer/algorithms/basic_algorithms.py :: BasicSearchAlgorithm.optimizer_step() + +def validate(): + scores = evaluate(self.agent, self.validate_guide, ...) + return np.mean(scores) if all([s is not None for s in scores]) else -np.inf + +# Selection: +candidates.append((score, update_dict)) # score is float +best_score, best_update = max(candidates, key=lambda x: x[0]) # scalar max +``` + +**Insertion point:** Replace `max(candidates, ...)` with `select_best(candidates, objective_config)`. + +### 3.4 BeamsearchAlgorithm — scalar sort selection + +```python +# opto/trainer/algorithms/beamsearch_algorithm.py :: BeamsearchAlgorithm.select() + +scored_candidates.append((validation_score, candidate_params)) # float +sorted_candidates = sorted(scored_candidates, key=lambda x: x[0], reverse=True) +selected_candidates = sorted_candidates[:beam_width] # take top-k by scalar +``` + +**Insertion point:** Replace scalar sort with `select_top_k(scored_candidates, objective_config, k=beam_width)`. + +### 3.5 Shared patterns across both trainers + +| Pattern | BasicSearch | Beamsearch | +|---------|-------------|------------| +| Validate | `np.mean(scores)` → float | `np.mean(validation_scores)` → float | +| Store | `(score, update_dict)` | `(validation_score, candidate_params)` | +| Select | `max(candidates, key=λ x: x[0])` | `sorted(candidates, key=λ x: x[0])[:k]` | +| Fallback | `-np.inf` | `-np.inf` | + +Both converge to the same abstraction: **given a list of `(score, params)` pairs, select the best or top-k.** This is exactly what `objectives.py` will provide. + +### 3.6 Existing infrastructure we leverage + +- **Logger abstraction:** `BaseLogger` with `log(name, value, step)` — can log each metric in a vector score. +- **StubLLM / DummyLLM:** Wraps deterministic callables — usable for CI and no-keys notebooks. +- **`batch_run` / `async_run`:** Parallelism utilities already in place. + +--- + +## 4. Proposed Architecture (Minimal Delta) + +### 4.1 Core idea + +Isolate all multi-objective logic into one new module (`opto/trainer/objectives.py`) containing **pure functions**: + +``` +normalize_score() → scalar ↔ dict conversion +apply_minimize() → flip signs for minimize metrics +weighted_scalarize()→ dict → float via weighted sum +pareto_rank() → dominance ranking + tie-break +select_best() → given candidates + config, return best index +select_top_k() → given candidates + config, return top-k indices +``` + +Trainers call these functions instead of inline `max()` / `sorted()`. When `objective_config` is `None`, the functions fall through to scalar comparison — **identical to current behavior**. + +### 4.2 Data flow (target) + +``` +Guide.get_feedback() + │ + ├── returns (float, str) ← existing path, unchanged + └── returns (Dict[str,float], str) ← new path (via get_score_dict helper) + │ + ▼ +Evaluator.evaluate_vector() + │ + ├── per-example: List[Dict[str, float]] + └── aggregated: Dict[str, float] (mean per metric) + │ + ▼ +Trainer selection (objectives.py) + │ + ├── mode="scalar" → max(mean_scores) ← unchanged + ├── mode="weighted" → max(weighted_scalarize()) ← new + └── mode="pareto" → pareto_rank() + tie-break ← new +``` + +### 4.3 Backward compatibility guarantee + +The entire vector-score path is **opt-in**: + +1. If `objective_config` is `None` → existing scalar path, no new code executed. +2. If guide returns `float` and `objective_config` is provided → `normalize_score()` wraps it as `{"score": float}`, weights default to `{"score": 1.0}`. +3. If guide returns `Dict[str, float]` and `objective_config` is `None` → `mean(values)` used as scalar fallback, preserving scalar selection. + +--- + +## 5. Public API & Data Contracts + +### 5.1 Score types + +```python +from typing import Union, Dict + +ScalarScore = float +VectorScore = Dict[str, float] # JSON-serializable, all values finite +ScoreLike = Union[int, float, bool, Dict[str, float]] +``` + +**Contract:** +- "Higher is better" by default for all metrics. +- Metrics to minimize are declared in `ObjectiveConfig.minimize` (semantics: negate internally). +- All dict values must be finite floats. `NaN` / `±inf` in a dict raises `ValueError`. +- `int` and `bool` scalar scores are accepted and converted to `float` (e.g., `LLMJudge` returns `int` 0/1, test guides return `bool`). + +### 5.2 ObjectiveConfig + +```python +from dataclasses import dataclass, field +from typing import Literal, Optional, Dict, Tuple + +@dataclass(frozen=True) +class ObjectiveConfig: + """Configuration for multi-objective candidate selection. + + Attributes: + mode: Selection strategy. + - "scalar": Use existing scalar comparison (default, backward-compatible). + - "weighted": Scalarize via weighted sum, then select max. + - "pareto": Pareto dominance ranking with configurable tie-break. + weights: Per-metric weights for weighted scalarization. + Missing metrics use missing_value. Metrics not present in the weights dict + are ignored (not included in the weighted sum). + If empty dict in weighted mode, all present metrics get equal weight 1.0. + minimize: Frozenset of metric names where lower is better (users can pass set; auto-converted). + These are negated internally before comparison ("higher-is-better" normalization). + missing_value: Score assigned to missing metrics in a candidate's score dict. + Default: float('-inf') (effectively disqualifies candidates missing required metrics). + pareto_metrics: Subset of metrics to use for Pareto dominance. + If None, all metrics present across candidates are used. + tie_break: Strategy for breaking ties among Pareto-equivalent candidates. + - "weighted": Fall back to weighted scalarization among tied candidates. + - "lexicographic": Sort by metrics in alphabetical order. + - "random_seeded": Seeded random shuffle. + seed: Random seed for deterministic tie-breaking. + """ + mode: Literal["scalar", "weighted", "pareto"] = "scalar" + weights: Dict[str, float] = field(default_factory=dict) + minimize: frozenset = field(default_factory=frozenset) + missing_value: float = float("-inf") + pareto_metrics: Optional[Tuple[str, ...]] = None + tie_break: Literal["weighted", "lexicographic", "random_seeded"] = "weighted" + seed: int = 0 + + def __post_init__(self): + # Convert set → frozenset for true immutability + hashability + if isinstance(self.minimize, set): + object.__setattr__(self, 'minimize', frozenset(self.minimize)) + # Validate weights are non-negative + for k, v in self.weights.items(): + if v < 0: + raise ValueError(f"Weight for '{k}' must be non-negative, got {v}") + # Validate pareto_metrics + if self.pareto_metrics is not None and len(self.pareto_metrics) == 0: + raise ValueError("pareto_metrics must be None (auto) or non-empty tuple") +``` + +**Validation rules (enforced in `__post_init__`):** +- `minimize` is stored as `frozenset` for true immutability (users can pass `set` for convenience; it's auto-converted). +- `mode="weighted"` with empty `weights` → auto-assign equal weight 1.0 to all encountered metrics. +- `mode="pareto"` with `pareto_metrics=None` → use union of all metric keys across candidates. +- `mode="pareto"` with `pareto_metrics=()` → `ValueError`. +- All weight values must be non-negative. +- `minimize` metric names must be valid strings (warning if not found in any candidate). + +### 5.3 Guide helper method + +```python +# Added to Guide base class (non-breaking) + +class Guide: + # ... existing methods unchanged ... + + def get_score_dict(self, query: str, response: str, reference=None, **kwargs) -> Dict[str, float]: + """Return the evaluation score as a dictionary. + + Wraps get_feedback() for backward compatibility: + - If get_feedback returns (float, str): returns {"score": float} + - If get_feedback returns (dict, str): returns dict directly + + Subclasses returning multi-metric scores should override get_feedback() + to return (Dict[str, float], str) instead of (float, str). + """ + score, _ = self.get_feedback(query, response, reference, **kwargs) + if isinstance(score, dict): + return score + return {"score": float(score)} + + def metric(self, query: str, response: str, reference=None, **kwargs) -> float: + """Always returns float. For dict scores, returns mean of values as scalar fallback. + + This ensures evaluate() and the training loop (which call metric()) remain + completely safe. Dict scores only flow through get_score_dict() → evaluate_vector(). + """ + score, _ = self.get_feedback(query, response, reference, **kwargs) + if isinstance(score, dict): + return float(np.mean(list(score.values()))) + return float(score) +``` + +**Why this approach:** +- `get_score_dict()` is a new method — zero risk of breaking existing subclasses. +- `metric()` always returns `float` — the existing `evaluate()` function (which calls `guide.metric()` and passes results to `np.array()`) and the training loop (which calls `np.mean(scores)`) are completely unaffected. +- Dict scores are only accessible via `get_score_dict()` → `evaluate_vector()`, keeping the two data paths cleanly separated. + +### 5.4 Evaluator additions + +```python +# Added to opto/trainer/evaluators.py + +def evaluate_vector(agent, guide, inputs, infos, min_score=None, + num_samples=1, num_threads=None, description=None + ) -> list: + """Like evaluate(), but returns List[ScoreLike] (float or dict per example). + + Uses guide.get_score_dict() to obtain dict scores per example. + When guide returns scalar, get_score_dict() wraps it as {"score": float}. + + When num_samples > 1: for each example, collects num_samples score dicts, + computes per-key mean across the samples, and returns one aggregated dict + per example. Final output is always List[Dict[str, float]] of length N. + """ + ... + +def aggregate_vector_scores(scores: list) -> Union[float, Dict[str, float]]: + """Aggregate per-example scores into a single summary score. + + - If all scores are float: returns np.mean (existing behavior). + - If all scores are dict: returns per-metric mean dict. + - Mixed float/dict: normalizes all to dict via normalize_score(), then averages. + + Args: + scores: List of float or Dict[str, float] values. + + Returns: + float (if all scalar) or Dict[str, float] (if any dicts present). + """ + ... +``` + +### 5.5 objectives.py — complete function signatures + +```python +# opto/trainer/objectives.py (NEW FILE) + +from typing import Union, Dict, List, Set, Optional, Tuple, Literal +from dataclasses import dataclass, field + +# --- ObjectiveConfig defined here (see §5.2) --- + +# --- Score type aliases --- +ScalarScore = float +VectorScore = Dict[str, float] +ScoreLike = Union[float, Dict[str, float]] + +# --- Pure utility functions --- + +def normalize_score(score: ScoreLike) -> Dict[str, float]: + """Convert any score to dict form. + + - int/float/bool → {"score": float(value)} + - Dict[str, float] → returned as-is (validated: all values finite) + + Handles int (LLMJudge returns 0/1) and bool (test guides) via isinstance(score, (int, float, bool)). + + Raises: + TypeError: if score is not int, float, bool, or dict + ValueError: if dict contains non-finite values or is empty + """ + ... + +def apply_minimize(score_dict: Dict[str, float], + minimize: Set[str]) -> Dict[str, float]: + """Negate values for minimize metrics (higher-is-better normalization). + + Returns a new dict with minimize metrics negated. + Metrics not in minimize set are unchanged. + """ + ... + +def weighted_scalarize(score_dict: Dict[str, float], + weights: Dict[str, float], + missing_value: float = float("-inf")) -> float: + """Compute weighted sum of score dict. + + For each metric in weights: + - If present in score_dict: weight * value + - If missing: weight * missing_value + + Metrics in score_dict but NOT in weights are ignored. + If weights is empty, all metrics get equal weight 1.0. + + Returns: + Weighted scalar score. + """ + ... + +def dominates(a: Dict[str, float], b: Dict[str, float], + metrics: Optional[Tuple[str, ...]] = None) -> bool: + """Check if candidate 'a' Pareto-dominates candidate 'b'. + + a dominates b iff: + - a[m] >= b[m] for all metrics m, AND + - a[m] > b[m] for at least one metric m + + Both dicts must already be in "higher-is-better" form (post apply_minimize). + Missing metrics are treated as missing_value (caller should handle before call). + + Args: + a, b: Score dicts (higher-is-better normalized). + metrics: Subset of metrics to compare. If None, use union of keys. + """ + ... + +def pareto_rank(candidates: List[Dict[str, float]], + metrics: Optional[Tuple[str, ...]] = None) -> List[int]: + """Assign Pareto rank to each candidate (0 = non-dominated front). + + Uses standard non-dominated sorting. + + Args: + candidates: List of score dicts (higher-is-better normalized). + metrics: Subset of metrics for dominance. If None, use all present. + + Returns: + List of integer ranks (same length as candidates). Rank 0 = Pareto front. + """ + ... + +def select_best(candidates: List[Tuple[ScoreLike, any]], + objective_config: Optional['ObjectiveConfig'] = None) -> int: + """Select the single best candidate index. + + Args: + candidates: List of (score, payload) tuples. + objective_config: Selection config. If None, uses scalar max (backward-compatible). + + Returns: + Index of best candidate. + + Behavior by mode: + - scalar/None: max(score) where score is float (or mean of dict values). + - weighted: max(weighted_scalarize(normalize(score), config.weights)). + - pareto: rank candidates, tie-break among rank-0 front, return winner. + + Call-site transformation (BasicSearch): + # Current: + best_score, best_update = max(candidates, key=lambda x: x[0]) + # Target: + best_idx = select_best(candidates, objective_config) + best_score, best_update = candidates[best_idx] + """ + ... + +def select_top_k(candidates: List[Tuple[ScoreLike, any]], + objective_config: Optional['ObjectiveConfig'] = None, + k: int = 1) -> List[int]: + """Select the top-k candidate indices. + + Same logic as select_best, but returns k indices. + + For pareto mode: returns rank-0 front (up to k). If front < k, + includes rank-1 candidates by tie-break order, etc. + + Deterministic ordering guaranteed with fixed seed. + """ + ... +``` + +--- + +## 6. Module Modifications + +### 6.1 Files to CREATE + +| File | Contents | Milestone | +|------|----------|-----------| +| `opto/trainer/objectives.py` | `ObjectiveConfig`, `normalize_score`, `apply_minimize`, `weighted_scalarize`, `dominates`, `pareto_rank`, `select_best`, `select_top_k` | M1 | +| `tests/test_objectives.py` | Unit tests for all functions in objectives.py | M1 | +| `tests/test_evaluators_vector.py` | Tests for evaluate_vector + aggregate_vector_scores | M1 | +| `tests/test_trainers_multiobjective.py` | Integration tests for BasicSearch + Beamsearch with ObjectiveConfig | M2 | +| `examples/notebooks/t6_m0_analysis.ipynb` | M0 analysis notebook | M0 | +| `examples/notebooks/t6_m1_vector_scores.ipynb` | M1 demo notebook | M1 | +| `examples/notebooks/t6_m2_trainers.ipynb` | M2 demo notebook | M2 | +| `examples/notebooks/t6_m3_benchmarks.ipynb` | M3 benchmark notebook | M3 | +| `docs/T6_technical_plan.md` | This document | M0 | +| `docs/multi_objective_scores.md` | User-facing documentation | M4 | + +### 6.2 Files to MODIFY + +| File | Change | Milestone | +|------|--------|-----------| +| `opto/trainer/guide.py` | Add `get_score_dict()` method to `Guide` base class. Update `metric()` to collapse dict scores to `float` via `mean(values)` (return type stays `float`). | M1 | +| `opto/trainer/evaluators.py` | Add `evaluate_vector()` and `aggregate_vector_scores()`. Existing `evaluate()` unchanged. | M1 | +| `opto/trainer/algorithms/basic_algorithms.py` | Add `objective_config` param to `BasicSearchAlgorithm.train()`. Replace `max(candidates, ...)` with `select_best()` in `optimizer_step()`. | M1 (minimal) / M2 (robust) | +| `opto/trainer/algorithms/beamsearch_algorithm.py` | Add `objective_config` param to `BeamsearchAlgorithm.train()`. Replace scalar sort in `select()` with `select_top_k()`. | M2 | +| `opto/features/priority_search/priority_search.py` | (Optional) Add `objective_config` param. Scalarize heap key via weighted mode. Store dict for logging. Pareto falls back to weighted. | M2 | + +### 6.3 Files NOT modified + +- `opto/trace/` — no changes to trace primitives. +- `opto/optimizers/` — optimizers are upstream of selection; they produce candidates, not rank them. +- Existing tests — no modifications; they validate backward compatibility by continuing to pass. + +--- + +## 7. Edge Cases & Defensive Design + +### 7.1 Score validation + +| Case | Behavior | +|------|----------| +| `score = 0.85` (float) | `normalize_score()` → `{"score": 0.85}` | +| `score = 1` (int) | `normalize_score()` → `{"score": 1.0}` (LLMJudge returns int 0/1) | +| `score = True` (bool) | `normalize_score()` → `{"score": 1.0}` (test guides return bool) | +| `score = {"accuracy": 0.9, "latency_ms": 120.0}` | Returned as-is after validation | +| `score = {}` (empty dict) | `ValueError("Score dict must not be empty")` | +| `score = {"accuracy": float('nan')}` | `ValueError("Score dict contains non-finite value")` | +| `score = {"accuracy": float('inf')}` | `ValueError("Score dict contains non-finite value")` | +| `score = "text"` (wrong type) | `TypeError("Score must be int, float, bool, or Dict[str, float]")` | + +### 7.2 Missing metrics across candidates + +| Case | Behavior | +|------|----------| +| Candidate A has `{accuracy, latency}`, B has `{accuracy}` | B gets `latency = missing_value` (default `-inf`) | +| `weights = {"accuracy": 0.7, "latency": 0.3}`, candidate missing `latency` | Weighted sum uses `0.3 * missing_value` | +| All candidates missing a weighted metric | Warning logged; metric still contributes `weight * missing_value` | + +### 7.3 Mixed scalar/dict batches + +| Case | Behavior | +|------|----------| +| All scores are `float` (or `int`/`bool`) | `aggregate_vector_scores()` returns `float` via `np.mean()` (existing behavior) | +| All scores are `dict` with same keys | `aggregate_vector_scores()` returns per-metric mean `Dict[str, float]` | +| Mixed `float` and `dict` in same batch | `ValueError("All scores in a batch must be the same type (all float or all dict)")` | + +A mixed batch most likely indicates a bug in the guide implementation (e.g., returning `float` on some inputs and `dict` on others). Failing loudly prevents silent incorrect aggregation. + +### 7.4 Single-metric dict + +| Case | Behavior | +|------|----------| +| Guide returns `{"accuracy": 0.9}` with `mode="weighted"` | Weighted sum = `weight * 0.9` (trivially correct) | +| Guide returns `{"accuracy": 0.9}` with `mode="pareto"` | Pareto degenerates to scalar max (single dimension — no tradeoffs). Warning logged. | + +### 7.5 Tie-breaking + +| Case | Behavior | +|------|----------| +| Two candidates with identical weighted score | Deterministic: lower original index wins (stable sort) | +| Pareto front with 3 equivalent candidates, `tie_break="weighted"` | Fall back to weighted scalarization among the 3; select max | +| Pareto front with 3 equivalent candidates, `tie_break="lexicographic"` | Sort by metric names alphabetically, compare values in order | +| Pareto front with 3 equivalent candidates, `tie_break="random_seeded"` | Seeded shuffle with `config.seed`; same seed → same order always | + +### 7.7 Training loop safety + +The training loop has a **separate data path** from evaluation/selection. In `standard_optimization_step()` (basic_algorithms.py:46) and `standard_forward()` (sampler.py:130): + +```python +score, feedback = guide(x, target.data, info) +``` + +This `score` flows into `MinibatchAlgorithm.update()` where `np.mean(scores)` is computed (basic_algorithms.py:511). **This path must always receive `float`.** + +| Constraint | Enforcement | +|-----------|-------------| +| `guide.__call__()` / `get_feedback()` return type is **NOT widened** | No changes to `get_feedback()` signature; it still returns `Tuple[float, str]` | +| Training loop always receives scalar `score` | `metric()` always returns `float` (collapses dict via `mean(values)` if needed) | +| Dict scores flow through a separate path | `get_score_dict()` → `evaluate_vector()` → `select_best()` / `select_top_k()` | +| A multi-objective guide must return `(float, str)` from `get_feedback()` for the training loop | The float is a collapsed scalar summary; the full dict is extracted via `get_score_dict()` during selection | + +**Two data paths (by design):** +``` +Training loop: guide() → score (float) → np.mean(scores) ← UNCHANGED +Selection path: get_score_dict() → evaluate_vector() → objectives.py ← NEW +``` + +### 7.6 ObjectiveConfig validation + +| Case | Behavior | +|------|----------| +| `mode="weighted"`, `weights={}` | Auto-assign equal weight 1.0 to all metrics encountered at selection time | +| `mode="pareto"`, `pareto_metrics=()` (empty tuple) | `ValueError("pareto_metrics must be None (auto) or non-empty tuple")` | +| `weights={"accuracy": -0.5}` (negative weight) | `ValueError("All weights must be non-negative")` | +| `minimize={"unknown_metric"}` | Warning logged at selection time if metric never appears; no error (tolerant) | + +--- + +## 8. Milestones & Validation Gates + +### Milestone 0 — Analysis + technical plan + interface spec + +**Deliverables:** +- `docs/T6_technical_plan.md` — this document, finalized +- `notebooks/t6_m0_analysis.ipynb` — Colab-ready notebook + +**Notebook demonstrates:** +- Current Guide score contract (`get_feedback` → `Tuple[float, str]`, `metric` → `float`) +- Where scalar selection happens in BasicSearch (`max(candidates, ...)`) and Beamsearch (`sorted(...)[:k]`) +- Planned behavior prototype: deterministic toy guide returning dict metrics, showing weighted vs Pareto selection on dummy candidates + +**SMART validation:** +- Plan includes final API signatures and precise file list (create/modify) ✓ +- Notebook runs without API keys ✓ +- Notebook prints: current score contract, selection touchpoints, planned selection outputs ✓ + +--- + +### Milestone 1 — ObjectiveConfig + utilities + evaluator support + BasicSearch minimal + +**Deliverables:** +- `opto/trainer/objectives.py` (new) +- `opto/trainer/guide.py` (add `get_score_dict`) +- `opto/trainer/evaluators.py` (add `evaluate_vector`, `aggregate_vector_scores`) +- `opto/trainer/algorithms/basic_algorithms.py` (BasicSearch: accept/use ObjectiveConfig) +- `tests/test_objectives.py`, `tests/test_evaluators_vector.py` +- `notebooks/t6_m1_vector_scores.ipynb` + +**Notebook demonstrates:** +- StubLLM mode: BasicSearchAlgorithm on small candidate set (5-10) with deterministic dummy guide returning dict metrics +- Shows: (a) scalar baseline, (b) weighted mode, (c) Pareto mode, (d) deterministic tie-break under fixed seed +- Real LLM mode (optional): tiny dataset (≤5 items) producing ≥2 metrics + +**SMART validation:** +- `pytest -q` passes (all new functions covered) +- Notebook runs in Colab: weighted selection result changes when weights change +- Pareto returns tradeoffs and is deterministic under fixed seed +- Scalar path produces identical results to pre-change behavior + +--- + +### Milestone 2 — Trainer upgrades (Beamsearch + robust BasicSearch) + +**Deliverables:** +- `opto/trainer/algorithms/beamsearch_algorithm.py` (accept ObjectiveConfig, vector selection) +- Expanded BasicSearch tests (edge cases, missing metrics, tie-break policies) +- Optional: minimal PrioritySearch support (weighted scalarization for heap, dict stored for logging) +- `tests/test_trainers_multiobjective.py` +- `notebooks/t6_m2_trainers.ipynb` + +**Notebook demonstrates:** +- BasicSearch + Beamsearch in: scalar mode (baseline), weighted mode, Pareto mode +- StubLLM + real LLM sections + +**SMART validation:** +- `pytest -q` green +- Integration test confirms: weighted vs Pareto select different candidates where expected +- Scalar-only example produces same final best score when `objective_config=None` +- Deterministic tie-break is stable across runs + +--- + +### Milestone 3 — Benchmarks (Trace-Bench integration) + +**Deliverables:** +- PR to Trace-Bench: benchmark configs/tasks + notebook +- 3 benchmarks: + 1. **Accuracy vs latency** (toy QA dataset) + 2. **Accuracy vs response length** (penalize verbosity) + 3. **Accuracy vs tool calls** (penalize excessive tool usage) +- `notebooks/t6_m3_benchmarks.ipynb` + +**SMART validation:** +- Notebook outputs per-benchmark table: weighted-mode best candidate metrics + Pareto-mode set of tradeoffs +- Benchmarks run in StubLLM mode (fast/deterministic) and real LLM mode (small sample) +- Trace-Bench run completes without private datasets +- `pytest -q` green (smoke tests for benchmark integration) + +--- + +### Milestone 4 — Documentation + polished notebooks + +**Deliverables:** +- `docs/multi_objective_scores.md` — user-facing documentation +- README update with pointers to docs and notebooks +- Polished "How-to" notebook: installs from GitHub, runs BasicSearch weighted + Pareto, prints metric tradeoffs + +**SMART validation:** +- Fresh Colab runtime runs how-to notebook without manual patching +- CI green, no behavioral changes beyond documentation/polish + +--- + +## 9. Test Plan + +### 9.1 Unit tests — `tests/test_objectives.py` (M1) + +| Test | Validates | +|------|-----------| +| `test_normalize_score_from_float` | `0.85` → `{"score": 0.85}` | +| `test_normalize_score_from_dict` | `{"a": 1.0, "b": 2.0}` → same dict | +| `test_normalize_score_empty_dict_raises` | `{}` → `ValueError` | +| `test_normalize_score_nan_raises` | `{"a": float('nan')}` → `ValueError` | +| `test_normalize_score_wrong_type_raises` | `"text"` → `TypeError` | +| `test_apply_minimize` | `{"acc": 0.9, "lat": 100}` with `minimize={"lat"}` → `{"acc": 0.9, "lat": -100}` | +| `test_apply_minimize_empty_set` | No metrics negated | +| `test_weighted_scalarize_basic` | `{"a": 0.8, "b": 0.2}` with `weights={"a": 0.7, "b": 0.3}` → `0.7*0.8 + 0.3*0.2` | +| `test_weighted_scalarize_missing_metric` | Missing metric uses `missing_value` | +| `test_weighted_scalarize_empty_weights` | Equal weight 1.0 for all metrics | +| `test_dominates_true` | A dominates B (all ≥, at least one >) | +| `test_dominates_false_equal` | A == B → does not dominate | +| `test_dominates_false_tradeoff` | A better on one, B better on another | +| `test_pareto_rank_simple` | 3 candidates with clear rank 0, 1, 2 | +| `test_pareto_rank_all_nondominated` | All candidates rank 0 | +| `test_select_best_scalar_mode` | Falls back to scalar max | +| `test_select_best_weighted_mode` | Returns highest weighted score | +| `test_select_best_pareto_mode` | Returns Pareto-optimal by tie-break | +| `test_select_best_none_config` | `objective_config=None` → scalar max (backward compat) | +| `test_select_top_k_weighted` | Returns k highest weighted scores | +| `test_select_top_k_pareto` | Returns k from Pareto front + spillover | +| `test_deterministic_tie_break_seeded` | Same seed → same result across 100 runs | +| `test_deterministic_tie_break_different_seeds` | Different seeds → potentially different result | + +### 9.2 Unit tests — `tests/test_evaluators_vector.py` (M1) + +| Test | Validates | +|------|-----------| +| `test_aggregate_vector_scores_all_scalar` | `[0.8, 0.9, 0.7]` → `np.mean` (backward compat) | +| `test_aggregate_vector_scores_all_dict` | Per-metric mean computed correctly | +| `test_aggregate_vector_scores_mixed` | Scalars normalized to dict, then averaged | +| `test_evaluate_vector_returns_correct_types` | Returns list of ScoreLike matching guide output | + +### 9.3 Integration tests — `tests/test_trainers_multiobjective.py` (M2) + +| Test | Validates | +|------|-----------| +| `test_basicsearch_scalar_unchanged` | Default behavior identical to pre-change | +| `test_basicsearch_weighted_selects_expected` | Weighted mode picks correct candidate | +| `test_basicsearch_pareto_selects_expected` | Pareto mode picks different candidate than weighted | +| `test_beamsearch_scalar_unchanged` | Default behavior identical | +| `test_beamsearch_weighted_selects_top_k` | Weighted mode picks correct top-k | +| `test_beamsearch_pareto_selects_front` | Pareto mode returns non-dominated front | +| `test_deterministic_across_runs` | Fixed seed → same selections in 5 repeated runs | + +### 9.4 Notebook validation (human / Trace team) + +Each notebook contains: +- **StubLLM (no keys) section:** deterministic dummy guide, runs quickly +- **Real LLM section (optional):** small N (5-20 examples), prints cost/latency caveats, requires API key + +--- + +## 10. Risks & Mitigation + +| Risk | Severity | Mitigation | +|------|----------|------------| +| **R1: Missing metrics across candidates** | Medium | `missing_value` in ObjectiveConfig (default `-inf`). Enforce metric presence for configured weights (or warn). | +| **R2: Pareto nondeterminism** | High | Deterministic ordering via stable sort + explicit tie-break rules. Seeded randomness only when requested. | +| **R3: Multi-thread eval ordering** | Medium | Tests run with `num_threads=1` to guarantee stability. Document thread-safety considerations. | +| **R4: Breaking Guide subclasses** | High | Use `get_score_dict()` helper — never change `get_feedback()` signature. Union type on `metric()` is safe because existing callers only receive floats. | +| **R5: Performance regression** | Low | `objectives.py` functions are O(n²) for Pareto ranking on n candidates, but n is typically ≤20 (num_proposals). No concern at this scale. | +| **R6: Mixed scalar/dict in same batch** | Medium | `aggregate_vector_scores()` rejects mixed batches with `ValueError`. A mixed batch indicates a bug in the guide. | +| **R7: Training loop receives dict score** | High | `guide.__call__()` / `get_feedback()` return type is NOT widened. `metric()` always returns `float`. Dict scores only flow through `get_score_dict()` → `evaluate_vector()`. See §7.7. | + +--- + +## 11. Design Decisions (Resolved) + +### D1: Where to implement scalar→dict normalization? + +**Decision: Option A — `Guide.get_score_dict()` helper + `objectives.normalize_score()`** + +- `get_score_dict()` on Guide provides a clean entry point for subclasses. +- `normalize_score()` in objectives.py is the canonical utility (pure function, testable). +- Avoids widening `get_feedback()` return type (higher churn, breaks typing). + +### D2: Pareto selection definition + +**Decision: Option A — Standard dominance on aggregated metrics, return single best by tie-break.** + +- `select_best()` returns one winner. `select_top_k()` returns k winners. +- Trainers don't need to manage a "front" — they just get indices. +- Beamsearch naturally uses `select_top_k(k=beam_width)`. + +### D3: PrioritySearch scope + +**Decision: Minimal (in-scope).** + +- Scalarize heap priority via `weighted_scalarize()`. +- Store full `score_dict` on each candidate for logging. +- `mode="pareto"` falls back to weighted with documented warning. +- Pareto archive is out-of-scope for v1. + +--- + +## 12. Appendix: Code Touchpoints + +### OpenTrace / experimental + +| File | URL | +|------|-----| +| Guide base | [guide.py](https://github.com/AgentOpt/OpenTrace/blob/experimental/opto/trainer/guide.py) | +| Evaluators | [evaluators.py](https://github.com/AgentOpt/OpenTrace/blob/experimental/opto/trainer/evaluators.py) | +| BasicSearch | [basic_algorithms.py](https://github.com/AgentOpt/OpenTrace/blob/experimental/opto/trainer/algorithms/basic_algorithms.py) | +| Beamsearch | [beamsearch_algorithm.py](https://github.com/AgentOpt/OpenTrace/blob/experimental/opto/trainer/algorithms/beamsearch_algorithm.py) | +| PrioritySearch | [priority_search.py](https://github.com/AgentOpt/OpenTrace/blob/experimental/opto/features/priority_search/priority_search.py) | + +### Trace-Bench + +| File | URL | +|------|-----| +| Repo | [Trace-Bench](https://github.com/AgentOpt/Trace-Bench) | + +### Selection logic summary (current → target) + +| Trainer | Current Code | Target Code | +|---------|-------------|-------------| +| BasicSearch | `max(candidates, key=lambda x: x[0])` | `select_best(candidates, objective_config)` | +| Beamsearch | `sorted(candidates, key=lambda x: x[0], reverse=True)[:k]` | `select_top_k(candidates, objective_config, k)` | +| PrioritySearch | scalar heap key | `weighted_scalarize(score_dict, config)` for heap key | diff --git a/examples/notebooks/t6_m0_analysis.ipynb b/examples/notebooks/t6_m0_analysis.ipynb new file mode 100644 index 00000000..90eefcad --- /dev/null +++ b/examples/notebooks/t6_m0_analysis.ipynb @@ -0,0 +1,950 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "275808ea", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nT6 Milestone 0 — Analysis Notebook\\n\\nThis notebook is the M0 deliverable for the T6 Multi-Objective Vector Scores project.\\nIt demonstrates:\\n 1. Current baseline behavior (Guide score contract, evaluator aggregation, trainer selection)\\n 2. Exact code touchpoints and signatures in the OpenTrace codebase\\n 3. Planned behavior prototype: Pareto front vs weighted selection on deterministic toy candidates\\n\\nRuns end-to-end WITHOUT API keys.\\n'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "T6 Milestone 0 — Analysis Notebook\n", + "\n", + "This notebook is the M0 deliverable for the T6 Multi-Objective Vector Scores project.\n", + "It demonstrates:\n", + " 1. Current baseline behavior (Guide score contract, evaluator aggregation, trainer selection)\n", + " 2. Exact code touchpoints and signatures in the OpenTrace codebase\n", + " 3. Planned behavior prototype: Pareto front vs weighted selection on deterministic toy candidates\n", + "\n", + "Runs end-to-end WITHOUT API keys.\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "b1a58d26", + "metadata": {}, + "source": [ + "# T6 Multi-Objective Vector Scores — M0 Analysis\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/)\n", + "\n", + "**Milestone 0 Deliverable** — Analysis + Technical Plan + Interface Spec\n", + "\n", + "This notebook demonstrates:\n", + "1. **Current baseline**: How Guide returns scalar scores, how evaluators aggregate, where selection happens\n", + "2. **Exact touchpoints**: The specific lines of code in BasicSearch and Beamsearch that perform scalar selection\n", + "3. **Planned behavior**: A deterministic prototype showing weighted vs Pareto selection on toy candidates\n", + "\n", + "**No API keys required.** All examples use deterministic dummy data.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "a252270b", + "metadata": {}, + "source": [ + "## How to Validate This Milestone\n", + "\n", + "After running all cells, confirm:\n", + "- [ ] Current Guide score contract is printed (`get_feedback → Tuple[float, str]`, `metric → float`)\n", + "- [ ] Scalar selection points in BasicSearch and Beamsearch are identified\n", + "- [ ] Weighted selection produces different results when weights change\n", + "- [ ] Pareto selection returns non-dominated candidates (tradeoff set)\n", + "- [ ] Deterministic tie-break produces identical results across repeated runs with same seed" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "067cd49e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "T6 M0 Analysis — Multi-Objective Vector Scores\n", + "======================================================================\n" + ] + } + ], + "source": [ + "# Setup — no external dependencies beyond numpy\n", + "import numpy as np\n", + "from typing import Dict, List, Tuple, Optional, Set, Union, Literal\n", + "from dataclasses import dataclass, field\n", + "import json\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"T6 M0 Analysis — Multi-Objective Vector Scores\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "markdown", + "id": "54b6022f", + "metadata": {}, + "source": [ + "---\n", + "## Part 1: Current Baseline Behavior\n", + "\n", + "### 1.1 Guide Score Contract\n", + "\n", + "The `Guide` base class defines the current score interface:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2ab12cbf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "PART 1: CURRENT BASELINE BEHAVIOR\n", + "======================================================================\n", + "\n", + "=== Current Guide Score Contract ===\n", + "\n", + "class Guide:\n", + " def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n", + " raise NotImplementedError\n", + "\n", + " def metric(self, query, response, reference=None, **kwargs) -> float:\n", + " return self.get_feedback(query, response, reference)[0] # extracts scalar\n", + "\n", + "Key observations:\n", + " • get_feedback() returns Tuple[float, str] — a SCALAR score + feedback string\n", + " • metric() returns float — just extracts the first element\n", + " • LLMJudge (subclass) returns binary 0/1 scores\n", + " • No mechanism to return Dict[str, float] for multiple metrics\n", + "\n", + "Example — get_feedback(): score=1.0 (type=float), feedback='Correct!'\n", + "Example — metric(): 1.0 (type=float)\n" + ] + } + ], + "source": [ + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"PART 1: CURRENT BASELINE BEHAVIOR\")\n", + "print(\"=\" * 70)\n", + "\n", + "print(\"\"\"\n", + "=== Current Guide Score Contract ===\n", + "\n", + "class Guide:\n", + " def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n", + " raise NotImplementedError\n", + "\n", + " def metric(self, query, response, reference=None, **kwargs) -> float:\n", + " return self.get_feedback(query, response, reference)[0] # extracts scalar\n", + "\n", + "Key observations:\n", + " • get_feedback() returns Tuple[float, str] — a SCALAR score + feedback string\n", + " • metric() returns float — just extracts the first element\n", + " • LLMJudge (subclass) returns binary 0/1 scores\n", + " • No mechanism to return Dict[str, float] for multiple metrics\n", + "\"\"\")\n", + "\n", + "# Simulate current behavior\n", + "class CurrentGuide:\n", + " \"\"\"Simulates the current Guide behavior — scalar scores only.\"\"\"\n", + " def get_feedback(self, query, response, reference=None) -> Tuple[float, str]:\n", + " score = 1.0 if response == reference else 0.0\n", + " feedback = \"Correct!\" if score == 1.0 else f\"Expected '{reference}', got '{response}'\"\n", + " return score, feedback\n", + "\n", + " def metric(self, query, response, reference=None) -> float:\n", + " return self.get_feedback(query, response, reference)[0]\n", + "\n", + "guide = CurrentGuide()\n", + "score, feedback = guide.get_feedback(\"What is 2+2?\", \"4\", \"4\")\n", + "print(f\"Example — get_feedback(): score={score} (type={type(score).__name__}), feedback='{feedback}'\")\n", + "print(f\"Example — metric(): {guide.metric('What is 2+2?', '4', '4')} (type={type(guide.metric('What is 2+2?', '4', '4')).__name__})\")" + ] + }, + { + "cell_type": "markdown", + "id": "fcbb5663", + "metadata": {}, + "source": [ + "### 1.2 Evaluator Aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "55bf7801", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Current Evaluator Behavior ===\n", + "\n", + "def evaluate(agent, guide, inputs, infos, ...) -> np.ndarray:\n", + " # For each input: calls guide.metric(input, agent(input), info) → float\n", + " # Returns np.array of shape (N,) or (N, num_samples)\n", + " # Aggregated via np.mean(scores)\n", + "\n", + "Key observations:\n", + " • All scores are numeric scalars\n", + " • Aggregation: np.mean() over all examples\n", + " • No support for Dict[str, float] scores\n", + "\n", + "Example — evaluate() returns: [0.9 0.85 0.95 0.7 0.88] (shape=(5,), dtype=float64)\n", + "Example — np.mean(scores): 0.8560 (single scalar used for selection)\n" + ] + } + ], + "source": [ + "print(\"\"\"\n", + "=== Current Evaluator Behavior ===\n", + "\n", + "def evaluate(agent, guide, inputs, infos, ...) -> np.ndarray:\n", + " # For each input: calls guide.metric(input, agent(input), info) → float\n", + " # Returns np.array of shape (N,) or (N, num_samples)\n", + " # Aggregated via np.mean(scores)\n", + "\n", + "Key observations:\n", + " • All scores are numeric scalars\n", + " • Aggregation: np.mean() over all examples\n", + " • No support for Dict[str, float] scores\n", + "\"\"\")\n", + "\n", + "# Simulate current evaluator\n", + "scores_array = np.array([0.9, 0.85, 0.95, 0.7, 0.88])\n", + "mean_score = np.mean(scores_array)\n", + "print(f\"Example — evaluate() returns: {scores_array} (shape={scores_array.shape}, dtype={scores_array.dtype})\")\n", + "print(f\"Example — np.mean(scores): {mean_score:.4f} (single scalar used for selection)\")" + ] + }, + { + "cell_type": "markdown", + "id": "7ab684f0", + "metadata": {}, + "source": [ + "### 1.3 Selection Points in Trainers" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b8b0032f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== BasicSearchAlgorithm — Selection Logic ===\n", + "\n", + "File: opto/trainer/algorithms/basic_algorithms.py\n", + "Method: BasicSearchAlgorithm.optimizer_step()\n", + "\n", + " def validate():\n", + " scores = evaluate(self.agent, self.validate_guide, ...)\n", + " return np.mean(scores) if all([s is not None for s in scores]) else -np.inf\n", + " ^^^^^^^^^^^^^^^^\n", + " Returns: single float\n", + "\n", + " candidates.append((score, update_dict)) # score is float\n", + " candidates.append((self.current_score, backup_dict)) # include current\n", + "\n", + " best_score, best_update = max(candidates, key=lambda x: x[0])\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " SELECTION: scalar max — single metric only\n", + "\n", + ">>> This is the PRIMARY insertion point for multi-objective selection. <<<\n", + "\n", + "BasicSearch candidates: [(0.72, 'proposal_A'), (0.85, 'proposal_B'), (0.78, 'proposal_C'), (0.85, 'current_params')]\n", + "Selected (scalar max): score=0.85, params='proposal_B'\n", + "Note: Tie between proposal_B and current_params — max() picks first occurrence (proposal_B)\n" + ] + } + ], + "source": [ + "print(\"\"\"\n", + "=== BasicSearchAlgorithm — Selection Logic ===\n", + "\n", + "File: opto/trainer/algorithms/basic_algorithms.py\n", + "Method: BasicSearchAlgorithm.optimizer_step()\n", + "\n", + " def validate():\n", + " scores = evaluate(self.agent, self.validate_guide, ...)\n", + " return np.mean(scores) if all([s is not None for s in scores]) else -np.inf\n", + " ^^^^^^^^^^^^^^^^\n", + " Returns: single float\n", + "\n", + " candidates.append((score, update_dict)) # score is float\n", + " candidates.append((self.current_score, backup_dict)) # include current\n", + "\n", + " best_score, best_update = max(candidates, key=lambda x: x[0])\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " SELECTION: scalar max — single metric only\n", + "\n", + ">>> This is the PRIMARY insertion point for multi-objective selection. <<<\n", + "\"\"\")\n", + "\n", + "# Simulate current BasicSearch selection\n", + "candidates_basic = [\n", + " (0.72, \"proposal_A\"),\n", + " (0.85, \"proposal_B\"),\n", + " (0.78, \"proposal_C\"),\n", + " (0.85, \"current_params\"), # tie with proposal_B\n", + "]\n", + "best_score, best_update = max(candidates_basic, key=lambda x: x[0])\n", + "print(f\"BasicSearch candidates: {[(s, name) for s, name in candidates_basic]}\")\n", + "print(f\"Selected (scalar max): score={best_score}, params='{best_update}'\")\n", + "print(f\"Note: Tie between proposal_B and current_params — max() picks first occurrence (proposal_B)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8db5aa87", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== BeamsearchAlgorithm — Selection Logic ===\n", + "\n", + "File: opto/trainer/algorithms/beamsearch_algorithm.py\n", + "Method: BeamsearchAlgorithm.select()\n", + "\n", + " scored_candidates.append((validation_score, candidate_params)) # float\n", + "\n", + " sorted_candidates = sorted(scored_candidates, key=lambda x: x[0], reverse=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " SELECTION: scalar sort descending\n", + "\n", + " selected_candidates = sorted_candidates[:beam_width] # take top-k\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " Top-k by scalar score only\n", + "\n", + ">>> This is the SECONDARY insertion point for multi-objective selection. <<<\n", + "\n", + "Beamsearch candidates: [(0.72, 'candidate_1'), (0.91, 'candidate_2'), (0.85, 'candidate_3'), (0.91, 'candidate_4'), (0.78, 'candidate_5')]\n", + "Selected (top-3 by scalar): [(0.91, 'candidate_2'), (0.91, 'candidate_4'), (0.85, 'candidate_3')]\n", + "Note: Tie between candidate_2 and candidate_4 — sorted() preserves input order (stable sort)\n" + ] + } + ], + "source": [ + "print(\"\"\"\n", + "=== BeamsearchAlgorithm — Selection Logic ===\n", + "\n", + "File: opto/trainer/algorithms/beamsearch_algorithm.py\n", + "Method: BeamsearchAlgorithm.select()\n", + "\n", + " scored_candidates.append((validation_score, candidate_params)) # float\n", + "\n", + " sorted_candidates = sorted(scored_candidates, key=lambda x: x[0], reverse=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " SELECTION: scalar sort descending\n", + "\n", + " selected_candidates = sorted_candidates[:beam_width] # take top-k\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " Top-k by scalar score only\n", + "\n", + ">>> This is the SECONDARY insertion point for multi-objective selection. <<<\n", + "\"\"\")\n", + "\n", + "# Simulate current Beamsearch selection\n", + "candidates_beam = [\n", + " (0.72, \"candidate_1\"),\n", + " (0.91, \"candidate_2\"),\n", + " (0.85, \"candidate_3\"),\n", + " (0.91, \"candidate_4\"), # tie with candidate_2\n", + " (0.78, \"candidate_5\"),\n", + "]\n", + "beam_width = 3\n", + "sorted_candidates = sorted(candidates_beam, key=lambda x: x[0], reverse=True)\n", + "selected = sorted_candidates[:beam_width]\n", + "print(f\"Beamsearch candidates: {[(s, name) for s, name in candidates_beam]}\")\n", + "print(f\"Selected (top-{beam_width} by scalar): {[(s, name) for s, name in selected]}\")\n", + "print(f\"Note: Tie between candidate_2 and candidate_4 — sorted() preserves input order (stable sort)\")" + ] + }, + { + "cell_type": "markdown", + "id": "7119b4a4", + "metadata": {}, + "source": [ + "### 1.4 Summary: What's Missing\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbf9e98b", + "metadata": {}, + "outputs": [], + "source": "print(\"\"\"\n=== Summary: Current Limitations ===\n\n1. Guide.metric() → float only (and stays float BY DESIGN)\n metric() will NOT be widened to return dicts.\n Dict scores flow through the NEW get_score_dict() path instead.\n\n2. evaluate() → np.array of floats\n Cannot aggregate per-metric means across examples.\n New evaluate_vector() will handle dict aggregation separately.\n\n3. BasicSearch: max(candidates, key=scalar)\n Cannot do weighted multi-metric selection or Pareto ranking\n\n4. Beamsearch: sorted(candidates, key=scalar)[:k]\n Cannot select top-k by Pareto dominance\n\n5. No ObjectiveConfig\n No way to declare minimize metrics, weights, or selection mode\n\n>>> All of the above will be addressed in M1-M2 without breaking existing behavior. <<<\n>>> Training loop (guide.__call__ → float) is NEVER modified. <<<\n\"\"\")" + }, + { + "cell_type": "markdown", + "id": "8e97b2fd", + "metadata": {}, + "source": [ + "---\n", + "## Part 2: Planned Behavior — Prototype\n", + "\n", + "The following cells implement the **planned multi-objective selection** as pure functions.\n", + "This is a standalone prototype (no OpenTrace dependency) demonstrating the exact behavior\n", + "that `opto/trainer/objectives.py` will provide.\n", + "\n", + "### 2.1 ObjectiveConfig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bad5944d", + "metadata": {}, + "outputs": [], + "source": "print(\"\\n\" + \"=\" * 70)\nprint(\"PART 2: PLANNED BEHAVIOR — PROTOTYPE\")\nprint(\"=\" * 70)\n\n@dataclass(frozen=True)\nclass ObjectiveConfig:\n \"\"\"Configuration for multi-objective candidate selection.\"\"\"\n mode: str = \"scalar\" # \"scalar\", \"weighted\", \"pareto\"\n weights: Dict[str, float] = field(default_factory=dict)\n minimize: frozenset = field(default_factory=frozenset)\n missing_value: float = float(\"-inf\")\n pareto_metrics: Optional[Tuple[str, ...]] = None\n tie_break: str = \"weighted\" # \"weighted\", \"lexicographic\", \"random_seeded\"\n seed: int = 0\n\n def __post_init__(self):\n # Convert set → frozenset for true immutability + hashability\n if isinstance(self.minimize, set):\n object.__setattr__(self, 'minimize', frozenset(self.minimize))\n # Validate weights are non-negative\n for k, v in self.weights.items():\n if v < 0:\n raise ValueError(f\"Weight for '{k}' must be non-negative, got {v}\")\n # Validate pareto_metrics\n if self.pareto_metrics is not None and len(self.pareto_metrics) == 0:\n raise ValueError(\"pareto_metrics must be None (auto) or non-empty tuple\")\n\nprint(\"ObjectiveConfig defined with modes: scalar | weighted | pareto\")\nprint(f\"Default config: {ObjectiveConfig()}\")\n\n# Verify set → frozenset auto-conversion\nconfig_with_set = ObjectiveConfig(minimize={\"latency_s\"})\nprint(f\"minimize=set auto-converts: type={type(config_with_set.minimize).__name__}, value={config_with_set.minimize}\")" + }, + { + "cell_type": "markdown", + "id": "478f806d", + "metadata": {}, + "source": [ + "### 2.2 Core Utility Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ed7c83c", + "metadata": {}, + "outputs": [], + "source": "# --- Score type aliases ---\nScoreLike = Union[int, float, bool, Dict[str, float]]\n\n\ndef normalize_score(score: ScoreLike) -> Dict[str, float]:\n \"\"\"Convert any score to dict form.\n \n - int/float/bool → {\"score\": float(value)}\n - Dict[str, float] → returned as-is (validated)\n \n Handles int (LLMJudge returns 0/1) and bool (test guides) explicitly.\n \"\"\"\n if isinstance(score, (bool, int, float)):\n # bool check must come before int since bool is subclass of int\n val = float(score)\n if not np.isfinite(val):\n raise ValueError(f\"Score must be finite, got {score}\")\n return {\"score\": val}\n elif isinstance(score, dict):\n if len(score) == 0:\n raise ValueError(\"Score dict must not be empty\")\n for k, v in score.items():\n if not isinstance(v, (int, float)) or not np.isfinite(float(v)):\n raise ValueError(f\"Score dict value for '{k}' must be finite float, got {v}\")\n return {k: float(v) for k, v in score.items()}\n else:\n raise TypeError(f\"Score must be int, float, bool, or Dict[str, float], got {type(score).__name__}\")\n\n\ndef apply_minimize(score_dict: Dict[str, float], minimize: set) -> Dict[str, float]:\n \"\"\"Negate values for minimize metrics (higher-is-better normalization).\"\"\"\n return {\n k: -v if k in minimize else v\n for k, v in score_dict.items()\n }\n\n\ndef weighted_scalarize(score_dict: Dict[str, float], weights: Dict[str, float],\n missing_value: float = float(\"-inf\")) -> float:\n \"\"\"Compute weighted sum. Empty weights → equal weight 1.0.\"\"\"\n if not weights:\n weights = {k: 1.0 for k in score_dict}\n total = 0.0\n for metric, weight in weights.items():\n value = score_dict.get(metric, missing_value)\n total += weight * value\n return total\n\n\ndef dominates(a: Dict[str, float], b: Dict[str, float],\n metrics: Optional[Tuple[str, ...]] = None) -> bool:\n \"\"\"Check if candidate 'a' Pareto-dominates candidate 'b'.\n \n a dominates b iff:\n - a[m] >= b[m] for ALL metrics m, AND\n - a[m] > b[m] for AT LEAST ONE metric m\n \"\"\"\n if metrics is None:\n metrics = tuple(sorted(set(a.keys()) | set(b.keys())))\n \n at_least_one_better = False\n for m in metrics:\n va = a.get(m, float(\"-inf\"))\n vb = b.get(m, float(\"-inf\"))\n if va < vb:\n return False # a is worse on this metric\n if va > vb:\n at_least_one_better = True\n return at_least_one_better\n\n\ndef pareto_rank(candidates: List[Dict[str, float]],\n metrics: Optional[Tuple[str, ...]] = None) -> List[int]:\n \"\"\"Assign Pareto rank (0 = non-dominated front).\"\"\"\n n = len(candidates)\n ranks = [0] * n\n assigned = [False] * n\n current_rank = 0\n\n remaining = set(range(n))\n while remaining:\n # Find non-dominated set among remaining\n front = []\n for i in remaining:\n dominated = False\n for j in remaining:\n if i != j and dominates(candidates[j], candidates[i], metrics):\n dominated = True\n break\n if not dominated:\n front.append(i)\n\n for i in front:\n ranks[i] = current_rank\n remaining.remove(i)\n current_rank += 1\n\n return ranks\n\n\ndef select_best(candidates: List[Tuple[ScoreLike, any]],\n config: Optional[ObjectiveConfig] = None) -> int:\n \"\"\"Select the single best candidate index.\"\"\"\n if config is None or config.mode == \"scalar\":\n # Backward-compatible: scalar max\n scores = []\n for score, _ in candidates:\n if isinstance(score, dict):\n scores.append(np.mean(list(score.values())))\n else:\n scores.append(float(score))\n return int(np.argmax(scores))\n\n # Normalize all scores to dict\n score_dicts = [normalize_score(s) for s, _ in candidates]\n\n # Apply minimize\n score_dicts = [apply_minimize(sd, config.minimize) for sd in score_dicts]\n\n if config.mode == \"weighted\":\n weighted_scores = [weighted_scalarize(sd, config.weights, config.missing_value) for sd in score_dicts]\n return int(np.argmax(weighted_scores))\n\n elif config.mode == \"pareto\":\n ranks = pareto_rank(score_dicts, config.pareto_metrics)\n # Get indices of rank-0 (Pareto front)\n front_indices = [i for i, r in enumerate(ranks) if r == 0]\n\n if len(front_indices) == 1:\n return front_indices[0]\n\n # Tie-break among front\n if config.tie_break == \"weighted\":\n front_scores = [weighted_scalarize(score_dicts[i], config.weights, config.missing_value)\n for i in front_indices]\n return front_indices[int(np.argmax(front_scores))]\n elif config.tie_break == \"lexicographic\":\n metrics = sorted(score_dicts[front_indices[0]].keys())\n def lex_key(idx):\n return tuple(score_dicts[idx].get(m, config.missing_value) for m in metrics)\n return max(front_indices, key=lex_key)\n elif config.tie_break == \"random_seeded\":\n rng = np.random.RandomState(config.seed)\n return front_indices[rng.randint(len(front_indices))]\n\n raise ValueError(f\"Unknown mode: {config.mode}\")\n\n\ndef select_top_k(candidates: List[Tuple[ScoreLike, any]],\n config: Optional[ObjectiveConfig] = None,\n k: int = 1) -> List[int]:\n \"\"\"Select the top-k candidate indices.\"\"\"\n if config is None or config.mode == \"scalar\":\n scores = []\n for score, _ in candidates:\n if isinstance(score, dict):\n scores.append(np.mean(list(score.values())))\n else:\n scores.append(float(score))\n return list(np.argsort(scores)[::-1][:k])\n\n score_dicts = [normalize_score(s) for s, _ in candidates]\n score_dicts = [apply_minimize(sd, config.minimize) for sd in score_dicts]\n\n if config.mode == \"weighted\":\n weighted_scores = [weighted_scalarize(sd, config.weights, config.missing_value) for sd in score_dicts]\n return list(np.argsort(weighted_scores)[::-1][:k])\n\n elif config.mode == \"pareto\":\n ranks = pareto_rank(score_dicts, config.pareto_metrics)\n # Collect by rank, then tie-break within each rank\n result = []\n max_rank = max(ranks)\n for rank in range(max_rank + 1):\n rank_indices = [i for i, r in enumerate(ranks) if r == rank]\n # Sort within rank by tie-break\n if config.tie_break == \"weighted\":\n rank_indices.sort(\n key=lambda i: weighted_scalarize(score_dicts[i], config.weights, config.missing_value),\n reverse=True\n )\n elif config.tie_break == \"lexicographic\":\n metrics = sorted(score_dicts[rank_indices[0]].keys()) if rank_indices else []\n rank_indices.sort(\n key=lambda i: tuple(score_dicts[i].get(m, config.missing_value) for m in metrics),\n reverse=True\n )\n elif config.tie_break == \"random_seeded\":\n rng = np.random.RandomState(config.seed + rank)\n rng.shuffle(rank_indices)\n result.extend(rank_indices)\n if len(result) >= k:\n break\n return result[:k]\n\n raise ValueError(f\"Unknown mode: {config.mode}\")\n\n\nprint(\"Core utility functions defined:\")\nprint(\" \\u2022 normalize_score() — handles float, int, bool, and dict\")\nprint(\" \\u2022 apply_minimize()\")\nprint(\" \\u2022 weighted_scalarize()\")\nprint(\" \\u2022 dominates()\")\nprint(\" \\u2022 pareto_rank()\")\nprint(\" \\u2022 select_best()\")\nprint(\" \\u2022 select_top_k()\")" + }, + { + "cell_type": "markdown", + "id": "6233d2c7", + "metadata": {}, + "source": [ + "### 2.3 Validation: normalize_score()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25003e79", + "metadata": {}, + "outputs": [], + "source": "print(\"\\n--- normalize_score() examples ---\")\nprint(f\" normalize_score(0.85) = {normalize_score(0.85)}\")\nprint(f\" normalize_score({{'acc': 0.9, 'lat': 50}}) = {normalize_score({'acc': 0.9, 'lat': 50})}\")\n\n# int and bool edge cases (LLMJudge returns int 0/1, test guides return bool)\nprint(f\"\\n --- int / bool edge cases ---\")\nprint(f\" normalize_score(1) = {normalize_score(1)} # LLMJudge returns int 0/1\")\nprint(f\" normalize_score(0) = {normalize_score(0)} # LLMJudge incorrect → int 0\")\nprint(f\" normalize_score(True) = {normalize_score(True)} # test guide correct → bool\")\nprint(f\" normalize_score(False) = {normalize_score(False)} # test guide incorrect → bool\")\n\n# Error edge cases\nprint(f\"\\n --- Error edge cases ---\")\ntry:\n normalize_score({})\nexcept ValueError as e:\n print(f\" normalize_score({{}}) → ValueError: {e}\")\n\ntry:\n normalize_score(\"bad\")\nexcept TypeError as e:\n print(f\" normalize_score('bad') → TypeError: {e}\")" + }, + { + "cell_type": "markdown", + "id": "a5c0fef1", + "metadata": {}, + "source": [ + "### 2.4 Validation: apply_minimize() + weighted_scalarize()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b9e31bec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- apply_minimize() examples ---\n", + " Original: {'accuracy': 0.9, 'latency_ms': 120.0, 'cost': 0.05}\n", + " Minimize: {'latency_ms', 'cost'}\n", + " Result: {'accuracy': 0.9, 'latency_ms': -120.0, 'cost': -0.05}\n", + " (latency and cost negated → higher-is-better)\n", + "\n", + "--- weighted_scalarize() examples ---\n", + " Score (normalized): {'accuracy': 0.9, 'latency_ms': -120.0, 'cost': -0.05}\n", + " Weights: {'accuracy': 0.6, 'latency_ms': 0.3, 'cost': 0.1}\n", + " Weighted sum: -35.4650\n", + " = 0.6*0.9 + 0.3*(-120.0) + 0.1*(-0.05) = -35.4650\n" + ] + } + ], + "source": [ + "print(\"\\n--- apply_minimize() examples ---\")\n", + "score = {\"accuracy\": 0.9, \"latency_ms\": 120.0, \"cost\": 0.05}\n", + "minimized = apply_minimize(score, minimize={\"latency_ms\", \"cost\"})\n", + "print(f\" Original: {score}\")\n", + "print(f\" Minimize: {{'latency_ms', 'cost'}}\")\n", + "print(f\" Result: {minimized}\")\n", + "print(f\" (latency and cost negated → higher-is-better)\")\n", + "\n", + "print(\"\\n--- weighted_scalarize() examples ---\")\n", + "weights = {\"accuracy\": 0.6, \"latency_ms\": 0.3, \"cost\": 0.1}\n", + "ws = weighted_scalarize(minimized, weights)\n", + "print(f\" Score (normalized): {minimized}\")\n", + "print(f\" Weights: {weights}\")\n", + "print(f\" Weighted sum: {ws:.4f}\")\n", + "print(f\" = 0.6*0.9 + 0.3*(-120.0) + 0.1*(-0.05) = {0.6*0.9 + 0.3*(-120.0) + 0.1*(-0.05):.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f1725c01", + "metadata": {}, + "source": [ + "### 2.5 Demonstration: Weighted vs Pareto Selection\n", + "\n", + "We create 6 candidates with realistic multi-metric scores to show how weighted and Pareto selection differ." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d3023945", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "DEMONSTRATION: WEIGHTED vs PARETO SELECTION\n", + "======================================================================\n", + "\n", + "Candidate Scores:\n", + " Name Accuracy Latency (s)\n", + " --------------- ---------- ------------\n", + " candidate_A 0.95 0.200\n", + " candidate_B 0.70 0.030\n", + " candidate_C 0.88 0.080\n", + " candidate_D 0.92 0.150\n", + " candidate_E 0.60 0.020\n", + " candidate_F 0.85 0.085\n", + "\n", + "--- Mode: SCALAR (baseline) ---\n", + " Selection: mean of dict values → max\n", + " Winner: candidate_A (index 0)\n", + " Score: {'accuracy': 0.95, 'latency_s': 0.2}\n", + " Note: This is the CURRENT behavior — treats multi-metric as mean scalar.\n", + "\n", + "--- Mode: WEIGHTED (accuracy-heavy) ---\n", + " Weights: accuracy=0.8, latency_s=0.2 (minimized)\n", + " Winner: candidate_A (index 0)\n", + " Weighted score: 0.7200\n", + "\n", + "--- Mode: WEIGHTED (latency-heavy) ---\n", + " Weights: accuracy=0.2, latency_s=0.8 (minimized)\n", + " Winner: candidate_B (index 1)\n", + " Weighted score: 0.1160\n", + "\n", + " >>> Changing weights changes the winner!\n", + " >>> Accuracy-heavy → candidate_A, Latency-heavy → candidate_B\n", + "\n", + "--- Mode: PARETO ---\n", + "\n", + " Pareto Ranking (after minimize normalization):\n", + " Name Accuracy Neg Latency Pareto Rank\n", + " --------------- ---------- ------------ ------------\n", + " candidate_A 0.95 -0.200 0\n", + " candidate_B 0.70 -0.030 0\n", + " candidate_C 0.88 -0.080 0\n", + " candidate_D 0.92 -0.150 0\n", + " candidate_E 0.60 -0.020 0\n", + " candidate_F 0.85 -0.085 1\n", + "\n", + " Pareto Front (Rank 0): ['candidate_A', 'candidate_B', 'candidate_C', 'candidate_D', 'candidate_E']\n", + " These candidates represent TRADEOFFS — none is dominated by another.\n", + "\n", + " After tie-break (weighted, weights={acc: 0.5, lat: 0.5}):\n", + " Winner: candidate_C (index 2)\n", + "\n", + "--- Mode: PARETO (top-k for Beamsearch, k=3) ---\n", + " Selected top-3:\n", + " #1: candidate_C (Pareto rank 0, scores: {'accuracy': 0.88, 'latency_s': 0.08})\n", + " #2: candidate_D (Pareto rank 0, scores: {'accuracy': 0.92, 'latency_s': 0.15})\n", + " #3: candidate_A (Pareto rank 0, scores: {'accuracy': 0.95, 'latency_s': 0.2})\n" + ] + } + ], + "source": [ + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"DEMONSTRATION: WEIGHTED vs PARETO SELECTION\")\n", + "print(\"=\" * 70)\n", + "\n", + "# 6 candidates with accuracy (higher=better) and latency_s (lower=better)\n", + "# Using latency_s (seconds, 0-1 scale) so metrics are comparable and weight changes matter\n", + "candidates = [\n", + " ({\"accuracy\": 0.95, \"latency_s\": 0.200}, \"candidate_A\"), # high accuracy, high latency\n", + " ({\"accuracy\": 0.70, \"latency_s\": 0.030}, \"candidate_B\"), # low accuracy, low latency\n", + " ({\"accuracy\": 0.88, \"latency_s\": 0.080}, \"candidate_C\"), # balanced\n", + " ({\"accuracy\": 0.92, \"latency_s\": 0.150}, \"candidate_D\"), # good accuracy, moderate latency\n", + " ({\"accuracy\": 0.60, \"latency_s\": 0.020}, \"candidate_E\"), # lowest accuracy, fastest\n", + " ({\"accuracy\": 0.85, \"latency_s\": 0.085}, \"candidate_F\"), # similar to C\n", + "]\n", + "\n", + "print(\"\\nCandidate Scores:\")\n", + "print(f\" {'Name':<15} {'Accuracy':>10} {'Latency (s)':>12}\")\n", + "print(f\" {'-'*15} {'-'*10} {'-'*12}\")\n", + "for score, name in candidates:\n", + " print(f\" {name:<15} {score['accuracy']:>10.2f} {score['latency_s']:>12.3f}\")\n", + "\n", + "# --- Scalar mode (baseline) ---\n", + "print(\"\\n--- Mode: SCALAR (baseline) ---\")\n", + "config_scalar = ObjectiveConfig(mode=\"scalar\")\n", + "best_idx = select_best(candidates, config_scalar)\n", + "print(f\" Selection: mean of dict values → max\")\n", + "print(f\" Winner: {candidates[best_idx][1]} (index {best_idx})\")\n", + "print(f\" Score: {candidates[best_idx][0]}\")\n", + "print(f\" Note: This is the CURRENT behavior — treats multi-metric as mean scalar.\")\n", + "\n", + "# --- Weighted mode: accuracy-heavy ---\n", + "print(\"\\n--- Mode: WEIGHTED (accuracy-heavy) ---\")\n", + "config_weighted_acc = ObjectiveConfig(\n", + " mode=\"weighted\",\n", + " weights={\"accuracy\": 0.8, \"latency_s\": 0.2},\n", + " minimize=frozenset({\"latency_s\"})\n", + ")\n", + "best_idx = select_best(candidates, config_weighted_acc)\n", + "print(f\" Weights: accuracy=0.8, latency_s=0.2 (minimized)\")\n", + "print(f\" Winner: {candidates[best_idx][1]} (index {best_idx})\")\n", + "score_dict = apply_minimize(candidates[best_idx][0], config_weighted_acc.minimize)\n", + "ws = weighted_scalarize(score_dict, config_weighted_acc.weights)\n", + "print(f\" Weighted score: {ws:.4f}\")\n", + "\n", + "# --- Weighted mode: latency-heavy ---\n", + "print(\"\\n--- Mode: WEIGHTED (latency-heavy) ---\")\n", + "config_weighted_lat = ObjectiveConfig(\n", + " mode=\"weighted\",\n", + " weights={\"accuracy\": 0.2, \"latency_s\": 0.8},\n", + " minimize=frozenset({\"latency_s\"})\n", + ")\n", + "best_idx_lat = select_best(candidates, config_weighted_lat)\n", + "print(f\" Weights: accuracy=0.2, latency_s=0.8 (minimized)\")\n", + "print(f\" Winner: {candidates[best_idx_lat][1]} (index {best_idx_lat})\")\n", + "score_dict_lat = apply_minimize(candidates[best_idx_lat][0], config_weighted_lat.minimize)\n", + "ws_lat = weighted_scalarize(score_dict_lat, config_weighted_lat.weights)\n", + "print(f\" Weighted score: {ws_lat:.4f}\")\n", + "\n", + "print(f\"\\n >>> Changing weights changes the winner!\")\n", + "print(f\" >>> Accuracy-heavy → {candidates[best_idx][1]}, Latency-heavy → {candidates[best_idx_lat][1]}\")\n", + "\n", + "# --- Pareto mode ---\n", + "print(\"\\n--- Mode: PARETO ---\")\n", + "config_pareto = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"accuracy\": 0.5, \"latency_s\": 0.5}, # used for tie-breaking\n", + " minimize=frozenset({\"latency_s\"}),\n", + " tie_break=\"weighted\",\n", + " seed=42\n", + ")\n", + "\n", + "# Show full Pareto ranking\n", + "score_dicts_norm = [apply_minimize(normalize_score(s), config_pareto.minimize) for s, _ in candidates]\n", + "ranks = pareto_rank(score_dicts_norm)\n", + "\n", + "print(f\"\\n Pareto Ranking (after minimize normalization):\")\n", + "print(f\" {'Name':<15} {'Accuracy':>10} {'Neg Latency':>12} {'Pareto Rank':>12}\")\n", + "print(f\" {'-'*15} {'-'*10} {'-'*12} {'-'*12}\")\n", + "for i, ((score, name), rank) in enumerate(zip(candidates, ranks)):\n", + " nd = score_dicts_norm[i]\n", + " print(f\" {name:<15} {nd['accuracy']:>10.2f} {nd['latency_s']:>12.3f} {rank:>12}\")\n", + "\n", + "front_indices = [i for i, r in enumerate(ranks) if r == 0]\n", + "print(f\"\\n Pareto Front (Rank 0): {[candidates[i][1] for i in front_indices]}\")\n", + "print(f\" These candidates represent TRADEOFFS — none is dominated by another.\")\n", + "\n", + "best_idx_pareto = select_best(candidates, config_pareto)\n", + "print(f\"\\n After tie-break (weighted, weights={{acc: 0.5, lat: 0.5}}):\")\n", + "print(f\" Winner: {candidates[best_idx_pareto][1]} (index {best_idx_pareto})\")\n", + "\n", + "# --- Top-k selection (Beamsearch simulation) ---\n", + "print(\"\\n--- Mode: PARETO (top-k for Beamsearch, k=3) ---\")\n", + "top_k_indices = select_top_k(candidates, config_pareto, k=3)\n", + "print(f\" Selected top-3:\")\n", + "for rank_pos, idx in enumerate(top_k_indices):\n", + " r = ranks[idx]\n", + " print(f\" #{rank_pos+1}: {candidates[idx][1]} (Pareto rank {r}, scores: {candidates[idx][0]})\")" + ] + }, + { + "cell_type": "markdown", + "id": "c1bdf524", + "metadata": {}, + "source": [ + "### 2.6 Deterministic Tie-Break Validation" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "dc6ea71d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "DETERMINISTIC TIE-BREAK VALIDATION\n", + "======================================================================\n", + "\n", + "--- Repeated runs with seed=42 ---\n", + " 10 runs with seed=42: indices = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n", + " All identical: True ✓\n", + "\n", + "--- Different seeds with random_seeded tie-break ---\n", + " seed= 0: winner = candidate_E (index 4)\n", + " seed= 1: winner = candidate_D (index 3)\n", + " seed= 2: winner = candidate_A (index 0)\n", + " seed=42: winner = candidate_D (index 3)\n", + " seed=99: winner = candidate_B (index 1)\n", + "\n", + "--- Determinism check for random_seeded (seed=42, 10 runs) ---\n", + " 10 runs: indices = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]\n", + " All identical: True ✓\n" + ] + } + ], + "source": [ + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"DETERMINISTIC TIE-BREAK VALIDATION\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Run selection 10 times with same seed — must produce identical results\n", + "print(\"\\n--- Repeated runs with seed=42 ---\")\n", + "results = []\n", + "for run in range(10):\n", + " config = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n", + " minimize=frozenset({\"latency_s\"}),\n", + " tie_break=\"weighted\",\n", + " seed=42\n", + " )\n", + " idx = select_best(candidates, config)\n", + " results.append(idx)\n", + "\n", + "all_same = len(set(results)) == 1\n", + "print(f\" 10 runs with seed=42: indices = {results}\")\n", + "print(f\" All identical: {all_same} ✓\" if all_same else f\" NOT identical: FAIL ✗\")\n", + "\n", + "# Different seed should potentially give different tie-break (if random_seeded)\n", + "print(\"\\n--- Different seeds with random_seeded tie-break ---\")\n", + "for seed in [0, 1, 2, 42, 99]:\n", + " config = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n", + " minimize=frozenset({\"latency_s\"}),\n", + " tie_break=\"random_seeded\",\n", + " seed=seed\n", + " )\n", + " idx = select_best(candidates, config)\n", + " print(f\" seed={seed:>2}: winner = {candidates[idx][1]} (index {idx})\")\n", + "\n", + "# Verify same seed is deterministic for random_seeded too\n", + "print(\"\\n--- Determinism check for random_seeded (seed=42, 10 runs) ---\")\n", + "results_random = []\n", + "for _ in range(10):\n", + " config = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n", + " minimize=frozenset({\"latency_s\"}),\n", + " tie_break=\"random_seeded\",\n", + " seed=42\n", + " )\n", + " idx = select_best(candidates, config)\n", + " results_random.append(idx)\n", + "all_same_random = len(set(results_random)) == 1\n", + "print(f\" 10 runs: indices = {results_random}\")\n", + "print(f\" All identical: {all_same_random} ✓\" if all_same_random else f\" NOT identical: FAIL ✗\")" + ] + }, + { + "cell_type": "markdown", + "id": "3cc966d2", + "metadata": {}, + "source": [ + "### 2.7 Edge Cases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4545dc3", + "metadata": {}, + "outputs": [], + "source": "print(\"\\n\" + \"=\" * 70)\nprint(\"EDGE CASES\")\nprint(\"=\" * 70)\n\n# Single-metric dict\nprint(\"\\n--- Single-metric dict with Pareto mode ---\")\nsingle_metric_candidates = [\n ({\"accuracy\": 0.9}, \"A\"),\n ({\"accuracy\": 0.8}, \"B\"),\n ({\"accuracy\": 0.95}, \"C\"),\n]\nconfig_single = ObjectiveConfig(mode=\"pareto\", tie_break=\"weighted\")\nbest = select_best(single_metric_candidates, config_single)\nprint(f\" Candidates: {[s for s, _ in single_metric_candidates]}\")\nprint(f\" Winner: {single_metric_candidates[best][1]} (index {best})\")\nprint(f\" Note: Pareto with 1 metric degenerates to scalar max — expected behavior.\")\n\n# Mixed float and dict\nprint(\"\\n--- Backward compat: float scores with ObjectiveConfig ---\")\nfloat_candidates = [\n (0.85, \"A\"),\n (0.92, \"B\"),\n (0.78, \"C\"),\n]\nconfig_float = ObjectiveConfig(mode=\"weighted\", weights={\"score\": 1.0})\nbest_float = select_best(float_candidates, config_float)\nprint(f\" Float candidates: {[s for s, _ in float_candidates]}\")\nprint(f\" Winner: {float_candidates[best_float][1]} (score={float_candidates[best_float][0]})\")\nprint(f\" Note: Floats normalized to {{'score': val}} — backward-compatible.\")\n\n# None config (pure backward compatibility)\nprint(\"\\n--- None config (current behavior) ---\")\nbest_none = select_best(float_candidates, None)\nprint(f\" config=None → scalar max → {float_candidates[best_none][1]} (score={float_candidates[best_none][0]})\")\nprint(f\" Identical to current max(candidates, key=lambda x: x[0])\")\n\n# Negative weight validation\nprint(\"\\n--- Negative weight validation ---\")\ntry:\n ObjectiveConfig(weights={\"accuracy\": 0.8, \"latency_s\": -0.2})\nexcept ValueError as e:\n print(f\" ObjectiveConfig(weights={{..., 'latency_s': -0.2}}) → ValueError: {e}\")\n print(f\" Note: Use minimize={{'latency_s'}} instead of negative weights.\")\n\n# Empty pareto_metrics validation\nprint(\"\\n--- Empty pareto_metrics validation ---\")\ntry:\n ObjectiveConfig(pareto_metrics=())\nexcept ValueError as e:\n print(f\" ObjectiveConfig(pareto_metrics=()) → ValueError: {e}\")\n print(f\" Note: Use None (auto-detect) or a non-empty tuple of metric names.\")" + }, + { + "cell_type": "markdown", + "id": "b510fdc9", + "metadata": {}, + "source": [ + "### 2.8 Visual Summary: Selection Behavior Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c9abcad1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "SELECTION BEHAVIOR COMPARISON\n", + "======================================================================\n", + "\n", + " Mode Winner Reasoning\n", + " ------------------------- --------------- --------------------------------------------------\n", + " scalar (baseline) candidate_A mean of dict values → max\n", + " weighted (acc=0.8) candidate_A weighted sum with {'accuracy': 0.8, 'latency_s': 0.2}\n", + " weighted (lat=0.8) candidate_B weighted sum with {'accuracy': 0.2, 'latency_s': 0.8}\n", + " pareto (tie=weighted) candidate_C rank-0 front, tie-break=weighted\n", + "\n", + " >>> Different modes select different candidates from the SAME pool.\n", + " >>> This is exactly the behavior objectives.py will provide to trainers.\n" + ] + } + ], + "source": [ + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"SELECTION BEHAVIOR COMPARISON\")\n", + "print(\"=\" * 70)\n", + "\n", + "print(f\"\\n {'Mode':<25} {'Winner':<15} {'Reasoning'}\")\n", + "print(f\" {'-'*25} {'-'*15} {'-'*50}\")\n", + "\n", + "modes = [\n", + " (\"scalar (baseline)\", config_scalar),\n", + " (\"weighted (acc=0.8)\", config_weighted_acc),\n", + " (\"weighted (lat=0.8)\", config_weighted_lat),\n", + " (\"pareto (tie=weighted)\", config_pareto),\n", + "]\n", + "\n", + "for mode_name, config in modes:\n", + " idx = select_best(candidates, config)\n", + " name = candidates[idx][1]\n", + " if config.mode == \"scalar\":\n", + " reason = \"mean of dict values → max\"\n", + " elif config.mode == \"weighted\":\n", + " reason = f\"weighted sum with {dict(config.weights)}\"\n", + " elif config.mode == \"pareto\":\n", + " reason = f\"rank-0 front, tie-break={config.tie_break}\"\n", + " print(f\" {mode_name:<25} {name:<15} {reason}\")\n", + "\n", + "print(f\"\\n >>> Different modes select different candidates from the SAME pool.\")\n", + "print(f\" >>> This is exactly the behavior objectives.py will provide to trainers.\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f1ed487", + "metadata": {}, + "source": "---\n## Part 3: Architecture Summary\n\n### Two separate data paths (by design)\n\nThe training loop and selection path are **intentionally separate**. `guide.__call__()` / `get_feedback()` return type is NOT widened — the training loop always receives `float`.\n\n```\nTRAINING LOOP (unchanged):\n guide(x, target.data, info) → (float, str)\n │\n └── score (float) → np.mean(scores) → optimizer backward\n Always float. Never dict. Training loop is completely safe.\n\nSELECTION PATH (new):\n guide.get_score_dict(query, response, reference) → Dict[str, float]\n │\n ▼\n evaluate_vector() → List[Dict[str, float]] (one dict per example)\n │\n ▼\n aggregate_vector_scores() → Dict[str, float] (mean per metric)\n │\n ▼\n objectives.py (select_best / select_top_k)\n │\n ├── mode=\"scalar\" → max(mean_scores) ← unchanged\n ├── mode=\"weighted\" → max(weighted_scalarize()) ← new\n └── mode=\"pareto\" → pareto_rank() + tie-break ← new\n```\n\n**Key safety invariant:** `metric()` always returns `float`. If a guide's `get_feedback()` returns a dict as the score, `metric()` collapses it via `mean(values)`. Dict scores are only accessible through `get_score_dict()`.\n\n### Files to create/modify\n\n| Action | File | Milestone |\n|--------|------|-----------|\n| CREATE | `opto/trainer/objectives.py` | M1 |\n| MODIFY | `opto/trainer/guide.py` — add `get_score_dict()`, update `metric()` to collapse dicts to float | M1 |\n| MODIFY | `opto/trainer/evaluators.py` — add `evaluate_vector()`, `aggregate_vector_scores()` | M1 |\n| MODIFY | `basic_algorithms.py` | M1-M2 |\n| MODIFY | `beamsearch_algorithm.py` | M2 |\n| OPTIONAL | `priority_search.py` | M2 |" + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3e97bc57", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "M0 ANALYSIS COMPLETE\n", + "======================================================================\n", + "\n", + "Deliverables verified:\n", + " ✓ Current Guide score contract documented (Tuple[float, str])\n", + " ✓ Scalar selection points identified (BasicSearch max, Beamsearch sorted[:k])\n", + " ✓ Weighted selection produces different results with different weights\n", + " ✓ Pareto selection returns non-dominated tradeoff set\n", + " ✓ Deterministic tie-break verified (same seed → same result, 10 runs)\n", + " ✓ Edge cases validated (empty dict, single metric, float compat, None config)\n", + " ✓ Architecture summary with file list and data flow\n", + "\n", + "See docs/T6_technical_plan.md for the complete refined technical plan.\n", + "\n" + ] + } + ], + "source": [ + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"M0 ANALYSIS COMPLETE\")\n", + "print(\"=\" * 70)\n", + "print(\"\"\"\n", + "Deliverables verified:\n", + " ✓ Current Guide score contract documented (Tuple[float, str])\n", + " ✓ Scalar selection points identified (BasicSearch max, Beamsearch sorted[:k])\n", + " ✓ Weighted selection produces different results with different weights\n", + " ✓ Pareto selection returns non-dominated tradeoff set\n", + " ✓ Deterministic tie-break verified (same seed → same result, 10 runs)\n", + " ✓ Edge cases validated (empty dict, single metric, float compat, None config)\n", + " ✓ Architecture summary with file list and data flow\n", + "\n", + "See docs/T6_technical_plan.md for the complete refined technical plan.\n", + "\"\"\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From 3b2a0b29c0b3ba9cea64675340626f74b4f3c4c7 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Tue, 10 Feb 2026 17:20:55 -0400 Subject: [PATCH 02/20] T6 M0: Apply Xavier's review fixes (paths, dates, motivation, real LLM required) --- examples/notebooks/t6_m0_analysis.ipynb | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/examples/notebooks/t6_m0_analysis.ipynb b/examples/notebooks/t6_m0_analysis.ipynb index 90eefcad..2549d76a 100644 --- a/examples/notebooks/t6_m0_analysis.ipynb +++ b/examples/notebooks/t6_m0_analysis.ipynb @@ -35,22 +35,7 @@ "cell_type": "markdown", "id": "b1a58d26", "metadata": {}, - "source": [ - "# T6 Multi-Objective Vector Scores — M0 Analysis\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/)\n", - "\n", - "**Milestone 0 Deliverable** — Analysis + Technical Plan + Interface Spec\n", - "\n", - "This notebook demonstrates:\n", - "1. **Current baseline**: How Guide returns scalar scores, how evaluators aggregate, where selection happens\n", - "2. **Exact touchpoints**: The specific lines of code in BasicSearch and Beamsearch that perform scalar selection\n", - "3. **Planned behavior**: A deterministic prototype showing weighted vs Pareto selection on toy candidates\n", - "\n", - "**No API keys required.** All examples use deterministic dummy data.\n", - "\n", - "---" - ] + "source": "# T6 Multi-Objective Vector Scores — M0 Analysis\n\n[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AgentOpt/OpenTrace/blob/pull/61/head/examples/notebooks/t6_m0_analysis.ipynb)\n\n**Milestone 0 Deliverable** — Analysis + Technical Plan + Interface Spec\n\nThis notebook demonstrates:\n1. **Current baseline**: How Guide returns scalar scores, how evaluators aggregate, where selection happens\n2. **Exact touchpoints**: The specific lines of code in BasicSearch and Beamsearch that perform scalar selection\n3. **Planned behavior**: A deterministic prototype showing weighted vs Pareto selection on toy candidates\n\n**Motivation (why score-as-dict):** adding extra metrics into the *feedback dict/text* can help optimizers (OptoPrime/OPRO), but trainers typically only use the scalar score for ranking/UCB and ignore additional feedback structure. To enable Pareto/weighted multi-objective selection at the trainer level, we need vector score (score-as-dict) with backward-compatible scalar reduction.\n\n**No API keys required for M0.** All examples use deterministic dummy data. (From M1 onward, milestone notebooks must validate both StubLLM and real LLM modes.)\n\n---" }, { "cell_type": "markdown", @@ -405,7 +390,7 @@ "id": "fbf9e98b", "metadata": {}, "outputs": [], - "source": "print(\"\"\"\n=== Summary: Current Limitations ===\n\n1. Guide.metric() → float only (and stays float BY DESIGN)\n metric() will NOT be widened to return dicts.\n Dict scores flow through the NEW get_score_dict() path instead.\n\n2. evaluate() → np.array of floats\n Cannot aggregate per-metric means across examples.\n New evaluate_vector() will handle dict aggregation separately.\n\n3. BasicSearch: max(candidates, key=scalar)\n Cannot do weighted multi-metric selection or Pareto ranking\n\n4. Beamsearch: sorted(candidates, key=scalar)[:k]\n Cannot select top-k by Pareto dominance\n\n5. No ObjectiveConfig\n No way to declare minimize metrics, weights, or selection mode\n\n>>> All of the above will be addressed in M1-M2 without breaking existing behavior. <<<\n>>> Training loop (guide.__call__ → float) is NEVER modified. <<<\n\"\"\")" + "source": "print(\"\"\"\n=== Summary: Current Limitations ===\n\n0. Extra metrics in feedback are not usable by trainers today.\n Trainers typically rank/UCB using only the scalar score, and do not inspect feedback structure.\n\n1. Guide.metric() → float only (and stays float BY DESIGN)\n metric() will NOT be widened to return dicts.\n Dict scores flow through the NEW get_score_dict() path instead.\n\n2. evaluate() → np.array of floats\n Cannot aggregate per-metric means across examples.\n New evaluate_vector() will handle dict aggregation separately.\n\n3. BasicSearch: max(candidates, key=scalar)\n Cannot do weighted multi-metric selection or Pareto ranking\n\n4. Beamsearch: sorted(candidates, key=scalar)[:k]\n Cannot select top-k by Pareto dominance\n\n5. No ObjectiveConfig\n No way to declare minimize metrics, weights, or selection mode\n\n>>> All of the above will be addressed in M1-M2 without breaking existing behavior. <<<\n>>> Training loop (guide.__call__ → float) is NEVER modified. <<<\n\"\"\")" }, { "cell_type": "markdown", From 249bde6e187b6fc211a344fd328f6bcd35f92ff7 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Tue, 10 Feb 2026 17:22:59 -0400 Subject: [PATCH 03/20] T6 M0: Apply Xavier's review fixes to technical plan --- docs/T6_technical_plan.md | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/docs/T6_technical_plan.md b/docs/T6_technical_plan.md index e37c0c8c..87f3e764 100644 --- a/docs/T6_technical_plan.md +++ b/docs/T6_technical_plan.md @@ -2,7 +2,7 @@ **Version:** 1.0 (Refined) **Author:** Carlos Rodriguez -**Date:** February 9, 2025 +**Date:** February 9, 2026 **Status:** M0 Deliverable — Analysis + Architecture + Interface Spec **Target repos / branches:** @@ -32,6 +32,9 @@ Today, trainer selection in Trace is driven by a **single scalar score**. Guides return `Tuple[float, str]` via `get_feedback()`, evaluators produce `np.array` of floats, and trainers (`BasicSearchAlgorithm`, `BeamsearchAlgorithm`) select candidates via scalar comparison (`max(candidates, key=lambda x: x[0])` and `sorted(..., key=lambda x: x[0])` respectively). This blocks trainer-side search from exploiting multiple metrics like `{accuracy, latency_ms, cost}`. +**Motivation note (from team discussion):** +Putting multiple metrics into the *feedback dict/text* is useful for optimizers (OptoPrime/OPRO), but trainers (BasicSearch/UCB/PrioritySearch/GEPA) typically only inspect the **scalar score** for ranking/UCB and ignore additional feedback structure. Therefore, enabling **vector score / score-as-dict** (with backward-compatible scalar reduction) is required for multi-objective trainer selection. + ### What this plan adds | Component | Change | @@ -516,7 +519,7 @@ def select_top_k(candidates: List[Tuple[ScoreLike, any]], | File | Change | Milestone | |------|--------|-----------| -| `opto/trainer/guide.py` | Add `get_score_dict()` method to `Guide` base class. Update `metric()` to collapse dict scores to `float` via `mean(values)` (return type stays `float`). | M1 | +| `opto/trainer/guide.py` | Add `get_score_dict()` method to `Guide` base class. Keep training loop scalar-safe (`metric()` returns `float`). Dict/vector scores are accessed via `get_score_dict()` for trainer-side selection. | M1 | | `opto/trainer/evaluators.py` | Add `evaluate_vector()` and `aggregate_vector_scores()`. Existing `evaluate()` unchanged. | M1 | | `opto/trainer/algorithms/basic_algorithms.py` | Add `objective_config` param to `BasicSearchAlgorithm.train()`. Replace `max(candidates, ...)` with `select_best()` in `optimizer_step()`. | M1 (minimal) / M2 (robust) | | `opto/trainer/algorithms/beamsearch_algorithm.py` | Add `objective_config` param to `BeamsearchAlgorithm.train()`. Replace scalar sort in `select()` with `select_top_k()`. | M2 | @@ -592,7 +595,7 @@ This `score` flows into `MinibatchAlgorithm.update()` where `np.mean(scores)` is | Constraint | Enforcement | |-----------|-------------| | `guide.__call__()` / `get_feedback()` return type is **NOT widened** | No changes to `get_feedback()` signature; it still returns `Tuple[float, str]` | -| Training loop always receives scalar `score` | `metric()` always returns `float` (collapses dict via `mean(values)` if needed) | +| Training loop always receives scalar `score` | `metric()` always returns `float`. Vector/dict scores are not used by the training loop and are accessed via `get_score_dict()` for trainer-side selection. | | Dict scores flow through a separate path | `get_score_dict()` → `evaluate_vector()` → `select_best()` / `select_top_k()` | | A multi-objective guide must return `(float, str)` from `get_feedback()` for the training loop | The float is a collapsed scalar summary; the full dict is extracted via `get_score_dict()` during selection | @@ -619,7 +622,7 @@ Selection path: get_score_dict() → evaluate_vector() → objectives.py ← **Deliverables:** - `docs/T6_technical_plan.md` — this document, finalized -- `notebooks/t6_m0_analysis.ipynb` — Colab-ready notebook +- `examples/notebooks/t6_m0_analysis.ipynb` — Colab-ready notebook **Notebook demonstrates:** - Current Guide score contract (`get_feedback` → `Tuple[float, str]`, `metric` → `float`) @@ -641,12 +644,12 @@ Selection path: get_score_dict() → evaluate_vector() → objectives.py ← - `opto/trainer/evaluators.py` (add `evaluate_vector`, `aggregate_vector_scores`) - `opto/trainer/algorithms/basic_algorithms.py` (BasicSearch: accept/use ObjectiveConfig) - `tests/test_objectives.py`, `tests/test_evaluators_vector.py` -- `notebooks/t6_m1_vector_scores.ipynb` +- `examples/notebooks/t6_m1_vector_scores.ipynb` **Notebook demonstrates:** - StubLLM mode: BasicSearchAlgorithm on small candidate set (5-10) with deterministic dummy guide returning dict metrics - Shows: (a) scalar baseline, (b) weighted mode, (c) Pareto mode, (d) deterministic tie-break under fixed seed -- Real LLM mode (optional): tiny dataset (≤5 items) producing ≥2 metrics +- Real LLM mode (required): tiny dataset (≤5 items) producing ≥2 metrics **SMART validation:** - `pytest -q` passes (all new functions covered) @@ -663,7 +666,7 @@ Selection path: get_score_dict() → evaluate_vector() → objectives.py ← - Expanded BasicSearch tests (edge cases, missing metrics, tie-break policies) - Optional: minimal PrioritySearch support (weighted scalarization for heap, dict stored for logging) - `tests/test_trainers_multiobjective.py` -- `notebooks/t6_m2_trainers.ipynb` +- `examples/notebooks/t6_m2_trainers.ipynb` **Notebook demonstrates:** - BasicSearch + Beamsearch in: scalar mode (baseline), weighted mode, Pareto mode @@ -681,11 +684,18 @@ Selection path: get_score_dict() → evaluate_vector() → objectives.py ← **Deliverables:** - PR to Trace-Bench: benchmark configs/tasks + notebook + - **Trace-Bench touchpoints (update `main` if default branch differs):** + - https://github.com/AgentOpt/Trace-Bench/blob/main/LLM4AD/trainers_benchmark.py + - https://github.com/AgentOpt/Trace-Bench/blob/main/LLM4AD/trainers_benchmark_tasks_validation.py + - https://github.com/AgentOpt/Trace-Bench/blob/main/LLM4AD/benchmark_tasks/index.json + - https://github.com/AgentOpt/Trace-Bench/tree/main/LLM4AD/benchmark_tasks + - https://github.com/AgentOpt/Trace-Bench/blob/main/LLM4AD/llm4ad_loader.py + - https://github.com/AgentOpt/Trace-Bench/blob/main/tests/test_lite_optimize_llm4ad.py - 3 benchmarks: 1. **Accuracy vs latency** (toy QA dataset) 2. **Accuracy vs response length** (penalize verbosity) 3. **Accuracy vs tool calls** (penalize excessive tool usage) -- `notebooks/t6_m3_benchmarks.ipynb` +- Trace-Bench notebook: `notebooks/t6_multiobjective_benchmarks.ipynb` (in Trace-Bench repo) **SMART validation:** - Notebook outputs per-benchmark table: weighted-mode best candidate metrics + Pareto-mode set of tradeoffs @@ -763,7 +773,7 @@ Selection path: get_score_dict() → evaluate_vector() → objectives.py ← Each notebook contains: - **StubLLM (no keys) section:** deterministic dummy guide, runs quickly -- **Real LLM section (optional):** small N (5-20 examples), prints cost/latency caveats, requires API key +- **Real LLM section (required):** small N (5-20 examples), prints cost/latency caveats, requires API key --- From 2213a191bc2ba06274df1e657ba9a2e434e308ce Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Thu, 12 Feb 2026 11:50:07 -0400 Subject: [PATCH 04/20] =?UTF-8?q?T6=20M1:=20Multi-objective=20vector=20sco?= =?UTF-8?q?res=20=E2=80=94=20ObjectiveConfig,=20objectives.py,=20evaluate?= =?UTF-8?q?=5Fvector,=20BasicSearch=20integration,=2059=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/notebooks/t6_m1_vector_scores.ipynb | 810 +++++++++++++++++++ opto/trainer/algorithms/basic_algorithms.py | 95 ++- opto/trainer/evaluators.py | 74 +- opto/trainer/guide.py | 16 + opto/trainer/objectives.py | 312 +++++++ tests/unit_tests/test_evaluators_vector.py | 154 ++++ tests/unit_tests/test_objectives.py | 383 +++++++++ 7 files changed, 1823 insertions(+), 21 deletions(-) create mode 100644 examples/notebooks/t6_m1_vector_scores.ipynb create mode 100644 opto/trainer/objectives.py create mode 100644 tests/unit_tests/test_evaluators_vector.py create mode 100644 tests/unit_tests/test_objectives.py diff --git a/examples/notebooks/t6_m1_vector_scores.ipynb b/examples/notebooks/t6_m1_vector_scores.ipynb new file mode 100644 index 00000000..637322d0 --- /dev/null +++ b/examples/notebooks/t6_m1_vector_scores.ipynb @@ -0,0 +1,810 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "a0000001", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "T6 Milestone 1 — Multi-Objective Vector Scores\n", + "\n", + "This notebook is the M1 deliverable for the T6 Multi-Objective Vector Scores project.\n", + "It demonstrates:\n", + " 1. ObjectiveConfig creation and validation\n", + " 2. MultiMetricGuide with get_score_dict()\n", + " 3. evaluate_vector() + aggregate_vector_scores()\n", + " 4. Full BasicSearchAlgorithm.train() with DummyLLM + objective_config\n", + " 5. Scalar baseline comparison (backward compat)\n", + " 6. Pareto mode demo + deterministic tiebreak\n", + "\n", + "Part A runs end-to-end WITHOUT API keys (StubLLM / DummyLLM).\n", + "Part B requires an OpenRouter API key (Colab secrets or environment variable).\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "a0000002", + "metadata": {}, + "source": [ + "# T6 Multi-Objective Vector Scores — M1 Implementation\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AgentOpt/OpenTrace/blob/pull/61/head/examples/notebooks/t6_m1_vector_scores.ipynb)\n", + "\n", + "**Milestone 1 Deliverable** — Core multi-objective infrastructure\n", + "\n", + "This notebook demonstrates the M1 implementation:\n", + "1. **ObjectiveConfig**: Frozen dataclass for multi-objective selection configuration\n", + "2. **Vector score path**: `get_score_dict()` → `evaluate_vector()` → `aggregate_vector_scores()` → `select_best()`\n", + "3. **BasicSearch integration**: Training with `objective_config` parameter (weighted + Pareto modes)\n", + "4. **Backward compatibility**: `objective_config=None` produces identical behavior to baseline\n", + "\n", + "**Part A (StubLLM):** No API keys required. Uses `DummyLLM` for deterministic end-to-end training.\n", + "\n", + "**Part B (Real LLM):** Requires `OPENROUTER_API_KEY` via Colab secrets or env var. Uses `google/gemini-2.0-flash-001`.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "a0000003", + "metadata": {}, + "source": [ + "## How to Validate This Milestone\n", + "\n", + "After running all cells, confirm:\n", + "- [ ] ObjectiveConfig creation and validation work correctly\n", + "- [ ] MultiMetricGuide returns `Dict[str, float]` from `get_score_dict()`\n", + "- [ ] `evaluate_vector()` returns `List[Dict[str, float]]`\n", + "- [ ] `aggregate_vector_scores()` computes per-metric means\n", + "- [ ] BasicSearch with `objective_config=None` (scalar) trains successfully\n", + "- [ ] BasicSearch with weighted `objective_config` selects differently than scalar\n", + "- [ ] Pareto mode produces deterministic results with same seed\n", + "- [ ] Real LLM section (Part B) trains with actual model + multi-metric guide" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000004", + "metadata": {}, + "outputs": [], + "source": "import sys, os\n\n# Ensure OpenTrace root is on the path (needed when running from examples/notebooks/)\n_repo_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))\nif os.path.isdir(os.path.join(_repo_root, 'opto')):\n if _repo_root not in sys.path:\n sys.path.insert(0, _repo_root)\n# Also handle running directly from the repo root\nif os.path.isdir(os.path.join(os.getcwd(), 'opto')):\n if os.getcwd() not in sys.path:\n sys.path.insert(0, os.getcwd())\n\nimport numpy as np\nfrom typing import Dict, Tuple, Optional\n\nprint(\"=\" * 70)\nprint(\"T6 M1 \\u2014 Multi-Objective Vector Scores\")\nprint(\"=\" * 70)" + }, + { + "cell_type": "markdown", + "id": "a0000005", + "metadata": {}, + "source": [ + "---\n", + "## Part A: StubLLM (No API Key Required)\n", + "\n", + "### A.1 ObjectiveConfig Creation & Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000006", + "metadata": {}, + "outputs": [], + "source": [ + "from opto.trainer.objectives import (\n", + " ObjectiveConfig, normalize_score, apply_minimize,\n", + " weighted_scalarize, dominates, pareto_rank, select_best, select_top_k,\n", + ")\n", + "\n", + "print(\"--- ObjectiveConfig defaults ---\")\n", + "config_default = ObjectiveConfig()\n", + "print(f\" mode={config_default.mode}, weights={config_default.weights}, \"\n", + " f\"minimize={config_default.minimize}\")\n", + "\n", + "print(\"\\n--- ObjectiveConfig: weighted mode ---\")\n", + "config_weighted = ObjectiveConfig(\n", + " mode=\"weighted\",\n", + " weights={\"accuracy\": 0.8, \"latency_s\": 0.2},\n", + " minimize=frozenset({\"latency_s\"}),\n", + ")\n", + "print(f\" mode={config_weighted.mode}\")\n", + "print(f\" weights={config_weighted.weights}\")\n", + "print(f\" minimize={config_weighted.minimize}\")\n", + "\n", + "print(\"\\n--- ObjectiveConfig: Pareto mode ---\")\n", + "config_pareto = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n", + " minimize=frozenset({\"latency_s\"}),\n", + " tie_break=\"weighted\",\n", + " seed=42,\n", + ")\n", + "print(f\" mode={config_pareto.mode}, tie_break={config_pareto.tie_break}, seed={config_pareto.seed}\")\n", + "\n", + "print(\"\\n--- ObjectiveConfig: set auto-converts to frozenset ---\")\n", + "config_set = ObjectiveConfig(minimize={\"lat\"})\n", + "print(f\" type(minimize)={type(config_set.minimize).__name__} (auto-converted from set)\")\n", + "\n", + "print(\"\\n--- Validation: negative weight ---\")\n", + "try:\n", + " ObjectiveConfig(weights={\"a\": -0.5})\n", + "except ValueError as e:\n", + " print(f\" Caught: {e}\")\n", + "\n", + "print(\"\\n--- Validation: bad mode ---\")\n", + "try:\n", + " ObjectiveConfig(mode=\"unknown\")\n", + "except ValueError as e:\n", + " print(f\" Caught: {e}\")\n", + "\n", + "print(\"\\n--- Frozen (immutable) ---\")\n", + "try:\n", + " config_default.mode = \"weighted\"\n", + "except AttributeError as e:\n", + " print(f\" Caught: {e}\")\n", + "\n", + "print(\"\\nObjectiveConfig validation: all checks passed.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0000007", + "metadata": {}, + "source": [ + "### A.2 MultiMetricGuide with `get_score_dict()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000008", + "metadata": {}, + "outputs": [], + "source": [ + "from opto.trainer.guide import Guide\n", + "\n", + "\n", + "class MultiMetricGuide(Guide):\n", + " \"\"\"Guide that returns multi-metric score dicts.\n", + "\n", + " Evaluates accuracy (exact match) and brevity (inverse length difference).\n", + " The training loop still calls get_feedback() -> (float, str).\n", + " The selection path calls get_score_dict() -> Dict[str, float].\n", + " \"\"\"\n", + "\n", + " def get_feedback(self, query, response, reference=None, **kwargs):\n", + " accuracy = 1.0 if str(response).strip().lower() == str(reference).strip().lower() else 0.0\n", + " len_diff = abs(len(str(response)) - len(str(reference)))\n", + " brevity = 1.0 / (1.0 + len_diff)\n", + " feedback = f\"Expected '{reference}', got '{response}'. \"\n", + " if accuracy < 1.0:\n", + " feedback += \"Incorrect. Please provide the exact expected answer.\"\n", + " else:\n", + " feedback += \"Correct!\"\n", + " # Training loop gets scalar (accuracy) + feedback string\n", + " return accuracy, feedback\n", + "\n", + " def get_score_dict(self, query, response, reference=None, **kwargs):\n", + " accuracy = 1.0 if str(response).strip().lower() == str(reference).strip().lower() else 0.0\n", + " len_diff = abs(len(str(response)) - len(str(reference)))\n", + " brevity = 1.0 / (1.0 + len_diff)\n", + " return {\"accuracy\": accuracy, \"brevity\": brevity}\n", + "\n", + "\n", + "# Demonstrate both paths\n", + "guide = MultiMetricGuide()\n", + "\n", + "print(\"--- Training path: get_feedback() -> (float, str) ---\")\n", + "score, feedback = guide.get_feedback(\"Q: 2+2\", \"4\", \"4\")\n", + "print(f\" score={score} (type={type(score).__name__})\")\n", + "print(f\" feedback='{feedback}'\")\n", + "\n", + "print(\"\\n--- Selection path: get_score_dict() -> Dict[str, float] ---\")\n", + "sd = guide.get_score_dict(\"Q: 2+2\", \"4\", \"4\")\n", + "print(f\" score_dict={sd}\")\n", + "\n", + "print(\"\\n--- metric() still returns float (backward compat) ---\")\n", + "m = guide.metric(\"Q: 2+2\", \"4\", \"4\")\n", + "print(f\" metric()={m} (type={type(m).__name__})\")\n", + "\n", + "print(\"\\n--- Base Guide without get_score_dict override wraps scalar ---\")\n", + "class ScalarOnlyGuide(Guide):\n", + " def get_feedback(self, query, response, reference=None, **kwargs):\n", + " return 0.75, \"some feedback\"\n", + "\n", + "fallback = ScalarOnlyGuide()\n", + "print(f\" get_score_dict()={fallback.get_score_dict('q', 'r', 'ref')}\")\n", + "print(\" (wrapped as {{'score': 0.75}} automatically)\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0000009", + "metadata": {}, + "source": [ + "### A.3 `evaluate_vector()` + `aggregate_vector_scores()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000010", + "metadata": {}, + "outputs": [], + "source": [ + "from opto import trace\n", + "from opto.trainer.evaluators import evaluate_vector, aggregate_vector_scores\n", + "\n", + "\n", + "@trace.model\n", + "class StubAgent:\n", + " \"\"\"Agent with a trainable string parameter. Returns it directly.\"\"\"\n", + " def __init__(self, answer):\n", + " self.answer = trace.node(answer, trainable=True)\n", + "\n", + " def forward(self, x):\n", + " return self.answer\n", + "\n", + "\n", + "agent = StubAgent(\"4\")\n", + "guide = MultiMetricGuide()\n", + "\n", + "inputs = [\"What is 2+2?\", \"What is 3+1?\", \"What is 5-1?\"]\n", + "infos = [\"4\", \"4\", \"4\" ] # all expect \"4\"\n", + "\n", + "print(\"--- evaluate_vector() ---\")\n", + "score_dicts = evaluate_vector(agent, guide, inputs, infos, num_threads=1)\n", + "for i, sd in enumerate(score_dicts):\n", + " print(f\" Example {i}: {sd}\")\n", + "\n", + "print(\"\\n--- aggregate_vector_scores() ---\")\n", + "agg = aggregate_vector_scores(score_dicts)\n", + "print(f\" Aggregated (per-metric mean): {agg}\")\n", + "\n", + "# Now test with wrong answer\n", + "agent_wrong = StubAgent(\"five\")\n", + "print(\"\\n--- Wrong answer agent ---\")\n", + "score_dicts_wrong = evaluate_vector(agent_wrong, guide, inputs, infos, num_threads=1)\n", + "for i, sd in enumerate(score_dicts_wrong):\n", + " print(f\" Example {i}: {sd}\")\n", + "agg_wrong = aggregate_vector_scores(score_dicts_wrong)\n", + "print(f\" Aggregated: {agg_wrong}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0000011", + "metadata": {}, + "source": [ + "### A.4 Selection with `select_best()` and `select_top_k()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000012", + "metadata": {}, + "outputs": [], + "source": [ + "# Candidates: (score_dict, payload) tuples\n", + "candidates = [\n", + " ({\"accuracy\": 0.95, \"latency_s\": 0.200}, \"prompt_A\"),\n", + " ({\"accuracy\": 0.70, \"latency_s\": 0.030}, \"prompt_B\"),\n", + " ({\"accuracy\": 0.88, \"latency_s\": 0.080}, \"prompt_C\"),\n", + " ({\"accuracy\": 0.60, \"latency_s\": 0.020}, \"prompt_D\"),\n", + "]\n", + "\n", + "print(\"Candidates:\")\n", + "for s, name in candidates:\n", + " print(f\" {name}: {s}\")\n", + "\n", + "# Scalar mode (backward-compat)\n", + "print(\"\\n--- select_best(config=None) [scalar, backward-compat] ---\")\n", + "idx = select_best(candidates, None)\n", + "print(f\" Winner: {candidates[idx][1]} (index {idx})\")\n", + "\n", + "# Weighted: accuracy-heavy\n", + "print(\"\\n--- select_best(weighted, accuracy=0.8) ---\")\n", + "config_acc = ObjectiveConfig(\n", + " mode=\"weighted\",\n", + " weights={\"accuracy\": 0.8, \"latency_s\": 0.2},\n", + " minimize=frozenset({\"latency_s\"}),\n", + ")\n", + "idx = select_best(candidates, config_acc)\n", + "print(f\" Winner: {candidates[idx][1]} (index {idx})\")\n", + "\n", + "# Weighted: latency-heavy\n", + "print(\"\\n--- select_best(weighted, latency_s=0.8) ---\")\n", + "config_lat = ObjectiveConfig(\n", + " mode=\"weighted\",\n", + " weights={\"accuracy\": 0.2, \"latency_s\": 0.8},\n", + " minimize=frozenset({\"latency_s\"}),\n", + ")\n", + "idx = select_best(candidates, config_lat)\n", + "print(f\" Winner: {candidates[idx][1]} (index {idx})\")\n", + "\n", + "# Pareto mode\n", + "print(\"\\n--- select_best(pareto, tie_break=weighted) ---\")\n", + "config_par = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n", + " minimize=frozenset({\"latency_s\"}),\n", + " tie_break=\"weighted\",\n", + ")\n", + "score_dicts_norm = [apply_minimize(normalize_score(s), config_par.minimize) for s, _ in candidates]\n", + "ranks = pareto_rank(score_dicts_norm)\n", + "print(f\" Pareto ranks: {ranks}\")\n", + "print(f\" Front (rank 0): {[candidates[i][1] for i, r in enumerate(ranks) if r == 0]}\")\n", + "idx = select_best(candidates, config_par)\n", + "print(f\" Winner (after tie-break): {candidates[idx][1]} (index {idx})\")\n", + "\n", + "# Deterministic check\n", + "print(\"\\n--- Determinism: 10 runs with same config ---\")\n", + "results = [select_best(candidates, config_par) for _ in range(10)]\n", + "print(f\" Results: {results}\")\n", + "print(f\" All identical: {len(set(results)) == 1}\")\n", + "\n", + "# Top-k\n", + "print(\"\\n--- select_top_k(pareto, k=2) ---\")\n", + "top2 = select_top_k(candidates, config_par, k=2)\n", + "print(f\" Top 2: {[candidates[i][1] for i in top2]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0000013", + "metadata": {}, + "source": [ + "### A.5 Full Training: BasicSearch with DummyLLM (scalar baseline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000014", + "metadata": {}, + "outputs": [], + "source": [ + "from opto.utils.llm import DummyLLM\n", + "from opto.optimizers import OptoPrimeV2\n", + "from opto.trainer.algorithms.basic_algorithms import BasicSearchAlgorithm\n", + "\n", + "# --- Dataset: simple Q&A ---\n", + "dataset = dict(\n", + " inputs=[\"What is 2+2?\", \"What is 3+1?\", \"What is 10-6?\"],\n", + " infos= [\"4\", \"4\", \"4\" ],\n", + ")\n", + "\n", + "# --- DummyLLM: always proposes the same system prompt ---\n", + "def stub_llm_fn(*args, **kwargs):\n", + " \"\"\"Deterministic LLM stub: always returns a fixed response.\"\"\"\n", + " return \"You are a math assistant. Always answer with just the number.\"\n", + "\n", + "dummy_llm = DummyLLM(stub_llm_fn)\n", + "\n", + "# --- Agent ---\n", + "@trace.model\n", + "class MathAgent:\n", + " def __init__(self, llm):\n", + " self.system_prompt = trace.node(\n", + " \"You are a helpful assistant.\", trainable=True\n", + " )\n", + " self.llm = llm\n", + "\n", + " @trace.bundle()\n", + " def call_llm(self, system_prompt, question):\n", + " resp = self.llm(\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": question},\n", + " ]\n", + " )\n", + " return resp.choices[0].message.content\n", + "\n", + " def forward(self, x):\n", + " return self.call_llm(self.system_prompt, x)\n", + "\n", + "# --- Scalar baseline (objective_config=None) ---\n", + "print(\"=\" * 70)\n", + "print(\"TRAINING: Scalar baseline (objective_config=None)\")\n", + "print(\"=\" * 70)\n", + "\n", + "agent_scalar = MathAgent(dummy_llm)\n", + "optimizer_scalar = OptoPrimeV2(agent_scalar.parameters(), llm=dummy_llm)\n", + "trainer_scalar = BasicSearchAlgorithm(\n", + " agent=agent_scalar, optimizer=optimizer_scalar\n", + ")\n", + "\n", + "guide_scalar = MultiMetricGuide()\n", + "scores_scalar, test_score_scalar = trainer_scalar.train(\n", + " guide=guide_scalar,\n", + " train_dataset=dataset,\n", + " num_proposals=2,\n", + " num_epochs=1,\n", + " batch_size=1,\n", + " num_threads=1,\n", + " objective_config=None, # scalar baseline\n", + ")\n", + "\n", + "print(f\"\\nScalar training scores: {scores_scalar}\")\n", + "print(f\"current_score: {trainer_scalar.current_score}\")\n", + "print(f\"current_score_dict: {trainer_scalar.current_score_dict}\")\n", + "print(\"(current_score_dict is None because scalar mode does not use vector path)\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0000015", + "metadata": {}, + "source": [ + "### A.6 Full Training: BasicSearch with DummyLLM (weighted mode)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000016", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=\" * 70)\n", + "print(\"TRAINING: Weighted mode (objective_config.mode='weighted')\")\n", + "print(\"=\" * 70)\n", + "\n", + "config_weighted_train = ObjectiveConfig(\n", + " mode=\"weighted\",\n", + " weights={\"accuracy\": 0.7, \"brevity\": 0.3},\n", + ")\n", + "\n", + "agent_weighted = MathAgent(dummy_llm)\n", + "optimizer_weighted = OptoPrimeV2(agent_weighted.parameters(), llm=dummy_llm)\n", + "trainer_weighted = BasicSearchAlgorithm(\n", + " agent=agent_weighted, optimizer=optimizer_weighted\n", + ")\n", + "\n", + "guide_weighted = MultiMetricGuide()\n", + "scores_weighted, test_score_weighted = trainer_weighted.train(\n", + " guide=guide_weighted,\n", + " train_dataset=dataset,\n", + " num_proposals=2,\n", + " num_epochs=1,\n", + " batch_size=1,\n", + " num_threads=1,\n", + " objective_config=config_weighted_train,\n", + ")\n", + "\n", + "print(f\"\\nWeighted training scores: {scores_weighted}\")\n", + "print(f\"current_score (float): {trainer_weighted.current_score}\")\n", + "print(f\"current_score_dict: {trainer_weighted.current_score_dict}\")\n", + "print(\"(current_score_dict stores the vector score selected by weighted mode)\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0000017", + "metadata": {}, + "source": [ + "### A.7 Full Training: BasicSearch with DummyLLM (Pareto mode)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000018", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=\" * 70)\n", + "print(\"TRAINING: Pareto mode (objective_config.mode='pareto')\")\n", + "print(\"=\" * 70)\n", + "\n", + "config_pareto_train = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"accuracy\": 0.5, \"brevity\": 0.5},\n", + " tie_break=\"weighted\",\n", + " seed=42,\n", + ")\n", + "\n", + "agent_pareto = MathAgent(dummy_llm)\n", + "optimizer_pareto = OptoPrimeV2(agent_pareto.parameters(), llm=dummy_llm)\n", + "trainer_pareto = BasicSearchAlgorithm(\n", + " agent=agent_pareto, optimizer=optimizer_pareto\n", + ")\n", + "\n", + "guide_pareto = MultiMetricGuide()\n", + "scores_pareto, test_score_pareto = trainer_pareto.train(\n", + " guide=guide_pareto,\n", + " train_dataset=dataset,\n", + " num_proposals=2,\n", + " num_epochs=1,\n", + " batch_size=1,\n", + " num_threads=1,\n", + " objective_config=config_pareto_train,\n", + ")\n", + "\n", + "print(f\"\\nPareto training scores: {scores_pareto}\")\n", + "print(f\"current_score (float): {trainer_pareto.current_score}\")\n", + "print(f\"current_score_dict: {trainer_pareto.current_score_dict}\")\n", + "\n", + "# Verify determinism: run again with same seed\n", + "print(\"\\n--- Determinism: re-run with same seed ---\")\n", + "agent_pareto2 = MathAgent(dummy_llm)\n", + "optimizer_pareto2 = OptoPrimeV2(agent_pareto2.parameters(), llm=dummy_llm)\n", + "trainer_pareto2 = BasicSearchAlgorithm(\n", + " agent=agent_pareto2, optimizer=optimizer_pareto2\n", + ")\n", + "scores_pareto2, _ = trainer_pareto2.train(\n", + " guide=MultiMetricGuide(),\n", + " train_dataset=dataset,\n", + " num_proposals=2,\n", + " num_epochs=1,\n", + " batch_size=1,\n", + " num_threads=1,\n", + " objective_config=config_pareto_train,\n", + ")\n", + "print(f\"Run 1 current_score_dict: {trainer_pareto.current_score_dict}\")\n", + "print(f\"Run 2 current_score_dict: {trainer_pareto2.current_score_dict}\")\n", + "match = trainer_pareto.current_score_dict == trainer_pareto2.current_score_dict\n", + "print(f\"Deterministic: {match}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0000019", + "metadata": {}, + "source": [ + "### A.8 Summary: StubLLM Section" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000020", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"PART A COMPLETE — StubLLM Section\")\n", + "print(\"=\" * 70)\n", + "print(\"\"\"\n", + "Verified:\n", + " ✓ ObjectiveConfig creation, validation, and immutability\n", + " ✓ MultiMetricGuide: get_feedback() -> (float, str) for training loop\n", + " ✓ MultiMetricGuide: get_score_dict() -> Dict[str, float] for selection path\n", + " ✓ evaluate_vector() returns List[Dict[str, float]]\n", + " ✓ aggregate_vector_scores() computes per-metric means\n", + " ✓ select_best(): scalar, weighted, Pareto modes all work\n", + " ✓ BasicSearch training: scalar baseline (objective_config=None)\n", + " ✓ BasicSearch training: weighted mode with vector score selection\n", + " ✓ BasicSearch training: Pareto mode with deterministic tie-break\n", + " ✓ current_score stays float, current_score_dict stores vector\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "a0000021", + "metadata": {}, + "source": [ + "---\n", + "## Part B: Real LLM (API Key Required)\n", + "\n", + "This section trains a real LLM agent using `CustomLLM` with OpenRouter.\n", + "\n", + "**Requirements:**\n", + "- **Colab:** Set `OPENROUTER_API_KEY` in Colab Secrets (key icon in sidebar)\n", + "- **Local:** `export OPENROUTER_API_KEY=sk-or-v1-...` in your shell, or set in `.env`\n", + "\n", + "Uses model `google/gemini-2.0-flash-001` via OpenRouter (very cheap, fast)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000022", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Try Colab secrets first, then environment variable\n", + "api_key = None\n", + "try:\n", + " from google.colab import userdata\n", + " api_key = userdata.get('OPENROUTER_API_KEY')\n", + " print(\"API key loaded from Colab secrets.\")\n", + "except (ImportError, Exception):\n", + " pass\n", + "\n", + "if not api_key:\n", + " api_key = os.environ.get('OPENROUTER_API_KEY')\n", + " if api_key:\n", + " print(\"API key loaded from environment variable.\")\n", + "\n", + "if not api_key:\n", + " # Try loading from .env file in project root\n", + " env_path = os.path.join(os.getcwd(), '.env')\n", + " if not os.path.exists(env_path):\n", + " env_path = os.path.join(os.path.dirname(os.getcwd()), '.env')\n", + " if os.path.exists(env_path):\n", + " with open(env_path) as f:\n", + " for line in f:\n", + " line = line.strip()\n", + " if line.startswith('OPENROUTER_API_KEY='):\n", + " api_key = line.split('=', 1)[1].strip()\n", + " print(f\"API key loaded from {env_path}\")\n", + " break\n", + "\n", + "if not api_key:\n", + " print(\"WARNING: No OPENROUTER_API_KEY found. Part B cells will be skipped.\")\n", + " print(\"Set it via: Colab Secrets, env var, or .env file.\")\n", + "else:\n", + " # Configure CustomLLM environment\n", + " os.environ['TRACE_CUSTOMLLM_URL'] = 'https://openrouter.ai/api/v1'\n", + " os.environ['TRACE_CUSTOMLLM_API_KEY'] = api_key\n", + " os.environ['TRACE_CUSTOMLLM_MODEL'] = 'google/gemini-2.0-flash-001'\n", + " print(\"CustomLLM configured for OpenRouter (google/gemini-2.0-flash-001).\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000023", + "metadata": {}, + "outputs": [], + "source": [ + "# Skip this cell if no API key\n", + "if not api_key:\n", + " print(\"Skipping: no API key. Set OPENROUTER_API_KEY to run Part B.\")\n", + "else:\n", + " from opto.utils.llm import CustomLLM\n", + "\n", + " real_llm = CustomLLM(model='google/gemini-2.0-flash-001')\n", + "\n", + " # Quick smoke test\n", + " print(\"--- Smoke test: real LLM call ---\")\n", + " resp = real_llm(messages=[\n", + " {\"role\": \"user\", \"content\": \"What is 2+2? Answer with just the number.\"}\n", + " ])\n", + " print(f\" Response: {resp.choices[0].message.content}\")\n", + " print(\" LLM connection verified.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000024", + "metadata": {}, + "outputs": [], + "source": [ + "# Real LLM training with weighted multi-objective selection\n", + "if not api_key:\n", + " print(\"Skipping: no API key.\")\n", + "else:\n", + " print(\"=\" * 70)\n", + " print(\"REAL LLM TRAINING: Weighted mode with multi-metric guide\")\n", + " print(\"=\" * 70)\n", + "\n", + " # Small dataset to keep costs low\n", + " real_dataset = dict(\n", + " inputs=[\"What is 7+3?\", \"What is 15-9?\", \"What is 4*3?\"],\n", + " infos= [\"10\", \"6\", \"12\" ],\n", + " )\n", + "\n", + " real_config = ObjectiveConfig(\n", + " mode=\"weighted\",\n", + " weights={\"accuracy\": 0.7, \"brevity\": 0.3},\n", + " )\n", + "\n", + " real_agent = MathAgent(real_llm)\n", + " real_optimizer = OptoPrimeV2(real_agent.parameters(), llm=real_llm)\n", + " real_trainer = BasicSearchAlgorithm(\n", + " agent=real_agent, optimizer=real_optimizer\n", + " )\n", + "\n", + " real_guide = MultiMetricGuide()\n", + " real_scores, real_test = real_trainer.train(\n", + " guide=real_guide,\n", + " train_dataset=real_dataset,\n", + " num_proposals=2,\n", + " num_epochs=1,\n", + " batch_size=1,\n", + " num_threads=1,\n", + " objective_config=real_config,\n", + " )\n", + "\n", + " print(f\"\\nReal LLM training scores: {real_scores}\")\n", + " print(f\"current_score (float): {real_trainer.current_score}\")\n", + " print(f\"current_score_dict: {real_trainer.current_score_dict}\")\n", + " print(f\"\\nFinal system prompt: {real_agent.system_prompt.data}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000025", + "metadata": {}, + "outputs": [], + "source": [ + "# Real LLM: Pareto mode comparison\n", + "if not api_key:\n", + " print(\"Skipping: no API key.\")\n", + "else:\n", + " print(\"=\" * 70)\n", + " print(\"REAL LLM TRAINING: Pareto mode for comparison\")\n", + " print(\"=\" * 70)\n", + "\n", + " pareto_config = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"accuracy\": 0.5, \"brevity\": 0.5},\n", + " tie_break=\"weighted\",\n", + " seed=42,\n", + " )\n", + "\n", + " pareto_agent = MathAgent(real_llm)\n", + " pareto_optimizer = OptoPrimeV2(pareto_agent.parameters(), llm=real_llm)\n", + " pareto_trainer = BasicSearchAlgorithm(\n", + " agent=pareto_agent, optimizer=pareto_optimizer\n", + " )\n", + "\n", + " pareto_scores, _ = pareto_trainer.train(\n", + " guide=MultiMetricGuide(),\n", + " train_dataset=real_dataset,\n", + " num_proposals=2,\n", + " num_epochs=1,\n", + " batch_size=1,\n", + " num_threads=1,\n", + " objective_config=pareto_config,\n", + " )\n", + "\n", + " print(f\"\\nPareto training scores: {pareto_scores}\")\n", + " print(f\"current_score_dict: {pareto_trainer.current_score_dict}\")\n", + "\n", + " print(\"\\n--- Comparison ---\")\n", + " print(f\"Weighted mode final: {real_trainer.current_score_dict}\")\n", + " print(f\"Pareto mode final: {pareto_trainer.current_score_dict}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0000026", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"M1 NOTEBOOK COMPLETE\")\n", + "print(\"=\" * 70)\n", + "print(\"\"\"\n", + "Deliverables verified:\n", + " ✓ Part A (StubLLM): All cells run without API keys\n", + " - ObjectiveConfig creation + validation\n", + " - MultiMetricGuide with get_score_dict()\n", + " - evaluate_vector() + aggregate_vector_scores()\n", + " - BasicSearch: scalar, weighted, and Pareto modes\n", + " - Backward compatibility (objective_config=None)\n", + " - Deterministic tie-break verification\n", + "\n", + " ✓ Part B (Real LLM): Trained with actual model via OpenRouter\n", + " - Weighted and Pareto mode with real LLM proposals\n", + " - Multi-metric selection (accuracy + brevity)\n", + " - current_score_dict populated with real scores\n", + "\"\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/opto/trainer/algorithms/basic_algorithms.py b/opto/trainer/algorithms/basic_algorithms.py index 691b14a8..50ea7842 100644 --- a/opto/trainer/algorithms/basic_algorithms.py +++ b/opto/trainer/algorithms/basic_algorithms.py @@ -6,7 +6,8 @@ from opto.trainer.loader import DataLoader from opto.trainer.utils import batch_run, async_run from opto.optimizers.utils import print_color -from opto.trainer.evaluators import evaluate +from opto.trainer.evaluators import evaluate, evaluate_vector, aggregate_vector_scores +from opto.trainer.objectives import ObjectiveConfig, select_best def standard_optimization_step(agent, x, guide, info, min_score=0): @@ -533,6 +534,7 @@ def train(self, validate_dataset = None, # dataset of (x, info) pairs to evaluate the agent for candidate selection validate_guide = None, # to provide scores for the validation set num_proposals = 4, # number of proposals to get from the optimizer + objective_config = None, # optional ObjectiveConfig for multi-objective selection num_epochs = 1, # number of training epochs batch_size = 1, # batch size for updating the agent test_dataset = None, # dataset of (x, info) pairs to evaluate the agent @@ -549,6 +551,8 @@ def train(self, self.validate_guide = validate_guide or guide self.min_score = min_score self.current_score = None + self.objective_config = objective_config + self.current_score_dict = None # stores vector score when using multi-objective return super().train(guide, train_dataset, num_epochs=num_epochs, batch_size=batch_size, test_dataset=test_dataset, test_frequency=test_frequency, log_frequency=log_frequency, @@ -571,6 +575,21 @@ def validate(): description="Validating proposals") return np.mean(scores) if all([s is not None for s in scores]) else -np.inf + def validate_vector(): + """ Validate and return aggregated vector score dict. """ + score_dicts = evaluate_vector(self.agent, + self.validate_guide, + self.validate_dataset['inputs'], + self.validate_dataset['infos'], + min_score=self.min_score, + num_threads=num_threads, + description="Validating proposals (vector)") + return aggregate_vector_scores(score_dicts) + + # Determine whether to use vector scoring for selection + use_vector = (self.objective_config is not None + and self.objective_config.mode != "scalar") + # TODO perhaps we can ask for multiple updates in one query or use different temperatures in different queries # Generate different proposals step_kwargs = dict(bypassing=True, verbose='output' if verbose else False) # we don't print the inner full message @@ -582,25 +601,57 @@ def validate(): kwargs_list=[step_kwargs] * self.num_proposals, max_workers=num_threads, description=f"Generating {self.num_proposals} proposals") # async step + # Validate the proposals candidates = [] backup_dict = {p: copy.deepcopy(p.data) for p in self.agent.parameters()} # backup the current value - for update_dict in update_dicts: - if len(update_dict) == 0: - continue - self.optimizer.update(update_dict) # set the agent with update_dict - score = validate() # check the score on the validation set - candidates.append((score, update_dict)) - self.optimizer.update(backup_dict) # restore the backup - - # Include the current parameter as a candidate - if self.current_score is None: - self.current_score = validate() - candidates.append((self.current_score, backup_dict)) - - # Find the candidate with the best score - best_score, best_update = max(candidates, key=lambda x: x[0]) - self.current_score = best_score + + if use_vector: + # Vector path: collect (score_dict, update_dict) for multi-objective selection + vector_candidates = [] + for update_dict in update_dicts: + if len(update_dict) == 0: + continue + self.optimizer.update(update_dict) + score_dict = validate_vector() + scalar_score = float(np.mean(list(score_dict.values()))) + candidates.append((scalar_score, update_dict)) + vector_candidates.append((score_dict, update_dict)) + self.optimizer.update(backup_dict) + + # Include current parameters as a candidate + if self.current_score_dict is None: + self.current_score_dict = validate_vector() + if self.current_score is None: + self.current_score = float(np.mean(list(self.current_score_dict.values()))) + candidates.append((self.current_score, backup_dict)) + vector_candidates.append((self.current_score_dict, backup_dict)) + + # Select best via multi-objective config + best_idx = select_best(vector_candidates, self.objective_config) + best_score_dict = vector_candidates[best_idx][0] + best_update = vector_candidates[best_idx][1] + best_score = float(np.mean(list(best_score_dict.values()))) + self.current_score = best_score + self.current_score_dict = best_score_dict + else: + # Scalar path: unchanged from original behavior + for update_dict in update_dicts: + if len(update_dict) == 0: + continue + self.optimizer.update(update_dict) # set the agent with update_dict + score = validate() # check the score on the validation set + candidates.append((score, update_dict)) + self.optimizer.update(backup_dict) # restore the backup + + # Include the current parameter as a candidate + if self.current_score is None: + self.current_score = validate() + candidates.append((self.current_score, backup_dict)) + + # Find the candidate with the best score + best_score, best_update = max(candidates, key=lambda x: x[0]) + self.current_score = best_score if verbose: print_color(f"Best score: {best_score} out of scores {[c[0] for c in candidates]}", 'green') @@ -609,5 +660,11 @@ def validate(): # Make the best update self.optimizer.update(best_update) - # Logging - self.logger.log('Validation score', best_score, self.n_iters, color='green') \ No newline at end of file + # Logging — always log scalar for backward compatibility + self.logger.log('Validation score', best_score, self.n_iters, color='green') + + # Log individual vector metrics if available + if use_vector and isinstance(best_score_dict, dict): + for metric_name, metric_value in best_score_dict.items(): + self.logger.log(f'Validation score/{metric_name}', metric_value, + self.n_iters, color='green') \ No newline at end of file diff --git a/opto/trainer/evaluators.py b/opto/trainer/evaluators.py index d1e99c8e..d1271fe8 100644 --- a/opto/trainer/evaluators.py +++ b/opto/trainer/evaluators.py @@ -39,6 +39,76 @@ def _evaluate(agent, guide, i): scores = np.array(scores) if num_samples > 1: # scores will be of length N * num_samples - # Reshape scores into an array of shape (N, num_samples) + # Reshape scores into an array of shape (N, num_samples) scores = scores.reshape(N, num_samples) - return scores \ No newline at end of file + return scores + + +def evaluate_vector(agent, guide, inputs, infos, min_score=None, + num_threads=None, description=None): + """Evaluate the agent and return per-example score dicts. + + Like evaluate(), but calls guide.get_score_dict() instead of + guide.metric(), returning a list of Dict[str, float]. + + Args: + agent: The agent to evaluate + guide: The guide (must have get_score_dict method) + inputs: List of inputs to evaluate on + infos: List of additional information for each input + min_score: Fallback on exception. Dict or float (wrapped as + {"score": val}). None -> {"score": -inf}. + num_threads: Maximum threads for parallel evaluation + description: Progress bar description + + Returns: + List[Dict[str, float]] of length len(inputs) + """ + assert len(inputs) == len(infos), "Inputs and infos must have the same length" + N = len(inputs) + eval_description = description or f"Evaluating {N} examples (vector)" + + if min_score is None: + _fallback = {"score": float("-inf")} + elif isinstance(min_score, dict): + _fallback = min_score + else: + _fallback = {"score": float(min_score)} + + @batch_run(max_workers=num_threads, description=eval_description) + def _evaluate_vector(agent, guide, i): + try: + output = agent(inputs[i]).data + score_dict = guide.get_score_dict(inputs[i], output, infos[i]) + except ExecutionError: + score_dict = copy.copy(_fallback) + return score_dict + + indices = list(range(N)) + return _evaluate_vector(agent, guide, indices) + + +def aggregate_vector_scores(score_dicts): + """Compute the per-metric mean across a list of score dicts. + + Args: + score_dicts: List[Dict[str, float]] + + Returns: + Dict[str, float] with the mean value for each metric key. + Empty dict if input is empty. + """ + if not score_dicts: + return {} + + all_keys = set() + for sd in score_dicts: + all_keys.update(sd.keys()) + + result = {} + for key in sorted(all_keys): + values = [sd[key] for sd in score_dicts + if key in sd and sd[key] is not None] + if values: + result[key] = float(np.mean(values)) + return result \ No newline at end of file diff --git a/opto/trainer/guide.py b/opto/trainer/guide.py index 19b6d3b2..4906c831 100644 --- a/opto/trainer/guide.py +++ b/opto/trainer/guide.py @@ -47,6 +47,22 @@ def metric(self, query: str, response: str, reference: Optional[str] = None, **k """ Exact match metric """ return self.get_feedback(query, response, reference)[0] + def get_score_dict(self, query: str, response: str, reference: Optional[str] = None, **kwargs) -> Dict[str, float]: + """Return the evaluation score as a dictionary. + + Default implementation wraps the scalar from get_feedback() as + {"score": float_value}. Subclasses returning multi-metric scores + should override this method to return e.g. + {"accuracy": 0.9, "fluency": 0.8, "latency_s": 0.05}. + + If get_feedback() returns a dict as its first element, that dict + is returned directly (with values cast to float). + """ + score = self.get_feedback(query, response, reference, **kwargs)[0] + if isinstance(score, dict): + return {k: float(v) for k, v in score.items()} + return {"score": float(score)} + def copy(self): """ Create a copy of the guide instance. diff --git a/opto/trainer/objectives.py b/opto/trainer/objectives.py new file mode 100644 index 00000000..3c21ca67 --- /dev/null +++ b/opto/trainer/objectives.py @@ -0,0 +1,312 @@ +"""Multi-objective configuration and selection utilities. + +Provides ObjectiveConfig and pure functions for multi-objective candidate +selection: weighted scalarization, Pareto ranking, and backward-compatible +scalar max. + +All functions are pure (no side effects) and depend only on numpy, typing, +and dataclasses. No imports from opto.trainer to avoid circular dependencies. +""" +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple, Union +import numpy as np + + +# --- Type aliases --- +ScalarScore = float +VectorScore = Dict[str, float] +ScoreLike = Union[int, float, bool, Dict[str, float]] + + +@dataclass(frozen=True) +class ObjectiveConfig: + """Immutable configuration for multi-objective candidate selection. + + Attributes: + mode: Selection strategy. + - "scalar": existing scalar comparison (default, backward-compatible). + - "weighted": scalarize via weighted sum, then select max. + - "pareto": Pareto dominance ranking with configurable tie-break. + weights: Per-metric weights for weighted scalarization. + Missing metrics use missing_value. Metrics not in weights are ignored. + Empty dict in weighted mode -> equal weight 1.0 for all metrics. + minimize: Frozenset of metric names where lower is better. + These are negated internally ("higher-is-better" normalization). + Users can pass a set; it is auto-converted to frozenset. + missing_value: Score assigned to missing metrics (default: -inf). + pareto_metrics: Subset of metrics for Pareto dominance. + None -> use all metrics present across candidates. + tie_break: Strategy for Pareto-equivalent candidates. + - "weighted": fall back to weighted scalarization. + - "lexicographic": sort by metric names alphabetically. + - "random_seeded": seeded random shuffle. + seed: Random seed for deterministic tie-breaking. + """ + mode: str = "scalar" + weights: Dict[str, float] = field(default_factory=dict) + minimize: frozenset = field(default_factory=frozenset) + missing_value: float = float("-inf") + pareto_metrics: Optional[Tuple[str, ...]] = None + tie_break: str = "weighted" + seed: int = 0 + + def __post_init__(self): + if isinstance(self.minimize, set): + object.__setattr__(self, 'minimize', frozenset(self.minimize)) + if self.mode not in ("scalar", "weighted", "pareto"): + raise ValueError( + f"mode must be 'scalar', 'weighted', or 'pareto', got '{self.mode}'" + ) + if self.tie_break not in ("weighted", "lexicographic", "random_seeded"): + raise ValueError( + f"tie_break must be 'weighted', 'lexicographic', or " + f"'random_seeded', got '{self.tie_break}'" + ) + for k, v in self.weights.items(): + if v < 0: + raise ValueError(f"Weight for '{k}' must be non-negative, got {v}") + if self.pareto_metrics is not None and len(self.pareto_metrics) == 0: + raise ValueError( + "pareto_metrics must be None (auto) or non-empty tuple" + ) + + +# --------------------------------------------------------------------------- +# Pure utility functions +# --------------------------------------------------------------------------- + +def normalize_score(score: ScoreLike) -> Dict[str, float]: + """Convert any score to dict form. + + - bool/int/float -> {"score": float(value)} + - Dict[str, float] -> returned as-is (validated: all values finite) + + Raises: + TypeError: if score is not int, float, bool, or dict. + ValueError: if dict is empty or contains non-finite values. + """ + if isinstance(score, bool): + return {"score": float(score)} + if isinstance(score, (int, float)): + val = float(score) + if not np.isfinite(val): + raise ValueError(f"Score must be finite, got {score}") + return {"score": val} + if isinstance(score, dict): + if len(score) == 0: + raise ValueError("Score dict must not be empty") + for k, v in score.items(): + if not isinstance(v, (int, float)) or not np.isfinite(float(v)): + raise ValueError( + f"Score dict value for '{k}' must be finite float, got {v}" + ) + return {k: float(v) for k, v in score.items()} + raise TypeError( + f"Score must be int, float, bool, or Dict[str, float], " + f"got {type(score).__name__}" + ) + + +def apply_minimize(score_dict: Dict[str, float], + minimize: frozenset) -> Dict[str, float]: + """Negate values for minimize metrics (higher-is-better normalization). + + Returns a new dict; metrics not in *minimize* are unchanged. + """ + return {k: -v if k in minimize else v for k, v in score_dict.items()} + + +def weighted_scalarize(score_dict: Dict[str, float], + weights: Dict[str, float], + missing_value: float = float("-inf")) -> float: + """Compute weighted sum of score dict. + + If *weights* is empty, all present metrics get equal weight 1.0. + Metrics in *score_dict* but NOT in *weights* are ignored. + """ + if not weights: + weights = {k: 1.0 for k in score_dict} + total = 0.0 + for metric, weight in weights.items(): + value = score_dict.get(metric, missing_value) + total += weight * value + return total + + +def dominates(a: Dict[str, float], b: Dict[str, float], + metrics: Optional[Tuple[str, ...]] = None) -> bool: + """Check if candidate *a* Pareto-dominates candidate *b*. + + a dominates b iff: + - a[m] >= b[m] for ALL metrics m, AND + - a[m] > b[m] for AT LEAST ONE metric m + + Both dicts must be in "higher-is-better" form (post apply_minimize). + """ + if metrics is None: + metrics = tuple(sorted(set(a.keys()) | set(b.keys()))) + at_least_one_better = False + for m in metrics: + va = a.get(m, float("-inf")) + vb = b.get(m, float("-inf")) + if va < vb: + return False + if va > vb: + at_least_one_better = True + return at_least_one_better + + +def pareto_rank(candidates: List[Dict[str, float]], + metrics: Optional[Tuple[str, ...]] = None) -> List[int]: + """Assign Pareto rank to each candidate (0 = non-dominated front). + + Uses standard non-dominated sorting. + """ + n = len(candidates) + ranks = [0] * n + remaining = set(range(n)) + current_rank = 0 + + while remaining: + front = [] + for i in remaining: + dominated = False + for j in remaining: + if i != j and dominates(candidates[j], candidates[i], metrics): + dominated = True + break + if not dominated: + front.append(i) + for i in front: + ranks[i] = current_rank + remaining.remove(i) + current_rank += 1 + + return ranks + + +def select_best(candidates: List[Tuple[ScoreLike, Any]], + config: Optional[ObjectiveConfig] = None) -> int: + """Select index of the single best candidate. + + Args: + candidates: List of (score, payload) tuples. + config: Selection config. None -> scalar max (backward-compatible). + + Returns: + Index of the best candidate. + """ + if config is None or config.mode == "scalar": + scores = [] + for score, _ in candidates: + if isinstance(score, dict): + scores.append(np.mean(list(score.values()))) + else: + scores.append(float(score)) + return int(np.argmax(scores)) + + score_dicts = [normalize_score(s) for s, _ in candidates] + score_dicts = [apply_minimize(sd, config.minimize) for sd in score_dicts] + + if config.mode == "weighted": + weighted = [ + weighted_scalarize(sd, config.weights, config.missing_value) + for sd in score_dicts + ] + return int(np.argmax(weighted)) + + if config.mode == "pareto": + ranks = pareto_rank(score_dicts, config.pareto_metrics) + front_indices = [i for i, r in enumerate(ranks) if r == 0] + + if len(front_indices) == 1: + return front_indices[0] + + # Tie-break among front + if config.tie_break == "weighted": + front_scores = [ + weighted_scalarize( + score_dicts[i], config.weights, config.missing_value + ) + for i in front_indices + ] + return front_indices[int(np.argmax(front_scores))] + + if config.tie_break == "lexicographic": + metrics = sorted(score_dicts[front_indices[0]].keys()) + + def lex_key(idx): + return tuple( + score_dicts[idx].get(m, config.missing_value) for m in metrics + ) + + return max(front_indices, key=lex_key) + + if config.tie_break == "random_seeded": + rng = np.random.RandomState(config.seed) + return front_indices[rng.randint(len(front_indices))] + + raise ValueError(f"Unknown mode: {config.mode}") + + +def select_top_k(candidates: List[Tuple[ScoreLike, Any]], + config: Optional[ObjectiveConfig] = None, + k: int = 1) -> List[int]: + """Select the top-k candidate indices. + + Same logic as select_best but returns *k* indices. + For Pareto mode: rank-0 front first (up to k), then rank-1, etc. + """ + if config is None or config.mode == "scalar": + scores = [] + for score, _ in candidates: + if isinstance(score, dict): + scores.append(np.mean(list(score.values()))) + else: + scores.append(float(score)) + return list(np.argsort(scores)[::-1][:k]) + + score_dicts = [normalize_score(s) for s, _ in candidates] + score_dicts = [apply_minimize(sd, config.minimize) for sd in score_dicts] + + if config.mode == "weighted": + weighted = [ + weighted_scalarize(sd, config.weights, config.missing_value) + for sd in score_dicts + ] + return list(np.argsort(weighted)[::-1][:k]) + + if config.mode == "pareto": + ranks = pareto_rank(score_dicts, config.pareto_metrics) + result: List[int] = [] + max_rank = max(ranks) + for rank in range(max_rank + 1): + rank_indices = [i for i, r in enumerate(ranks) if r == rank] + if config.tie_break == "weighted": + rank_indices.sort( + key=lambda i: weighted_scalarize( + score_dicts[i], config.weights, config.missing_value + ), + reverse=True, + ) + elif config.tie_break == "lexicographic": + metrics = ( + sorted(score_dicts[rank_indices[0]].keys()) + if rank_indices else [] + ) + rank_indices.sort( + key=lambda i: tuple( + score_dicts[i].get(m, config.missing_value) + for m in metrics + ), + reverse=True, + ) + elif config.tie_break == "random_seeded": + rng = np.random.RandomState(config.seed + rank) + rng.shuffle(rank_indices) + result.extend(rank_indices) + if len(result) >= k: + break + return result[:k] + + raise ValueError(f"Unknown mode: {config.mode}") diff --git a/tests/unit_tests/test_evaluators_vector.py b/tests/unit_tests/test_evaluators_vector.py new file mode 100644 index 00000000..61cfa1f1 --- /dev/null +++ b/tests/unit_tests/test_evaluators_vector.py @@ -0,0 +1,154 @@ +"""Tests for evaluate_vector and aggregate_vector_scores in opto.trainer.evaluators.""" +import pytest +import numpy as np +from opto import trace +from opto.trainer.evaluators import evaluate_vector, aggregate_vector_scores +from opto.trainer.guide import Guide + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +@trace.model +class SimpleAgent: + """Deterministic agent: returns input + param.""" + def __init__(self, param): + self.param = trace.node(param, trainable=True) + + def forward(self, x): + return x + self.param + + +class MultiMetricGuide(Guide): + """Guide returning multi-metric score dict.""" + def __init__(self, target): + super().__init__() + self.target = target + + def get_feedback(self, query, response, reference=None, **kwargs): + accuracy = float(response == self.target) + brevity = 1.0 / max(abs(response - self.target) + 1, 1) + feedback = f"response={response}, target={self.target}" + return accuracy, feedback + + def get_score_dict(self, query, response, reference=None, **kwargs): + accuracy = float(response == self.target) + brevity = 1.0 / max(abs(response - self.target) + 1, 1) + return {"accuracy": accuracy, "brevity": brevity} + + +class ScalarGuide(Guide): + """Guide using only scalar get_feedback (no get_score_dict override).""" + def __init__(self, target): + super().__init__() + self.target = target + + def get_feedback(self, query, response, reference=None, **kwargs): + score = float(response == self.target) + feedback = f"response={response}" + return score, feedback + + +# --------------------------------------------------------------------------- +# evaluate_vector +# --------------------------------------------------------------------------- + +def test_evaluate_vector_basic(): + """evaluate_vector returns a list of dicts with correct metric values.""" + agent = SimpleAgent(10) + guide = MultiMetricGuide(target=11) + inputs = [1, 2, 3] + infos = [None, None, None] + + results = evaluate_vector(agent, guide, inputs, infos, num_threads=1) + + assert len(results) == 3 + assert isinstance(results[0], dict) + # input=1 + param=10 = 11 == target=11 -> accuracy=1.0, brevity=1.0 + assert results[0]["accuracy"] == 1.0 + assert results[0]["brevity"] == 1.0 + # input=2 + param=10 = 12 != target=11 -> accuracy=0.0 + assert results[1]["accuracy"] == 0.0 + assert results[1]["brevity"] == pytest.approx(0.5) # 1/(|12-11|+1) = 0.5 + # input=3 + param=10 = 13 != target=11 -> accuracy=0.0 + assert results[2]["accuracy"] == 0.0 + assert results[2]["brevity"] == pytest.approx(1.0 / 3.0) # 1/(|13-11|+1) + + +def test_evaluate_vector_all_keys_present(): + """Every result dict contains the same set of metric keys.""" + agent = SimpleAgent(5) + guide = MultiMetricGuide(target=10) + inputs = [1, 2, 3, 4, 5] + infos = [None] * 5 + + results = evaluate_vector(agent, guide, inputs, infos, num_threads=1) + + expected_keys = {"accuracy", "brevity"} + for rd in results: + assert set(rd.keys()) == expected_keys + + +def test_evaluate_vector_scalar_guide_fallback(): + """Guide without get_score_dict override returns {"score": float}.""" + agent = SimpleAgent(10) + guide = ScalarGuide(target=11) + inputs = [1, 2] + infos = [None, None] + + results = evaluate_vector(agent, guide, inputs, infos, num_threads=1) + + assert len(results) == 2 + # input=1 + param=10 = 11 == target=11 -> score=1.0 + assert results[0] == {"score": 1.0} + # input=2 + param=10 = 12 != target=11 -> score=0.0 + assert results[1] == {"score": 0.0} + + +def test_evaluate_vector_empty_inputs(): + """Empty inputs produce empty results.""" + agent = SimpleAgent(0) + guide = MultiMetricGuide(target=0) + + results = evaluate_vector(agent, guide, [], [], num_threads=1) + assert results == [] + + +# --------------------------------------------------------------------------- +# aggregate_vector_scores +# --------------------------------------------------------------------------- + +def test_aggregate_basic(): + """Per-metric mean is computed correctly.""" + score_dicts = [ + {"accuracy": 1.0, "brevity": 0.5}, + {"accuracy": 0.0, "brevity": 1.0}, + ] + agg = aggregate_vector_scores(score_dicts) + assert agg["accuracy"] == pytest.approx(0.5) + assert agg["brevity"] == pytest.approx(0.75) + + +def test_aggregate_empty(): + """Empty input returns empty dict.""" + assert aggregate_vector_scores([]) == {} + + +def test_aggregate_single(): + """Single dict returns the same values.""" + score_dicts = [{"a": 0.42, "b": 0.99}] + agg = aggregate_vector_scores(score_dicts) + assert agg == {"a": pytest.approx(0.42), "b": pytest.approx(0.99)} + + +def test_aggregate_missing_keys(): + """Handles dicts with partially overlapping keys.""" + score_dicts = [ + {"accuracy": 1.0}, + {"accuracy": 0.0, "brevity": 0.8}, + ] + agg = aggregate_vector_scores(score_dicts) + assert agg["accuracy"] == pytest.approx(0.5) + # brevity only present in one dict + assert agg["brevity"] == pytest.approx(0.8) diff --git a/tests/unit_tests/test_objectives.py b/tests/unit_tests/test_objectives.py new file mode 100644 index 00000000..04fbccc2 --- /dev/null +++ b/tests/unit_tests/test_objectives.py @@ -0,0 +1,383 @@ +"""Tests for opto.trainer.objectives — ObjectiveConfig and selection utilities.""" +import pytest +import numpy as np +from opto.trainer.objectives import ( + ObjectiveConfig, normalize_score, apply_minimize, weighted_scalarize, + dominates, pareto_rank, select_best, select_top_k, +) + + +# --------------------------------------------------------------------------- +# normalize_score +# --------------------------------------------------------------------------- + +def test_normalize_score_float(): + assert normalize_score(0.85) == {"score": 0.85} + + +def test_normalize_score_zero(): + assert normalize_score(0.0) == {"score": 0.0} + + +def test_normalize_score_int(): + assert normalize_score(1) == {"score": 1.0} + + +def test_normalize_score_int_zero(): + assert normalize_score(0) == {"score": 0.0} + + +def test_normalize_score_bool_true(): + assert normalize_score(True) == {"score": 1.0} + + +def test_normalize_score_bool_false(): + assert normalize_score(False) == {"score": 0.0} + + +def test_normalize_score_dict(): + result = normalize_score({"acc": 0.9, "lat": 50.0}) + assert result == {"acc": 0.9, "lat": 50.0} + + +def test_normalize_score_dict_with_int_values(): + result = normalize_score({"acc": 1, "lat": 0}) + assert result == {"acc": 1.0, "lat": 0.0} + + +def test_normalize_score_empty_dict_raises(): + with pytest.raises(ValueError, match="must not be empty"): + normalize_score({}) + + +def test_normalize_score_nan_raises(): + with pytest.raises(ValueError, match="finite"): + normalize_score({"a": float("nan")}) + + +def test_normalize_score_inf_raises(): + with pytest.raises(ValueError, match="finite"): + normalize_score(float("inf")) + + +def test_normalize_score_neg_inf_raises(): + with pytest.raises(ValueError, match="finite"): + normalize_score(float("-inf")) + + +def test_normalize_score_string_raises(): + with pytest.raises(TypeError, match="str"): + normalize_score("bad") + + +def test_normalize_score_none_raises(): + with pytest.raises(TypeError): + normalize_score(None) + + +# --------------------------------------------------------------------------- +# apply_minimize +# --------------------------------------------------------------------------- + +def test_apply_minimize_negates(): + result = apply_minimize({"acc": 0.9, "lat": 100.0}, frozenset({"lat"})) + assert result == {"acc": 0.9, "lat": -100.0} + + +def test_apply_minimize_empty_set(): + result = apply_minimize({"acc": 0.9, "lat": 100.0}, frozenset()) + assert result == {"acc": 0.9, "lat": 100.0} + + +def test_apply_minimize_all(): + result = apply_minimize({"a": 1.0, "b": 2.0}, frozenset({"a", "b"})) + assert result == {"a": -1.0, "b": -2.0} + + +# --------------------------------------------------------------------------- +# weighted_scalarize +# --------------------------------------------------------------------------- + +def test_weighted_scalarize_basic(): + result = weighted_scalarize({"a": 0.8, "b": 0.2}, {"a": 0.7, "b": 0.3}) + assert result == pytest.approx(0.7 * 0.8 + 0.3 * 0.2) + + +def test_weighted_scalarize_empty_weights(): + result = weighted_scalarize({"a": 1.0, "b": 2.0}, {}) + assert result == pytest.approx(3.0) # equal weight 1.0 each + + +def test_weighted_scalarize_missing_metric(): + result = weighted_scalarize({"a": 1.0}, {"a": 0.5, "b": 0.5}, missing_value=0.0) + assert result == pytest.approx(0.5) # 0.5*1.0 + 0.5*0.0 + + +def test_weighted_scalarize_ignores_extra_metrics(): + result = weighted_scalarize({"a": 1.0, "b": 2.0, "c": 99.0}, {"a": 1.0}) + assert result == pytest.approx(1.0) # only "a" is weighted + + +# --------------------------------------------------------------------------- +# dominates +# --------------------------------------------------------------------------- + +def test_dominates_yes(): + assert dominates({"a": 2.0, "b": 2.0}, {"a": 1.0, "b": 1.0}) is True + + +def test_dominates_yes_one_equal(): + assert dominates({"a": 2.0, "b": 1.0}, {"a": 1.0, "b": 1.0}) is True + + +def test_dominates_no_equal(): + assert dominates({"a": 1.0, "b": 1.0}, {"a": 1.0, "b": 1.0}) is False + + +def test_dominates_no_tradeoff(): + assert dominates({"a": 2.0, "b": 0.5}, {"a": 1.0, "b": 1.0}) is False + + +def test_dominates_with_metric_subset(): + assert dominates({"a": 2.0, "b": 0.5}, {"a": 1.0, "b": 1.0}, + metrics=("a",)) is True + + +# --------------------------------------------------------------------------- +# pareto_rank +# --------------------------------------------------------------------------- + +def test_pareto_rank_clear_hierarchy(): + candidates = [ + {"a": 3.0, "b": 3.0}, # dominates everything -> rank 0 + {"a": 2.0, "b": 2.0}, # dominated by [0] -> rank 1 + {"a": 1.0, "b": 1.0}, # dominated by [0],[1] -> rank 2 + ] + ranks = pareto_rank(candidates) + assert ranks == [0, 1, 2] + + +def test_pareto_rank_all_nondominated(): + candidates = [ + {"a": 3.0, "b": 1.0}, + {"a": 1.0, "b": 3.0}, + {"a": 2.0, "b": 2.0}, + ] + ranks = pareto_rank(candidates) + # All are tradeoffs — none dominates another + assert ranks == [0, 0, 0] + + +def test_pareto_rank_mixed(): + candidates = [ + {"a": 3.0, "b": 1.0}, # front 0 + {"a": 1.0, "b": 3.0}, # front 0 + {"a": 0.5, "b": 0.5}, # dominated by both -> rank 1 + ] + ranks = pareto_rank(candidates) + assert ranks[0] == 0 + assert ranks[1] == 0 + assert ranks[2] == 1 + + +# --------------------------------------------------------------------------- +# select_best +# --------------------------------------------------------------------------- + +def test_select_best_none_config(): + candidates = [(0.5, "A"), (0.9, "B"), (0.7, "C")] + assert select_best(candidates, None) == 1 + + +def test_select_best_scalar_mode(): + config = ObjectiveConfig(mode="scalar") + candidates = [(0.5, "A"), (0.9, "B"), (0.7, "C")] + assert select_best(candidates, config) == 1 + + +def test_select_best_scalar_with_dict_scores(): + """Scalar mode with dict scores uses mean of values.""" + config = ObjectiveConfig(mode="scalar") + candidates = [ + ({"a": 0.5, "b": 0.3}, "X"), # mean = 0.4 + ({"a": 0.8, "b": 0.6}, "Y"), # mean = 0.7 + ] + assert select_best(candidates, config) == 1 + + +def test_select_best_weighted(): + config = ObjectiveConfig( + mode="weighted", + weights={"accuracy": 0.8, "latency_s": 0.2}, + minimize=frozenset({"latency_s"}), + ) + candidates = [ + ({"accuracy": 0.95, "latency_s": 0.200}, "A"), # 0.8*0.95 + 0.2*(-0.2) = 0.72 + ({"accuracy": 0.70, "latency_s": 0.030}, "B"), # 0.8*0.70 + 0.2*(-0.03) = 0.554 + ] + assert select_best(candidates, config) == 0 + + +def test_select_best_weighted_latency_heavy(): + config = ObjectiveConfig( + mode="weighted", + weights={"accuracy": 0.2, "latency_s": 0.8}, + minimize=frozenset({"latency_s"}), + ) + candidates = [ + ({"accuracy": 0.95, "latency_s": 0.200}, "A"), # 0.2*0.95 + 0.8*(-0.2) = 0.03 + ({"accuracy": 0.70, "latency_s": 0.030}, "B"), # 0.2*0.70 + 0.8*(-0.03) = 0.116 + ] + assert select_best(candidates, config) == 1 + + +def test_select_best_pareto_tiebreak_weighted(): + config = ObjectiveConfig( + mode="pareto", + weights={"a": 0.5, "b": 0.5}, + tie_break="weighted", + ) + candidates = [ + ({"a": 0.9, "b": 0.1}, "X"), # front 0, weighted = 0.5 + ({"a": 0.1, "b": 0.9}, "Y"), # front 0, weighted = 0.5 + ({"a": 0.6, "b": 0.6}, "Z"), # front 0, weighted = 0.6 -> winner + ] + assert select_best(candidates, config) == 2 + + +def test_select_best_pareto_deterministic(): + config = ObjectiveConfig( + mode="pareto", + weights={"a": 0.5, "b": 0.5}, + tie_break="weighted", + seed=42, + ) + candidates = [ + ({"a": 0.9, "b": 0.1}, "X"), + ({"a": 0.1, "b": 0.9}, "Y"), + ] + results = [select_best(candidates, config) for _ in range(10)] + assert len(set(results)) == 1 # same result every time + + +def test_select_best_pareto_random_seeded_deterministic(): + config = ObjectiveConfig( + mode="pareto", + tie_break="random_seeded", + seed=42, + ) + candidates = [ + ({"a": 0.9, "b": 0.1}, "X"), + ({"a": 0.1, "b": 0.9}, "Y"), + ] + results = [select_best(candidates, config) for _ in range(20)] + assert len(set(results)) == 1 + + +def test_select_best_pareto_different_seeds_may_differ(): + results = set() + for seed in range(50): + config = ObjectiveConfig( + mode="pareto", + tie_break="random_seeded", + seed=seed, + ) + candidates = [ + ({"a": 0.9, "b": 0.1}, "X"), + ({"a": 0.1, "b": 0.9}, "Y"), + ] + results.add(select_best(candidates, config)) + # With 50 different seeds across 2 candidates, we expect both to appear + assert len(results) == 2 + + +# --------------------------------------------------------------------------- +# select_top_k +# --------------------------------------------------------------------------- + +def test_select_top_k_scalar_none_config(): + candidates = [(0.5, "A"), (0.9, "B"), (0.7, "C")] + indices = select_top_k(candidates, None, k=2) + assert len(indices) == 2 + assert indices[0] == 1 # B is best + assert indices[1] == 2 # C is second + + +@pytest.mark.parametrize("k", [1, 2, 3]) +def test_select_top_k_scalar_k(k): + candidates = [(0.5, "A"), (0.9, "B"), (0.7, "C")] + indices = select_top_k(candidates, None, k=k) + assert len(indices) == k + assert indices[0] == 1 # B always best + + +def test_select_top_k_weighted(): + config = ObjectiveConfig( + mode="weighted", + weights={"a": 1.0, "b": 1.0}, + ) + candidates = [ + ({"a": 0.5, "b": 0.5}, "X"), # weighted = 1.0 + ({"a": 0.9, "b": 0.1}, "Y"), # weighted = 1.0 + ({"a": 0.8, "b": 0.8}, "Z"), # weighted = 1.6 + ] + indices = select_top_k(candidates, config, k=2) + assert indices[0] == 2 # Z is best + + +def test_select_top_k_pareto(): + config = ObjectiveConfig( + mode="pareto", + weights={"a": 0.5, "b": 0.5}, + tie_break="weighted", + ) + candidates = [ + ({"a": 0.9, "b": 0.1}, "X"), # front 0 + ({"a": 0.1, "b": 0.9}, "Y"), # front 0 + ({"a": 0.05, "b": 0.05}, "Z"), # front 1 (dominated) + ] + indices = select_top_k(candidates, config, k=2) + assert set(indices) == {0, 1} # both front-0 candidates + + +# --------------------------------------------------------------------------- +# ObjectiveConfig validation +# --------------------------------------------------------------------------- + +def test_config_default(): + config = ObjectiveConfig() + assert config.mode == "scalar" + assert config.weights == {} + assert config.minimize == frozenset() + + +def test_config_set_to_frozenset(): + config = ObjectiveConfig(minimize={"lat"}) + assert isinstance(config.minimize, frozenset) + assert "lat" in config.minimize + + +def test_config_negative_weight_raises(): + with pytest.raises(ValueError, match="non-negative"): + ObjectiveConfig(weights={"a": -1.0}) + + +def test_config_bad_mode_raises(): + with pytest.raises(ValueError, match="mode"): + ObjectiveConfig(mode="unknown") + + +def test_config_bad_tie_break_raises(): + with pytest.raises(ValueError, match="tie_break"): + ObjectiveConfig(tie_break="bad") + + +def test_config_empty_pareto_metrics_raises(): + with pytest.raises(ValueError, match="non-empty"): + ObjectiveConfig(pareto_metrics=()) + + +def test_config_frozen(): + config = ObjectiveConfig() + with pytest.raises(AttributeError): + config.mode = "weighted" From 45901029613159d83f18819ff6062541d316d734 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Thu, 12 Feb 2026 12:18:05 -0400 Subject: [PATCH 05/20] T6 M1: Fix Colab install cell for Python 3.12 compatibility --- examples/notebooks/t6_m1_vector_scores.ipynb | 576 +++++++++++++++++-- 1 file changed, 530 insertions(+), 46 deletions(-) diff --git a/examples/notebooks/t6_m1_vector_scores.ipynb b/examples/notebooks/t6_m1_vector_scores.ipynb index 637322d0..6363aee2 100644 --- a/examples/notebooks/t6_m1_vector_scores.ipynb +++ b/examples/notebooks/t6_m1_vector_scores.ipynb @@ -6,23 +6,7 @@ "id": "a0000001", "metadata": {}, "outputs": [], - "source": [ - "\"\"\"\n", - "T6 Milestone 1 — Multi-Objective Vector Scores\n", - "\n", - "This notebook is the M1 deliverable for the T6 Multi-Objective Vector Scores project.\n", - "It demonstrates:\n", - " 1. ObjectiveConfig creation and validation\n", - " 2. MultiMetricGuide with get_score_dict()\n", - " 3. evaluate_vector() + aggregate_vector_scores()\n", - " 4. Full BasicSearchAlgorithm.train() with DummyLLM + objective_config\n", - " 5. Scalar baseline comparison (backward compat)\n", - " 6. Pareto mode demo + deterministic tiebreak\n", - "\n", - "Part A runs end-to-end WITHOUT API keys (StubLLM / DummyLLM).\n", - "Part B requires an OpenRouter API key (Colab secrets or environment variable).\n", - "\"\"\"" - ] + "source": "!git clone https://github.com/carlosrod723/OpenTrace.git Trace\n%cd Trace\n!git checkout t6-multi-objective-m0\n!sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n!pip install -e ." }, { "cell_type": "markdown", @@ -72,7 +56,7 @@ "id": "a0000004", "metadata": {}, "outputs": [], - "source": "import sys, os\n\n# Ensure OpenTrace root is on the path (needed when running from examples/notebooks/)\n_repo_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))\nif os.path.isdir(os.path.join(_repo_root, 'opto')):\n if _repo_root not in sys.path:\n sys.path.insert(0, _repo_root)\n# Also handle running directly from the repo root\nif os.path.isdir(os.path.join(os.getcwd(), 'opto')):\n if os.getcwd() not in sys.path:\n sys.path.insert(0, os.getcwd())\n\nimport numpy as np\nfrom typing import Dict, Tuple, Optional\n\nprint(\"=\" * 70)\nprint(\"T6 M1 \\u2014 Multi-Objective Vector Scores\")\nprint(\"=\" * 70)" + "source": "import numpy as np\nfrom typing import Dict, Tuple, Optional\n\nprint(\"=\" * 70)\nprint(\"T6 M1 \\u2014 Multi-Objective Vector Scores\")\nprint(\"=\" * 70)" }, { "cell_type": "markdown", @@ -87,10 +71,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a0000006", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- ObjectiveConfig defaults ---\n", + " mode=scalar, weights={}, minimize=frozenset()\n", + "\n", + "--- ObjectiveConfig: weighted mode ---\n", + " mode=weighted\n", + " weights={'accuracy': 0.8, 'latency_s': 0.2}\n", + " minimize=frozenset({'latency_s'})\n", + "\n", + "--- ObjectiveConfig: Pareto mode ---\n", + " mode=pareto, tie_break=weighted, seed=42\n", + "\n", + "--- ObjectiveConfig: set auto-converts to frozenset ---\n", + " type(minimize)=frozenset (auto-converted from set)\n", + "\n", + "--- Validation: negative weight ---\n", + " Caught: Weight for 'a' must be non-negative, got -0.5\n", + "\n", + "--- Validation: bad mode ---\n", + " Caught: mode must be 'scalar', 'weighted', or 'pareto', got 'unknown'\n", + "\n", + "--- Frozen (immutable) ---\n", + " Caught: cannot assign to field 'mode'\n", + "\n", + "ObjectiveConfig validation: all checks passed.\n" + ] + } + ], "source": [ "from opto.trainer.objectives import (\n", " ObjectiveConfig, normalize_score, apply_minimize,\n", @@ -157,10 +172,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "a0000008", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Training path: get_feedback() -> (float, str) ---\n", + " score=1.0 (type=float)\n", + " feedback='Expected '4', got '4'. Correct!'\n", + "\n", + "--- Selection path: get_score_dict() -> Dict[str, float] ---\n", + " score_dict={'accuracy': 1.0, 'brevity': 1.0}\n", + "\n", + "--- metric() still returns float (backward compat) ---\n", + " metric()=1.0 (type=float)\n", + "\n", + "--- Base Guide without get_score_dict override wraps scalar ---\n", + " get_score_dict()={'score': 0.75}\n", + " (wrapped as {{'score': 0.75}} automatically)\n" + ] + } + ], "source": [ "from opto.trainer.guide import Guide\n", "\n", @@ -228,10 +263,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "a0000010", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- evaluate_vector() ---\n", + "Evaluating 3 examples (vector) (Running sequentially).\n", + " Example 0: {'accuracy': 1.0, 'brevity': 1.0}\n", + " Example 1: {'accuracy': 1.0, 'brevity': 1.0}\n", + " Example 2: {'accuracy': 1.0, 'brevity': 1.0}\n", + "\n", + "--- aggregate_vector_scores() ---\n", + " Aggregated (per-metric mean): {'accuracy': 1.0, 'brevity': 1.0}\n", + "\n", + "--- Wrong answer agent ---\n", + "Evaluating 3 examples (vector) (Running sequentially).\n", + " Example 0: {'accuracy': 0.0, 'brevity': 0.25}\n", + " Example 1: {'accuracy': 0.0, 'brevity': 0.25}\n", + " Example 2: {'accuracy': 0.0, 'brevity': 0.25}\n", + " Aggregated: {'accuracy': 0.0, 'brevity': 0.25}\n" + ] + } + ], "source": [ "from opto import trace\n", "from opto.trainer.evaluators import evaluate_vector, aggregate_vector_scores\n", @@ -282,10 +339,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "a0000012", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Candidates:\n", + " prompt_A: {'accuracy': 0.95, 'latency_s': 0.2}\n", + " prompt_B: {'accuracy': 0.7, 'latency_s': 0.03}\n", + " prompt_C: {'accuracy': 0.88, 'latency_s': 0.08}\n", + " prompt_D: {'accuracy': 0.6, 'latency_s': 0.02}\n", + "\n", + "--- select_best(config=None) [scalar, backward-compat] ---\n", + " Winner: prompt_A (index 0)\n", + "\n", + "--- select_best(weighted, accuracy=0.8) ---\n", + " Winner: prompt_A (index 0)\n", + "\n", + "--- select_best(weighted, latency_s=0.8) ---\n", + " Winner: prompt_B (index 1)\n", + "\n", + "--- select_best(pareto, tie_break=weighted) ---\n", + " Pareto ranks: [0, 0, 0, 0]\n", + " Front (rank 0): ['prompt_A', 'prompt_B', 'prompt_C', 'prompt_D']\n", + " Winner (after tie-break): prompt_C (index 2)\n", + "\n", + "--- Determinism: 10 runs with same config ---\n", + " Results: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n", + " All identical: True\n", + "\n", + "--- select_top_k(pareto, k=2) ---\n", + " Top 2: ['prompt_C', 'prompt_A']\n" + ] + } + ], "source": [ "# Candidates: (score_dict, payload) tuples\n", "candidates = [\n", @@ -361,10 +451,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "a0000014", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "TRAINING: Scalar baseline (objective_config=None)\n", + "======================================================================\n", + "Evaluating agent (iteration 0) (Running sequentially).\n", + "[Step 0] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (Running sequentially).\n", + "[Step 0] \u001b[92mValidation score: 0.0\u001b[0m\n", + "Evaluating agent (iteration 1) (Running sequentially).\n", + "[Step 1] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Instantaneous train score: 0.0\n", + "[Step 1] Average train score: 0.0\n", + "[Step 1] \u001b[91mParameter: str:2: You are a helpful assistant.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "[Step 1] \u001b[92mValidation score: 0.0\u001b[0m\n", + "Evaluating agent (iteration 2) (Running sequentially).\n", + "[Step 2] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 2\n", + "[Step 2] Instantaneous train score: 0.0\n", + "[Step 2] Average train score: 0.0\n", + "[Step 2] \u001b[91mParameter: str:2: You are a helpful assistant.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "[Step 2] \u001b[92mValidation score: 0.0\u001b[0m\n", + "Evaluating agent (iteration 3) (Running sequentially).\n", + "[Step 3] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 3\n", + "[Step 3] Instantaneous train score: 0.0\n", + "[Step 3] Average train score: 0.0\n", + "[Step 3] \u001b[91mParameter: str:2: You are a helpful assistant.\u001b[0m\n", + "\n", + "Scalar training scores: [np.float64(0.0), np.float64(0.0), np.float64(0.0)]\n", + "current_score: 0.0\n", + "current_score_dict: None\n", + "(current_score_dict is None because scalar mode does not use vector path)\n" + ] + } + ], "source": [ "from opto.utils.llm import DummyLLM\n", "from opto.optimizers import OptoPrimeV2\n", @@ -443,10 +578,61 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "a0000016", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "TRAINING: Weighted mode (objective_config.mode='weighted')\n", + "======================================================================\n", + "Evaluating agent (iteration 0) (Running sequentially).\n", + "[Step 0] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 0] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 1) (Running sequentially).\n", + "[Step 1] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Instantaneous train score: 0.0\n", + "[Step 1] Average train score: 0.0\n", + "[Step 1] \u001b[91mParameter: str:9: You are a helpful assistant.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "[Step 1] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 2) (Running sequentially).\n", + "[Step 2] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 2\n", + "[Step 2] Instantaneous train score: 0.0\n", + "[Step 2] Average train score: 0.0\n", + "[Step 2] \u001b[91mParameter: str:9: You are a helpful assistant.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "[Step 2] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 3) (Running sequentially).\n", + "[Step 3] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 3\n", + "[Step 3] Instantaneous train score: 0.0\n", + "[Step 3] Average train score: 0.0\n", + "[Step 3] \u001b[91mParameter: str:9: You are a helpful assistant.\u001b[0m\n", + "\n", + "Weighted training scores: [np.float64(0.0), np.float64(0.0), np.float64(0.0)]\n", + "current_score (float): 0.00819672131147541\n", + "current_score_dict: {'accuracy': 0.0, 'brevity': 0.01639344262295082}\n", + "(current_score_dict stores the vector score selected by weighted mode)\n" + ] + } + ], "source": [ "print(\"=\" * 70)\n", "print(\"TRAINING: Weighted mode (objective_config.mode='weighted')\")\n", @@ -490,10 +676,101 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "a0000018", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "TRAINING: Pareto mode (objective_config.mode='pareto')\n", + "======================================================================\n", + "Evaluating agent (iteration 0) (Running sequentially).\n", + "[Step 0] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 0] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 1) (Running sequentially).\n", + "[Step 1] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Instantaneous train score: 0.0\n", + "[Step 1] Average train score: 0.0\n", + "[Step 1] \u001b[91mParameter: str:16: You are a helpful assistant.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "[Step 1] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 2) (Running sequentially).\n", + "[Step 2] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 2\n", + "[Step 2] Instantaneous train score: 0.0\n", + "[Step 2] Average train score: 0.0\n", + "[Step 2] \u001b[91mParameter: str:16: You are a helpful assistant.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "[Step 2] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 3) (Running sequentially).\n", + "[Step 3] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 3\n", + "[Step 3] Instantaneous train score: 0.0\n", + "[Step 3] Average train score: 0.0\n", + "[Step 3] \u001b[91mParameter: str:16: You are a helpful assistant.\u001b[0m\n", + "\n", + "Pareto training scores: [np.float64(0.0), np.float64(0.0), np.float64(0.0)]\n", + "current_score (float): 0.00819672131147541\n", + "current_score_dict: {'accuracy': 0.0, 'brevity': 0.01639344262295082}\n", + "\n", + "--- Determinism: re-run with same seed ---\n", + "Evaluating agent (iteration 0) (Running sequentially).\n", + "[Step 0] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 0] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 1) (Running sequentially).\n", + "[Step 1] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Instantaneous train score: 0.0\n", + "[Step 1] Average train score: 0.0\n", + "[Step 1] \u001b[91mParameter: str:23: You are a helpful assistant.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "[Step 1] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 2) (Running sequentially).\n", + "[Step 2] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 2\n", + "[Step 2] Instantaneous train score: 0.0\n", + "[Step 2] Average train score: 0.0\n", + "[Step 2] \u001b[91mParameter: str:23: You are a helpful assistant.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "[Step 2] \u001b[92mValidation score: 0.00819672131147541\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/accuracy: 0.0\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/brevity: 0.01639344262295082\u001b[0m\n", + "Evaluating agent (iteration 3) (Running sequentially).\n", + "[Step 3] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 3\n", + "[Step 3] Instantaneous train score: 0.0\n", + "[Step 3] Average train score: 0.0\n", + "[Step 3] \u001b[91mParameter: str:23: You are a helpful assistant.\u001b[0m\n", + "Run 1 current_score_dict: {'accuracy': 0.0, 'brevity': 0.01639344262295082}\n", + "Run 2 current_score_dict: {'accuracy': 0.0, 'brevity': 0.01639344262295082}\n", + "Deterministic: True\n" + ] + } + ], "source": [ "print(\"=\" * 70)\n", "print(\"TRAINING: Pareto mode (objective_config.mode='pareto')\")\n", @@ -559,10 +836,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "a0000020", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "PART A COMPLETE — StubLLM Section\n", + "======================================================================\n", + "\n", + "Verified:\n", + " ✓ ObjectiveConfig creation, validation, and immutability\n", + " ✓ MultiMetricGuide: get_feedback() -> (float, str) for training loop\n", + " ✓ MultiMetricGuide: get_score_dict() -> Dict[str, float] for selection path\n", + " ✓ evaluate_vector() returns List[Dict[str, float]]\n", + " ✓ aggregate_vector_scores() computes per-metric means\n", + " ✓ select_best(): scalar, weighted, Pareto modes all work\n", + " ✓ BasicSearch training: scalar baseline (objective_config=None)\n", + " ✓ BasicSearch training: weighted mode with vector score selection\n", + " ✓ BasicSearch training: Pareto mode with deterministic tie-break\n", + " ✓ current_score stays float, current_score_dict stores vector\n", + "\n" + ] + } + ], "source": [ "print(\"\\n\" + \"=\" * 70)\n", "print(\"PART A COMPLETE — StubLLM Section\")\n", @@ -601,10 +902,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "a0000022", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API key loaded from environment variable.\n", + "CustomLLM configured for OpenRouter (google/gemini-2.0-flash-001).\n" + ] + } + ], "source": [ "import os\n", "\n", @@ -649,10 +959,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "a0000023", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Smoke test: real LLM call ---\n", + " Response: 4\n", + "\n", + " LLM connection verified.\n" + ] + } + ], "source": [ "# Skip this cell if no API key\n", "if not api_key:\n", @@ -673,10 +994,74 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "a0000024", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "REAL LLM TRAINING: Weighted mode with multi-metric guide\n", + "======================================================================\n", + "Evaluating agent (iteration 0) (Running sequentially).\n", + "[Step 0] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 0] \u001b[92mValidation score: 0.75\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/accuracy: 1.0\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/brevity: 0.5\u001b[0m\n", + "Checking improvement (iteration 0) (Running sequentially).\n", + "\u001b[92mUpdate accepted: Current score 0.0, New score 1.0\u001b[0m\n", + "Evaluating agent (iteration 1) (Running sequentially).\n", + "[Step 1] \u001b[92mAverage test score: 1.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Instantaneous train score: 0.0\n", + "[Step 1] Average train score: 0.0\n", + "[Step 1] \u001b[91mParameter: str:30: You are a helpful assistant. Your task is to calculate the answer to the question. You should respond with the numerical answer only.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 1] \u001b[92mValidation score: 0.75\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/accuracy: 1.0\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/brevity: 0.5\u001b[0m\n", + "Checking improvement (iteration 1) (Running sequentially).\n", + "\u001b[91mUpdate rejected: Current score 1.0, New score 1.0\u001b[0m\n", + "Evaluating agent (iteration 2) (Running sequentially).\n", + "[Step 2] \u001b[92mAverage test score: 1.0\u001b[0m\n", + "Epoch: 0. Iteration: 2\n", + "[Step 2] Instantaneous train score: 1.0\n", + "[Step 2] Average train score: 0.5\n", + "[Step 2] \u001b[91mParameter: str:30: You are a helpful assistant. Your task is to calculate the answer to the question. You should respond with the numerical answer only.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 2] \u001b[92mValidation score: 0.75\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/accuracy: 1.0\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/brevity: 0.5\u001b[0m\n", + "Checking improvement (iteration 2) (Running sequentially).\n", + "\u001b[91mUpdate rejected: Current score 1.0, New score 1.0\u001b[0m\n", + "Evaluating agent (iteration 3) (Running sequentially).\n", + "[Step 3] \u001b[92mAverage test score: 1.0\u001b[0m\n", + "Epoch: 0. Iteration: 3\n", + "[Step 3] Instantaneous train score: 1.0\n", + "[Step 3] Average train score: 0.6666666666666666\n", + "[Step 3] \u001b[91mParameter: str:30: You are a helpful assistant. Your task is to calculate the answer to the question. You should respond with the numerical answer only.\u001b[0m\n", + "\n", + "Real LLM training scores: [np.float64(0.0), np.float64(1.0), np.float64(1.0)]\n", + "current_score (float): 0.75\n", + "current_score_dict: {'accuracy': 1.0, 'brevity': 0.5}\n", + "\n", + "Final system prompt: You are a helpful assistant. Your task is to calculate the answer to the question. You should respond with the numerical answer only.\n" + ] + } + ], "source": [ "# Real LLM training with weighted multi-objective selection\n", "if not api_key:\n", @@ -722,10 +1107,75 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "a0000025", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "REAL LLM TRAINING: Pareto mode for comparison\n", + "======================================================================\n", + "Evaluating agent (iteration 0) (Running sequentially).\n", + "[Step 0] \u001b[92mAverage test score: 0.0\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 0] \u001b[92mValidation score: 0.75\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/accuracy: 1.0\u001b[0m\n", + "[Step 0] \u001b[92mValidation score/brevity: 0.5\u001b[0m\n", + "Checking improvement (iteration 0) (Running sequentially).\n", + "\u001b[92mUpdate accepted: Current score 0.0, New score 1.0\u001b[0m\n", + "Evaluating agent (iteration 1) (Running sequentially).\n", + "[Step 1] \u001b[92mAverage test score: 1.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Instantaneous train score: 0.0\n", + "[Step 1] Average train score: 0.0\n", + "[Step 1] \u001b[91mParameter: str:37: You are a helpful assistant. Your task is to answer math questions. You should only provide the numerical answer without any explanation or problem description.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 1] \u001b[92mValidation score: 0.75\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/accuracy: 1.0\u001b[0m\n", + "[Step 1] \u001b[92mValidation score/brevity: 0.5\u001b[0m\n", + "Checking improvement (iteration 1) (Running sequentially).\n", + "\u001b[91mUpdate rejected: Current score 1.0, New score 1.0\u001b[0m\n", + "Evaluating agent (iteration 2) (Running sequentially).\n", + "[Step 2] \u001b[92mAverage test score: 1.0\u001b[0m\n", + "Epoch: 0. Iteration: 2\n", + "[Step 2] Instantaneous train score: 1.0\n", + "[Step 2] Average train score: 0.5\n", + "[Step 2] \u001b[91mParameter: str:37: You are a helpful assistant. Your task is to answer math questions. You should only provide the numerical answer without any explanation or problem description.\u001b[0m\n", + "Forward pass (batch size: 1) (Running sequentially).\n", + "Generating 2 proposals (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "Validating proposals (vector) (Running sequentially).\n", + "[Step 2] \u001b[92mValidation score: 0.8333333333333333\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/accuracy: 1.0\u001b[0m\n", + "[Step 2] \u001b[92mValidation score/brevity: 0.6666666666666666\u001b[0m\n", + "Checking improvement (iteration 2) (Running sequentially).\n", + "\u001b[91mUpdate rejected: Current score 1.0, New score 1.0\u001b[0m\n", + "Evaluating agent (iteration 3) (Running sequentially).\n", + "[Step 3] \u001b[92mAverage test score: 1.0\u001b[0m\n", + "Epoch: 0. Iteration: 3\n", + "[Step 3] Instantaneous train score: 1.0\n", + "[Step 3] Average train score: 0.6666666666666666\n", + "[Step 3] \u001b[91mParameter: str:37: You are a helpful assistant. Your task is to answer math questions. You should only provide the numerical answer without any explanation or problem description.\u001b[0m\n", + "\n", + "Pareto training scores: [np.float64(0.0), np.float64(1.0), np.float64(1.0)]\n", + "current_score_dict: {'accuracy': 1.0, 'brevity': 0.6666666666666666}\n", + "\n", + "--- Comparison ---\n", + "Weighted mode final: {'accuracy': 1.0, 'brevity': 0.5}\n", + "Pareto mode final: {'accuracy': 1.0, 'brevity': 0.6666666666666666}\n" + ] + } + ], "source": [ "# Real LLM: Pareto mode comparison\n", "if not api_key:\n", @@ -768,10 +1218,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "a0000026", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "M1 NOTEBOOK COMPLETE\n", + "======================================================================\n", + "\n", + "Deliverables verified:\n", + " ✓ Part A (StubLLM): All cells run without API keys\n", + " - ObjectiveConfig creation + validation\n", + " - MultiMetricGuide with get_score_dict()\n", + " - evaluate_vector() + aggregate_vector_scores()\n", + " - BasicSearch: scalar, weighted, and Pareto modes\n", + " - Backward compatibility (objective_config=None)\n", + " - Deterministic tie-break verification\n", + "\n", + " ✓ Part B (Real LLM): Trained with actual model via OpenRouter\n", + " - Weighted and Pareto mode with real LLM proposals\n", + " - Multi-metric selection (accuracy + brevity)\n", + " - current_score_dict populated with real scores\n", + "\n" + ] + } + ], "source": [ "print(\"\\n\" + \"=\" * 70)\n", "print(\"M1 NOTEBOOK COMPLETE\")\n", @@ -796,13 +1272,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.11.0" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" } }, "nbformat": 4, From 3b8d2ededb29da7c695370185cec5d2beaad3c2f Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Thu, 12 Feb 2026 16:50:34 -0400 Subject: [PATCH 06/20] T6 M1: Fix scalar objective computation, document config=None fallback, add weight-sensitivity demo --- examples/notebooks/t6_m1_vector_scores.ipynb | 14 ++++ opto/trainer/algorithms/basic_algorithms.py | 25 +++++-- opto/trainer/objectives.py | 12 ++++ tests/unit_tests/test_objectives.py | 69 ++++++++++++++++++++ 4 files changed, 114 insertions(+), 6 deletions(-) diff --git a/examples/notebooks/t6_m1_vector_scores.ipynb b/examples/notebooks/t6_m1_vector_scores.ipynb index 6363aee2..acedde2d 100644 --- a/examples/notebooks/t6_m1_vector_scores.ipynb +++ b/examples/notebooks/t6_m1_vector_scores.ipynb @@ -441,6 +441,20 @@ "print(f\" Top 2: {[candidates[i][1] for i in top2]}\")" ] }, + { + "cell_type": "markdown", + "id": "h7x96u4z4dn", + "source": "### A.4b Weight Sensitivity Demonstration\n\nTwo candidates with a genuine tradeoff: A has higher accuracy, B has higher brevity.\nChanging the weights should flip the winner.", + "metadata": {} + }, + { + "cell_type": "code", + "id": "w0zvqaxrl98", + "source": "# Weight sensitivity: changing weights flips the winner\nfrom opto.trainer.objectives import ObjectiveConfig, select_best, weighted_scalarize\n\ncandidates = [\n ({\"accuracy\": 0.95, \"brevity\": 0.3}, \"candidate_A\"), # high accuracy, low brevity\n ({\"accuracy\": 0.70, \"brevity\": 0.9}, \"candidate_B\"), # low accuracy, high brevity\n]\n\nprint(\"Candidates:\")\nfor score, name in candidates:\n print(f\" {name}: {score}\")\n\n# Accuracy-heavy weights\nconfig_acc = ObjectiveConfig(mode=\"weighted\", weights={\"accuracy\": 0.9, \"brevity\": 0.1})\nwinner_acc = select_best(candidates, config_acc)\nscore_A_acc = weighted_scalarize(candidates[0][0], config_acc.weights)\nscore_B_acc = weighted_scalarize(candidates[1][0], config_acc.weights)\nprint(f\"\\n--- Accuracy-heavy (accuracy=0.9, brevity=0.1) ---\")\nprint(f\" A: 0.9*0.95 + 0.1*0.3 = {score_A_acc:.3f}\")\nprint(f\" B: 0.9*0.70 + 0.1*0.9 = {score_B_acc:.3f}\")\nprint(f\" Winner: {candidates[winner_acc][1]}\")\n\n# Brevity-heavy weights\nconfig_brev = ObjectiveConfig(mode=\"weighted\", weights={\"accuracy\": 0.1, \"brevity\": 0.9})\nwinner_brev = select_best(candidates, config_brev)\nscore_A_brev = weighted_scalarize(candidates[0][0], config_brev.weights)\nscore_B_brev = weighted_scalarize(candidates[1][0], config_brev.weights)\nprint(f\"\\n--- Brevity-heavy (accuracy=0.1, brevity=0.9) ---\")\nprint(f\" A: 0.1*0.95 + 0.9*0.3 = {score_A_brev:.3f}\")\nprint(f\" B: 0.1*0.70 + 0.9*0.9 = {score_B_brev:.3f}\")\nprint(f\" Winner: {candidates[winner_brev][1]}\")\n\n# Verify the flip\nassert winner_acc == 0, \"Accuracy-heavy should pick candidate_A\"\nassert winner_brev == 1, \"Brevity-heavy should pick candidate_B\"\nprint(f\"\\n✓ Weight sensitivity confirmed: accuracy-heavy → A, brevity-heavy → B\")", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, { "cell_type": "markdown", "id": "a0000013", diff --git a/opto/trainer/algorithms/basic_algorithms.py b/opto/trainer/algorithms/basic_algorithms.py index 50ea7842..5edb3a3e 100644 --- a/opto/trainer/algorithms/basic_algorithms.py +++ b/opto/trainer/algorithms/basic_algorithms.py @@ -7,7 +7,17 @@ from opto.trainer.utils import batch_run, async_run from opto.optimizers.utils import print_color from opto.trainer.evaluators import evaluate, evaluate_vector, aggregate_vector_scores -from opto.trainer.objectives import ObjectiveConfig, select_best +from opto.trainer.objectives import ObjectiveConfig, select_best, apply_minimize, weighted_scalarize + + +def _objective_scalar(score_dict, config): + """Compute scalar objective consistent with selection mode. + + Uses weighted_scalarize(apply_minimize(...)) so the logged scalar + reflects the same weights and minimize settings used for selection. + """ + minimized = apply_minimize(score_dict, config.minimize) + return weighted_scalarize(minimized, config.weights, config.missing_value) def standard_optimization_step(agent, x, guide, info, min_score=0): @@ -614,7 +624,7 @@ def validate_vector(): continue self.optimizer.update(update_dict) score_dict = validate_vector() - scalar_score = float(np.mean(list(score_dict.values()))) + scalar_score = _objective_scalar(score_dict, self.objective_config) candidates.append((scalar_score, update_dict)) vector_candidates.append((score_dict, update_dict)) self.optimizer.update(backup_dict) @@ -623,7 +633,7 @@ def validate_vector(): if self.current_score_dict is None: self.current_score_dict = validate_vector() if self.current_score is None: - self.current_score = float(np.mean(list(self.current_score_dict.values()))) + self.current_score = _objective_scalar(self.current_score_dict, self.objective_config) candidates.append((self.current_score, backup_dict)) vector_candidates.append((self.current_score_dict, backup_dict)) @@ -631,7 +641,7 @@ def validate_vector(): best_idx = select_best(vector_candidates, self.objective_config) best_score_dict = vector_candidates[best_idx][0] best_update = vector_candidates[best_idx][1] - best_score = float(np.mean(list(best_score_dict.values()))) + best_score = _objective_scalar(best_score_dict, self.objective_config) self.current_score = best_score self.current_score_dict = best_score_dict else: @@ -660,8 +670,11 @@ def validate_vector(): # Make the best update self.optimizer.update(best_update) - # Logging — always log scalar for backward compatibility - self.logger.log('Validation score', best_score, self.n_iters, color='green') + # Logging — scalar objective for backward compatibility + if use_vector: + self.logger.log('Validation objective', best_score, self.n_iters, color='green') + else: + self.logger.log('Validation score', best_score, self.n_iters, color='green') # Log individual vector metrics if available if use_vector and isinstance(best_score_dict, dict): diff --git a/opto/trainer/objectives.py b/opto/trainer/objectives.py index 3c21ca67..385e15b1 100644 --- a/opto/trainer/objectives.py +++ b/opto/trainer/objectives.py @@ -195,6 +195,12 @@ def select_best(candidates: List[Tuple[ScoreLike, Any]], Returns: Index of the best candidate. + + Notes: + When *config* is None or mode='scalar', dict scores are collapsed to + mean(values) for backward compatibility. For explicit multi-objective + control, pass an ObjectiveConfig with mode='weighted' or 'pareto' + and appropriate weights. """ if config is None or config.mode == "scalar": scores = [] @@ -256,6 +262,12 @@ def select_top_k(candidates: List[Tuple[ScoreLike, Any]], Same logic as select_best but returns *k* indices. For Pareto mode: rank-0 front first (up to k), then rank-1, etc. + + Notes: + When *config* is None or mode='scalar', dict scores are collapsed to + mean(values) for backward compatibility. For explicit multi-objective + control, pass an ObjectiveConfig with mode='weighted' or 'pareto' + and appropriate weights. """ if config is None or config.mode == "scalar": scores = [] diff --git a/tests/unit_tests/test_objectives.py b/tests/unit_tests/test_objectives.py index 04fbccc2..331eea1d 100644 --- a/tests/unit_tests/test_objectives.py +++ b/tests/unit_tests/test_objectives.py @@ -381,3 +381,72 @@ def test_config_frozen(): config = ObjectiveConfig() with pytest.raises(AttributeError): config.mode = "weighted" + + +# --------------------------------------------------------------------------- +# Objective scalar consistency (Fix 1 verification) +# --------------------------------------------------------------------------- + +def test_weighted_objective_not_mean(): + """Weighted objective uses weighted_scalarize, not mean(values). + + Xavier's example: score_dict={'accuracy':1.0,'brevity':0.5} with + weights={'accuracy':0.7,'brevity':0.3} should be 0.85, not 0.75 (mean). + """ + score_dict = {"accuracy": 1.0, "brevity": 0.5} + weights = {"accuracy": 0.7, "brevity": 0.3} + + objective = weighted_scalarize(apply_minimize(score_dict, frozenset()), weights) + assert objective == pytest.approx(0.85) # 0.7*1.0 + 0.3*0.5 + + naive_mean = float(np.mean(list(score_dict.values()))) + assert naive_mean == pytest.approx(0.75) + assert objective != pytest.approx(naive_mean) + + +def test_weighted_objective_stub_example(): + """StubLLM example: weighted objective differs from naive mean. + + score_dict={'accuracy':0.0,'brevity':0.01639...} with + weights={'accuracy':0.7,'brevity':0.3} should be ~0.00492, not ~0.00820. + """ + score_dict = {"accuracy": 0.0, "brevity": 0.01639344262295082} + weights = {"accuracy": 0.7, "brevity": 0.3} + + objective = weighted_scalarize(apply_minimize(score_dict, frozenset()), weights) + expected = 0.7 * 0.0 + 0.3 * 0.01639344262295082 + assert objective == pytest.approx(expected) # ~0.004918 + + naive_mean = float(np.mean(list(score_dict.values()))) + assert naive_mean == pytest.approx(0.00819672131147541) + assert objective != pytest.approx(naive_mean) + + +def test_weighted_objective_with_minimize(): + """Minimize metrics are negated before scalarization.""" + score_dict = {"accuracy": 0.95, "latency_s": 0.200} + config = ObjectiveConfig( + mode="weighted", + weights={"accuracy": 0.8, "latency_s": 0.2}, + minimize=frozenset({"latency_s"}), + ) + + minimized = apply_minimize(score_dict, config.minimize) + assert minimized == {"accuracy": 0.95, "latency_s": -0.200} + + objective = weighted_scalarize(minimized, config.weights, config.missing_value) + assert objective == pytest.approx(0.8 * 0.95 + 0.2 * (-0.200)) # 0.72 + + +def test_weight_sensitivity_flips_winner(): + """Changing weights flips which candidate wins.""" + candidates = [ + ({"accuracy": 0.95, "brevity": 0.3}, "A"), # high acc, low brev + ({"accuracy": 0.70, "brevity": 0.9}, "B"), # low acc, high brev + ] + + config_acc = ObjectiveConfig(mode="weighted", weights={"accuracy": 0.9, "brevity": 0.1}) + assert select_best(candidates, config_acc) == 0 # A wins + + config_brev = ObjectiveConfig(mode="weighted", weights={"accuracy": 0.1, "brevity": 0.9}) + assert select_best(candidates, config_brev) == 1 # B wins From 7401ca2666d39ba1f5f37cf5775ddaf825b2b5c0 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Mon, 16 Feb 2026 11:29:47 -0400 Subject: [PATCH 07/20] T6 M1: Apply Ching-An review - to_score_dict rename, configurable scalarize_dict, aggregate to objectives.py --- docs/T6_technical_plan.md | 38 ++-- examples/notebooks/t6_m1_vector_scores.ipynb | 195 +------------------ opto/trainer/evaluators.py | 19 +- opto/trainer/objectives.py | 120 ++++++++++-- tests/unit_tests/test_objectives.py | 161 +++++++++++---- 5 files changed, 258 insertions(+), 275 deletions(-) diff --git a/docs/T6_technical_plan.md b/docs/T6_technical_plan.md index 87f3e764..45a3c2fd 100644 --- a/docs/T6_technical_plan.md +++ b/docs/T6_technical_plan.md @@ -172,7 +172,7 @@ Both converge to the same abstraction: **given a list of `(score, params)` pairs Isolate all multi-objective logic into one new module (`opto/trainer/objectives.py`) containing **pure functions**: ``` -normalize_score() → scalar ↔ dict conversion +to_score_dict() → scalar/dict to dict conversion (neutral name) apply_minimize() → flip signs for minimize metrics weighted_scalarize()→ dict → float via weighted sum pareto_rank() → dominance ranking + tie-break @@ -209,8 +209,8 @@ Trainer selection (objectives.py) The entire vector-score path is **opt-in**: 1. If `objective_config` is `None` → existing scalar path, no new code executed. -2. If guide returns `float` and `objective_config` is provided → `normalize_score()` wraps it as `{"score": float}`, weights default to `{"score": 1.0}`. -3. If guide returns `Dict[str, float]` and `objective_config` is `None` → `mean(values)` used as scalar fallback, preserving scalar selection. +2. If guide returns `float` and `objective_config` is provided → `to_score_dict()` wraps it as `{"score": float}`, weights default to `{"score": 1.0}`. +3. If guide returns `Dict[str, float]` and `objective_config` is `None` → `ValueError` is raised (no hidden hard-coded dict→scalar reduction). Pass an explicit `ObjectiveConfig(mode="scalar", scalarize_dict="mean")` to reduce via mean, or `scalarize_dict="score"` to use a single key. --- @@ -356,7 +356,7 @@ def aggregate_vector_scores(scores: list) -> Union[float, Dict[str, float]]: - If all scores are float: returns np.mean (existing behavior). - If all scores are dict: returns per-metric mean dict. - - Mixed float/dict: normalizes all to dict via normalize_score(), then averages. + - Mixed float/dict: normalizes all to dict via to_score_dict(), then averages. Args: scores: List of float or Dict[str, float] values. @@ -384,13 +384,14 @@ ScoreLike = Union[float, Dict[str, float]] # --- Pure utility functions --- -def normalize_score(score: ScoreLike) -> Dict[str, float]: - """Convert any score to dict form. +def to_score_dict(score: ScoreLike) -> Dict[str, float]: + """Convert any score to dict form (neutral name). - int/float/bool → {"score": float(value)} - Dict[str, float] → returned as-is (validated: all values finite) Handles int (LLMJudge returns 0/1) and bool (test guides) via isinstance(score, (int, float, bool)). + Backward-compatible alias: `normalize_score = to_score_dict` Raises: TypeError: if score is not int, float, bool, or dict @@ -504,7 +505,7 @@ def select_top_k(candidates: List[Tuple[ScoreLike, any]], | File | Contents | Milestone | |------|----------|-----------| -| `opto/trainer/objectives.py` | `ObjectiveConfig`, `normalize_score`, `apply_minimize`, `weighted_scalarize`, `dominates`, `pareto_rank`, `select_best`, `select_top_k` | M1 | +| `opto/trainer/objectives.py` | `ObjectiveConfig`, `to_score_dict`, `apply_minimize`, `weighted_scalarize`, `dominates`, `pareto_rank`, `select_best`, `select_top_k`, `score_dict_to_scalar`, `to_scalar_score`, `aggregate_score_dicts` | M1 | | `tests/test_objectives.py` | Unit tests for all functions in objectives.py | M1 | | `tests/test_evaluators_vector.py` | Tests for evaluate_vector + aggregate_vector_scores | M1 | | `tests/test_trainers_multiobjective.py` | Integration tests for BasicSearch + Beamsearch with ObjectiveConfig | M2 | @@ -539,9 +540,9 @@ def select_top_k(candidates: List[Tuple[ScoreLike, any]], | Case | Behavior | |------|----------| -| `score = 0.85` (float) | `normalize_score()` → `{"score": 0.85}` | -| `score = 1` (int) | `normalize_score()` → `{"score": 1.0}` (LLMJudge returns int 0/1) | -| `score = True` (bool) | `normalize_score()` → `{"score": 1.0}` (test guides return bool) | +| `score = 0.85` (float) | `to_score_dict()` → `{"score": 0.85}` | +| `score = 1` (int) | `to_score_dict()` → `{"score": 1.0}` (LLMJudge returns int 0/1) | +| `score = True` (bool) | `to_score_dict()` → `{"score": 1.0}` (test guides return bool) | | `score = {"accuracy": 0.9, "latency_ms": 120.0}` | Returned as-is after validation | | `score = {}` (empty dict) | `ValueError("Score dict must not be empty")` | | `score = {"accuracy": float('nan')}` | `ValueError("Score dict contains non-finite value")` | @@ -724,11 +725,11 @@ Selection path: get_score_dict() → evaluate_vector() → objectives.py ← | Test | Validates | |------|-----------| -| `test_normalize_score_from_float` | `0.85` → `{"score": 0.85}` | -| `test_normalize_score_from_dict` | `{"a": 1.0, "b": 2.0}` → same dict | -| `test_normalize_score_empty_dict_raises` | `{}` → `ValueError` | -| `test_normalize_score_nan_raises` | `{"a": float('nan')}` → `ValueError` | -| `test_normalize_score_wrong_type_raises` | `"text"` → `TypeError` | +| `test_to_score_dict_from_float` | `0.85` → `{"score": 0.85}` | +| `test_to_score_dict_from_dict` | `{"a": 1.0, "b": 2.0}` → same dict | +| `test_to_score_dict_empty_dict_raises` | `{}` → `ValueError` | +| `test_to_score_dict_nan_raises` | `{"a": float('nan')}` → `ValueError` | +| `test_to_score_dict_wrong_type_raises` | `"text"` → `TypeError` | | `test_apply_minimize` | `{"acc": 0.9, "lat": 100}` with `minimize={"lat"}` → `{"acc": 0.9, "lat": -100}` | | `test_apply_minimize_empty_set` | No metrics negated | | `test_weighted_scalarize_basic` | `{"a": 0.8, "b": 0.2}` with `weights={"a": 0.7, "b": 0.3}` → `0.7*0.8 + 0.3*0.2` | @@ -793,12 +794,15 @@ Each notebook contains: ## 11. Design Decisions (Resolved) +> **Post-review update (Ching-An, Feb 2026):** All dict→scalar reduction is now controlled by `ObjectiveConfig.scalarize_dict` (values: `"score"`, `"mean"`, `"weighted"`). Guide produces raw metrics only. `normalize_score` renamed to `to_score_dict` (neutral name; backward-compat alias kept). `aggregate_score_dicts()` moved from evaluators to objectives.py (Objective-side policy). Dict scores with `config=None` now raise `ValueError` (no hidden hard-coded reduction). + ### D1: Where to implement scalar→dict normalization? -**Decision: Option A — `Guide.get_score_dict()` helper + `objectives.normalize_score()`** +**Decision: Option A — `Guide.get_score_dict()` helper + `objectives.to_score_dict()`** - `get_score_dict()` on Guide provides a clean entry point for subclasses. -- `normalize_score()` in objectives.py is the canonical utility (pure function, testable). +- `to_score_dict()` in objectives.py is the canonical utility (pure function, testable). Renamed from `normalize_score` per Ching-An's review (neutral name; backward-compat alias kept). +- All dict→scalar reduction is controlled by `ObjectiveConfig` (via `scalarize_dict` field). No hidden hard-coded defaults. - Avoids widening `get_feedback()` return type (higher churn, breaks typing). ### D2: Pareto selection definition diff --git a/examples/notebooks/t6_m1_vector_scores.ipynb b/examples/notebooks/t6_m1_vector_scores.ipynb index acedde2d..52bc5c73 100644 --- a/examples/notebooks/t6_m1_vector_scores.ipynb +++ b/examples/notebooks/t6_m1_vector_scores.ipynb @@ -6,7 +6,7 @@ "id": "a0000001", "metadata": {}, "outputs": [], - "source": "!git clone https://github.com/carlosrod723/OpenTrace.git Trace\n%cd Trace\n!git checkout t6-multi-objective-m0\n!sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n!pip install -e ." + "source": "import os, sys\n\n# In Colab: clone and install from GitHub\n# Locally: add repo root to sys.path so opto is importable\ntry:\n import google.colab\n IN_COLAB = True\nexcept ImportError:\n IN_COLAB = False\n\nif IN_COLAB:\n !git clone https://github.com/carlosrod723/OpenTrace.git Trace\n %cd Trace\n !git checkout t6-multi-objective-m0\n !sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n !pip install -e .\nelse:\n # Local: ensure repo root is on sys.path\n _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n if _repo_root not in sys.path:\n sys.path.insert(0, _repo_root)\n import opto\n print(f\"Using local opto from: {os.path.dirname(opto.__file__)}\")" }, { "cell_type": "markdown", @@ -71,96 +71,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "a0000006", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--- ObjectiveConfig defaults ---\n", - " mode=scalar, weights={}, minimize=frozenset()\n", - "\n", - "--- ObjectiveConfig: weighted mode ---\n", - " mode=weighted\n", - " weights={'accuracy': 0.8, 'latency_s': 0.2}\n", - " minimize=frozenset({'latency_s'})\n", - "\n", - "--- ObjectiveConfig: Pareto mode ---\n", - " mode=pareto, tie_break=weighted, seed=42\n", - "\n", - "--- ObjectiveConfig: set auto-converts to frozenset ---\n", - " type(minimize)=frozenset (auto-converted from set)\n", - "\n", - "--- Validation: negative weight ---\n", - " Caught: Weight for 'a' must be non-negative, got -0.5\n", - "\n", - "--- Validation: bad mode ---\n", - " Caught: mode must be 'scalar', 'weighted', or 'pareto', got 'unknown'\n", - "\n", - "--- Frozen (immutable) ---\n", - " Caught: cannot assign to field 'mode'\n", - "\n", - "ObjectiveConfig validation: all checks passed.\n" - ] - } - ], - "source": [ - "from opto.trainer.objectives import (\n", - " ObjectiveConfig, normalize_score, apply_minimize,\n", - " weighted_scalarize, dominates, pareto_rank, select_best, select_top_k,\n", - ")\n", - "\n", - "print(\"--- ObjectiveConfig defaults ---\")\n", - "config_default = ObjectiveConfig()\n", - "print(f\" mode={config_default.mode}, weights={config_default.weights}, \"\n", - " f\"minimize={config_default.minimize}\")\n", - "\n", - "print(\"\\n--- ObjectiveConfig: weighted mode ---\")\n", - "config_weighted = ObjectiveConfig(\n", - " mode=\"weighted\",\n", - " weights={\"accuracy\": 0.8, \"latency_s\": 0.2},\n", - " minimize=frozenset({\"latency_s\"}),\n", - ")\n", - "print(f\" mode={config_weighted.mode}\")\n", - "print(f\" weights={config_weighted.weights}\")\n", - "print(f\" minimize={config_weighted.minimize}\")\n", - "\n", - "print(\"\\n--- ObjectiveConfig: Pareto mode ---\")\n", - "config_pareto = ObjectiveConfig(\n", - " mode=\"pareto\",\n", - " weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n", - " minimize=frozenset({\"latency_s\"}),\n", - " tie_break=\"weighted\",\n", - " seed=42,\n", - ")\n", - "print(f\" mode={config_pareto.mode}, tie_break={config_pareto.tie_break}, seed={config_pareto.seed}\")\n", - "\n", - "print(\"\\n--- ObjectiveConfig: set auto-converts to frozenset ---\")\n", - "config_set = ObjectiveConfig(minimize={\"lat\"})\n", - "print(f\" type(minimize)={type(config_set.minimize).__name__} (auto-converted from set)\")\n", - "\n", - "print(\"\\n--- Validation: negative weight ---\")\n", - "try:\n", - " ObjectiveConfig(weights={\"a\": -0.5})\n", - "except ValueError as e:\n", - " print(f\" Caught: {e}\")\n", - "\n", - "print(\"\\n--- Validation: bad mode ---\")\n", - "try:\n", - " ObjectiveConfig(mode=\"unknown\")\n", - "except ValueError as e:\n", - " print(f\" Caught: {e}\")\n", - "\n", - "print(\"\\n--- Frozen (immutable) ---\")\n", - "try:\n", - " config_default.mode = \"weighted\"\n", - "except AttributeError as e:\n", - " print(f\" Caught: {e}\")\n", - "\n", - "print(\"\\nObjectiveConfig validation: all checks passed.\")" - ] + "outputs": [], + "source": "from opto.trainer.objectives import (\n ObjectiveConfig, to_score_dict, apply_minimize,\n weighted_scalarize, dominates, pareto_rank, select_best, select_top_k,\n)\n\nprint(\"--- ObjectiveConfig defaults ---\")\nconfig_default = ObjectiveConfig()\nprint(f\" mode={config_default.mode}, weights={config_default.weights}, \"\n f\"minimize={config_default.minimize}\")\n\nprint(\"\\n--- ObjectiveConfig: weighted mode ---\")\nconfig_weighted = ObjectiveConfig(\n mode=\"weighted\",\n weights={\"accuracy\": 0.8, \"latency_s\": 0.2},\n minimize=frozenset({\"latency_s\"}),\n)\nprint(f\" mode={config_weighted.mode}\")\nprint(f\" weights={config_weighted.weights}\")\nprint(f\" minimize={config_weighted.minimize}\")\n\nprint(\"\\n--- ObjectiveConfig: Pareto mode ---\")\nconfig_pareto = ObjectiveConfig(\n mode=\"pareto\",\n weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n minimize=frozenset({\"latency_s\"}),\n tie_break=\"weighted\",\n seed=42,\n)\nprint(f\" mode={config_pareto.mode}, tie_break={config_pareto.tie_break}, seed={config_pareto.seed}\")\n\nprint(\"\\n--- ObjectiveConfig: set auto-converts to frozenset ---\")\nconfig_set = ObjectiveConfig(minimize={\"lat\"})\nprint(f\" type(minimize)={type(config_set.minimize).__name__} (auto-converted from set)\")\n\nprint(\"\\n--- Validation: negative weight ---\")\ntry:\n ObjectiveConfig(weights={\"a\": -0.5})\nexcept ValueError as e:\n print(f\" Caught: {e}\")\n\nprint(\"\\n--- Validation: bad mode ---\")\ntry:\n ObjectiveConfig(mode=\"unknown\")\nexcept ValueError as e:\n print(f\" Caught: {e}\")\n\nprint(\"\\n--- Frozen (immutable) ---\")\ntry:\n config_default.mode = \"weighted\"\nexcept AttributeError as e:\n print(f\" Caught: {e}\")\n\nprint(\"\\nObjectiveConfig validation: all checks passed.\")" }, { "cell_type": "markdown", @@ -339,107 +254,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "a0000012", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Candidates:\n", - " prompt_A: {'accuracy': 0.95, 'latency_s': 0.2}\n", - " prompt_B: {'accuracy': 0.7, 'latency_s': 0.03}\n", - " prompt_C: {'accuracy': 0.88, 'latency_s': 0.08}\n", - " prompt_D: {'accuracy': 0.6, 'latency_s': 0.02}\n", - "\n", - "--- select_best(config=None) [scalar, backward-compat] ---\n", - " Winner: prompt_A (index 0)\n", - "\n", - "--- select_best(weighted, accuracy=0.8) ---\n", - " Winner: prompt_A (index 0)\n", - "\n", - "--- select_best(weighted, latency_s=0.8) ---\n", - " Winner: prompt_B (index 1)\n", - "\n", - "--- select_best(pareto, tie_break=weighted) ---\n", - " Pareto ranks: [0, 0, 0, 0]\n", - " Front (rank 0): ['prompt_A', 'prompt_B', 'prompt_C', 'prompt_D']\n", - " Winner (after tie-break): prompt_C (index 2)\n", - "\n", - "--- Determinism: 10 runs with same config ---\n", - " Results: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n", - " All identical: True\n", - "\n", - "--- select_top_k(pareto, k=2) ---\n", - " Top 2: ['prompt_C', 'prompt_A']\n" - ] - } - ], - "source": [ - "# Candidates: (score_dict, payload) tuples\n", - "candidates = [\n", - " ({\"accuracy\": 0.95, \"latency_s\": 0.200}, \"prompt_A\"),\n", - " ({\"accuracy\": 0.70, \"latency_s\": 0.030}, \"prompt_B\"),\n", - " ({\"accuracy\": 0.88, \"latency_s\": 0.080}, \"prompt_C\"),\n", - " ({\"accuracy\": 0.60, \"latency_s\": 0.020}, \"prompt_D\"),\n", - "]\n", - "\n", - "print(\"Candidates:\")\n", - "for s, name in candidates:\n", - " print(f\" {name}: {s}\")\n", - "\n", - "# Scalar mode (backward-compat)\n", - "print(\"\\n--- select_best(config=None) [scalar, backward-compat] ---\")\n", - "idx = select_best(candidates, None)\n", - "print(f\" Winner: {candidates[idx][1]} (index {idx})\")\n", - "\n", - "# Weighted: accuracy-heavy\n", - "print(\"\\n--- select_best(weighted, accuracy=0.8) ---\")\n", - "config_acc = ObjectiveConfig(\n", - " mode=\"weighted\",\n", - " weights={\"accuracy\": 0.8, \"latency_s\": 0.2},\n", - " minimize=frozenset({\"latency_s\"}),\n", - ")\n", - "idx = select_best(candidates, config_acc)\n", - "print(f\" Winner: {candidates[idx][1]} (index {idx})\")\n", - "\n", - "# Weighted: latency-heavy\n", - "print(\"\\n--- select_best(weighted, latency_s=0.8) ---\")\n", - "config_lat = ObjectiveConfig(\n", - " mode=\"weighted\",\n", - " weights={\"accuracy\": 0.2, \"latency_s\": 0.8},\n", - " minimize=frozenset({\"latency_s\"}),\n", - ")\n", - "idx = select_best(candidates, config_lat)\n", - "print(f\" Winner: {candidates[idx][1]} (index {idx})\")\n", - "\n", - "# Pareto mode\n", - "print(\"\\n--- select_best(pareto, tie_break=weighted) ---\")\n", - "config_par = ObjectiveConfig(\n", - " mode=\"pareto\",\n", - " weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n", - " minimize=frozenset({\"latency_s\"}),\n", - " tie_break=\"weighted\",\n", - ")\n", - "score_dicts_norm = [apply_minimize(normalize_score(s), config_par.minimize) for s, _ in candidates]\n", - "ranks = pareto_rank(score_dicts_norm)\n", - "print(f\" Pareto ranks: {ranks}\")\n", - "print(f\" Front (rank 0): {[candidates[i][1] for i, r in enumerate(ranks) if r == 0]}\")\n", - "idx = select_best(candidates, config_par)\n", - "print(f\" Winner (after tie-break): {candidates[idx][1]} (index {idx})\")\n", - "\n", - "# Deterministic check\n", - "print(\"\\n--- Determinism: 10 runs with same config ---\")\n", - "results = [select_best(candidates, config_par) for _ in range(10)]\n", - "print(f\" Results: {results}\")\n", - "print(f\" All identical: {len(set(results)) == 1}\")\n", - "\n", - "# Top-k\n", - "print(\"\\n--- select_top_k(pareto, k=2) ---\")\n", - "top2 = select_top_k(candidates, config_par, k=2)\n", - "print(f\" Top 2: {[candidates[i][1] for i in top2]}\")" - ] + "outputs": [], + "source": "# Candidates: (score_dict, payload) tuples\ncandidates = [\n ({\"accuracy\": 0.95, \"latency_s\": 0.200}, \"prompt_A\"),\n ({\"accuracy\": 0.70, \"latency_s\": 0.030}, \"prompt_B\"),\n ({\"accuracy\": 0.88, \"latency_s\": 0.080}, \"prompt_C\"),\n ({\"accuracy\": 0.60, \"latency_s\": 0.020}, \"prompt_D\"),\n]\n\nprint(\"Candidates:\")\nfor s, name in candidates:\n print(f\" {name}: {s}\")\n\n# Scalar mode with explicit config (dict scores require ObjectiveConfig)\nprint(\"\\n--- select_best(scalar, scalarize_dict='mean') ---\")\nconfig_scalar = ObjectiveConfig(mode=\"scalar\", scalarize_dict=\"mean\")\nidx = select_best(candidates, config_scalar)\nprint(f\" Winner: {candidates[idx][1]} (index {idx})\")\nprint(\" (Uses mean of dict values as scalar — explicit via scalarize_dict='mean')\")\n\n# Weighted: accuracy-heavy\nprint(\"\\n--- select_best(weighted, accuracy=0.8) ---\")\nconfig_acc = ObjectiveConfig(\n mode=\"weighted\",\n weights={\"accuracy\": 0.8, \"latency_s\": 0.2},\n minimize=frozenset({\"latency_s\"}),\n)\nidx = select_best(candidates, config_acc)\nprint(f\" Winner: {candidates[idx][1]} (index {idx})\")\n\n# Weighted: latency-heavy\nprint(\"\\n--- select_best(weighted, latency_s=0.8) ---\")\nconfig_lat = ObjectiveConfig(\n mode=\"weighted\",\n weights={\"accuracy\": 0.2, \"latency_s\": 0.8},\n minimize=frozenset({\"latency_s\"}),\n)\nidx = select_best(candidates, config_lat)\nprint(f\" Winner: {candidates[idx][1]} (index {idx})\")\n\n# Pareto mode\nprint(\"\\n--- select_best(pareto, tie_break=weighted) ---\")\nconfig_par = ObjectiveConfig(\n mode=\"pareto\",\n weights={\"accuracy\": 0.5, \"latency_s\": 0.5},\n minimize=frozenset({\"latency_s\"}),\n tie_break=\"weighted\",\n)\nscore_dicts_norm = [apply_minimize(to_score_dict(s), config_par.minimize) for s, _ in candidates]\nranks = pareto_rank(score_dicts_norm)\nprint(f\" Pareto ranks: {ranks}\")\nprint(f\" Front (rank 0): {[candidates[i][1] for i, r in enumerate(ranks) if r == 0]}\")\nidx = select_best(candidates, config_par)\nprint(f\" Winner (after tie-break): {candidates[idx][1]} (index {idx})\")\n\n# Deterministic check\nprint(\"\\n--- Determinism: 10 runs with same config ---\")\nresults = [select_best(candidates, config_par) for _ in range(10)]\nprint(f\" Results: {results}\")\nprint(f\" All identical: {len(set(results)) == 1}\")\n\n# Top-k\nprint(\"\\n--- select_top_k(pareto, k=2) ---\")\ntop2 = select_top_k(candidates, config_par, k=2)\nprint(f\" Top 2: {[candidates[i][1] for i in top2]}\")\n\n# Dict scores + config=None raises ValueError (no hidden reduction)\nprint(\"\\n--- Dict scores + config=None raises ValueError ---\")\ntry:\n select_best(candidates, None)\nexcept ValueError as e:\n print(f\" Caught: {e}\")\n print(\" (Pass explicit ObjectiveConfig to define dict→scalar reduction)\")" }, { "cell_type": "markdown", diff --git a/opto/trainer/evaluators.py b/opto/trainer/evaluators.py index d1271fe8..9af4dacb 100644 --- a/opto/trainer/evaluators.py +++ b/opto/trainer/evaluators.py @@ -2,6 +2,7 @@ from opto.trace import ExecutionError import copy import numpy as np +from opto.trainer.objectives import aggregate_score_dicts def evaluate(agent, guide, inputs, infos, min_score=None, num_samples=1, num_threads=None, description=None): @@ -91,6 +92,9 @@ def _evaluate_vector(agent, guide, i): def aggregate_vector_scores(score_dicts): """Compute the per-metric mean across a list of score dicts. + Thin wrapper — delegates to objectives.aggregate_score_dicts + (Objective-side policy per reviewer). + Args: score_dicts: List[Dict[str, float]] @@ -98,17 +102,4 @@ def aggregate_vector_scores(score_dicts): Dict[str, float] with the mean value for each metric key. Empty dict if input is empty. """ - if not score_dicts: - return {} - - all_keys = set() - for sd in score_dicts: - all_keys.update(sd.keys()) - - result = {} - for key in sorted(all_keys): - values = [sd[key] for sd in score_dicts - if key in sd and sd[key] is not None] - if values: - result[key] = float(np.mean(values)) - return result \ No newline at end of file + return aggregate_score_dicts(score_dicts) \ No newline at end of file diff --git a/opto/trainer/objectives.py b/opto/trainer/objectives.py index 385e15b1..5d8e6bd4 100644 --- a/opto/trainer/objectives.py +++ b/opto/trainer/objectives.py @@ -41,6 +41,12 @@ class ObjectiveConfig: - "lexicographic": sort by metric names alphabetically. - "random_seeded": seeded random shuffle. seed: Random seed for deterministic tie-breaking. + + scalarize_dict: How to reduce dict scores to a scalar (when mode="scalar"). + - "score": use score_key (default; avoids hidden behavior) + - "mean": mean(values) (explicitly requested; diagnostic/backcompat) + - "weighted": weighted_scalarize() (explicitly requested) + score_key: Key used when scalarize_dict="score" (default: "score") """ mode: str = "scalar" weights: Dict[str, float] = field(default_factory=dict) @@ -49,6 +55,8 @@ class ObjectiveConfig: pareto_metrics: Optional[Tuple[str, ...]] = None tie_break: str = "weighted" seed: int = 0 + scalarize_dict: str = "score" + score_key: str = "score" def __post_init__(self): if isinstance(self.minimize, set): @@ -62,6 +70,13 @@ def __post_init__(self): f"tie_break must be 'weighted', 'lexicographic', or " f"'random_seeded', got '{self.tie_break}'" ) + if self.scalarize_dict not in ("score", "mean", "weighted"): + raise ValueError( + f"scalarize_dict must be 'score', 'mean', or 'weighted', " + f"got '{self.scalarize_dict}'" + ) + if not isinstance(self.score_key, str) or not self.score_key: + raise ValueError("score_key must be a non-empty string") for k, v in self.weights.items(): if v < 0: raise ValueError(f"Weight for '{k}' must be non-negative, got {v}") @@ -75,7 +90,7 @@ def __post_init__(self): # Pure utility functions # --------------------------------------------------------------------------- -def normalize_score(score: ScoreLike) -> Dict[str, float]: +def to_score_dict(score: ScoreLike) -> Dict[str, float]: """Convert any score to dict form. - bool/int/float -> {"score": float(value)} @@ -107,6 +122,60 @@ def normalize_score(score: ScoreLike) -> Dict[str, float]: ) +# Backward-compatible alias (deprecated name) +normalize_score = to_score_dict + + +def score_dict_to_scalar(score_dict: Dict[str, float], + config: ObjectiveConfig) -> float: + """Reduce a score dict to a scalar according to ObjectiveConfig. + + Applies apply_minimize first, then reduces per config.scalarize_dict: + - "score": return sd[config.score_key] + - "mean": return mean(sd.values()) + - "weighted": return weighted_scalarize(sd, config.weights, ...) + + This exists to avoid hard-coding any dict->scalar behavior in Guide/Evaluator. + """ + sd = to_score_dict(score_dict) + sd = apply_minimize(sd, config.minimize) + + if config.scalarize_dict == "score": + if config.score_key not in sd: + raise ValueError( + f"Dict score missing key '{config.score_key}'. " + "Either include it, or set ObjectiveConfig.scalarize_dict " + "to 'mean' or 'weighted'." + ) + return float(sd[config.score_key]) + + if config.scalarize_dict == "mean": + return float(np.mean(list(sd.values()))) + + if config.scalarize_dict == "weighted": + return float(weighted_scalarize(sd, config.weights, config.missing_value)) + + raise ValueError(f"Unknown scalarize_dict: {config.scalarize_dict}") + + +def to_scalar_score(score: ScoreLike, + config: Optional[ObjectiveConfig]) -> float: + """Convert scalar or dict score to scalar using ObjectiveConfig. + + Scalar scores pass through as float(score). Dict scores require + an explicit ObjectiveConfig to define reduction (no hidden defaults). + """ + if isinstance(score, dict): + if config is None: + raise ValueError( + "Dict score encountered but ObjectiveConfig is None. " + "Pass ObjectiveConfig(mode='scalar', scalarize_dict=...) " + "to define reduction." + ) + return score_dict_to_scalar(score, config) + return float(score) + + def apply_minimize(score_dict: Dict[str, float], minimize: frozenset) -> Dict[str, float]: """Negate values for minimize metrics (higher-is-better normalization). @@ -198,20 +267,15 @@ def select_best(candidates: List[Tuple[ScoreLike, Any]], Notes: When *config* is None or mode='scalar', dict scores are collapsed to - mean(values) for backward compatibility. For explicit multi-objective - control, pass an ObjectiveConfig with mode='weighted' or 'pareto' - and appropriate weights. + a scalar using ObjectiveConfig.scalarize_dict. If dict scores are + present and config is None, a ValueError is raised (no hidden + hard-coded reduction). """ if config is None or config.mode == "scalar": - scores = [] - for score, _ in candidates: - if isinstance(score, dict): - scores.append(np.mean(list(score.values()))) - else: - scores.append(float(score)) + scores = [to_scalar_score(score, config) for score, _ in candidates] return int(np.argmax(scores)) - score_dicts = [normalize_score(s) for s, _ in candidates] + score_dicts = [to_score_dict(s) for s, _ in candidates] score_dicts = [apply_minimize(sd, config.minimize) for sd in score_dicts] if config.mode == "weighted": @@ -265,20 +329,15 @@ def select_top_k(candidates: List[Tuple[ScoreLike, Any]], Notes: When *config* is None or mode='scalar', dict scores are collapsed to - mean(values) for backward compatibility. For explicit multi-objective - control, pass an ObjectiveConfig with mode='weighted' or 'pareto' - and appropriate weights. + a scalar using ObjectiveConfig.scalarize_dict. If dict scores are + present and config is None, a ValueError is raised (no hidden + hard-coded reduction). """ if config is None or config.mode == "scalar": - scores = [] - for score, _ in candidates: - if isinstance(score, dict): - scores.append(np.mean(list(score.values()))) - else: - scores.append(float(score)) + scores = [to_scalar_score(score, config) for score, _ in candidates] return list(np.argsort(scores)[::-1][:k]) - score_dicts = [normalize_score(s) for s, _ in candidates] + score_dicts = [to_score_dict(s) for s, _ in candidates] score_dicts = [apply_minimize(sd, config.minimize) for sd in score_dicts] if config.mode == "weighted": @@ -322,3 +381,22 @@ def select_top_k(candidates: List[Tuple[ScoreLike, Any]], return result[:k] raise ValueError(f"Unknown mode: {config.mode}") + + +def aggregate_score_dicts(score_dicts: List[Dict[str, float]]) -> Dict[str, float]: + """Compute per-metric mean across a list of score dicts. + + This is Objective-side policy (per reviewer): evaluators call this + rather than defining aggregation logic themselves. + """ + if not score_dicts: + return {} + all_keys = set() + for sd in score_dicts: + all_keys.update(sd.keys()) + result: Dict[str, float] = {} + for key in sorted(all_keys): + values = [sd[key] for sd in score_dicts if key in sd and sd[key] is not None] + if values: + result[key] = float(np.mean(values)) + return result diff --git a/tests/unit_tests/test_objectives.py b/tests/unit_tests/test_objectives.py index 331eea1d..e68f6c4e 100644 --- a/tests/unit_tests/test_objectives.py +++ b/tests/unit_tests/test_objectives.py @@ -2,77 +2,139 @@ import pytest import numpy as np from opto.trainer.objectives import ( - ObjectiveConfig, normalize_score, apply_minimize, weighted_scalarize, + ObjectiveConfig, to_score_dict, apply_minimize, weighted_scalarize, dominates, pareto_rank, select_best, select_top_k, + score_dict_to_scalar, to_scalar_score, aggregate_score_dicts, ) # --------------------------------------------------------------------------- -# normalize_score +# to_score_dict (alias normalize_score kept for backwards-compat) # --------------------------------------------------------------------------- -def test_normalize_score_float(): - assert normalize_score(0.85) == {"score": 0.85} +def test_to_score_dict_float(): + assert to_score_dict(0.85) == {"score": 0.85} -def test_normalize_score_zero(): - assert normalize_score(0.0) == {"score": 0.0} +def test_to_score_dict_zero(): + assert to_score_dict(0.0) == {"score": 0.0} -def test_normalize_score_int(): - assert normalize_score(1) == {"score": 1.0} +def test_to_score_dict_int(): + assert to_score_dict(1) == {"score": 1.0} -def test_normalize_score_int_zero(): - assert normalize_score(0) == {"score": 0.0} +def test_to_score_dict_int_zero(): + assert to_score_dict(0) == {"score": 0.0} -def test_normalize_score_bool_true(): - assert normalize_score(True) == {"score": 1.0} +def test_to_score_dict_bool_true(): + assert to_score_dict(True) == {"score": 1.0} -def test_normalize_score_bool_false(): - assert normalize_score(False) == {"score": 0.0} +def test_to_score_dict_bool_false(): + assert to_score_dict(False) == {"score": 0.0} -def test_normalize_score_dict(): - result = normalize_score({"acc": 0.9, "lat": 50.0}) +def test_to_score_dict_dict(): + result = to_score_dict({"acc": 0.9, "lat": 50.0}) assert result == {"acc": 0.9, "lat": 50.0} -def test_normalize_score_dict_with_int_values(): - result = normalize_score({"acc": 1, "lat": 0}) +def test_to_score_dict_dict_with_int_values(): + result = to_score_dict({"acc": 1, "lat": 0}) assert result == {"acc": 1.0, "lat": 0.0} -def test_normalize_score_empty_dict_raises(): +def test_to_score_dict_empty_dict_raises(): with pytest.raises(ValueError, match="must not be empty"): - normalize_score({}) + to_score_dict({}) -def test_normalize_score_nan_raises(): +def test_to_score_dict_nan_raises(): with pytest.raises(ValueError, match="finite"): - normalize_score({"a": float("nan")}) + to_score_dict({"a": float("nan")}) -def test_normalize_score_inf_raises(): +def test_to_score_dict_inf_raises(): with pytest.raises(ValueError, match="finite"): - normalize_score(float("inf")) + to_score_dict(float("inf")) -def test_normalize_score_neg_inf_raises(): +def test_to_score_dict_neg_inf_raises(): with pytest.raises(ValueError, match="finite"): - normalize_score(float("-inf")) + to_score_dict(float("-inf")) -def test_normalize_score_string_raises(): +def test_to_score_dict_string_raises(): with pytest.raises(TypeError, match="str"): - normalize_score("bad") + to_score_dict("bad") -def test_normalize_score_none_raises(): +def test_to_score_dict_none_raises(): with pytest.raises(TypeError): - normalize_score(None) + to_score_dict(None) + + +def test_backward_compat_alias(): + """normalize_score still works as alias.""" + from opto.trainer.objectives import normalize_score + assert normalize_score(0.5) == {"score": 0.5} + + +# --------------------------------------------------------------------------- +# score_dict_to_scalar / to_scalar_score +# --------------------------------------------------------------------------- + +def test_score_dict_to_scalar_score_key(): + config = ObjectiveConfig(scalarize_dict="score") + assert score_dict_to_scalar({"score": 0.9, "extra": 0.1}, config) == pytest.approx(0.9) + + +def test_score_dict_to_scalar_score_key_missing_raises(): + config = ObjectiveConfig(scalarize_dict="score") + with pytest.raises(ValueError, match="missing key"): + score_dict_to_scalar({"acc": 0.9}, config) + + +def test_score_dict_to_scalar_mean(): + config = ObjectiveConfig(scalarize_dict="mean") + result = score_dict_to_scalar({"a": 0.8, "b": 0.2}, config) + assert result == pytest.approx(0.5) + + +def test_score_dict_to_scalar_weighted(): + config = ObjectiveConfig(scalarize_dict="weighted", weights={"a": 0.7, "b": 0.3}) + result = score_dict_to_scalar({"a": 1.0, "b": 0.5}, config) + assert result == pytest.approx(0.7 * 1.0 + 0.3 * 0.5) + + +def test_to_scalar_score_float_passthrough(): + assert to_scalar_score(0.75, None) == pytest.approx(0.75) + + +def test_to_scalar_score_dict_none_config_raises(): + with pytest.raises(ValueError, match="ObjectiveConfig is None"): + to_scalar_score({"a": 0.5}, None) + + +def test_to_scalar_score_dict_with_config(): + config = ObjectiveConfig(scalarize_dict="mean") + assert to_scalar_score({"a": 0.8, "b": 0.2}, config) == pytest.approx(0.5) + + +# --------------------------------------------------------------------------- +# aggregate_score_dicts +# --------------------------------------------------------------------------- + +def test_aggregate_score_dicts_basic(): + result = aggregate_score_dicts([{"a": 1.0, "b": 0.5}, {"a": 0.0, "b": 1.0}]) + assert result["a"] == pytest.approx(0.5) + assert result["b"] == pytest.approx(0.75) + + +def test_aggregate_score_dicts_empty(): + assert aggregate_score_dicts([]) == {} # --------------------------------------------------------------------------- @@ -195,12 +257,29 @@ def test_select_best_scalar_mode(): assert select_best(candidates, config) == 1 -def test_select_best_scalar_with_dict_scores(): - """Scalar mode with dict scores uses mean of values.""" - config = ObjectiveConfig(mode="scalar") +def test_select_best_scalar_with_dict_scores_requires_config(): + """Dict scores require explicit scalarization config (no hidden hard-coded mean).""" + candidates = [({"a": 0.5, "b": 0.3}, "X")] + with pytest.raises(ValueError, match="ObjectiveConfig is None"): + select_best(candidates, None) + + +def test_select_best_scalar_with_dict_scores_score_key_default(): + """Default scalarize_dict='score' uses the 'score' key.""" + config = ObjectiveConfig(mode="scalar") # scalarize_dict="score" by default candidates = [ - ({"a": 0.5, "b": 0.3}, "X"), # mean = 0.4 - ({"a": 0.8, "b": 0.6}, "Y"), # mean = 0.7 + ({"score": 0.4, "a": 0.5}, "X"), + ({"score": 0.7, "a": 0.8}, "Y"), + ] + assert select_best(candidates, config) == 1 + + +def test_select_best_scalar_with_dict_scores_mean_configurable(): + """Explicit scalarize_dict='mean' uses mean of all values.""" + config = ObjectiveConfig(mode="scalar", scalarize_dict="mean") + candidates = [ + ({"a": 0.5, "b": 0.3}, "X"), # mean 0.4 + ({"a": 0.8, "b": 0.6}, "Y"), # mean 0.7 ] assert select_best(candidates, config) == 1 @@ -349,6 +428,8 @@ def test_config_default(): assert config.mode == "scalar" assert config.weights == {} assert config.minimize == frozenset() + assert config.scalarize_dict == "score" + assert config.score_key == "score" def test_config_set_to_frozenset(): @@ -377,6 +458,16 @@ def test_config_empty_pareto_metrics_raises(): ObjectiveConfig(pareto_metrics=()) +def test_config_bad_scalarize_dict_raises(): + with pytest.raises(ValueError, match="scalarize_dict"): + ObjectiveConfig(scalarize_dict="bad") + + +def test_config_empty_score_key_raises(): + with pytest.raises(ValueError, match="score_key"): + ObjectiveConfig(score_key="") + + def test_config_frozen(): config = ObjectiveConfig() with pytest.raises(AttributeError): From e2a66de124be90bfa80774a8a68b4bc3ba5df1aa Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Wed, 18 Feb 2026 19:51:17 -0400 Subject: [PATCH 08/20] T6 M2 prep: align tech plan per Xavier review, add Allen's multi_objective_convex_fn.py --- docs/T6_technical_plan.md | 28 +- examples/multi_objective_convex_fn.py | 663 ++++++++++++++++++++++++++ 2 files changed, 673 insertions(+), 18 deletions(-) create mode 100644 examples/multi_objective_convex_fn.py diff --git a/docs/T6_technical_plan.md b/docs/T6_technical_plan.md index 45a3c2fd..60891818 100644 --- a/docs/T6_technical_plan.md +++ b/docs/T6_technical_plan.md @@ -301,30 +301,22 @@ class Guide: # ... existing methods unchanged ... def get_score_dict(self, query: str, response: str, reference=None, **kwargs) -> Dict[str, float]: - """Return the evaluation score as a dictionary. + """Return evaluation score as a dict (multi-objective selection path). - Wraps get_feedback() for backward compatibility: - - If get_feedback returns (float, str): returns {"score": float} - - If get_feedback returns (dict, str): returns dict directly + Default implementation wraps the scalar training score from get_feedback() as: + {"score": float_value} - Subclasses returning multi-metric scores should override get_feedback() - to return (Dict[str, float], str) instead of (float, str). - """ - score, _ = self.get_feedback(query, response, reference, **kwargs) - if isinstance(score, dict): - return score - return {"score": float(score)} - - def metric(self, query: str, response: str, reference=None, **kwargs) -> float: - """Always returns float. For dict scores, returns mean of values as scalar fallback. + Guides that need multiple metrics should override *get_score_dict()* and return + e.g. {"accuracy": 0.9, "brevity": 0.8, "latency_s": 0.05}. - This ensures evaluate() and the training loop (which call metric()) remain - completely safe. Dict scores only flow through get_score_dict() → evaluate_vector(). + Note: get_feedback() should remain scalar (float) for training-loop backward + compatibility. If a subclass returns a dict from get_feedback(), metric() and + scalar evaluators may break; prefer overriding get_score_dict(). """ score, _ = self.get_feedback(query, response, reference, **kwargs) if isinstance(score, dict): - return float(np.mean(list(score.values()))) - return float(score) + return {k: float(v) for k, v in score.items()} + return {"score": float(score)} ``` **Why this approach:** diff --git a/examples/multi_objective_convex_fn.py b/examples/multi_objective_convex_fn.py new file mode 100644 index 00000000..50f11428 --- /dev/null +++ b/examples/multi_objective_convex_fn.py @@ -0,0 +1,663 @@ +import re +import numpy as np +import cvxpy as cp +from opto.trace.utils import dedent + +def np_random(seed: int | None = None) -> tuple[np.random.Generator, int]: + if seed is not None and not (isinstance(seed, int) and 0 <= seed): + if isinstance(seed, int) is False: + raise Exception(f"Seed must be a python integer, actual type: {type(seed)}") + else: + raise Exception(f"Seed must be greater or equal to zero, actual value: {seed}") + seed_seq = np.random.SeedSequence(seed) + np_seed = seed_seq.entropy + rng = np.random.Generator(np.random.PCG64(seed_seq)) + return rng, np_seed + + +def _norm_term(x: np.ndarray, norm_coef: float, norm_kind: str) -> float: + if norm_coef == 0.0: + return 0.0 + if norm_kind == "l2sq": + return float(norm_coef * (x[0] ** 2 + x[1] ** 2)) + if norm_kind == "l2": + return float(norm_coef * np.sqrt(x[0] ** 2 + x[1] ** 2)) + if norm_kind == "l1": + return float(norm_coef * (abs(x[0]) + abs(x[1]))) + raise ValueError("norm_kind must be one of {'l2sq','l2','l1'}") + + +def _rosenbrock_cubic_global_min(a: float, b: float, lam: float) -> tuple[np.ndarray, float]: + """ + For f(u,v)=(a-u)^2 + b(v-u^2)^2 + lam(u^2+v^2), b>0, lam>=0. + Returns (x_star, f_star). + """ + # lam == 0: classic Rosenbrock minimum at (a, a^2) with value 0. + if lam == 0.0: + x_star = np.array([a, a ** 2], dtype=float) + f_star = 0.0 + return x_star, f_star + + # Solve cubic: c3*u^3 + (1+lam)*u - a = 0 with c3 = 2*b*lam/(b+lam) + c3 = 2.0 * b * lam / (b + lam) + c1 = 1.0 + lam + roots = np.roots([c3, 0.0, c1, -a]) + + best = None + for r in roots: + if abs(r.imag) < 1e-10: + u = float(r.real) + v = (b / (b + lam)) * u * u + x = np.array([u, v], dtype=float) + # evaluate full objective + base = (a - u) ** 2 + b * (v - u * u) ** 2 + f = float(base + lam * (u * u + v * v)) + if best is None or f < best[1]: + best = (x, f) + + if best is None: + raise RuntimeError("Unexpected: cubic had no real root.") + return best + + +# --------------------------- +# SOS / moment relaxation for Six-Hump Camel on a box +# --------------------------- + +def _monomials_upto_degree(k: int) -> list[tuple[int, int]]: + out: list[tuple[int, int]] = [] + for deg in range(k + 1): + for i in range(deg + 1): + j = deg - i + out.append((i, j)) + return out + +def _add_mono(a: tuple[int, int], b: tuple[int, int]) -> tuple[int, int]: + return (a[0] + b[0], a[1] + b[1]) + +def _build_moment_matrix(y: dict[tuple[int,int], cp.Expression], basis: list[tuple[int,int]]) -> cp.Expression: + m = len(basis) + blocks = [] + for i in range(m): + row = [] + for j in range(m): + row.append(y[_add_mono(basis[i], basis[j])]) + blocks.append(row) + return cp.bmat(blocks) + +def _build_localizing_matrix_linear( + y: dict[tuple[int,int], cp.Expression], + basis: list[tuple[int,int]], + g_lin: dict[tuple[int,int], float], # g(x,y) = c00 + c10 x + c01 y +) -> cp.Expression: + m = len(basis) + blocks = [] + for i in range(m): + row = [] + for j in range(m): + a = _add_mono(basis[i], basis[j]) + expr = 0 + for beta, c in g_lin.items(): + expr += c * y[_add_mono(a, beta)] + row.append(expr) + blocks.append(row) + return cp.bmat(blocks) + +def _six_hump_coeffs(lam_l2sq: float = 0.0) -> dict[tuple[int,int], float]: + """ + Base six-hump camel: + f(x,y) = 4x^2 -2.1 x^4 + (1/3) x^6 + x y - 4y^2 + 4y^4 + + With l2sq regularizer: + f(x,y) + lam*(x^2 + y^2) + => add lam to the (2,0) and (0,2) coefficients. + """ + lam = float(lam_l2sq) + return { + (2, 0): 4.0 + lam, + (4, 0): -2.1, + (6, 0): 1.0 / 3.0, + (1, 1): 1.0, + (0, 2): -4.0 + lam, + (0, 4): 4.0, + } + + +def six_hump_sos_certificate_on_box( + bound: float = 2.0, + order_d: int = 3, + solver: str = "SCS", + verbose: bool = False, + lam_l2sq: float = 0.0, +) -> tuple[float, str]: + """ + Moment relaxation (Lasserre) order d for Six-Hump Camel (+ optional l2sq) on [-bound, bound]^2. + Returns (lower_bound, status). lower_bound is the SOS certificate gamma. + """ + if order_d < 3: + raise ValueError("For degree-6 polynomial, use order_d >= 3") + + coeff = _six_hump_coeffs(lam_l2sq=lam_l2sq) + max_deg = 2 * order_d + all_monos = _monomials_upto_degree(max_deg) + + y: dict[tuple[int,int], cp.Variable] = {m: cp.Variable() for m in all_monos} + constraints = [y[(0, 0)] == 1.0] + + basis_d = _monomials_upto_degree(order_d) + M = _build_moment_matrix(y, basis_d) + constraints.append(M >> 0) + + basis_d1 = _monomials_upto_degree(order_d - 1) + g_list = [ + {(0,0): bound, (1,0): -1.0, (0,1): 0.0}, + {(0,0): bound, (1,0): 1.0, (0,1): 0.0}, + {(0,0): bound, (1,0): 0.0, (0,1): -1.0}, + {(0,0): bound, (1,0): 0.0, (0,1): 1.0}, + ] + for g in g_list: + L = _build_localizing_matrix_linear(y, basis_d1, g) + constraints.append(L >> 0) + + obj = cp.Minimize(sum(c * y[m] for m, c in coeff.items())) + prob = cp.Problem(obj, constraints) + prob.solve(solver=solver, verbose=verbose) + + return float(prob.value), str(prob.status) + +class LossLandscapeBase: + def __init__( + self, + callable_func, + x_low, + x_high, + optimal_sol, + feedback=0, + seed=None, + precision_digit=2, + horizon=10, + # multi-objective / regularization knobs + norm_coef: float = 0.0, + norm_kind: str = "l2sq", + # done criterion uses certificate + done_tol: float = 1e-2, + ): + self.x_low = x_low + self.x_high = x_high + + self._np_random = None + self.stop_keywords = ["reach", "stay", "stop"] + + # base (unregularized) function + self.base_func = callable_func + + self.norm_coef = float(norm_coef) + self.norm_kind = str(norm_kind) + self.done_tol = float(done_tol) + + # wrapped function used everywhere in env: base + norm + def augmented(x: np.ndarray) -> float: + x = np.asarray(x, dtype=float) + return float(self.base_func(x) + _norm_term(x, self.norm_coef, self.norm_kind)) + + self.callable_func = augmented + + self.prev_x = None + self.left_attempts = horizon + + self.optimal_sol = optimal_sol + self.precision_digit = precision_digit + self.horizon = horizon + self._seed = self.seed(seed) + + # subclass sets this (certificate-based) in _init_certificate() + self.certificate_y: float | None = None + self.certificate_meta: dict = {} + + self._init_certificate() + if self.certificate_y is None: + raise RuntimeError("Subclass must set self.certificate_y in _init_certificate().") + + # Use certificate as min_y for reward range + done checks + self.min_y = float(self.certificate_y) + + self.reward_range = (self.get_min_reward(), -self.min_y) + + if self.norm_coef != 0.0: + norm_desc = { + "l2sq": "||x||_2^2 (squared L2 norm)", + "l2": "||x||_2 (L2 norm)", + "l1": "||x||_1 (L1 norm)", + }.get(self.norm_kind, self.norm_kind) + + objective_line = ( + f"Your goal is to minimize the total objective:\n" + f" y(x) = f(x) + {self.norm_coef} * {norm_desc}\n" + f"where f(x) is the base function output and x is a 2D vector." + ) + else: + objective_line = ( + "Your goal is to minimize the function output:\n" + " y(x) = f(x)\n" + "where f(x) is the base function output and x is a 2D vector." + ) + + self.docstring = dedent(f""" + You are trying to minimize an objective by choosing the input x. + + {objective_line} + + You get to observe y once you choose x, where x is a 2-dimensional vector: + x = [x1, x2], with real-valued coordinates. + + The allowed range for x1 and x2 is [{self.x_low}, {self.x_high}]. + Please do not choose x outside of this range. + + You have {self.horizon} attempts. + You can choose to stop at any time by outputting a message containing one of: {", ".join(self.stop_keywords)}. + + Output format: + x = [x1, x2] + """).strip() + + self.called_reset = False + + def _init_certificate(self) -> None: + """ + Subclasses must set: + self.certificate_y: float (target min value / certificate) + Optionally: + self.certificate_meta: dict with info (solver status, x*, etc.) + """ + raise NotImplementedError + + def get_min_reward(self): + # conservative: evaluate on corners of box for reward lower bound + x_range = [self.x_low, self.x_high] + y_vals = [self.callable_func(np.array([x_range[i], x_range[j]])) for i in range(2) for j in range(2)] + y_max = max(y_vals) + return -float(y_max) + + def get_optimal_solution(self): + return self.optimal_sol + + def reset(self, **kwargs): + if "seed" in kwargs: + self._seed = self.seed(kwargs["seed"]) + + x = self.np_random.uniform(self.x_low, self.x_high, size=2) + x = np.round(x, self.precision_digit) + self.prev_x = x + + y = self.callable_func(x) + self.left_attempts = self.horizon + + # obs = f"x={x.tolist()}\nFunction outputs y = {y}\nYou have {self.left_attempts} attempts left!\n" + loss_line, info = self._format_loss_report(x) + obs = loss_line + obs += "Please output the next x that will make this function output the smallest y.\n" + obs += "Format: x = [x1, x2]\n" + obs += "Output:" + + self.called_reset = True + return obs + + def seed(self, seed=None): + self._np_random, seed = np_random(seed) + return [seed] + + @property + def np_random(self): + if self._np_random is None: + self.seed() + return self._np_random # type: ignore + + def text_extract(self, text): + for stop_word in self.stop_keywords: + if stop_word in text: + return None, True + + pattern = r"\[(-?\d+\.?\d*(?:e[-+]?\d+)?),\s*(-?\d+\.?\d*(?:e[-+]?\d+)?)\]" + match = re.search(pattern, text) + if match is None: + return None, False + numbers = [float(g) for g in match.groups()] + return np.array(numbers, dtype=float), False + + def _is_success(self, loss: float) -> bool: + # Done criterion: close to certificate/guarantee. + # Note: certificate_y is a lower bound for SOS cases; if it's tight, this is meaningful. + return abs(float(loss) - float(self.certificate_y)) <= self.done_tol + + def _eval_losses(self, x: np.ndarray) -> tuple[float, float, float]: + x = np.asarray(x, dtype=float) + base = float(self.base_func(x)) + reg = float(_norm_term(x, self.norm_coef, self.norm_kind)) + total = base + reg + return base, reg, total + + def _format_loss_report(self, x: np.ndarray) -> tuple[str, dict]: + base, reg, total = self._eval_losses(x) + info = { + "base_loss": base, + "reg_loss": reg, + "total_loss": total, + "certificate_y": float(self.certificate_y), + "gap": float(total - float(self.certificate_y)), + } + + if self.norm_coef != 0.0: + # optional: report the raw norm too (not multiplied by coef) + if self.norm_kind == "l2sq": + norm_val = float(x[0] ** 2 + x[1] ** 2) + elif self.norm_kind == "l2": + norm_val = float(np.sqrt(x[0] ** 2 + x[1] ** 2)) + elif self.norm_kind == "l1": + norm_val = float(abs(x[0]) + abs(x[1])) + else: + norm_val = None + + info["norm_value"] = norm_val + info["norm_kind"] = self.norm_kind + info["norm_coef"] = float(self.norm_coef) + + line = ( + f"Function outputs total y = {total}\n" + f" base f(x) = {base}\n" + f" regularizer = {reg} (coef={self.norm_coef}, kind={self.norm_kind}, norm={norm_val})\n" + ) + else: + line = f"Function outputs y = {total}\n" + + return line, info + + def step(self, action): + if not self.called_reset: + raise Exception("must call env.reset() first before step()") + + x, stop = self.text_extract(action) + + if x is None and stop is False: + feedback = ( + f"You entered an invalid action: {action}" + + f" Please enter a valid action within ({self.x_low, self.x_high})" + ) + return None, -1, True, { + "feedback": feedback, + "success": False, + "base_loss": None, + "reg_loss": None, + "total_loss": None, + "certificate_y": float(self.certificate_y), + "gap": None, + } + + if stop: + base, reg, total = self._eval_losses(self.prev_x) + success = self._is_success(total) + feedback = f"You have chosen to stop at {self.prev_x}." + feedback += " You have reached the (certified) minimum!" if success else " You have not reached the (certified) minimum!" + return None, total, True, { + "feedback": feedback, + "success": success, + "base_loss": base, + "reg_loss": reg, + "total_loss": total, + "certificate_y": float(self.certificate_y), + "gap": float(total - float(self.certificate_y)), + } + + if np.any(x < self.x_low) or np.any(x > self.x_high): + base, reg, total = self._eval_losses(self.prev_x) + feedback = f"x must be within [{self.x_low}, {self.x_high}]. You gave {x.tolist()}." + return None, total, True, { + "feedback": feedback, + "success": False, + "base_loss": base, + "reg_loss": reg, + "total_loss": total, + "certificate_y": float(self.certificate_y), + "gap": float(total - float(self.certificate_y)), + } + + base, reg, total = self._eval_losses(x) + + if self._is_success(total): + feedback = f"Function outputs y: {total}\nYou have reached the (certified) minimum!" + return feedback, -total, True, { + "feedback": feedback, + "success": True, + "base_loss": base, + "reg_loss": reg, + "total_loss": total, + "certificate_y": float(self.certificate_y), + "gap": float(total - float(self.certificate_y)), + } + + loss_line, info = self._format_loss_report(x) + obs = loss_line + obs += f"You have {self.left_attempts} attempts left!\n" + obs += "Please output the next x that will make this function output the smallest y.\n" + obs += "Format: x = [x1, x2]\n" + obs += "Output:" + + self.prev_x = x + self.left_attempts -= 1 + + r = np.clip(float(-total), self.get_min_reward(), -self.min_y) + feedback = f"You chose {action}. Choose different numbers such that you can minimize y." + return obs, r, False, { + "feedback": feedback, + "success": False, + "base_loss": base, + "reg_loss": reg, + "total_loss": total, + "certificate_y": float(self.certificate_y), + "gap": float(total - float(self.certificate_y)), + } + + +class Rosenbrock(LossLandscapeBase): + def __init__( + self, + a=1.0, + b=1.0, + feedback=0, + seed=None, + horizon=10, + precision_digit=2, + norm_coef: float = 1.0, + norm_kind: str = "l2sq", + done_tol: float = 1e-2, + ): + self.a = float(a) + self.b = float(b) + + def base(x: np.ndarray) -> float: + return float((self.a - x[0]) ** 2 + self.b * (x[1] - x[0] ** 2) ** 2) + + super().__init__( + callable_func=base, + x_low=-5, + x_high=10, + optimal_sol=np.ones(2), + feedback=feedback, + seed=seed, + precision_digit=precision_digit, + horizon=horizon, + norm_coef=norm_coef, + norm_kind=norm_kind, + done_tol=done_tol, + ) + + def _init_certificate(self) -> None: + if self.norm_kind != "l2sq": + raise ValueError("Rosenbrock cubic certificate requires norm_kind='l2sq'.") + + if self.norm_coef < 0: + raise ValueError("For a meaningful global certificate, norm_coef should be >= 0.") + + x_star, f_star = _rosenbrock_cubic_global_min(self.a, self.b, self.norm_coef) + + self.certificate_y = float(f_star) + self.optimal_sol = x_star + self.certificate_meta = {"method": "cubic", "x_star": x_star, "f_star": float(f_star)} + + +class SixHumpCamel(LossLandscapeBase): + def __init__( + self, + feedback=0, + seed=None, + horizon=10, + precision_digit=4, + norm_coef: float = 1.0, + norm_kind: str = "l2sq", + done_tol: float = 1e-3, + sos_solver: str = "SCS", + sos_order_d: int = 3, + sos_verbose: bool = False, + ): + self.sos_solver = sos_solver + self.sos_order_d = sos_order_d + self.sos_verbose = sos_verbose + + def base(x: np.ndarray) -> float: + u, v = float(x[0]), float(x[1]) + return float((4 - 2.1 * u ** 2 + (u ** 4) / 3) * u ** 2 + u * v + (-4 + 4 * v ** 2) * v ** 2) + + super().__init__( + callable_func=base, + x_low=-2, + x_high=2, + optimal_sol=[np.array([0.0898, -0.7126]), np.array([-0.0898, 0.7126])], + feedback=feedback, + seed=seed, + precision_digit=precision_digit, + horizon=horizon, + norm_coef=norm_coef, + norm_kind=norm_kind, + done_tol=done_tol, + ) + + def _init_certificate(self) -> None: + if self.norm_coef != 0.0 and self.norm_kind != "l2sq": + raise ValueError( + "SixHumpCamel SOS certificate supports norm_coef==0 or norm_kind=='l2sq'. " + "For l1/l2 you need epigraph variables." + ) + + gamma, status = six_hump_sos_certificate_on_box( + bound=2.0, + order_d=self.sos_order_d, + solver=self.sos_solver, + verbose=self.sos_verbose, + lam_l2sq=self.norm_coef, + ) + self.certificate_y = float(gamma) + self.certificate_meta = { + "method": "moment_sdp", + "gamma": float(gamma), + "status": status, + "lam_l2sq": float(self.norm_coef), + "bound": 2.0, + "order_d": self.sos_order_d, + "solver": self.sos_solver, + } + + +# ============ Multi-objective test harness (Approach 1: BasicSearch + ObjectiveConfig) ============ +from opto import trace +from opto.trainer.guide import Guide +from opto.trainer.loggers import TensorboardLogger +from opto import trainer +from opto.trainer.objectives import ObjectiveConfig +from opto.trainer.algorithms.basic_algorithms import BasicSearchAlgorithm as SearchAlgorithm +from typing import Tuple +from copy import copy + + +class RewardGuide(Guide): + """ + Multi-objective metrics: + + - base_loss: minimize + - reg_loss: minimize + + (The trainer's ObjectiveConfig decides how to combine/compare.) + """ + + def __init__(self, env: LossLandscapeBase): + self.env = env + + def _score_action_on_env_copy(self, action: str): + env_copy = copy.deepcopy(self.env) + obs, reward, done, info = env_copy.step(action) + return obs, reward, done, info + + def get_feedback(self, query: str, response: str, reference=None, **kwargs) -> Tuple[float, str]: + # Legacy scalar path: advances the real env. + obs, reward, done, info = self.env.step(response) + return float(reward), ((obs + "\n\n") if obs else "") + info.get("feedback", "") + + def get_score_dict(self, query: str, response: str, reference=None, **kwargs) -> dict[str, float]: + # Vector score path for trainer-side selection: + obs, reward, done, info = self._score_action_on_env_copy(response) + + base_loss = info.get("base_loss") + reg_loss = info.get("reg_loss") + + # If action invalid, your env sets losses to None. Map to +inf so it never gets selected. + if base_loss is None or reg_loss is None: + base_loss = float("inf") + reg_loss = float("inf") + + return { + "base_loss": float(base_loss), # minimize + "reg_loss": float(reg_loss), # minimize + } + +def main(): + env = SixHumpCamel(horizon=200) + train_dataset = dict(inputs=[None], infos=[None]) + + instruction = env.reset() + initial_input = instruction.split("\n")[0].strip() + param = trace.node(initial_input, description="Input x into the hidden function to get y.", trainable=True) + + guide = RewardGuide(env) + logger = TensorboardLogger(log_dir="./logs/basicsearch_multiobjective_on_loss_landscape") + + # We want high reward, but penalize invalid actions and overly long outputs. + objective_config = ObjectiveConfig( + mode="weighted", + weights={"base_loss": 1.0, "reg_loss": 1.0}, + minimize=frozenset({"base_loss", "reg_loss"}), + seed=0, + ) + + trainer.train( + model=param, + algorithm=SearchAlgorithm, + train_dataset=train_dataset, + logger=logger, + score_range=[-10, 10], + num_epochs=1, + num_steps=5, + batch_size=1, + num_batches=2, + verbose=False, + guide=guide, + objective_config=objective_config, + # basic search knobs (keep small for smoke test) + num_candidates=4, + num_proposals=4, + optimizer_kwargs={ + "objective": "You have a task of guessing two numbers. Output x=[x1,x2] and minimize y.", + "memory_size": 10, + }, + ) + + +if __name__ == "__main__": + main() From 270a1b6fcd1d4218aa84b8bee7cd4bc34d58a19c Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Wed, 18 Feb 2026 20:17:01 -0400 Subject: [PATCH 09/20] T6 M2: Multi-objective support for BeamsearchAlgorithm + PrioritySearch + 12 integration tests --- .../priority_search/priority_search.py | 144 ++++++- .../algorithms/beamsearch_algorithm.py | 178 +++++--- .../test_trainers_multiobjective.py | 402 ++++++++++++++++++ 3 files changed, 664 insertions(+), 60 deletions(-) create mode 100644 tests/unit_tests/test_trainers_multiobjective.py diff --git a/opto/features/priority_search/priority_search.py b/opto/features/priority_search/priority_search.py index 86bcbd60..828b8837 100644 --- a/opto/features/priority_search/priority_search.py +++ b/opto/features/priority_search/priority_search.py @@ -10,6 +10,10 @@ from opto.trainer.algorithms.basic_algorithms import batchify from opto.features.priority_search.search_template import SearchTemplate, Samples, BatchRollout, save_train_config from opto.features.priority_search.utils import set_module_parameters, remap_update_dict, create_module_from_update_dict, is_module_copy, deepcopy_module +from opto.trainer.objectives import ( + ObjectiveConfig, to_score_dict, apply_minimize, + weighted_scalarize, pareto_rank, aggregate_score_dicts +) class ModuleCandidate: @@ -85,6 +89,19 @@ def mean_score(self): return None return safe_mean([r['score'] for r in self.rollouts]) + def mean_score_dict(self) -> Optional[Dict[str, float]]: + """Compute the per-metric mean score dict across rollouts. + + Returns None if no rollouts have 'score_dict' entries. + """ + if not self.rollouts: + return None + score_dicts = [r.get("score_dict") for r in self.rollouts] + score_dicts = [sd for sd in score_dicts if isinstance(sd, dict)] + if not score_dicts: + return None + return aggregate_score_dicts(score_dicts) + def compute_score_confidence(self, min_score, max_score, scaling_constant=1.0, total_trials=1): """Compute the UCB, mean, LCB score for the candidate. After queried, the number of confidence queries is incremented. @@ -213,6 +230,76 @@ def _criterion(x): return p if p is not None else 0 return max(self.memory, key=lambda x: _criterion(x)) + +class ParetoHeapMemory(HeapMemory): + """Heap-backed memory that can pop using Pareto-front selection among top-K items. + + Keeps scalar priority for push/best efficiency. When mode='pareto', + pop() selects from the Pareto front among the top-K candidates by scalar + priority, with tie-breaking via weighted scalarization. + + push() and best() are inherited unchanged from HeapMemory. + """ + + def __init__(self, size=None, processing_fun: Callable = None, *, + pareto_k: int = 20, + score_dict_fn: Optional[Callable] = None, + objective_config: Optional[ObjectiveConfig] = None): + super().__init__(size=size, processing_fun=processing_fun) + self.pareto_k = pareto_k + self.score_dict_fn = score_dict_fn + self.objective_config = objective_config + + def pop(self): + """Pop a candidate, using Pareto-front selection when configured. + + If objective_config is None, mode != 'pareto', or score_dict_fn is + missing, falls back to standard heapq.heappop (scalar priority). + """ + if not self.memory: + raise IndexError("pop from an empty heap memory") + + cfg = self.objective_config + if cfg is None or cfg.mode != "pareto" or self.score_dict_fn is None: + return heapq.heappop(self.memory) + + # Extract top-K by scalar priority (heap stores -score, so nsmallest = highest scores) + k = min(self.pareto_k, len(self.memory)) + topk = heapq.nsmallest(k, self.memory) + candidates = [c for _, c in topk] + + # Get score dicts for each candidate + score_dicts = [] + for c in candidates: + sd = self.score_dict_fn(c) + sd = to_score_dict(sd) if sd is not None else None + score_dicts.append(sd) + + # Fallback to standard pop if any candidate lacks a score dict + if any(sd is None for sd in score_dicts): + return heapq.heappop(self.memory) + + # Apply minimize normalization and compute Pareto ranks + score_dicts = [apply_minimize(sd, cfg.minimize) for sd in score_dicts] + ranks = pareto_rank(score_dicts, metrics=cfg.pareto_metrics) + front_idx = [i for i, r in enumerate(ranks) if r == 0] + + # Tie-break among front by weighted scalarization + def _tie_break(i): + sd = score_dicts[i] + if cfg.weights: + return float(weighted_scalarize(sd, cfg.weights, cfg.missing_value)) + return float(np.mean(list(sd.values()))) + + chosen_local = max(front_idx, key=_tie_break) + chosen_item = topk[chosen_local] + + # Remove chosen item from heap (O(n), acceptable for small K) + self.memory.remove(chosen_item) + heapq.heapify(self.memory) + return chosen_item + + # TODO check saving and loading class PrioritySearch(SearchTemplate): """ A search algorithm that uses a priority queue to explore the parameter space and propose new candidates. @@ -241,6 +328,8 @@ def train(self, guide, # guide to provide feedback train_dataset, # dataset of (x, info) pairs to train the agent *, + # multi-objective + objective_config: Optional[ObjectiveConfig] = None, # ObjectiveConfig for multi-objective selection (None = scalar) # validation validate_dataset = None, # same format as train_dataset; if None, use the current batch. validate_guide = None, # to provide scores for the validation set @@ -318,6 +407,7 @@ def train(self, short_term_memory_size=short_term_memory_size, memory_update_frequency=memory_update_frequency, decouple_optimizers=decouple_optimizers, + objective_config=objective_config, ) self._enforce_using_data_collecting_candidates = True @@ -354,7 +444,8 @@ def _initialize_search_parameters(self, *, long_term_memory_size, short_term_memory_size, memory_update_frequency, - decouple_optimizers): + decouple_optimizers, + objective_config=None): """Initialize search parameters and memory structures. Args: @@ -369,6 +460,7 @@ def _initialize_search_parameters(self, *, short_term_memory_size (int): Size of the short-term memory memory_update_frequency (int): The candidates are merged into long-term memory after this many iterations. decouple_optimizers (bool): Whether to decouple optimizers for each candidate + objective_config (ObjectiveConfig, optional): Multi-objective selection config. None = scalar. """ # Validate and adjust num_candidates based on number of optimizers if num_candidates < len(self._optimizers): @@ -410,8 +502,26 @@ def _initialize_search_parameters(self, *, else: print(f"PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every {memory_update_frequency} iterations.") - self.long_term_memory = HeapMemory(size=long_term_memory_size, processing_fun=self.compress_candidate_memory) - self.short_term_memory = HeapMemory(size=short_term_memory_size) + self.objective_config = objective_config + use_pareto_memory = (objective_config is not None + and objective_config.mode == "pareto") + if use_pareto_memory: + self.long_term_memory = ParetoHeapMemory( + size=long_term_memory_size, + processing_fun=self.compress_candidate_memory, + pareto_k=20, + score_dict_fn=lambda c: c.mean_score_dict(), + objective_config=objective_config, + ) + self.short_term_memory = ParetoHeapMemory( + size=short_term_memory_size, + pareto_k=20, + score_dict_fn=lambda c: c.mean_score_dict(), + objective_config=objective_config, + ) + else: + self.long_term_memory = HeapMemory(size=long_term_memory_size, processing_fun=self.compress_candidate_memory) + self.short_term_memory = HeapMemory(size=short_term_memory_size) self.memory_update_frequency = memory_update_frequency def update(self, @@ -635,6 +745,20 @@ def validate(self, for c, rollouts in matched_candidates_and_samples.items(): # rollouts is a list of BatchRollouts results[c] = [ r for rr in rollouts for r in rr.to_list()] # we only need the list of dicts + # Populate score_dict in each rollout when multi-objective is active + cfg = getattr(self, 'objective_config', None) + if cfg is not None and cfg.mode != "scalar": + guide = self.validate_sampler.guide + for c, rollout_list in results.items(): + for rollout in rollout_list: + try: + sd = guide.get_score_dict( + rollout['x'], rollout['target'], rollout['info'] + ) + rollout['score_dict'] = sd + except Exception: + pass # guide may not support get_score_dict; skip gracefully + return results def match_candidates_and_samples( @@ -769,8 +893,18 @@ def compute_exploration_priority(self, candidate) -> float: """ if not isinstance(candidate, ModuleCandidate): raise TypeError("candidate must be an instance of ModuleCandidate.") - # By default, we compute the mean score of the rollouts + # Multi-objective priority: use weighted scalarization of score_dict when available + if getattr(self, 'objective_config', None) is not None: + sd = candidate.mean_score_dict() + if sd is not None: + cfg = self.objective_config + sd = apply_minimize(to_score_dict(sd), cfg.minimize) + if cfg.weights: + return float(weighted_scalarize(sd, cfg.weights, cfg.missing_value)) + return float(np.mean(list(sd.values()))) + + # Fall through to existing scalar logic if self.score_function == 'mean': # Compute the mean score of the candidate's rollouts return candidate.mean_score() @@ -795,7 +929,7 @@ def compress_candidate_memory(self, candidate: ModuleCandidate) -> ModuleCandida def _process_rollout(rollout): # rollout is a dict containing module, x, info, target, score, feedback for k in rollout: - if k not in ['score']: + if k not in ['score', 'score_dict']: rollout[k] = None candidate = copy.copy(candidate) # make a copy of the candidate to avoid modifying the original one candidate.rollouts = copy.deepcopy(candidate.rollouts) # deep copy the rollouts to avoid modifying the original one diff --git a/opto/trainer/algorithms/beamsearch_algorithm.py b/opto/trainer/algorithms/beamsearch_algorithm.py index 0beac524..fc8352bf 100644 --- a/opto/trainer/algorithms/beamsearch_algorithm.py +++ b/opto/trainer/algorithms/beamsearch_algorithm.py @@ -3,7 +3,9 @@ from typing import Union, List, Tuple, Dict, Any, Optional from opto.trainer.utils import async_run, batch_run from opto.optimizers.utils import print_color -from opto.trainer.algorithms.basic_algorithms import MinibatchAlgorithm, evaluate, batchify +from opto.trainer.algorithms.basic_algorithms import MinibatchAlgorithm, evaluate, batchify, _objective_scalar +from opto.trainer.evaluators import evaluate_vector, aggregate_vector_scores +from opto.trainer.objectives import ObjectiveConfig, select_top_k, apply_minimize, weighted_scalarize class BeamsearchAlgorithm(MinibatchAlgorithm): @@ -20,6 +22,7 @@ def train(self, guide, train_dataset, *, + objective_config=None, # ObjectiveConfig for multi-objective selection (None = scalar) validate_dataset=None, # dataset for selecting the best candidates validate_guide=None, # guide for validation validation_dataset_size=5, # size of validation minibatch for each evaluation @@ -39,8 +42,11 @@ def train(self, ): """ Performs beam search to find optimal parameters. - + Args: + objective_config: ObjectiveConfig for multi-objective selection. + None or mode='scalar' uses existing scalar comparison (backward-compatible). + mode='weighted' or 'pareto' enables vector scoring in select(). beam_width: Number of candidates to keep at each level of the beam search num_proposals: Number of proposals to generate per beam candidate max_depth: Maximum depth of the beam search @@ -50,6 +56,7 @@ def train(self, test_frequency: How often to evaluate on test set (every N steps) Other parameters are the same as MinibatchAlgorithm.train() """ + self.objective_config = objective_config self.total_samples = 0 print_color(f"Running BeamsearchAlgorithm with beam_width={beam_width}, max_depth={max_depth}", 'blue') @@ -168,7 +175,18 @@ def train(self, self.logger.log('Average validation score', np.mean(scores), step_num, color='cyan') self.logger.log('Min validation score', min(scores), step_num, color='yellow') self.logger.log('Max validation score', max(scores), step_num, color='magenta') - + + # Log per-metric values when using vector scoring + if (getattr(self, 'objective_config', None) is not None + and self.objective_config.mode != "scalar" + and hasattr(self, '_last_selected_score_dicts') + and self._last_selected_score_dicts): + best_score_dict = self._last_selected_score_dicts[best_idx] + if isinstance(best_score_dict, dict): + for metric_name, metric_val in best_score_dict.items(): + self.logger.log(f'Validation score/{metric_name}', metric_val, + step_num, color='green') + # Evaluate on test set every test_frequency steps if test_dataset is not None and ((depth + 1) % test_frequency == 0): # Update agent with best parameters from this depth @@ -370,7 +388,7 @@ def expand(self, return candidates - def select(self, + def select(self, candidates: List[Dict], validate_guide, validation_mini_dataset, @@ -380,7 +398,7 @@ def select(self, return_scores: bool = False) -> Union[List[Dict], Tuple[List[Dict], List[float]]]: """ Evaluates all candidates and selects the top beam_width candidates based on validation scores. - + Args: candidates: List of parameter dictionaries for each candidate validate_guide: Guide for validation @@ -389,63 +407,111 @@ def select(self, num_threads: Number of threads to use min_score: Minimum score when errors occur return_scores: Whether to return scores along with parameters - + Returns: If return_scores is False: List of selected candidates' parameters If return_scores is True: Tuple of (list of parameters, list of scores) """ + use_vector = (getattr(self, 'objective_config', None) is not None + and self.objective_config.mode != "scalar") + # Store current parameters to restore later current_params = {p: copy.deepcopy(p.data) for p in self.optimizer.parameters} - - # List to store (score, params) pairs - scored_candidates = [] - - # Evaluate each candidate - for candidate_idx, candidate_params in enumerate(candidates): - self.optimizer.update(candidate_params) - - # Evaluate on validation minibatch using evaluate function - validation_scores = evaluate( - self.agent, - validate_guide, - validation_mini_dataset['inputs'], - validation_mini_dataset['infos'], - min_score=min_score, - num_threads=num_threads, - description=f"Validating candidate {candidate_idx+1}/{len(candidates)}" - ) - - validation_score = np.mean(validation_scores) if all([s is not None for s in validation_scores]) else -np.inf - scored_candidates.append((validation_score, candidate_params)) - - print_color(f"Candidate {candidate_idx+1}: Validation score: {validation_score:.4f}", 'cyan') - - # Restore original parameters - self.optimizer.update(current_params) - - # Extract scores for logging - scores = [score for score, _ in scored_candidates] - - # If the number of candidates is less than or equal to beam_width, keep all of them - if len(scored_candidates) <= beam_width: - print_color(f"Keeping all {len(scored_candidates)} candidates as num_candidates <= beam_width. Scores: {[f'{s:.4f}' for s in scores]}", 'green') - selected_params = [params for _, params in scored_candidates] + + if use_vector: + # --- Vector path: multi-objective selection --- + scored_candidates = [] # (scalar_score, params) for logging/return + vector_candidates = [] # (score_dict, params) for multi-objective ranking + + for candidate_idx, candidate_params in enumerate(candidates): + self.optimizer.update(candidate_params) + + score_dicts = evaluate_vector( + self.agent, + validate_guide, + validation_mini_dataset['inputs'], + validation_mini_dataset['infos'], + min_score=min_score, + num_threads=num_threads, + description=f"Validating candidate {candidate_idx+1}/{len(candidates)} (vector)" + ) + score_dict = aggregate_vector_scores(score_dicts) + scalar_score = _objective_scalar(score_dict, self.objective_config) + + scored_candidates.append((scalar_score, candidate_params)) + vector_candidates.append((score_dict, candidate_params)) + + print_color(f"Candidate {candidate_idx+1}: Validation score: {scalar_score:.4f} {score_dict}", 'cyan') + + # Restore original parameters + self.optimizer.update(current_params) + + scores = [s for s, _ in scored_candidates] + + if len(scored_candidates) <= beam_width: + print_color(f"Keeping all {len(scored_candidates)} candidates as num_candidates <= beam_width. Scores: {[f'{s:.4f}' for s in scores]}", 'green') + selected_params = [params for _, params in scored_candidates] + # Store score_dicts for per-metric logging in train() + self._last_selected_score_dicts = [sd for sd, _ in vector_candidates] + if return_scores: + return selected_params, scores + return selected_params + + # Multi-objective ranking via select_top_k + top_indices = select_top_k(vector_candidates, self.objective_config, k=beam_width) + selected_params = [vector_candidates[i][1] for i in top_indices] + selected_scores = [scores[i] for i in top_indices] + # Store score_dicts for per-metric logging in train() + self._last_selected_score_dicts = [vector_candidates[i][0] for i in top_indices] + + print_color(f"Selected top {beam_width} beams with scores: {[f'{s:.4f}' for s in selected_scores]}", 'green') if return_scores: - return selected_params, scores + return selected_params, selected_scores + return selected_params + + else: + # --- Scalar path: completely unchanged --- + scored_candidates = [] + + for candidate_idx, candidate_params in enumerate(candidates): + self.optimizer.update(candidate_params) + + validation_scores = evaluate( + self.agent, + validate_guide, + validation_mini_dataset['inputs'], + validation_mini_dataset['infos'], + min_score=min_score, + num_threads=num_threads, + description=f"Validating candidate {candidate_idx+1}/{len(candidates)}" + ) + + validation_score = np.mean(validation_scores) if all([s is not None for s in validation_scores]) else -np.inf + scored_candidates.append((validation_score, candidate_params)) + + print_color(f"Candidate {candidate_idx+1}: Validation score: {validation_score:.4f}", 'cyan') + + # Restore original parameters + self.optimizer.update(current_params) + + scores = [score for score, _ in scored_candidates] + + if len(scored_candidates) <= beam_width: + print_color(f"Keeping all {len(scored_candidates)} candidates as num_candidates <= beam_width. Scores: {[f'{s:.4f}' for s in scores]}", 'green') + selected_params = [params for _, params in scored_candidates] + if return_scores: + return selected_params, scores + return selected_params + + sorted_candidates = sorted(scored_candidates, key=lambda x: x[0], reverse=True) + selected_candidates = sorted_candidates[:beam_width] + selected_scores = [score for score, _ in selected_candidates] + selected_params = [params for _, params in selected_candidates] + + print_color(f"Selected top {beam_width} beams with scores: {[f'{s:.4f}' for s in selected_scores]}", 'green') + if return_scores: + return selected_params, selected_scores return selected_params - - # Sort candidates by score (descending) - sorted_candidates = sorted(scored_candidates, key=lambda x: x[0], reverse=True) - - # Select top beam_width candidates - selected_candidates = sorted_candidates[:beam_width] - selected_scores = [score for score, _ in selected_candidates] - selected_params = [params for _, params in selected_candidates] - - print_color(f"Selected top {beam_width} beams with scores: {[f'{s:.4f}' for s in selected_scores]}", 'green') - if return_scores: - return selected_params, selected_scores - return selected_params @@ -464,6 +530,7 @@ def train(self, guide, train_dataset, *, + objective_config=None, # ObjectiveConfig for multi-objective selection (None = scalar) validate_dataset=None, validate_guide=None, validation_dataset_size=5, @@ -487,7 +554,8 @@ def train(self, Default is 1, which keeps only the best candidate. Other args are the same as BeamsearchAlgorithm.train() """ - self.total_samples = 0 + self.objective_config = objective_config + self.total_samples = 0 self.min_score = kwargs.get('min_score', 0) print_color(f"Running BeamsearchHistoryAlgorithm with beam_width={beam_width}, max_depth={max_depth}, max_history_size={max_history_size}", 'blue') diff --git a/tests/unit_tests/test_trainers_multiobjective.py b/tests/unit_tests/test_trainers_multiobjective.py new file mode 100644 index 00000000..012a00bd --- /dev/null +++ b/tests/unit_tests/test_trainers_multiobjective.py @@ -0,0 +1,402 @@ +"""Tests for M2 multi-objective support in BeamsearchAlgorithm and PrioritySearch. + +Uses DummyLLM and deterministic guides — no API keys required. +""" +import pytest +import re +import numpy as np +import heapq + +from opto import trace +from opto.trainer.guide import Guide +from opto.trainer.objectives import ObjectiveConfig +from opto.trainer.algorithms.beamsearch_algorithm import ( + BeamsearchAlgorithm, + BeamsearchHistoryAlgorithm, +) +from opto.features.priority_search.priority_search import ( + PrioritySearch, + ModuleCandidate, + HeapMemory, + ParetoHeapMemory, +) +from opto.optimizers import OptoPrimeV2 +from opto.utils.llm import DummyLLM + + +# --------------------------------------------------------------------------- +# Fixtures: Guide, Agent, LLM, Dataset +# --------------------------------------------------------------------------- + +class ScalarGuide(Guide): + """Simple scalar guide: exact-match returns 1.0/0.0.""" + + def get_feedback(self, query, response, reference=None, **kwargs): + score = float(str(response).strip() == str(reference).strip()) + feedback = "Correct" if score == 1.0 else "Incorrect" + return score, feedback + + +class MultiMetricGuide(Guide): + """Multi-metric guide: scalar accuracy from get_feedback, + accuracy + brevity from get_score_dict.""" + + def get_feedback(self, query, response, reference=None, **kwargs): + accuracy = float(str(response).strip() == str(reference).strip()) + feedback = "Correct" if accuracy == 1.0 else "Incorrect" + return accuracy, feedback + + def get_score_dict(self, query, response, reference=None, **kwargs): + accuracy = float(str(response).strip() == str(reference).strip()) + brevity = max(0.0, 1.0 - len(str(response)) / 100.0) + return {"accuracy": accuracy, "brevity": brevity} + + +@trace.model +class StubAgent: + def __init__(self): + self.param = trace.node("default answer", trainable=True) + + def forward(self, x): + return self.param + + +# Simple dataset +DATASET = { + "inputs": ["What is 2+2?", "Capital of France?", "Color of sky?"], + "infos": ["4", "Paris", "blue"], +} + +SUGGESTED_VALUE = "4" + + +def _llm_callable(messages, **kwargs): + """Dummy LLM callable returning a fixed value.""" + problem = messages[1]["content"] + name = re.findall(r'', problem) + name = name[0] if name else "unknown" + return f""" + Dummy reasoning. + + {name} + {SUGGESTED_VALUE} + + """ + + +def _make_beamsearch(): + """Create a BeamsearchAlgorithm instance with DummyLLM.""" + agent = StubAgent() + llm = DummyLLM(_llm_callable) + optimizer = OptoPrimeV2(agent.parameters(), llm=llm) + algo = BeamsearchAlgorithm(agent, optimizer) + return algo + + +def _make_priority_search(): + """Create a PrioritySearch instance with DummyLLM.""" + agent = StubAgent() + llm = DummyLLM(_llm_callable) + optimizer = OptoPrimeV2(agent.parameters(), llm=llm) + algo = PrioritySearch(agent, optimizer) + return algo + + +# --------------------------------------------------------------------------- +# Backward compatibility +# --------------------------------------------------------------------------- + +class TestBackwardCompat: + + def test_beamsearch_scalar_baseline(self): + """BeamsearchAlgorithm with objective_config=None trains and returns scores.""" + algo = _make_beamsearch() + metrics, final_score = algo.train( + guide=ScalarGuide(), + train_dataset=DATASET, + objective_config=None, + beam_width=2, + num_proposals=1, + max_depth=1, + batch_size=1, + num_threads=2, + ) + assert "best_validation_scores" in metrics + assert isinstance(final_score, (int, float)) + + def test_priority_search_scalar_baseline(self): + """PrioritySearch with objective_config=None trains without error.""" + algo = _make_priority_search() + algo.train( + guide=ScalarGuide(), + train_dataset=DATASET, + objective_config=None, + batch_size=1, + num_batches=1, + num_epochs=1, + num_candidates=2, + num_proposals=1, + num_threads=2, + long_term_memory_size=3, + memory_update_frequency=0, + verbose=False, + ) + # If we get here without exception, backward compat is maintained + + +# --------------------------------------------------------------------------- +# Beamsearch selection +# --------------------------------------------------------------------------- + +class TestBeamsearchSelection: + + def test_beamsearch_weighted_mode(self): + """Weighted mode trains and populates _last_selected_score_dicts.""" + algo = _make_beamsearch() + config = ObjectiveConfig( + mode="weighted", + weights={"accuracy": 0.7, "brevity": 0.3}, + ) + metrics, final_score = algo.train( + guide=MultiMetricGuide(), + train_dataset=DATASET, + objective_config=config, + beam_width=2, + num_proposals=1, + max_depth=1, + batch_size=1, + num_threads=2, + ) + assert isinstance(final_score, (int, float)) + # _last_selected_score_dicts should have been populated by select() + assert hasattr(algo, "_last_selected_score_dicts") + if algo._last_selected_score_dicts: + sd = algo._last_selected_score_dicts[0] + assert isinstance(sd, dict) + assert "accuracy" in sd + assert "brevity" in sd + + def test_beamsearch_pareto_mode(self): + """Pareto mode trains without error.""" + algo = _make_beamsearch() + config = ObjectiveConfig(mode="pareto") + metrics, final_score = algo.train( + guide=MultiMetricGuide(), + train_dataset=DATASET, + objective_config=config, + beam_width=2, + num_proposals=1, + max_depth=1, + batch_size=1, + num_threads=2, + ) + assert isinstance(final_score, (int, float)) + + def test_beamsearch_history_forwards_config(self): + """BeamsearchHistoryAlgorithm accepts and stores objective_config.""" + agent = StubAgent() + llm = DummyLLM(_llm_callable) + optimizer = OptoPrimeV2(agent.parameters(), llm=llm) + algo = BeamsearchHistoryAlgorithm(agent, optimizer) + config = ObjectiveConfig( + mode="weighted", + weights={"accuracy": 1.0}, + ) + algo.train( + guide=MultiMetricGuide(), + train_dataset=DATASET, + objective_config=config, + beam_width=2, + num_proposals=1, + max_depth=1, + batch_size=1, + num_threads=2, + ) + assert algo.objective_config is config + + +# --------------------------------------------------------------------------- +# ParetoHeapMemory unit tests +# --------------------------------------------------------------------------- + +class TestParetoHeapMemory: + + def test_fallback_when_no_config(self): + """When config is None, pop() behaves like standard heappop.""" + phm = ParetoHeapMemory(size=10, pareto_k=5) + phm.memory = [(-3.0, "c3"), (-2.0, "c2"), (-1.0, "c1")] + heapq.heapify(phm.memory) + neg_score, data = phm.pop() + # heappop returns the smallest (most negative = highest score) + assert neg_score == -3.0 + assert data == "c3" + + def test_pareto_pop_selects_from_front(self): + """When config mode='pareto', pop() selects from Pareto front.""" + # Candidate A: good accuracy, bad brevity + # Candidate B: bad accuracy, good brevity + # Candidate C: dominated by both A and B + score_dicts = { + "A": {"accuracy": 0.9, "brevity": 0.1}, + "B": {"accuracy": 0.1, "brevity": 0.9}, + "C": {"accuracy": 0.05, "brevity": 0.05}, + } + + config = ObjectiveConfig( + mode="pareto", + weights={"accuracy": 0.7, "brevity": 0.3}, + ) + phm = ParetoHeapMemory( + size=10, + pareto_k=10, + score_dict_fn=lambda c: score_dicts[c], + objective_config=config, + ) + # Push all three (scalar priority doesn't matter for pareto pop) + phm.memory = [(-0.5, "A"), (-0.5, "B"), (-0.1, "C")] + heapq.heapify(phm.memory) + + neg_score, chosen = phm.pop() + # C is dominated, so chosen must be A or B + assert chosen in ("A", "B"), f"Expected A or B from Pareto front, got {chosen}" + # With weights accuracy=0.7, brevity=0.3: + # A: 0.7*0.9 + 0.3*0.1 = 0.66 + # B: 0.7*0.1 + 0.3*0.9 = 0.34 + # Tie-break by weighted scalarize → A wins + assert chosen == "A" + + def test_missing_score_dict_fallback(self): + """When score_dict_fn returns None, falls back to heappop.""" + config = ObjectiveConfig(mode="pareto") + phm = ParetoHeapMemory( + size=10, + pareto_k=10, + score_dict_fn=lambda c: None, # always returns None + objective_config=config, + ) + phm.memory = [(-5.0, "best"), (-2.0, "mid"), (-1.0, "worst")] + heapq.heapify(phm.memory) + + neg_score, data = phm.pop() + # Falls back to heappop → highest priority (most negative) + assert neg_score == -5.0 + assert data == "best" + + +# --------------------------------------------------------------------------- +# PrioritySearch multi-objective +# --------------------------------------------------------------------------- + +class TestPrioritySearchMultiObjective: + + def test_weighted_priority(self): + """With weighted config, compute_exploration_priority uses weighted scalarization.""" + algo = _make_priority_search() + config = ObjectiveConfig( + mode="weighted", + weights={"accuracy": 0.6, "brevity": 0.4}, + ) + # Initialize search params to set self.objective_config and self.score_function + algo._initialize_search_parameters( + num_candidates=2, + num_proposals=1, + validate_exploration_candidates=True, + use_best_candidate_to_explore=True, + score_function="mean", + score_range=(0, 1), + ucb_exploration_constant=1.0, + long_term_memory_size=None, + short_term_memory_size=None, + memory_update_frequency=0, + decouple_optimizers=True, + objective_config=config, + ) + + candidate = ModuleCandidate(algo.agent) + # Add rollouts with score_dict + candidate.rollouts = [ + {"module": None, "x": "q", "info": "a", "target": "a", + "score": 0.8, "feedback": "ok", + "score_dict": {"accuracy": 0.9, "brevity": 0.7}}, + {"module": None, "x": "q", "info": "a", "target": "a", + "score": 0.6, "feedback": "ok", + "score_dict": {"accuracy": 0.7, "brevity": 0.5}}, + ] + + priority = algo.compute_exploration_priority(candidate) + # Mean score_dict: accuracy=0.8, brevity=0.6 + # Weighted: 0.6*0.8 + 0.4*0.6 = 0.48 + 0.24 = 0.72 + assert isinstance(priority, float) + assert abs(priority - 0.72) < 1e-6, f"Expected ~0.72, got {priority}" + + def test_score_dict_in_rollouts(self): + """After validate(), rollouts contain score_dict entries when multi-objective is active.""" + algo = _make_priority_search() + config = ObjectiveConfig( + mode="weighted", + weights={"accuracy": 0.7, "brevity": 0.3}, + ) + algo.train( + guide=MultiMetricGuide(), + train_dataset=DATASET, + objective_config=config, + batch_size=1, + num_batches=1, + num_epochs=1, + num_candidates=2, + num_proposals=1, + num_threads=2, + long_term_memory_size=3, + memory_update_frequency=0, + verbose=False, + ) + # Check that at least some candidates in memory have score_dict in rollouts + found_score_dict = False + for neg_priority, candidate in algo.long_term_memory: + for rollout in candidate.rollouts: + if "score_dict" in rollout and rollout["score_dict"] is not None: + found_score_dict = True + sd = rollout["score_dict"] + assert "accuracy" in sd + assert "brevity" in sd + break + if found_score_dict: + break + assert found_score_dict, "Expected at least one rollout with score_dict in memory" + + +# --------------------------------------------------------------------------- +# ModuleCandidate.mean_score_dict +# --------------------------------------------------------------------------- + +class TestModuleCandidateMeanScoreDict: + + def test_mean_score_dict(self): + """Returns correct per-metric mean when rollouts have score_dict.""" + agent = StubAgent() + candidate = ModuleCandidate(agent) + candidate.rollouts = [ + {"module": None, "x": "q", "info": "a", "target": "a", + "score": 0.9, "feedback": "ok", + "score_dict": {"accuracy": 1.0, "brevity": 0.8}}, + {"module": None, "x": "q", "info": "a", "target": "a", + "score": 0.7, "feedback": "ok", + "score_dict": {"accuracy": 0.6, "brevity": 0.4}}, + ] + sd = candidate.mean_score_dict() + assert sd is not None + assert abs(sd["accuracy"] - 0.8) < 1e-6 + assert abs(sd["brevity"] - 0.6) < 1e-6 + + def test_mean_score_dict_none_when_no_score_dict(self): + """Returns None when rollouts lack score_dict.""" + agent = StubAgent() + candidate = ModuleCandidate(agent) + candidate.rollouts = [ + {"module": None, "x": "q", "info": "a", "target": "a", + "score": 0.9, "feedback": "ok"}, + {"module": None, "x": "q", "info": "a", "target": "a", + "score": 0.7, "feedback": "ok"}, + ] + sd = candidate.mean_score_dict() + assert sd is None From a888b93736c8c0cdbe092dbe6154a74e25e88651 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Wed, 18 Feb 2026 20:41:31 -0400 Subject: [PATCH 10/20] T6 M2: Add validation notebook with convex function multi-objective demo + plots --- examples/notebooks/t6_m2_trainers.ipynb | 678 ++++++++++++++++++++++++ 1 file changed, 678 insertions(+) create mode 100644 examples/notebooks/t6_m2_trainers.ipynb diff --git a/examples/notebooks/t6_m2_trainers.ipynb b/examples/notebooks/t6_m2_trainers.ipynb new file mode 100644 index 00000000..be30a155 --- /dev/null +++ b/examples/notebooks/t6_m2_trainers.ipynb @@ -0,0 +1,678 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "cell-setup", + "metadata": {}, + "outputs": [], + "source": [ + "import os, sys\n", + "\n", + "# In Colab: clone and install from GitHub\n", + "# Locally: add repo root to sys.path so opto is importable\n", + "try:\n", + " import google.colab\n", + " IN_COLAB = True\n", + "except ImportError:\n", + " IN_COLAB = False\n", + "\n", + "if IN_COLAB:\n", + " !git clone https://github.com/carlosrod723/OpenTrace.git Trace\n", + " %cd Trace\n", + " !git checkout t6-multi-objective-m0\n", + " !sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n", + " !pip install -e .\n", + " !pip install cvxpy matplotlib pandas\n", + "else:\n", + " # Local: ensure repo root is on sys.path\n", + " _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n", + " _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n", + " if _repo_root not in sys.path:\n", + " sys.path.insert(0, _repo_root)\n", + " import opto\n", + " print(f\"Using local opto from: {os.path.dirname(opto.__file__)}\")\n", + "\n", + "# Verify cvxpy is available (required for SixHumpCamel SOS certificate)\n", + "try:\n", + " import cvxpy\n", + " print(f\"cvxpy {cvxpy.__version__} available\")\n", + "except ImportError:\n", + " raise ImportError(\"cvxpy is required: pip install cvxpy\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-title", + "metadata": {}, + "source": [ + "# T6 M2 — BeamsearchAlgorithm & PrioritySearch Multi-Objective\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AgentOpt/OpenTrace/blob/pull/61/head/examples/notebooks/t6_m2_trainers.ipynb)\n", + "\n", + "**Milestone 2 Deliverable** — Multi-objective support in BeamsearchAlgorithm and PrioritySearch\n", + "\n", + "This notebook demonstrates M2 using Allen's SixHumpCamel convex function environment:\n", + "1. **BasicSearch** (M1 baseline): scalar, weighted, and Pareto modes\n", + "2. **BeamsearchAlgorithm** (M2): weighted mode with `select()` vector path\n", + "3. **PrioritySearch** (M2): weighted and Pareto modes with `ParetoHeapMemory`\n", + "\n", + "All cells run in **StubLLM mode** (no API keys required). Uses `DummyLLM` for deterministic training.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "cell-checklist", + "metadata": {}, + "source": [ + "## How to Validate This Milestone\n", + "\n", + "After running all cells, confirm:\n", + "- [ ] BasicSearch trains with scalar, weighted, and Pareto `objective_config` (M1 baseline)\n", + "- [ ] BeamsearchAlgorithm trains with weighted `objective_config` and populates `_last_selected_score_dicts`\n", + "- [ ] PrioritySearch trains with weighted and Pareto `objective_config`\n", + "- [ ] PrioritySearch memory contains candidates with `score_dict` in rollouts\n", + "- [ ] Score progression graph shows per-mode validation curves\n", + "- [ ] Comparison scatter shows `base_loss` vs `reg_loss` for all algorithm+mode combos\n", + "- [ ] Summary table aggregates results across all runs\n", + "- [ ] Backward compatibility: `objective_config=None` trains identically to pre-M2 behavior" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-imports", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import copy\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from typing import Tuple, Dict\n", + "\n", + "from opto import trace\n", + "from opto.trainer.guide import Guide\n", + "from opto.trainer.objectives import ObjectiveConfig\n", + "from opto.utils.llm import DummyLLM\n", + "from opto.optimizers import OptoPrimeV2\n", + "from opto.trainer.algorithms.basic_algorithms import BasicSearchAlgorithm\n", + "from opto.trainer.algorithms.beamsearch_algorithm import BeamsearchAlgorithm\n", + "from opto.features.priority_search.priority_search import PrioritySearch\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"T6 M2 — BeamsearchAlgorithm & PrioritySearch Multi-Objective\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "markdown", + "id": "cell-parta-header", + "metadata": {}, + "source": [ + "---\n", + "## Part A: SixHumpCamel Convex Function — All Three Algorithms\n", + "\n", + "Uses Allen's `SixHumpCamel` loss landscape with L2-squared regularization.\n", + "Two objectives (both to **minimize**):\n", + "- `base_loss`: the six-hump camel function value\n", + "- `reg_loss`: the L2-squared regularization term\n", + "\n", + "Known global optima near `[0.0898, -0.7126]` and `[-0.0898, 0.7126]`.\n", + "\n", + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-setup-env", + "metadata": {}, + "outputs": [], + "source": [ + "# Import SixHumpCamel from Allen's example file\n", + "_examples_dir = os.path.join(_repo_root, 'examples')\n", + "if _examples_dir not in sys.path:\n", + " sys.path.insert(0, _examples_dir)\n", + "from multi_objective_convex_fn import SixHumpCamel\n", + "\n", + "\n", + "# --- RewardGuide (defined here with correct copy import) ---\n", + "class RewardGuide(Guide):\n", + " \"\"\"Multi-objective guide for convex function environments.\n", + "\n", + " get_feedback() -> (float, str): advances real env (training loop).\n", + " get_score_dict() -> Dict[str, float]: uses deepcopy (selection path).\n", + " \"\"\"\n", + "\n", + " def __init__(self, env):\n", + " self.env = env\n", + "\n", + " def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n", + " obs, reward, done, info = self.env.step(str(response))\n", + " feedback = ((obs + \"\\n\\n\") if obs else \"\") + info.get(\"feedback\", \"\")\n", + " return float(reward), feedback\n", + "\n", + " def get_score_dict(self, query, response, reference=None, **kwargs) -> Dict[str, float]:\n", + " env_copy = copy.deepcopy(self.env)\n", + " obs, reward, done, info = env_copy.step(str(response))\n", + " base_loss = info.get(\"base_loss\")\n", + " reg_loss = info.get(\"reg_loss\")\n", + " if base_loss is None or reg_loss is None:\n", + " base_loss = float(\"inf\")\n", + " reg_loss = float(\"inf\")\n", + " return {\"base_loss\": float(base_loss), \"reg_loss\": float(reg_loss)}\n", + "\n", + "\n", + "# --- Agent: wraps a trace node that holds the x = [x1, x2] string ---\n", + "@trace.model\n", + "class ConvexAgent:\n", + " def __init__(self, initial_value):\n", + " self.param = trace.node(\n", + " initial_value, trainable=True,\n", + " description=\"Input x into the hidden function to minimize y. Format: x = [x1, x2]\"\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.param\n", + "\n", + "\n", + "# --- DummyLLM callable: proposes x = [float, float] values ---\n", + "class ConvexLLMCallable:\n", + " \"\"\"Returns cycling proposals near the SixHumpCamel optima.\"\"\"\n", + "\n", + " PROPOSALS = [\n", + " \"x = [0.1, -0.7]\", # near optimum 1\n", + " \"x = [-0.1, 0.7]\", # near optimum 2\n", + " \"x = [0.5, -0.3]\", # moderate\n", + " \"x = [0.09, -0.71]\", # very close to optimum 1\n", + " ]\n", + "\n", + " def __init__(self):\n", + " self.idx = 0\n", + "\n", + " def __call__(self, messages, **kwargs):\n", + " problem = messages[1][\"content\"]\n", + " name = re.findall(r'', problem)\n", + " name = name[0] if name else \"unknown\"\n", + " value = self.PROPOSALS[self.idx % len(self.PROPOSALS)]\n", + " self.idx += 1\n", + " return (\n", + " f\" Exploring the loss landscape near known optima. \\n\"\n", + " f\"\\n\"\n", + " f\" {name} \\n\"\n", + " f\" {value} \\n\"\n", + " f\"\"\n", + " )\n", + "\n", + "\n", + "# --- Factory: create fresh env + agent + optimizer + guide per run ---\n", + "DATASET = dict(inputs=[None], infos=[None])\n", + "\n", + "\n", + "def make_basicsearch_run():\n", + " env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n", + " env.reset(seed=42)\n", + " guide = RewardGuide(env)\n", + " agent = ConvexAgent(\"x = [0.0, 0.0]\")\n", + " llm = DummyLLM(ConvexLLMCallable())\n", + " optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n", + " algo = BasicSearchAlgorithm(agent, optimizer)\n", + " return algo, guide\n", + "\n", + "\n", + "def make_beamsearch_run():\n", + " env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n", + " env.reset(seed=42)\n", + " guide = RewardGuide(env)\n", + " agent = ConvexAgent(\"x = [0.0, 0.0]\")\n", + " llm = DummyLLM(ConvexLLMCallable())\n", + " optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n", + " algo = BeamsearchAlgorithm(agent, optimizer)\n", + " return algo, guide\n", + "\n", + "\n", + "def make_priority_search_run():\n", + " env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n", + " env.reset(seed=42)\n", + " guide = RewardGuide(env)\n", + " agent = ConvexAgent(\"x = [0.0, 0.0]\")\n", + " llm = DummyLLM(ConvexLLMCallable())\n", + " optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n", + " algo = PrioritySearch(agent, optimizer)\n", + " return algo, guide\n", + "\n", + "\n", + "# Objective configs\n", + "CONFIG_WEIGHTED = ObjectiveConfig(\n", + " mode=\"weighted\",\n", + " weights={\"base_loss\": 1.0, \"reg_loss\": 1.0},\n", + " minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n", + " seed=0,\n", + ")\n", + "\n", + "CONFIG_PARETO = ObjectiveConfig(\n", + " mode=\"pareto\",\n", + " weights={\"base_loss\": 0.7, \"reg_loss\": 0.3},\n", + " minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n", + " tie_break=\"weighted\",\n", + " seed=42,\n", + ")\n", + "\n", + "# Results collector\n", + "results = {}\n", + "print(\"Setup complete. SixHumpCamel environment + DummyLLM ready.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-basicsearch-runs", + "metadata": {}, + "outputs": [], + "source": [ + "# =====================================================================\n", + "# BasicSearch: scalar, weighted, pareto (M1 baseline for comparison)\n", + "# =====================================================================\n", + "BASIC_KWARGS = dict(\n", + " train_dataset=DATASET,\n", + " num_proposals=2,\n", + " num_epochs=1,\n", + " batch_size=1,\n", + " num_threads=1,\n", + ")\n", + "\n", + "# --- Scalar ---\n", + "print(\"=\" * 60)\n", + "print(\"BasicSearch: SCALAR mode\")\n", + "print(\"=\" * 60)\n", + "algo_bs_scalar, guide_bs_scalar = make_basicsearch_run()\n", + "scores_bs_scalar, test_bs_scalar = algo_bs_scalar.train(\n", + " guide=guide_bs_scalar, objective_config=None, **BASIC_KWARGS\n", + ")\n", + "results['BasicSearch/scalar'] = {\n", + " 'val_scores': scores_bs_scalar,\n", + " 'final_score': test_bs_scalar,\n", + " 'score_dict': getattr(algo_bs_scalar, 'current_score_dict', None),\n", + "}\n", + "print(f\"\\nScalar validation scores: {scores_bs_scalar}\")\n", + "print(f\"Final score: {test_bs_scalar}\")\n", + "\n", + "# --- Weighted ---\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"BasicSearch: WEIGHTED mode\")\n", + "print(\"=\" * 60)\n", + "algo_bs_weighted, guide_bs_weighted = make_basicsearch_run()\n", + "scores_bs_weighted, test_bs_weighted = algo_bs_weighted.train(\n", + " guide=guide_bs_weighted, objective_config=CONFIG_WEIGHTED, **BASIC_KWARGS\n", + ")\n", + "results['BasicSearch/weighted'] = {\n", + " 'val_scores': scores_bs_weighted,\n", + " 'final_score': test_bs_weighted,\n", + " 'score_dict': getattr(algo_bs_weighted, 'current_score_dict', None),\n", + "}\n", + "print(f\"\\nWeighted validation scores: {scores_bs_weighted}\")\n", + "print(f\"Final score: {test_bs_weighted}\")\n", + "print(f\"Score dict: {algo_bs_weighted.current_score_dict}\")\n", + "\n", + "# --- Pareto ---\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"BasicSearch: PARETO mode\")\n", + "print(\"=\" * 60)\n", + "algo_bs_pareto, guide_bs_pareto = make_basicsearch_run()\n", + "scores_bs_pareto, test_bs_pareto = algo_bs_pareto.train(\n", + " guide=guide_bs_pareto, objective_config=CONFIG_PARETO, **BASIC_KWARGS\n", + ")\n", + "results['BasicSearch/pareto'] = {\n", + " 'val_scores': scores_bs_pareto,\n", + " 'final_score': test_bs_pareto,\n", + " 'score_dict': getattr(algo_bs_pareto, 'current_score_dict', None),\n", + "}\n", + "print(f\"\\nPareto validation scores: {scores_bs_pareto}\")\n", + "print(f\"Final score: {test_bs_pareto}\")\n", + "print(f\"Score dict: {algo_bs_pareto.current_score_dict}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-score-progression", + "metadata": {}, + "outputs": [], + "source": [ + "# =====================================================================\n", + "# Graph 1: Score Progression — BasicSearch across modes\n", + "# =====================================================================\n", + "fig, ax = plt.subplots(1, 1, figsize=(8, 5))\n", + "\n", + "for label, marker, color in [\n", + " ('BasicSearch/scalar', 'o', '#1f77b4'),\n", + " ('BasicSearch/weighted', 's', '#ff7f0e'),\n", + " ('BasicSearch/pareto', '^', '#2ca02c'),\n", + "]:\n", + " data = results.get(label, {})\n", + " val_scores = data.get('val_scores', [])\n", + " if val_scores:\n", + " ax.plot(range(len(val_scores)), val_scores,\n", + " marker=marker, color=color, label=label, linewidth=2, markersize=6)\n", + "\n", + "ax.set_xlabel('Step', fontsize=12)\n", + "ax.set_ylabel('Validation Score (scalar)', fontsize=12)\n", + "ax.set_title('BasicSearch Score Progression — SixHumpCamel', fontsize=14)\n", + "ax.legend(fontsize=10)\n", + "ax.grid(True, alpha=0.3)\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(\"Graph 1: Score progression for BasicSearch across all three objective modes.\")\n", + "print(\"With DummyLLM, scores depend on the cycling proposals near optima.\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-partb-header", + "metadata": {}, + "source": [ + "---\n", + "## Part B: BeamsearchAlgorithm (M2)\n", + "\n", + "Tests the `select()` method's vector scoring path:\n", + "- `evaluate_vector()` computes per-metric scores for each beam candidate\n", + "- `select_top_k()` ranks candidates using the `ObjectiveConfig`\n", + "- Per-metric validation scores are logged (e.g., `Validation score/base_loss`)\n", + "- `_last_selected_score_dicts` stores the score_dicts from the final selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-beamsearch-run", + "metadata": {}, + "outputs": [], + "source": [ + "# =====================================================================\n", + "# BeamsearchAlgorithm: weighted mode\n", + "# =====================================================================\n", + "print(\"=\" * 60)\n", + "print(\"Beamsearch: WEIGHTED mode\")\n", + "print(\"=\" * 60)\n", + "\n", + "algo_beam_w, guide_beam_w = make_beamsearch_run()\n", + "metrics_beam_w, final_beam_w = algo_beam_w.train(\n", + " guide=guide_beam_w,\n", + " train_dataset=DATASET,\n", + " objective_config=CONFIG_WEIGHTED,\n", + " beam_width=2,\n", + " num_proposals=2,\n", + " max_depth=1,\n", + " batch_size=1,\n", + " num_threads=1,\n", + ")\n", + "\n", + "# Extract results\n", + "beam_score_dicts = getattr(algo_beam_w, '_last_selected_score_dicts', None)\n", + "beam_best_sd = beam_score_dicts[0] if beam_score_dicts else None\n", + "\n", + "results['Beamsearch/weighted'] = {\n", + " 'val_scores': metrics_beam_w.get('best_validation_scores', []),\n", + " 'final_score': final_beam_w,\n", + " 'score_dict': beam_best_sd,\n", + "}\n", + "\n", + "print(f\"\\nValidation scores by depth: {metrics_beam_w.get('best_validation_scores', [])}\")\n", + "print(f\"Final test score: {final_beam_w}\")\n", + "print(f\"_last_selected_score_dicts: {beam_score_dicts}\")\n", + "print(f\"Best candidate score_dict: {beam_best_sd}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-partc-header", + "metadata": {}, + "source": [ + "---\n", + "## Part C: PrioritySearch (M2)\n", + "\n", + "Tests the full PrioritySearch multi-objective pipeline:\n", + "- `validate()` populates `score_dict` in rollouts via `guide.get_score_dict()`\n", + "- `ModuleCandidate.mean_score_dict()` aggregates per-metric means across rollouts\n", + "- `compute_exploration_priority()` uses weighted scalarization when `objective_config` is set\n", + "- **Weighted mode:** `HeapMemory` with weighted scalar priority\n", + "- **Pareto mode:** `ParetoHeapMemory` with Pareto-front-aware `pop()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-priority-weighted", + "metadata": {}, + "outputs": [], + "source": [ + "# =====================================================================\n", + "# PrioritySearch: weighted mode\n", + "# =====================================================================\n", + "print(\"=\" * 60)\n", + "print(\"PrioritySearch: WEIGHTED mode\")\n", + "print(\"=\" * 60)\n", + "\n", + "algo_ps_w, guide_ps_w = make_priority_search_run()\n", + "algo_ps_w.train(\n", + " guide=guide_ps_w,\n", + " train_dataset=DATASET,\n", + " objective_config=CONFIG_WEIGHTED,\n", + " batch_size=1,\n", + " num_batches=1,\n", + " num_epochs=1,\n", + " num_candidates=2,\n", + " num_proposals=1,\n", + " num_threads=1,\n", + " long_term_memory_size=5,\n", + " memory_update_frequency=0,\n", + " verbose=False,\n", + ")\n", + "\n", + "# Extract best candidate from memory\n", + "ps_w_sd = None\n", + "ps_w_final = None\n", + "if hasattr(algo_ps_w, 'long_term_memory') and algo_ps_w.long_term_memory:\n", + " best_neg, best_cand = min(algo_ps_w.long_term_memory, key=lambda x: x[0])\n", + " ps_w_sd = best_cand.mean_score_dict()\n", + " ps_w_final = float(-best_neg)\n", + " print(f\"\\nBest candidate priority: {ps_w_final:.4f}\")\n", + " print(f\"Best candidate mean_score_dict: {ps_w_sd}\")\n", + " print(f\"Number of rollouts: {len(best_cand.rollouts)}\")\n", + " # Check that rollouts have score_dict\n", + " has_sd = any('score_dict' in r and r['score_dict'] is not None\n", + " for r in best_cand.rollouts)\n", + " print(f\"Rollouts contain score_dict: {has_sd}\")\n", + "else:\n", + " print(\"No candidates in long_term_memory\")\n", + "\n", + "results['PrioritySearch/weighted'] = {\n", + " 'val_scores': [], # PrioritySearch doesn't return per-step scores\n", + " 'final_score': ps_w_final,\n", + " 'score_dict': ps_w_sd,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-priority-pareto", + "metadata": {}, + "outputs": [], + "source": [ + "# =====================================================================\n", + "# PrioritySearch: Pareto mode (uses ParetoHeapMemory)\n", + "# =====================================================================\n", + "print(\"=\" * 60)\n", + "print(\"PrioritySearch: PARETO mode\")\n", + "print(\"=\" * 60)\n", + "\n", + "algo_ps_p, guide_ps_p = make_priority_search_run()\n", + "algo_ps_p.train(\n", + " guide=guide_ps_p,\n", + " train_dataset=DATASET,\n", + " objective_config=CONFIG_PARETO,\n", + " batch_size=1,\n", + " num_batches=1,\n", + " num_epochs=1,\n", + " num_candidates=2,\n", + " num_proposals=1,\n", + " num_threads=1,\n", + " long_term_memory_size=5,\n", + " memory_update_frequency=0,\n", + " verbose=False,\n", + ")\n", + "\n", + "# Extract best candidate from memory\n", + "ps_p_sd = None\n", + "ps_p_final = None\n", + "if hasattr(algo_ps_p, 'long_term_memory') and algo_ps_p.long_term_memory:\n", + " # Check the memory type (should be ParetoHeapMemory)\n", + " mem_type = type(algo_ps_p.long_term_memory).__name__\n", + " print(f\"Memory type: {mem_type}\")\n", + "\n", + " best_neg_p, best_cand_p = min(algo_ps_p.long_term_memory, key=lambda x: x[0])\n", + " ps_p_sd = best_cand_p.mean_score_dict()\n", + " ps_p_final = float(-best_neg_p)\n", + " print(f\"\\nBest candidate priority: {ps_p_final:.4f}\")\n", + " print(f\"Best candidate mean_score_dict: {ps_p_sd}\")\n", + " print(f\"Number of rollouts: {len(best_cand_p.rollouts)}\")\n", + " has_sd_p = any('score_dict' in r and r['score_dict'] is not None\n", + " for r in best_cand_p.rollouts)\n", + " print(f\"Rollouts contain score_dict: {has_sd_p}\")\n", + "\n", + " # Show all candidates in memory\n", + " print(f\"\\nAll candidates in memory ({len(algo_ps_p.long_term_memory)}):\")\n", + " for neg_p, cand in sorted(algo_ps_p.long_term_memory, key=lambda x: x[0]):\n", + " sd = cand.mean_score_dict()\n", + " print(f\" priority={-neg_p:.4f}, score_dict={sd}\")\n", + "else:\n", + " print(\"No candidates in long_term_memory\")\n", + "\n", + "results['PrioritySearch/pareto'] = {\n", + " 'val_scores': [],\n", + " 'final_score': ps_p_final,\n", + " 'score_dict': ps_p_sd,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-scatter", + "metadata": {}, + "outputs": [], + "source": [ + "# =====================================================================\n", + "# Graph 2: Comparison Scatter — base_loss vs reg_loss\n", + "# =====================================================================\n", + "fig, ax = plt.subplots(1, 1, figsize=(8, 6))\n", + "\n", + "markers = {\n", + " 'BasicSearch': 'o',\n", + " 'Beamsearch': 's',\n", + " 'PrioritySearch': '^',\n", + "}\n", + "colors = {\n", + " 'scalar': '#1f77b4',\n", + " 'weighted': '#ff7f0e',\n", + " 'pareto': '#2ca02c',\n", + "}\n", + "\n", + "for run_name, run_data in results.items():\n", + " sd = run_data.get('score_dict')\n", + " if sd is None or 'base_loss' not in sd or 'reg_loss' not in sd:\n", + " continue\n", + " algo_name, mode_name = run_name.split('/')\n", + " ax.scatter(\n", + " sd['base_loss'], sd['reg_loss'],\n", + " marker=markers.get(algo_name, 'x'),\n", + " color=colors.get(mode_name, 'gray'),\n", + " s=120, edgecolors='black', linewidths=0.8,\n", + " label=run_name, zorder=5,\n", + " )\n", + "\n", + "ax.set_xlabel('base_loss (lower is better)', fontsize=12)\n", + "ax.set_ylabel('reg_loss (lower is better)', fontsize=12)\n", + "ax.set_title('Multi-Objective Comparison — base_loss vs reg_loss', fontsize=14)\n", + "ax.legend(fontsize=9, loc='upper right')\n", + "ax.grid(True, alpha=0.3)\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(\"Graph 2: Each point represents the best candidate's score_dict from a training run.\")\n", + "print(\"Ideal candidates are in the bottom-left (low base_loss AND low reg_loss).\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-summary", + "metadata": {}, + "outputs": [], + "source": [ + "# =====================================================================\n", + "# Summary Table\n", + "# =====================================================================\n", + "rows = []\n", + "for run_name, run_data in results.items():\n", + " algo_name, mode_name = run_name.split('/')\n", + " sd = run_data.get('score_dict')\n", + " rows.append({\n", + " 'Algorithm': algo_name,\n", + " 'Mode': mode_name,\n", + " 'Final Scalar Score': f\"{run_data.get('final_score', 'N/A')}\",\n", + " 'base_loss': f\"{sd['base_loss']:.4f}\" if sd and 'base_loss' in sd else 'N/A',\n", + " 'reg_loss': f\"{sd['reg_loss']:.4f}\" if sd and 'reg_loss' in sd else 'N/A',\n", + " })\n", + "\n", + "df = pd.DataFrame(rows)\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"SUMMARY: Multi-Objective Training Results\")\n", + "print(\"=\" * 70)\n", + "print(df.to_string(index=False))\n", + "\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"M2 NOTEBOOK COMPLETE\")\n", + "print(\"=\" * 70)\n", + "print(\"\"\"\n", + "Deliverables verified:\n", + " Part A (BasicSearch): scalar, weighted, Pareto modes on SixHumpCamel\n", + " - Backward compatible (objective_config=None)\n", + " - Weighted mode populates current_score_dict\n", + " - Pareto mode selects from non-dominated front\n", + "\n", + " Part B (BeamsearchAlgorithm): weighted mode with vector select()\n", + " - evaluate_vector() computes per-metric scores for beam candidates\n", + " - select_top_k() ranks candidates via ObjectiveConfig\n", + " - _last_selected_score_dicts populated for per-metric logging\n", + "\n", + " Part C (PrioritySearch): weighted + Pareto modes\n", + " - validate() populates score_dict in rollouts\n", + " - mean_score_dict() aggregates per-metric means\n", + " - compute_exploration_priority() uses weighted scalarization\n", + " - ParetoHeapMemory used in Pareto mode\n", + " - Rollouts contain score_dict entries\n", + "\"\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From bdd5260f1fef9926a5b65a83a7a91e5ea2ec38dc Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Wed, 18 Feb 2026 20:45:58 -0400 Subject: [PATCH 11/20] fix: define _repo_root in Colab setup cell for notebook --- examples/notebooks/t6_m2_trainers.ipynb | 37 ++----------------------- 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/examples/notebooks/t6_m2_trainers.ipynb b/examples/notebooks/t6_m2_trainers.ipynb index be30a155..2e6d955a 100644 --- a/examples/notebooks/t6_m2_trainers.ipynb +++ b/examples/notebooks/t6_m2_trainers.ipynb @@ -6,40 +6,7 @@ "id": "cell-setup", "metadata": {}, "outputs": [], - "source": [ - "import os, sys\n", - "\n", - "# In Colab: clone and install from GitHub\n", - "# Locally: add repo root to sys.path so opto is importable\n", - "try:\n", - " import google.colab\n", - " IN_COLAB = True\n", - "except ImportError:\n", - " IN_COLAB = False\n", - "\n", - "if IN_COLAB:\n", - " !git clone https://github.com/carlosrod723/OpenTrace.git Trace\n", - " %cd Trace\n", - " !git checkout t6-multi-objective-m0\n", - " !sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n", - " !pip install -e .\n", - " !pip install cvxpy matplotlib pandas\n", - "else:\n", - " # Local: ensure repo root is on sys.path\n", - " _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n", - " _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n", - " if _repo_root not in sys.path:\n", - " sys.path.insert(0, _repo_root)\n", - " import opto\n", - " print(f\"Using local opto from: {os.path.dirname(opto.__file__)}\")\n", - "\n", - "# Verify cvxpy is available (required for SixHumpCamel SOS certificate)\n", - "try:\n", - " import cvxpy\n", - " print(f\"cvxpy {cvxpy.__version__} available\")\n", - "except ImportError:\n", - " raise ImportError(\"cvxpy is required: pip install cvxpy\")" - ] + "source": "import os, sys\n\n# In Colab: clone and install from GitHub\n# Locally: add repo root to sys.path so opto is importable\ntry:\n import google.colab\n IN_COLAB = True\nexcept ImportError:\n IN_COLAB = False\n\nif IN_COLAB:\n !git clone https://github.com/carlosrod723/OpenTrace.git Trace\n %cd Trace\n !git checkout t6-multi-objective-m0\n !sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n !pip install -e .\n !pip install cvxpy matplotlib pandas\n _repo_root = os.getcwd() # /content/Trace after %cd\nelse:\n # Local: ensure repo root is on sys.path\n _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n if _repo_root not in sys.path:\n sys.path.insert(0, _repo_root)\n import opto\n print(f\"Using local opto from: {os.path.dirname(opto.__file__)}\")\n\nprint(f\"Repo root: {_repo_root}\")\n\n# Verify cvxpy is available (required for SixHumpCamel SOS certificate)\ntry:\n import cvxpy\n print(f\"cvxpy {cvxpy.__version__} available\")\nexcept ImportError:\n raise ImportError(\"cvxpy is required: pip install cvxpy\")" }, { "cell_type": "markdown", @@ -675,4 +642,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file From 94a17312f588df6d6a36995581ed897e685e904b Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Wed, 18 Feb 2026 20:50:27 -0400 Subject: [PATCH 12/20] fix: reset Colab working directory before clone in setup cell --- examples/notebooks/t6_m2_trainers.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/notebooks/t6_m2_trainers.ipynb b/examples/notebooks/t6_m2_trainers.ipynb index 2e6d955a..83fa4827 100644 --- a/examples/notebooks/t6_m2_trainers.ipynb +++ b/examples/notebooks/t6_m2_trainers.ipynb @@ -6,7 +6,7 @@ "id": "cell-setup", "metadata": {}, "outputs": [], - "source": "import os, sys\n\n# In Colab: clone and install from GitHub\n# Locally: add repo root to sys.path so opto is importable\ntry:\n import google.colab\n IN_COLAB = True\nexcept ImportError:\n IN_COLAB = False\n\nif IN_COLAB:\n !git clone https://github.com/carlosrod723/OpenTrace.git Trace\n %cd Trace\n !git checkout t6-multi-objective-m0\n !sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n !pip install -e .\n !pip install cvxpy matplotlib pandas\n _repo_root = os.getcwd() # /content/Trace after %cd\nelse:\n # Local: ensure repo root is on sys.path\n _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n if _repo_root not in sys.path:\n sys.path.insert(0, _repo_root)\n import opto\n print(f\"Using local opto from: {os.path.dirname(opto.__file__)}\")\n\nprint(f\"Repo root: {_repo_root}\")\n\n# Verify cvxpy is available (required for SixHumpCamel SOS certificate)\ntry:\n import cvxpy\n print(f\"cvxpy {cvxpy.__version__} available\")\nexcept ImportError:\n raise ImportError(\"cvxpy is required: pip install cvxpy\")" + "source": "import os, sys\n\n# In Colab: clone and install from GitHub\n# Locally: add repo root to sys.path so opto is importable\ntry:\n import google.colab\n IN_COLAB = True\nexcept ImportError:\n IN_COLAB = False\n\nif IN_COLAB:\n %cd /content\n !rm -rf Trace # clean slate\n !git clone https://github.com/carlosrod723/OpenTrace.git Trace\n %cd Trace\n !git checkout t6-multi-objective-m0\n !sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n !pip install -e .\n !pip install cvxpy matplotlib pandas\n _repo_root = os.getcwd() # /content/Trace after %cd\nelse:\n # Local: ensure repo root is on sys.path\n _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n if _repo_root not in sys.path:\n sys.path.insert(0, _repo_root)\n import opto\n print(f\"Using local opto from: {os.path.dirname(opto.__file__)}\")\n\nprint(f\"Repo root: {_repo_root}\")\n\n# Verify cvxpy is available (required for SixHumpCamel SOS certificate)\ntry:\n import cvxpy\n print(f\"cvxpy {cvxpy.__version__} available\")\nexcept ImportError:\n raise ImportError(\"cvxpy is required: pip install cvxpy\")" }, { "cell_type": "markdown", From a47cdc1fa0011e980e3d3ae8e744d310043d5783 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Wed, 18 Feb 2026 21:03:11 -0400 Subject: [PATCH 13/20] fix: improve notebook charts - more training steps, proper loss evaluation, better plots --- examples/notebooks/t6_m2_trainers.ipynb | 456 +----------------------- 1 file changed, 8 insertions(+), 448 deletions(-) diff --git a/examples/notebooks/t6_m2_trainers.ipynb b/examples/notebooks/t6_m2_trainers.ipynb index 83fa4827..c9082a1d 100644 --- a/examples/notebooks/t6_m2_trainers.ipynb +++ b/examples/notebooks/t6_m2_trainers.ipynb @@ -99,140 +99,7 @@ "id": "cell-setup-env", "metadata": {}, "outputs": [], - "source": [ - "# Import SixHumpCamel from Allen's example file\n", - "_examples_dir = os.path.join(_repo_root, 'examples')\n", - "if _examples_dir not in sys.path:\n", - " sys.path.insert(0, _examples_dir)\n", - "from multi_objective_convex_fn import SixHumpCamel\n", - "\n", - "\n", - "# --- RewardGuide (defined here with correct copy import) ---\n", - "class RewardGuide(Guide):\n", - " \"\"\"Multi-objective guide for convex function environments.\n", - "\n", - " get_feedback() -> (float, str): advances real env (training loop).\n", - " get_score_dict() -> Dict[str, float]: uses deepcopy (selection path).\n", - " \"\"\"\n", - "\n", - " def __init__(self, env):\n", - " self.env = env\n", - "\n", - " def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n", - " obs, reward, done, info = self.env.step(str(response))\n", - " feedback = ((obs + \"\\n\\n\") if obs else \"\") + info.get(\"feedback\", \"\")\n", - " return float(reward), feedback\n", - "\n", - " def get_score_dict(self, query, response, reference=None, **kwargs) -> Dict[str, float]:\n", - " env_copy = copy.deepcopy(self.env)\n", - " obs, reward, done, info = env_copy.step(str(response))\n", - " base_loss = info.get(\"base_loss\")\n", - " reg_loss = info.get(\"reg_loss\")\n", - " if base_loss is None or reg_loss is None:\n", - " base_loss = float(\"inf\")\n", - " reg_loss = float(\"inf\")\n", - " return {\"base_loss\": float(base_loss), \"reg_loss\": float(reg_loss)}\n", - "\n", - "\n", - "# --- Agent: wraps a trace node that holds the x = [x1, x2] string ---\n", - "@trace.model\n", - "class ConvexAgent:\n", - " def __init__(self, initial_value):\n", - " self.param = trace.node(\n", - " initial_value, trainable=True,\n", - " description=\"Input x into the hidden function to minimize y. Format: x = [x1, x2]\"\n", - " )\n", - "\n", - " def forward(self, x):\n", - " return self.param\n", - "\n", - "\n", - "# --- DummyLLM callable: proposes x = [float, float] values ---\n", - "class ConvexLLMCallable:\n", - " \"\"\"Returns cycling proposals near the SixHumpCamel optima.\"\"\"\n", - "\n", - " PROPOSALS = [\n", - " \"x = [0.1, -0.7]\", # near optimum 1\n", - " \"x = [-0.1, 0.7]\", # near optimum 2\n", - " \"x = [0.5, -0.3]\", # moderate\n", - " \"x = [0.09, -0.71]\", # very close to optimum 1\n", - " ]\n", - "\n", - " def __init__(self):\n", - " self.idx = 0\n", - "\n", - " def __call__(self, messages, **kwargs):\n", - " problem = messages[1][\"content\"]\n", - " name = re.findall(r'', problem)\n", - " name = name[0] if name else \"unknown\"\n", - " value = self.PROPOSALS[self.idx % len(self.PROPOSALS)]\n", - " self.idx += 1\n", - " return (\n", - " f\" Exploring the loss landscape near known optima. \\n\"\n", - " f\"\\n\"\n", - " f\" {name} \\n\"\n", - " f\" {value} \\n\"\n", - " f\"\"\n", - " )\n", - "\n", - "\n", - "# --- Factory: create fresh env + agent + optimizer + guide per run ---\n", - "DATASET = dict(inputs=[None], infos=[None])\n", - "\n", - "\n", - "def make_basicsearch_run():\n", - " env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n", - " env.reset(seed=42)\n", - " guide = RewardGuide(env)\n", - " agent = ConvexAgent(\"x = [0.0, 0.0]\")\n", - " llm = DummyLLM(ConvexLLMCallable())\n", - " optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n", - " algo = BasicSearchAlgorithm(agent, optimizer)\n", - " return algo, guide\n", - "\n", - "\n", - "def make_beamsearch_run():\n", - " env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n", - " env.reset(seed=42)\n", - " guide = RewardGuide(env)\n", - " agent = ConvexAgent(\"x = [0.0, 0.0]\")\n", - " llm = DummyLLM(ConvexLLMCallable())\n", - " optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n", - " algo = BeamsearchAlgorithm(agent, optimizer)\n", - " return algo, guide\n", - "\n", - "\n", - "def make_priority_search_run():\n", - " env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n", - " env.reset(seed=42)\n", - " guide = RewardGuide(env)\n", - " agent = ConvexAgent(\"x = [0.0, 0.0]\")\n", - " llm = DummyLLM(ConvexLLMCallable())\n", - " optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n", - " algo = PrioritySearch(agent, optimizer)\n", - " return algo, guide\n", - "\n", - "\n", - "# Objective configs\n", - "CONFIG_WEIGHTED = ObjectiveConfig(\n", - " mode=\"weighted\",\n", - " weights={\"base_loss\": 1.0, \"reg_loss\": 1.0},\n", - " minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n", - " seed=0,\n", - ")\n", - "\n", - "CONFIG_PARETO = ObjectiveConfig(\n", - " mode=\"pareto\",\n", - " weights={\"base_loss\": 0.7, \"reg_loss\": 0.3},\n", - " minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n", - " tie_break=\"weighted\",\n", - " seed=42,\n", - ")\n", - "\n", - "# Results collector\n", - "results = {}\n", - "print(\"Setup complete. SixHumpCamel environment + DummyLLM ready.\")" - ] + "source": "# Import SixHumpCamel from Allen's example file\n_examples_dir = os.path.join(_repo_root, 'examples')\nif _examples_dir not in sys.path:\n sys.path.insert(0, _examples_dir)\nfrom multi_objective_convex_fn import SixHumpCamel\n\n\n# --- RewardGuide (defined here with correct copy import) ---\nclass RewardGuide(Guide):\n \"\"\"Multi-objective guide for convex function environments.\n\n get_feedback() -> (float, str): advances real env (training loop).\n get_score_dict() -> Dict[str, float]: uses deepcopy (selection path).\n \"\"\"\n\n def __init__(self, env):\n self.env = env\n\n def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n obs, reward, done, info = self.env.step(str(response))\n feedback = ((obs + \"\\n\\n\") if obs else \"\") + info.get(\"feedback\", \"\")\n return float(reward), feedback\n\n def get_score_dict(self, query, response, reference=None, **kwargs) -> Dict[str, float]:\n env_copy = copy.deepcopy(self.env)\n obs, reward, done, info = env_copy.step(str(response))\n base_loss = info.get(\"base_loss\")\n reg_loss = info.get(\"reg_loss\")\n if base_loss is None or reg_loss is None:\n base_loss = float(\"inf\")\n reg_loss = float(\"inf\")\n return {\"base_loss\": float(base_loss), \"reg_loss\": float(reg_loss)}\n\n\n# --- Agent: wraps a trace node that holds the x = [x1, x2] string ---\n@trace.model\nclass ConvexAgent:\n def __init__(self, initial_value):\n self.param = trace.node(\n initial_value, trainable=True,\n description=\"Input x into the hidden function to minimize y. Format: x = [x1, x2]\"\n )\n\n def forward(self, x):\n return self.param\n\n\n# --- DummyLLM callable: proposes x = [float, float] values ---\nclass ConvexLLMCallable:\n \"\"\"Returns cycling proposals spanning the SixHumpCamel landscape.\"\"\"\n\n PROPOSALS = [\n \"x = [0.09, -0.71]\", # very close to optimum 1\n \"x = [-0.09, 0.71]\", # very close to optimum 2\n \"x = [0.1, -0.7]\", # near optimum 1\n \"x = [-0.1, 0.7]\", # near optimum 2\n \"x = [0.5, -0.3]\", # moderate region\n \"x = [-0.5, 0.3]\", # moderate symmetric\n \"x = [0.2, -0.5]\", # exploring\n \"x = [-0.3, 0.6]\", # exploring\n \"x = [1.0, -1.0]\", # far from optima (high reg)\n \"x = [0.0, 0.0]\", # origin (zero loss)\n ]\n\n def __init__(self):\n self.idx = 0\n\n def __call__(self, messages, **kwargs):\n problem = messages[1][\"content\"]\n name = re.findall(r'', problem)\n name = name[0] if name else \"unknown\"\n value = self.PROPOSALS[self.idx % len(self.PROPOSALS)]\n self.idx += 1\n return (\n f\" Exploring the loss landscape. \\n\"\n f\"\\n\"\n f\" {name} \\n\"\n f\" {value} \\n\"\n f\"\"\n )\n\n\n# --- Post-training evaluation: get actual losses from the final parameter ---\ndef evaluate_final_losses(param_value):\n \"\"\"Evaluate a parameter string on a fresh SixHumpCamel env.\n\n Returns dict with base_loss, reg_loss, total_loss (all actual values).\n \"\"\"\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n x, stop = env.text_extract(str(param_value))\n if x is None:\n return {\"base_loss\": float(\"nan\"), \"reg_loss\": float(\"nan\"), \"total_loss\": float(\"nan\")}\n base, reg, total = env._eval_losses(x)\n return {\"base_loss\": float(base), \"reg_loss\": float(reg), \"total_loss\": float(total)}\n\n\n# --- Factory: create fresh env + agent + optimizer + guide per run ---\nDATASET = dict(inputs=[None], infos=[None])\n\n\ndef make_basicsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BasicSearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_beamsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BeamsearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_priority_search_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = PrioritySearch(agent, optimizer)\n return algo, guide, agent\n\n\n# Objective configs\nCONFIG_WEIGHTED = ObjectiveConfig(\n mode=\"weighted\",\n weights={\"base_loss\": 1.0, \"reg_loss\": 1.0},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n seed=0,\n)\n\nCONFIG_PARETO = ObjectiveConfig(\n mode=\"pareto\",\n weights={\"base_loss\": 0.7, \"reg_loss\": 0.3},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n tie_break=\"weighted\",\n seed=42,\n)\n\n# Results collector\nresults = {}\nprint(\"Setup complete. SixHumpCamel environment + DummyLLM ready.\")\nprint(f\"DummyLLM has {len(ConvexLLMCallable.PROPOSALS)} diverse proposals.\")" }, { "cell_type": "code", @@ -240,68 +107,7 @@ "id": "cell-basicsearch-runs", "metadata": {}, "outputs": [], - "source": [ - "# =====================================================================\n", - "# BasicSearch: scalar, weighted, pareto (M1 baseline for comparison)\n", - "# =====================================================================\n", - "BASIC_KWARGS = dict(\n", - " train_dataset=DATASET,\n", - " num_proposals=2,\n", - " num_epochs=1,\n", - " batch_size=1,\n", - " num_threads=1,\n", - ")\n", - "\n", - "# --- Scalar ---\n", - "print(\"=\" * 60)\n", - "print(\"BasicSearch: SCALAR mode\")\n", - "print(\"=\" * 60)\n", - "algo_bs_scalar, guide_bs_scalar = make_basicsearch_run()\n", - "scores_bs_scalar, test_bs_scalar = algo_bs_scalar.train(\n", - " guide=guide_bs_scalar, objective_config=None, **BASIC_KWARGS\n", - ")\n", - "results['BasicSearch/scalar'] = {\n", - " 'val_scores': scores_bs_scalar,\n", - " 'final_score': test_bs_scalar,\n", - " 'score_dict': getattr(algo_bs_scalar, 'current_score_dict', None),\n", - "}\n", - "print(f\"\\nScalar validation scores: {scores_bs_scalar}\")\n", - "print(f\"Final score: {test_bs_scalar}\")\n", - "\n", - "# --- Weighted ---\n", - "print(\"\\n\" + \"=\" * 60)\n", - "print(\"BasicSearch: WEIGHTED mode\")\n", - "print(\"=\" * 60)\n", - "algo_bs_weighted, guide_bs_weighted = make_basicsearch_run()\n", - "scores_bs_weighted, test_bs_weighted = algo_bs_weighted.train(\n", - " guide=guide_bs_weighted, objective_config=CONFIG_WEIGHTED, **BASIC_KWARGS\n", - ")\n", - "results['BasicSearch/weighted'] = {\n", - " 'val_scores': scores_bs_weighted,\n", - " 'final_score': test_bs_weighted,\n", - " 'score_dict': getattr(algo_bs_weighted, 'current_score_dict', None),\n", - "}\n", - "print(f\"\\nWeighted validation scores: {scores_bs_weighted}\")\n", - "print(f\"Final score: {test_bs_weighted}\")\n", - "print(f\"Score dict: {algo_bs_weighted.current_score_dict}\")\n", - "\n", - "# --- Pareto ---\n", - "print(\"\\n\" + \"=\" * 60)\n", - "print(\"BasicSearch: PARETO mode\")\n", - "print(\"=\" * 60)\n", - "algo_bs_pareto, guide_bs_pareto = make_basicsearch_run()\n", - "scores_bs_pareto, test_bs_pareto = algo_bs_pareto.train(\n", - " guide=guide_bs_pareto, objective_config=CONFIG_PARETO, **BASIC_KWARGS\n", - ")\n", - "results['BasicSearch/pareto'] = {\n", - " 'val_scores': scores_bs_pareto,\n", - " 'final_score': test_bs_pareto,\n", - " 'score_dict': getattr(algo_bs_pareto, 'current_score_dict', None),\n", - "}\n", - "print(f\"\\nPareto validation scores: {scores_bs_pareto}\")\n", - "print(f\"Final score: {test_bs_pareto}\")\n", - "print(f\"Score dict: {algo_bs_pareto.current_score_dict}\")" - ] + "source": "# =====================================================================\n# BasicSearch: scalar, weighted, pareto (M1 baseline for comparison)\n# num_epochs=5 gives 5 training steps (1-item dataset, batch_size=1)\n# =====================================================================\nBASIC_KWARGS = dict(\n train_dataset=DATASET,\n num_proposals=4,\n num_epochs=5,\n batch_size=1,\n num_threads=1,\n)\n\n# --- Scalar ---\nprint(\"=\" * 60)\nprint(\"BasicSearch: SCALAR mode (5 epochs)\")\nprint(\"=\" * 60)\nalgo_bs_scalar, guide_bs_scalar, agent_bs_scalar = make_basicsearch_run()\nscores_bs_scalar, test_bs_scalar = algo_bs_scalar.train(\n guide=guide_bs_scalar, objective_config=None, **BASIC_KWARGS\n)\neval_scalar = evaluate_final_losses(agent_bs_scalar.param.data)\nresults['BasicSearch/scalar'] = {\n 'val_scores': scores_bs_scalar,\n 'final_score': test_bs_scalar,\n 'eval_losses': eval_scalar,\n 'final_param': str(agent_bs_scalar.param.data),\n}\nprint(f\"\\nFinal param: {agent_bs_scalar.param.data}\")\nprint(f\"Validation scores ({len(scores_bs_scalar)} steps): {scores_bs_scalar}\")\nprint(f\"Evaluated losses: {eval_scalar}\")\n\n# --- Weighted ---\nprint(\"\\n\" + \"=\" * 60)\nprint(\"BasicSearch: WEIGHTED mode (5 epochs)\")\nprint(\"=\" * 60)\nalgo_bs_weighted, guide_bs_weighted, agent_bs_weighted = make_basicsearch_run()\nscores_bs_weighted, test_bs_weighted = algo_bs_weighted.train(\n guide=guide_bs_weighted, objective_config=CONFIG_WEIGHTED, **BASIC_KWARGS\n)\neval_weighted = evaluate_final_losses(agent_bs_weighted.param.data)\nresults['BasicSearch/weighted'] = {\n 'val_scores': scores_bs_weighted,\n 'final_score': test_bs_weighted,\n 'eval_losses': eval_weighted,\n 'final_param': str(agent_bs_weighted.param.data),\n}\nprint(f\"\\nFinal param: {agent_bs_weighted.param.data}\")\nprint(f\"Validation scores ({len(scores_bs_weighted)} steps): {scores_bs_weighted}\")\nprint(f\"Evaluated losses: {eval_weighted}\")\n\n# --- Pareto ---\nprint(\"\\n\" + \"=\" * 60)\nprint(\"BasicSearch: PARETO mode (5 epochs)\")\nprint(\"=\" * 60)\nalgo_bs_pareto, guide_bs_pareto, agent_bs_pareto = make_basicsearch_run()\nscores_bs_pareto, test_bs_pareto = algo_bs_pareto.train(\n guide=guide_bs_pareto, objective_config=CONFIG_PARETO, **BASIC_KWARGS\n)\neval_pareto = evaluate_final_losses(agent_bs_pareto.param.data)\nresults['BasicSearch/pareto'] = {\n 'val_scores': scores_bs_pareto,\n 'final_score': test_bs_pareto,\n 'eval_losses': eval_pareto,\n 'final_param': str(agent_bs_pareto.param.data),\n}\nprint(f\"\\nFinal param: {agent_bs_pareto.param.data}\")\nprint(f\"Validation scores ({len(scores_bs_pareto)} steps): {scores_bs_pareto}\")\nprint(f\"Evaluated losses: {eval_pareto}\")" }, { "cell_type": "code", @@ -309,34 +115,7 @@ "id": "cell-score-progression", "metadata": {}, "outputs": [], - "source": [ - "# =====================================================================\n", - "# Graph 1: Score Progression — BasicSearch across modes\n", - "# =====================================================================\n", - "fig, ax = plt.subplots(1, 1, figsize=(8, 5))\n", - "\n", - "for label, marker, color in [\n", - " ('BasicSearch/scalar', 'o', '#1f77b4'),\n", - " ('BasicSearch/weighted', 's', '#ff7f0e'),\n", - " ('BasicSearch/pareto', '^', '#2ca02c'),\n", - "]:\n", - " data = results.get(label, {})\n", - " val_scores = data.get('val_scores', [])\n", - " if val_scores:\n", - " ax.plot(range(len(val_scores)), val_scores,\n", - " marker=marker, color=color, label=label, linewidth=2, markersize=6)\n", - "\n", - "ax.set_xlabel('Step', fontsize=12)\n", - "ax.set_ylabel('Validation Score (scalar)', fontsize=12)\n", - "ax.set_title('BasicSearch Score Progression — SixHumpCamel', fontsize=14)\n", - "ax.legend(fontsize=10)\n", - "ax.grid(True, alpha=0.3)\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "print(\"Graph 1: Score progression for BasicSearch across all three objective modes.\")\n", - "print(\"With DummyLLM, scores depend on the cycling proposals near optima.\")" - ] + "source": "# =====================================================================\n# Graph 1: Score Progression — BasicSearch validation scores across modes\n# With 5 epochs, each mode produces 5 data points showing optimization\n# =====================================================================\nfig, ax = plt.subplots(1, 1, figsize=(9, 5))\n\nfor label, marker, color in [\n ('BasicSearch/scalar', 'o-', '#1f77b4'),\n ('BasicSearch/weighted', 's-', '#ff7f0e'),\n ('BasicSearch/pareto', '^-', '#2ca02c'),\n]:\n data = results.get(label, {})\n val_scores = data.get('val_scores', [])\n if val_scores:\n steps = list(range(len(val_scores)))\n ax.plot(steps, val_scores, marker, color=color, label=label,\n linewidth=2, markersize=7)\n\nax.set_xlabel('Training Step', fontsize=12)\nax.set_ylabel('Validation Score (reward)', fontsize=12)\nax.set_title('BasicSearch Score Progression — SixHumpCamel (5 epochs)', fontsize=14)\nax.legend(fontsize=10)\nax.grid(True, alpha=0.3)\nplt.tight_layout()\nplt.show()\n\nprint(\"Score progression across 5 training epochs.\")\nprint(\"Higher reward = lower total loss (reward = -total_loss).\")\nprint(\"Different objective modes may select different proposals, producing different curves.\")" }, { "cell_type": "markdown", @@ -359,41 +138,7 @@ "id": "cell-beamsearch-run", "metadata": {}, "outputs": [], - "source": [ - "# =====================================================================\n", - "# BeamsearchAlgorithm: weighted mode\n", - "# =====================================================================\n", - "print(\"=\" * 60)\n", - "print(\"Beamsearch: WEIGHTED mode\")\n", - "print(\"=\" * 60)\n", - "\n", - "algo_beam_w, guide_beam_w = make_beamsearch_run()\n", - "metrics_beam_w, final_beam_w = algo_beam_w.train(\n", - " guide=guide_beam_w,\n", - " train_dataset=DATASET,\n", - " objective_config=CONFIG_WEIGHTED,\n", - " beam_width=2,\n", - " num_proposals=2,\n", - " max_depth=1,\n", - " batch_size=1,\n", - " num_threads=1,\n", - ")\n", - "\n", - "# Extract results\n", - "beam_score_dicts = getattr(algo_beam_w, '_last_selected_score_dicts', None)\n", - "beam_best_sd = beam_score_dicts[0] if beam_score_dicts else None\n", - "\n", - "results['Beamsearch/weighted'] = {\n", - " 'val_scores': metrics_beam_w.get('best_validation_scores', []),\n", - " 'final_score': final_beam_w,\n", - " 'score_dict': beam_best_sd,\n", - "}\n", - "\n", - "print(f\"\\nValidation scores by depth: {metrics_beam_w.get('best_validation_scores', [])}\")\n", - "print(f\"Final test score: {final_beam_w}\")\n", - "print(f\"_last_selected_score_dicts: {beam_score_dicts}\")\n", - "print(f\"Best candidate score_dict: {beam_best_sd}\")" - ] + "source": "# =====================================================================\n# BeamsearchAlgorithm: weighted mode\n# max_depth=2 gives 2 beam search levels with validation at each\n# =====================================================================\nprint(\"=\" * 60)\nprint(\"Beamsearch: WEIGHTED mode (depth=2, width=3)\")\nprint(\"=\" * 60)\n\nalgo_beam_w, guide_beam_w, agent_beam_w = make_beamsearch_run()\nmetrics_beam_w, final_beam_w = algo_beam_w.train(\n guide=guide_beam_w,\n train_dataset=DATASET,\n objective_config=CONFIG_WEIGHTED,\n beam_width=3,\n num_proposals=3,\n max_depth=2,\n batch_size=1,\n num_threads=1,\n)\n\n# Post-training evaluation on fresh env\neval_beam_w = evaluate_final_losses(agent_beam_w.param.data)\n\n# Extract score_dicts from selection\nbeam_score_dicts = getattr(algo_beam_w, '_last_selected_score_dicts', None)\nbeam_best_sd = beam_score_dicts[0] if beam_score_dicts else None\n\nresults['Beamsearch/weighted'] = {\n 'val_scores': metrics_beam_w.get('best_validation_scores', []),\n 'final_score': final_beam_w,\n 'eval_losses': eval_beam_w,\n 'final_param': str(agent_beam_w.param.data),\n}\n\nprint(f\"\\nFinal param: {agent_beam_w.param.data}\")\nprint(f\"Validation scores by depth: {metrics_beam_w.get('best_validation_scores', [])}\")\nprint(f\"_last_selected_score_dicts: {beam_score_dicts}\")\nprint(f\"Evaluated losses: {eval_beam_w}\")" }, { "cell_type": "markdown", @@ -417,53 +162,7 @@ "id": "cell-priority-weighted", "metadata": {}, "outputs": [], - "source": [ - "# =====================================================================\n", - "# PrioritySearch: weighted mode\n", - "# =====================================================================\n", - "print(\"=\" * 60)\n", - "print(\"PrioritySearch: WEIGHTED mode\")\n", - "print(\"=\" * 60)\n", - "\n", - "algo_ps_w, guide_ps_w = make_priority_search_run()\n", - "algo_ps_w.train(\n", - " guide=guide_ps_w,\n", - " train_dataset=DATASET,\n", - " objective_config=CONFIG_WEIGHTED,\n", - " batch_size=1,\n", - " num_batches=1,\n", - " num_epochs=1,\n", - " num_candidates=2,\n", - " num_proposals=1,\n", - " num_threads=1,\n", - " long_term_memory_size=5,\n", - " memory_update_frequency=0,\n", - " verbose=False,\n", - ")\n", - "\n", - "# Extract best candidate from memory\n", - "ps_w_sd = None\n", - "ps_w_final = None\n", - "if hasattr(algo_ps_w, 'long_term_memory') and algo_ps_w.long_term_memory:\n", - " best_neg, best_cand = min(algo_ps_w.long_term_memory, key=lambda x: x[0])\n", - " ps_w_sd = best_cand.mean_score_dict()\n", - " ps_w_final = float(-best_neg)\n", - " print(f\"\\nBest candidate priority: {ps_w_final:.4f}\")\n", - " print(f\"Best candidate mean_score_dict: {ps_w_sd}\")\n", - " print(f\"Number of rollouts: {len(best_cand.rollouts)}\")\n", - " # Check that rollouts have score_dict\n", - " has_sd = any('score_dict' in r and r['score_dict'] is not None\n", - " for r in best_cand.rollouts)\n", - " print(f\"Rollouts contain score_dict: {has_sd}\")\n", - "else:\n", - " print(\"No candidates in long_term_memory\")\n", - "\n", - "results['PrioritySearch/weighted'] = {\n", - " 'val_scores': [], # PrioritySearch doesn't return per-step scores\n", - " 'final_score': ps_w_final,\n", - " 'score_dict': ps_w_sd,\n", - "}" - ] + "source": "# =====================================================================\n# PrioritySearch: weighted mode (more epochs for real exploration)\n# =====================================================================\nprint(\"=\" * 60)\nprint(\"PrioritySearch: WEIGHTED mode (2 epochs, 2 batches)\")\nprint(\"=\" * 60)\n\nalgo_ps_w, guide_ps_w, agent_ps_w = make_priority_search_run()\nalgo_ps_w.train(\n guide=guide_ps_w,\n train_dataset=DATASET,\n objective_config=CONFIG_WEIGHTED,\n batch_size=1,\n num_batches=2,\n num_epochs=2,\n num_candidates=3,\n num_proposals=2,\n num_threads=1,\n long_term_memory_size=10,\n memory_update_frequency=0,\n verbose=False,\n)\n\n# Post-training evaluation on fresh env\neval_ps_w = evaluate_final_losses(agent_ps_w.param.data)\n\n# Extract best candidate from memory\nps_w_sd = None\nps_w_final = None\nif hasattr(algo_ps_w, 'long_term_memory') and algo_ps_w.long_term_memory:\n best_neg, best_cand = min(algo_ps_w.long_term_memory, key=lambda x: x[0])\n ps_w_sd = best_cand.mean_score_dict()\n ps_w_final = float(-best_neg)\n print(f\"\\nBest candidate priority: {ps_w_final:.4f}\")\n print(f\"Best candidate mean_score_dict: {ps_w_sd}\")\n has_sd = any('score_dict' in r and r['score_dict'] is not None\n for r in best_cand.rollouts)\n print(f\"Rollouts contain score_dict: {has_sd}\")\nelse:\n print(\"No candidates in long_term_memory\")\n\nprint(f\"Final param: {agent_ps_w.param.data}\")\nprint(f\"Evaluated losses: {eval_ps_w}\")\n\nresults['PrioritySearch/weighted'] = {\n 'val_scores': [],\n 'final_score': ps_w_final,\n 'eval_losses': eval_ps_w,\n 'final_param': str(agent_ps_w.param.data),\n}" }, { "cell_type": "code", @@ -471,62 +170,7 @@ "id": "cell-priority-pareto", "metadata": {}, "outputs": [], - "source": [ - "# =====================================================================\n", - "# PrioritySearch: Pareto mode (uses ParetoHeapMemory)\n", - "# =====================================================================\n", - "print(\"=\" * 60)\n", - "print(\"PrioritySearch: PARETO mode\")\n", - "print(\"=\" * 60)\n", - "\n", - "algo_ps_p, guide_ps_p = make_priority_search_run()\n", - "algo_ps_p.train(\n", - " guide=guide_ps_p,\n", - " train_dataset=DATASET,\n", - " objective_config=CONFIG_PARETO,\n", - " batch_size=1,\n", - " num_batches=1,\n", - " num_epochs=1,\n", - " num_candidates=2,\n", - " num_proposals=1,\n", - " num_threads=1,\n", - " long_term_memory_size=5,\n", - " memory_update_frequency=0,\n", - " verbose=False,\n", - ")\n", - "\n", - "# Extract best candidate from memory\n", - "ps_p_sd = None\n", - "ps_p_final = None\n", - "if hasattr(algo_ps_p, 'long_term_memory') and algo_ps_p.long_term_memory:\n", - " # Check the memory type (should be ParetoHeapMemory)\n", - " mem_type = type(algo_ps_p.long_term_memory).__name__\n", - " print(f\"Memory type: {mem_type}\")\n", - "\n", - " best_neg_p, best_cand_p = min(algo_ps_p.long_term_memory, key=lambda x: x[0])\n", - " ps_p_sd = best_cand_p.mean_score_dict()\n", - " ps_p_final = float(-best_neg_p)\n", - " print(f\"\\nBest candidate priority: {ps_p_final:.4f}\")\n", - " print(f\"Best candidate mean_score_dict: {ps_p_sd}\")\n", - " print(f\"Number of rollouts: {len(best_cand_p.rollouts)}\")\n", - " has_sd_p = any('score_dict' in r and r['score_dict'] is not None\n", - " for r in best_cand_p.rollouts)\n", - " print(f\"Rollouts contain score_dict: {has_sd_p}\")\n", - "\n", - " # Show all candidates in memory\n", - " print(f\"\\nAll candidates in memory ({len(algo_ps_p.long_term_memory)}):\")\n", - " for neg_p, cand in sorted(algo_ps_p.long_term_memory, key=lambda x: x[0]):\n", - " sd = cand.mean_score_dict()\n", - " print(f\" priority={-neg_p:.4f}, score_dict={sd}\")\n", - "else:\n", - " print(\"No candidates in long_term_memory\")\n", - "\n", - "results['PrioritySearch/pareto'] = {\n", - " 'val_scores': [],\n", - " 'final_score': ps_p_final,\n", - " 'score_dict': ps_p_sd,\n", - "}" - ] + "source": "# =====================================================================\n# PrioritySearch: Pareto mode (uses ParetoHeapMemory)\n# =====================================================================\nprint(\"=\" * 60)\nprint(\"PrioritySearch: PARETO mode (2 epochs, 2 batches)\")\nprint(\"=\" * 60)\n\nalgo_ps_p, guide_ps_p, agent_ps_p = make_priority_search_run()\nalgo_ps_p.train(\n guide=guide_ps_p,\n train_dataset=DATASET,\n objective_config=CONFIG_PARETO,\n batch_size=1,\n num_batches=2,\n num_epochs=2,\n num_candidates=3,\n num_proposals=2,\n num_threads=1,\n long_term_memory_size=10,\n memory_update_frequency=0,\n verbose=False,\n)\n\n# Post-training evaluation on fresh env\neval_ps_p = evaluate_final_losses(agent_ps_p.param.data)\n\n# Extract best candidate from memory\nps_p_sd = None\nps_p_final = None\nif hasattr(algo_ps_p, 'long_term_memory') and algo_ps_p.long_term_memory:\n mem_type = type(algo_ps_p.long_term_memory).__name__\n print(f\"Memory type: {mem_type}\")\n\n best_neg_p, best_cand_p = min(algo_ps_p.long_term_memory, key=lambda x: x[0])\n ps_p_sd = best_cand_p.mean_score_dict()\n ps_p_final = float(-best_neg_p)\n print(f\"\\nBest candidate priority: {ps_p_final:.4f}\")\n print(f\"Best candidate mean_score_dict: {ps_p_sd}\")\n has_sd_p = any('score_dict' in r and r['score_dict'] is not None\n for r in best_cand_p.rollouts)\n print(f\"Rollouts contain score_dict: {has_sd_p}\")\n\n print(f\"\\nAll candidates in memory ({len(algo_ps_p.long_term_memory)}):\")\n for neg_p, cand in sorted(algo_ps_p.long_term_memory, key=lambda x: x[0]):\n sd = cand.mean_score_dict()\n print(f\" priority={-neg_p:.4f}, score_dict={sd}\")\nelse:\n print(\"No candidates in long_term_memory\")\n\nprint(f\"Final param: {agent_ps_p.param.data}\")\nprint(f\"Evaluated losses: {eval_ps_p}\")\n\nresults['PrioritySearch/pareto'] = {\n 'val_scores': [],\n 'final_score': ps_p_final,\n 'eval_losses': eval_ps_p,\n 'final_param': str(agent_ps_p.param.data),\n}" }, { "cell_type": "code", @@ -534,47 +178,7 @@ "id": "cell-scatter", "metadata": {}, "outputs": [], - "source": [ - "# =====================================================================\n", - "# Graph 2: Comparison Scatter — base_loss vs reg_loss\n", - "# =====================================================================\n", - "fig, ax = plt.subplots(1, 1, figsize=(8, 6))\n", - "\n", - "markers = {\n", - " 'BasicSearch': 'o',\n", - " 'Beamsearch': 's',\n", - " 'PrioritySearch': '^',\n", - "}\n", - "colors = {\n", - " 'scalar': '#1f77b4',\n", - " 'weighted': '#ff7f0e',\n", - " 'pareto': '#2ca02c',\n", - "}\n", - "\n", - "for run_name, run_data in results.items():\n", - " sd = run_data.get('score_dict')\n", - " if sd is None or 'base_loss' not in sd or 'reg_loss' not in sd:\n", - " continue\n", - " algo_name, mode_name = run_name.split('/')\n", - " ax.scatter(\n", - " sd['base_loss'], sd['reg_loss'],\n", - " marker=markers.get(algo_name, 'x'),\n", - " color=colors.get(mode_name, 'gray'),\n", - " s=120, edgecolors='black', linewidths=0.8,\n", - " label=run_name, zorder=5,\n", - " )\n", - "\n", - "ax.set_xlabel('base_loss (lower is better)', fontsize=12)\n", - "ax.set_ylabel('reg_loss (lower is better)', fontsize=12)\n", - "ax.set_title('Multi-Objective Comparison — base_loss vs reg_loss', fontsize=14)\n", - "ax.legend(fontsize=9, loc='upper right')\n", - "ax.grid(True, alpha=0.3)\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "print(\"Graph 2: Each point represents the best candidate's score_dict from a training run.\")\n", - "print(\"Ideal candidates are in the bottom-left (low base_loss AND low reg_loss).\")" - ] + "source": "# =====================================================================\n# Graph 2: Comparison Scatter — base_loss vs reg_loss\n# Each point = final parameter evaluated on a fresh SixHumpCamel env\n# =====================================================================\nfig, ax = plt.subplots(1, 1, figsize=(8, 6))\n\nmarkers = {\n 'BasicSearch': 'o',\n 'Beamsearch': 's',\n 'PrioritySearch': '^',\n}\ncolors = {\n 'scalar': '#1f77b4',\n 'weighted': '#ff7f0e',\n 'pareto': '#2ca02c',\n}\n\nfor run_name, run_data in results.items():\n el = run_data.get('eval_losses')\n if el is None or 'base_loss' not in el or 'reg_loss' not in el:\n continue\n # Skip NaN entries (e.g. if text_extract failed)\n if np.isnan(el['base_loss']) or np.isnan(el['reg_loss']):\n continue\n algo_name, mode_name = run_name.split('/')\n ax.scatter(\n el['base_loss'], el['reg_loss'],\n marker=markers.get(algo_name, 'x'),\n color=colors.get(mode_name, 'gray'),\n s=120, edgecolors='black', linewidths=0.8,\n label=run_name, zorder=5,\n )\n\nax.set_xlabel('base_loss (lower is better)', fontsize=12)\nax.set_ylabel('reg_loss (lower is better)', fontsize=12)\nax.set_title('Multi-Objective Comparison — base_loss vs reg_loss', fontsize=14)\nax.legend(fontsize=9, loc='upper right')\nax.grid(True, alpha=0.3)\nplt.tight_layout()\nplt.show()\n\nprint(\"Graph 2: Each point represents the final parameter evaluated on a fresh SixHumpCamel env.\")\nprint(\"Ideal candidates are in the bottom-left (low base_loss AND low reg_loss).\")" }, { "cell_type": "code", @@ -582,51 +186,7 @@ "id": "cell-summary", "metadata": {}, "outputs": [], - "source": [ - "# =====================================================================\n", - "# Summary Table\n", - "# =====================================================================\n", - "rows = []\n", - "for run_name, run_data in results.items():\n", - " algo_name, mode_name = run_name.split('/')\n", - " sd = run_data.get('score_dict')\n", - " rows.append({\n", - " 'Algorithm': algo_name,\n", - " 'Mode': mode_name,\n", - " 'Final Scalar Score': f\"{run_data.get('final_score', 'N/A')}\",\n", - " 'base_loss': f\"{sd['base_loss']:.4f}\" if sd and 'base_loss' in sd else 'N/A',\n", - " 'reg_loss': f\"{sd['reg_loss']:.4f}\" if sd and 'reg_loss' in sd else 'N/A',\n", - " })\n", - "\n", - "df = pd.DataFrame(rows)\n", - "print(\"\\n\" + \"=\" * 70)\n", - "print(\"SUMMARY: Multi-Objective Training Results\")\n", - "print(\"=\" * 70)\n", - "print(df.to_string(index=False))\n", - "\n", - "print(\"\\n\" + \"=\" * 70)\n", - "print(\"M2 NOTEBOOK COMPLETE\")\n", - "print(\"=\" * 70)\n", - "print(\"\"\"\n", - "Deliverables verified:\n", - " Part A (BasicSearch): scalar, weighted, Pareto modes on SixHumpCamel\n", - " - Backward compatible (objective_config=None)\n", - " - Weighted mode populates current_score_dict\n", - " - Pareto mode selects from non-dominated front\n", - "\n", - " Part B (BeamsearchAlgorithm): weighted mode with vector select()\n", - " - evaluate_vector() computes per-metric scores for beam candidates\n", - " - select_top_k() ranks candidates via ObjectiveConfig\n", - " - _last_selected_score_dicts populated for per-metric logging\n", - "\n", - " Part C (PrioritySearch): weighted + Pareto modes\n", - " - validate() populates score_dict in rollouts\n", - " - mean_score_dict() aggregates per-metric means\n", - " - compute_exploration_priority() uses weighted scalarization\n", - " - ParetoHeapMemory used in Pareto mode\n", - " - Rollouts contain score_dict entries\n", - "\"\"\")" - ] + "source": "# =====================================================================\n# Summary Table\n# =====================================================================\nrows = []\nfor run_name, run_data in results.items():\n algo_name, mode_name = run_name.split('/')\n el = run_data.get('eval_losses')\n rows.append({\n 'Algorithm': algo_name,\n 'Mode': mode_name,\n 'Final Scalar Score': f\"{run_data.get('final_score', 'N/A')}\",\n 'base_loss': f\"{el['base_loss']:.4f}\" if el and 'base_loss' in el and not np.isnan(el['base_loss']) else 'N/A',\n 'reg_loss': f\"{el['reg_loss']:.4f}\" if el and 'reg_loss' in el and not np.isnan(el['reg_loss']) else 'N/A',\n 'total_loss': f\"{el['total_loss']:.4f}\" if el and 'total_loss' in el and not np.isnan(el['total_loss']) else 'N/A',\n 'Final Param': run_data.get('final_param', 'N/A'),\n })\n\ndf = pd.DataFrame(rows)\nprint(\"\\n\" + \"=\" * 70)\nprint(\"SUMMARY: Multi-Objective Training Results\")\nprint(\"=\" * 70)\nprint(df.to_string(index=False))\n\nprint(\"\\n\" + \"=\" * 70)\nprint(\"M2 NOTEBOOK COMPLETE\")\nprint(\"=\" * 70)\nprint(\"\"\"\nDeliverables verified:\n Part A (BasicSearch): scalar, weighted, Pareto modes on SixHumpCamel\n - Backward compatible (objective_config=None)\n - Weighted mode populates current_score_dict\n - Pareto mode selects from non-dominated front\n\n Part B (BeamsearchAlgorithm): weighted mode with vector select()\n - evaluate_vector() computes per-metric scores for beam candidates\n - select_top_k() ranks candidates via ObjectiveConfig\n - _last_selected_score_dicts populated for per-metric logging\n\n Part C (PrioritySearch): weighted + Pareto modes\n - validate() populates score_dict in rollouts\n - mean_score_dict() aggregates per-metric means\n - compute_exploration_priority() uses weighted scalarization\n - ParetoHeapMemory used in Pareto mode\n - Rollouts contain score_dict entries\n\"\"\")" } ], "metadata": { From 0a79496a6458adee8c70a53167223b4ad201851c Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Wed, 18 Feb 2026 21:49:50 -0400 Subject: [PATCH 14/20] fix: move DATASET definition to imports cell for reliability --- examples/notebooks/t6_m2_trainers.ipynb | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/examples/notebooks/t6_m2_trainers.ipynb b/examples/notebooks/t6_m2_trainers.ipynb index c9082a1d..ba76e84f 100644 --- a/examples/notebooks/t6_m2_trainers.ipynb +++ b/examples/notebooks/t6_m2_trainers.ipynb @@ -53,27 +53,7 @@ "id": "cell-imports", "metadata": {}, "outputs": [], - "source": [ - "import re\n", - "import copy\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "from typing import Tuple, Dict\n", - "\n", - "from opto import trace\n", - "from opto.trainer.guide import Guide\n", - "from opto.trainer.objectives import ObjectiveConfig\n", - "from opto.utils.llm import DummyLLM\n", - "from opto.optimizers import OptoPrimeV2\n", - "from opto.trainer.algorithms.basic_algorithms import BasicSearchAlgorithm\n", - "from opto.trainer.algorithms.beamsearch_algorithm import BeamsearchAlgorithm\n", - "from opto.features.priority_search.priority_search import PrioritySearch\n", - "\n", - "print(\"=\" * 70)\n", - "print(\"T6 M2 — BeamsearchAlgorithm & PrioritySearch Multi-Objective\")\n", - "print(\"=\" * 70)" - ] + "source": "import re\nimport copy\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\nfrom typing import Tuple, Dict\n\nfrom opto import trace\nfrom opto.trainer.guide import Guide\nfrom opto.trainer.objectives import ObjectiveConfig\nfrom opto.utils.llm import DummyLLM\nfrom opto.optimizers import OptoPrimeV2\nfrom opto.trainer.algorithms.basic_algorithms import BasicSearchAlgorithm\nfrom opto.trainer.algorithms.beamsearch_algorithm import BeamsearchAlgorithm\nfrom opto.features.priority_search.priority_search import PrioritySearch\n\n# Single-item dataset used by all algorithms (SixHumpCamel ignores inputs/infos)\nDATASET = dict(inputs=[None], infos=[None])\n\nprint(\"=\" * 70)\nprint(\"T6 M2 — BeamsearchAlgorithm & PrioritySearch Multi-Objective\")\nprint(\"=\" * 70)" }, { "cell_type": "markdown", @@ -99,7 +79,7 @@ "id": "cell-setup-env", "metadata": {}, "outputs": [], - "source": "# Import SixHumpCamel from Allen's example file\n_examples_dir = os.path.join(_repo_root, 'examples')\nif _examples_dir not in sys.path:\n sys.path.insert(0, _examples_dir)\nfrom multi_objective_convex_fn import SixHumpCamel\n\n\n# --- RewardGuide (defined here with correct copy import) ---\nclass RewardGuide(Guide):\n \"\"\"Multi-objective guide for convex function environments.\n\n get_feedback() -> (float, str): advances real env (training loop).\n get_score_dict() -> Dict[str, float]: uses deepcopy (selection path).\n \"\"\"\n\n def __init__(self, env):\n self.env = env\n\n def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n obs, reward, done, info = self.env.step(str(response))\n feedback = ((obs + \"\\n\\n\") if obs else \"\") + info.get(\"feedback\", \"\")\n return float(reward), feedback\n\n def get_score_dict(self, query, response, reference=None, **kwargs) -> Dict[str, float]:\n env_copy = copy.deepcopy(self.env)\n obs, reward, done, info = env_copy.step(str(response))\n base_loss = info.get(\"base_loss\")\n reg_loss = info.get(\"reg_loss\")\n if base_loss is None or reg_loss is None:\n base_loss = float(\"inf\")\n reg_loss = float(\"inf\")\n return {\"base_loss\": float(base_loss), \"reg_loss\": float(reg_loss)}\n\n\n# --- Agent: wraps a trace node that holds the x = [x1, x2] string ---\n@trace.model\nclass ConvexAgent:\n def __init__(self, initial_value):\n self.param = trace.node(\n initial_value, trainable=True,\n description=\"Input x into the hidden function to minimize y. Format: x = [x1, x2]\"\n )\n\n def forward(self, x):\n return self.param\n\n\n# --- DummyLLM callable: proposes x = [float, float] values ---\nclass ConvexLLMCallable:\n \"\"\"Returns cycling proposals spanning the SixHumpCamel landscape.\"\"\"\n\n PROPOSALS = [\n \"x = [0.09, -0.71]\", # very close to optimum 1\n \"x = [-0.09, 0.71]\", # very close to optimum 2\n \"x = [0.1, -0.7]\", # near optimum 1\n \"x = [-0.1, 0.7]\", # near optimum 2\n \"x = [0.5, -0.3]\", # moderate region\n \"x = [-0.5, 0.3]\", # moderate symmetric\n \"x = [0.2, -0.5]\", # exploring\n \"x = [-0.3, 0.6]\", # exploring\n \"x = [1.0, -1.0]\", # far from optima (high reg)\n \"x = [0.0, 0.0]\", # origin (zero loss)\n ]\n\n def __init__(self):\n self.idx = 0\n\n def __call__(self, messages, **kwargs):\n problem = messages[1][\"content\"]\n name = re.findall(r'', problem)\n name = name[0] if name else \"unknown\"\n value = self.PROPOSALS[self.idx % len(self.PROPOSALS)]\n self.idx += 1\n return (\n f\" Exploring the loss landscape. \\n\"\n f\"\\n\"\n f\" {name} \\n\"\n f\" {value} \\n\"\n f\"\"\n )\n\n\n# --- Post-training evaluation: get actual losses from the final parameter ---\ndef evaluate_final_losses(param_value):\n \"\"\"Evaluate a parameter string on a fresh SixHumpCamel env.\n\n Returns dict with base_loss, reg_loss, total_loss (all actual values).\n \"\"\"\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n x, stop = env.text_extract(str(param_value))\n if x is None:\n return {\"base_loss\": float(\"nan\"), \"reg_loss\": float(\"nan\"), \"total_loss\": float(\"nan\")}\n base, reg, total = env._eval_losses(x)\n return {\"base_loss\": float(base), \"reg_loss\": float(reg), \"total_loss\": float(total)}\n\n\n# --- Factory: create fresh env + agent + optimizer + guide per run ---\nDATASET = dict(inputs=[None], infos=[None])\n\n\ndef make_basicsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BasicSearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_beamsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BeamsearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_priority_search_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = PrioritySearch(agent, optimizer)\n return algo, guide, agent\n\n\n# Objective configs\nCONFIG_WEIGHTED = ObjectiveConfig(\n mode=\"weighted\",\n weights={\"base_loss\": 1.0, \"reg_loss\": 1.0},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n seed=0,\n)\n\nCONFIG_PARETO = ObjectiveConfig(\n mode=\"pareto\",\n weights={\"base_loss\": 0.7, \"reg_loss\": 0.3},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n tie_break=\"weighted\",\n seed=42,\n)\n\n# Results collector\nresults = {}\nprint(\"Setup complete. SixHumpCamel environment + DummyLLM ready.\")\nprint(f\"DummyLLM has {len(ConvexLLMCallable.PROPOSALS)} diverse proposals.\")" + "source": "# Import SixHumpCamel from Allen's example file\n_examples_dir = os.path.join(_repo_root, 'examples')\nif _examples_dir not in sys.path:\n sys.path.insert(0, _examples_dir)\nfrom multi_objective_convex_fn import SixHumpCamel\n\n\n# --- RewardGuide (defined here with correct copy import) ---\nclass RewardGuide(Guide):\n \"\"\"Multi-objective guide for convex function environments.\n\n get_feedback() -> (float, str): advances real env (training loop).\n get_score_dict() -> Dict[str, float]: uses deepcopy (selection path).\n \"\"\"\n\n def __init__(self, env):\n self.env = env\n\n def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n obs, reward, done, info = self.env.step(str(response))\n feedback = ((obs + \"\\n\\n\") if obs else \"\") + info.get(\"feedback\", \"\")\n return float(reward), feedback\n\n def get_score_dict(self, query, response, reference=None, **kwargs) -> Dict[str, float]:\n env_copy = copy.deepcopy(self.env)\n obs, reward, done, info = env_copy.step(str(response))\n base_loss = info.get(\"base_loss\")\n reg_loss = info.get(\"reg_loss\")\n if base_loss is None or reg_loss is None:\n base_loss = float(\"inf\")\n reg_loss = float(\"inf\")\n return {\"base_loss\": float(base_loss), \"reg_loss\": float(reg_loss)}\n\n\n# --- Agent: wraps a trace node that holds the x = [x1, x2] string ---\n@trace.model\nclass ConvexAgent:\n def __init__(self, initial_value):\n self.param = trace.node(\n initial_value, trainable=True,\n description=\"Input x into the hidden function to minimize y. Format: x = [x1, x2]\"\n )\n\n def forward(self, x):\n return self.param\n\n\n# --- DummyLLM callable: proposes x = [float, float] values ---\nclass ConvexLLMCallable:\n \"\"\"Returns cycling proposals spanning the SixHumpCamel landscape.\"\"\"\n\n PROPOSALS = [\n \"x = [0.09, -0.71]\", # very close to optimum 1\n \"x = [-0.09, 0.71]\", # very close to optimum 2\n \"x = [0.1, -0.7]\", # near optimum 1\n \"x = [-0.1, 0.7]\", # near optimum 2\n \"x = [0.5, -0.3]\", # moderate region\n \"x = [-0.5, 0.3]\", # moderate symmetric\n \"x = [0.2, -0.5]\", # exploring\n \"x = [-0.3, 0.6]\", # exploring\n \"x = [1.0, -1.0]\", # far from optima (high reg)\n \"x = [0.0, 0.0]\", # origin (zero loss)\n ]\n\n def __init__(self):\n self.idx = 0\n\n def __call__(self, messages, **kwargs):\n problem = messages[1][\"content\"]\n name = re.findall(r'', problem)\n name = name[0] if name else \"unknown\"\n value = self.PROPOSALS[self.idx % len(self.PROPOSALS)]\n self.idx += 1\n return (\n f\" Exploring the loss landscape. \\n\"\n f\"\\n\"\n f\" {name} \\n\"\n f\" {value} \\n\"\n f\"\"\n )\n\n\n# --- Post-training evaluation: get actual losses from the final parameter ---\ndef evaluate_final_losses(param_value):\n \"\"\"Evaluate a parameter string on a fresh SixHumpCamel env.\n\n Returns dict with base_loss, reg_loss, total_loss (all actual values).\n \"\"\"\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n x, stop = env.text_extract(str(param_value))\n if x is None:\n return {\"base_loss\": float(\"nan\"), \"reg_loss\": float(\"nan\"), \"total_loss\": float(\"nan\")}\n base, reg, total = env._eval_losses(x)\n return {\"base_loss\": float(base), \"reg_loss\": float(reg), \"total_loss\": float(total)}\n\n\n# --- Factory: create fresh env + agent + optimizer + guide per run ---\n\ndef make_basicsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BasicSearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_beamsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BeamsearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_priority_search_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = PrioritySearch(agent, optimizer)\n return algo, guide, agent\n\n\n# Objective configs\nCONFIG_WEIGHTED = ObjectiveConfig(\n mode=\"weighted\",\n weights={\"base_loss\": 1.0, \"reg_loss\": 1.0},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n seed=0,\n)\n\nCONFIG_PARETO = ObjectiveConfig(\n mode=\"pareto\",\n weights={\"base_loss\": 0.7, \"reg_loss\": 0.3},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n tie_break=\"weighted\",\n seed=42,\n)\n\n# Results collector\nresults = {}\nprint(\"Setup complete. SixHumpCamel environment + DummyLLM ready.\")\nprint(f\"DummyLLM has {len(ConvexLLMCallable.PROPOSALS)} diverse proposals.\")" }, { "cell_type": "code", From ca1349b0591ef57bb2ddccde844a0cd3e912017c Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Thu, 19 Feb 2026 17:02:27 -0400 Subject: [PATCH 15/20] fix: RewardGuide get_feedback() now uses deepcopy - prevents env horizon exhaustion during multi-candidate validation --- examples/notebooks/t6_m2_trainers.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/notebooks/t6_m2_trainers.ipynb b/examples/notebooks/t6_m2_trainers.ipynb index ba76e84f..8984f43e 100644 --- a/examples/notebooks/t6_m2_trainers.ipynb +++ b/examples/notebooks/t6_m2_trainers.ipynb @@ -79,7 +79,7 @@ "id": "cell-setup-env", "metadata": {}, "outputs": [], - "source": "# Import SixHumpCamel from Allen's example file\n_examples_dir = os.path.join(_repo_root, 'examples')\nif _examples_dir not in sys.path:\n sys.path.insert(0, _examples_dir)\nfrom multi_objective_convex_fn import SixHumpCamel\n\n\n# --- RewardGuide (defined here with correct copy import) ---\nclass RewardGuide(Guide):\n \"\"\"Multi-objective guide for convex function environments.\n\n get_feedback() -> (float, str): advances real env (training loop).\n get_score_dict() -> Dict[str, float]: uses deepcopy (selection path).\n \"\"\"\n\n def __init__(self, env):\n self.env = env\n\n def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n obs, reward, done, info = self.env.step(str(response))\n feedback = ((obs + \"\\n\\n\") if obs else \"\") + info.get(\"feedback\", \"\")\n return float(reward), feedback\n\n def get_score_dict(self, query, response, reference=None, **kwargs) -> Dict[str, float]:\n env_copy = copy.deepcopy(self.env)\n obs, reward, done, info = env_copy.step(str(response))\n base_loss = info.get(\"base_loss\")\n reg_loss = info.get(\"reg_loss\")\n if base_loss is None or reg_loss is None:\n base_loss = float(\"inf\")\n reg_loss = float(\"inf\")\n return {\"base_loss\": float(base_loss), \"reg_loss\": float(reg_loss)}\n\n\n# --- Agent: wraps a trace node that holds the x = [x1, x2] string ---\n@trace.model\nclass ConvexAgent:\n def __init__(self, initial_value):\n self.param = trace.node(\n initial_value, trainable=True,\n description=\"Input x into the hidden function to minimize y. Format: x = [x1, x2]\"\n )\n\n def forward(self, x):\n return self.param\n\n\n# --- DummyLLM callable: proposes x = [float, float] values ---\nclass ConvexLLMCallable:\n \"\"\"Returns cycling proposals spanning the SixHumpCamel landscape.\"\"\"\n\n PROPOSALS = [\n \"x = [0.09, -0.71]\", # very close to optimum 1\n \"x = [-0.09, 0.71]\", # very close to optimum 2\n \"x = [0.1, -0.7]\", # near optimum 1\n \"x = [-0.1, 0.7]\", # near optimum 2\n \"x = [0.5, -0.3]\", # moderate region\n \"x = [-0.5, 0.3]\", # moderate symmetric\n \"x = [0.2, -0.5]\", # exploring\n \"x = [-0.3, 0.6]\", # exploring\n \"x = [1.0, -1.0]\", # far from optima (high reg)\n \"x = [0.0, 0.0]\", # origin (zero loss)\n ]\n\n def __init__(self):\n self.idx = 0\n\n def __call__(self, messages, **kwargs):\n problem = messages[1][\"content\"]\n name = re.findall(r'', problem)\n name = name[0] if name else \"unknown\"\n value = self.PROPOSALS[self.idx % len(self.PROPOSALS)]\n self.idx += 1\n return (\n f\" Exploring the loss landscape. \\n\"\n f\"\\n\"\n f\" {name} \\n\"\n f\" {value} \\n\"\n f\"\"\n )\n\n\n# --- Post-training evaluation: get actual losses from the final parameter ---\ndef evaluate_final_losses(param_value):\n \"\"\"Evaluate a parameter string on a fresh SixHumpCamel env.\n\n Returns dict with base_loss, reg_loss, total_loss (all actual values).\n \"\"\"\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n x, stop = env.text_extract(str(param_value))\n if x is None:\n return {\"base_loss\": float(\"nan\"), \"reg_loss\": float(\"nan\"), \"total_loss\": float(\"nan\")}\n base, reg, total = env._eval_losses(x)\n return {\"base_loss\": float(base), \"reg_loss\": float(reg), \"total_loss\": float(total)}\n\n\n# --- Factory: create fresh env + agent + optimizer + guide per run ---\n\ndef make_basicsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BasicSearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_beamsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BeamsearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_priority_search_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = PrioritySearch(agent, optimizer)\n return algo, guide, agent\n\n\n# Objective configs\nCONFIG_WEIGHTED = ObjectiveConfig(\n mode=\"weighted\",\n weights={\"base_loss\": 1.0, \"reg_loss\": 1.0},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n seed=0,\n)\n\nCONFIG_PARETO = ObjectiveConfig(\n mode=\"pareto\",\n weights={\"base_loss\": 0.7, \"reg_loss\": 0.3},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n tie_break=\"weighted\",\n seed=42,\n)\n\n# Results collector\nresults = {}\nprint(\"Setup complete. SixHumpCamel environment + DummyLLM ready.\")\nprint(f\"DummyLLM has {len(ConvexLLMCallable.PROPOSALS)} diverse proposals.\")" + "source": "# Import SixHumpCamel from Allen's example file\n_examples_dir = os.path.join(_repo_root, 'examples')\nif _examples_dir not in sys.path:\n sys.path.insert(0, _examples_dir)\nfrom multi_objective_convex_fn import SixHumpCamel\n\n\n# --- RewardGuide (defined here with correct copy import) ---\nclass RewardGuide(Guide):\n \"\"\"Multi-objective guide for convex function environments.\n\n Both get_feedback() and get_score_dict() evaluate on a deepcopy of\n the environment so that candidate scoring never advances the real env.\n The SixHumpCamel loss computation is stateless (only depends on x),\n so deepcopy is safe and avoids burning through the env's horizon\n during multi-candidate validation.\n \"\"\"\n\n def __init__(self, env):\n self.env = env\n\n def _score_on_copy(self, response):\n \"\"\"Evaluate response on a deepcopy — env state stays frozen.\"\"\"\n env_copy = copy.deepcopy(self.env)\n obs, reward, done, info = env_copy.step(str(response))\n return obs, reward, done, info\n\n def get_feedback(self, query, response, reference=None, **kwargs) -> Tuple[float, str]:\n obs, reward, done, info = self._score_on_copy(response)\n feedback = ((obs + \"\\n\\n\") if obs else \"\") + info.get(\"feedback\", \"\")\n return float(reward), feedback\n\n def get_score_dict(self, query, response, reference=None, **kwargs) -> Dict[str, float]:\n obs, reward, done, info = self._score_on_copy(response)\n base_loss = info.get(\"base_loss\")\n reg_loss = info.get(\"reg_loss\")\n if base_loss is None or reg_loss is None:\n base_loss = float(\"inf\")\n reg_loss = float(\"inf\")\n return {\"base_loss\": float(base_loss), \"reg_loss\": float(reg_loss)}\n\n\n# --- Agent: wraps a trace node that holds the x = [x1, x2] string ---\n@trace.model\nclass ConvexAgent:\n def __init__(self, initial_value):\n self.param = trace.node(\n initial_value, trainable=True,\n description=\"Input x into the hidden function to minimize y. Format: x = [x1, x2]\"\n )\n\n def forward(self, x):\n return self.param\n\n\n# --- DummyLLM callable: proposes x = [float, float] values ---\nclass ConvexLLMCallable:\n \"\"\"Returns cycling proposals spanning the SixHumpCamel landscape.\"\"\"\n\n PROPOSALS = [\n \"x = [0.09, -0.71]\", # very close to optimum 1\n \"x = [-0.09, 0.71]\", # very close to optimum 2\n \"x = [0.1, -0.7]\", # near optimum 1\n \"x = [-0.1, 0.7]\", # near optimum 2\n \"x = [0.5, -0.3]\", # moderate region\n \"x = [-0.5, 0.3]\", # moderate symmetric\n \"x = [0.2, -0.5]\", # exploring\n \"x = [-0.3, 0.6]\", # exploring\n \"x = [1.0, -1.0]\", # far from optima (high reg)\n \"x = [0.0, 0.0]\", # origin (zero loss)\n ]\n\n def __init__(self):\n self.idx = 0\n\n def __call__(self, messages, **kwargs):\n problem = messages[1][\"content\"]\n name = re.findall(r'', problem)\n name = name[0] if name else \"unknown\"\n value = self.PROPOSALS[self.idx % len(self.PROPOSALS)]\n self.idx += 1\n return (\n f\" Exploring the loss landscape. \\n\"\n f\"\\n\"\n f\" {name} \\n\"\n f\" {value} \\n\"\n f\"\"\n )\n\n\n# --- Post-training evaluation: get actual losses from the final parameter ---\ndef evaluate_final_losses(param_value):\n \"\"\"Evaluate a parameter string on a fresh SixHumpCamel env.\n\n Returns dict with base_loss, reg_loss, total_loss (all actual values).\n \"\"\"\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n x, stop = env.text_extract(str(param_value))\n if x is None:\n return {\"base_loss\": float(\"nan\"), \"reg_loss\": float(\"nan\"), \"total_loss\": float(\"nan\")}\n base, reg, total = env._eval_losses(x)\n return {\"base_loss\": float(base), \"reg_loss\": float(reg), \"total_loss\": float(total)}\n\n\n# --- Factory: create fresh env + agent + optimizer + guide per run ---\n\ndef make_basicsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BasicSearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_beamsearch_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = BeamsearchAlgorithm(agent, optimizer)\n return algo, guide, agent\n\n\ndef make_priority_search_run():\n env = SixHumpCamel(horizon=200, norm_coef=1.0, seed=42)\n env.reset(seed=42)\n guide = RewardGuide(env)\n agent = ConvexAgent(\"x = [0.0, 0.0]\")\n llm = DummyLLM(ConvexLLMCallable())\n optimizer = OptoPrimeV2(agent.parameters(), llm=llm)\n algo = PrioritySearch(agent, optimizer)\n return algo, guide, agent\n\n\n# Objective configs\nCONFIG_WEIGHTED = ObjectiveConfig(\n mode=\"weighted\",\n weights={\"base_loss\": 1.0, \"reg_loss\": 1.0},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n seed=0,\n)\n\nCONFIG_PARETO = ObjectiveConfig(\n mode=\"pareto\",\n weights={\"base_loss\": 0.7, \"reg_loss\": 0.3},\n minimize=frozenset({\"base_loss\", \"reg_loss\"}),\n tie_break=\"weighted\",\n seed=42,\n)\n\n# Results collector\nresults = {}\nprint(\"Setup complete. SixHumpCamel environment + DummyLLM ready.\")\nprint(f\"DummyLLM has {len(ConvexLLMCallable.PROPOSALS)} diverse proposals.\")" }, { "cell_type": "code", From 427cb3f621b35b8712f8fb2593af50e4c1e054e1 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Thu, 19 Feb 2026 17:51:50 -0400 Subject: [PATCH 16/20] fix: Beamsearch train() returns final_validation_score instead of hardcoded 0.0 when no test_dataset --- opto/trainer/algorithms/beamsearch_algorithm.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/opto/trainer/algorithms/beamsearch_algorithm.py b/opto/trainer/algorithms/beamsearch_algorithm.py index fc8352bf..43655f0c 100644 --- a/opto/trainer/algorithms/beamsearch_algorithm.py +++ b/opto/trainer/algorithms/beamsearch_algorithm.py @@ -308,8 +308,10 @@ def train(self, print_color(f"Depth {depth}: Test score = {score:.4f}", 'cyan') # For API consistency with other algorithms - return metrics, final_test_score if final_test_score is not None else 0.0 - + # Prefer test score when available; fall back to the validation score + # from the final select() call (not 0.0, which hides the real result). + return metrics, final_test_score if final_test_score is not None else final_validation_score + def _sample_minibatch(self, dataset, batch_size): """Sample a minibatch from the dataset.""" indices = np.random.choice(len(dataset['inputs']), min(batch_size, len(dataset['inputs'])), replace=False) @@ -752,7 +754,9 @@ def train(self, for d, s in zip(metrics['test_depths'], metrics['test_scores']): print_color(f"Depth {d}: Test score = {s:.4f}", 'cyan') - return metrics, final_test_score if final_test_score is not None else -np.inf + # Prefer test score when available; fall back to the validation score + # from the final select() call. + return metrics, final_test_score if final_test_score is not None else final_validation_score def expand(self, beam_params: Dict, From 106bd111912167cae0c4012bbd006f05002867a7 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Fri, 20 Feb 2026 11:05:43 -0400 Subject: [PATCH 17/20] T6 M2: BBEH notebook with dotenv + OpenAI direct support --- examples/notebooks/t6_m2_bbeh.ipynb | 1197 +++++++++++++++++++++++++++ 1 file changed, 1197 insertions(+) create mode 100644 examples/notebooks/t6_m2_bbeh.ipynb diff --git a/examples/notebooks/t6_m2_bbeh.ipynb b/examples/notebooks/t6_m2_bbeh.ipynb new file mode 100644 index 00000000..3e2329d5 --- /dev/null +++ b/examples/notebooks/t6_m2_bbeh.ipynb @@ -0,0 +1,1197 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cell-title", + "metadata": {}, + "source": [ + "# T6 M2 — BBEH Boolean Expressions with Multi-Objective Instrumentation\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AgentOpt/OpenTrace/blob/pull/61/head/examples/notebooks/t6_m2_bbeh.ipynb)\n", + "\n", + "**Milestone 2 Deliverable** — Multi-objective scoring on a real LLM task\n", + "\n", + "This notebook demonstrates multi-objective optimization on the **BBEH boolean_expressions** benchmark\n", + "using the **PAL (Program-Aided Language model)** strategy from Xavier’s original experiment.\n", + "\n", + "Two objectives are tracked:\n", + "- **accuracy** (binary: 1.0 = correct, 0.0 = wrong)\n", + "- **execution_time_s** (wall-clock seconds for the generated Python code)\n", + "\n", + "The `LangGraphGuide.get_score_dict()` method returns both metrics per example,\n", + "enabling the M2 multi-objective infrastructure to track and visualize tradeoffs.\n", + "\n", + "**Requires a real LLM API key** (OpenRouter recommended, default model: `openai/gpt-5-nano`).\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-config", + "metadata": {}, + "outputs": [], + "source": "import os\n\n# -----------------------\n# Load .env file (if present) so API keys are available via os.getenv()\n# -----------------------\ntry:\n from dotenv import load_dotenv\n # Walk up from notebook dir to find .env (works locally and in Colab)\n _env_candidates = [\".env\", \"../.env\", \"../../.env\", \"../../../.env\"]\n for _ep in _env_candidates:\n if os.path.exists(_ep):\n load_dotenv(_ep, override=False)\n print(f\"Loaded .env from: {os.path.abspath(_ep)}\")\n break\n else:\n print(\"No .env file found (will use existing env vars).\")\nexcept ImportError:\n print(\"python-dotenv not installed (pip install python-dotenv). Using existing env vars.\")\n\n# -----------------------\n# Core defaults (edit me)\n# -----------------------\nBBEH_TASK_NAME = os.getenv(\"BBEH_TASK_NAME\", \"bbeh_boolean_expressions\")\n\n# Data split\nN_TRAIN = int(os.getenv(\"N_TRAIN\", \"20\"))\nN_VAL = int(os.getenv(\"N_VAL\", \"10\"))\nSEED = int(os.getenv(\"SEED\", \"0\"))\n\n# CurriculumBuffer Mode B\nVALIDATE_ON_LAST_N = int(os.getenv(\"VALIDATE_ON_LAST_N\", \"2\"))\nACCUMULATION_STEPS = int(os.getenv(\"ACCUMULATION_STEPS\", \"2\"))\n\n# Optimization loop controls\nLEARNING_RETRY = int(os.getenv(\"LEARNING_RETRY\", \"20\"))\nMAX_ATTEMPTS = int(os.getenv(\"MAX_ATTEMPTS\", \"10\"))\n\nSKIP_OPTIMIZATION = os.getenv(\"SKIP_OPTIMIZATION\", \"0\") == \"1\"\n\n# Output\nOUTPUT_FOLDER = os.getenv(\"OUTPUT_FOLDER\", \"./trace_runs\")\n\n# Optional verbosity toggles\nSHOW_MERMAID_GRAPH = os.getenv(\"SHOW_MERMAID_GRAPH\", \"0\") == \"1\"\nSHOW_OPT_TRACE = os.getenv(\"SHOW_OPT_TRACE\", \"0\") == \"1\"\n\ntry:\n import google.colab\n IN_COLAB = True\nexcept ImportError:\n IN_COLAB = False\n\nprint(\"Config:\")\nprint(f\" {BBEH_TASK_NAME=}\")\nprint(f\" {N_TRAIN=}, {N_VAL=}, {SEED=}\")\nprint(f\" {VALIDATE_ON_LAST_N=}, {ACCUMULATION_STEPS=}\")\nprint(f\" {LEARNING_RETRY=}, {MAX_ATTEMPTS=}\")\nprint(f\" {SKIP_OPTIMIZATION=}\")\nprint(f\" {OUTPUT_FOLDER=}\")" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-setup", + "metadata": {}, + "outputs": [], + "source": [ + "import os, sys\n", + "\n", + "if IN_COLAB:\n", + " if not os.path.exists('/content/Trace'):\n", + " print(\"Setting up Trace...\")\n", + " %pip install langgraph langchain langchain_openai datasets tqdm langchain_community litellm dspy black matplotlib pandas\n", + " %git clone https://github.com/AgentOpt/OpenTrace.git Trace\n", + " %cd Trace\n", + " %git pull origin experimental && git checkout experimental\n", + " %sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n", + " %pip install -e .\n", + " sys.path.append('/content/Trace')\n", + "else:\n", + " # Local: add repo root to sys.path\n", + " _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n", + " _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n", + " if _repo_root not in sys.path:\n", + " sys.path.insert(0, _repo_root)\n", + "\n", + "# Clone BBEH benchmark tasks\n", + "if not os.path.exists('bbeh'):\n", + " !git clone https://github.com/google-deepmind/bbeh.git\n", + "else:\n", + " print(\"bbeh/ already exists, skipping clone.\")\n", + "\n", + "# Soft-import display\n", + "try:\n", + " from IPython.display import display\n", + "except Exception:\n", + " def display(*args, **kwargs):\n", + " return None\n", + "\n", + "print(f\"{IN_COLAB=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-llm-config", + "metadata": {}, + "outputs": [], + "source": "import os\nfrom langchain_core.messages import HumanMessage, SystemMessage\nfrom langchain_openai import ChatOpenAI\n\n# -----------------------\n# LLM config — auto-detect from available API keys\n# -----------------------\n# Priority: LLM_SERVICE env var (explicit override) > OPENAI_API_KEY > OPENROUTER_API_KEY > CUSTOMLLM_API_KEY\n# When OPENAI_API_KEY is available, uses gpt-5-nano directly via OpenAI (no OpenRouter prefix).\n\ndef _get_secret(name: str) -> str | None:\n try:\n from google.colab import userdata\n v = userdata.get(name)\n if v:\n return v\n except Exception:\n pass\n return os.getenv(name)\n\nOPENAI_API_KEY = _get_secret(\"OPENAI_API_KEY\")\nOPENROUTER_API_KEY = _get_secret(\"OPENROUTER_API_KEY\")\nCUSTOMLLM_API_KEY = _get_secret(\"CUSTOMLLM_API_KEY\")\nCUSTOMLLM_URL = os.getenv(\"CUSTOMLLM_URL\", \"http://localhost:4000/\")\n\n# Auto-detect service if not explicitly set\n_explicit_service = os.getenv(\"LLM_SERVICE\")\nif _explicit_service:\n LLM_SERVICE = _explicit_service\nelif OPENAI_API_KEY:\n LLM_SERVICE = \"openai\"\nelif OPENROUTER_API_KEY:\n LLM_SERVICE = \"openrouter\"\nelif CUSTOMLLM_API_KEY:\n LLM_SERVICE = \"customllm\"\nelse:\n raise ValueError(\n \"No API key found. Set OPENAI_API_KEY, OPENROUTER_API_KEY, or CUSTOMLLM_API_KEY \"\n \"(via env var, .env file, or Colab secret).\"\n )\n\n# Model name: OpenRouter uses \"openai/gpt-5-nano\" prefix, OpenAI uses \"gpt-5-nano\" directly\n_default_model = os.getenv(\"LLM_GENERAL_MODEL\")\nif _default_model:\n LLM_GENERAL_MODEL = _default_model\nelif LLM_SERVICE == \"openai\":\n LLM_GENERAL_MODEL = \"gpt-5-nano\"\nelse:\n LLM_GENERAL_MODEL = \"openai/gpt-5-nano\"\n\nif LLM_SERVICE == \"openai\":\n if not OPENAI_API_KEY:\n raise ValueError(\"OPENAI_API_KEY missing (set env var, .env file, or Colab secret).\")\n os.environ[\"OPENAI_BASE_URL\"] = \"https://api.openai.com/v1\"\n os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\nelif LLM_SERVICE == \"openrouter\":\n if not OPENROUTER_API_KEY:\n raise ValueError(\"OPENROUTER_API_KEY missing (set env var, .env file, or Colab secret).\")\n os.environ[\"OPENAI_BASE_URL\"] = \"https://openrouter.ai/api/v1\"\n os.environ[\"OPENAI_API_KEY\"] = OPENROUTER_API_KEY\nelif LLM_SERVICE == \"customllm\":\n if not CUSTOMLLM_API_KEY:\n raise ValueError(\"CUSTOMLLM_API_KEY missing (set env var, .env file, or Colab secret).\")\n os.environ[\"OPENAI_BASE_URL\"] = CUSTOMLLM_URL\n os.environ[\"OPENAI_API_KEY\"] = CUSTOMLLM_API_KEY\nelse:\n raise ValueError(f\"Unknown LLM_SERVICE: {LLM_SERVICE!r}\")\n\nllm = ChatOpenAI(model_name=LLM_GENERAL_MODEL, temperature=0)\n\ndef llm_call(prompt: str, system_instructions: str = \"\") -> str:\n msgs = [HumanMessage(content=prompt)]\n if system_instructions:\n msgs.insert(0, SystemMessage(content=system_instructions))\n return llm.invoke(msgs).content\n\nprint(\"LLM ready:\", {\"service\": LLM_SERVICE, \"model\": LLM_GENERAL_MODEL})" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-framework", + "metadata": {}, + "outputs": [], + "source": [ + "import os, json, random, inspect, time\n", + "from copy import deepcopy\n", + "from typing import Dict, Tuple, Optional\n", + "\n", + "# ---- Trace imports (OpenTrace / opto) ----\n", + "try:\n", + " from opto.trace import node, bundle\n", + " from opto.trace.bundle import FunModule\n", + " from opto.optimizers.optoprime_v2 import OptoPrimeV2 as OptoPrime\n", + " from opto.trainer.guide import Guide as _TraceGuide\n", + " from opto.trainer.algorithms.basic_algorithms import Minibatch as _TraceMinibatch\n", + "except Exception as e:\n", + " raise ImportError(\n", + " \"Could not import OpenTrace (opto.*). \"\n", + " \"Make sure OpenTrace is installed and on sys.path.\"\n", + " ) from e\n", + "\n", + "\n", + "# -----------------------\n", + "# Small helpers\n", + "# -----------------------\n", + "def set_dict(state: dict, key, value):\n", + " (state.data if hasattr(state, \"data\") else state)[key] = value\n", + "\n", + "def get_no_node(x):\n", + " return x.data if hasattr(x, \"data\") else x\n", + "\n", + "def _snapshot_params(parameters):\n", + " snap = {}\n", + " for p in parameters:\n", + " try:\n", + " snap[p.name] = deepcopy(p.data)\n", + " except Exception:\n", + " snap[p.name] = p.data\n", + " return snap\n", + "\n", + "def _params_changed(before, after) -> bool:\n", + " if before.keys() != after.keys():\n", + " return True\n", + " for k in before.keys():\n", + " if str(before[k]) != str(after[k]):\n", + " return True\n", + " return False\n", + "\n", + "def _replace_in_scope_by_identity(scope: dict, old_obj, new_obj) -> list[str]:\n", + " replaced = []\n", + " for k, v in list(scope.items()):\n", + " if v is old_obj:\n", + " scope[k] = new_obj\n", + " replaced.append(k)\n", + " return replaced\n", + "\n", + "def bind_function(func, *, trainable=True, traceable_code=True, allow_external_dependencies=True):\n", + " \"\"\"Safely bundle() a python function into a Trace FunModule (only once).\"\"\"\n", + " if func is None or not callable(func):\n", + " return func\n", + " if isinstance(func, FunModule):\n", + " return func\n", + " fm = bundle(trainable=trainable,\n", + " traceable_code=traceable_code,\n", + " allow_external_dependencies=allow_external_dependencies)(func)\n", + " try:\n", + " fm.__signature__ = inspect.signature(fm._fun)\n", + " except Exception:\n", + " pass\n", + " return fm\n", + "\n", + "\n", + "# -----------------------\n", + "# Guide: graph output -> (score, feedback) + multi-objective score_dict\n", + "# -----------------------\n", + "class LangGraphGuide(_TraceGuide):\n", + " \"\"\"Guide for LangGraph-based agents with multi-objective scoring.\n", + "\n", + " get_feedback() returns the original scalar (accuracy) score and feedback string.\n", + " get_score_dict() returns {accuracy, execution_time_s} for multi-objective tracking.\n", + "\n", + " The execution_time_s is populated per-call via _last_execution_time_s,\n", + " set externally after each graph invocation from the instrumented execute_code() node.\n", + " \"\"\"\n", + "\n", + " def __init__(self, feedback_func, *, answer_key=\"final_answer\", allowed_answer_set=None):\n", + " self.feedback_func = feedback_func\n", + " self.answer_key = answer_key\n", + " self.allowed = allowed_answer_set\n", + " self._last_execution_time_s = 0.0\n", + "\n", + " def _extract_answer(self, response):\n", + " \"\"\"Extract the answer value from a graph response.\"\"\"\n", + " try:\n", + " if isinstance(response, dict) or (hasattr(response, \"data\") and isinstance(get_no_node(response), dict)):\n", + " return get_no_node(get_no_node(response)[self.answer_key])\n", + " else:\n", + " return get_no_node(response)\n", + " except Exception:\n", + " return get_no_node(response)\n", + "\n", + " def get_feedback(self, query, response, reference, **kwargs):\n", + " extracted = self._extract_answer(response)\n", + " if self.allowed is not None:\n", + " ok, fb = self.feedback_func(extracted, reference, self.allowed)\n", + " else:\n", + " ok, fb = self.feedback_func(extracted, reference)\n", + " return float(bool(ok)), fb\n", + "\n", + " def get_score_dict(self, query, response, reference=None, **kwargs) -> Dict[str, float]:\n", + " \"\"\"Return multi-objective scores: accuracy and execution time.\n", + "\n", + " accuracy: 1.0 if correct, 0.0 if wrong (to maximize)\n", + " execution_time_s: wall-clock seconds for code execution (to minimize)\n", + " \"\"\"\n", + " extracted = self._extract_answer(response)\n", + " if self.allowed is not None:\n", + " ok, _ = self.feedback_func(extracted, reference, self.allowed)\n", + " else:\n", + " ok, _ = self.feedback_func(extracted, reference)\n", + " return {\n", + " \"accuracy\": float(bool(ok)),\n", + " \"execution_time_s\": self._last_execution_time_s,\n", + " }\n", + "\n", + " def copy(self):\n", + " g = LangGraphGuide(self.feedback_func, answer_key=self.answer_key, allowed_answer_set=self.allowed)\n", + " g._last_execution_time_s = self._last_execution_time_s\n", + " return g\n", + "\n", + "\n", + "# -----------------------\n", + "# CurriculumBuffer\n", + "# -----------------------\n", + "class CurriculumBuffer:\n", + " \"\"\"Mode A (fixed pool) if training_pool is provided; Mode B (curriculum) otherwise.\"\"\"\n", + " def __init__(self, training_pool=None, *, history_size=2, sample_with_replacement=True, seed=None):\n", + " self.pool = list(training_pool) if training_pool else []\n", + " self.history = []\n", + " self.history_size = int(history_size)\n", + " self.replacement = bool(sample_with_replacement)\n", + " self._rng = random.Random(seed)\n", + "\n", + " @property\n", + " def is_fixed_pool(self) -> bool:\n", + " return len(self.pool) > 0\n", + "\n", + " def add_success(self, example: dict):\n", + " self.history.append(example)\n", + " if len(self.history) > self.history_size:\n", + " self.history.pop(0)\n", + "\n", + " def sample_batch(self, batch_size: int, *, current_question=None, current_solution=None) -> list[dict]:\n", + " if self.is_fixed_pool:\n", + " k = batch_size if self.replacement else min(batch_size, len(self.pool))\n", + " return self._rng.choices(self.pool, k=k) if self.replacement else self._rng.sample(self.pool, k=k)\n", + " batch = []\n", + " max_steps = min(batch_size, 1 + len(self.history))\n", + " for i in range(max_steps):\n", + " if i == 0:\n", + " batch.append({\"question\": current_question, \"solution\": current_solution})\n", + " else:\n", + " ex = self.history[-i]\n", + " batch.append({\"question\": ex[\"question\"], \"solution\": ex.get(\"solution\", ex.get(\"answer\"))})\n", + " return batch\n", + "\n", + "\n", + "# -----------------------\n", + "# Trainer\n", + "# -----------------------\n", + "class LangGraphTrainer(_TraceMinibatch):\n", + " def __init__(self, *, graph_root_function: str, graph_agents_functions: list[str], scope: dict,\n", + " optimizer, parameters: list):\n", + " object.__init__(self)\n", + " self.root_name = graph_root_function\n", + " self.agent_names = list(graph_agents_functions)\n", + " self.scope = scope\n", + " self.optimizer = optimizer\n", + " self.parameters = list(parameters)\n", + " self._original_root = scope[graph_root_function]\n", + " self._original_agents = {n: scope[n] for n in graph_agents_functions if n in scope}\n", + "\n", + " def restore_originals(self):\n", + " self.scope[self.root_name] = self._original_root\n", + " for name, orig in self._original_agents.items():\n", + " self.scope[name] = orig\n", + "\n", + " def _check_corruption(self) -> bool:\n", + " restored = False\n", + " for name in self.agent_names:\n", + " agent = self.scope.get(name)\n", + " if isinstance(agent, FunModule) and getattr(agent, \"_fun\", None) is None:\n", + " print(f\"\\u26a0\\ufe0f corruption: '{name}' has ._fun=None. Restoring original.\")\n", + " self.scope[name] = self._original_agents[name]\n", + " restored = True\n", + " return restored\n", + "\n", + " def _run_one(self, question, solution, guide: LangGraphGuide):\n", + " answer_key = guide.answer_key\n", + " try:\n", + " answer = self.scope[self.root_name](question)\n", + " score, feedback = guide.get_feedback(question, answer, solution)\n", + " ok = score >= 1.0\n", + " except Exception as e:\n", + " ok = False\n", + " feedback = f\"ERROR: {e}\"\n", + " answer = {answer_key: node(\"DUMMY_ANSWER\")}\n", + " return answer, ok, feedback\n", + "\n", + " def train(self, *, guide: LangGraphGuide, buffer: CurriculumBuffer,\n", + " question=None, solution=None,\n", + " target_updates=20, max_attempts=10, batch_size=3,\n", + " test_optimization=True, stop_on_success=True,\n", + " run_dir=\".\", save_steps=True,\n", + " validation_set=None):\n", + " if validation_set is None:\n", + " validation_set = []\n", + "\n", + " answer_key = guide.answer_key\n", + " best_state = None\n", + " last_state = None\n", + " history = []\n", + " modified = False\n", + " updates_done = 0\n", + " global_attempt = 0\n", + "\n", + " os.makedirs(run_dir, exist_ok=True)\n", + "\n", + " while updates_done < int(target_updates):\n", + " step_attempt = 0\n", + " step_changed = False\n", + "\n", + " while step_attempt < int(max_attempts) and not step_changed:\n", + " step_attempt += 1\n", + " global_attempt += 1\n", + " attempt = global_attempt\n", + " print(f\"[opt] attempt={attempt} update_step={updates_done+1}/{target_updates} try={step_attempt}/{max_attempts}\")\n", + "\n", + " self.optimizer.zero_feedback()\n", + "\n", + " batch_examples = buffer.sample_batch(\n", + " int(batch_size),\n", + " current_question=question,\n", + " current_solution=solution,\n", + " )\n", + "\n", + " answers = []\n", + " feedbacks = []\n", + " batch_all_correct = True\n", + "\n", + " for ex in batch_examples:\n", + " eq = ex[\"question\"]\n", + " es = ex.get(\"solution\", ex.get(\"answer\"))\n", + " ans, ok, fb = self._run_one(eq, es, guide)\n", + " batch_all_correct = batch_all_correct and ok\n", + " answers.append(ans)\n", + " feedbacks.append(fb)\n", + "\n", + " if len(feedbacks) == 1:\n", + " common_feedback = feedbacks[0]\n", + " else:\n", + " common_feedback = \"\\n\".join([f\"Feedback #{i+1}: {fb}\" for i, fb in enumerate(feedbacks)])\n", + "\n", + " for ans in answers:\n", + " ans_node = ans.get(answer_key, ans) if isinstance(ans, dict) else ans\n", + " if not hasattr(ans_node, \"backward\"):\n", + " ans_node = node(str(ans_node))\n", + " self.optimizer.backward(\n", + " ans_node,\n", + " common_feedback,\n", + " visualize=bool(SHOW_OPT_TRACE),\n", + " print_limit=30,\n", + " )\n", + "\n", + " before = _snapshot_params(self.parameters)\n", + " self.optimizer.step(verbose=True)\n", + " after = _snapshot_params(self.parameters)\n", + " step_changed = _params_changed(before, after)\n", + "\n", + " if self._check_corruption():\n", + " step_changed = False\n", + "\n", + " if not step_changed:\n", + " print(\"[opt] no parameter change, retrying...\")\n", + " continue\n", + "\n", + " updates_done += 1\n", + " modified = True\n", + " last_state = {p.name: p.data for p in self.parameters}\n", + "\n", + " val_acc = None\n", + " if validation_set:\n", + " n_ok = 0\n", + " for v in validation_set:\n", + " _, vok, _ = self._run_one(v[\"question\"], v.get(\"solution\", v.get(\"answer\")), guide)\n", + " n_ok += int(vok)\n", + " val_acc = n_ok / float(len(validation_set))\n", + "\n", + " if save_steps:\n", + " try:\n", + " step_path = os.path.join(run_dir, f\"step_{updates_done:03d}_state.txt\")\n", + " with open(step_path, \"w\") as f:\n", + " for nm, val in last_state.items():\n", + " f.write(f\"{nm}: {val}\\n\")\n", + " except Exception as e:\n", + " print(f\"\\u26a0\\ufe0f could not save step state: {e}\")\n", + "\n", + " if test_optimization and question is not None:\n", + " _, cur_ok, cur_fb = self._run_one(question, solution, guide)\n", + " val_ok = True\n", + " for v in validation_set:\n", + " _, vok, _ = self._run_one(v[\"question\"], v.get(\"solution\", v.get(\"answer\")), guide)\n", + " if not vok:\n", + " val_ok = False\n", + " break\n", + " if cur_ok and val_ok:\n", + " best_state = last_state\n", + " print(\"[opt] gate PASS:\", cur_fb)\n", + " if stop_on_success:\n", + " hist_entry = {\n", + " \"update_step\": updates_done,\n", + " \"attempt\": attempt,\n", + " \"batch_size\": int(batch_size),\n", + " \"mode\": \"fixed\" if buffer.is_fixed_pool else \"curriculum\",\n", + " \"train_batch_all_correct\": batch_all_correct,\n", + " \"val_acc\": val_acc,\n", + " \"gate_pass\": True,\n", + " }\n", + " history.append(hist_entry)\n", + " with open(os.path.join(run_dir, \"history.jsonl\"), \"a\") as f:\n", + " f.write(json.dumps(hist_entry, default=str) + \"\\n\")\n", + " return modified, history, best_state, last_state\n", + "\n", + " hist_entry = {\n", + " \"update_step\": updates_done,\n", + " \"attempt\": attempt,\n", + " \"batch_size\": int(batch_size),\n", + " \"mode\": \"fixed\" if buffer.is_fixed_pool else \"curriculum\",\n", + " \"train_batch_all_correct\": batch_all_correct,\n", + " \"val_acc\": val_acc,\n", + " \"gate_pass\": bool(best_state is not None),\n", + " }\n", + " history.append(hist_entry)\n", + " try:\n", + " with open(os.path.join(run_dir, \"history.jsonl\"), \"a\") as f:\n", + " f.write(json.dumps(hist_entry, default=str) + \"\\n\")\n", + " except Exception:\n", + " pass\n", + "\n", + " if stop_on_success and best_state is not None:\n", + " return modified, history, best_state, last_state\n", + "\n", + " if not step_changed:\n", + " print(f\"\\u26a0\\ufe0f stopping early: couldn't get a parameter update after {max_attempts} tries.\")\n", + " break\n", + "\n", + " return modified, history, best_state, last_state\n", + "\n", + "\n", + "# -----------------------\n", + "# optimize_langgraph (thin facade)\n", + "# -----------------------\n", + "def optimize_langgraph(\n", + " *,\n", + " graph_root_function: str,\n", + " graph_agents_functions: list[str],\n", + " question: str,\n", + " solution: str,\n", + " graph_prompts_list=None,\n", + " answer_feedback_func=None,\n", + " allowed_answer_set=None,\n", + " answer_key=\"final_answer\",\n", + " validation_set=None,\n", + " training_pool=None,\n", + " batch_size=None,\n", + " accumulation_steps=1,\n", + " sample_with_replacement=True,\n", + " seed=None,\n", + " updating_steps=None,\n", + " retry=5,\n", + " max_attempts=10,\n", + " stop_on_success=True,\n", + " test_optimization=True,\n", + " train_graph_agents_functions=True,\n", + " memory_size=1,\n", + " save_steps=True,\n", + " dump_prefix=\"\",\n", + " output_folder=None,\n", + " scope=None,\n", + " optimizer_cls=None,\n", + " trainer_cls=None,\n", + "):\n", + " if optimizer_cls is None:\n", + " optimizer_cls = OptoPrime\n", + " if trainer_cls is None:\n", + " trainer_cls = LangGraphTrainer\n", + " if scope is None:\n", + " scope = globals()\n", + " if validation_set is None:\n", + " validation_set = []\n", + " if seed is not None:\n", + " random.seed(seed)\n", + "\n", + " if isinstance(scope.get(graph_root_function), FunModule):\n", + " scope[graph_root_function] = scope[graph_root_function]._fun\n", + "\n", + " parameters = []\n", + " for name in graph_agents_functions:\n", + " if name not in scope:\n", + " raise KeyError(f\"'{name}' not found in scope.\")\n", + " scope[name] = bind_function(scope[name], trainable=train_graph_agents_functions)\n", + " parameters.extend(scope[name].parameters())\n", + "\n", + " if graph_prompts_list is not None:\n", + " for i, prompt in enumerate(list(graph_prompts_list)):\n", + " if hasattr(prompt, \"data\") and hasattr(prompt, \"name\"):\n", + " parameters.append(prompt)\n", + " continue\n", + " new_prompt = node(str(prompt), trainable=True)\n", + " _replace_in_scope_by_identity(scope, prompt, new_prompt)\n", + " graph_prompts_list[i] = new_prompt\n", + " parameters.append(new_prompt)\n", + "\n", + " if not parameters:\n", + " raise ValueError(\"No trainable parameters found (agents/prompts list is empty).\")\n", + "\n", + " opt = optimizer_cls(\n", + " parameters,\n", + " memory_size=memory_size,\n", + " objective=[\n", + " \"Improve the agent so it solves the task reliably.\",\n", + " \"Prefer simple, robust edits to prompts/code.\"\n", + " ],\n", + " )\n", + "\n", + " guide = LangGraphGuide(\n", + " feedback_func=answer_feedback_func,\n", + " answer_key=answer_key,\n", + " allowed_answer_set=allowed_answer_set,\n", + " )\n", + "\n", + " effective_batch_size = int(batch_size) if batch_size is not None else max(1, 1 + int(accumulation_steps))\n", + "\n", + " buffer = CurriculumBuffer(\n", + " training_pool=training_pool,\n", + " history_size=max(len(validation_set), 2) if validation_set else 2,\n", + " sample_with_replacement=sample_with_replacement,\n", + " seed=seed,\n", + " )\n", + " if (not buffer.is_fixed_pool) and validation_set:\n", + " for v in validation_set:\n", + " buffer.add_success(v)\n", + "\n", + " target_updates = int(updating_steps) if updating_steps is not None else int(retry)\n", + " _max_attempts = int(max_attempts)\n", + "\n", + " base_dir = output_folder or \".\"\n", + " os.makedirs(base_dir, exist_ok=True)\n", + " run_name = (\n", + " f\"{dump_prefix}{graph_root_function}\"\n", + " f\"__mode-{'fixed' if buffer.is_fixed_pool else 'curr'}\"\n", + " f\"__bs{effective_batch_size}\"\n", + " f\"__updates{target_updates}\"\n", + " f\"__maxA{_max_attempts}\"\n", + " f\"__mem{memory_size}\"\n", + " f\"__seed{seed if seed is not None else 'none'}\"\n", + " )\n", + " run_dir = os.path.join(base_dir, run_name)\n", + " os.makedirs(run_dir, exist_ok=True)\n", + "\n", + " trainer = trainer_cls(\n", + " graph_root_function=graph_root_function,\n", + " graph_agents_functions=graph_agents_functions,\n", + " scope=scope,\n", + " optimizer=opt,\n", + " parameters=parameters,\n", + " )\n", + " modified, history, best_state, last_state = trainer.train(\n", + " guide=guide,\n", + " buffer=buffer,\n", + " question=question,\n", + " solution=solution,\n", + " target_updates=target_updates,\n", + " max_attempts=_max_attempts,\n", + " batch_size=effective_batch_size,\n", + " test_optimization=test_optimization,\n", + " stop_on_success=stop_on_success,\n", + " save_steps=save_steps,\n", + " run_dir=run_dir,\n", + " validation_set=validation_set,\n", + " )\n", + "\n", + " chosen_state = best_state if best_state is not None else last_state\n", + " dump_filename = None\n", + " if modified and chosen_state is not None:\n", + " dump_filename = os.path.join(run_dir, \"best_state.txt\")\n", + " with open(dump_filename, \"w\") as f:\n", + " for nm, val in chosen_state.items():\n", + " f.write(f\"{nm}: {val}\\n\")\n", + "\n", + " if (not test_optimization) or (best_state is None):\n", + " trainer.restore_originals()\n", + "\n", + " return modified, dump_filename, history, chosen_state, run_dir\n", + "\n", + "\n", + "print(\"Framework loaded: LangGraphGuide (with get_score_dict), CurriculumBuffer, LangGraphTrainer, optimize_langgraph\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-pal-strategy", + "metadata": {}, + "outputs": [], + "source": [ + "import re, time\n", + "from langgraph.graph import StateGraph, START, END\n", + "\n", + "# -----------------------\n", + "# Strategy: PAL (Program-Aided Language model)\n", + "# -----------------------\n", + "prompt_parse_problem = node(\n", + " \"Read the problem and write Python code that sets a variable named `result` to the final answer.\\n\"\n", + " \"- Output ONLY valid Python (no markdown fences).\\n\"\n", + " \"- If the task is multiple-choice, set result to the option label exactly (e.g., '(A)').\\n\\n\"\n", + " \"Problem:\\n\",\n", + " trainable=True,\n", + " description=\"PAL prompt that generates python code producing a `result`.\"\n", + ")\n", + "\n", + "# Global variable to capture execution time from the graph invocation.\n", + "# This is read by run_solver_on_example() to populate the guide's score_dict.\n", + "_last_exec_time_s = 0.0\n", + "\n", + "def parse_problem(state: dict):\n", + " question = get_no_node(state.get(\"question\", \"\"))\n", + " prompt = prompt_parse_problem + question\n", + " code_str = llm_call(get_no_node(prompt))\n", + " return {\"code\": code_str.strip(), \"question\": question}\n", + "\n", + "def execute_code(state: dict):\n", + " \"\"\"Execute LLM-generated Python code, measuring wall-clock time.\n", + "\n", + " M2 instrumentation: wraps the exec() call with time.perf_counter()\n", + " so execution_time_s is available as a second objective.\n", + " \"\"\"\n", + " global _last_exec_time_s\n", + "\n", + " def strip_python_tags(code: str) -> str:\n", + " return re.sub(\n", + " r'(?s)(?:.*?```(?:python)?\\s*\\n(.*?)(?:\\n```.*)?|(.*))',\n", + " lambda m: m.group(1) if m.group(1) is not None else m.group(2),\n", + " code,\n", + " )\n", + "\n", + " update = {}\n", + " try:\n", + " code_to_run = strip_python_tags(get_no_node(state.get(\"code\", \"\")))\n", + " local_vars = {}\n", + "\n", + " # --- M2: measure wall-clock time for code execution ---\n", + " t0 = time.perf_counter()\n", + " exec(code_to_run, {}, local_vars) # noqa: S102 - intentional PAL strategy\n", + " t1 = time.perf_counter()\n", + " _last_exec_time_s = t1 - t0\n", + "\n", + " local_vars.pop(\"__builtins__\", None)\n", + "\n", + " if \"result\" in local_vars:\n", + " update[\"final_answer\"] = node(local_vars[\"result\"])\n", + " elif len(local_vars) == 1:\n", + " update[\"final_answer\"] = node(next(iter(local_vars.values())))\n", + " else:\n", + " update[\"final_answer\"] = node(None)\n", + "\n", + " except Exception as e:\n", + " _last_exec_time_s = 0.0\n", + " update[\"final_answer\"] = node(None)\n", + " update[\"error\"] = str(e)\n", + "\n", + " return update\n", + "\n", + "def create_graph_solve_with_PAL_Strategy():\n", + " g = StateGraph(dict)\n", + " g.add_node(\"parse\", parse_problem)\n", + " g.add_node(\"calculate\", execute_code)\n", + " g.add_edge(START, \"parse\")\n", + " g.add_edge(\"parse\", \"calculate\")\n", + " g.add_edge(\"calculate\", END)\n", + " return g\n", + "\n", + "def solve_with_PAL_Strategy(problem: str) -> dict:\n", + " global _last_exec_time_s\n", + " _last_exec_time_s = 0.0 # reset before each invocation\n", + "\n", + " g = create_graph_solve_with_PAL_Strategy()\n", + " compiled = g.compile()\n", + "\n", + " if SHOW_MERMAID_GRAPH:\n", + " try:\n", + " from IPython.display import Image, display\n", + " display(Image(compiled.get_graph(xray=1).draw_mermaid_png()))\n", + " except Exception:\n", + " pass\n", + "\n", + " result = compiled.invoke({\"question\": get_no_node(problem)})\n", + " if \"final_answer\" not in result:\n", + " return {\"final_answer\": node(\"No solution found\")}\n", + " if isinstance(result[\"final_answer\"], str):\n", + " return {\"final_answer\": node(result[\"final_answer\"])}\n", + " return result\n", + "\n", + "# Default graph spec\n", + "GRAPH_ROOT = \"solve_with_PAL_Strategy\"\n", + "GRAPH_AGENTS = [\"parse_problem\", \"execute_code\"]\n", + "GRAPH_PROMPTS = [prompt_parse_problem]\n", + "\n", + "print(\"PAL strategy loaded (with execution time instrumentation).\")\n", + "print(\"execute_code() measures wall-clock time for exec() via time.perf_counter().\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-bbeh-data", + "metadata": {}, + "outputs": [], + "source": [ + "import os, json, random, string\n", + "\n", + "# -----------------------\n", + "# BBEH dataset loader\n", + "# -----------------------\n", + "def _find_bbeh_tasks_dir() -> str:\n", + " candidates = [\n", + " \"bbeh/benchmark_tasks\",\n", + " \"bbeh/bbeh/benchmark_tasks\",\n", + " \"benchmark_tasks\",\n", + " ]\n", + " for c in candidates:\n", + " if os.path.exists(c):\n", + " return c\n", + " raise FileNotFoundError(\n", + " \"Could not locate BBEH benchmark_tasks folder.\\n\"\n", + " \"Clone the repo first, e.g. `git clone https://github.com/google-deepmind/bbeh.git`.\"\n", + " )\n", + "\n", + "bbeh_tasks_dir = _find_bbeh_tasks_dir()\n", + "print(\"BBEH tasks dir:\", bbeh_tasks_dir)\n", + "\n", + "LIMITED_BBEH_OUTPUT_TASKS = {\n", + " \"bbeh_boolean_expressions\": {\"(A)\", \"(B)\", \"(C)\", \"(D)\", \"(E)\"},\n", + "}\n", + "\n", + "def normalize_answer(ans) -> str:\n", + " if ans is None:\n", + " return \"\"\n", + " ans = str(ans).strip().lower()\n", + " ans = ans.translate(str.maketrans(\"\", \"\", string.punctuation))\n", + " ans = ans.replace(\" \", \"\")\n", + " return ans\n", + "\n", + "def feedback_answer_bbeh(predicted, target, allowed_set=None):\n", + " pred_raw = get_no_node(predicted)\n", + " pred_norm = normalize_answer(pred_raw)\n", + " target_norm = normalize_answer(target)\n", + "\n", + " allowed_norm = None\n", + " if allowed_set:\n", + " allowed_norm = {normalize_answer(a) for a in allowed_set}\n", + "\n", + " if pred_norm == target_norm:\n", + " return True, f\"SUCCESS: '{pred_raw}'\"\n", + " msg = f\"FAILED: '{pred_raw}' != '{target}'. Fix the code/prompt to solve similar problems.\"\n", + " if allowed_norm is not None and pred_norm not in allowed_norm:\n", + " msg += f\" (final answer must be one of: {sorted(allowed_set)})\"\n", + " return False, msg\n", + "\n", + "def load_bbeh_examples(task_name: str, *, n_train: int, n_val: int, seed: int = 0):\n", + " task_path = os.path.join(bbeh_tasks_dir, task_name, \"task.json\")\n", + " if not os.path.exists(task_path):\n", + " raise FileNotFoundError(f\"Task not found: {task_path}\")\n", + "\n", + " with open(task_path, \"r\") as f:\n", + " task = json.load(f)\n", + "\n", + " examples = task.get(\"examples\", [])\n", + " rng = random.Random(seed)\n", + " rng.shuffle(examples)\n", + "\n", + " allowed = LIMITED_BBEH_OUTPUT_TASKS.get(task_name)\n", + " def _format_q(q: str) -> str:\n", + " if allowed:\n", + " return q + f\"\\n\\nAllowed final answer: {sorted(allowed)}\"\n", + " return q\n", + "\n", + " items = [{\"question\": _format_q(ex[\"input\"]), \"solution\": ex[\"target\"]} for ex in examples]\n", + "\n", + " train = items[:n_train]\n", + " val = items[n_train:n_train + n_val]\n", + " return train, val, allowed\n", + "\n", + "train_set, val_set, allowed_set = load_bbeh_examples(\n", + " BBEH_TASK_NAME,\n", + " n_train=N_TRAIN,\n", + " n_val=N_VAL,\n", + " seed=SEED,\n", + ")\n", + "\n", + "print(f\"Loaded {len(train_set)} train and {len(val_set)} val examples for {BBEH_TASK_NAME}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-training-header", + "metadata": {}, + "source": [ + "---\n", + "## Training Loop with Multi-Objective Metric Collection\n", + "\n", + "The training loop below is identical to Xavier's original curriculum training,\n", + "with one addition: after each example (train or val), we collect `get_score_dict()`\n", + "from the `LangGraphGuide` to record both **accuracy** and **execution_time_s**.\n", + "\n", + "This per-example metric log feeds the graphs in the next section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-training", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Dict, Tuple\n", + "import time\n", + "\n", + "# -----------------------\n", + "# Multi-objective instrumented solver + evaluator\n", + "# -----------------------\n", + "\n", + "# Build the guide with multi-objective support\n", + "guide = LangGraphGuide(\n", + " feedback_func=feedback_answer_bbeh,\n", + " answer_key=\"final_answer\",\n", + " allowed_answer_set=allowed_set,\n", + ")\n", + "\n", + "def run_solver_on_example(ex: dict) -> Tuple[bool, str, str, Dict[str, float]]:\n", + " \"\"\"Run solver and return (ok, pred, feedback, score_dict).\n", + "\n", + " score_dict contains {accuracy, execution_time_s} from get_score_dict().\n", + " \"\"\"\n", + " global _last_exec_time_s\n", + " _last_exec_time_s = 0.0\n", + "\n", + " out = solve_with_PAL_Strategy(ex[\"question\"])\n", + " pred = get_no_node(out.get(\"final_answer\"))\n", + " ok, fb = feedback_answer_bbeh(pred, ex[\"solution\"], allowed_set)\n", + "\n", + " # Populate guide's execution time from the global, then get score_dict\n", + " guide._last_execution_time_s = _last_exec_time_s\n", + " score_dict = guide.get_score_dict(ex[\"question\"], out, ex[\"solution\"])\n", + "\n", + " return ok, str(pred), fb, score_dict\n", + "\n", + "def evaluate(examples: List[dict], *, name: str) -> Tuple[float, List[Dict[str, float]]]:\n", + " \"\"\"Evaluate examples, returning (accuracy, list of score_dicts).\"\"\"\n", + " n_ok = 0\n", + " all_score_dicts = []\n", + " for i, ex in enumerate(examples, 1):\n", + " ok, pred, fb, sd = run_solver_on_example(ex)\n", + " n_ok += int(ok)\n", + " all_score_dicts.append(sd)\n", + " print(f\"[{name}] {i:02d}/{len(examples)} ok={ok} pred={pred} \"\n", + " f\"exec_time={sd['execution_time_s']:.4f}s :: {fb}\")\n", + " acc = n_ok / max(1, len(examples))\n", + " mean_time = sum(sd['execution_time_s'] for sd in all_score_dicts) / max(1, len(all_score_dicts))\n", + " print(f\"[{name}] accuracy = {acc:.3f} ({n_ok}/{len(examples)}), mean exec_time = {mean_time:.4f}s\")\n", + " return acc, all_score_dicts\n", + "\n", + "\n", + "# =====================================================================\n", + "# Baseline evaluation\n", + "# =====================================================================\n", + "print(\"=\" * 60)\n", + "print(\"BASELINE evaluation on validation set\")\n", + "print(\"=\" * 60)\n", + "baseline_acc, baseline_score_dicts = evaluate(val_set, name=\"baseline/val\")\n", + "\n", + "# =====================================================================\n", + "# Per-step metric collection during curriculum training\n", + "# =====================================================================\n", + "# Stores {step, phase, accuracy, execution_time_s, example_idx} per observation\n", + "metric_log = []\n", + "step_counter = 0\n", + "\n", + "# Record baseline metrics\n", + "for i, sd in enumerate(baseline_score_dicts):\n", + " metric_log.append({\n", + " \"step\": 0,\n", + " \"phase\": \"baseline\",\n", + " \"example_idx\": i,\n", + " **sd,\n", + " })\n", + "\n", + "# =====================================================================\n", + "# Curriculum training (Mode B) with metric collection\n", + "# =====================================================================\n", + "if SKIP_OPTIMIZATION:\n", + " print(\"SKIP_OPTIMIZATION=1 -> skipping optimization/training.\")\n", + "else:\n", + " last_successes: List[dict] = []\n", + "\n", + " for idx, ex in enumerate(train_set, 1):\n", + " step_counter += 1\n", + " ok, pred, fb, sd = run_solver_on_example(ex)\n", + " print(f\"[train] {idx:02d}/{len(train_set)} ok={ok} pred={pred} \"\n", + " f\"exec_time={sd['execution_time_s']:.4f}s :: {fb}\")\n", + "\n", + " # Log pre-optimization metric\n", + " metric_log.append({\n", + " \"step\": step_counter,\n", + " \"phase\": \"train_pre\",\n", + " \"example_idx\": idx - 1,\n", + " **sd,\n", + " })\n", + "\n", + " if ok:\n", + " last_successes.append(ex)\n", + " last_successes = last_successes[-VALIDATE_ON_LAST_N:]\n", + " continue\n", + "\n", + " # Optimize on the failing example\n", + " modified, dump_file, history, chosen_state, run_dir = optimize_langgraph(\n", + " graph_root_function=GRAPH_ROOT,\n", + " graph_agents_functions=GRAPH_AGENTS,\n", + " graph_prompts_list=GRAPH_PROMPTS,\n", + " question=ex[\"question\"],\n", + " solution=ex[\"solution\"],\n", + " answer_feedback_func=feedback_answer_bbeh,\n", + " allowed_answer_set=allowed_set,\n", + " validation_set=last_successes,\n", + " accumulation_steps=ACCUMULATION_STEPS,\n", + " retry=LEARNING_RETRY,\n", + " max_attempts=MAX_ATTEMPTS,\n", + " test_optimization=True,\n", + " stop_on_success=True,\n", + " seed=SEED,\n", + " dump_prefix=f\"BBEH_{BBEH_TASK_NAME}__PAL__\",\n", + " output_folder=OUTPUT_FOLDER,\n", + " )\n", + "\n", + " print(\"[train] optimize_langgraph:\", {\"modified\": modified, \"dump_file\": dump_file, \"run_dir\": run_dir})\n", + " if history:\n", + " print(\"[train] last history entry:\", history[-1])\n", + "\n", + " # Re-test after optimization\n", + " ok2, pred2, fb2, sd2 = run_solver_on_example(ex)\n", + " print(f\"[train] after-opt ok={ok2} pred={pred2} \"\n", + " f\"exec_time={sd2['execution_time_s']:.4f}s :: {fb2}\")\n", + "\n", + " # Log post-optimization metric\n", + " metric_log.append({\n", + " \"step\": step_counter,\n", + " \"phase\": \"train_post\",\n", + " \"example_idx\": idx - 1,\n", + " **sd2,\n", + " })\n", + "\n", + " if ok2:\n", + " last_successes.append(ex)\n", + " last_successes = last_successes[-VALIDATE_ON_LAST_N:]\n", + "\n", + "# =====================================================================\n", + "# Post-training evaluation\n", + "# =====================================================================\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"POST-TRAINING evaluation on validation set\")\n", + "print(\"=\" * 60)\n", + "final_acc, final_score_dicts = evaluate(val_set, name=\"final/val\")\n", + "\n", + "# Record final eval metrics\n", + "step_counter += 1\n", + "for i, sd in enumerate(final_score_dicts):\n", + " metric_log.append({\n", + " \"step\": step_counter,\n", + " \"phase\": \"final\",\n", + " \"example_idx\": i,\n", + " **sd,\n", + " })\n", + "\n", + "print(f\"\\nSummary: baseline_val_acc={baseline_acc:.3f}, final_val_acc={final_acc:.3f}\")\n", + "print(f\"Total metric observations collected: {len(metric_log)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-graphs-header", + "metadata": {}, + "source": [ + "---\n", + "## Multi-Objective Graphs\n", + "\n", + "Three visualizations from the collected metric log:\n", + "\n", + "1. **Score progression** — accuracy and execution_time_s over training steps (dual y-axis)\n", + "2. **Scatter plot** — accuracy vs execution_time_s per example (baseline vs final)\n", + "3. **Summary table** — aggregate metrics before and after training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-graph-progression", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "# =====================================================================\n", + "# Graph 1: Score Progression (dual y-axis)\n", + "# Shows mean accuracy (left axis) and mean execution_time_s (right axis)\n", + "# aggregated per training step.\n", + "# =====================================================================\n", + "\n", + "# Aggregate metrics by step\n", + "steps_seen = sorted(set(m[\"step\"] for m in metric_log))\n", + "step_acc = []\n", + "step_time = []\n", + "\n", + "for s in steps_seen:\n", + " entries = [m for m in metric_log if m[\"step\"] == s]\n", + " mean_acc = np.mean([e[\"accuracy\"] for e in entries])\n", + " mean_time = np.mean([e[\"execution_time_s\"] for e in entries])\n", + " step_acc.append(mean_acc)\n", + " step_time.append(mean_time)\n", + "\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "\n", + "color_acc = '#1f77b4'\n", + "color_time = '#ff7f0e'\n", + "\n", + "ax1.set_xlabel('Training Step', fontsize=12)\n", + "ax1.set_ylabel('Mean Accuracy', fontsize=12, color=color_acc)\n", + "line1 = ax1.plot(steps_seen, step_acc, 'o-', color=color_acc, linewidth=2,\n", + " markersize=7, label='accuracy')\n", + "ax1.tick_params(axis='y', labelcolor=color_acc)\n", + "ax1.set_ylim(-0.05, 1.05)\n", + "\n", + "ax2 = ax1.twinx()\n", + "ax2.set_ylabel('Mean Execution Time (s)', fontsize=12, color=color_time)\n", + "line2 = ax2.plot(steps_seen, step_time, 's--', color=color_time, linewidth=2,\n", + " markersize=7, label='execution_time_s')\n", + "ax2.tick_params(axis='y', labelcolor=color_time)\n", + "\n", + "# Combined legend\n", + "lines = line1 + line2\n", + "labels = [l.get_label() for l in lines]\n", + "ax1.legend(lines, labels, loc='center right', fontsize=10)\n", + "\n", + "ax1.set_title('BBEH boolean_expressions \\u2014 Score Progression (Multi-Objective)', fontsize=14)\n", + "ax1.grid(True, alpha=0.3)\n", + "fig.tight_layout()\n", + "plt.show()\n", + "\n", + "print(\"Graph 1: Dual-axis score progression.\")\n", + "print(\" Left axis (blue): mean accuracy per step (higher is better).\")\n", + "print(\" Right axis (orange): mean execution time per step (lower is better).\")\n", + "print(\" Step 0 = baseline, intermediate = training, last = final eval.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-graph-scatter", + "metadata": {}, + "outputs": [], + "source": [ + "# =====================================================================\n", + "# Graph 2: Scatter — Accuracy vs Execution Time (baseline vs final)\n", + "# Each point = one validation example.\n", + "# Jitter on accuracy axis to separate overlapping correct/incorrect dots.\n", + "# =====================================================================\n", + "\n", + "baseline_entries = [m for m in metric_log if m[\"phase\"] == \"baseline\"]\n", + "final_entries = [m for m in metric_log if m[\"phase\"] == \"final\"]\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 6))\n", + "\n", + "rng = np.random.RandomState(42)\n", + "\n", + "if baseline_entries:\n", + " b_acc = np.array([e[\"accuracy\"] for e in baseline_entries])\n", + " b_time = np.array([e[\"execution_time_s\"] for e in baseline_entries])\n", + " jitter_b = rng.uniform(-0.03, 0.03, size=len(b_acc))\n", + " ax.scatter(b_time, b_acc + jitter_b, marker='o', c='#1f77b4', s=100,\n", + " edgecolors='black', linewidths=0.8, alpha=0.7, label='Baseline', zorder=5)\n", + "\n", + "if final_entries:\n", + " f_acc = np.array([e[\"accuracy\"] for e in final_entries])\n", + " f_time = np.array([e[\"execution_time_s\"] for e in final_entries])\n", + " jitter_f = rng.uniform(-0.03, 0.03, size=len(f_acc))\n", + " ax.scatter(f_time, f_acc + jitter_f, marker='^', c='#2ca02c', s=120,\n", + " edgecolors='black', linewidths=0.8, alpha=0.7, label='After Training', zorder=5)\n", + "\n", + "ax.set_xlabel('Execution Time (seconds)', fontsize=12)\n", + "ax.set_ylabel('Accuracy (1=correct, 0=wrong)', fontsize=12)\n", + "ax.set_title('BBEH boolean_expressions \\u2014 Accuracy vs Execution Time', fontsize=14)\n", + "ax.set_ylim(-0.15, 1.15)\n", + "ax.set_yticks([0.0, 0.5, 1.0])\n", + "ax.legend(fontsize=11)\n", + "ax.grid(True, alpha=0.3)\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(\"Graph 2: Each point is one validation example.\")\n", + "print(\" Ideal: top-left (high accuracy, low execution time).\")\n", + "print(\" Blue circles = baseline, green triangles = after curriculum training.\")\n", + "print(\" Small jitter on y-axis to separate overlapping points.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-summary-table", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# =====================================================================\n", + "# Summary Table: Baseline vs Final metrics\n", + "# =====================================================================\n", + "\n", + "baseline_entries = [m for m in metric_log if m[\"phase\"] == \"baseline\"]\n", + "final_entries = [m for m in metric_log if m[\"phase\"] == \"final\"]\n", + "\n", + "def agg_metrics(entries):\n", + " if not entries:\n", + " return {\"accuracy\": \"N/A\", \"exec_time_s (mean)\": \"N/A\",\n", + " \"exec_time_s (max)\": \"N/A\", \"n_examples\": 0}\n", + " accs = [e[\"accuracy\"] for e in entries]\n", + " times = [e[\"execution_time_s\"] for e in entries]\n", + " return {\n", + " \"accuracy\": f\"{np.mean(accs):.3f}\",\n", + " \"exec_time_s (mean)\": f\"{np.mean(times):.4f}\",\n", + " \"exec_time_s (max)\": f\"{np.max(times):.4f}\",\n", + " \"n_examples\": len(entries),\n", + " }\n", + "\n", + "rows = [\n", + " {\"Phase\": \"Baseline (val)\", **agg_metrics(baseline_entries)},\n", + " {\"Phase\": \"After Training (val)\", **agg_metrics(final_entries)},\n", + "]\n", + "\n", + "# Also add training metrics if available\n", + "train_pre = [m for m in metric_log if m[\"phase\"] == \"train_pre\"]\n", + "train_post = [m for m in metric_log if m[\"phase\"] == \"train_post\"]\n", + "if train_pre:\n", + " rows.append({\"Phase\": \"Train pre-opt (all)\", **agg_metrics(train_pre)})\n", + "if train_post:\n", + " rows.append({\"Phase\": \"Train post-opt (failed)\", **agg_metrics(train_post)})\n", + "\n", + "df = pd.DataFrame(rows)\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"SUMMARY: Multi-Objective Metrics\")\n", + "print(\"=\" * 70)\n", + "print(df.to_string(index=False))\n", + "\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"M2 BBEH NOTEBOOK COMPLETE\")\n", + "print(\"=\" * 70)\n", + "print(\"\"\"\n", + "Multi-Objective Instrumentation:\n", + " - execute_code() measures wall-clock time via time.perf_counter()\n", + " - LangGraphGuide.get_score_dict() returns {accuracy, execution_time_s}\n", + " - Per-example metrics collected during baseline, training, and final eval\n", + "\n", + "Graphs:\n", + " Graph 1: Dual-axis score progression (accuracy + exec time over steps)\n", + " Graph 2: Accuracy vs execution time scatter (baseline vs final)\n", + " Summary: Aggregate metrics table\n", + "\n", + "This demonstrates the M2 multi-objective infrastructure on a real LLM task.\n", + "The same get_score_dict() interface works with BasicSearch, BeamsearchAlgorithm,\n", + "and PrioritySearch (see t6_m2_trainers.ipynb for those algorithms).\n", + "\"\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From d16ef41e00c2cb6d321b22daf7b1f1ec0a4f12e5 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Fri, 20 Feb 2026 11:10:40 -0400 Subject: [PATCH 18/20] fix: BBEH notebook Colab setup cell --- examples/notebooks/t6_m2_bbeh.ipynb | 36 +---------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/examples/notebooks/t6_m2_bbeh.ipynb b/examples/notebooks/t6_m2_bbeh.ipynb index 3e2329d5..8ff311e6 100644 --- a/examples/notebooks/t6_m2_bbeh.ipynb +++ b/examples/notebooks/t6_m2_bbeh.ipynb @@ -40,41 +40,7 @@ "id": "cell-setup", "metadata": {}, "outputs": [], - "source": [ - "import os, sys\n", - "\n", - "if IN_COLAB:\n", - " if not os.path.exists('/content/Trace'):\n", - " print(\"Setting up Trace...\")\n", - " %pip install langgraph langchain langchain_openai datasets tqdm langchain_community litellm dspy black matplotlib pandas\n", - " %git clone https://github.com/AgentOpt/OpenTrace.git Trace\n", - " %cd Trace\n", - " %git pull origin experimental && git checkout experimental\n", - " %sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n", - " %pip install -e .\n", - " sys.path.append('/content/Trace')\n", - "else:\n", - " # Local: add repo root to sys.path\n", - " _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n", - " _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n", - " if _repo_root not in sys.path:\n", - " sys.path.insert(0, _repo_root)\n", - "\n", - "# Clone BBEH benchmark tasks\n", - "if not os.path.exists('bbeh'):\n", - " !git clone https://github.com/google-deepmind/bbeh.git\n", - "else:\n", - " print(\"bbeh/ already exists, skipping clone.\")\n", - "\n", - "# Soft-import display\n", - "try:\n", - " from IPython.display import display\n", - "except Exception:\n", - " def display(*args, **kwargs):\n", - " return None\n", - "\n", - "print(f\"{IN_COLAB=}\")" - ] + "source": "import os, sys, subprocess\n\nif IN_COLAB:\n if not os.path.exists('/content/Trace'):\n print(\"Setting up Trace...\")\n !pip install langgraph langchain langchain_openai datasets tqdm langchain_community litellm dspy black matplotlib pandas\n !git clone https://github.com/AgentOpt/OpenTrace.git /content/Trace\n %cd /content/Trace\n !git pull origin experimental && git checkout experimental\n !sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n !pip install -e .\n sys.path.append('/content/Trace')\nelse:\n # Local: add repo root to sys.path\n _nb_dir = os.path.dirname(os.path.abspath(\"__file__\"))\n _repo_root = os.path.abspath(os.path.join(_nb_dir, \"..\", \"..\"))\n if _repo_root not in sys.path:\n sys.path.insert(0, _repo_root)\n\n# Clone BBEH benchmark tasks\nif not os.path.exists('bbeh'):\n !git clone https://github.com/google-deepmind/bbeh.git\nelse:\n print(\"bbeh/ already exists, skipping clone.\")\n\n# Soft-import display\ntry:\n from IPython.display import display\nexcept Exception:\n def display(*args, **kwargs):\n return None\n\nprint(f\"{IN_COLAB=}\")" }, { "cell_type": "code", From 5c1b7e3c594bebc894fd9e7f213a6f3e9c95e3bc Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Fri, 20 Feb 2026 11:24:52 -0400 Subject: [PATCH 19/20] fix: BBEH notebook measures end-to-end graph time (LLM + exec) not just exec() --- examples/notebooks/t6_m2_bbeh.ipynb | 129 +--------------------------- 1 file changed, 2 insertions(+), 127 deletions(-) diff --git a/examples/notebooks/t6_m2_bbeh.ipynb b/examples/notebooks/t6_m2_bbeh.ipynb index 8ff311e6..5d202b27 100644 --- a/examples/notebooks/t6_m2_bbeh.ipynb +++ b/examples/notebooks/t6_m2_bbeh.ipynb @@ -4,27 +4,7 @@ "cell_type": "markdown", "id": "cell-title", "metadata": {}, - "source": [ - "# T6 M2 — BBEH Boolean Expressions with Multi-Objective Instrumentation\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AgentOpt/OpenTrace/blob/pull/61/head/examples/notebooks/t6_m2_bbeh.ipynb)\n", - "\n", - "**Milestone 2 Deliverable** — Multi-objective scoring on a real LLM task\n", - "\n", - "This notebook demonstrates multi-objective optimization on the **BBEH boolean_expressions** benchmark\n", - "using the **PAL (Program-Aided Language model)** strategy from Xavier’s original experiment.\n", - "\n", - "Two objectives are tracked:\n", - "- **accuracy** (binary: 1.0 = correct, 0.0 = wrong)\n", - "- **execution_time_s** (wall-clock seconds for the generated Python code)\n", - "\n", - "The `LangGraphGuide.get_score_dict()` method returns both metrics per example,\n", - "enabling the M2 multi-objective infrastructure to track and visualize tradeoffs.\n", - "\n", - "**Requires a real LLM API key** (OpenRouter recommended, default model: `openai/gpt-5-nano`).\n", - "\n", - "---" - ] + "source": "# T6 M2 — BBEH Boolean Expressions with Multi-Objective Instrumentation\n\n[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AgentOpt/OpenTrace/blob/pull/61/head/examples/notebooks/t6_m2_bbeh.ipynb)\n\n**Milestone 2 Deliverable** — Multi-objective scoring on a real LLM task\n\nThis notebook demonstrates multi-objective optimization on the **BBEH boolean_expressions** benchmark\nusing the **PAL (Program-Aided Language model)** strategy from Xavier's original experiment.\n\nTwo objectives are tracked:\n- **accuracy** (binary: 1.0 = correct, 0.0 = wrong)\n- **execution_time_s** (end-to-end wall-clock seconds per example: LLM call + code execution)\n\nThe `LangGraphGuide.get_score_dict()` method returns both metrics per example,\nenabling the M2 multi-objective infrastructure to track and visualize tradeoffs.\n\n**Requires a real LLM API key** (OpenRouter recommended, default model: `openai/gpt-5-nano`).\n\n---" }, { "cell_type": "code", @@ -568,112 +548,7 @@ "id": "cell-pal-strategy", "metadata": {}, "outputs": [], - "source": [ - "import re, time\n", - "from langgraph.graph import StateGraph, START, END\n", - "\n", - "# -----------------------\n", - "# Strategy: PAL (Program-Aided Language model)\n", - "# -----------------------\n", - "prompt_parse_problem = node(\n", - " \"Read the problem and write Python code that sets a variable named `result` to the final answer.\\n\"\n", - " \"- Output ONLY valid Python (no markdown fences).\\n\"\n", - " \"- If the task is multiple-choice, set result to the option label exactly (e.g., '(A)').\\n\\n\"\n", - " \"Problem:\\n\",\n", - " trainable=True,\n", - " description=\"PAL prompt that generates python code producing a `result`.\"\n", - ")\n", - "\n", - "# Global variable to capture execution time from the graph invocation.\n", - "# This is read by run_solver_on_example() to populate the guide's score_dict.\n", - "_last_exec_time_s = 0.0\n", - "\n", - "def parse_problem(state: dict):\n", - " question = get_no_node(state.get(\"question\", \"\"))\n", - " prompt = prompt_parse_problem + question\n", - " code_str = llm_call(get_no_node(prompt))\n", - " return {\"code\": code_str.strip(), \"question\": question}\n", - "\n", - "def execute_code(state: dict):\n", - " \"\"\"Execute LLM-generated Python code, measuring wall-clock time.\n", - "\n", - " M2 instrumentation: wraps the exec() call with time.perf_counter()\n", - " so execution_time_s is available as a second objective.\n", - " \"\"\"\n", - " global _last_exec_time_s\n", - "\n", - " def strip_python_tags(code: str) -> str:\n", - " return re.sub(\n", - " r'(?s)(?:.*?```(?:python)?\\s*\\n(.*?)(?:\\n```.*)?|(.*))',\n", - " lambda m: m.group(1) if m.group(1) is not None else m.group(2),\n", - " code,\n", - " )\n", - "\n", - " update = {}\n", - " try:\n", - " code_to_run = strip_python_tags(get_no_node(state.get(\"code\", \"\")))\n", - " local_vars = {}\n", - "\n", - " # --- M2: measure wall-clock time for code execution ---\n", - " t0 = time.perf_counter()\n", - " exec(code_to_run, {}, local_vars) # noqa: S102 - intentional PAL strategy\n", - " t1 = time.perf_counter()\n", - " _last_exec_time_s = t1 - t0\n", - "\n", - " local_vars.pop(\"__builtins__\", None)\n", - "\n", - " if \"result\" in local_vars:\n", - " update[\"final_answer\"] = node(local_vars[\"result\"])\n", - " elif len(local_vars) == 1:\n", - " update[\"final_answer\"] = node(next(iter(local_vars.values())))\n", - " else:\n", - " update[\"final_answer\"] = node(None)\n", - "\n", - " except Exception as e:\n", - " _last_exec_time_s = 0.0\n", - " update[\"final_answer\"] = node(None)\n", - " update[\"error\"] = str(e)\n", - "\n", - " return update\n", - "\n", - "def create_graph_solve_with_PAL_Strategy():\n", - " g = StateGraph(dict)\n", - " g.add_node(\"parse\", parse_problem)\n", - " g.add_node(\"calculate\", execute_code)\n", - " g.add_edge(START, \"parse\")\n", - " g.add_edge(\"parse\", \"calculate\")\n", - " g.add_edge(\"calculate\", END)\n", - " return g\n", - "\n", - "def solve_with_PAL_Strategy(problem: str) -> dict:\n", - " global _last_exec_time_s\n", - " _last_exec_time_s = 0.0 # reset before each invocation\n", - "\n", - " g = create_graph_solve_with_PAL_Strategy()\n", - " compiled = g.compile()\n", - "\n", - " if SHOW_MERMAID_GRAPH:\n", - " try:\n", - " from IPython.display import Image, display\n", - " display(Image(compiled.get_graph(xray=1).draw_mermaid_png()))\n", - " except Exception:\n", - " pass\n", - "\n", - " result = compiled.invoke({\"question\": get_no_node(problem)})\n", - " if \"final_answer\" not in result:\n", - " return {\"final_answer\": node(\"No solution found\")}\n", - " if isinstance(result[\"final_answer\"], str):\n", - " return {\"final_answer\": node(result[\"final_answer\"])}\n", - " return result\n", - "\n", - "# Default graph spec\n", - "GRAPH_ROOT = \"solve_with_PAL_Strategy\"\n", - "GRAPH_AGENTS = [\"parse_problem\", \"execute_code\"]\n", - "GRAPH_PROMPTS = [prompt_parse_problem]\n", - "\n", - "print(\"PAL strategy loaded (with execution time instrumentation).\")\n", - "print(\"execute_code() measures wall-clock time for exec() via time.perf_counter().\")" - ] + "source": "import re, time\nfrom langgraph.graph import StateGraph, START, END\n\n# -----------------------\n# Strategy: PAL (Program-Aided Language model)\n# -----------------------\nprompt_parse_problem = node(\n \"Read the problem and write Python code that sets a variable named `result` to the final answer.\\n\"\n \"- Output ONLY valid Python (no markdown fences).\\n\"\n \"- If the task is multiple-choice, set result to the option label exactly (e.g., '(A)').\\n\\n\"\n \"Problem:\\n\",\n trainable=True,\n description=\"PAL prompt that generates python code producing a `result`.\"\n)\n\n# Global variable to capture execution time from the graph invocation.\n# This is read by run_solver_on_example() to populate the guide's score_dict.\n_last_exec_time_s = 0.0\n\ndef parse_problem(state: dict):\n question = get_no_node(state.get(\"question\", \"\"))\n prompt = prompt_parse_problem + question\n code_str = llm_call(get_no_node(prompt))\n return {\"code\": code_str.strip(), \"question\": question}\n\ndef execute_code(state: dict):\n \"\"\"Execute LLM-generated Python code.\n\n The PAL strategy: exec() the code produced by the LLM and extract the\n `result` variable as the final answer.\n \"\"\"\n def strip_python_tags(code: str) -> str:\n return re.sub(\n r'(?s)(?:.*?```(?:python)?\\s*\\n(.*?)(?:\\n```.*)?|(.*))',\n lambda m: m.group(1) if m.group(1) is not None else m.group(2),\n code,\n )\n\n update = {}\n try:\n code_to_run = strip_python_tags(get_no_node(state.get(\"code\", \"\")))\n local_vars = {}\n exec(code_to_run, {}, local_vars) # noqa: S102 - intentional PAL strategy\n local_vars.pop(\"__builtins__\", None)\n\n if \"result\" in local_vars:\n update[\"final_answer\"] = node(local_vars[\"result\"])\n elif len(local_vars) == 1:\n update[\"final_answer\"] = node(next(iter(local_vars.values())))\n else:\n update[\"final_answer\"] = node(None)\n\n except Exception as e:\n update[\"final_answer\"] = node(None)\n update[\"error\"] = str(e)\n\n return update\n\ndef create_graph_solve_with_PAL_Strategy():\n g = StateGraph(dict)\n g.add_node(\"parse\", parse_problem)\n g.add_node(\"calculate\", execute_code)\n g.add_edge(START, \"parse\")\n g.add_edge(\"parse\", \"calculate\")\n g.add_edge(\"calculate\", END)\n return g\n\ndef solve_with_PAL_Strategy(problem: str) -> dict:\n global _last_exec_time_s\n _last_exec_time_s = 0.0 # reset before each invocation\n\n g = create_graph_solve_with_PAL_Strategy()\n compiled = g.compile()\n\n if SHOW_MERMAID_GRAPH:\n try:\n from IPython.display import Image, display\n display(Image(compiled.get_graph(xray=1).draw_mermaid_png()))\n except Exception:\n pass\n\n # --- M2: measure end-to-end graph execution time (LLM call + code exec) ---\n t0 = time.perf_counter()\n result = compiled.invoke({\"question\": get_no_node(problem)})\n t1 = time.perf_counter()\n _last_exec_time_s = t1 - t0\n\n if \"final_answer\" not in result:\n return {\"final_answer\": node(\"No solution found\")}\n if isinstance(result[\"final_answer\"], str):\n return {\"final_answer\": node(result[\"final_answer\"])}\n return result\n\n# Default graph spec\nGRAPH_ROOT = \"solve_with_PAL_Strategy\"\nGRAPH_AGENTS = [\"parse_problem\", \"execute_code\"]\nGRAPH_PROMPTS = [prompt_parse_problem]\n\nprint(\"PAL strategy loaded (with end-to-end timing instrumentation).\")\nprint(\"solve_with_PAL_Strategy() measures total graph time (LLM + code exec) via time.perf_counter().\")" }, { "cell_type": "code", From b8f502323c9552ca7a29eec889affc6e9f5a7886 Mon Sep 17 00:00:00 2001 From: Jose Carlos Rodriguez Date: Fri, 20 Feb 2026 11:32:17 -0400 Subject: [PATCH 20/20] fix: BBEH notebook resilient to Trace bundle corruption after failed optimization --- examples/notebooks/t6_m2_bbeh.ipynb | 163 +--------------------------- 1 file changed, 1 insertion(+), 162 deletions(-) diff --git a/examples/notebooks/t6_m2_bbeh.ipynb b/examples/notebooks/t6_m2_bbeh.ipynb index 5d202b27..d53632d5 100644 --- a/examples/notebooks/t6_m2_bbeh.ipynb +++ b/examples/notebooks/t6_m2_bbeh.ipynb @@ -662,168 +662,7 @@ "id": "cell-training", "metadata": {}, "outputs": [], - "source": [ - "from typing import List, Dict, Tuple\n", - "import time\n", - "\n", - "# -----------------------\n", - "# Multi-objective instrumented solver + evaluator\n", - "# -----------------------\n", - "\n", - "# Build the guide with multi-objective support\n", - "guide = LangGraphGuide(\n", - " feedback_func=feedback_answer_bbeh,\n", - " answer_key=\"final_answer\",\n", - " allowed_answer_set=allowed_set,\n", - ")\n", - "\n", - "def run_solver_on_example(ex: dict) -> Tuple[bool, str, str, Dict[str, float]]:\n", - " \"\"\"Run solver and return (ok, pred, feedback, score_dict).\n", - "\n", - " score_dict contains {accuracy, execution_time_s} from get_score_dict().\n", - " \"\"\"\n", - " global _last_exec_time_s\n", - " _last_exec_time_s = 0.0\n", - "\n", - " out = solve_with_PAL_Strategy(ex[\"question\"])\n", - " pred = get_no_node(out.get(\"final_answer\"))\n", - " ok, fb = feedback_answer_bbeh(pred, ex[\"solution\"], allowed_set)\n", - "\n", - " # Populate guide's execution time from the global, then get score_dict\n", - " guide._last_execution_time_s = _last_exec_time_s\n", - " score_dict = guide.get_score_dict(ex[\"question\"], out, ex[\"solution\"])\n", - "\n", - " return ok, str(pred), fb, score_dict\n", - "\n", - "def evaluate(examples: List[dict], *, name: str) -> Tuple[float, List[Dict[str, float]]]:\n", - " \"\"\"Evaluate examples, returning (accuracy, list of score_dicts).\"\"\"\n", - " n_ok = 0\n", - " all_score_dicts = []\n", - " for i, ex in enumerate(examples, 1):\n", - " ok, pred, fb, sd = run_solver_on_example(ex)\n", - " n_ok += int(ok)\n", - " all_score_dicts.append(sd)\n", - " print(f\"[{name}] {i:02d}/{len(examples)} ok={ok} pred={pred} \"\n", - " f\"exec_time={sd['execution_time_s']:.4f}s :: {fb}\")\n", - " acc = n_ok / max(1, len(examples))\n", - " mean_time = sum(sd['execution_time_s'] for sd in all_score_dicts) / max(1, len(all_score_dicts))\n", - " print(f\"[{name}] accuracy = {acc:.3f} ({n_ok}/{len(examples)}), mean exec_time = {mean_time:.4f}s\")\n", - " return acc, all_score_dicts\n", - "\n", - "\n", - "# =====================================================================\n", - "# Baseline evaluation\n", - "# =====================================================================\n", - "print(\"=\" * 60)\n", - "print(\"BASELINE evaluation on validation set\")\n", - "print(\"=\" * 60)\n", - "baseline_acc, baseline_score_dicts = evaluate(val_set, name=\"baseline/val\")\n", - "\n", - "# =====================================================================\n", - "# Per-step metric collection during curriculum training\n", - "# =====================================================================\n", - "# Stores {step, phase, accuracy, execution_time_s, example_idx} per observation\n", - "metric_log = []\n", - "step_counter = 0\n", - "\n", - "# Record baseline metrics\n", - "for i, sd in enumerate(baseline_score_dicts):\n", - " metric_log.append({\n", - " \"step\": 0,\n", - " \"phase\": \"baseline\",\n", - " \"example_idx\": i,\n", - " **sd,\n", - " })\n", - "\n", - "# =====================================================================\n", - "# Curriculum training (Mode B) with metric collection\n", - "# =====================================================================\n", - "if SKIP_OPTIMIZATION:\n", - " print(\"SKIP_OPTIMIZATION=1 -> skipping optimization/training.\")\n", - "else:\n", - " last_successes: List[dict] = []\n", - "\n", - " for idx, ex in enumerate(train_set, 1):\n", - " step_counter += 1\n", - " ok, pred, fb, sd = run_solver_on_example(ex)\n", - " print(f\"[train] {idx:02d}/{len(train_set)} ok={ok} pred={pred} \"\n", - " f\"exec_time={sd['execution_time_s']:.4f}s :: {fb}\")\n", - "\n", - " # Log pre-optimization metric\n", - " metric_log.append({\n", - " \"step\": step_counter,\n", - " \"phase\": \"train_pre\",\n", - " \"example_idx\": idx - 1,\n", - " **sd,\n", - " })\n", - "\n", - " if ok:\n", - " last_successes.append(ex)\n", - " last_successes = last_successes[-VALIDATE_ON_LAST_N:]\n", - " continue\n", - "\n", - " # Optimize on the failing example\n", - " modified, dump_file, history, chosen_state, run_dir = optimize_langgraph(\n", - " graph_root_function=GRAPH_ROOT,\n", - " graph_agents_functions=GRAPH_AGENTS,\n", - " graph_prompts_list=GRAPH_PROMPTS,\n", - " question=ex[\"question\"],\n", - " solution=ex[\"solution\"],\n", - " answer_feedback_func=feedback_answer_bbeh,\n", - " allowed_answer_set=allowed_set,\n", - " validation_set=last_successes,\n", - " accumulation_steps=ACCUMULATION_STEPS,\n", - " retry=LEARNING_RETRY,\n", - " max_attempts=MAX_ATTEMPTS,\n", - " test_optimization=True,\n", - " stop_on_success=True,\n", - " seed=SEED,\n", - " dump_prefix=f\"BBEH_{BBEH_TASK_NAME}__PAL__\",\n", - " output_folder=OUTPUT_FOLDER,\n", - " )\n", - "\n", - " print(\"[train] optimize_langgraph:\", {\"modified\": modified, \"dump_file\": dump_file, \"run_dir\": run_dir})\n", - " if history:\n", - " print(\"[train] last history entry:\", history[-1])\n", - "\n", - " # Re-test after optimization\n", - " ok2, pred2, fb2, sd2 = run_solver_on_example(ex)\n", - " print(f\"[train] after-opt ok={ok2} pred={pred2} \"\n", - " f\"exec_time={sd2['execution_time_s']:.4f}s :: {fb2}\")\n", - "\n", - " # Log post-optimization metric\n", - " metric_log.append({\n", - " \"step\": step_counter,\n", - " \"phase\": \"train_post\",\n", - " \"example_idx\": idx - 1,\n", - " **sd2,\n", - " })\n", - "\n", - " if ok2:\n", - " last_successes.append(ex)\n", - " last_successes = last_successes[-VALIDATE_ON_LAST_N:]\n", - "\n", - "# =====================================================================\n", - "# Post-training evaluation\n", - "# =====================================================================\n", - "print(\"\\n\" + \"=\" * 60)\n", - "print(\"POST-TRAINING evaluation on validation set\")\n", - "print(\"=\" * 60)\n", - "final_acc, final_score_dicts = evaluate(val_set, name=\"final/val\")\n", - "\n", - "# Record final eval metrics\n", - "step_counter += 1\n", - "for i, sd in enumerate(final_score_dicts):\n", - " metric_log.append({\n", - " \"step\": step_counter,\n", - " \"phase\": \"final\",\n", - " \"example_idx\": i,\n", - " **sd,\n", - " })\n", - "\n", - "print(f\"\\nSummary: baseline_val_acc={baseline_acc:.3f}, final_val_acc={final_acc:.3f}\")\n", - "print(f\"Total metric observations collected: {len(metric_log)}\")" - ] + "source": "from typing import List, Dict, Tuple\nimport time\n\n# -----------------------\n# Multi-objective instrumented solver + evaluator\n# -----------------------\n\n# Build the guide with multi-objective support\nguide = LangGraphGuide(\n feedback_func=feedback_answer_bbeh,\n answer_key=\"final_answer\",\n allowed_answer_set=allowed_set,\n)\n\ndef run_solver_on_example(ex: dict) -> Tuple[bool, str, str, Dict[str, float]]:\n \"\"\"Run solver and return (ok, pred, feedback, score_dict).\n\n score_dict contains {accuracy, execution_time_s} from get_score_dict().\n \"\"\"\n global _last_exec_time_s\n _last_exec_time_s = 0.0\n\n out = solve_with_PAL_Strategy(ex[\"question\"])\n pred = get_no_node(out.get(\"final_answer\"))\n ok, fb = feedback_answer_bbeh(pred, ex[\"solution\"], allowed_set)\n\n # Populate guide's execution time from the global, then get score_dict\n guide._last_execution_time_s = _last_exec_time_s\n score_dict = guide.get_score_dict(ex[\"question\"], out, ex[\"solution\"])\n\n return ok, str(pred), fb, score_dict\n\ndef evaluate(examples: List[dict], *, name: str) -> Tuple[float, List[Dict[str, float]]]:\n \"\"\"Evaluate examples, returning (accuracy, list of score_dicts).\"\"\"\n n_ok = 0\n all_score_dicts = []\n for i, ex in enumerate(examples, 1):\n ok, pred, fb, sd = run_solver_on_example(ex)\n n_ok += int(ok)\n all_score_dicts.append(sd)\n print(f\"[{name}] {i:02d}/{len(examples)} ok={ok} pred={pred} \"\n f\"exec_time={sd['execution_time_s']:.4f}s :: {fb}\")\n acc = n_ok / max(1, len(examples))\n mean_time = sum(sd['execution_time_s'] for sd in all_score_dicts) / max(1, len(all_score_dicts))\n print(f\"[{name}] accuracy = {acc:.3f} ({n_ok}/{len(examples)}), mean exec_time = {mean_time:.4f}s\")\n return acc, all_score_dicts\n\n\n# =====================================================================\n# Baseline evaluation\n# =====================================================================\nprint(\"=\" * 60)\nprint(\"BASELINE evaluation on validation set\")\nprint(\"=\" * 60)\nbaseline_acc, baseline_score_dicts = evaluate(val_set, name=\"baseline/val\")\n\n# =====================================================================\n# Per-step metric collection during curriculum training\n# =====================================================================\n# Stores {step, phase, accuracy, execution_time_s, example_idx} per observation\nmetric_log = []\nstep_counter = 0\n\n# Record baseline metrics\nfor i, sd in enumerate(baseline_score_dicts):\n metric_log.append({\n \"step\": 0,\n \"phase\": \"baseline\",\n \"example_idx\": i,\n **sd,\n })\n\n# =====================================================================\n# Curriculum training (Mode B) with metric collection\n# =====================================================================\nif SKIP_OPTIMIZATION:\n print(\"SKIP_OPTIMIZATION=1 -> skipping optimization/training.\")\nelse:\n last_successes: List[dict] = []\n\n for idx, ex in enumerate(train_set, 1):\n step_counter += 1\n ok, pred, fb, sd = run_solver_on_example(ex)\n print(f\"[train] {idx:02d}/{len(train_set)} ok={ok} pred={pred} \"\n f\"exec_time={sd['execution_time_s']:.4f}s :: {fb}\")\n\n # Log pre-optimization metric\n metric_log.append({\n \"step\": step_counter,\n \"phase\": \"train_pre\",\n \"example_idx\": idx - 1,\n **sd,\n })\n\n if ok:\n last_successes.append(ex)\n last_successes = last_successes[-VALIDATE_ON_LAST_N:]\n continue\n\n # Optimize on the failing example\n modified, dump_file, history, chosen_state, run_dir = optimize_langgraph(\n graph_root_function=GRAPH_ROOT,\n graph_agents_functions=GRAPH_AGENTS,\n graph_prompts_list=GRAPH_PROMPTS,\n question=ex[\"question\"],\n solution=ex[\"solution\"],\n answer_feedback_func=feedback_answer_bbeh,\n allowed_answer_set=allowed_set,\n validation_set=last_successes,\n accumulation_steps=ACCUMULATION_STEPS,\n retry=LEARNING_RETRY,\n max_attempts=MAX_ATTEMPTS,\n test_optimization=True,\n stop_on_success=True,\n seed=SEED,\n dump_prefix=f\"BBEH_{BBEH_TASK_NAME}__PAL__\",\n output_folder=OUTPUT_FOLDER,\n )\n\n print(\"[train] optimize_langgraph:\", {\"modified\": modified, \"dump_file\": dump_file, \"run_dir\": run_dir})\n if history:\n print(\"[train] last history entry:\", history[-1])\n\n # Re-test after optimization.\n # Wrapped in try/except: when optimization fails to update params,\n # the Trace bundle state can be corrupted (Node objects where dicts\n # are expected), causing ExecutionError in the re-test.\n try:\n ok2, pred2, fb2, sd2 = run_solver_on_example(ex)\n print(f\"[train] after-opt ok={ok2} pred={pred2} \"\n f\"exec_time={sd2['execution_time_s']:.4f}s :: {fb2}\")\n\n # Log post-optimization metric\n metric_log.append({\n \"step\": step_counter,\n \"phase\": \"train_post\",\n \"example_idx\": idx - 1,\n **sd2,\n })\n\n if ok2:\n last_successes.append(ex)\n last_successes = last_successes[-VALIDATE_ON_LAST_N:]\n except Exception as e:\n print(f\"[train] after-opt re-test failed (graph state corrupted): {type(e).__name__}: {e}\")\n print(\"[train] skipping this example and continuing.\")\n\n# =====================================================================\n# Post-training evaluation\n# =====================================================================\nprint(\"\\n\" + \"=\" * 60)\nprint(\"POST-TRAINING evaluation on validation set\")\nprint(\"=\" * 60)\nfinal_acc, final_score_dicts = evaluate(val_set, name=\"final/val\")\n\n# Record final eval metrics\nstep_counter += 1\nfor i, sd in enumerate(final_score_dicts):\n metric_log.append({\n \"step\": step_counter,\n \"phase\": \"final\",\n \"example_idx\": i,\n **sd,\n })\n\nprint(f\"\\nSummary: baseline_val_acc={baseline_acc:.3f}, final_val_acc={final_acc:.3f}\")\nprint(f\"Total metric observations collected: {len(metric_log)}\")" }, { "cell_type": "markdown",