diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..198f6d55 --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +# OpenRouter Configuration +# Copy this file to .env and fill in your values +# Get your API key from: https://openrouter.ai/keys + +OPENROUTER_API_KEY=sk-or-v1-your-key-here +OPENROUTER_MODEL=meta-llama/llama-3.1-8b-instruct:free +OPENROUTER_BASE_URL=https://openrouter.ai/api/v1 +USE_STUB_LLM=false diff --git a/.gitignore b/.gitignore index 17aa1eeb..fa6aa449 100644 --- a/.gitignore +++ b/.gitignore @@ -168,4 +168,6 @@ OAI_CONFIG_LIST *.gv.pdf # jupyter book API output -docs/api/* \ No newline at end of file +docs/api/* + +uv.lock \ No newline at end of file diff --git a/01_m1_instrument_and_optimize(5)(1).ipynb b/01_m1_instrument_and_optimize(5)(1).ipynb new file mode 100644 index 00000000..191ee79a --- /dev/null +++ b/01_m1_instrument_and_optimize(5)(1).ipynb @@ -0,0 +1,2072 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "YgoQVioB1KMO" + }, + "source": [ + "# M1: Instrument & Optimize a LangGraph Agent\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AgentOpt/OpenTrace/blob/main/examples/notebooks/01_m1_instrument_and_optimize.ipynb)\n", + "\n", + "This notebook demonstrates the **M1 core value proposition**: drop-in OTEL\n", + "instrumentation and end-to-end optimization for any LangGraph agent.\n", + "\n", + "## What this notebook proves\n", + "\n", + "| Gate | Verified |\n", + "|------|----------|\n", + "| `instrument_graph()` wraps a LangGraph with OTEL tracing | Section 4 |\n", + "| `param.*` + `param.*.trainable` attributes on spans | Section 5 |\n", + "| OTLP → TGJ → `ParameterNode` + `MessageNode` | Section 6 |\n", + "| Child spans do NOT break temporal chaining | Section 6 |\n", + "| `apply_updates()` changes prompt templates via bindings | Section 7 |\n", + "| `optimize_graph()` full loop (StubLLM — deterministic) | Section 8 |\n", + "| `optimize_graph()` live provider (OpenRouter, guarded) | Section 9 |\n", + "\n", + "## Modes\n", + "\n", + "- **StubLLM mode** (Sections 4-8): runs without any API keys — deterministic, CI-safe.\n", + "- **Live LLM mode** (Section 9): requires `OPENROUTER_API_KEY` via Colab Secrets or `.env`.\n", + "\n", + "## Table of Contents\n", + "\n", + "1. [Install Dependencies](#1-install-dependencies)\n", + "2. [Configuration](#2-configuration)\n", + "3. [Define a Minimal LangGraph](#3-define-a-minimal-langgraph)\n", + "4. [Instrument the Graph (StubLLM)](#4-instrument-the-graph-stubllm)\n", + "5. [Inspect OTLP Spans & param.* Attributes](#5-inspect-otlp-spans--param-attributes)\n", + "6. [OTLP → TGJ → Trace Nodes](#6-otlp--tgj--trace-nodes)\n", + "7. [Bindings & apply_updates()](#7-bindings--apply_updates)\n", + "8. [optimize_graph() — StubLLM End-to-End](#8-optimize_graph--stubllm-end-to-end)\n", + "9. [Live LLM Mode (OpenRouter)](#9-live-llm-mode-openrouter)\n", + "10. [Save Artifacts](#10-save-artifacts)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y0L_AwXi1KMQ" + }, + "source": [ + "---\n", + "## 1. Install Dependencies\n", + "\n", + "Run this cell once to install all required packages." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:58:37.234100Z", + "iopub.status.busy": "2026-02-12T07:58:37.233113Z", + "iopub.status.idle": "2026-02-12T07:58:48.042859Z", + "shell.execute_reply": "2026-02-12T07:58:48.039301Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bjhySi_q1KMQ", + "outputId": "787b1b22-ada0-45e1-c10d-0bc21099726f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "M\tsetup.py\n", + "Already on 'm1-for-upstream'\n", + "Your branch is up to date with 'origin/m1-for-upstream'.\n", + "/content/NewTrace\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Checking if build backend supports build_editable ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build editable ... \u001b[?25l\u001b[?25hdone\n", + " Preparing editable metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building editable for trace-opt (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "[INFO] OpenTrace ref: m1-for-upstream\n", + "\n", + "==================================================\n", + "All dependencies installed!\n", + "==================================================\n" + ] + } + ], + "source": [ + "!pip install -q langgraph>=1.0.0 opentelemetry-api>=1.38.0 opentelemetry-sdk>=1.38.0 \\\n", + " python-dotenv>=1.0.0 requests>=2.28.0 typing_extensions>=4.0.0 graphviz>=0.20.1\n", + "\n", + "# Install OpenTrace (the project itself) in editable mode\n", + "# If running on Colab, install from the repo and checkout OPENTRACE_REF\n", + "import os\n", + "try:\n", + " import google.colab # noqa: F401\n", + " IN_COLAB = True\n", + "\n", + " OPENTRACE_FOLDER = \"NewTrace\"\n", + " OPENTRACE_REPO = f\"https://github.com/mjehanzaib999/{OPENTRACE_FOLDER}.git\"\n", + " OPENTRACE_REF = os.environ.get(\"OPENTRACE_REF\", \"m1-for-upstream\")\n", + "\n", + " if not os.path.exists(f\"/content/{OPENTRACE_FOLDER}\"):\n", + " !git clone {OPENTRACE_REPO} /content/{OPENTRACE_FOLDER}\n", + " !git -C /content/{OPENTRACE_FOLDER} checkout {OPENTRACE_REF}\n", + " %cd /content/{OPENTRACE_FOLDER}\n", + " %alias sed sed\n", + " %sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n", + " !pip install -q -e /content/{OPENTRACE_FOLDER}\n", + "\n", + " print(f\"[INFO] OpenTrace ref: {OPENTRACE_REF}\")\n", + "except ImportError:\n", + " IN_COLAB = False\n", + " # Assume local dev: project already installed via pip install -e .\n", + "\n", + "print(\"\\n\" + \"=\" * 50)\n", + "print(\"All dependencies installed!\")\n", + "print(\"=\" * 50)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "puVi4D_Z1KMQ" + }, + "source": [ + "**Persistent output (Colab):** When running on Colab the next cell mounts\n", + "Google Drive so artifacts survive session restarts. Locally they go into\n", + "`./notebook_outputs/`." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:58:48.234683Z", + "iopub.status.busy": "2026-02-12T07:58:48.233679Z", + "iopub.status.idle": "2026-02-12T07:58:48.254178Z", + "shell.execute_reply": "2026-02-12T07:58:48.252166Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7G2vMKKj1KMR", + "outputId": "5c153c06-89f0-4d30-a2ad-3965870c9650" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", + "Run folder (Google Drive, OpenTrace): /content/drive/MyDrive/OpenTrace_runs/M1/main/run_20260219_220544\n" + ] + } + ], + "source": [ + "import os\n", + "from datetime import datetime\n", + "\n", + "RUN_FOLDER = None\n", + "try:\n", + " import google.colab\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\", force_remount=False)\n", + " OPENTRACE_REF = os.environ.get(\"OPENTRACE_REF\", \"main\")\n", + " base = f\"/content/drive/MyDrive/OpenTrace_runs/M1/{OPENTRACE_REF}\"\n", + " os.makedirs(base, exist_ok=True)\n", + " RUN_FOLDER = os.path.join(base, f\"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}\")\n", + " os.makedirs(RUN_FOLDER, exist_ok=True)\n", + " print(f\"Run folder (Google Drive, OpenTrace): {RUN_FOLDER}\")\n", + "except Exception:\n", + " RUN_FOLDER = os.path.abspath(os.path.join(os.getcwd(), \"notebook_outputs\", \"m1\"))\n", + " os.makedirs(RUN_FOLDER, exist_ok=True)\n", + " print(f\"Run folder (local): {RUN_FOLDER}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DuBjDelH1KMR" + }, + "source": [ + "---\n", + "## 2. Configuration\n", + "\n", + "API keys are retrieved **automatically** — never paste keys into cells:\n", + "\n", + "| Priority | Source | How to set |\n", + "|----------|--------|------------|\n", + "| 1 | **Colab Secrets** | Click the key icon → add `OPENROUTER_API_KEY` |\n", + "| 2 | **Environment variable** | `export OPENROUTER_API_KEY=sk-or-v1-...` |\n", + "| 3 | **`.env` file** | `OPENROUTER_API_KEY=sk-or-v1-...` in project root |\n", + "\n", + "Sections 4-8 use **StubLLM** (no key needed). Section 9 uses a live\n", + "provider and is skipped automatically when no key is available." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:58:48.269399Z", + "iopub.status.busy": "2026-02-12T07:58:48.268397Z", + "iopub.status.idle": "2026-02-12T07:58:48.324887Z", + "shell.execute_reply": "2026-02-12T07:58:48.321207Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CsxUsb3p1KMS", + "outputId": "7fb0d196-c989-4869-e48e-3a31373ff99a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[INFO] API key loaded from Colab Secrets.\n", + "\n", + "API key: [SET]\n", + "Model: meta-llama/llama-3.1-8b-instruct:free\n", + "Budget: max_tokens=256, temperature=0\n" + ] + } + ], + "source": [ + "from __future__ import annotations\n", + "import os, json\n", + "\n", + "# Model config (free tier on OpenRouter)\n", + "OPENROUTER_MODEL = \"meta-llama/llama-3.1-8b-instruct:free\"\n", + "OPENROUTER_BASE_URL = \"https://openrouter.ai/api/v1\"\n", + "\n", + "# Budget guard for live mode\n", + "MAX_TOKENS_PER_CALL = 256\n", + "LIVE_TEMPERATURE = 0 # deterministic\n", + "\n", + "# ---------- key retrieval (Colab Secrets → env → .env file) ----------\n", + "OPENROUTER_API_KEY = \"\"\n", + "\n", + "try:\n", + " from google.colab import userdata\n", + " OPENROUTER_API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n", + " if OPENROUTER_API_KEY:\n", + " print(\"[INFO] API key loaded from Colab Secrets.\")\n", + "except (ImportError, ModuleNotFoundError):\n", + " pass\n", + "\n", + "if not OPENROUTER_API_KEY:\n", + " OPENROUTER_API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", + " if OPENROUTER_API_KEY:\n", + " print(\"[INFO] API key loaded from environment variable.\")\n", + "\n", + "if not OPENROUTER_API_KEY:\n", + " try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + " OPENROUTER_API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", + " if OPENROUTER_API_KEY:\n", + " print(\"[INFO] API key loaded from .env file.\")\n", + " except ImportError:\n", + " pass\n", + "\n", + "HAS_API_KEY = bool(OPENROUTER_API_KEY)\n", + "os.environ[\"OPENROUTER_API_KEY\"] = OPENROUTER_API_KEY\n", + "\n", + "print(f\"\\nAPI key: {'[SET]' if HAS_API_KEY else '[NOT SET — live mode will be skipped]'}\")\n", + "print(f\"Model: {OPENROUTER_MODEL}\")\n", + "print(f\"Budget: max_tokens={MAX_TOKENS_PER_CALL}, temperature={LIVE_TEMPERATURE}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i1EvhtCN1KMS" + }, + "source": [ + "---\n", + "## 3. Define a Minimal LangGraph\n", + "\n", + "A simple **planner → synthesizer** pipeline. Node functions close over\n", + "`tracing_llm` and `templates` so that `apply_updates()` propagates to\n", + "the next invocation automatically." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:58:48.337340Z", + "iopub.status.busy": "2026-02-12T07:58:48.336340Z", + "iopub.status.idle": "2026-02-12T07:58:55.612322Z", + "shell.execute_reply": "2026-02-12T07:58:55.609666Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "R-wjaWKz1KMS", + "outputId": "fe2e078c-af71-49d2-ea08-b9d422acafc8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Graph builder defined.\n", + " Nodes: planner, executor, web_researcher, wikidata_researcher, synthesizer, evaluator\n", + " DEMO_QUERIES: 3 queries\n" + ] + } + ], + "source": [ + "from typing import Any, Dict, List, Optional\n", + "from typing_extensions import TypedDict, Literal\n", + "from langgraph.graph import StateGraph, START, END\n", + "from langgraph.types import Command\n", + "\n", + "# Keep the notebook aligned with JSON_OTEL_trace_optim_demo_LANGGRAPH.py\n", + "DEMO_QUERIES = [\n", + " \"Summarize the causes and key events of the French Revolution.\",\n", + " \"Give 3 factual relationships about Tesla, Inc. with entity IDs.\",\n", + " \"What is the Wikidata ID for CRISPR and list 2 related entities?\",\n", + "]\n", + "\n", + "class AgentState(TypedDict, total=False):\n", + " query: str\n", + " plan: Dict[str, Any]\n", + " current_step: int\n", + " contexts: List[str]\n", + " agent_query: str\n", + " final_answer: str\n", + " eval_score: float\n", + " eval_feedback: str\n", + "\n", + "def wikipedia_search(query: str) -> str:\n", + " \"\"\"Wikipedia tool. Falls back gracefully if wikipedia package/network is unavailable.\"\"\"\n", + " try:\n", + " import wikipedia\n", + " wikipedia.set_lang(\"en\")\n", + " hits = wikipedia.search(query, results=2)\n", + " out = []\n", + " for h in hits:\n", + " try:\n", + " s = wikipedia.summary(h, sentences=3, auto_suggest=False, redirect=True)\n", + " out.append(f\"### {h}\\n{s}\")\n", + " except Exception:\n", + " continue\n", + " return \"\\n\\n\".join(out) or \"No Wikipedia results.\"\n", + " except Exception:\n", + " return \"Wikipedia search unavailable.\"\n", + "\n", + "def wikidata_search(query: str) -> str:\n", + " \"\"\"Wikidata search tool (wbsearchentities).\"\"\"\n", + " import requests\n", + " try:\n", + " r = requests.get(\n", + " \"https://www.wikidata.org/w/api.php\",\n", + " params={\n", + " \"action\": \"wbsearchentities\",\n", + " \"format\": \"json\",\n", + " \"language\": \"en\",\n", + " \"search\": query[:100],\n", + " \"limit\": 5,\n", + " },\n", + " timeout=10,\n", + " )\n", + " r.raise_for_status()\n", + " data = r.json()\n", + " results = [\n", + " f\"- {item.get('label','')}: {item.get('description','')} ({item.get('id','')})\"\n", + " for item in data.get(\"search\", [])\n", + " ]\n", + " return \"\\n\".join(results) if results else \"No Wikidata entities found.\"\n", + " except Exception:\n", + " return f\"Wikidata search unavailable. Query: {query[:50]}...\"\n", + "\n", + "def build_graph(tracing_llm, templates: Dict[str, str]):\n", + " \"\"\"\n", + " Build a multi-node LangGraph aligned with JSON_OTEL_trace_optim_demo_LANGGRAPH.py:\n", + " planner -> executor -> (web_researcher|wikidata_researcher|synthesizer) -> evaluator\n", + " \"\"\"\n", + "\n", + " def planner_node(state: AgentState) -> Command[Literal[\"executor\"]]:\n", + " template = templates.get(\n", + " \"planner_prompt\",\n", + " \"Return JSON plan with steps for query: {query}. Use agents: web_researcher, wikidata_researcher, synthesizer.\",\n", + " )\n", + " prompt = template.replace(\"{query}\", state.get(\"query\", \"\"))\n", + "\n", + " raw = tracing_llm.node_call(\n", + " span_name=\"planner\",\n", + " template_name=\"planner_prompt\",\n", + " template=template,\n", + " optimizable_key=\"planner\",\n", + " user_query=state.get(\"query\", \"\"),\n", + " extra_inputs={\"user_query\": state.get(\"query\", \"\")},\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"Return JSON only. Keys: 1,2,... each step has {agent,action,goal,query}.\"},\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " max_tokens=400,\n", + " temperature=0,\n", + " )\n", + " plan: Dict[str, Any]\n", + " try:\n", + " import json\n", + " plan = json.loads(raw)\n", + " except Exception:\n", + " q = (state.get(\"query\", \"\") or \"\").lower()\n", + " plan = {\n", + " \"1\": {\"agent\": \"web_researcher\", \"action\": \"search\", \"goal\": \"collect context\", \"query\": state.get(\"query\", \"\")},\n", + " \"2\": {\"agent\": \"wikidata_researcher\" if (\"wikidata\" in q or \"entity id\" in q or \"id\" in q) else \"synthesizer\",\n", + " \"action\": \"search\" if (\"wikidata\" in q or \"entity id\" in q or \"id\" in q) else \"answer\",\n", + " \"goal\": \"entities or final answer\", \"query\": state.get(\"query\", \"\")},\n", + " \"3\": {\"agent\": \"synthesizer\", \"action\": \"answer\", \"goal\": \"final answer\", \"query\": state.get(\"query\", \"\")},\n", + " }\n", + "\n", + " return Command(update={\"plan\": plan, \"current_step\": 1, \"contexts\": []}, goto=\"executor\")\n", + "\n", + " def executor_node(state: AgentState) -> Command[Literal[\"web_researcher\", \"wikidata_researcher\", \"synthesizer\"]]:\n", + " step = int(state.get(\"current_step\", 1) or 1)\n", + " plan = state.get(\"plan\", {}) or {}\n", + " plan_step = plan.get(str(step), {})\n", + " if not plan_step:\n", + " return Command(update={}, goto=\"synthesizer\")\n", + "\n", + " template = templates.get(\n", + " \"executor_prompt\",\n", + " \"Given step {step} of plan: {plan_step}\\nFor query: {query}\\nReturn JSON: {goto,query}. goto in [web_researcher,wikidata_researcher,synthesizer].\",\n", + " )\n", + " prompt = (\n", + " template.replace(\"{step}\", str(step))\n", + " .replace(\"{plan_step}\", str(plan_step))\n", + " .replace(\"{query}\", state.get(\"query\", \"\"))\n", + " )\n", + "\n", + " raw = tracing_llm.node_call(\n", + " span_name=\"executor\",\n", + " template_name=\"executor_prompt\",\n", + " template=template,\n", + " optimizable_key=\"executor\",\n", + " user_query=state.get(\"query\", \"\"),\n", + " extra_inputs={\"step\": str(step), \"user_query\": state.get(\"query\", \"\")},\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"Return JSON only with keys goto and query.\"},\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " max_tokens=200,\n", + " temperature=0,\n", + " )\n", + "\n", + " goto = str(plan_step.get(\"agent\", \"synthesizer\"))\n", + " q2 = str(plan_step.get(\"query\", state.get(\"query\", \"\")))\n", + " try:\n", + " import json\n", + " d = json.loads(raw)\n", + " goto = str(d.get(\"goto\", goto))\n", + " q2 = str(d.get(\"query\", q2))\n", + " except Exception:\n", + " pass\n", + "\n", + " if goto not in (\"web_researcher\", \"wikidata_researcher\", \"synthesizer\"):\n", + " goto = \"synthesizer\"\n", + "\n", + " return Command(update={\"agent_query\": q2}, goto=goto)\n", + "\n", + " def web_researcher_node(state: AgentState) -> Command[Literal[\"executor\"]]:\n", + " q = state.get(\"agent_query\", state.get(\"query\", \"\"))\n", + " with tracing_llm.tracer.start_as_current_span(\"web_researcher\") as sp:\n", + " sp.set_attribute(\"inputs.user_query\", state.get(\"query\", \"\"))\n", + " sp.set_attribute(\"inputs.agent_query\", q)\n", + " ctx = wikipedia_search(q)\n", + " sp.set_attribute(\"outputs.context.preview\", (ctx or \"\")[:500])\n", + " contexts = list(state.get(\"contexts\", []) or [])\n", + " contexts.append(ctx)\n", + " step = int(state.get(\"current_step\", 1) or 1) + 1\n", + " return Command(update={\"contexts\": contexts, \"current_step\": step}, goto=\"executor\")\n", + "\n", + " def wikidata_researcher_node(state: AgentState) -> Command[Literal[\"executor\"]]:\n", + " q = state.get(\"agent_query\", state.get(\"query\", \"\"))\n", + " with tracing_llm.tracer.start_as_current_span(\"wikidata_researcher\") as sp:\n", + " sp.set_attribute(\"inputs.user_query\", state.get(\"query\", \"\"))\n", + " sp.set_attribute(\"inputs.agent_query\", q)\n", + " ctx = wikidata_search(q)\n", + " sp.set_attribute(\"outputs.context.preview\", (ctx or \"\")[:500])\n", + " contexts = list(state.get(\"contexts\", []) or [])\n", + " contexts.append(ctx)\n", + " step = int(state.get(\"current_step\", 1) or 1) + 1\n", + " return Command(update={\"contexts\": contexts, \"current_step\": step}, goto=\"executor\")\n", + "\n", + " def synthesizer_node(state: AgentState) -> Command[Literal[\"evaluator\"]]:\n", + " template = templates.get(\n", + " \"synthesizer_prompt\",\n", + " \"Answer the query: {query}\\nContext:\\n{contexts}\\nIf asked for IDs, include them. Be factual.\",\n", + " )\n", + " contexts = \"\\n\\n\".join(state.get(\"contexts\", []) or [])\n", + " prompt = template.replace(\"{query}\", state.get(\"query\", \"\")).replace(\"{contexts}\", contexts[:4000])\n", + "\n", + " ans = tracing_llm.node_call(\n", + " span_name=\"synthesizer\",\n", + " template_name=\"synthesizer_prompt\",\n", + " template=template,\n", + " optimizable_key=\"synthesizer\",\n", + " user_query=state.get(\"query\", \"\"),\n", + " extra_inputs={\"user_query\": state.get(\"query\", \"\")},\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a careful assistant.\"},\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " max_tokens=500,\n", + " temperature=0,\n", + " )\n", + " return Command(update={\"final_answer\": ans}, goto=\"evaluator\")\n", + "\n", + " def evaluator_node(state: AgentState) -> Command[Literal[\"__end__\"]]:\n", + " import re\n", + " q = (state.get(\"query\", \"\") or \"\").lower()\n", + " ans = (state.get(\"final_answer\", \"\") or \"\")\n", + " ctx = \"\\n\".join(state.get(\"contexts\", []) or \"\")\n", + " wants_ids = (\"wikidata\" in q) or (\"entity id\" in q) or (\"id\" in q and \"tesla\" in q)\n", + " has_qid = bool(re.search(r\"\\bQ\\d{2,}\\b\", ans))\n", + " uses_ctx = len(ctx.strip()) > 0\n", + " score = 0.25\n", + " reasons = []\n", + " if uses_ctx:\n", + " score += 0.25\n", + " reasons.append(\"uses_context\")\n", + " if wants_ids and has_qid:\n", + " score += 0.35\n", + " reasons.append(\"has_qids\")\n", + " if \"french revolution\" in q and len(ans) > 200:\n", + " score += 0.10\n", + " reasons.append(\"sufficient_detail\")\n", + " score = min(score, 0.95)\n", + " feedback = {\"score\": score, \"reasons\": reasons}\n", + " with tracing_llm.tracer.start_as_current_span(\"evaluator\") as sp:\n", + " sp.set_attribute(\"eval.score\", str(score))\n", + " sp.set_attribute(\"eval.reasons\", \",\".join(reasons))\n", + " return Command(update={\"eval_score\": score, \"eval_feedback\": str(feedback)}, goto=END)\n", + "\n", + " workflow = StateGraph(AgentState)\n", + " workflow.add_node(\"planner\", planner_node)\n", + " workflow.add_node(\"executor\", executor_node)\n", + " workflow.add_node(\"web_researcher\", web_researcher_node)\n", + " workflow.add_node(\"wikidata_researcher\", wikidata_researcher_node)\n", + " workflow.add_node(\"synthesizer\", synthesizer_node)\n", + " workflow.add_node(\"evaluator\", evaluator_node)\n", + "\n", + " workflow.add_edge(START, \"planner\")\n", + " workflow.add_edge(\"synthesizer\", \"evaluator\")\n", + "\n", + " return workflow.compile()\n", + "\n", + "print(\"Graph builder defined.\")\n", + "print(f\" Nodes: planner, executor, web_researcher, wikidata_researcher, synthesizer, evaluator\")\n", + "print(f\" DEMO_QUERIES: {len(DEMO_QUERIES)} queries\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JNuBgcbM1KMT" + }, + "source": [ + "### StubLLM\n", + "\n", + "A deterministic LLM that returns canned responses (no API calls)." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:58:55.622865Z", + "iopub.status.busy": "2026-02-12T07:58:55.621865Z", + "iopub.status.idle": "2026-02-12T07:58:55.641281Z", + "shell.execute_reply": "2026-02-12T07:58:55.639271Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FbFIkFCh1KMT", + "outputId": "efeb191f-09fb-4b36-99e1-95f1dd31dff8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "StubLLM ready (multi-node graph aware, prompt-template-sensitive).\n" + ] + } + ], + "source": [ + "class StubLLM:\n", + " \"\"\"Deterministic LLM stub for the multi-node graph.\n", + "\n", + " Produces JSON plans for planner, routing JSON for executor,\n", + " and text answers for synthesizer. When the prompt template includes\n", + " optimization signals (\"step-by-step\", \"thorough\"), the stub produces\n", + " richer plans and more detailed answers so eval_fn returns a higher\n", + " score — demonstrating non-saturating optimization.\n", + " \"\"\"\n", + " model = \"stub-llm\"\n", + "\n", + " def __init__(self):\n", + " self.call_count = 0\n", + "\n", + " def __call__(self, messages=None, **kwargs):\n", + " self.call_count += 1\n", + " import json as _json\n", + "\n", + " content = f\"Stub response #{self.call_count}\"\n", + " if messages:\n", + " user_text = \"\"\n", + " system_text = \"\"\n", + " for m in messages:\n", + " if m.get(\"role\") == \"user\":\n", + " user_text = (m.get(\"content\") or \"\").lower()\n", + " elif m.get(\"role\") == \"system\":\n", + " system_text = (m.get(\"content\") or \"\").lower()\n", + "\n", + " # Detect if the prompt template has been optimized\n", + " is_enhanced = any(kw in user_text for kw in (\"step-by-step\", \"thorough\", \"detailed\"))\n", + "\n", + " if \"return json only\" in system_text and \"step\" in system_text:\n", + " # Planner: return a JSON plan\n", + " q = user_text\n", + " if is_enhanced:\n", + " # Optimized prompt -> richer plan with more steps\n", + " steps = {\n", + " \"1\": {\"agent\": \"web_researcher\", \"action\": \"search\", \"goal\": \"gather primary context\", \"query\": user_text[:80]},\n", + " \"2\": {\"agent\": \"wikidata_researcher\", \"action\": \"search\", \"goal\": \"find entity IDs\", \"query\": user_text[:80]},\n", + " \"3\": {\"agent\": \"web_researcher\", \"action\": \"verify\", \"goal\": \"cross-check facts\", \"query\": user_text[:80]},\n", + " \"4\": {\"agent\": \"synthesizer\", \"action\": \"answer\", \"goal\": \"produce comprehensive answer\", \"query\": user_text[:80]},\n", + " }\n", + " else:\n", + " # Baseline prompt -> simpler plan\n", + " steps = {\n", + " \"1\": {\"agent\": \"web_researcher\", \"action\": \"search\", \"goal\": \"collect context\", \"query\": user_text[:80]},\n", + " \"2\": {\"agent\": \"synthesizer\", \"action\": \"answer\", \"goal\": \"final answer\", \"query\": user_text[:80]},\n", + " }\n", + " content = _json.dumps(steps)\n", + "\n", + " elif \"return json only\" in system_text and \"goto\" in system_text:\n", + " # Executor: return routing JSON\n", + " content = _json.dumps({\"goto\": \"synthesizer\", \"query\": user_text[:80]})\n", + "\n", + " elif \"careful assistant\" in system_text:\n", + " # Synthesizer: return a text answer\n", + " if \"french revolution\" in user_text:\n", + " content = (\n", + " \"The French Revolution (1789-1799) was caused by fiscal crisis, social inequality, \"\n", + " \"and Enlightenment ideas. Key events include the Storming of the Bastille (July 14, 1789), \"\n", + " \"the Declaration of the Rights of Man, the Reign of Terror, and Napoleon's rise to power.\"\n", + " )\n", + " elif \"tesla\" in user_text:\n", + " content = (\n", + " \"Tesla, Inc. (Q478214) is an American electric vehicle manufacturer. \"\n", + " \"Key relationships: 1) Founded by Elon Musk (Q317521). \"\n", + " \"2) Headquartered in Austin, Texas (Q16559). \"\n", + " \"3) Produces the Model S, Model 3, Model X, and Model Y vehicles.\"\n", + " )\n", + " elif \"crispr\" in user_text:\n", + " content = (\n", + " \"CRISPR (Q22328579) is a gene-editing technology. \"\n", + " \"Related entities: 1) Cas9 protein (Q24721710) - the molecular scissors. \"\n", + " \"2) Jennifer Doudna (Q467958) - co-discoverer of CRISPR-Cas9.\"\n", + " )\n", + " else:\n", + " content = f\"Based on the collected context, here is a comprehensive answer about the topic.\"\n", + " else:\n", + " content = f\"Stub response #{self.call_count}: Generic LLM output.\"\n", + "\n", + " class _Msg:\n", + " pass\n", + " msg = _Msg()\n", + " msg.content = content\n", + " class _Choice:\n", + " pass\n", + " choice = _Choice()\n", + " choice.message = msg\n", + " class _Resp:\n", + " pass\n", + " resp = _Resp()\n", + " resp.choices = [choice]\n", + " return resp\n", + "\n", + "stub_llm = StubLLM()\n", + "print(\"StubLLM ready (multi-node graph aware, prompt-template-sensitive).\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uX7xCbP21KMT" + }, + "source": [ + "---\n", + "## 4. Instrument the Graph (StubLLM)\n", + "\n", + "One function call — `instrument_graph()` — wraps the LangGraph with full\n", + "OTEL tracing, creates a `TelemetrySession`, and sets up `Binding` objects\n", + "that map `param.*` keys to the live template dict." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:58:55.651617Z", + "iopub.status.busy": "2026-02-12T07:58:55.650609Z", + "iopub.status.idle": "2026-02-12T07:59:07.295195Z", + "shell.execute_reply": "2026-02-12T07:59:07.294185Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wLBpRJmc1KMT", + "outputId": "fe0458cd-ebe6-4b0e-f570-e48634770cb3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Instrumented graph ready.\n", + " Templates: ['executor_prompt', 'planner_prompt', 'synthesizer_prompt']\n", + " Bindings: ['executor_prompt', 'planner_prompt', 'synthesizer_prompt']\n", + " output_key: final_answer\n" + ] + } + ], + "source": [ + "from opto.trace.io import instrument_graph, apply_updates\n", + "\n", + "INITIAL_TEMPLATES = {\n", + " \"planner_prompt\": \"Create a JSON plan for: {query}. Use web_researcher and synthesizer; include wikidata_researcher if IDs are requested.\",\n", + " \"executor_prompt\": \"Given step {step} of plan: {plan_step} for query: {query}. Return JSON {goto,query}.\",\n", + " \"synthesizer_prompt\": \"Answer: {query}\\nContext:\\n{contexts}\\nIf asked for IDs, include Wikidata QIDs.\",\n", + "}\n", + "\n", + "ig = instrument_graph(\n", + " graph=None,\n", + " service_name=\"m1-notebook\",\n", + " trainable_keys={\"planner\", \"executor\", \"synthesizer\"},\n", + " llm=stub_llm,\n", + " initial_templates=INITIAL_TEMPLATES,\n", + " emit_genai_child_spans=True,\n", + " provider_name=\"stub\",\n", + " llm_span_name=\"llm.chat.completion\",\n", + " input_key=\"query\",\n", + " output_key=\"final_answer\",\n", + ")\n", + "\n", + "# Build and attach the graph (node funcs close over tracing_llm + templates)\n", + "ig.graph = build_graph(ig.tracing_llm, ig.templates)\n", + "\n", + "print(\"Instrumented graph ready.\")\n", + "print(f\" Templates: {sorted(ig.templates.keys())}\")\n", + "print(f\" Bindings: {sorted(ig.bindings.keys())}\")\n", + "print(f\" output_key: {ig.output_key}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.302370Z", + "iopub.status.busy": "2026-02-12T07:59:07.301358Z", + "iopub.status.idle": "2026-02-12T07:59:07.321120Z", + "shell.execute_reply": "2026-02-12T07:59:07.320110Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FvQBzbGZ1KMT", + "outputId": "32d2f61c-f122-44d7-ebc0-76ff30a6150e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Result keys: ['agent_query', 'contexts', 'current_step', 'eval_feedback', 'eval_score', 'final_answer', 'plan', 'query']\n", + "\n", + "Final answer (79 chars):\n", + " Based on the collected context, here is a comprehensive answer about the topic.\n", + "\n", + "Plan:\n", + " {\n", + " \"1\": {\n", + " \"agent\": \"web_researcher\",\n", + " \"action\": \"search\",\n", + " \"goal\": \"collect context\",\n", + " \"query\": \"create a json plan for: what is reinforcement learning?. use web_researcher and \"\n", + " },\n", + " \"2\": {\n", + " \"agent\": \"synthesizer\",\n", + " \"action\": \"answer\",\n", + " \"goal\": \"final answer\",\n", + " \"query\": \n", + "\n", + "Contexts collected: 0\n", + "Eval score: 0.25\n" + ] + } + ], + "source": [ + "# --- Single invocation ---\n", + "result = ig.invoke({\"query\": \"What is reinforcement learning?\"})\n", + "\n", + "print(\"Result keys:\", sorted(result.keys()))\n", + "ans_len = len(str(result.get('final_answer', '')))\n", + "print(f\"\\nFinal answer ({ans_len} chars):\")\n", + "print(f\" {str(result.get('final_answer', '(none)'))[:300]}\")\n", + "print(f\"\\nPlan:\")\n", + "import json as _json\n", + "try:\n", + " print(f\" {_json.dumps(result.get('plan', {}), indent=2)[:300]}\")\n", + "except Exception:\n", + " print(f\" {str(result.get('plan', '(none)'))[:300]}\")\n", + "print(f\"\\nContexts collected: {len(result.get('contexts', []))}\")\n", + "print(f\"Eval score: {result.get('eval_score', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zCV8nYAd1KMT" + }, + "source": [ + "---\n", + "## 5. Inspect OTLP Spans & `param.*` Attributes\n", + "\n", + "After invocation the `TelemetrySession` holds all captured OTEL spans.\n", + "`flush_otlp()` exports them as an OTLP JSON payload." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.329697Z", + "iopub.status.busy": "2026-02-12T07:59:07.328119Z", + "iopub.status.idle": "2026-02-12T07:59:07.342552Z", + "shell.execute_reply": "2026-02-12T07:59:07.341545Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TbVDqsVQ1KMT", + "outputId": "6f1ba3d5-ad17-4643-aa62-7214d445c136" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Total spans captured: 8\n", + "\n", + "Unique trace IDs: 1 (D9: should be 1)\n", + "Root invocation span: m1-notebook.invoke (id=e7c67eee901a...)\n", + "\n", + " Span: llm.chat.completion parent=12073348\n", + " gen_ai.operation.name = chat\n", + " gen_ai.output.preview = {\"1\": {\"agent\": \"web_researcher\", \"action\": \"search\", \"goal\": \"collect context\",\n", + " gen_ai.provider.name = stub\n", + " gen_ai.request.model = stub-llm\n", + " trace.temporal_ignore = true\n", + "\n", + " Span: planner parent=e7c67eee\n", + " gen_ai.model = stub-llm\n", + " inputs.gen_ai.prompt = Create a JSON plan for: What is reinforcement learning?. Use web_researcher and \n", + " inputs.user_query = What is reinforcement learning?\n", + " param.planner_prompt = Create a JSON plan for: {query}. Use web_researcher and synthesizer; include wik\n", + " param.planner_prompt.trainable = True\n", + "\n", + " Span: llm.chat.completion parent=fe7b1f50\n", + " gen_ai.operation.name = chat\n", + " gen_ai.output.preview = {\"goto\": \"synthesizer\", \"query\": \"given step 1 of plan: {'agent': 'web_researche\n", + " gen_ai.provider.name = stub\n", + " gen_ai.request.model = stub-llm\n", + " trace.temporal_ignore = true\n", + "\n", + " Span: executor parent=e7c67eee\n", + " gen_ai.model = stub-llm\n", + " inputs.gen_ai.prompt = Given step 1 of plan: {'agent': 'web_researcher', 'action': 'search', 'goal': 'c\n", + " inputs.step = 1\n", + " inputs.user_query = What is reinforcement learning?\n", + " param.executor_prompt = Given step {step} of plan: {plan_step} for query: {query}. Return JSON {goto,que\n", + " param.executor_prompt.trainable = True\n", + "\n", + " Span: llm.chat.completion parent=711b2caa\n", + " gen_ai.operation.name = chat\n", + " gen_ai.output.preview = Based on the collected context, here is a comprehensive answer about the topic.\n", + " gen_ai.provider.name = stub\n", + " gen_ai.request.model = stub-llm\n", + " trace.temporal_ignore = true\n", + "\n", + " Span: synthesizer parent=e7c67eee\n", + " gen_ai.model = stub-llm\n", + " inputs.gen_ai.prompt = Answer: What is reinforcement learning?\n", + "Context:\n", + "\n", + "If asked for IDs, include Wiki\n", + " inputs.user_query = What is reinforcement learning?\n", + " param.synthesizer_prompt = Answer: {query}\n", + "Context:\n", + "{contexts}\n", + "If asked for IDs, include Wikidata QIDs.\n", + " param.synthesizer_prompt.trainable = True\n", + "\n", + " Span: evaluator parent=e7c67eee\n", + " eval.reasons = \n", + " eval.score = 0.25\n", + "\n", + " Span: m1-notebook.invoke parent=\n", + "\n" + ] + } + ], + "source": [ + "otlp = ig.session.flush_otlp(clear=True)\n", + "\n", + "spans = otlp[\"resourceSpans\"][0][\"scopeSpans\"][0][\"spans\"]\n", + "print(f\"Total spans captured: {len(spans)}\\n\")\n", + "\n", + "# D9: Verify single trace ID per invocation\n", + "trace_ids = {s[\"traceId\"] for s in spans}\n", + "print(f\"Unique trace IDs: {len(trace_ids)} (D9: should be 1)\")\n", + "assert len(trace_ids) == 1, f\"Expected 1 trace ID, got {len(trace_ids)}\"\n", + "\n", + "# D9: Verify root invocation span exists\n", + "root_spans = [s for s in spans if s[\"name\"].endswith(\".invoke\")]\n", + "assert root_spans, \"Missing root invocation span (*.invoke). D9 invariant failed.\"\n", + "root_id = root_spans[0][\"spanId\"]\n", + "print(f\"Root invocation span: {root_spans[0]['name']} (id={root_id[:12]}...)\")\n", + "print()\n", + "\n", + "for sp in spans:\n", + " attrs = {}\n", + " for a in sp.get(\"attributes\", []):\n", + " val = a.get(\"value\", {})\n", + " attrs[a[\"key\"]] = val.get(\"stringValue\", val.get(\"boolValue\", val.get(\"intValue\", \"\")))\n", + " print(f\" Span: {sp['name']:<35} parent={sp.get('parentSpanId','(root)')[:8]}\")\n", + " for k, v in sorted(attrs.items()):\n", + " if k.startswith(\"param.\"):\n", + " print(f\" {k} = {str(v)[:80]}\")\n", + " elif k.startswith(\"gen_ai.\") or k == \"trace.temporal_ignore\":\n", + " print(f\" {k} = {str(v)[:80]}\")\n", + " elif k.startswith(\"eval.\") or k.startswith(\"inputs.\") or k.startswith(\"outputs.\"):\n", + " print(f\" {k} = {str(v)[:80]}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w1CaYiNH1KMT" + }, + "source": [ + "**Checkpoint:** The output above should show:\n", + "- `planner` and `synthesizer` spans with `param.` and `param..trainable = True`\n", + "- Child LLM spans (configurable name, e.g. `llm.chat.completion`) with `gen_ai.*` attributes and `trace.temporal_ignore = true`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G-3Xhr_h1KMT" + }, + "source": [ + "---\n", + "## 6. OTLP → TGJ → Trace Nodes\n", + "\n", + "Convert the OTLP payload to **Trace-Graph JSON (TGJ)**, then ingest it\n", + "into `ParameterNode` / `MessageNode` objects — the exact format the\n", + "optimizer consumes." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.350295Z", + "iopub.status.busy": "2026-02-12T07:59:07.349305Z", + "iopub.status.idle": "2026-02-12T07:59:07.369083Z", + "shell.execute_reply": "2026-02-12T07:59:07.367068Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a_wrw0kb1KMU", + "outputId": "77adc6dd-d155-4e6d-b9c5-ea28800a8efc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "TGJ documents: 1\n", + "\n", + "ParameterNode (trainable): 3\n", + " m1-notebook/0/planner_prompt7 trainable=True\n", + " m1-notebook/0/executor_prompt7 trainable=True\n", + " m1-notebook/0/synthesizer_prompt7 trainable=True\n", + "\n", + "Unique trainable params: ['executor_prompt7', 'planner_prompt7', 'synthesizer_prompt7']\n", + "[OK] No duplicate ParameterNodes (C7).\n", + "\n", + "MessageNode: 7\n", + " m1-notebook/0/planner7 parents=['lit_4812', 'lit_60399', 'planner_prompt7']\n", + " m1-notebook/0/llm.chat.completion21 parents=['planner7']\n", + " m1-notebook/0/llm.chat.completion23 parents=['synthesizer7']\n", + " m1-notebook/0/executor7 parents=['lit_41842', 'lit_41527', 'lit_603910', 'planner7', 'executor_prompt7']\n", + " m1-notebook/0/llm.chat.completion22 parents=['executor7']\n", + " m1-notebook/0/synthesizer7 parents=['lit_29423', 'lit_603911', 'executor7', 'synthesizer_prompt7']\n", + " m1-notebook/0/evaluator7 parents=['synthesizer7']\n", + "[WARN] No top-level message nodes found.\n" + ] + } + ], + "source": [ + "from opto.trace.io import otlp_traces_to_trace_json, ingest_tgj\n", + "from opto.trace.nodes import ParameterNode, MessageNode\n", + "\n", + "# Re-invoke so we have fresh spans for this section\n", + "ig.invoke({\"query\": DEMO_QUERIES[0]})\n", + "otlp = ig.session.flush_otlp(clear=True)\n", + "\n", + "# --- OTLP -> TGJ ---\n", + "docs = otlp_traces_to_trace_json(\n", + " otlp,\n", + " agent_id_hint=\"m1-notebook\",\n", + " use_temporal_hierarchy=True,\n", + ")\n", + "print(f\"TGJ documents: {len(docs)}\")\n", + "\n", + "# --- TGJ -> Trace Nodes ---\n", + "nodes = ingest_tgj(docs[0])\n", + "\n", + "# ingest_tgj stores each node under both its ID and name key,\n", + "# so deduplicate by object identity when iterating values.\n", + "param_nodes = list({id(n): n for n in nodes.values()\n", + " if isinstance(n, ParameterNode) and n.trainable}.values())\n", + "msg_nodes = list({id(n): n for n in nodes.values()\n", + " if isinstance(n, MessageNode)}.values())\n", + "\n", + "print(f\"\\nParameterNode (trainable): {len(param_nodes)}\")\n", + "for p in param_nodes:\n", + " print(f\" {p.py_name} trainable={p.trainable}\")\n", + "\n", + "# C7: Verify unique trainable param count == expected template keys\n", + "unique_param_names = set()\n", + "for p in param_nodes:\n", + " name = p.py_name.split(\"/\")[-1] if \"/\" in p.py_name else p.py_name\n", + " unique_param_names.add(name)\n", + "print(f\"\\nUnique trainable params: {sorted(unique_param_names)}\")\n", + "\n", + "assert len(unique_param_names) == len(param_nodes), \\\n", + " f\"Duplicate ParameterNodes: {len(param_nodes)} nodes but {len(unique_param_names)} unique names\"\n", + "print(\"[OK] No duplicate ParameterNodes (C7).\")\n", + "\n", + "print(f\"\\nMessageNode: {len(msg_nodes)}\")\n", + "for m in msg_nodes:\n", + " print(f\" {m.py_name} parents={[p.py_name.split('/')[-1] for p in m.parents]}\")\n", + "\n", + "# C8: Verify output node is a top-level node (not a child LLM span)\n", + "tgj_nodes = docs[0][\"nodes\"]\n", + "top_level_msg = []\n", + "for m in msg_nodes:\n", + " m_name = m.py_name.split(\"/\")[-1] if \"/\" in m.py_name else m.py_name\n", + " for nid, n in tgj_nodes.items():\n", + " if n.get(\"kind\") == \"msg\" and n.get(\"name\") == m_name:\n", + " otel_info = (n.get(\"info\") or {}).get(\"otel\", {})\n", + " is_child = str(otel_info.get(\"temporal_ignore\", \"false\")).lower() in (\"true\", \"1\", \"yes\")\n", + " if not is_child:\n", + " top_level_msg.append((m, n))\n", + " break\n", + "\n", + "if top_level_msg:\n", + " output_node, output_tgj = top_level_msg[-1]\n", + " print(f\"\\nOutput node (sink): {output_node.py_name}\")\n", + " print(f\" temporal_ignore=false -> OK (not a child span)\")\n", + " print(\"[OK] Output node is a top-level node (C8).\")\n", + "else:\n", + " print(\"[WARN] No top-level message nodes found.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.375448Z", + "iopub.status.busy": "2026-02-12T07:59:07.374447Z", + "iopub.status.idle": "2026-02-12T07:59:07.387535Z", + "shell.execute_reply": "2026-02-12T07:59:07.386526Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tJVT_y5A1KMU", + "outputId": "8ba71636-cfa2-44da-99f9-269a73b9fecd" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Child LLM spans detected (via temporal_ignore): 3\n", + "Top-level message nodes: 4\n", + " [OK] Node executor temporal parent → 74f1f9f8d28c... (not a child span)\n", + " [OK] Node synthesizer temporal parent → 292009cef08c... (not a child span)\n", + " [OK] Node evaluator temporal parent → dd31d71df559... (not a child span)\n", + "\n", + "[OK] Temporal chaining verified — no top-level node points to child spans.\n" + ] + } + ], + "source": [ + "# --- Verify temporal chain: child spans did NOT break chaining ---\n", + "tgj_nodes = docs[0][\"nodes\"]\n", + "\n", + "# Collect child LLM span IDs using trace.temporal_ignore marker (D10)\n", + "llm_span_ids = set()\n", + "for nid, n in tgj_nodes.items():\n", + " otel_info = (n.get(\"info\") or {}).get(\"otel\", {})\n", + " if str(otel_info.get(\"temporal_ignore\", \"false\")).lower() in (\"true\", \"1\", \"yes\"):\n", + " llm_span_ids.add(otel_info.get(\"span_id\"))\n", + "\n", + "print(f\"Child LLM spans detected (via temporal_ignore): {len(llm_span_ids)}\")\n", + "assert len(llm_span_ids) > 0, \"No child LLM spans found — temporal_ignore detection failed.\"\n", + "\n", + "# Check that no top-level node has a temporal parent pointing to a child LLM span\n", + "top_level_nodes = [\n", + " (nid, n) for nid, n in tgj_nodes.items()\n", + " if n.get(\"kind\") == \"msg\"\n", + " and str((n.get(\"info\") or {}).get(\"otel\", {}).get(\"temporal_ignore\", \"false\")).lower() not in (\"true\", \"1\", \"yes\")\n", + "]\n", + "\n", + "print(f\"Top-level message nodes: {len(top_level_nodes)}\")\n", + "clean = True\n", + "for nid, n in top_level_nodes:\n", + " parent_ref = n.get(\"inputs\", {}).get(\"parent\", \"\")\n", + " if parent_ref and \":\" in parent_ref:\n", + " _, ref_id = parent_ref.rsplit(\":\", 1)\n", + " if ref_id in llm_span_ids:\n", + " print(f\" [BUG] Node {n.get('name')} temporal parent points to child LLM span {ref_id[:12]}...\")\n", + " clean = False\n", + " else:\n", + " print(f\" [OK] Node {n.get('name')} temporal parent → {ref_id[:12]}... (not a child span)\")\n", + "\n", + "assert clean, \"Temporal parent incorrectly points to a child LLM span!\"\n", + "print(\"\\n[OK] Temporal chaining verified — no top-level node points to child spans.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eO5KdxjF1KMU" + }, + "source": [ + "---\n", + "## 7. Bindings & `apply_updates()`\n", + "\n", + "Bindings map optimizer output keys to live template values.\n", + "`apply_updates()` pushes new values through the bindings so the\n", + "**next** `invoke()` automatically uses the updated prompt." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.394844Z", + "iopub.status.busy": "2026-02-12T07:59:07.394844Z", + "iopub.status.idle": "2026-02-12T07:59:07.406751Z", + "shell.execute_reply": "2026-02-12T07:59:07.404735Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-VdXEdAi1KMU", + "outputId": "645d52e5-26b1-42df-a431-e5b8b9c2f797" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "============================================================\n", + "BEFORE apply_updates\n", + "============================================================\n", + " planner_prompt: 'Create a JSON plan for: {query}. Use web_researcher and synthesizer; include wikidata_researcher if IDs are requested.'\n", + " executor_prompt: 'Given step {step} of plan: {plan_step} for query: {query}. Return JSON {goto,query}.'\n", + " synthesizer_prompt: 'Answer: {query}\\nContext:\\n{contexts}\\nIf asked for IDs, include Wikidata QIDs.'\n", + "\n", + "============================================================\n", + "AFTER apply_updates\n", + "============================================================\n", + " planner_prompt: 'Create a detailed, step-by-step plan for: {query}. Use web_researcher, wikidata_researcher, synthesizer.'\n", + " executor_prompt: 'Given step {step} of plan: {plan_step} for query: {query}. Return JSON {goto,query}.'\n", + " synthesizer_prompt: 'Answer: {query}\\nContext:\\n{contexts}\\nIf asked for IDs, include Wikidata QIDs.'\n", + "\n", + "[OK] Binding → templates propagation verified.\n" + ] + } + ], + "source": [ + "print(\"=\" * 60)\n", + "print(\"BEFORE apply_updates\")\n", + "print(\"=\" * 60)\n", + "for k, b in ig.bindings.items():\n", + " print(f\" {k}: {b.get()!r}\")\n", + "\n", + "# Simulate an optimizer suggesting a new planner prompt\n", + "apply_updates(\n", + " {\"planner_prompt\": \"Create a detailed, step-by-step plan for: {query}. Use web_researcher, wikidata_researcher, synthesizer.\"},\n", + " ig.bindings,\n", + ")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"AFTER apply_updates\")\n", + "print(\"=\" * 60)\n", + "for k, b in ig.bindings.items():\n", + " print(f\" {k}: {b.get()!r}\")\n", + "\n", + "# Verify the change is visible in ig.templates too\n", + "assert \"detailed\" in ig.templates[\"planner_prompt\"]\n", + "print(\"\\n[OK] Binding → templates propagation verified.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.413969Z", + "iopub.status.busy": "2026-02-12T07:59:07.412959Z", + "iopub.status.idle": "2026-02-12T07:59:07.428527Z", + "shell.execute_reply": "2026-02-12T07:59:07.427517Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ws2nrnk31KMU", + "outputId": "fffdb9b0-dfc5-41e4-f6fd-bea70125d94c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "param.planner_prompt in span:\n", + " Create a detailed, step-by-step plan for: {query}. Use web_researcher, wikidata_researcher, synthesizer.\n", + "\n", + "[OK] Updated template appears in OTLP span after re-invoke.\n" + ] + } + ], + "source": [ + "# Invoke again and confirm the updated template appears in the OTLP span\n", + "ig.invoke({\"query\": \"test update\"})\n", + "otlp_after = ig.session.flush_otlp(clear=True)\n", + "\n", + "spans_after = otlp_after[\"resourceSpans\"][0][\"scopeSpans\"][0][\"spans\"]\n", + "planner_sp = next(s for s in spans_after if s[\"name\"] == \"planner\")\n", + "planner_attrs = {\n", + " a[\"key\"]: a[\"value\"][\"stringValue\"] for a in planner_sp[\"attributes\"]\n", + "}\n", + "\n", + "print(f\"param.planner_prompt in span:\")\n", + "print(f\" {planner_attrs['param.planner_prompt']}\")\n", + "\n", + "assert \"detailed\" in planner_attrs[\"param.planner_prompt\"]\n", + "print(\"\\n[OK] Updated template appears in OTLP span after re-invoke.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.436041Z", + "iopub.status.busy": "2026-02-12T07:59:07.435043Z", + "iopub.status.idle": "2026-02-12T07:59:07.444869Z", + "shell.execute_reply": "2026-02-12T07:59:07.443860Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "79NfK4K41KMU", + "outputId": "b88c3f12-1a90-41a3-9202-3f4038f18e3b" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Templates reset to original values:\n", + " executor_prompt: 'Given step {step} of plan: {plan_step} for query: {query}. Return JSON {goto,query}.'\n", + " planner_prompt: 'Create a JSON plan for: {query}. Use web_researcher and synthesizer; include wikidata_researcher if IDs are requested.'\n", + " synthesizer_prompt: 'Answer: {query}\\nContext:\\n{contexts}\\nIf asked for IDs, include Wikidata QIDs.'\n" + ] + } + ], + "source": [ + "# Reset templates back to original for the optimization demo\n", + "apply_updates(INITIAL_TEMPLATES, ig.bindings)\n", + "print(\"Templates reset to original values:\")\n", + "for k in sorted(INITIAL_TEMPLATES):\n", + " print(f\" {k}: {ig.templates[k]!r}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yFhz51PW1KMU" + }, + "source": [ + "---\n", + "## 8. `optimize_graph()` — StubLLM End-to-End\n", + "\n", + "Run the full optimization loop with **StubLLM** (deterministic, no API\n", + "calls). This verifies the complete pipeline:\n", + "\n", + "```\n", + "instrument → invoke → flush OTLP → TGJ → ingest → optimizer → apply_updates\n", + "```\n", + "\n", + "We use a simple length-based `eval_fn` and a mock optimizer to\n", + "demonstrate prompt value changes across iterations." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.451868Z", + "iopub.status.busy": "2026-02-12T07:59:07.450869Z", + "iopub.status.idle": "2026-02-12T07:59:07.466046Z", + "shell.execute_reply": "2026-02-12T07:59:07.465038Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1NUgjPH91KMU", + "outputId": "76dce824-f2e3-42d2-86af-7e72b915a0f9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mock optimizer and eval_fn ready.\n" + ] + } + ], + "source": [ + "from opto.trace.io import optimize_graph, EvalResult\n", + "\n", + "# ---- Mock optimizer (returns deterministic updates) ----\n", + "class MockOptimizer:\n", + " def __init__(self, param_nodes=None, **kw):\n", + " self.calls = []\n", + " def zero_feedback(self):\n", + " self.calls.append(\"zero_feedback\")\n", + " def backward(self, output_node, feedback_text):\n", + " self.calls.append(\"backward\")\n", + " def step(self):\n", + " self.calls.append(\"step\")\n", + " return {\n", + " \"planner_prompt\": \"Create a thorough, step-by-step JSON plan for: {query}. Use web_researcher, wikidata_researcher, synthesizer.\",\n", + " }\n", + "\n", + "# ---- Eval_fn: prefer evaluator score produced by the graph; fallback to structure scoring ----\n", + "def stub_eval_fn(payload):\n", + " result = payload.get(\"result\") or {}\n", + " ans = str(payload.get(\"answer\", \"\") or \"\")\n", + " if ans.strip().startswith(\"[ERROR]\") or not ans.strip():\n", + " return EvalResult(score=0.0, feedback=\"LLM failure/empty answer\")\n", + "\n", + " plan = {}\n", + " if isinstance(result, dict):\n", + " plan = result.get(\"plan\", {}) or {}\n", + " plan_steps = len(list(plan.keys())) if isinstance(plan, dict) else 0\n", + "\n", + " # Score: base + reward plan richness (up to 3 steps) + small reward for length\n", + " score = 0.2 + 0.2 * min(plan_steps, 3) + min(len(ans) / 1200.0, 0.15)\n", + " score = min(score, 0.95)\n", + " return EvalResult(score=score, feedback=f\"plan_steps={plan_steps}, score={score:.2f}\")\n", + "\n", + "print(\"Mock optimizer and eval_fn ready.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.472683Z", + "iopub.status.busy": "2026-02-12T07:59:07.471675Z", + "iopub.status.idle": "2026-02-12T07:59:07.552476Z", + "shell.execute_reply": "2026-02-12T07:59:07.550368Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ozkaDqi51KMU", + "outputId": "bfdda1a4-bb6e-4008-b513-d49544938b19" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "============================================================\n", + "TEMPLATE BEFORE OPTIMIZATION\n", + "============================================================\n", + " planner_prompt: 'Create a JSON plan for: {query}. Use web_researcher and synthesizer; include wikidata_researcher if IDs are requested.'\n", + "\n", + " Running baseline...\n", + " Query 1/3: Summarize the causes and key events of t... score=0.7500000000000001\n", + " Query 2/3: Give 3 factual relationships about Tesla... score=0.7500000000000001\n", + " Query 3/3: What is the Wikidata ID for CRISPR and l... score=0.7500000000000001\n", + " Baseline average: 0.7500\n", + " Iteration 1/2...\n", + " Query 1/3: Summarize the causes and key events of t... score=0.7500000000000001\n", + " Query 2/3: Give 3 factual relationships about Tesla... score=0.7500000000000001\n", + " Query 3/3: What is the Wikidata ID for CRISPR and l... score=0.7500000000000001\n", + " Iteration 1 average: 0.7500\n", + " Iteration 2/2...\n", + " Query 1/3: Summarize the causes and key events of t... score=0.95\n", + " Query 2/3: Give 3 factual relationships about Tesla... score=0.95\n", + " Query 3/3: What is the Wikidata ID for CRISPR and l... score=0.95\n", + " Iteration 2 average: 0.9500 * NEW BEST\n", + "\n", + "============================================================\n", + "TEMPLATE AFTER OPTIMIZATION\n", + "============================================================\n", + " planner_prompt: 'Create a thorough, step-by-step JSON plan for: {query}. Use web_researcher, wikidata_researcher, synthesizer.'\n", + "\n", + "============================================================\n", + "OPTIMIZATION RESULTS\n", + "============================================================\n", + " Baseline score: 0.7500\n", + " Best score: 0.9500\n", + " Best iteration: 2\n", + " Score history: [0.75, 0.75, 0.95]\n", + " Optimizer calls: ['zero_feedback', 'backward', 'step', 'zero_feedback', 'backward', 'step']\n", + " Final params: ['planner_prompt', 'executor_prompt', 'synthesizer_prompt']\n", + " Best params: ['planner_prompt', 'executor_prompt', 'synthesizer_prompt']\n", + " Best updates: ['planner_prompt']\n" + ] + } + ], + "source": [ + "# -- Use the same 3 queries as the reference demo --\n", + "QUERIES = DEMO_QUERIES\n", + "\n", + "mock_opt = MockOptimizer()\n", + "\n", + "print(\"=\" * 60)\n", + "print(\"TEMPLATE BEFORE OPTIMIZATION\")\n", + "print(\"=\" * 60)\n", + "print(f\" planner_prompt: {ig.templates['planner_prompt']!r}\")\n", + "print()\n", + "\n", + "opt_result = optimize_graph(\n", + " ig,\n", + " queries=QUERIES,\n", + " iterations=2,\n", + " optimizer=mock_opt,\n", + " eval_fn=stub_eval_fn,\n", + " apply_updates_flag=True,\n", + ")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"TEMPLATE AFTER OPTIMIZATION\")\n", + "print(\"=\" * 60)\n", + "print(f\" planner_prompt: {ig.templates['planner_prompt']!r}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"OPTIMIZATION RESULTS\")\n", + "print(\"=\" * 60)\n", + "print(f\" Baseline score: {opt_result.baseline_score:.4f}\")\n", + "print(f\" Best score: {opt_result.best_score:.4f}\")\n", + "print(f\" Best iteration: {opt_result.best_iteration}\")\n", + "print(f\" Score history: {[round(s, 4) for s in opt_result.score_history]}\")\n", + "print(f\" Optimizer calls: {mock_opt.calls}\")\n", + "print(f\" Final params: {list(opt_result.final_parameters.keys())}\")\n", + "print(f\" Best params: {list(opt_result.best_parameters.keys())}\")\n", + "print(f\" Best updates: {list(opt_result.best_updates.keys())}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.559993Z", + "iopub.status.busy": "2026-02-12T07:59:07.558992Z", + "iopub.status.idle": "2026-02-12T07:59:07.571810Z", + "shell.execute_reply": "2026-02-12T07:59:07.570297Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hyKXgrbD1KMV", + "outputId": "4456d009-5f00-48bf-be8f-ee2118ea8d24" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[OK] Score improved by 0.2000\n", + "[OK] StubLLM end-to-end optimization verified!\n", + " - Template changed across iterations\n", + " - All runs contain OTLP data\n", + " - Score: baseline=0.7500, best=0.9500 (non-saturating)\n", + " - Optimizer was called (zero_feedback -> backward -> step)\n", + " - apply_updates propagated to bindings\n", + "\n", + "============================================================\n", + "OPTIMIZATION TABLE\n", + "============================================================\n", + "Iter Avg Score Best Score Best Iter Best Params\n", + "------------------------------------------------------------\n", + "0 0.7500 0.7500 0 []\n", + "1 0.7500 0.7500 0 []\n", + "2 0.9500 0.9500 2 ['planner_prompt', 'executor_prompt', 'synthesizer_prompt']\n" + ] + } + ], + "source": [ + "# ---- Verify M1 acceptance: template changed between iter 0 and final ----\n", + "assert ig.templates[\"planner_prompt\"] != INITIAL_TEMPLATES[\"planner_prompt\"], \\\n", + " \"Prompt should have changed after optimization!\"\n", + "assert \"step-by-step\" in ig.templates[\"planner_prompt\"].lower(), \\\n", + " f\"Expected 'step-by-step' in optimized planner_prompt, got: {ig.templates['planner_prompt']!r}\"\n", + "\n", + "# Verify OTLP data present in all runs\n", + "for i, runs in enumerate(opt_result.all_runs):\n", + " for r in runs:\n", + " assert \"resourceSpans\" in r.otlp, f\"Run in iter {i} missing OTLP data\"\n", + "\n", + "# Verify non-saturating scoring\n", + "assert opt_result.best_score < 1.0, \\\n", + " f\"Score should not saturate at 1.0: {opt_result.best_score:.4f}\"\n", + "assert opt_result.best_score >= opt_result.baseline_score, \\\n", + " f\"Optimization should not degrade: best={opt_result.best_score:.4f} baseline={opt_result.baseline_score:.4f}\"\n", + "\n", + "improvement = opt_result.best_score - opt_result.baseline_score\n", + "if improvement > 0:\n", + " print(f\"[OK] Score improved by {improvement:.4f}\")\n", + "else:\n", + " print(f\"[INFO] Scores equal (baseline already near cap): best={opt_result.best_score:.4f}\")\n", + "\n", + "print(\"[OK] StubLLM end-to-end optimization verified!\")\n", + "print(\" - Template changed across iterations\")\n", + "print(\" - All runs contain OTLP data\")\n", + "print(f\" - Score: baseline={opt_result.baseline_score:.4f}, best={opt_result.best_score:.4f} (non-saturating)\")\n", + "print(\" - Optimizer was called (zero_feedback -> backward -> step)\")\n", + "print(\" - apply_updates propagated to bindings\")\n", + "\n", + "# Print optimization table\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"OPTIMIZATION TABLE\")\n", + "print(\"=\" * 60)\n", + "print(f\"{'Iter':<6} {'Avg Score':<12} {'Best Score':<12} {'Best Iter':<12} {'Best Params'}\")\n", + "print(\"-\" * 60)\n", + "best_so_far = float(\"-inf\")\n", + "best_iter_so_far = 0\n", + "for i, sc in enumerate(opt_result.score_history):\n", + " if sc > best_so_far:\n", + " best_so_far = sc\n", + " best_iter_so_far = i\n", + " bp = list(opt_result.best_parameters.keys()) if i == opt_result.best_iteration else []\n", + " print(f\"{i:<6} {sc:<12.4f} {best_so_far:<12.4f} {best_iter_so_far:<12} {bp}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "npyJxRjs1KMV" + }, + "source": [ + "---\n", + "## 9. Live LLM Mode (OpenRouter)\n", + "\n", + "This section runs the same pipeline against a **real LLM provider**\n", + "(OpenRouter). It is **automatically skipped** if no API key is available.\n", + "\n", + "Constraints per M1 acceptance:\n", + "- Tiny dataset (≤3 items)\n", + "- Deterministic settings (`temperature=0`)\n", + "- Budget guard (`max_tokens=256` per call)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.581005Z", + "iopub.status.busy": "2026-02-12T07:59:07.579994Z", + "iopub.status.idle": "2026-02-12T07:59:07.603100Z", + "shell.execute_reply": "2026-02-12T07:59:07.602018Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YJKCzGfB1KMV", + "outputId": "1431e24c-e677-4041-f183-ae06d1825766" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "OpenRouterLLM class ready.\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "class OpenRouterLLM:\n", + " \"\"\"Minimal OpenRouter client (OpenAI-compatible interface).\n", + "\n", + " A1: On HTTP errors, this class now **raises** instead of converting\n", + " the error to assistant content. TracingLLM will catch and re-raise\n", + " as LLMCallError so the caller can score the run as 0.\n", + " \"\"\"\n", + "\n", + " def __init__(self, api_key, model, base_url, *, max_tokens=256, temperature=0):\n", + " self.api_key = api_key\n", + " self.model = model\n", + " self.base_url = base_url\n", + " self.max_tokens = max_tokens\n", + " self.temperature = temperature\n", + " self.call_count = 0\n", + "\n", + " def __call__(self, messages=None, **kwargs):\n", + " self.call_count += 1\n", + " headers = {\n", + " \"Authorization\": f\"Bearer {self.api_key}\",\n", + " \"Content-Type\": \"application/json\",\n", + " }\n", + " payload = {\n", + " \"model\": self.model,\n", + " \"messages\": messages,\n", + " \"temperature\": self.temperature,\n", + " \"max_tokens\": self.max_tokens,\n", + " }\n", + " # A1: Let HTTP errors propagate — do NOT wrap them as content\n", + " resp = requests.post(\n", + " f\"{self.base_url}/chat/completions\",\n", + " headers=headers, json=payload, timeout=60,\n", + " )\n", + " resp.raise_for_status()\n", + " data = resp.json()\n", + "\n", + " return self._wrap(data)\n", + "\n", + " @staticmethod\n", + " def _wrap(data):\n", + " class _M:\n", + " pass\n", + " class _C:\n", + " pass\n", + " class _R:\n", + " pass\n", + " r = _R()\n", + " r.choices = []\n", + " for c in data.get(\"choices\", [{\"message\": {\"content\": \"\"}}]):\n", + " ch = _C()\n", + " m = _M()\n", + " m.content = c.get(\"message\", {}).get(\"content\", \"\")\n", + " ch.message = m\n", + " r.choices.append(ch)\n", + " return r\n", + "\n", + "print(\"OpenRouterLLM class ready.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:07.609620Z", + "iopub.status.busy": "2026-02-12T07:59:07.608112Z", + "iopub.status.idle": "2026-02-12T07:59:09.143370Z", + "shell.execute_reply": "2026-02-12T07:59:09.141411Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cQqBF-_R1KMV", + "outputId": "bbb66fdc-f666-44ec-b1c4-637d1e928243" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "============================================================\n", + "LIVE LLM MODE (OpenRouter)\n", + "============================================================\n", + "\n", + "[FAIL] LLMCallError during live invocation: LLM provider call failed: 404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions\n", + " Skipping live optimization. Score = 0.\n" + ] + } + ], + "source": [ + "from opto.trace.io import LLMCallError\n", + "\n", + "if not HAS_API_KEY:\n", + " print(\"[SKIP] No OPENROUTER_API_KEY — live mode skipped.\")\n", + " print(\" To enable: add the key in Colab Secrets or a .env file.\")\n", + " live_ok = False\n", + "else:\n", + " print(\"=\" * 60)\n", + " print(\"LIVE LLM MODE (OpenRouter)\")\n", + " print(\"=\" * 60)\n", + "\n", + " live_llm = OpenRouterLLM(\n", + " api_key=OPENROUTER_API_KEY,\n", + " model=OPENROUTER_MODEL,\n", + " base_url=OPENROUTER_BASE_URL,\n", + " max_tokens=MAX_TOKENS_PER_CALL,\n", + " temperature=LIVE_TEMPERATURE,\n", + " )\n", + "\n", + " live_templates = dict(INITIAL_TEMPLATES)\n", + "\n", + " live_ig = instrument_graph(\n", + " graph=None,\n", + " service_name=\"m1-live\",\n", + " trainable_keys={\"planner\", \"executor\", \"synthesizer\"},\n", + " llm=live_llm,\n", + " initial_templates=live_templates,\n", + " emit_genai_child_spans=True,\n", + " provider_name=\"openrouter\",\n", + " llm_span_name=\"openrouter.chat.completion\",\n", + " input_key=\"query\",\n", + " output_key=\"final_answer\",\n", + " )\n", + " live_graph = build_graph(live_ig.tracing_llm, live_ig.templates)\n", + " live_ig.graph = live_graph\n", + "\n", + " live_ok = False\n", + " try:\n", + " live_result = live_ig.invoke({\"query\": \"What is gradient descent?\"})\n", + " ans = str(live_result.get(\"final_answer\", \"\") or \"\")\n", + " if ans.startswith(\"[ERROR]\") or not ans.strip():\n", + " print(f\"[FAIL] Live LLM returned error or empty: {ans[:200]}\")\n", + " else:\n", + " print(f\"\\nLive answer ({len(ans)} chars):\")\n", + " print(f\" {ans[:300]}\")\n", + "\n", + " live_otlp = live_ig.session.flush_otlp(clear=False)\n", + " live_spans = live_otlp[\"resourceSpans\"][0][\"scopeSpans\"][0][\"spans\"]\n", + " trace_ids = {s[\"traceId\"] for s in live_spans}\n", + " has_root = any(str(sp.get(\"name\",\"\")).endswith(\".invoke\") for sp in live_spans)\n", + "\n", + " print(f\"\\nSpans captured: {len(live_spans)} unique_trace_ids={len(trace_ids)} has_root_invoke={has_root}\")\n", + "\n", + " # Verify trace invariants\n", + " if len(trace_ids) != 1:\n", + " print(f\"[WARN] Expected single trace ID, got {len(trace_ids)}\")\n", + " if not has_root:\n", + " print(\"[WARN] No root *.invoke span found\")\n", + "\n", + " # Check provider metadata\n", + " for sp in live_spans:\n", + " for a in sp.get(\"attributes\", []):\n", + " if a[\"key\"] == \"gen_ai.provider.name\":\n", + " prov = a[\"value\"].get(\"stringValue\", \"\")\n", + " print(f\" gen_ai.provider.name = {prov}\")\n", + " if prov != \"openrouter\":\n", + " print(f\" [WARN] Expected 'openrouter', got '{prov}'\")\n", + "\n", + " live_ok = True\n", + " print(\"\\n[OK] Live LLM trace validated!\")\n", + "\n", + " except LLMCallError as e:\n", + " print(f\"\\n[FAIL] LLMCallError during live invocation: {e}\")\n", + " print(\" Skipping live optimization. Score = 0.\")\n", + " except Exception as e:\n", + " print(f\"\\n[FAIL] Unexpected error during live invocation: {e}\")\n", + " print(\" Skipping live optimization.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:09.152911Z", + "iopub.status.busy": "2026-02-12T07:59:09.151899Z", + "iopub.status.idle": "2026-02-12T07:59:09.728081Z", + "shell.execute_reply": "2026-02-12T07:59:09.727073Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cRplJAic1KMV", + "outputId": "ab5a04ae-d847-4997-8031-68f50adc483c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[SKIP] Live invocation failed — live optimization skipped.\n" + ] + } + ], + "source": [ + "if HAS_API_KEY and live_ok:\n", + " LIVE_QUERIES = DEMO_QUERIES[:2]\n", + "\n", + " print(\"=\" * 60)\n", + " print(f\"LIVE OPTIMIZATION (1 iteration, {len(LIVE_QUERIES)} queries)\")\n", + " print(\"=\" * 60)\n", + "\n", + " # Reset templates for a fresh optimization\n", + " apply_updates(INITIAL_TEMPLATES, live_ig.bindings)\n", + " print(f\" planner_prompt BEFORE: {live_ig.templates['planner_prompt']!r}\")\n", + "\n", + " live_mock_opt = MockOptimizer()\n", + "\n", + " live_opt_result = optimize_graph(\n", + " live_ig,\n", + " queries=LIVE_QUERIES,\n", + " iterations=1,\n", + " optimizer=live_mock_opt,\n", + " eval_fn=stub_eval_fn,\n", + " apply_updates_flag=True,\n", + " )\n", + "\n", + " print(f\"\\n planner_prompt AFTER: {live_ig.templates['planner_prompt']!r}\")\n", + " print(f\" Baseline score: {live_opt_result.baseline_score:.4f}\")\n", + " print(f\" Best score: {live_opt_result.best_score:.4f}\")\n", + " print(f\" Score history: {[round(s, 4) for s in live_opt_result.score_history]}\")\n", + " print(f\" Total LLM calls: {live_llm.call_count}\")\n", + "\n", + " # --- Live OTLP inspection ---\n", + " live_otlp_final = live_ig.session.flush_otlp(clear=True)\n", + " try:\n", + " live_spans = live_otlp_final[\"resourceSpans\"][0][\"scopeSpans\"][0][\"spans\"]\n", + " trace_ids = {s[\"traceId\"] for s in live_spans}\n", + " has_root = any(str(sp.get(\"name\",\"\")).endswith(\".invoke\") for sp in live_spans)\n", + " print(f\"\\n Live OTLP: {len(live_spans)} spans, {len(trace_ids)} trace IDs, root_invoke={has_root}\")\n", + " except (KeyError, IndexError) as e:\n", + " print(f\"\\n [WARN] Could not inspect live OTLP: {e}\")\n", + "else:\n", + " if not HAS_API_KEY:\n", + " print(\"[SKIP] No API key — live optimization skipped.\")\n", + " else:\n", + " print(\"[SKIP] Live invocation failed — live optimization skipped.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f8oGUzML1KMV" + }, + "source": [ + "---\n", + "## 10. Save Artifacts\n", + "\n", + "Save OTLP traces, TGJ documents, and optimization summary to the run\n", + "folder (Google Drive on Colab, local fallback)." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "execution": { + "iopub.execute_input": "2026-02-12T07:59:09.732598Z", + "iopub.status.busy": "2026-02-12T07:59:09.732598Z", + "iopub.status.idle": "2026-02-12T07:59:09.818823Z", + "shell.execute_reply": "2026-02-12T07:59:09.817814Z" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4MnYiHER1KMV", + "outputId": "30db790b-95ed-44ab-a9e7-89d79e65c2f6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "============================================================\n", + "SAVING ARTIFACTS\n", + "============================================================\n", + " /content/drive/MyDrive/OpenTrace_runs/M1/main/run_20260219_220544/stub_sample_otlp.json\n", + " /content/drive/MyDrive/OpenTrace_runs/M1/main/run_20260219_220544/stub_all_traces.json (9 traces)\n", + " /content/drive/MyDrive/OpenTrace_runs/M1/main/run_20260219_220544/stub_sample_tgj.json\n", + " /content/drive/MyDrive/OpenTrace_runs/M1/main/run_20260219_220544/stub_summary.json\n", + "\n", + "All artifacts saved to: /content/drive/MyDrive/OpenTrace_runs/M1/main/run_20260219_220544\n" + ] + } + ], + "source": [ + "print(\"=\" * 60)\n", + "print(\"SAVING ARTIFACTS\")\n", + "print(\"=\" * 60)\n", + "\n", + "# --- Save StubLLM optimization traces ---\n", + "if opt_result.all_runs and opt_result.all_runs[0]:\n", + " # Sample trace\n", + " sample_otlp = opt_result.all_runs[0][0].otlp\n", + " p = os.path.join(RUN_FOLDER, \"stub_sample_otlp.json\")\n", + " with open(p, \"w\") as f:\n", + " json.dump(sample_otlp, f, indent=2)\n", + " print(f\" {p}\")\n", + "\n", + " # All optimization traces\n", + " all_traces = []\n", + " for iter_idx, runs in enumerate(opt_result.all_runs):\n", + " label = \"baseline\" if iter_idx == 0 else f\"iteration_{iter_idx}\"\n", + " for ri, run in enumerate(runs):\n", + " all_traces.append({\n", + " \"iteration\": label,\n", + " \"query_index\": ri,\n", + " \"score\": run.score,\n", + " \"otlp\": run.otlp,\n", + " })\n", + " p = os.path.join(RUN_FOLDER, \"stub_all_traces.json\")\n", + " with open(p, \"w\") as f:\n", + " json.dump(all_traces, f, indent=2)\n", + " print(f\" {p} ({len(all_traces)} traces)\")\n", + "\n", + " # TGJ from first run\n", + " tgj_docs = otlp_traces_to_trace_json(\n", + " sample_otlp, agent_id_hint=\"m1-notebook\", use_temporal_hierarchy=True,\n", + " )\n", + " p = os.path.join(RUN_FOLDER, \"stub_sample_tgj.json\")\n", + " with open(p, \"w\") as f:\n", + " json.dump(tgj_docs, f, indent=2)\n", + " print(f\" {p}\")\n", + "\n", + "# --- Summary ---\n", + "summary = {\n", + " \"mode\": \"stub\",\n", + " \"baseline_score\": opt_result.baseline_score,\n", + " \"best_score\": opt_result.best_score,\n", + " \"best_iteration\": opt_result.best_iteration,\n", + " \"score_history\": opt_result.score_history,\n", + " \"final_parameters\": opt_result.final_parameters,\n", + "}\n", + "p = os.path.join(RUN_FOLDER, \"stub_summary.json\")\n", + "with open(p, \"w\") as f:\n", + " json.dump(summary, f, indent=2)\n", + "print(f\" {p}\")\n", + "\n", + "# --- Save live traces if available ---\n", + "if HAS_API_KEY and 'live_opt_result' in dir():\n", + " live_traces = []\n", + " for iter_idx, runs in enumerate(live_opt_result.all_runs):\n", + " label = \"baseline\" if iter_idx == 0 else f\"iteration_{iter_idx}\"\n", + " for ri, run in enumerate(runs):\n", + " live_traces.append({\n", + " \"iteration\": label,\n", + " \"query_index\": ri,\n", + " \"score\": run.score,\n", + " \"otlp\": run.otlp,\n", + " })\n", + " p = os.path.join(RUN_FOLDER, \"live_all_traces.json\")\n", + " with open(p, \"w\") as f:\n", + " json.dump(live_traces, f, indent=2)\n", + " print(f\" {p} ({len(live_traces)} traces)\")\n", + "\n", + " live_summary = {\n", + " \"mode\": \"live\",\n", + " \"model\": OPENROUTER_MODEL,\n", + " \"baseline_score\": live_opt_result.baseline_score,\n", + " \"best_score\": live_opt_result.best_score,\n", + " \"score_history\": live_opt_result.score_history,\n", + " \"final_parameters\": live_opt_result.final_parameters,\n", + " \"total_llm_calls\": live_llm.call_count,\n", + " }\n", + " p = os.path.join(RUN_FOLDER, \"live_summary.json\")\n", + " with open(p, \"w\") as f:\n", + " json.dump(live_summary, f, indent=2)\n", + " print(f\" {p}\")\n", + "\n", + "print(f\"\\nAll artifacts saved to: {RUN_FOLDER}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uCByY8OZ1KMV" + }, + "source": [ + "---\n", + "## Summary\n", + "\n", + "This notebook demonstrated the full M1 pipeline:\n", + "\n", + "1. **`instrument_graph()`** — one-liner to add OTEL tracing to a LangGraph\n", + "2. **`param.*` attributes** — spans carry trainable prompt values\n", + "3. **OTLP → TGJ → `ParameterNode` + `MessageNode`** — optimizer-compatible trace graph\n", + "4. **Temporal integrity** — child `gen_ai.*` spans don't break chaining\n", + "5. **`apply_updates()`** — bindings propagate optimizer output to live templates\n", + "6. **`optimize_graph()`** — end-to-end loop (StubLLM deterministic + live provider)\n", + "7. **Artifacts persisted** — OTLP JSON, TGJ JSON, and summaries saved to disk\n", + "\n", + "All verifications passed with StubLLM (CI-safe, deterministic). When\n", + "`OPENROUTER_API_KEY` is set, the live section additionally proves\n", + "real-provider tracing with `param.*` and `gen_ai.*` attributes." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/OpenTrace_LangGraph_BBEH_boolean_expressions_PAL_curriculum_clean(3).ipynb b/OpenTrace_LangGraph_BBEH_boolean_expressions_PAL_curriculum_clean(3).ipynb new file mode 100644 index 00000000..2835a7ba --- /dev/null +++ b/OpenTrace_LangGraph_BBEH_boolean_expressions_PAL_curriculum_clean(3).ipynb @@ -0,0 +1,1618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "13f77031", + "metadata": { + "id": "13f77031" + }, + "source": [ + "# OpenTrace + LangGraph + BBEH (clean)\n", + "\n", + "This notebook is a cleaned/compacted version of the original experiment notebook.\n", + "\n", + "Defaults:\n", + "- **Strategy:** `solve_with_PAL_Strategy`\n", + "- **Benchmark:** **BBEH** → `bbeh_boolean_expressions` (no GSM8K)\n", + "- **Optimization sampling:** **CurriculumBuffer Mode B** (curriculum: current example + last successes)\n", + "- **No strategy sweep** and **no end-of-notebook plots/graphs** (optional trace visuals are disabled by default)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "231a31f4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "231a31f4", + "outputId": "eadaf641-43cd-42d1-e9f1-bc2a267f991f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Config:\n", + " BBEH_TASK_NAME='bbeh_boolean_expressions'\n", + " N_TRAIN=20, N_VAL=10, SEED=0\n", + " VALIDATE_ON_LAST_N=2, ACCUMULATION_STEPS=2\n", + " LEARNING_RETRY=20, MAX_ATTEMPTS=10\n", + " SKIP_OPTIMIZATION=False\n", + " OUTPUT_FOLDER='./trace_runs'\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "# -----------------------\n", + "# Core defaults (edit me)\n", + "# -----------------------\n", + "BBEH_TASK_NAME = os.getenv(\"BBEH_TASK_NAME\", \"bbeh_boolean_expressions\")\n", + "\n", + "# Data split (BBEH tasks are stored as JSON \"examples\"; we just shuffle + slice)\n", + "N_TRAIN = int(os.getenv(\"N_TRAIN\", \"20\"))\n", + "N_VAL = int(os.getenv(\"N_VAL\", \"10\"))\n", + "SEED = int(os.getenv(\"SEED\", \"0\"))\n", + "\n", + "# CurriculumBuffer Mode B (curriculum):\n", + "# - keep last N successful examples as validation history\n", + "# - when optimizing on a failing example, train on (current + history) via accumulation_steps\n", + "VALIDATE_ON_LAST_N = int(os.getenv(\"VALIDATE_ON_LAST_N\", \"2\"))\n", + "ACCUMULATION_STEPS = int(os.getenv(\"ACCUMULATION_STEPS\", \"2\")) # effective_batch_size = 1 + ACCUMULATION_STEPS\n", + "\n", + "# Optimization loop controls\n", + "LEARNING_RETRY = int(os.getenv(\"LEARNING_RETRY\", \"20\")) # target update-steps per optimize_langgraph() call\n", + "MAX_ATTEMPTS = int(os.getenv(\"MAX_ATTEMPTS\", \"10\")) # tries per update-step to get a real parameter change\n", + "\n", + "SKIP_OPTIMIZATION = os.getenv(\"SKIP_OPTIMIZATION\", \"0\") == \"1\"\n", + "\n", + "# Output\n", + "OUTPUT_FOLDER = os.getenv(\"OUTPUT_FOLDER\", \"./trace_runs\")\n", + "\n", + "# Optional verbosity toggles (kept OFF by default)\n", + "SHOW_MERMAID_GRAPH = os.getenv(\"SHOW_MERMAID_GRAPH\", \"0\") == \"1\"\n", + "SHOW_OPT_TRACE = os.getenv(\"SHOW_OPT_TRACE\", \"0\") == \"1\" # Trace backward visuals\n", + "\n", + "try:\n", + " import google.colab\n", + " IN_COLAB = True\n", + "except ImportError:\n", + " IN_COLAB = False\n", + "\n", + "print(\"Config:\")\n", + "print(f\" {BBEH_TASK_NAME=}\")\n", + "print(f\" {N_TRAIN=}, {N_VAL=}, {SEED=}\")\n", + "print(f\" {VALIDATE_ON_LAST_N=}, {ACCUMULATION_STEPS=}\")\n", + "print(f\" {LEARNING_RETRY=}, {MAX_ATTEMPTS=}\")\n", + "print(f\" {SKIP_OPTIMIZATION=}\")\n", + "print(f\" {OUTPUT_FOLDER=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dfd8b1c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5dfd8b1c", + "outputId": "2af18aed-9f61-4ea8-e1fd-149e192effc4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Setting up Trace...\n", + "Requirement already satisfied: langgraph in /usr/local/lib/python3.12/dist-packages (1.0.8)\n", + "Requirement already satisfied: langchain in /usr/local/lib/python3.12/dist-packages (1.2.10)\n", + "Collecting langchain_openai\n", + " Downloading langchain_openai-1.1.10-py3-none-any.whl.metadata (3.1 kB)\n", + "Requirement already satisfied: datasets in /usr/local/lib/python3.12/dist-packages (4.0.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (4.67.3)\n", + "Collecting langchain_community\n", + " Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)\n", + "Collecting litellm\n", + " Downloading litellm-1.81.13-py3-none-any.whl.metadata (30 kB)\n", + "Collecting dspy\n", + " Downloading dspy-3.1.3-py3-none-any.whl.metadata (8.4 kB)\n", + "Collecting black\n", + " Downloading black-26.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (88 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.9/88.9 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: langchain-core>=0.1 in /usr/local/lib/python3.12/dist-packages (from langgraph) (1.2.13)\n", + "Requirement already satisfied: langgraph-checkpoint<5.0.0,>=2.1.0 in /usr/local/lib/python3.12/dist-packages (from langgraph) (4.0.0)\n", + "Requirement already satisfied: langgraph-prebuilt<1.1.0,>=1.0.7 in /usr/local/lib/python3.12/dist-packages (from langgraph) (1.0.7)\n", + "Requirement already satisfied: langgraph-sdk<0.4.0,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from langgraph) (0.3.6)\n", + "Requirement already satisfied: pydantic>=2.7.4 in /usr/local/lib/python3.12/dist-packages (from langgraph) (2.12.3)\n", + "Requirement already satisfied: xxhash>=3.5.0 in /usr/local/lib/python3.12/dist-packages (from langgraph) (3.6.0)\n", + "Requirement already satisfied: openai<3.0.0,>=2.20.0 in /usr/local/lib/python3.12/dist-packages (from langchain_openai) (2.21.0)\n", + "Requirement already satisfied: tiktoken<1.0.0,>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from langchain_openai) (0.12.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from datasets) (3.24.2)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (2.0.2)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (18.1.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from datasets) (2.2.2)\n", + "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.12/dist-packages (from datasets) (2.32.4)\n", + "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2025.3.0)\n", + "Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (1.4.1)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from datasets) (26.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from datasets) (6.0.3)\n", + "Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain_community)\n", + " Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)\n", + "Requirement already satisfied: SQLAlchemy<3.0.0,>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from langchain_community) (2.0.46)\n", + "Collecting requests>=2.32.2 (from datasets)\n", + " Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.12/dist-packages (from langchain_community) (3.13.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10.0.0,>=8.1.0 in /usr/local/lib/python3.12/dist-packages (from langchain_community) (9.1.4)\n", + "Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain_community)\n", + " Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)\n", + "Requirement already satisfied: pydantic-settings<3.0.0,>=2.10.1 in /usr/local/lib/python3.12/dist-packages (from langchain_community) (2.13.0)\n", + "Requirement already satisfied: langsmith<1.0.0,>=0.1.125 in /usr/local/lib/python3.12/dist-packages (from langchain_community) (0.7.3)\n", + "Requirement already satisfied: httpx-sse<1.0.0,>=0.4.0 in /usr/local/lib/python3.12/dist-packages (from langchain_community) (0.4.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm) (8.3.1)\n", + "Collecting fastuuid>=0.13.0 (from litellm)\n", + " Downloading fastuuid-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)\n", + "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm) (0.28.1)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm) (8.7.1)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm) (3.1.6)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm) (4.26.0)\n", + "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm) (1.2.1)\n", + "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm) (0.22.2)\n", + "Requirement already satisfied: regex>=2023.10.3 in /usr/local/lib/python3.12/dist-packages (from dspy) (2025.11.3)\n", + "Requirement already satisfied: orjson>=3.9.0 in /usr/local/lib/python3.12/dist-packages (from dspy) (3.11.7)\n", + "Collecting optuna>=3.4.0 (from dspy)\n", + " Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)\n", + "Collecting diskcache>=5.6.0 (from dspy)\n", + " Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)\n", + "Collecting json-repair>=0.54.2 (from dspy)\n", + " Downloading json_repair-0.58.0-py3-none-any.whl.metadata (15 kB)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from dspy) (4.12.1)\n", + "Collecting asyncer==0.0.8 (from dspy)\n", + " Downloading asyncer-0.0.8-py3-none-any.whl.metadata (6.7 kB)\n", + "Requirement already satisfied: cachetools>=5.5.0 in /usr/local/lib/python3.12/dist-packages (from dspy) (7.0.1)\n", + "Requirement already satisfied: cloudpickle>=3.0.0 in /usr/local/lib/python3.12/dist-packages (from dspy) (3.1.2)\n", + "Collecting gepa==0.0.26 (from gepa[dspy]==0.0.26->dspy)\n", + " Downloading gepa-0.0.26-py3-none-any.whl.metadata (29 kB)\n", + "Collecting mypy-extensions>=0.4.3 (from black)\n", + " Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)\n", + "Collecting pathspec>=1.0.0 (from black)\n", + " Downloading pathspec-1.0.4-py3-none-any.whl.metadata (13 kB)\n", + "Requirement already satisfied: platformdirs>=2 in /usr/local/lib/python3.12/dist-packages (from black) (4.9.2)\n", + "Collecting pytokens>=0.3.0 (from black)\n", + " Downloading pytokens-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (25.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.22.0)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.12/dist-packages (from anyio->dspy) (3.11)\n", + "Requirement already satisfied: typing_extensions>=4.5 in /usr/local/lib/python3.12/dist-packages (from anyio->dspy) (4.15.0)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain_community)\n", + " Downloading marshmallow-3.26.2-py3-none-any.whl.metadata (7.3 kB)\n", + "Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain_community)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm) (2026.1.4)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm) (0.16.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.2.0)\n", + "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.5.4)\n", + "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (0.24.0)\n", + "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm) (3.23.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm) (3.0.3)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.23.0->litellm) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.23.0->litellm) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.23.0->litellm) (0.30.0)\n", + "Collecting langchain-text-splitters<2.0.0,>=1.1.0 (from langchain-classic<2.0.0,>=1.0.0->langchain_community)\n", + " Downloading langchain_text_splitters-1.1.1-py3-none-any.whl.metadata (3.3 kB)\n", + "Requirement already satisfied: jsonpatch<2.0.0,>=1.33.0 in /usr/local/lib/python3.12/dist-packages (from langchain-core>=0.1->langgraph) (1.33)\n", + "Requirement already satisfied: uuid-utils<1.0,>=0.12.0 in /usr/local/lib/python3.12/dist-packages (from langchain-core>=0.1->langgraph) (0.14.0)\n", + "Requirement already satisfied: ormsgpack>=1.12.0 in /usr/local/lib/python3.12/dist-packages (from langgraph-checkpoint<5.0.0,>=2.1.0->langgraph) (1.12.2)\n", + "Requirement already satisfied: requests-toolbelt>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from langsmith<1.0.0,>=0.1.125->langchain_community) (1.0.0)\n", + "Requirement already satisfied: zstandard>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from langsmith<1.0.0,>=0.1.125->langchain_community) (0.25.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai<3.0.0,>=2.20.0->langchain_openai) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai<3.0.0,>=2.20.0->langchain_openai) (0.13.0)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai<3.0.0,>=2.20.0->langchain_openai) (1.3.1)\n", + "Requirement already satisfied: alembic>=1.5.0 in /usr/local/lib/python3.12/dist-packages (from optuna>=3.4.0->dspy) (1.18.4)\n", + "Collecting colorlog (from optuna>=3.4.0->dspy)\n", + " Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic>=2.7.4->langgraph) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic>=2.7.4->langgraph) (2.41.4)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic>=2.7.4->langgraph) (0.4.2)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (2.5.0)\n", + "Requirement already satisfied: greenlet>=1 in /usr/local/lib/python3.12/dist-packages (from SQLAlchemy<3.0.0,>=1.4.0->langchain_community) (3.3.1)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.3)\n", + "Requirement already satisfied: Mako in /usr/local/lib/python3.12/dist-packages (from alembic>=1.5.0->optuna>=3.4.0->dspy) (1.3.10)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.12/dist-packages (from jsonpatch<2.0.0,>=1.33.0->langchain-core>=0.1->langgraph) (3.0.0)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n", + "Requirement already satisfied: typer>=0.24.0 in /usr/local/lib/python3.12/dist-packages (from typer-slim->huggingface-hub>=0.24.0->datasets) (0.24.0)\n", + "Requirement already satisfied: rich>=12.3.0 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim->huggingface-hub>=0.24.0->datasets) (13.9.4)\n", + "Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim->huggingface-hub>=0.24.0->datasets) (0.0.4)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer>=0.24.0->typer-slim->huggingface-hub>=0.24.0->datasets) (4.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer>=0.24.0->typer-slim->huggingface-hub>=0.24.0->datasets) (2.19.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer>=0.24.0->typer-slim->huggingface-hub>=0.24.0->datasets) (0.1.2)\n", + "Downloading langchain_openai-1.1.10-py3-none-any.whl (87 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.2/87.2 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langchain_community-0.4.1-py3-none-any.whl (2.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m51.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading litellm-1.81.13-py3-none-any.whl (14.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading dspy-3.1.3-py3-none-any.whl (312 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m312.4/312.4 kB\u001b[0m \u001b[31m27.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading asyncer-0.0.8-py3-none-any.whl (9.2 kB)\n", + "Downloading gepa-0.0.26-py3-none-any.whl (139 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.7/139.7 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading black-26.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m78.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)\n", + "Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading fastuuid-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (278 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.1/278.1 kB\u001b[0m \u001b[31m25.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading json_repair-0.58.0-py3-none-any.whl (40 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.0/40.0 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langchain_classic-1.0.1-py3-none-any.whl (1.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m56.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading mypy_extensions-1.1.0-py3-none-any.whl (5.0 kB)\n", + "Downloading optuna-4.7.0-py3-none-any.whl (413 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m413.9/413.9 kB\u001b[0m \u001b[31m34.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pathspec-1.0.4-py3-none-any.whl (55 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.2/55.2 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pytokens-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (269 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m269.8/269.8 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading requests-2.32.5-py3-none-any.whl (64 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.7/64.7 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langchain_text_splitters-1.1.1-py3-none-any.whl (35 kB)\n", + "Downloading marshmallow-3.26.2-py3-none-any.whl (50 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.0/51.0 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Downloading colorlog-6.10.1-py3-none-any.whl (11 kB)\n", + "Installing collected packages: requests, pytokens, pathspec, mypy-extensions, marshmallow, json-repair, gepa, fastuuid, diskcache, colorlog, typing-inspect, black, asyncer, optuna, dataclasses-json, langchain-text-splitters, langchain_openai, langchain-classic, litellm, langchain_community, dspy\n", + " Attempting uninstall: requests\n", + " Found existing installation: requests 2.32.4\n", + " Uninstalling requests-2.32.4:\n", + " Successfully uninstalled requests-2.32.4\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed asyncer-0.0.8 black-26.1.0 colorlog-6.10.1 dataclasses-json-0.6.7 diskcache-5.6.3 dspy-3.1.3 fastuuid-0.14.0 gepa-0.0.26 json-repair-0.58.0 langchain-classic-1.0.1 langchain-text-splitters-1.1.1 langchain_community-0.4.1 langchain_openai-1.1.10 litellm-1.81.13 marshmallow-3.26.2 mypy-extensions-1.1.0 optuna-4.7.0 pathspec-1.0.4 pytokens-0.4.1 requests-2.32.5 typing-inspect-0.9.0\n", + "Cloning into 'Trace'...\n", + "remote: Enumerating objects: 7192, done.\u001b[K\n", + "remote: Counting objects: 100% (2798/2798), done.\u001b[K\n", + "remote: Compressing objects: 100% (1120/1120), done.\u001b[K\n", + "remote: Total 7192 (delta 1879), reused 1680 (delta 1678), pack-reused 4394 (from 2)\u001b[K\n", + "Receiving objects: 100% (7192/7192), 14.39 MiB | 18.12 MiB/s, done.\n", + "Resolving deltas: 100% (4623/4623), done.\n", + "/content/Trace\n", + "From https://github.com/AgentOpt/OpenTrace\n", + " * branch experimental -> FETCH_HEAD\n", + "Updating f1c908e..aa127e7\n", + "Fast-forward\n", + " .github/workflows/ci.yml | 6 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " .github/workflows/python-app.yml | 4 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " CONTRIBUTING.md | 66 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " Makefile | 19 \u001b[31m-\u001b[m\n", + " OAI_CONFIG_LIST_sample | 25 \u001b[31m-\u001b[m\n", + " OVERVIEW.md | 81 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " README.md | 83 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " SECURITY.md | 41 \u001b[31m-\u001b[m\n", + " SUPPORT.md | 25 \u001b[31m-\u001b[m\n", + " dev_deployment/README.md | 15 \u001b[32m+\u001b[m\n", + " dev_deployment/trace_dev_modal_image.py | 84 \u001b[32m+\u001b[m\n", + " docs/_config.yml | 10 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/_static/custom.css | 35 \u001b[31m-\u001b[m\n", + " docs/_toc.yml | 1 \u001b[32m+\u001b[m\n", + " docs/colab_kernel_clean_script.py | 29 \u001b[31m-\u001b[m\n", + " docs/examples/basic/greeting.ipynb | 183 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/examples/game/negotiation_arena.ipynb | 86 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/examples/nlp/bigbench_hard.ipynb | 34 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " .../numerical/numerical_optimization.ipynb | 24 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/examples/robotics/metaworld.ipynb | 39 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/faq/faq.md | 5 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/images/contributing_workflow.png | Bin \u001b[31m0\u001b[m -> \u001b[32m39433\u001b[m bytes\n", + " docs/intro.md | 26 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/jupyter_build.sh | 16 \u001b[31m-\u001b[m\n", + " docs/post_build_script.py | 48 \u001b[31m-\u001b[m\n", + " docs/publish.sh | 6 \u001b[31m-\u001b[m\n", + " docs/quickstart/installation.md | 22 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/quickstart/quick_start.ipynb | 19 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/quickstart/quick_start_2.ipynb | 634 \u001b[32m++\u001b[m\u001b[31m-\u001b[m\n", + " docs/quickstart/virtualhome.md | 15 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/readme.md | 29 \u001b[31m-\u001b[m\n", + " docs/references.bib | 56 \u001b[31m-\u001b[m\n", + " docs/requirements.txt | 8 \u001b[31m-\u001b[m\n", + " docs/tutorials/error_handling_tutorial.ipynb | 64 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/tutorials/minibatch.ipynb | 560 \u001b[32m++\u001b[m\u001b[31m-\u001b[m\n", + " docs/tutorials/optimization_tutorial.ipynb | 77 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " docs/tutorials/projections_tutorial.ipynb | 856 \u001b[32m++++\u001b[m\n", + " docs/tutorials/trainers.ipynb | 4324 \u001b[32m++++++++++++++++++++\u001b[m\n", + " examples/async_optimization_example.py | 370 \u001b[32m++\u001b[m\n", + " examples/battleship.py | 18 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " examples/bbh/run_prompt_bigbench_dspy.py | 2 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " examples/bbh/run_prompt_bigbench_trace.py | 14 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " examples/greeting.py | 2 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " examples/gsm8k_trainer_example.py | 39 \u001b[32m+\u001b[m\n", + " .../run_bigbench_trace_async.py | 70 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " examples/multi_objective_convex_fn.py | 663 \u001b[32m+++\u001b[m\n", + " examples/priority_search_example.py | 63 \u001b[32m+\u001b[m\n", + " examples/priority_search_on_convex_fn.py | 261 \u001b[32m++\u001b[m\n", + " examples/priority_search_on_convex_fn_BENCH.py | 218 \u001b[32m+\u001b[m\n", + " examples/search_algo_example.py | 351 \u001b[32m++\u001b[m\n", + " .../evals/textgrad_prompt_optimization.py | 2 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " ...ad_test_time_loss_for_code_OptoPrimeMulti.ipynb | 9 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " examples/train_model.py | 41 \u001b[32m+\u001b[m\n", + " examples/train_single_node.py | 20 \u001b[32m+\u001b[m\n", + " examples/train_single_node_multi_optimizers.py.py | 32 \u001b[32m+\u001b[m\n", + " examples/virtualhome.py | 25 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " generated_docs/opto/optimizers/buffers.md | 76 \u001b[31m-\u001b[m\n", + " .../opto/optimizers/function_optimizer.md | 738 \u001b[31m----\u001b[m\n", + " generated_docs/opto/optimizers/opro.md | 79 \u001b[31m-\u001b[m\n", + " generated_docs/opto/optimizers/optimizers.md | 267 \u001b[31m--\u001b[m\n", + " generated_docs/opto/trace/broadcast.md | 54 \u001b[31m-\u001b[m\n", + " generated_docs/opto/trace/bundle.md | 469 \u001b[31m---\u001b[m\n", + " generated_docs/opto/trace/containers.md | 386 \u001b[31m--\u001b[m\n", + " generated_docs/opto/trace/errors.md | 112 \u001b[31m-\u001b[m\n", + " generated_docs/opto/trace/modules.md | 304 \u001b[31m--\u001b[m\n", + " generated_docs/opto/trace/nodes.md | 2213 \u001b[31m----------\u001b[m\n", + " generated_docs/opto/trace/operators.md | 893 \u001b[31m----\u001b[m\n", + " .../opto/trace/propagators/graph_propagator.md | 166 \u001b[31m-\u001b[m\n", + " .../opto/trace/propagators/propagators.md | 338 \u001b[31m--\u001b[m\n", + " generated_docs/opto/trace/trace.md | 43 \u001b[31m-\u001b[m\n", + " generated_docs/opto/trace/utils.md | 320 \u001b[31m--\u001b[m\n", + " opto/{trainer/README.md => features/__init__.py} | 0\n", + " opto/features/flows/__init__.py | 0\n", + " opto/features/flows/compose.py | 229 \u001b[32m++\u001b[m\n", + " opto/features/flows/types.py | 10 \u001b[32m+\u001b[m\n", + " opto/features/gepa/__init__.py | 9 \u001b[32m+\u001b[m\n", + " opto/features/gepa/gepa_algorithms.py | 652 \u001b[32m+++\u001b[m\n", + " opto/features/predefined_agents/__init__.py | 5 \u001b[32m+\u001b[m\n", + " opto/features/predefined_agents/learner.py | 85 \u001b[32m+\u001b[m\n", + " opto/features/priority_search/__init__.py | 2 \u001b[32m+\u001b[m\n", + " opto/features/priority_search/examples.py | 214 \u001b[32m+\u001b[m\n", + " opto/features/priority_search/module_regressor.py | 304 \u001b[32m++\u001b[m\n", + " opto/features/priority_search/priority_search.py | 804 \u001b[32m++++\u001b[m\n", + " .../priority_search_with_regressor.py | 212 \u001b[32m+\u001b[m\n", + " opto/features/priority_search/sampler.py | 333 \u001b[32m++\u001b[m\n", + " opto/features/priority_search/search_template.py | 377 \u001b[32m++\u001b[m\n", + " .../priority_search/streaming_priority_search.py | 113 \u001b[32m+\u001b[m\n", + " opto/features/priority_search/utils.py | 102 \u001b[32m+\u001b[m\n", + " opto/optimizers/__init__.py | 9 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/optimizers/buffers.py | 58 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/optimizers/opro.py | 68 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/optimizers/opro_v2.py | 339 \u001b[32m++\u001b[m\n", + " opto/optimizers/optimizer.py | 338 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/optimizers/optoprime.py | 492 \u001b[32m++\u001b[m\u001b[31m-\u001b[m\n", + " opto/optimizers/optoprime_batchopt.py | 160 \u001b[31m-\u001b[m\n", + " opto/optimizers/optoprime_v2.py | 680 \u001b[32m+++\u001b[m\n", + " opto/optimizers/optoprimemulti.py | 186 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/optimizers/textgrad.py | 124 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/optimizers/utils.py | 121 \u001b[32m+\u001b[m\n", + " opto/trace/README.md | 1 \u001b[31m-\u001b[m\n", + " opto/trace/__init__.py | 4 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/broadcast.py | 97 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/bundle.py | 28 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/containers.py | 246 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/errors.py | 56 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/iterators.py | 101 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/modules.py | 361 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/nodes.py | 661 \u001b[32m++\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/operators.py | 24 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/projections/__init__.py | 2 \u001b[32m+\u001b[m\n", + " opto/trace/projections/code_projections.py | 31 \u001b[32m+\u001b[m\n", + " opto/trace/projections/projections.py | 86 \u001b[32m+\u001b[m\n", + " opto/trace/propagators/graph_propagator.py | 82 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/propagators/propagators.py | 245 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trace/utils.py | 212 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trainer/__init__.py | 1 \u001b[32m+\u001b[m\n", + " opto/trainer/algorithms/UCBsearch.py | 374 \u001b[32m++\u001b[m\n", + " opto/trainer/algorithms/__init__.py | 5 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trainer/algorithms/aggregator.py | 222 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trainer/algorithms/algorithm.py | 295 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trainer/algorithms/basic_algorithm.py | 377 \u001b[31m--\u001b[m\n", + " opto/trainer/algorithms/basic_algorithms.py | 613 \u001b[32m+++\u001b[m\n", + " opto/trainer/algorithms/beamsearch_algorithm.py | 811 \u001b[32m++++\u001b[m\n", + " opto/trainer/evaluators.py | 44 \u001b[32m+\u001b[m\n", + " opto/trainer/guide.py | 68 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trainer/loader.py | 76 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/trainer/loggers.py | 122 \u001b[32m+\u001b[m\n", + " opto/trainer/train.py | 262 \u001b[32m++\u001b[m\n", + " opto/trainer/utils.py | 176 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/utils/auto_retry.py | 80 \u001b[32m+\u001b[m\n", + " opto/utils/llm.py | 252 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " opto/version.py | 2 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " pyproject.toml | 15 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " setup.py | 8 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " tests/features_tests/test_flows_compose.py | 78 \u001b[32m+\u001b[m\n", + " tests/llm_optimizers_tests/test_gepa_benchmark.py | 97 \u001b[32m+\u001b[m\n", + " tests/llm_optimizers_tests/test_guides.py | 19 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " tests/llm_optimizers_tests/test_opro_v2.py | 164 \u001b[32m+\u001b[m\n", + " tests/llm_optimizers_tests/test_optimizer.py | 116 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " .../test_optimizer_optoprimemulti.py | 209 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " tests/llm_optimizers_tests/test_optoprime_v2.py | 181 \u001b[32m+\u001b[m\n", + " tests/llm_optimizers_tests/test_trainer.py | 94 \u001b[31m-\u001b[m\n", + " .../test_trainer_refactored.py | 81 \u001b[31m-\u001b[m\n", + " tests/test_memory_leak.py | 37 \u001b[32m+\u001b[m\n", + " tests/unit_tests/test_backward.py | 29 \u001b[32m+\u001b[m\n", + " tests/unit_tests/test_batch_run.py | 116 \u001b[32m+\u001b[m\n", + " tests/unit_tests/test_bundle.py | 167 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " tests/unit_tests/test_copy.py | 9 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " tests/unit_tests/test_dataloader.py | 158 \u001b[32m+\u001b[m\n", + " tests/unit_tests/test_dependencies.py | 3 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " tests/unit_tests/test_modules.py | 422 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " tests/unit_tests/test_nodes.py | 13 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " tests/unit_tests/test_optimizer_xml_parsing.py | 369 \u001b[32m++\u001b[m\n", + " .../test_optoprime_suggestion_processing.py | 111 \u001b[32m+\u001b[m\n", + " tests/unit_tests/test_optoprime_update.py | 55 \u001b[32m+\u001b[m\n", + " tests/unit_tests/test_priority_search.py | 271 \u001b[32m++\u001b[m\n", + " tests/unit_tests/test_projection.py | 38 \u001b[32m+\u001b[m\n", + " tests/unit_tests/test_sampler.py | 144 \u001b[32m+\u001b[m\n", + " tests/unit_tests/test_saving_loading.py | 81 \u001b[32m+\u001b[m\u001b[31m-\u001b[m\n", + " 159 files changed, 22646 insertions(+), 8795 deletions(-)\n", + " delete mode 100644 Makefile\n", + " delete mode 100644 OAI_CONFIG_LIST_sample\n", + " delete mode 100644 SECURITY.md\n", + " delete mode 100644 SUPPORT.md\n", + " create mode 100644 dev_deployment/README.md\n", + " create mode 100644 dev_deployment/trace_dev_modal_image.py\n", + " delete mode 100644 docs/_static/custom.css\n", + " delete mode 100644 docs/colab_kernel_clean_script.py\n", + " create mode 100644 docs/images/contributing_workflow.png\n", + " delete mode 100644 docs/jupyter_build.sh\n", + " delete mode 100644 docs/post_build_script.py\n", + " delete mode 100644 docs/publish.sh\n", + " delete mode 100644 docs/readme.md\n", + " delete mode 100644 docs/references.bib\n", + " delete mode 100644 docs/requirements.txt\n", + " create mode 100644 docs/tutorials/projections_tutorial.ipynb\n", + " create mode 100644 docs/tutorials/trainers.ipynb\n", + " create mode 100644 examples/async_optimization_example.py\n", + " create mode 100644 examples/gsm8k_trainer_example.py\n", + " create mode 100644 examples/multi_objective_convex_fn.py\n", + " create mode 100644 examples/priority_search_example.py\n", + " create mode 100644 examples/priority_search_on_convex_fn.py\n", + " create mode 100644 examples/priority_search_on_convex_fn_BENCH.py\n", + " create mode 100644 examples/search_algo_example.py\n", + " create mode 100644 examples/train_model.py\n", + " create mode 100644 examples/train_single_node.py\n", + " create mode 100644 examples/train_single_node_multi_optimizers.py.py\n", + " delete mode 100644 generated_docs/opto/optimizers/buffers.md\n", + " delete mode 100644 generated_docs/opto/optimizers/function_optimizer.md\n", + " delete mode 100644 generated_docs/opto/optimizers/opro.md\n", + " delete mode 100644 generated_docs/opto/optimizers/optimizers.md\n", + " delete mode 100644 generated_docs/opto/trace/broadcast.md\n", + " delete mode 100644 generated_docs/opto/trace/bundle.md\n", + " delete mode 100644 generated_docs/opto/trace/containers.md\n", + " delete mode 100644 generated_docs/opto/trace/errors.md\n", + " delete mode 100644 generated_docs/opto/trace/modules.md\n", + " delete mode 100644 generated_docs/opto/trace/nodes.md\n", + " delete mode 100644 generated_docs/opto/trace/operators.md\n", + " delete mode 100644 generated_docs/opto/trace/propagators/graph_propagator.md\n", + " delete mode 100644 generated_docs/opto/trace/propagators/propagators.md\n", + " delete mode 100644 generated_docs/opto/trace/trace.md\n", + " delete mode 100644 generated_docs/opto/trace/utils.md\n", + " rename opto/{trainer/README.md => features/__init__.py} (100%)\n", + " create mode 100644 opto/features/flows/__init__.py\n", + " create mode 100644 opto/features/flows/compose.py\n", + " create mode 100644 opto/features/flows/types.py\n", + " create mode 100644 opto/features/gepa/__init__.py\n", + " create mode 100644 opto/features/gepa/gepa_algorithms.py\n", + " create mode 100644 opto/features/predefined_agents/__init__.py\n", + " create mode 100644 opto/features/predefined_agents/learner.py\n", + " create mode 100644 opto/features/priority_search/__init__.py\n", + " create mode 100644 opto/features/priority_search/examples.py\n", + " create mode 100644 opto/features/priority_search/module_regressor.py\n", + " create mode 100644 opto/features/priority_search/priority_search.py\n", + " create mode 100644 opto/features/priority_search/priority_search_with_regressor.py\n", + " create mode 100644 opto/features/priority_search/sampler.py\n", + " create mode 100644 opto/features/priority_search/search_template.py\n", + " create mode 100644 opto/features/priority_search/streaming_priority_search.py\n", + " create mode 100644 opto/features/priority_search/utils.py\n", + " create mode 100644 opto/optimizers/opro_v2.py\n", + " delete mode 100644 opto/optimizers/optoprime_batchopt.py\n", + " create mode 100644 opto/optimizers/optoprime_v2.py\n", + " delete mode 100644 opto/trace/README.md\n", + " create mode 100644 opto/trace/projections/__init__.py\n", + " create mode 100644 opto/trace/projections/code_projections.py\n", + " create mode 100644 opto/trace/projections/projections.py\n", + " create mode 100644 opto/trainer/algorithms/UCBsearch.py\n", + " delete mode 100644 opto/trainer/algorithms/basic_algorithm.py\n", + " create mode 100644 opto/trainer/algorithms/basic_algorithms.py\n", + " create mode 100644 opto/trainer/algorithms/beamsearch_algorithm.py\n", + " create mode 100644 opto/trainer/evaluators.py\n", + " create mode 100644 opto/trainer/loggers.py\n", + " create mode 100644 opto/trainer/train.py\n", + " create mode 100644 opto/utils/auto_retry.py\n", + " create mode 100644 tests/features_tests/test_flows_compose.py\n", + " create mode 100644 tests/llm_optimizers_tests/test_gepa_benchmark.py\n", + " create mode 100644 tests/llm_optimizers_tests/test_opro_v2.py\n", + " create mode 100644 tests/llm_optimizers_tests/test_optoprime_v2.py\n", + " delete mode 100644 tests/llm_optimizers_tests/test_trainer.py\n", + " delete mode 100644 tests/llm_optimizers_tests/test_trainer_refactored.py\n", + " create mode 100644 tests/test_memory_leak.py\n", + " create mode 100644 tests/unit_tests/test_batch_run.py\n", + " create mode 100644 tests/unit_tests/test_dataloader.py\n", + " create mode 100644 tests/unit_tests/test_optimizer_xml_parsing.py\n", + " create mode 100644 tests/unit_tests/test_optoprime_suggestion_processing.py\n", + " create mode 100644 tests/unit_tests/test_optoprime_update.py\n", + " create mode 100644 tests/unit_tests/test_priority_search.py\n", + " create mode 100644 tests/unit_tests/test_projection.py\n", + " create mode 100644 tests/unit_tests/test_sampler.py\n", + "Branch 'experimental' set up to track remote branch 'experimental' from 'origin'.\n", + "Switched to a new branch 'experimental'\n", + "Obtaining file:///content/Trace\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Checking if build backend supports build_editable ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build editable ... \u001b[?25l\u001b[?25hdone\n", + " Preparing editable metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: graphviz>=0.20.1 in /usr/local/lib/python3.12/dist-packages (from trace-opt==0.2.0) (0.21)\n", + "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (from trace-opt==0.2.0) (8.4.2)\n", + "Collecting litellm==1.75.0 (from trace-opt==0.2.0)\n", + " Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.7/40.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: black in /usr/local/lib/python3.12/dist-packages (from trace-opt==0.2.0) (26.1.0)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (from trace-opt==0.2.0) (1.6.1)\n", + "Collecting tensorboardX (from trace-opt==0.2.0)\n", + " Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)\n", + "Requirement already satisfied: tensorboard in /usr/local/lib/python3.12/dist-packages (from trace-opt==0.2.0) (2.19.0)\n", + "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (3.13.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (8.3.1)\n", + "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (0.28.1)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (8.7.1)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (3.1.6)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (4.26.0)\n", + "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (2.21.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (2.12.3)\n", + "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (1.2.1)\n", + "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (0.12.0)\n", + "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0->trace-opt==0.2.0) (0.22.2)\n", + "Requirement already satisfied: mypy-extensions>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from black->trace-opt==0.2.0) (1.1.0)\n", + "Requirement already satisfied: packaging>=22.0 in /usr/local/lib/python3.12/dist-packages (from black->trace-opt==0.2.0) (26.0)\n", + "Requirement already satisfied: pathspec>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from black->trace-opt==0.2.0) (1.0.4)\n", + "Requirement already satisfied: platformdirs>=2 in /usr/local/lib/python3.12/dist-packages (from black->trace-opt==0.2.0) (4.9.2)\n", + "Requirement already satisfied: pytokens>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from black->trace-opt==0.2.0) (0.4.1)\n", + "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest->trace-opt==0.2.0) (2.3.0)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest->trace-opt==0.2.0) (1.6.0)\n", + "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest->trace-opt==0.2.0) (2.19.2)\n", + "Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->trace-opt==0.2.0) (2.0.2)\n", + "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->trace-opt==0.2.0) (1.16.3)\n", + "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->trace-opt==0.2.0) (1.5.3)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->trace-opt==0.2.0) (3.6.0)\n", + "Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.12/dist-packages (from tensorboard->trace-opt==0.2.0) (1.4.0)\n", + "Requirement already satisfied: grpcio>=1.48.2 in /usr/local/lib/python3.12/dist-packages (from tensorboard->trace-opt==0.2.0) (1.78.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.12/dist-packages (from tensorboard->trace-opt==0.2.0) (3.10.2)\n", + "Requirement already satisfied: protobuf!=4.24.0,>=3.19.6 in /usr/local/lib/python3.12/dist-packages (from tensorboard->trace-opt==0.2.0) (5.29.6)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.12/dist-packages (from tensorboard->trace-opt==0.2.0) (75.2.0)\n", + "Requirement already satisfied: six>1.9 in /usr/local/lib/python3.12/dist-packages (from tensorboard->trace-opt==0.2.0) (1.17.0)\n", + "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from tensorboard->trace-opt==0.2.0) (0.7.2)\n", + "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from tensorboard->trace-opt==0.2.0) (3.1.5)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0->trace-opt==0.2.0) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0->trace-opt==0.2.0) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0->trace-opt==0.2.0) (25.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0->trace-opt==0.2.0) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0->trace-opt==0.2.0) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0->trace-opt==0.2.0) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0->trace-opt==0.2.0) (1.22.0)\n", + "Requirement already satisfied: typing-extensions~=4.12 in /usr/local/lib/python3.12/dist-packages (from grpcio>=1.48.2->tensorboard->trace-opt==0.2.0) (4.15.0)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0->trace-opt==0.2.0) (4.12.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0->trace-opt==0.2.0) (2026.1.4)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0->trace-opt==0.2.0) (1.0.9)\n", + "Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0->trace-opt==0.2.0) (3.11)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0->trace-opt==0.2.0) (0.16.0)\n", + "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0->trace-opt==0.2.0) (3.23.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0->trace-opt==0.2.0) (3.0.3)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0->trace-opt==0.2.0) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0->trace-opt==0.2.0) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0->trace-opt==0.2.0) (0.30.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0->trace-opt==0.2.0) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0->trace-opt==0.2.0) (0.13.0)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0->trace-opt==0.2.0) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0->trace-opt==0.2.0) (4.67.3)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0->trace-opt==0.2.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0->trace-opt==0.2.0) (2.41.4)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0->trace-opt==0.2.0) (0.4.2)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0->trace-opt==0.2.0) (2025.11.3)\n", + "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0->trace-opt==0.2.0) (2.32.5)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0->trace-opt==0.2.0) (1.4.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (3.24.2)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (2025.3.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (1.2.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (6.0.3)\n", + "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (1.5.4)\n", + "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (0.24.0)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0->trace-opt==0.2.0) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0->trace-opt==0.2.0) (2.5.0)\n", + "Requirement already satisfied: typer>=0.24.0 in /usr/local/lib/python3.12/dist-packages (from typer-slim->huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (0.24.0)\n", + "Requirement already satisfied: rich>=12.3.0 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim->huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (13.9.4)\n", + "Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim->huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (0.0.4)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer>=0.24.0->typer-slim->huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (4.0.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer>=0.24.0->typer-slim->huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0->trace-opt==0.2.0) (0.1.2)\n", + "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m77.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading tensorboardx-2.6.4-py3-none-any.whl (87 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.2/87.2 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hBuilding wheels for collected packages: trace-opt\n", + " Building editable for trace-opt (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for trace-opt: filename=trace_opt-0.2.0-0.editable-py3-none-any.whl size=12228 sha256=3f14cd9318d1ff6795e27be7ed17f34857f93440aee91918dbf3fce8b0fc703f\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-bioru59d/wheels/8f/37/69/34b901530ef008282826b9a2e68edad902735f6dbe889eed7b\n", + "Successfully built trace-opt\n", + "Installing collected packages: tensorboardX, litellm, trace-opt\n", + " Attempting uninstall: litellm\n", + " Found existing installation: litellm 1.81.13\n", + " Uninstalling litellm-1.81.13:\n", + " Successfully uninstalled litellm-1.81.13\n", + "Successfully installed litellm-1.75.0 tensorboardX-2.6.4 trace-opt-0.2.0\n", + "Cloning into 'bbeh'...\n", + "remote: Enumerating objects: 102, done.\u001b[K\n", + "remote: Counting objects: 100% (102/102), done.\u001b[K\n", + "remote: Compressing objects: 100% (88/88), done.\u001b[K\n", + "remote: Total 102 (delta 13), reused 100 (delta 11), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (102/102), 2.81 MiB | 6.90 MiB/s, done.\n", + "Resolving deltas: 100% (13/13), done.\n", + "IN_COLAB=True, trace_repo_exists=True\n" + ] + } + ], + "source": [ + "import os, sys\n", + "\n", + "# -----------------------\n", + "# Optional: install deps\n", + "# -----------------------\n", + "# If you are in a fresh Colab/runtime, you likely need:\n", + "#\n", + "import sys\n", + "if IN_COLAB:\n", + " # test if setup has already been done : reset by !rm -rf /content/Trace\n", + " if not os.path.exists('/content/Trace'):\n", + " print(\"Setting up Trace...\")\n", + " %pip install langgraph langchain langchain_openai datasets tqdm langchain_community litellm dspy black\n", + " %alias git git\n", + " %alias sed sed\n", + " %git clone https://github.com/AgentOpt/OpenTrace.git Trace\n", + " %cd Trace\n", + " %git pull origin experimental && git checkout experimental\n", + " %sed -i 's/python_requires=\">=3.13\"/python_requires=\">=3.12\"/' setup.py\n", + " %pip install -e .\n", + " sys.path.append('/content/Trace')\n", + "else:\n", + " sys.path.append(os.path.expanduser(\"~/trace/Trace\"))\n", + "#\n", + "# Also clone BBEH tasks:\n", + "!git clone https://github.com/google-deepmind/bbeh.git\n", + "\n", + "# Try to auto-add a local Trace repo path (edit if needed)\n", + "IN_COLAB = False\n", + "try:\n", + " import google.colab # type: ignore\n", + " IN_COLAB = True\n", + "except Exception:\n", + " IN_COLAB = False\n", + "\n", + "trace_repo = os.getenv(\"TRACE_REPO\", \"/content/Trace\" if IN_COLAB else os.path.expanduser(\"~/trace/Trace\"))\n", + "if os.path.exists(trace_repo) and trace_repo not in sys.path:\n", + " sys.path.append(trace_repo)\n", + "\n", + "# Soft-import display (avoid hard dependency on IPython)\n", + "try:\n", + " from IPython.display import display # type: ignore\n", + "except Exception:\n", + " def display(*args, **kwargs): # noqa: D401\n", + " return None\n", + "\n", + "print(f\"{IN_COLAB=}, trace_repo_exists={os.path.exists(trace_repo)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cd5f283", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1cd5f283", + "outputId": "b385883c-5026-4fee-cc1d-3569e8f7ea36" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "LLM ready: {'service': 'openrouter', 'model': 'openai/gpt-5-nano'}\n" + ] + } + ], + "source": [ + "import os\n", + "from langchain_core.messages import HumanMessage, SystemMessage\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "# -----------------------\n", + "# LLM config (defaults)\n", + "# -----------------------\n", + "LLM_SERVICE = os.getenv(\"LLM_SERVICE\", \"openrouter\") # \"openai\" | \"openrouter\" | \"customllm\"\n", + "LLM_GENERAL_MODEL = os.getenv(\"LLM_GENERAL_MODEL\", \"openai/gpt-5-nano\")\n", + "\n", + "# API keys: prefer env vars (Colab users can also use google.colab.userdata)\n", + "def _get_secret(name: str) -> str | None:\n", + " try:\n", + " from google.colab import userdata # type: ignore\n", + " v = userdata.get(name)\n", + " if v:\n", + " return v\n", + " except Exception:\n", + " pass\n", + " return os.getenv(name)\n", + "\n", + "OPENAI_API_KEY = _get_secret(\"OPENAI_API_KEY\")\n", + "OPENROUTER_API_KEY = _get_secret(\"OPENROUTER_API_KEY\")\n", + "CUSTOMLLM_API_KEY = _get_secret(\"CUSTOMLLM_API_KEY\")\n", + "CUSTOMLLM_URL = os.getenv(\"CUSTOMLLM_URL\", \"http://localhost:4000/\") # if you use a local proxy\n", + "\n", + "if LLM_SERVICE == \"openrouter\":\n", + " if not OPENROUTER_API_KEY:\n", + " raise ValueError(\"OPENROUTER_API_KEY missing (set env var or Colab secret).\")\n", + " os.environ[\"OPENAI_BASE_URL\"] = \"https://openrouter.ai/api/v1\"\n", + " os.environ[\"OPENAI_API_KEY\"] = OPENROUTER_API_KEY\n", + "elif LLM_SERVICE == \"customllm\":\n", + " if not CUSTOMLLM_API_KEY:\n", + " raise ValueError(\"CUSTOMLLM_API_KEY missing (set env var or Colab secret).\")\n", + " os.environ[\"OPENAI_BASE_URL\"] = CUSTOMLLM_URL\n", + " os.environ[\"OPENAI_API_KEY\"] = CUSTOMLLM_API_KEY\n", + "else:\n", + " if not OPENAI_API_KEY:\n", + " raise ValueError(\"OPENAI_API_KEY missing (set env var or Colab secret).\")\n", + " os.environ[\"OPENAI_BASE_URL\"] = \"https://api.openai.com/v1\"\n", + " os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n", + "\n", + "llm = ChatOpenAI(model_name=LLM_GENERAL_MODEL, temperature=0)\n", + "\n", + "def llm_call(prompt: str, system_instructions: str = \"\") -> str:\n", + " msgs = [HumanMessage(content=prompt)]\n", + " if system_instructions:\n", + " msgs.insert(0, SystemMessage(content=system_instructions))\n", + " return llm.invoke(msgs).content\n", + "\n", + "print(\"LLM ready:\", {\"service\": LLM_SERVICE, \"model\": LLM_GENERAL_MODEL})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25d3d5eb", + "metadata": { + "id": "25d3d5eb" + }, + "outputs": [], + "source": [ + "import os, json, random, inspect\n", + "from copy import deepcopy\n", + "\n", + "# ---- Trace imports (OpenTrace / opto) ----\n", + "try:\n", + " from opto.trace import node, bundle\n", + " from opto.trace.bundle import FunModule\n", + " from opto.optimizers.optoprime_v2 import OptoPrimeV2 as OptoPrime\n", + " from opto.trainer.guide import Guide as _TraceGuide\n", + " from opto.trainer.algorithms.basic_algorithms import Minibatch as _TraceMinibatch\n", + "except Exception as e:\n", + " raise ImportError(\n", + " \"Could not import OpenTrace (opto.*). \"\n", + " \"Make sure OpenTrace is installed and TRACE_REPO is on sys.path.\"\n", + " ) from e\n", + "\n", + "\n", + "# -----------------------\n", + "# Small helpers\n", + "# -----------------------\n", + "def set_dict(state: dict, key, value):\n", + " (state.data if hasattr(state, \"data\") else state)[key] = value\n", + "\n", + "def get_no_node(x):\n", + " return x.data if hasattr(x, \"data\") else x\n", + "\n", + "def _snapshot_params(parameters):\n", + " snap = {}\n", + " for p in parameters:\n", + " try:\n", + " snap[p.name] = deepcopy(p.data)\n", + " except Exception:\n", + " snap[p.name] = p.data\n", + " return snap\n", + "\n", + "def _params_changed(before, after) -> bool:\n", + " if before.keys() != after.keys():\n", + " return True\n", + " for k in before.keys():\n", + " if str(before[k]) != str(after[k]):\n", + " return True\n", + " return False\n", + "\n", + "def _replace_in_scope_by_identity(scope: dict, old_obj, new_obj) -> list[str]:\n", + " replaced = []\n", + " for k, v in list(scope.items()):\n", + " if v is old_obj:\n", + " scope[k] = new_obj\n", + " replaced.append(k)\n", + " return replaced\n", + "\n", + "def bind_function(func, *, trainable=True, traceable_code=True, allow_external_dependencies=True):\n", + " \"\"\"Safely bundle() a python function into a Trace FunModule (only once).\"\"\"\n", + " if func is None or not callable(func):\n", + " return func\n", + " if isinstance(func, FunModule):\n", + " return func\n", + " fm = bundle(trainable=trainable,\n", + " traceable_code=traceable_code,\n", + " allow_external_dependencies=allow_external_dependencies)(func)\n", + " # Preserve signature for nicer debugging\n", + " try:\n", + " fm.__signature__ = inspect.signature(fm._fun)\n", + " except Exception:\n", + " pass\n", + " return fm\n", + "\n", + "\n", + "# -----------------------\n", + "# Guide: graph output -> (score, feedback)\n", + "# -----------------------\n", + "class LangGraphGuide(_TraceGuide):\n", + " def __init__(self, feedback_func, *, answer_key=\"final_answer\", allowed_answer_set=None):\n", + " self.feedback_func = feedback_func\n", + " self.answer_key = answer_key\n", + " self.allowed = allowed_answer_set\n", + "\n", + " def get_feedback(self, query, response, reference, **kwargs):\n", + " # response is usually a dict: {\"final_answer\": }\n", + " try:\n", + " if isinstance(response, dict) or (hasattr(response, \"data\") and isinstance(get_no_node(response), dict)):\n", + " extracted = get_no_node(get_no_node(response)[self.answer_key])\n", + " else:\n", + " extracted = get_no_node(response)\n", + " except Exception:\n", + " extracted = get_no_node(response)\n", + "\n", + " if self.allowed is not None:\n", + " ok, fb = self.feedback_func(extracted, reference, self.allowed)\n", + " else:\n", + " ok, fb = self.feedback_func(extracted, reference)\n", + " return float(bool(ok)), fb\n", + "\n", + " def copy(self):\n", + " return LangGraphGuide(self.feedback_func, answer_key=self.answer_key, allowed_answer_set=self.allowed)\n", + "\n", + "\n", + "# -----------------------\n", + "# CurriculumBuffer\n", + "# -----------------------\n", + "class CurriculumBuffer:\n", + " \"\"\"Mode A (fixed pool) if training_pool is provided; Mode B (curriculum) otherwise.\"\"\"\n", + " def __init__(self, training_pool=None, *, history_size=2, sample_with_replacement=True, seed=None):\n", + " self.pool = list(training_pool) if training_pool else []\n", + " self.history = []\n", + " self.history_size = int(history_size)\n", + " self.replacement = bool(sample_with_replacement)\n", + " self._rng = random.Random(seed)\n", + "\n", + " @property\n", + " def is_fixed_pool(self) -> bool:\n", + " return len(self.pool) > 0\n", + "\n", + " def add_success(self, example: dict):\n", + " self.history.append(example)\n", + " if len(self.history) > self.history_size:\n", + " self.history.pop(0)\n", + "\n", + " def sample_batch(self, batch_size: int, *, current_question=None, current_solution=None) -> list[dict]:\n", + " if self.is_fixed_pool:\n", + " k = batch_size if self.replacement else min(batch_size, len(self.pool))\n", + " return self._rng.choices(self.pool, k=k) if self.replacement else self._rng.sample(self.pool, k=k)\n", + "\n", + " # Mode B: current + recent successes\n", + " batch = []\n", + " max_steps = min(batch_size, 1 + len(self.history))\n", + " for i in range(max_steps):\n", + " if i == 0:\n", + " batch.append({\"question\": current_question, \"solution\": current_solution})\n", + " else:\n", + " ex = self.history[-i]\n", + " batch.append({\"question\": ex[\"question\"], \"solution\": ex.get(\"solution\", ex.get(\"answer\"))})\n", + " return batch\n", + "\n", + "\n", + "# -----------------------\n", + "# Trainer\n", + "# -----------------------\n", + "class LangGraphTrainer(_TraceMinibatch):\n", + " def __init__(self, *, graph_root_function: str, graph_agents_functions: list[str], scope: dict,\n", + " optimizer, parameters: list):\n", + " object.__init__(self)\n", + " self.root_name = graph_root_function\n", + " self.agent_names = list(graph_agents_functions)\n", + " self.scope = scope\n", + " self.optimizer = optimizer\n", + " self.parameters = list(parameters)\n", + "\n", + " # originals for corruption guard / rollback\n", + " self._original_root = scope[graph_root_function]\n", + " self._original_agents = {n: scope[n] for n in graph_agents_functions if n in scope}\n", + "\n", + " def restore_originals(self):\n", + " self.scope[self.root_name] = self._original_root\n", + " for name, orig in self._original_agents.items():\n", + " self.scope[name] = orig\n", + "\n", + " def _check_corruption(self) -> bool:\n", + " restored = False\n", + " for name in self.agent_names:\n", + " agent = self.scope.get(name)\n", + " if isinstance(agent, FunModule) and getattr(agent, \"_fun\", None) is None:\n", + " print(f\"⚠️ corruption: '{name}' has ._fun=None. Restoring original.\")\n", + " self.scope[name] = self._original_agents[name]\n", + " restored = True\n", + " return restored\n", + "\n", + " def _run_one(self, question, solution, guide: LangGraphGuide):\n", + " answer_key = guide.answer_key\n", + " try:\n", + " answer = self.scope[self.root_name](question)\n", + " score, feedback = guide.get_feedback(question, answer, solution)\n", + " ok = score >= 1.0\n", + " except Exception as e:\n", + " ok = False\n", + " feedback = f\"ERROR: {e}\"\n", + " answer = {answer_key: node(\"DUMMY_ANSWER\")}\n", + " return answer, ok, feedback\n", + "\n", + " def train(self, *, guide: LangGraphGuide, buffer: CurriculumBuffer,\n", + " question=None, solution=None,\n", + " target_updates=20, max_attempts=10, batch_size=3,\n", + " test_optimization=True, stop_on_success=True,\n", + " run_dir=\".\", save_steps=True,\n", + " validation_set=None):\n", + " if validation_set is None:\n", + " validation_set = []\n", + "\n", + " answer_key = guide.answer_key\n", + " best_state = None\n", + " last_state = None\n", + " history = []\n", + " modified = False\n", + " updates_done = 0\n", + " global_attempt = 0\n", + "\n", + " os.makedirs(run_dir, exist_ok=True)\n", + "\n", + " while updates_done < int(target_updates):\n", + " step_attempt = 0\n", + " step_changed = False\n", + "\n", + " while step_attempt < int(max_attempts) and not step_changed:\n", + " step_attempt += 1\n", + " global_attempt += 1\n", + " attempt = global_attempt\n", + " print(f\"[opt] attempt={attempt} update_step={updates_done+1}/{target_updates} try={step_attempt}/{max_attempts}\")\n", + "\n", + " self.optimizer.zero_feedback()\n", + "\n", + " # minibatch\n", + " batch_examples = buffer.sample_batch(\n", + " int(batch_size),\n", + " current_question=question,\n", + " current_solution=solution,\n", + " )\n", + "\n", + " answers = []\n", + " feedbacks = []\n", + " batch_all_correct = True\n", + "\n", + " for ex in batch_examples:\n", + " eq = ex[\"question\"]\n", + " es = ex.get(\"solution\", ex.get(\"answer\"))\n", + " ans, ok, fb = self._run_one(eq, es, guide)\n", + " batch_all_correct = batch_all_correct and ok\n", + " answers.append(ans)\n", + " feedbacks.append(fb)\n", + "\n", + " # aggregate feedback\n", + " if len(feedbacks) == 1:\n", + " common_feedback = feedbacks[0]\n", + " else:\n", + " common_feedback = \"\\n\".join([f\"Feedback #{i+1}: {fb}\" for i, fb in enumerate(feedbacks)])\n", + "\n", + " # backward\n", + " for ans in answers:\n", + " ans_node = ans.get(answer_key, ans) if isinstance(ans, dict) else ans\n", + " if not hasattr(ans_node, \"backward\"):\n", + " ans_node = node(str(ans_node))\n", + " self.optimizer.backward(\n", + " ans_node,\n", + " common_feedback,\n", + " visualize=bool(SHOW_OPT_TRACE),\n", + " print_limit=30,\n", + " )\n", + "\n", + " # step + change detection\n", + " before = _snapshot_params(self.parameters)\n", + " self.optimizer.step(verbose=True)\n", + " after = _snapshot_params(self.parameters)\n", + " step_changed = _params_changed(before, after)\n", + "\n", + " # corruption guard\n", + " if self._check_corruption():\n", + " step_changed = False\n", + "\n", + " if not step_changed:\n", + " print(\"[opt] no parameter change, retrying...\")\n", + " continue\n", + "\n", + " # record successful update\n", + " updates_done += 1\n", + " modified = True\n", + " last_state = {p.name: p.data for p in self.parameters}\n", + "\n", + " # compute val acc (optional)\n", + " val_acc = None\n", + " if validation_set:\n", + " n_ok = 0\n", + " for v in validation_set:\n", + " _, vok, _ = self._run_one(v[\"question\"], v.get(\"solution\", v.get(\"answer\")), guide)\n", + " n_ok += int(vok)\n", + " val_acc = n_ok / float(len(validation_set))\n", + "\n", + " # save step snapshot (optional)\n", + " if save_steps:\n", + " try:\n", + " step_path = os.path.join(run_dir, f\"step_{updates_done:03d}_state.txt\")\n", + " with open(step_path, \"w\") as f:\n", + " for nm, val in last_state.items():\n", + " f.write(f\"{nm}: {val}\\n\")\n", + " except Exception as e:\n", + " print(f\"⚠️ could not save step state: {e}\")\n", + "\n", + " # test_optimization gate: current example + validation_set must pass\n", + " if test_optimization and question is not None:\n", + " _, cur_ok, cur_fb = self._run_one(question, solution, guide)\n", + " val_ok = True\n", + " for v in validation_set:\n", + " _, vok, _ = self._run_one(v[\"question\"], v.get(\"solution\", v.get(\"answer\")), guide)\n", + " if not vok:\n", + " val_ok = False\n", + " break\n", + " if cur_ok and val_ok:\n", + " best_state = last_state\n", + " print(\"[opt] gate PASS:\", cur_fb)\n", + " if stop_on_success:\n", + " # write history entry before stopping\n", + " hist_entry = {\n", + " \"update_step\": updates_done,\n", + " \"attempt\": attempt,\n", + " \"batch_size\": int(batch_size),\n", + " \"mode\": \"fixed\" if buffer.is_fixed_pool else \"curriculum\",\n", + " \"train_batch_all_correct\": batch_all_correct,\n", + " \"val_acc\": val_acc,\n", + " \"gate_pass\": True,\n", + " }\n", + " history.append(hist_entry)\n", + " with open(os.path.join(run_dir, \"history.jsonl\"), \"a\") as f:\n", + " f.write(json.dumps(hist_entry, default=str) + \"\\n\")\n", + " return modified, history, best_state, last_state\n", + "\n", + " # history entry (normal)\n", + " hist_entry = {\n", + " \"update_step\": updates_done,\n", + " \"attempt\": attempt,\n", + " \"batch_size\": int(batch_size),\n", + " \"mode\": \"fixed\" if buffer.is_fixed_pool else \"curriculum\",\n", + " \"train_batch_all_correct\": batch_all_correct,\n", + " \"val_acc\": val_acc,\n", + " \"gate_pass\": bool(best_state is not None),\n", + " }\n", + " history.append(hist_entry)\n", + " try:\n", + " with open(os.path.join(run_dir, \"history.jsonl\"), \"a\") as f:\n", + " f.write(json.dumps(hist_entry, default=str) + \"\\n\")\n", + " except Exception:\n", + " pass\n", + "\n", + " if stop_on_success and best_state is not None:\n", + " return modified, history, best_state, last_state\n", + "\n", + " if not step_changed:\n", + " print(f\"⚠️ stopping early: couldn't get a parameter update after {max_attempts} tries.\")\n", + " break\n", + "\n", + " return modified, history, best_state, last_state\n", + "\n", + "\n", + "# -----------------------\n", + "# optimize_langgraph (thin facade)\n", + "# -----------------------\n", + "def optimize_langgraph(\n", + " *,\n", + " graph_root_function: str,\n", + " graph_agents_functions: list[str],\n", + " question: str,\n", + " solution: str,\n", + " graph_prompts_list=None,\n", + " answer_feedback_func=None,\n", + " allowed_answer_set=None,\n", + " answer_key=\"final_answer\",\n", + " validation_set=None,\n", + " # Mode A vs B\n", + " training_pool=None,\n", + " batch_size=None,\n", + " accumulation_steps=1,\n", + " sample_with_replacement=True,\n", + " seed=None,\n", + " # Loop controls\n", + " updating_steps=None,\n", + " retry=5,\n", + " max_attempts=10,\n", + " stop_on_success=True,\n", + " test_optimization=True,\n", + " train_graph_agents_functions=True,\n", + " memory_size=1,\n", + " save_steps=True,\n", + " dump_prefix=\"\",\n", + " output_folder=None,\n", + " scope=None,\n", + " optimizer_cls=None,\n", + " trainer_cls=None,\n", + "):\n", + " if optimizer_cls is None:\n", + " optimizer_cls = OptoPrime\n", + " if trainer_cls is None:\n", + " trainer_cls = LangGraphTrainer\n", + " if scope is None:\n", + " scope = globals()\n", + " if validation_set is None:\n", + " validation_set = []\n", + " if seed is not None:\n", + " random.seed(seed)\n", + "\n", + " # Bind agents + prompts\n", + " if isinstance(scope.get(graph_root_function), FunModule):\n", + " scope[graph_root_function] = scope[graph_root_function]._fun\n", + "\n", + " parameters = []\n", + " for name in graph_agents_functions:\n", + " if name not in scope:\n", + " raise KeyError(f\"'{name}' not found in scope.\")\n", + " scope[name] = bind_function(scope[name], trainable=train_graph_agents_functions)\n", + " parameters.extend(scope[name].parameters())\n", + "\n", + " if graph_prompts_list is not None:\n", + " for i, prompt in enumerate(list(graph_prompts_list)):\n", + " if hasattr(prompt, \"data\") and hasattr(prompt, \"name\"):\n", + " parameters.append(prompt)\n", + " continue\n", + " new_prompt = node(str(prompt), trainable=True)\n", + " _replace_in_scope_by_identity(scope, prompt, new_prompt)\n", + " graph_prompts_list[i] = new_prompt\n", + " parameters.append(new_prompt)\n", + "\n", + " if not parameters:\n", + " raise ValueError(\"No trainable parameters found (agents/prompts list is empty).\")\n", + "\n", + " # Optimizer, guide, buffer\n", + " opt = optimizer_cls(\n", + " parameters,\n", + " memory_size=memory_size,\n", + " objective=[\n", + " \"Improve the agent so it solves the task reliably.\",\n", + " \"Prefer simple, robust edits to prompts/code.\"\n", + " ],\n", + " )\n", + "\n", + " guide = LangGraphGuide(\n", + " feedback_func=answer_feedback_func,\n", + " answer_key=answer_key,\n", + " allowed_answer_set=allowed_answer_set,\n", + " )\n", + "\n", + " effective_batch_size = int(batch_size) if batch_size is not None else max(1, 1 + int(accumulation_steps))\n", + "\n", + " buffer = CurriculumBuffer(\n", + " training_pool=training_pool,\n", + " history_size=max(len(validation_set), 2) if validation_set else 2,\n", + " sample_with_replacement=sample_with_replacement,\n", + " seed=seed,\n", + " )\n", + " # Pre-seed curriculum history from validation_set (Mode B)\n", + " if (not buffer.is_fixed_pool) and validation_set:\n", + " for v in validation_set:\n", + " buffer.add_success(v)\n", + "\n", + " target_updates = int(updating_steps) if updating_steps is not None else int(retry)\n", + " _max_attempts = int(max_attempts)\n", + "\n", + " # Run directory\n", + " base_dir = output_folder or \".\"\n", + " os.makedirs(base_dir, exist_ok=True)\n", + " run_name = (\n", + " f\"{dump_prefix}{graph_root_function}\"\n", + " f\"__mode-{'fixed' if buffer.is_fixed_pool else 'curr'}\"\n", + " f\"__bs{effective_batch_size}\"\n", + " f\"__updates{target_updates}\"\n", + " f\"__maxA{_max_attempts}\"\n", + " f\"__mem{memory_size}\"\n", + " f\"__seed{seed if seed is not None else 'none'}\"\n", + " )\n", + " run_dir = os.path.join(base_dir, run_name)\n", + " os.makedirs(run_dir, exist_ok=True)\n", + "\n", + " # Train\n", + " trainer = trainer_cls(\n", + " graph_root_function=graph_root_function,\n", + " graph_agents_functions=graph_agents_functions,\n", + " scope=scope,\n", + " optimizer=opt,\n", + " parameters=parameters,\n", + " )\n", + " modified, history, best_state, last_state = trainer.train(\n", + " guide=guide,\n", + " buffer=buffer,\n", + " question=question,\n", + " solution=solution,\n", + " target_updates=target_updates,\n", + " max_attempts=_max_attempts,\n", + " batch_size=effective_batch_size,\n", + " test_optimization=test_optimization,\n", + " stop_on_success=stop_on_success,\n", + " save_steps=save_steps,\n", + " run_dir=run_dir,\n", + " validation_set=validation_set,\n", + " )\n", + "\n", + " chosen_state = best_state if best_state is not None else last_state\n", + " dump_filename = None\n", + " if modified and chosen_state is not None:\n", + " dump_filename = os.path.join(run_dir, \"best_state.txt\")\n", + " with open(dump_filename, \"w\") as f:\n", + " for nm, val in chosen_state.items():\n", + " f.write(f\"{nm}: {val}\\n\")\n", + "\n", + " # Rollback if we didn't get a passing best_state (keeps semantics stable)\n", + " if (not test_optimization) or (best_state is None):\n", + " trainer.restore_originals()\n", + "\n", + " return modified, dump_filename, history, chosen_state, run_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d81c08ca", + "metadata": { + "id": "d81c08ca" + }, + "outputs": [], + "source": [ + "import re\n", + "from langgraph.graph import StateGraph, START, END\n", + "\n", + "# -----------------------\n", + "# Strategy: PAL\n", + "# -----------------------\n", + "prompt_parse_problem = node(\n", + " \"Read the problem and write Python code that sets a variable named `result` to the final answer.\\n\"\n", + " \"- Output ONLY valid Python (no markdown fences).\\n\"\n", + " \"- If the task is multiple-choice, set result to the option label exactly (e.g., '(A)').\\n\\n\"\n", + " \"Problem:\\n\",\n", + " trainable=True,\n", + " description=\"PAL prompt that generates python code producing a `result`.\"\n", + ")\n", + "\n", + "def parse_problem(state: dict):\n", + " question = get_no_node(state.get(\"question\", \"\"))\n", + " prompt = prompt_parse_problem + question\n", + " code_str = llm_call(get_no_node(prompt))\n", + " return {\"code\": code_str.strip(), \"question\": question}\n", + "\n", + "def execute_code(state: dict):\n", + " def strip_python_tags(code: str) -> str:\n", + " return re.sub(\n", + " r'(?s)(?:.*?```(?:python)?\\s*\\n(.*?)(?:\\n```.*)?|(.*))\\Z',\n", + " lambda m: m.group(1) if m.group(1) is not None else m.group(2),\n", + " code,\n", + " )\n", + "\n", + " update = {}\n", + " try:\n", + " code_to_run = strip_python_tags(get_no_node(state.get(\"code\", \"\")))\n", + " local_vars = {}\n", + " exec(code_to_run, {}, local_vars)\n", + " local_vars.pop(\"__builtins__\", None)\n", + "\n", + " if \"result\" in local_vars:\n", + " update[\"final_answer\"] = node(local_vars[\"result\"])\n", + " elif len(local_vars) == 1:\n", + " update[\"final_answer\"] = node(next(iter(local_vars.values())))\n", + " else:\n", + " update[\"final_answer\"] = node(None)\n", + "\n", + " except Exception as e:\n", + " update[\"final_answer\"] = node(None)\n", + " update[\"error\"] = str(e)\n", + "\n", + " return update\n", + "\n", + "def create_graph_solve_with_PAL_Strategy():\n", + " g = StateGraph(dict)\n", + " g.add_node(\"parse\", parse_problem)\n", + " g.add_node(\"calculate\", execute_code)\n", + " g.add_edge(START, \"parse\")\n", + " g.add_edge(\"parse\", \"calculate\")\n", + " g.add_edge(\"calculate\", END)\n", + " return g\n", + "\n", + "def solve_with_PAL_Strategy(problem: str) -> dict:\n", + " g = create_graph_solve_with_PAL_Strategy()\n", + " compiled = g.compile()\n", + "\n", + " # NOTE: graph visualization disabled by default\n", + " if SHOW_MERMAID_GRAPH:\n", + " try:\n", + " from IPython.display import Image, display # type: ignore\n", + " display(Image(compiled.get_graph(xray=1).draw_mermaid_png()))\n", + " except Exception:\n", + " pass\n", + "\n", + " result = compiled.invoke({\"question\": get_no_node(problem)})\n", + " if \"final_answer\" not in result:\n", + " return {\"final_answer\": node(\"No solution found\")}\n", + " if isinstance(result[\"final_answer\"], str):\n", + " return {\"final_answer\": node(result[\"final_answer\"])}\n", + " return result\n", + "\n", + "# Default \"graph spec\" for optimize_langgraph\n", + "GRAPH_ROOT = \"solve_with_PAL_Strategy\"\n", + "GRAPH_AGENTS = [\"parse_problem\", \"execute_code\"]\n", + "GRAPH_PROMPTS = [prompt_parse_problem]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45e1c136", + "metadata": { + "id": "45e1c136", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8d63f1c3-7f43-4515-dff7-f6db822517dd" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "BBEH tasks dir: bbeh/bbeh/benchmark_tasks\n", + "Loaded 20 train and 10 val examples for bbeh_boolean_expressions\n" + ] + } + ], + "source": [ + "import os, json, random, string\n", + "\n", + "# -----------------------\n", + "# BBEH dataset loader\n", + "# -----------------------\n", + "# Repo layout varies slightly depending on how you clone / where you run.\n", + "def _find_bbeh_tasks_dir() -> str:\n", + " candidates = [\n", + " \"bbeh/benchmark_tasks\",\n", + " \"bbeh/bbeh/benchmark_tasks\",\n", + " \"benchmark_tasks\",\n", + " ]\n", + " for c in candidates:\n", + " if os.path.exists(c):\n", + " return c\n", + " raise FileNotFoundError(\n", + " \"Could not locate BBEH benchmark_tasks folder.\\n\"\n", + " \"Clone the repo first, e.g. `git clone https://github.com/google-deepmind/bbeh.git`.\"\n", + " )\n", + "\n", + "bbeh_tasks_dir = _find_bbeh_tasks_dir()\n", + "print(\"BBEH tasks dir:\", bbeh_tasks_dir)\n", + "\n", + "# For this notebook we only need the task(s) with constrained outputs.\n", + "LIMITED_BBEH_OUTPUT_TASKS = {\n", + " \"bbeh_boolean_expressions\": {\"(A)\", \"(B)\", \"(C)\", \"(D)\", \"(E)\"},\n", + "}\n", + "\n", + "def normalize_answer(ans) -> str:\n", + " if ans is None:\n", + " return \"\"\n", + " ans = str(ans).strip().lower()\n", + " ans = ans.translate(str.maketrans(\"\", \"\", string.punctuation))\n", + " ans = ans.replace(\" \", \"\")\n", + " return ans\n", + "\n", + "def feedback_answer_bbeh(predicted, target, allowed_set=None):\n", + " pred_raw = get_no_node(predicted)\n", + " pred_norm = normalize_answer(pred_raw)\n", + " target_norm = normalize_answer(target)\n", + "\n", + " allowed_norm = None\n", + " if allowed_set:\n", + " allowed_norm = {normalize_answer(a) for a in allowed_set}\n", + "\n", + " if pred_norm == target_norm:\n", + " return True, f\"SUCCESS: '{pred_raw}'\"\n", + " msg = f\"FAILED: '{pred_raw}' != '{target}'. Fix the code/prompt to solve similar problems.\"\n", + " if allowed_norm is not None and pred_norm not in allowed_norm:\n", + " msg += f\" (final answer must be one of: {sorted(allowed_set)})\"\n", + " return False, msg\n", + "\n", + "def load_bbeh_examples(task_name: str, *, n_train: int, n_val: int, seed: int = 0):\n", + " task_path = os.path.join(bbeh_tasks_dir, task_name, \"task.json\")\n", + " if not os.path.exists(task_path):\n", + " raise FileNotFoundError(f\"Task not found: {task_path}\")\n", + "\n", + " with open(task_path, \"r\") as f:\n", + " task = json.load(f)\n", + "\n", + " examples = task.get(\"examples\", [])\n", + " rng = random.Random(seed)\n", + " rng.shuffle(examples)\n", + "\n", + " allowed = LIMITED_BBEH_OUTPUT_TASKS.get(task_name)\n", + " def _format_q(q: str) -> str:\n", + " if allowed:\n", + " return q + f\"\\n\\nAllowed final answer: {sorted(allowed)}\"\n", + " return q\n", + "\n", + " # Convert to the {question, solution} format used by optimize_langgraph\n", + " items = [{\"question\": _format_q(ex[\"input\"]), \"solution\": ex[\"target\"]} for ex in examples]\n", + "\n", + " train = items[:n_train]\n", + " val = items[n_train:n_train + n_val]\n", + " return train, val, allowed\n", + "\n", + "train_set, val_set, allowed_set = load_bbeh_examples(\n", + " BBEH_TASK_NAME,\n", + " n_train=N_TRAIN,\n", + " n_val=N_VAL,\n", + " seed=SEED,\n", + ")\n", + "\n", + "print(f\"Loaded {len(train_set)} train and {len(val_set)} val examples for {BBEH_TASK_NAME}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff5cca61", + "metadata": { + "id": "ff5cca61", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "143c6a5e-5203-4ef8-862f-934d1b6d8a70" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[baseline/val] 01/10 ok=False pred=None :: FAILED: 'None' != '(A)'. Fix the code/prompt to solve similar problems. (final answer must be one of: ['(A)', '(B)', '(C)', '(D)', '(E)'])\n", + "[baseline/val] 02/10 ok=True pred=(E) :: SUCCESS: '(E)'\n", + "[baseline/val] 03/10 ok=True pred=(C) :: SUCCESS: '(C)'\n", + "[baseline/val] 04/10 ok=False pred=(C) :: FAILED: '(C)' != '(D)'. Fix the code/prompt to solve similar problems.\n", + "[baseline/val] 05/10 ok=True pred=(B) :: SUCCESS: '(B)'\n", + "[baseline/val] 06/10 ok=False pred=(D) :: FAILED: '(D)' != '(E)'. Fix the code/prompt to solve similar problems.\n", + "[baseline/val] 07/10 ok=False pred=None :: FAILED: 'None' != '(E)'. Fix the code/prompt to solve similar problems. (final answer must be one of: ['(A)', '(B)', '(C)', '(D)', '(E)'])\n", + "[baseline/val] 08/10 ok=False pred=(E) :: FAILED: '(E)' != '(C)'. Fix the code/prompt to solve similar problems.\n", + "[baseline/val] 09/10 ok=False pred=(E) :: FAILED: '(E)' != '(B)'. Fix the code/prompt to solve similar problems.\n" + ] + } + ], + "source": [ + "from typing import List, Dict, Tuple\n", + "\n", + "def run_solver_on_example(ex: dict) -> Tuple[bool, str, str]:\n", + " out = solve_with_PAL_Strategy(ex[\"question\"])\n", + " pred = get_no_node(out.get(\"final_answer\"))\n", + " ok, fb = feedback_answer_bbeh(pred, ex[\"solution\"], allowed_set)\n", + " return ok, str(pred), fb\n", + "\n", + "def evaluate(examples: List[dict], *, name: str) -> float:\n", + " n_ok = 0\n", + " for i, ex in enumerate(examples, 1):\n", + " ok, pred, fb = run_solver_on_example(ex)\n", + " n_ok += int(ok)\n", + " print(f\"[{name}] {i:02d}/{len(examples)} ok={ok} pred={pred} :: {fb}\")\n", + " acc = n_ok / max(1, len(examples))\n", + " print(f\"[{name}] accuracy = {acc:.3f} ({n_ok}/{len(examples)})\")\n", + " return acc\n", + "\n", + "# -----------------------\n", + "# Baseline\n", + "# -----------------------\n", + "baseline_acc = evaluate(val_set, name=\"baseline/val\")\n", + "\n", + "# -----------------------\n", + "# Curriculum training (Mode B)\n", + "# -----------------------\n", + "if SKIP_OPTIMIZATION:\n", + " print(\"SKIP_OPTIMIZATION=1 -> skipping optimization/training.\")\n", + "else:\n", + " last_successes: List[dict] = []\n", + "\n", + " for idx, ex in enumerate(train_set, 1):\n", + " ok, pred, fb = run_solver_on_example(ex)\n", + " print(f\"[train] {idx:02d}/{len(train_set)} ok={ok} pred={pred} :: {fb}\")\n", + "\n", + " if ok:\n", + " last_successes.append(ex)\n", + " last_successes = last_successes[-VALIDATE_ON_LAST_N:]\n", + " continue\n", + "\n", + " # Optimize on the failing example, with validation on last successes (curriculum)\n", + " modified, dump_file, history, chosen_state, run_dir = optimize_langgraph(\n", + " graph_root_function=GRAPH_ROOT,\n", + " graph_agents_functions=GRAPH_AGENTS,\n", + " graph_prompts_list=GRAPH_PROMPTS,\n", + " question=ex[\"question\"],\n", + " solution=ex[\"solution\"],\n", + " answer_feedback_func=feedback_answer_bbeh,\n", + " allowed_answer_set=allowed_set,\n", + " validation_set=last_successes,\n", + " accumulation_steps=ACCUMULATION_STEPS,\n", + " retry=LEARNING_RETRY,\n", + " max_attempts=MAX_ATTEMPTS,\n", + " test_optimization=True,\n", + " stop_on_success=True,\n", + " seed=SEED,\n", + " dump_prefix=f\"BBEH_{BBEH_TASK_NAME}__PAL__\",\n", + " output_folder=OUTPUT_FOLDER,\n", + " )\n", + "\n", + " print(\"[train] optimize_langgraph:\", {\"modified\": modified, \"dump_file\": dump_file, \"run_dir\": run_dir})\n", + " if history:\n", + " print(\"[train] last history entry:\", history[-1])\n", + "\n", + " # Re-test the current example after optimization\n", + " ok2, pred2, fb2 = run_solver_on_example(ex)\n", + " print(f\"[train] after-opt ok={ok2} pred={pred2} :: {fb2}\")\n", + "\n", + " if ok2:\n", + " last_successes.append(ex)\n", + " last_successes = last_successes[-VALIDATE_ON_LAST_N:]\n", + "\n", + "# -----------------------\n", + "# Post-training eval\n", + "# -----------------------\n", + "final_acc = evaluate(val_set, name=\"final/val\")\n", + "\n", + "print(\"Summary:\", {\"baseline_val_acc\": baseline_acc, \"final_val_acc\": final_acc})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.2" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/T1_PR_diff_reco.diff b/T1_PR_diff_reco.diff new file mode 100644 index 00000000..70a027cb --- /dev/null +++ b/T1_PR_diff_reco.diff @@ -0,0 +1,552 @@ +--- a/opto/trace/io/otel_adapter.py ++++ b/opto/trace/io/otel_adapter.py +@@ -149,6 +149,10 @@ + # prefer temporal parent for data-flow graph. + if not psid or psid in root_span_ids: + effective_psid = prev_span_id ++ ++ # Avoid dangling TGJ parent refs to skipped root invocation spans ++ if effective_psid and effective_psid in root_span_ids: ++ effective_psid = None + + if effective_psid and "parent" not in inputs: + inputs["parent"] = f"{svc}:{effective_psid}" + +--- a/opto/trace/io/langgraph_otel_runtime.py ++++ b/opto/trace/io/langgraph_otel_runtime.py +@@ -334,9 +334,20 @@ + getattr(self.llm, "model", "llm"), + ) + +- resp = self.llm(messages=messages, **llm_kwargs) +- content = resp.choices[0].message.content +- content = self._validate_content(content) ++ try: ++ resp = self.llm(messages=messages, **llm_kwargs) ++ content = resp.choices[0].message.content ++ content = self._validate_content(content) ++ except LLMCallError as e: ++ llm_sp.set_attribute("error", "true") ++ llm_sp.set_attribute("error.type", "LLMCallError") ++ llm_sp.set_attribute("error.message", str(e)[:500]) ++ raise ++ except Exception as exc: ++ llm_sp.set_attribute("error", "true") ++ llm_sp.set_attribute("error.type", type(exc).__name__) ++ llm_sp.set_attribute("error.message", str(exc)[:500]) ++ raise + + llm_sp.set_attribute( + "gen_ai.output.preview", (content or "")[:500] +@@ -345,10 +356,11 @@ + resp = self.llm(messages=messages, **llm_kwargs) + content = resp.choices[0].message.content + content = self._validate_content(content) +- except LLMCallError: ++ except LLMCallError as e: + # Record the error on the span and re-raise + sp.set_attribute("error", "true") + sp.set_attribute("error.type", "LLMCallError") ++ sp.set_attribute("error.message", str(e)[:500]) + raise + except Exception as exc: + # Unexpected provider error — record and raise as LLMCallError + sp.set_attribute("error", "true") + sp.set_attribute("error.type", type(exc).__name__) ++ sp.set_attribute("error.message", str(exc)[:500]) + raise LLMCallError(f"LLM provider call failed: {exc}") from exc + + return content + +--- a/opto/trace/io/optimization.py ++++ b/opto/trace/io/optimization.py +@@ -313,12 +313,17 @@ + + eval_fn = eval_fn or _default_eval_fn + ++ # If caller didn't provide output_key, fall back to the graph's configured output_key. ++ if output_key is None: ++ output_key = getattr(graph, "output_key", None) ++ + score_history: List[float] = [] + all_runs: List[List[RunResult]] = [] + best_score = float("-inf") + best_iteration = 0 + best_updates: Dict[str, Any] = {} + best_parameters: Dict[str, Any] = _snapshot_parameters(effective_bindings) ++ last_applied_updates: Dict[str, Any] = {} + + # -- lazy imports for Trace framework -- + _ingest_tgj = None +@@ -366,6 +371,7 @@ + total_iters = iterations + 1 # baseline + N iterations + + for iteration in range(total_iters): + is_baseline = iteration == 0 ++ applied_updates_for_this_iter = dict(last_applied_updates) + label = "baseline" if is_baseline else f"iter_{iteration}" + + runs: List[RunResult] = [] +@@ -504,6 +510,7 @@ + if avg_score > best_score: + best_score = avg_score + best_iteration = iteration + best_parameters = _snapshot_parameters(effective_bindings) ++ best_updates = dict(applied_updates_for_this_iter) + marker = " * NEW BEST" if not is_baseline else "" + logger.info( + "Iteration %d avg_score=%.4f%s", iteration, avg_score, marker +@@ -621,7 +628,7 @@ + if updates and apply_updates_flag: + try: + apply_updates(updates, effective_bindings, strict=False) +- best_updates = dict(updates) ++ last_applied_updates = dict(updates) + logger.info("Applied updates: %s", sorted(updates.keys())) + except Exception as exc: + logger.warning("apply_updates failed: %s", exc, exc_info=True) + +--- a/examples/notebooks/01_m1_instrument_and_optimize.ipynb ++++ b/examples/notebooks/01_m1_instrument_and_optimize.ipynb +@@ -8,7 +8,7 @@ + "source": [ + "# M1: Instrument & Optimize a LangGraph Agent\n", + "\n", +- "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mjehanzaib999/NewTrace/blob/feature/M1-instrument-and-optimize/examples/notebooks/01_m1_instrument_and_optimize.ipynb)\n", ++ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AgentOpt/OpenTrace/blob/main/examples/notebooks/01_m1_instrument_and_optimize.ipynb)\n", + "\n", + "This notebook demonstrates the **M1 core value proposition**: drop-in OTEL\n", + "instrumentation and end-to-end optimization for any LangGraph agent.\n", +@@ -93,23 +93,29 @@ + "!pip install -q langgraph>=1.0.0 opentelemetry-api>=1.38.0 opentelemetry-sdk>=1.38.0 \\\n", + " python-dotenv>=1.0.0 requests>=2.28.0 typing_extensions>=4.0.0 graphviz>=0.20.1\n", + "\n", +- "# Install Trace (the project itself) in editable mode\n", +- "# If running on Colab, install from the repo\n", ++ "# Install OpenTrace (the project itself) in editable mode\n", ++ "# If running on Colab, install from the repo and checkout OPENTRACE_REF\n", + "import os\n", + "try:\n", +- " import google.colab\n", ++ " import google.colab # noqa: F401\n", + " IN_COLAB = True\n", +- " if not os.path.exists(\"/content/NewTrace\"):\n", +- " !git clone --branch feature/M1-instrument-and-optimize \\\n", +- " https://github.com/mjehanzaib999/NewTrace.git /content/NewTrace\n", +- " !pip install -q -e /content/NewTrace\n", ++ "\n", ++ " OPENTRACE_REPO = \"https://github.com/AgentOpt/OpenTrace.git\"\n", ++ " OPENTRACE_REF = os.environ.get(\"OPENTRACE_REF\", \"main\")\n", ++ "\n", ++ " if not os.path.exists(\"/content/OpenTrace\"):\n", ++ " !git clone {OPENTRACE_REPO} /content/OpenTrace\n", ++ " !git -C /content/OpenTrace checkout {OPENTRACE_REF}\n", ++ " !pip install -q -e /content/OpenTrace\n", ++ "\n", ++ " print(f\"[INFO] OpenTrace ref: {OPENTRACE_REF}\")\n", + "except ImportError:\n", + " IN_COLAB = False\n", + " # Assume local dev: project already installed via pip install -e .\n", +@@ -158,15 +164,18 @@ + " import google.colab\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\", force_remount=False)\n", +- " base = \"/content/drive/MyDrive/NewTrace_runs/M1\"\n", ++ "\n", ++ " OPENTRACE_REF = os.environ.get(\"OPENTRACE_REF\", \"main\")\n", ++ " base = f\"/content/drive/MyDrive/OpenTrace_runs/M1/{OPENTRACE_REF}\"\n", + " os.makedirs(base, exist_ok=True)\n", ++ "\n", + " RUN_FOLDER = os.path.join(base, f\"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}\")\n", + " os.makedirs(RUN_FOLDER, exist_ok=True)\n", +- " print(f\"Run folder (Google Drive): {RUN_FOLDER}\")\n", ++ " print(f\"Run folder (Google Drive, OpenTrace): {RUN_FOLDER}\")\n", + "except Exception:\n", + " RUN_FOLDER = os.path.abspath(os.path.join(os.getcwd(), \"notebook_outputs\", \"m1\"))\n", + " os.makedirs(RUN_FOLDER, exist_ok=True)\n", +@@ -306,68 +315,255 @@ + ], + "source": [ + "from typing import Any, Dict, List, Optional\n", +- "from typing_extensions import TypedDict\n", ++ "from typing_extensions import TypedDict, Literal\n", + "from langgraph.graph import StateGraph, START, END\n", +- "\n", ++ "from langgraph.types import Command\n", ++ "\n", ++ "# Keep the notebook aligned with JSON_OTEL_trace_optim_demo_LANGGRAPH.py\n", ++ "DEMO_QUERIES = [\n", ++ " \"Summarize the causes and key events of the French Revolution.\",\n", ++ " \"Give 3 factual relationships about Tesla, Inc. with entity IDs.\",\n", ++ " \"What is the Wikidata ID for CRISPR and list 2 related entities?\",\n", ++ "]\n", + "\n", + "class AgentState(TypedDict, total=False):\n", + " query: str\n", +- " plan: str\n", +- " answer: str\n", ++ " plan: Dict[str, Any]\n", ++ " current_step: int\n", ++ " contexts: List[str]\n", ++ " agent_query: str\n", ++ " final_answer: str\n", ++ " eval_score: float\n", ++ " eval_feedback: str\n", + "\n", ++ "def wikipedia_search(query: str) -> str:\n", ++ " \"\"\"Wikipedia tool. Falls back gracefully if wikipedia package/network is unavailable.\"\"\"\n", ++ " try:\n", ++ " import wikipedia\n", ++ " wikipedia.set_lang(\"en\")\n", ++ " hits = wikipedia.search(query, results=2)\n", ++ " out = []\n", ++ " for h in hits:\n", ++ " try:\n", ++ " s = wikipedia.summary(h, sentences=3, auto_suggest=False, redirect=True)\n", ++ " out.append(f\"### {h}\\n{s}\")\n", ++ " except Exception:\n", ++ " continue\n", ++ " return \"\\n\\n\".join(out) or \"No Wikipedia results.\"\n", ++ " except Exception:\n", ++ " return \"Wikipedia search unavailable.\"\n", ++ "\n", ++ "def wikidata_search(query: str) -> str:\n", ++ " \"\"\"Wikidata search tool (wbsearchentities).\"\"\"\n", ++ " import requests\n", ++ " try:\n", ++ " r = requests.get(\n", ++ " \"https://www.wikidata.org/w/api.php\",\n", ++ " params={\n", ++ " \"action\": \"wbsearchentities\",\n", ++ " \"format\": \"json\",\n", ++ " \"language\": \"en\",\n", ++ " \"search\": query[:100],\n", ++ " \"limit\": 5,\n", ++ " },\n", ++ " timeout=10,\n", ++ " )\n", ++ " r.raise_for_status()\n", ++ " data = r.json()\n", ++ " results = [\n", ++ " f\"- {item.get('label','')}: {item.get('description','')} ({item.get('id','')})\"\n", ++ " for item in data.get(\"search\", [])\n", ++ " ]\n", ++ " return \"\\n\".join(results) if results else \"No Wikidata entities found.\"\n", ++ " except Exception:\n", ++ " return f\"Wikidata search unavailable. Query: {query[:50]}...\"\n", + "\n", +- "def build_graph(tracing_llm, templates: Dict[str, str]) -> StateGraph:\n", +- " \"\"\"Build a 2-node LangGraph (planner → synthesizer).\"\"\"\n", ++ "def build_graph(tracing_llm, templates: Dict[str, str]):\n", ++ " \"\"\"\n", ++ " Build a multi-node LangGraph aligned with JSON_OTEL_trace_optim_demo_LANGGRAPH.py:\n", ++ " planner -> executor -> (web_researcher|wikidata_researcher|synthesizer) -> evaluator\n", ++ " \"\"\"\n", + "\n", +- " def planner_node(state: AgentState) -> Dict[str, Any]:\n", +- " template = templates.get(\n", +- " \"planner_prompt\",\n", +- " \"Create a concise plan for: {query}\",\n", +- " )\n", +- " prompt = template.replace(\"{query}\", state.get(\"query\", \"\"))\n", +- " response = tracing_llm.node_call(\n", +- " span_name=\"planner\",\n", +- " template_name=\"planner_prompt\",\n", +- " template=template,\n", +- " optimizable_key=\"planner\",\n", +- " messages=[\n", +- " {\"role\": \"system\", \"content\": \"You are a planning agent. Output a 3-step plan.\"},\n", +- " {\"role\": \"user\", \"content\": prompt},\n", +- " ],\n", +- " )\n", +- " return {\"plan\": response}\n", ++ " def planner_node(state: AgentState) -> Command[Literal[\"executor\"]]:\n", ++ " template = templates.get(\n", ++ " \"planner_prompt\",\n", ++ " \"Return JSON plan with steps for query: {query}. Use agents: web_researcher, wikidata_researcher, synthesizer.\",\n", ++ " )\n", ++ " prompt = template.replace(\"{query}\", state.get(\"query\", \"\"))\n", ++ "\n", ++ " raw = tracing_llm.node_call(\n", ++ " span_name=\"planner\",\n", ++ " template_name=\"planner_prompt\",\n", ++ " template=template,\n", ++ " optimizable_key=\"planner\",\n", ++ " user_query=state.get(\"query\", \"\"),\n", ++ " extra_inputs={\"user_query\": state.get(\"query\", \"\")},\n", ++ " messages=[\n", ++ " {\"role\": \"system\", \"content\": \"Return JSON only. Keys: 1,2,... each step has {agent,action,goal,query}.\"},\n", ++ " {\"role\": \"user\", \"content\": prompt},\n", ++ " ],\n", ++ " max_tokens=400,\n", ++ " temperature=0,\n", ++ " )\n", ++ " plan: Dict[str, Any]\n", ++ " try:\n", ++ " import json\n", ++ " plan = json.loads(raw)\n", ++ " except Exception:\n", ++ " q = (state.get(\"query\", \"\") or \"\").lower()\n", ++ " plan = {\n", ++ " \"1\": {\"agent\": \"web_researcher\", \"action\": \"search\", \"goal\": \"collect context\", \"query\": state.get(\"query\", \"\")},\n", ++ " \"2\": {\"agent\": \"wikidata_researcher\" if (\"wikidata\" in q or \"entity id\" in q or \"id\" in q) else \"synthesizer\",\n", ++ " \"action\": \"search\" if (\"wikidata\" in q or \"entity id\" in q or \"id\" in q) else \"answer\",\n", ++ " \"goal\": \"entities or final answer\", \"query\": state.get(\"query\", \"\")},\n", ++ " \"3\": {\"agent\": \"synthesizer\", \"action\": \"answer\", \"goal\": \"final answer\", \"query\": state.get(\"query\", \"\")},\n", ++ " }\n", ++ "\n", ++ " return Command(update={\"plan\": plan, \"current_step\": 1, \"contexts\": []}, goto=\"executor\")\n", + "\n", +- " def synthesizer_node(state: AgentState) -> Dict[str, Any]:\n", +- " template = templates.get(\n", +- " \"synthesizer_prompt\",\n", +- " \"Synthesize an answer for: {query}\\nPlan: {plan}\",\n", +- " )\n", +- " prompt = (\n", +- " template\n", +- " .replace(\"{query}\", state.get(\"query\", \"\"))\n", +- " .replace(\"{plan}\", state.get(\"plan\", \"\"))\n", +- " )\n", +- " response = tracing_llm.node_call(\n", +- " span_name=\"synthesizer\",\n", +- " template_name=\"synthesizer_prompt\",\n", +- " template=template,\n", +- " optimizable_key=\"synthesizer\",\n", +- " messages=[\n", +- " {\"role\": \"system\", \"content\": \"You are a synthesis agent. Give a concise answer.\"},\n", +- " {\"role\": \"user\", \"content\": prompt},\n", +- " ],\n", +- " )\n", +- " return {\"answer\": response}\n", ++ " def executor_node(state: AgentState) -> Command[Literal[\"web_researcher\", \"wikidata_researcher\", \"synthesizer\"]]:\n", ++ " step = int(state.get(\"current_step\", 1) or 1)\n", ++ " plan = state.get(\"plan\", {}) or {}\n", ++ " plan_step = plan.get(str(step), {})\n", ++ " if not plan_step:\n", ++ " return Command(update={}, goto=\"synthesizer\")\n", ++ "\n", ++ " template = templates.get(\n", ++ " \"executor_prompt\",\n", ++ " \"Given step {step} of plan: {plan_step}\\nFor query: {query}\\nReturn JSON: {goto,query}. goto in [web_researcher,wikidata_researcher,synthesizer].\",\n", ++ " )\n", ++ " prompt = (\n", ++ " template.replace(\"{step}\", str(step))\n", ++ " .replace(\"{plan_step}\", str(plan_step))\n", ++ " .replace(\"{query}\", state.get(\"query\", \"\"))\n", ++ " )\n", ++ "\n", ++ " raw = tracing_llm.node_call(\n", ++ " span_name=\"executor\",\n", ++ " template_name=\"executor_prompt\",\n", ++ " template=template,\n", ++ " optimizable_key=\"executor\",\n", ++ " user_query=state.get(\"query\", \"\"),\n", ++ " extra_inputs={\"step\": str(step), \"user_query\": state.get(\"query\", \"\")},\n", ++ " messages=[\n", ++ " {\"role\": \"system\", \"content\": \"Return JSON only with keys goto and query.\"},\n", ++ " {\"role\": \"user\", \"content\": prompt},\n", ++ " ],\n", ++ " max_tokens=200,\n", ++ " temperature=0,\n", ++ " )\n", ++ "\n", ++ " goto = str(plan_step.get(\"agent\", \"synthesizer\"))\n", ++ " q2 = str(plan_step.get(\"query\", state.get(\"query\", \"\")))\n", ++ " try:\n", ++ " import json\n", ++ " d = json.loads(raw)\n", ++ " goto = str(d.get(\"goto\", goto))\n", ++ " q2 = str(d.get(\"query\", q2))\n", ++ " except Exception:\n", ++ " pass\n", ++ "\n", ++ " if goto not in (\"web_researcher\", \"wikidata_researcher\", \"synthesizer\"):\n", ++ " goto = \"synthesizer\"\n", ++ "\n", ++ " return Command(update={\"agent_query\": q2}, goto=goto)\n", ++ "\n", ++ " def web_researcher_node(state: AgentState) -> Command[Literal[\"executor\"]]:\n", ++ " q = state.get(\"agent_query\", state.get(\"query\", \"\"))\n", ++ " with tracing_llm.tracer.start_as_current_span(\"web_researcher\") as sp:\n", ++ " sp.set_attribute(\"inputs.user_query\", state.get(\"query\", \"\"))\n", ++ " sp.set_attribute(\"inputs.agent_query\", q)\n", ++ " ctx = wikipedia_search(q)\n", ++ " sp.set_attribute(\"outputs.context.preview\", (ctx or \"\")[:500])\n", ++ " contexts = list(state.get(\"contexts\", []) or [])\n", ++ " contexts.append(ctx)\n", ++ " step = int(state.get(\"current_step\", 1) or 1) + 1\n", ++ " return Command(update={\"contexts\": contexts, \"current_step\": step}, goto=\"executor\")\n", ++ "\n", ++ " def wikidata_researcher_node(state: AgentState) -> Command[Literal[\"executor\"]]:\n", ++ " q = state.get(\"agent_query\", state.get(\"query\", \"\"))\n", ++ " with tracing_llm.tracer.start_as_current_span(\"wikidata_researcher\") as sp:\n", ++ " sp.set_attribute(\"inputs.user_query\", state.get(\"query\", \"\"))\n", ++ " sp.set_attribute(\"inputs.agent_query\", q)\n", ++ " ctx = wikidata_search(q)\n", ++ " sp.set_attribute(\"outputs.context.preview\", (ctx or \"\")[:500])\n", ++ " contexts = list(state.get(\"contexts\", []) or [])\n", ++ " contexts.append(ctx)\n", ++ " step = int(state.get(\"current_step\", 1) or 1) + 1\n", ++ " return Command(update={\"contexts\": contexts, \"current_step\": step}, goto=\"executor\")\n", ++ "\n", ++ " def synthesizer_node(state: AgentState) -> Command[Literal[\"evaluator\"]]:\n", ++ " template = templates.get(\n", ++ " \"synthesizer_prompt\",\n", ++ " \"Answer the query: {query}\\nContext:\\n{contexts}\\nIf asked for IDs, include them. Be factual.\",\n", ++ " )\n", ++ " contexts = \"\\n\\n\".join(state.get(\"contexts\", []) or [])\n", ++ " prompt = template.replace(\"{query}\", state.get(\"query\", \"\")).replace(\"{contexts}\", contexts[:4000])\n", ++ "\n", ++ " ans = tracing_llm.node_call(\n", ++ " span_name=\"synthesizer\",\n", ++ " template_name=\"synthesizer_prompt\",\n", ++ " template=template,\n", ++ " optimizable_key=\"synthesizer\",\n", ++ " user_query=state.get(\"query\", \"\"),\n", ++ " extra_inputs={\"user_query\": state.get(\"query\", \"\")},\n", ++ " messages=[\n", ++ " {\"role\": \"system\", \"content\": \"You are a careful assistant.\"},\n", ++ " {\"role\": \"user\", \"content\": prompt},\n", ++ " ],\n", ++ " max_tokens=500,\n", ++ " temperature=0,\n", ++ " )\n", ++ " return Command(update={\"final_answer\": ans}, goto=\"evaluator\")\n", ++ "\n", ++ " def evaluator_node(state: AgentState) -> Command[Literal[\"__end__\"]]:\n", ++ " import re\n", ++ " q = (state.get(\"query\", \"\") or \"\").lower()\n", ++ " ans = (state.get(\"final_answer\", \"\") or \"\")\n", ++ " ctx = \"\\n\".join(state.get(\"contexts\", []) or \"\")\n", ++ " wants_ids = (\"wikidata\" in q) or (\"entity id\" in q) or (\"id\" in q and \"tesla\" in q)\n", ++ " has_qid = bool(re.search(r\"\\bQ\\d{2,}\\b\", ans))\n", ++ " uses_ctx = len(ctx.strip()) > 0\n", ++ " score = 0.25\n", ++ " reasons = []\n", ++ " if uses_ctx:\n", ++ " score += 0.25\n", ++ " reasons.append(\"uses_context\")\n", ++ " if wants_ids and has_qid:\n", ++ " score += 0.35\n", ++ " reasons.append(\"has_qids\")\n", ++ " if \"french revolution\" in q and len(ans) > 200:\n", ++ " score += 0.10\n", ++ " reasons.append(\"sufficient_detail\")\n", ++ " score = min(score, 0.95)\n", ++ " feedback = {\"score\": score, \"reasons\": reasons}\n", ++ " with tracing_llm.tracer.start_as_current_span(\"evaluator\") as sp:\n", ++ " sp.set_attribute(\"eval.score\", str(score))\n", ++ " sp.set_attribute(\"eval.reasons\", \",\".join(reasons))\n", ++ " return Command(update={\"eval_score\": score, \"eval_feedback\": str(feedback)}, goto=END)\n", + "\n", +- " graph = StateGraph(AgentState)\n", +- " graph.add_node(\"planner\", planner_node)\n", +- " graph.add_node(\"synthesizer\", synthesizer_node)\n", +- " graph.add_edge(START, \"plann\n", ++ " workflow = StateGraph(AgentState)\n", ++ " workflow.add_node(\"planner\", planner_node)\n", ++ " workflow.add_node(\"executor\", executor_node)\n", ++ " workflow.add_node(\"web_researcher\", web_researcher_node)\n", ++ " workflow.add_node(\"wikidata_researcher\", wikidata_researcher_node)\n", ++ " workflow.add_node(\"synthesizer\", synthesizer_node)\n", ++ " workflow.add_node(\"evaluator\", evaluator_node)\n", ++ "\n", ++ " workflow.add_edge(START, \"planner\")\n", ++ " workflow.add_edge(\"synthesizer\", \"evaluator\")\n", ++ "\n", ++ " return workflow.compile()\n", + "\n" + ] + }, +@@ -474,17 +678,19 @@ + "from opto.trace.io import instrument_graph, apply_updates\n", + "\n", + "INITIAL_TEMPLATES = {\n", +- " \"planner_prompt\": \"Create a thorough, step-by-step JSON plan for: {query}. Use web_researcher, wikidata_researcher, synthesizer.\",\n", ++ " \"planner_prompt\": \"Create a JSON plan for: {query}. Use web_researcher and synthesizer; include wikidata_researcher if IDs are requested.\",\n", + " \"executor_prompt\": \"Given step {step} of plan: {plan_step} for query: {query}. Return JSON {goto,query}.\",\n", + " \"synthesizer_prompt\": \"Answer: {query}\\nContext:\\n{contexts}\\nIf asked for IDs, include Wikidata QIDs.\",\n", + "}\n", + "\n", + "ig = instrument_graph(\n", + " graph=None,\n", + " service_name=\"m1-notebook\",\n", +- " trainable_keys={\"planner\", \"executor\", \"synthesizer\"},\n", ++ " trainable_keys={\"planner\", \"executor\", \"synthesizer\"},\n", + " llm=StubLLM(),\n", + " initial_templates=INITIAL_TEMPLATES,\n", + " emit_genai_child_spans=True,\n", +@@ -768,20 +974,20 @@ + "# ---- Eval_fn: prefer evaluator score produced by the graph; fallback to structure scoring ----\n", + "def stub_eval_fn(payload):\n", + " result = payload.get(\"result\") or {}\n", +- " ans = str(payload.get(\"answer\", \"\") or \"\")\n", ++ " ans = str(payload.get(\"answer\", \"\") or \"\")\n", + " if ans.strip().startswith(\"[ERROR]\") or not ans.strip():\n", + " return EvalResult(score=0.0, feedback=\"LLM failure/empty answer\")\n", + "\n", +- " if isinstance(result, dict) and \"eval_score\" in result:\n", +- " try:\n", +- " s = float(result.get(\"eval_score\") or 0.0)\n", +- " return EvalResult(score=s, feedback=str(result.get(\"eval_feedback\",\"\"))[:500])\n", +- " except Exception:\n", +- " pass\n", +- "\n", +- " q = str(payload.get(\"query\",\"\") or \"\").lower()\n", +- " wants_ids = (\"wikidata\" in q) or (\"entity id\" in q) or (\"tesla\" in q and \"id\" in q)\n", +- " has_qid = (\"q\" in ans.lower()) and any(ch.isdigit() for ch in ans)\n", +- " score = 0.25 + (0.45 if wants_ids and has_qid else 0.0) + min(len(ans)/800.0, 0.25)\n", +- " score = min(score, 0.95)\n", +- " return EvalResult(score=score, feedback=f\"fallback score={score:.2f}\")\n", ++ " plan = {}\n", ++ " if isinstance(result, dict):\n", ++ " plan = result.get(\"plan\", {}) or {}\n", ++ " plan_steps = len(list(plan.keys())) if isinstance(plan, dict) else 0\n", ++ "\n", ++ " # Score: base + reward plan richness (up to 3 steps) + small reward for length\n", ++ " score = 0.2 + 0.2 * min(plan_steps, 3) + min(len(ans) / 1200.0, 0.15)\n", ++ " score = min(score, 0.95)\n", ++ " return EvalResult(score=score, feedback=f\"plan_steps={plan_steps}, score={score:.2f}\")\n", + "\n", + "print(\"Mock optimizer and eval_fn ready.\")\n", + "\n" + ] + }, +@@ -903,18 +1109,24 @@ + " if resp.status_code != 200:\n", + " raise RuntimeError(f\"OpenRouter HTTP {resp.status_code}: {resp.text[:1000]}\")\n", + " data = resp.json()\n", + " return self._wrap(data)\n", +@@ -1027,6 +1239,10 @@ + " live_ig = instrument_graph(\n", + " graph=None,\n", + " service_name=\"m1-live\",\n", + " trainable_keys={\"planner\", \"executor\", \"synthesizer\"},\n", + " llm=live_llm,\n", + " initial_templates=live_templates,\n", + " emit_genai_child_spans=True,\n", ++ " provider_name=\"openrouter\",\n", ++ " llm_span_name=\"openrouter.chat.completion\",\n", + " )\n", +@@ -1054,6 +1270,18 @@ + " if LIVE_OK:\n", + " live_otlp = live_ig.session.flush_otlp(clear=True)\n", + " live_spans = live_otlp[\"resourceSpans\"][0][\"scopeSpans\"][0][\"spans\"]\n", + " trace_ids = {sp.get(\"traceId\") for sp in live_spans}\n", + " has_root = any(str(sp.get(\"name\",\"\")) .endswith(\".invoke\") for sp in live_spans)\n", + " print(f\"\\nSpans captured: {len(live_spans)} unique_trace_ids={len(trace_ids)} has_root_invoke={has_root}\")\n", + " if len(trace_ids) != 1 or not has_root:\n", + " raise RuntimeError(\"Trace invariants failed: expected single traceId and root *.invoke span.\")\n", + "\n", + " print(\"\\n[OK] Live LLM trace validated (HTTP 200 + non-empty answer + trace invariants).\")\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 + } diff --git a/docs/OTEL_Graph_Optim_Draft_Feedback_analysis.md b/docs/OTEL_Graph_Optim_Draft_Feedback_analysis.md new file mode 100644 index 00000000..aad61d20 --- /dev/null +++ b/docs/OTEL_Graph_Optim_Draft_Feedback_analysis.md @@ -0,0 +1,238 @@ +## 1) What “good M0” means for this job (non-negotiable deliverable shape) + +Milestone 0 is not “some code that runs”. It’s a **design contract** that makes M1–M3 mechanical and reviewable: + +### M0 must include (minimum) + +1. **Boilerplate inventory** (from the existing demo): list the exact blocks to eliminate and where they move (runtime init, exporter setup, node spans, OTLP flush, OTLP→TGJ conversion, diff dumps, optimizer loop, result summaries). +2. **Public API signatures** (exact function/class signatures) for: + + * `instrument_graph(...)` + * LLM/tool wrappers (auto span emission) + * `optimize_langgraph(...)` or `LangGraphOptimizer.run(...)` + * `TelemetrySession` / `UnifiedTelemetry` (OTEL + MLflow) +3. **A genericity statement**: “works for any LangGraph graph”, and what “any” means (sync/async nodes? streaming? retries? tools? subgraphs?). +4. **A telemetry coverage plan**: how spans/metrics/artifacts flow across **nodes + LLM + tools + optimizers + trainers** into OTEL and into MLflow. +5. **A deterministic testing plan** (StubLLM mode), including what is asserted in pytest. +6. **A notebook plan** for M1/M2/M3: minimal code path, no secrets committed, “Open in Colab” badge, persistent artifacts. + +--- + +## 2) Your key concern is correct: the optimization API must not be demo-specific + +Your “planner / researcher / synthesizer / evaluator” graph is just a sample. The API needs to be framed around **LangGraph as a graph runtime**, not around that single graph’s roles. + +The M0 doc must explicitly answer: + +### What is the abstraction boundary? + +There are really only two robust patterns (he should pick one, and justify): + +#### Approach A — Node wrapper / decorator instrumentation (usually most reliable) + +* Wrap each node callable with `@trace_node(...)` or `trace_node(fn, ...)`. +* Pros: works even if nodes aren’t LangChain “runnables”; consistent spans. +* Cons: requires touching node registration; but can still be “minimal change”. + +#### Approach B — Callback-based instrumentation (lowest code change, but not always complete) + +LangChain / LangGraph expose a callback system intended for monitoring/logging. In LangChain docs, callbacks are explicitly positioned for observability side effects. ([reference.langchain.com][1]) + +* Pros: can be “one-liner” when supported (pass a callback handler to the compiled graph). +* Cons: many graphs won’t emit enough callback events unless nodes are implemented as LangChain components; and mixing callbacks with streaming has known foot-guns in practice. + +**M0 must pick A or B (or hybrid):** + +* Hybrid is common: callbacks for LLM/tool calls; node wrappers for node spans. + +--- + +## 3) Boilerplate reduction must be shown as a “before/after” (table + diff) + +You’re right to demand a “code before vs after” view. This is the *developer adoption* metric. Agent Lightning’s positioning (“almost zero code changes”) is exactly the framing you want to compete with. ([GitHub][2]) + +Below is a **ChatGPT-generated example** table he can paste into README (replace names with your actual APIs). This is not a claim about your repo; it’s a template. + +### Example “Before vs After” table (template) + +| Aspect | Before (manual demo) | After (proposed API) | +| -------------------------- | ---------------------------------------------------------- | ------------------------------------------------------- | +| OTEL init/exporter | manual tracer/provider/exporter wiring in every script | `session = TelemetrySession(...); session.start()` | +| Node spans | `with tracer.start_as_current_span("node"):` everywhere | `instrument_graph(graph, session, ...)` | +| LLM spans + prompt capture | manually `set_attribute("inputs.gen_ai.prompt", ...)` etc. | `llm = TracingLLM(base_llm, session)` (auto `gen_ai.*`) | +| OTLP flush | manual exporter flush | `session.flush_otlp()` | +| OTLP→TGJ | manual conversion calls | `optimize_langgraph(..., session=session)` | +| Apply updates | custom patching | `PatchApplier.apply(update, targets=...)` | +| Artifacts | ad-hoc json dumps | `RunArtifacts.write_run(...)` standard layout | + +### Example unified diff snippet (template) + +```diff +- tracer, exporter = init_otel_exporter(...) +- graph = build_graph(llm) +- for x in dataset: +- with tracer.start_as_current_span("planner") as sp: +- sp.set_attribute("inputs.gen_ai.prompt", prompt) +- out = llm(prompt) +- otlp = flush(exporter) +- tgj = otlp_to_tgj(otlp) +- upd = optimizer.step(tgj, scores) +- apply_updates(graph, upd) ++ session = TelemetrySession(project="langgraph-demo", mode="stub") ++ llm = TracingLLM(base_llm, session=session) ++ graph = build_graph(llm) ++ graph = instrument_graph(graph, session=session, optimizable=Optimizable(nodes="*")) ++ result = optimize_langgraph(graph, dataset, optimizer="OptoPrimeV2", session=session) +``` + +If his M0 doesn’t include something like this, he’s not meeting the “boilerplate reduction is top success metric” requirement. + +--- + +## 4) The API surface must be specified as a matrix of optimization “cases” + +You requested a table of “all the API in different cases of optimization” (prompts vs code vs params, selection, observability tuning). This is exactly what you need to force now, because otherwise he’ll implement only what the demo uses. + +Here is a concrete matrix he should include in M0. + +### API matrix (what must exist / be planned) + +| Use case | What is optimizable? | How dev selects targets | Required API | What is persisted | +| -------------------------- | ---------------------- | ------------------------------------------------- | --------------------------------------------------- | ----------------------------------------------- | +| Trace-only instrumentation | nothing | n/a | `instrument_graph(...)` | OTLP traces + minimal run metadata | +| Prompt optimization | prompt templates | `nodes=[...]` or `tags=[...]` or `selector=regex` | `TrainablePrompt("key")`, `optimize_langgraph(...)` | OTLP + TGJ + prompt patch/diff + summary | +| Code optimization | node code blocks | `code_nodes=[...]` | `TrainableCode(fn)` + patch applier | OTLP + TGJ + code patch + before/after snapshot | +| Hyperparam optimization | graph/node params | `param_keys=[...]` | `TrainableParam("k")` | param update log + config snapshot | +| Partial graph optimization | subset only | `selector` (node names/tags) | `Optimizable(selector=...)` | includes “skipped nodes” rationale | +| Observability “lite” | minimal spans | `capture_state=False` | `InstrumentOptions(capture=...)` | small artifacts, safe defaults | +| Observability “debug” | state I/O + truncation | `state_keys=[...]` | `CapturePolicy(truncate=..., redact=...)` | large artifacts, deterministic truncation | + +This should be in his M0 doc. If it isn’t, ask him to add it. + +--- + +## 5) OTEL semantics: define what attributes/spans you emit, and why + +This job is explicitly OTEL-first. He should anchor the design to the emerging OpenTelemetry GenAI semantic conventions (even if you store some data as artifacts for size). OpenTelemetry defines GenAI spans and related conventions (status is still evolving, but it’s the right direction). ([OpenTelemetry][3]) + +### What to insist on in M0 + +* **Node span contract** (what attributes are always present): + + * `graph.id`, `node.name`, `node.type` + * `param.*` (Trace optimization keys) + * `inputs.*` / `outputs.*` (with truncation rules) + * error fields (exception, status) +* **LLM span contract**: + + * a dedicated child “LLM call” span is the cleanest separation + * populate `gen_ai.*` keys per OpenTelemetry conventions where feasible ([OpenTelemetry][3]) + * put full prompt/response in **artifacts**, not span attributes, if size is large (and store only hashes/short previews in attributes) + +### Agent Lightning compatibility (optional but should be planned cleanly) + +If you keep the optional “Agent Lightning semconv compatibility”, his plan must reflect the actual documented conventions: + +* Rewards are dedicated spans named `agentlightning.annotation` ([microsoft.github.io][4]) +* Reward keys use the `agentlightning.reward` prefix; example `agentlightning.reward.0.value` ([microsoft.github.io][5]) +* `emit_reward`/`emit_annotation` exist as the conceptual model (even if you won’t depend on the library) ([microsoft.github.io][6]) + +So in M0 he should decide: + +* Do we emit those spans/attrs **always**, or behind a flag? +* If we emit child spans, how do we ensure TGJ conversion doesn’t break ordering (your “temporal_ignore” idea is sensible; if he adopts it, it must be explicitly in the M0 design). + +--- + +## 6) Telemetry unification: he must show a plan for trainers + optimizers + nodes + +Your note is correct: if his work plan doesn’t explicitly cover “how telemetry is initiated and wired across all components,” he will miss M2. + +### What to demand in M0: a concrete telemetry table + +Below is the table you asked for (template; he should fill exact modules). + +| Component | Today | Target telemetry hook | OTEL output | MLflow output | +| ---------------------------------- | ------------ | ---------------------------------------------------- | -------------------------------------------- | ------------------------------------------------- | +| LangGraph node execution | ad-hoc spans | `instrument_graph()` wraps nodes OR callback handler | spans per node | link run_id + store summary as artifact | +| LLM calls inside nodes | manual attrs | `TracingLLM` wrapper (child spans) | `gen_ai.*` spans/events ([OpenTelemetry][3]) | log token/cost metrics; save prompts as artifacts | +| Tool calls | inconsistent | `TracingTool` wrapper | span per tool call | metrics + tool error artifacts | +| Optimizer logs (e.g., summary_log) | in-memory | `TelemetrySession.log_event/artifact` adapter | events or span events | artifacts (jsonl), aggregate metrics | +| Trainer metrics via BaseLogger | fragmented | `BaseLogger → UnifiedTelemetry` adapter | metrics (optional) | `mlflow.log_metric` series | +| Run metadata | scattered | `TelemetrySession(run_id, iteration_id, step)` | resource attrs | params/tags + run dir artifact | + +**MLflow thread-safety must be addressed explicitly**: MLflow’s fluent API is not thread-safe; concurrent callers must use mutual exclusion, or use the lower-level client API. ([MLflow][7]) +So M0 must state one of: + +* “single-thread logging only (v1)” **or** +* “we use an internal lock for mlflow logging calls” **or** +* “we route all MLflow logging through `MlflowClient` in a single worker thread” + +### Also: don’t over-assume MLflow auto-tracing will cover LangGraph + +There are known gaps/issues around tracing LangGraph top-level calls with some autologging approaches. ([GitHub][8]) +So his plan should not hinge on “just turn on mlflow autolog and it traces the graph”. + +--- + +## 7) Tests: what M0 must commit to (StubLLM + deterministic assertions) + +He must specify exactly what tests will exist, not just “we’ll add tests”. + +Minimum pytest plan: + +1. **Unit**: `instrument_graph` produces spans with required attributes for: + + * normal node completion + * node exceptions (status) + * truncation/redaction rules +2. **Unit**: wrapper LLM emits `gen_ai.*` keys (and doesn’t crash on non-JSONable attrs) ([OpenTelemetry][3]) +3. **Integration (StubLLM)**: full loop: + + * run graph on 2–3 inputs + * flush OTLP + * convert OTLP→TGJ + * optimizer produces an update (even if toy) + * apply update + * rerun shows changed prompt/code snapshot +4. **Integration (MLflow local file store)**: + + * start run + * log a metric + artifact + * verify artifact exists in store + * ensure no keys required + +--- + +## 8) Notebook notes (add these at the end of your feedback, per your request) + +Even without seeing his notebook, the acceptance requirements are clear: + +* Good that he sent a notebook already executed (so you can inspect outputs). Keep that. +* Once it’s in GitHub, the notebook must: + + 1. Include an **“Open in Colab” badge** at the top. + 2. Use **Colab Secrets** / environment injection for API keys (avoid passing keys as parameters). + 3. Auto-save run artifacts to **Google Drive** (or a stable persistent path) to avoid losing long results on runtime reset. + 4. Print the **artifact folder path** at the end (so reviewers can find outputs quickly). + 5. Provide a clear **StubLLM path** that always runs in <5–10 minutes. + +(You can reuse the same Drive helper pattern you used in the Trace‑Bench feedback.) + + +--- + +## Bottom line + +For tomorrow’s meeting, you want to be able to decide in 5–10 minutes whether his M0 is “approval-worthy”. The gating signal is: **does the doc make M1 implementation obvious and generic, with the before/after diff, API matrix, telemetry matrix, and explicit tests/notebooks plan**. + +If you paste or upload his actual M0 README + notebook here later, I can add file-specific comments (naming, module layout, missing knobs, security issues, etc.). + +[1]: https://reference.langchain.com/python/langchain_core/callbacks/?utm_source=chatgpt.com "Callbacks | LangChain Reference" +[2]: https://github.com/microsoft/agent-lightning "GitHub - microsoft/agent-lightning: The absolute trainer to light up AI agents." +[3]: https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/?utm_source=chatgpt.com "Semantic conventions for generative client AI spans" +[4]: https://microsoft.github.io/agent-lightning/latest/tutorials/traces/?utm_source=chatgpt.com "Work with Traces - Agent-lightning" +[5]: https://microsoft.github.io/agent-lightning/stable/reference/semconv/?utm_source=chatgpt.com "Semantic Conventions - Agent-lightning" +[6]: https://microsoft.github.io/agent-lightning/latest/reference/agent/?utm_source=chatgpt.com "Agent-lightning" +[7]: https://mlflow.org/docs/latest/python_api/mlflow.html?utm_source=chatgpt.com "module provides a high-level “fluent” API for starting and ..." +[8]: https://github.com/mlflow/mlflow/issues/12798?utm_source=chatgpt.com "[FR] Tracing for Langchain's Runnable.astream_events ..." diff --git a/docs/PossibleStategyForAPIForOptimizationDemo.md b/docs/PossibleStategyForAPIForOptimizationDemo.md new file mode 100644 index 00000000..5c72b426 --- /dev/null +++ b/docs/PossibleStategyForAPIForOptimizationDemo.md @@ -0,0 +1,719 @@ +Got it — we’ll implement **Strategy 2 (Trace-first, dual semconv)** so that your LangGraph demo emits OTEL spans that are: + +* **Agent Lightning–compatible** (via `gen_ai.*` + `agentlightning.reward.*` spans) ([microsoft.github.io][1]) +* **Still convertible to TGJ** using your existing `otlp_traces_to_trace_json` adapter (and still optimizable by Trace). + +Below I provide: + +1. A **side-by-side API comparison table** (New API vs legacy demo vs design3_4 vs Agent Lightning) +2. **Colored code diffs** (Agent Lightning “ExampleAgentLightning.svg” vs New API, and Deep Research agent demo vs New API) +3. A **single unified git diff** against your current codebase (`Trace_main_code.txt` aggregated) +4. A **short explanation you can send to a colleague** + +--- + +## 1) “Per API element” table (New Strategy 2 vs legacy vs design3_4 vs Agent Lightning) + +> Goal: show *where* the code lives and *what it looks like*, for each “API element”. + +Legend: + +* ✅ built-in / intended +* 🟡 possible but manual +* ❌ not present + +| API element | **New API (Strategy 2)** | **Legacy demo** `JSON_OTEL_trace_optim_demo_LANGGRAPH.py` | **design3_4 demo** `...DESIGN3_4.py` | **Agent Lightning** | +| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Tracer + exporter init | `init_otel_runtime()` (Trace IO runtime) | Inline OTEL exporter + provider in demo | `init_otel_runtime()` from runtime and rebinding base tracer | Uses OTEL tracer/processoinfra; you write spans normally ([microsoft.github.io][2]) | +| Node span creation | Node functions use `TRACER.start_as_current_span("node")` *or* `TracingLLM.node_call(span_name="planner", ...)` | Manual `TRACER.start_as_current_span(...)` all over nodes | Base nodes call `TRACING_LLM.node_call(...)` (Design 3) | `@rollo create “agent ops”, plus normal OTEL spans ([microsoft.github.io][3]) | +| Prompt parameter capture (Trace optimization) | **Still**: `param.` + `param..trainable` on node span (same as today) | Manual `sp.set_attribute("param.*", ...)` per node | Centralized in `TracingLLM._record_llm_call()` in runtime (Design 3) | Uses **resources** / configs for prompt templates; tources ([GitHub][4]) | +| LLM tracing (fine-grained, AL-compatible) | `TracingLLM.node_call()` automatically emits **child span** named `openai.chat.completion` carrying `gen_ai.*` | LLM call happens inside node span; only `gen_ai.model` + `inputs.gen_ai.prompt` manually (non-standard) | Uses runtime `TracingLLM` but previously did not guarantee `gen_ai.*`; we’ll add it | Auto instrumentation/proxy creates spans like `openai.chat.completion` and training extracts from `gen_ai.*` ([microsoft.github.io][5])search7turn0search16 | +| **Problem**: temporal hierarchy TGJ conversion | With child spans, you must avoid “child span becomes prev span” (we’ll fix with `trace.temporal_ignore`) | No child spans → not an issue | Not previously emitting child gen-ai spans → not an issue | Not TGJ-based; they store spans with their own sequencing logic ([microsoft.github.io][2]) | +| Evaluation extraction for optimization | `extract_eval_metrics_from_otlp()` stays (Design 4) and becomes type-robust | Ad-hoc parser loop over OTLP spans | Uses `extract_eval_metrics_from_otlp()` already | Uses reward/annotation emitters like `emit_reward()` ([microsoft.github.io][6]) | +| Reward emission (AL-compatible) | Evaluator emits **child span** `agentlightning.annotation` with `agentlightning.reward.0.value` | Only `eval.score | Previously only Trace eval attributes (we’ll add AL reward emission in SPANOUTNODE) | `emit_reward(value: float)` creates reward spans (wrapper around annotation) ([microsoft.github.io][6]) | +| “One-liner” set attributes | `set_span_attributes(span, {...})` helper (new) | manual `sp.set_attribute()` repeated | runtime already centralized + we add helper | `emit_annotation({..([microsoft.github.io][6]) | +| Optimization loop | unchanged: `optimize_iteration(runs, ...)` and TGJ conversion via `otlp_traces_to_trace_json` | same | same (design34 calls base’s `optimize_iteration`) | Training loop is RL/APO/SFT (Trainer) rather than “patch prompts/code” ([microsoft.github.io][3]) | + +--- + +## 2) Colored code comparisons (Agent Lightning vs New API, and Deep Research demo vs New API) + +### 2.A Agent Lightning “reference example” (from docs + your SVG) vs New API + +Agent Lightning’s docs show: write an agent (often `@rollout`) and emit rewards via emitters; training is done via a `Trainer` and algorithm (e.g., APO). ([microsoft.github.io][7]) + +Here’s the conceptual diff: + +```diff +# -------------------------- +# Agent Lightning (concept) +# -------------------------- ++ import agentlightning as agl ++ from agentlightning import emit_reward ++ from agentlightning import rollout ++ ++ @rollout ++ def agent(task: dict, prompt_template: str): ++ # ... call LLM / tools ... ++ # compute intermediate/final reward ++ emit_reward(0.82) ++ return result ++ ++ trainer = agl.Trainer(algorithm=agl.APO(), initial_resources={"prompt_template": prompt_template}) ++ trainer.fit(agent=agent, train_dataset=tasks) + + +# -------------------------- +# Trace New API (Strategy 2) +# -------------------------- ++ from opto.trace.io.langgraph_otel_runtime import init_otel_runtime, TracingLLM ++ from opto.trace.io.otel_semconv import emit_agentlightning_reward # reward span format ++ ++ TRACER, EXPORTER = init_otel_runtime("my-graph") ++ TRACING_LLM = TracingLLM(llm=LLM_CLIENT, tracer=TRACER, trainable_keys={"planner","executor"}) ++ ++ def planner_node(state): ++ # no manual OTEL + gen_ai work; wrapper does it ++ plan = TRACING_LLM.node_call( ++ span_name="planner", ++ template_name="planner_prompt", ++ template=state.planner_template, ++ optimizable_key="planner", ++ messages=[...], ++ ) ++ return {...} ++ ++ def evaluator_node(state): ++ with TRACER.start_as_current_span("evaluator") as sp: ++ # produce Trace eval attrs (as before) ++ sp.set_attribute("eval.score", score) ++ ... ++ # AND ALSO produce Agent Lightning compatible reward span: ++ emit_agentlightning_reward(value=float(score), name="final_score") +``` + +Key point: **Strategy 2 does not try to reproduce RL training**. It only emits spans **compatible** with Lightning’s expectations while keeping your **TGJ/OPTO patch optimization** intact. + +--- + +### 2.B Deep Research agent: Legacy demo vs design3_4 vs New API (Strategy 2) + +In the legacy demo you manually set the prompt parameters + prompt input + `gen_ai.model` inside each node span. +In design3_4, those responsibilities move into the shared runtime `TracingLLM`. + +This is the “core simplification” you already did: + +```diff +# Legacy demo (manual OTEL inside each node) + with TRACER.start_as_current_span("synthesizer") as sp: + sp.set_attribute("param.synthesizer_prompt", template) + sp.set_attribute("param.synthesizer_prompt.trainable", "synthesizer" in OPTIMIZABLE) +- sp.set_attribute("gen_ai.model", "llm") + sp.set_attribute("inputs.gen_ai.prompt", prompt) + _emit_code_param(sp, "synthesizer", synthesizer_node) + answer = LLM_CLIENT(messages=[...]).:contentReference[oaicite:29]{index=29}tent + +# design3_4 + New API (wrapper) ++ answer = TRACING_LLM.node_call( ++ span_name="synthesizer", ++ template_name="synthesizer_prompt", ++ template=template, ++ optimizable_key="synthesizer", ++ code_key="synthesizer", ++ code_fn=synthesizer_node, ++ user_query=state.user_query, ++ messages=[{"role":"system","content":"..."}, {"role":"user","content":prompt}], ++ ) +``` + +What Strategy 2 adds **on top** of design3_4: + +* the wrapper emits a **child LLM span** named `openai.chat.completion` with `gen_ai.*` attributes (Lightning-friendly) ([OpenTelemetry][8]) +* evaluator emits a **child reward span** `agentlightning.annotation` with `agentlightning.reward.*` attributes ([microsoft.github.io][1]) +* we prevent these child spans from breaking TGJ “temporal hierarchy” conversion by marking them `trace.temporal_ignore=true` and teaching `otel_adapter` not to advance `prev_span_id` on them. + +--- + +## 3) Unified git diff to apply (against current codebase from `Trace_main_code.txt`) + +This patch adds **one helper module**, updates the runtime `TracingLLM`, updates `otel_adapter` for temporal-ignore safety, and updates the SPANOUTNODE evaluator to emit Agent Lightning rewards. + +> ✅ This is minimal and should not break legacy demos. +> ✅ It keeps TGJ conversion stable even with child spans. + +```diff +diff --git a/opto/trace/io/__init__.py b/opto/trace/io/__init__.py +index e69de29..7b9c3a1 100644 +--- a/opto/trace/io/__init__.py ++++ b/opto/trace/io/__init__.py +@@ -0,0 +1,9 @@ ++from .otel_semconv import ( ++ set_span_attributes, ++ record_genai_chat, ++ emit_agentlightning_reward, ++) ++ ++__all__ = [ ++ "set_span_attributes", "record_genai_chat", "emit_agentlightning_reward", ++] + +diff --git a/opto/trace/io/otel_semconv.py b/opto/trace/io/otel_semconv.py +new file mode 100644 +index 0000000..b1a2c3d +--- /dev/null ++++ b/opto/trace/io/otel_semconv.py +@@ -0,0 +1,176 @@ ++from __future__ import annotations ++ ++import json ++from typing import Any, Dict, List, Optional ++ ++from opentelemetry import trace as oteltrace ++ ++ ++def _json(v: Any) -> str: ++ return json.dumps(v, ensure_ascii=False) ++ ++ ++def set_span_attributes(span, attrs: Dict[str, Any]) -> None: ++ """ ++ Convenience helper: set many span attributes at once. ++ - dict/list -> JSON string ++ - None values -> skipped ++ """ ++ for k, v in (attrs or {}).items(): ++ if v is None: ++ continue ++ if isinstance(v, (dict, list)): ++ span.set_attribute(k, _json(v)) ++ else: ++ span.set_attribute(k, v) ++ ++ ++def record_genai_chat( ++ span, ++ *, ++ provider: str, ++ model: str, ++ input_messages: List[Dict[str, Any]], ++ output_text: Optional[str] = None, ++ request_type_compat: str = "chat.completion", ++) -> None: ++ """ ++ Record OTEL GenAI semantic convention attributes in a span. ++ ++ We store messages as JSON strings (span attrs must be primitive/sequence types). ++ """ ++ out_messages = None ++ if output_text is not None: ++ out_messages = [{"role": "assistant", "content": output_text}] ++ ++ set_span_attributes( ++ span, ++ { ++ # Spec-ish keys that many adapters expect ++ "gen_ai.operation.name": "chat", ++ "gen_ai.provider.name": provider, ++ "gen_ai.request.model": model, ++ # Back-compat / convenience for other tools (and Trace's existing heuristics) ++ "gen_ai.operation": "chat", ++ "gen_ai.model": model, ++ "gen_ai.request.type": request_type_compat, ++ # We keep these as JSON strings ++ "gen_ai.input.messages": input_messages, ++ "gen_ai.output.messages": out_messages, ++ }, ++ ) ++ ++ ++def emit_agentlightning_reward( ++ *, ++ value: float, ++ name: str = "final_score", ++ tracer_name: str = "opto.trace", ++ index: int = 0, ++ span_name: str = "agentlightning.annotation", ++ temporal_ignore: bool = True, ++ extra_attributes: Optional[Dict[str, Any]] = None, ++) -> None: ++ """ ++ Emit a reward span compatible with Agent Lightning semconv. ++ ++ Docs: emit_reward is a wrapper of emit_annotation; reward attrs use ++ agentlightning.reward..name / agentlightning.reward..value. :contentReference[oaicite:32]{index=32} ++ """ ++ tracer = oteltrace.get_tracer(tracer_name) ++ with tracer.start_as_current_span(span_name) as sp: ++ attrs: Dict[str, Any] = { ++ f"agentlightning.reward.{index}.name": name, ++ f"agentlightning.reward.{index}.value": float(value), ++ } ++ if temporal_ignore: ++ attrs["trace.temporal_ignore"] = True ++ if extra_attributes: ++ attrs.update(extra_attributes) ++ set_span_attributes(sp, attrs) + +diff --git a/opto/trace/io/langgraph_otel_runtime.py b/opto/trace/io/langgraph_otel_runtime.py +index 4f3aa11..c0f77df 100644 +--- a/opto/trace/io/langgraph_otel_runtime.py ++++ b/opto/trace/io/langgraph_otel_runtime.py +@@ -1,9 +1,11 @@ + from __future__ import annotations + ++import json + import time + from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple + + from opentelemetry import trace as oteltrace + from opentelemetry.sdk.trace import TracerProvider, ReadableSpan + from opentelemetry.sdk.trace.export import ( + SimpleSpanProcessor, + SpanExporter, + SpanExportResult, + ) ++ ++from .otel_semconv import record_genai_chat, set_span_attributes + + + class InMemorySpanExporter(SpanExporter): +@@ -56,6 +58,22 @@ def init_otel_runtime( + tracer = provider.get_tracer(service_name) + return tracer, exporter + + ++def _to_otlp_anyvalue(v: Any) -> Dict[str, Any]: ++ """ ++ Encode a Python attr into an OTLP JSON AnyValue. ++ Keep it simple/robust: primitives keep type; everything else stringified. ++ """ ++ if isinstance(v, bool): ++ return {"boolValue": v} ++ if isinstance(v, int) and not isinstance(v, bool): ++ # OTLP JSON commonly uses strings for intValue ++ return {"intValue": str(v)} ++ if isinstance(v, float): ++ return {"doubleValue": float(v)} ++ if isinstance(v, str): ++ return {"stringValue": v} ++ return {"stringValue": str(v)} ++ ++ + def flush_otlp( + exporter: InMemorySpanExporter, + scope_name: str = "demo", +@@ -78,10 +96,10 @@ def flush_otlp( + otlp_spans: List[Dict[str, Any]] = [] + for s in spans: + attributes = getattr(s, "attributes", {}) or {} + attrs = [ +- {"key": k, "value": {"stringValue": str(v)}} ++ {"key": k, "value": _to_otlp_anyvalue(v)} + for k, v in attributes.items() + ] + kind = getattr(s, "kind", 1) + if hasattr(kind, "value"): +@@ -121,6 +139,26 @@ def flush_otlp( + } + + + class TracingLLM: +@@ -137,6 +175,10 @@ class TracingLLM: + def __init__( + self, + llm: Any, + tracer: oteltrace.Tracer, + *, + trainable_keys: Optional[Iterable[str]] = None, + emit_code_param: Optional[Any] = None, ++ provider_name: str = "openai", ++ llm_span_name: str = "openai.chat.completion", ++ emit_llm_child_span: bool = True, + ) -> None: + self.llm = llm + self.tracer = tracer + self.trainable_keys = set(trainable_keys or []) + self.emit_code_param = emit_code_param ++ self.provider_name = provider_name ++ self.llm_span_name = llm_span_name ++ self.emit_llm_child_span = emit_llm_child_span + + # ---- helpers --------------------------------------------------------- +@@ -166,8 +208,8 @@ class TracingLLM: + if code_key and code_fn is not None and self.emit_code_param: + self.emit_code_param(sp, code_key, code_fn) + +- sp.set_attribute("gen_ai.model", "llm") ++ # Keep Trace-style prompt capture on the node span (TGJ-friendly). + sp.set_attribute("inputs.gen_ai.prompt", prompt) + if user_query is not None: + sp.set_attribute("inputs.user_query", user_query) +@@ -186,6 +228,17 @@ class TracingLLM: + """ + Invoke the wrapped LLM under an OTEL span. + """ + with self.tracer.start_as_current_span(span_name) as sp: + prompt = "" + if messages: + user_msgs = [m for m in messages if m.get("role") == "user"] + if user_msgs: + prompt = user_msgs[-1].get("content", "") or "" + else: + prompt = messages[-1].get("content", "") or "" + + self._record_llm_call( + sp, + template_name=template_name, + template=template, + optimizable_key=optimizable_key, + code_key=code_key, + code_fn=code_fn, + user_query=user_query, + prompt=prompt, + extra_inputs=extra_inputs or {}, + ) +- +- resp = self.llm(messages=messages, **llm_kwargs) +- # Compatible with OpenAI-style chat responses. +- return resp.choices[0].message.content ++ # Infer model name best-effort. ++ model = ( ++ str(llm_kwargs.get("model")) ++ if llm_kwargs.get("model") is not None ++ else str(getattr(self.llm, "model", "") or "unknown") ++ ) ++ ++ # Emit a child span that looks like common GenAI client spans. ++ # Important: mark it temporal-ignore so TGJ temporal parenting stays stable. ++ if self.emit_llm_child_span: ++ with self.tracer.start_as_current_span(self.llm_span_name) as llm_sp: ++ set_span_attributes(llm_sp, {"trace.temporal_ignore": True}) ++ # record request-side gen_ai.* first ++ record_genai_chat( ++ llm_sp, ++ provider=self.provider_name, ++ model=model, ++ input_messages=messages or [], ++ output_text=None, ++ ) ++ resp = self.llm(messages=messages, **llm_kwargs) ++ text = resp.choices[0].message.content ++ # now attach response-side gen_ai.* ++ record_genai_chat( ++ llm_sp, ++ provider=self.provider_name, ++ model=model, ++ input_messages=messages or [], ++ output_text=text, ++ ) ++ return text ++ ++ # Fallback: no child span; just call LLM. ++ resp = self.llm(messages=messages, **llm_kwargs) ++ return resp.choices[0].message.content + + + DEFAULT_EVAL_METRIC_KEYS: Mapping[str, str] = { +@@ -198,15 +251,31 @@ DEFAULT_EVAL_METRIC_KEYS: Mapping[str, str] = { + } + + +-def _attrs_to_dict(attrs: List[Dict[str, Any]]) -> Dict[str, str]: ++def _anyvalue_to_py(v: Any) -> Any: ++ if not isinstance(v, dict) or not v: ++ return v ++ if "stringValue" in v: ++ return v["stringValue"] ++ if "doubleValue" in v: ++ return v["doubleValue"] ++ if "intValue" in v: ++ try: ++ return int(v["intValue"]) ++ except Exception: ++ return v["intValue"] ++ if "boolValue" in v: ++ return bool(v["boolValue"]) ++ # arrays/kvlist unsupported here; stringify ++ return str(v) ++ ++ ++def _attrs_to_dict(attrs: List[Dict[str, Any]]) -> Dict[str, Any]: + out: Dict[str, str] = {} + for a in attrs or []: + key = a.get("key") +- val = a.get("value", {}) ++ val = a.get("value", {}) + if key is None: + continue +- if isinstance(val, dict) and "stringValue" in val: +- out[key] = val["stringValue"] +- else: +- out[key] = str(val) ++ out[key] = _anyvalue_to_py(val) + return out + + + def extract_eval_metrics_from_otlp( +@@ -241,7 +310,7 @@ def extract_eval_metrics_from_otlp( + if sp.get("name") != evaluator_span_name: + continue + attrs = _attrs_to_dict(sp.get("attributes", [])) + raw_score = attrs.get(score_key) + if raw_score is not None: + try: + score = float(raw_score) + except ValueError: + score = default_score + reasons = attrs.get("eval.reasons", "") or "" +@@ -252,7 +321,7 @@ def extract_eval_metrics_from_otlp( + raw = attrs.get(attr_key) + if raw is None: + continue + try: + metrics[friendly] = float(raw) + except ValueError: + metrics[friendly] = default_metric +diff --git a/opto/trace/io/otel_adapter.py b/opto/trace/io/otel_adapter.py +index 1c0d111..2b7e222 100644 +--- a/opto/trace/io/otel_adapter.py ++++ b/opto/trace/io/otel_adapter.py +@@ -1,6 +1,7 @@ + from __future__ import annotations + from typing import Dict, Any, List + + + PROFILE_VERSION = "trace-json/1.0+otel" +@@ -10,6 +11,14 @@ def _sanitize(name: str) -> str: + return (name or "node").replace(":", "_") + ++def _truthy(v: Any) -> bool: ++ if isinstance(v, bool): ++ return v ++ if isinstance(v, (int, float)): ++ return v != 0 ++ if isinstance(v, str): ++ return v.strip().lower() in ("1", "true", "yes", "y", "on") ++ return bool(v) + + def _op(attrs, span): + if "gen_ai.operation" in attrs or "gen_ai.model" in attrs: + return "llm_call" +@@ -109,8 +118,12 @@ def otlp_traces_to_trace_json(otlp: Dict[str, Any], agent_id_hint: str = "", use_temporal_hierarchy: bool = False) -> List[Dict[str, Any]]: + node_id = f"{svc}:{sid}" + nodes[node_id] = rec + +- # Update prev_span_id for next iteration (temporal parenting) +- prev_span_id = sid ++ # Update prev_span_id for next iteration (temporal parenting). ++ # If a span is marked "temporal_ignore", don't let it become the sequential parent. ++ if not _truthy(attrs.get("trace.temporal_ignore")): ++ prev_span_id = sid + + docs.append( + { +diff --git a/JSON_OTEL_trace_optim_demo_LANGGRAPH_SPANOUTNODE.py b/JSON_OTEL_trace_optim_demo_LANGGRAPH_SPANOUTNODE.py +index 9abc111..9abc222 100644 +--- a/JSON_OTEL_trace_optim_demo_LANGGRAPH_SPANOUTNODE.py ++++ b/JSON_OTEL_trace_optim_demo_LANGGRAPH_SPANOUTNODE.py +@@ -1,6 +1,7 @@ + ... ++from opto.trace.io.otel_semconv import emit_agentlightning_reward + ... + def evaluator_node(state: State) -> Command[Literal[END]]: + """ + Evaluator node with multi-metric assessment. + """ +@@ -40,6 +41,12 @@ def evaluator_node(state: State) -> Command[Literal[END]]: + for k, v in metrics.items(): + sp.set_attribute(f"eval.{k}", str(v)) + sp.set_attribute("eval.score", str(score)) + sp.set_attribute("eval.reasons", reasons) + _emit_code_param(sp, "evaluator", evaluator_node) ++ ++ # Also emit an Agent Lightning compatible reward span as a child. ++ # (This is just OTEL; safe even if Agent Lightning isn't installed.) :contentReference[oaicite:33]{index=33} ++ emit_agentlightning_reward(value=float(score), name="final_score") + + feedback = f"[Metrics] {list(metrics.values())} ; Reasons: {reasons}" + +diff --git a/tests/test_dual_semconv.py b/tests/test_dual_semconv.py +new file mode 100644 +index 0000000..ddee111 +--- /dev/null ++++ b/tests/test_dual_semconv.py +@@ -0,0 +1,148 @@ ++from __future__ import annotations ++ ++from typing import Any ++ ++from opto.trace.io.langgraph_otel_runtime import init_otel_runtime, TracingLLM, flush_otlp ++from opto.trace.io.otel_adapter import otlp_traces_to_trace_json ++ ++ ++class _DummyResp: ++ def __init__(self, txt: str): ++ self.choices = [type("C", (), {"message": type("M", (), {"content": txt})()})()] ++ ++ ++class DummyLLM: ++ def __call__(self, messages=None, **kwargs): ++ return _DummyResp("ok") ++ ++ ++def _find_span(otlp: dict, name: str) -> dict | None: ++ for rs in otlp.get("resourceSpans", []): ++ for ss in rs.get("scopeSpans", []): ++ for sp in ss.get("spans", []): ++ if sp.get("name") == name: ++ return sp ++ return None ++ ++ ++def _span_attrs(sp: dict) -> dict: ++ out = {} ++ for a in sp.get("attributes", []) or []: ++ k = a.get("key") ++ v = a.get("value", {}) or {} ++ # pick first value variant ++ if isinstance(v, dict) and v: ++ out[k] = next(iter(v.values())) ++ else: ++ out[k] = v ++ return out ++ ++ ++def test_tracingllm_emits_child_genai_span_and_temporal_ignore(): ++ tracer, exporter = init_otel_runtime("test-dual-semconv") ++ llm = DummyLLM() ++ tl = TracingLLM( ++ llm=llm, ++ tracer=tracer, ++ trainable_keys={"planner"}, ++ provider_name="openai", ++ llm_span_name="openai.chat.completion", ++ emit_llm_child_span=True, ++ ) ++ ++ out = tl.node_call( ++ span_name="planner", ++ template_name="planner_prompt", ++ template="Hello {x}", ++ optimizable_key="planner", ++ messages=[{"role": "user", "content": "hi"}], ++ ) ++ assert out == "ok" ++ ++ otlp = flush_otlp(exporter, scope_name="test") ++ ++ node_sp = _find_span(otlp, "planner") ++ llm_sp = _find_span(otlp, "openai.chat.completion") ++ assert node_sp is not None ++ assert llm_sp is not None ++ ++ llm_attrs = _span_attrs(llm_sp) ++ assert llm_attrs.get("trace.temporal_ignore") in (True, "true", "True", 1, "1") ++ assert llm_attrs.get("gen_ai.operation") == "chat" ++ assert llm_attrs.get("gen_ai.provider.name") == "openai" ++ ++ ++def test_otel_adapter_temporal_hierarchy_ignores_child_spans(): ++ # Build a minimal OTLP payload with: ++ # - A (t=1) ++ # - child C (t=2, parentSpanId=A, trace.temporal_ignore=true) ++ # - B (t=3, no parentSpanId) -> should attach to A (not C) under temporal hierarchy ++ otlp = { ++ "resourceSpans": [ ++ { ++ "resource": {"attributes": []}, ++ "scopeSpans": [ ++ { ++ "scope": {"name": "x"}, ++ "spans": [ ++ { ++ "traceId": "t", ++ "spanId": "A", ++ "parentSpanId": "", ++ "name": "A", ++ "kind": "INTERNAL", ++ "startTimeUnixNano": 1, ++ "endTimeUnixNano": 1, ++ "attributes": [], ++ }, ++ { ++ "traceId": "t", ++ "spanId": "C", ++ "parentSpanId": "A", ++ "name": "openai.chat.completion", ++ "kind": "CLIENT", ++ "startTimeUnixNano": 2, ++ "endTimeUnixNano": 2, ++ "attributes": [ ++ {"key": "trace.temporal_ignore", "value": {"boolValue": True}}, ++ {"key": "gen_ai.operation", "value": {"stringValue": "chat"}}, ++ ], ++ }, ++ { ++ "traceId": "t", ++ "spanId": "B", ++ "parentSpanId": "", ++ "name": "B", ++ "kind": "INTERNAL", ++ "startTimeUnixNano": 3, ++ "endTimeUnixNano": 3, ++ "attributes": [], ++ }, ++ ], ++ } ++ ], ++ } ++ ] ++ } ++ ++ docs = otlp_traces_to_trace_json(otlp, agent_id_hint="svc", use_temporal_hierarchy=True) ++ assert len(docs) == 1 ++ nodes = docs[0]["nodes"] ++ ++ # Node IDs are svc: ++ b = nodes["svc:B"] ++ assert b["inputs"]["parent"] == "svc:A" +``` + +--- + +## 4) What changed / how to explain it to a colleague (short + clear) + +**Problem:** We want a “minimal-intrusion” LangGraph optimization demo where Trace can optimize prompts/code/graph using TGJ, but the emitted OTEL spans are also compatible with Agent Lightning tooling (so future RL/APO pipelines can reuse the same traces). + +**What we did (Strategy 2):** + +1. **Keep Trace as the primary instrumentation and optimization system** (TGJ conversion and OPTO optimizers unchanged). + +2. **Emit OTEL spans using two “schemas” at once**: + + * Trace-specific attrs remain: `param.*`, `inputs.gen_ai.prompt`, `eval.*` + * AgentLightning-compatible spans are added: + + * each LLM call produces a child span named `openai.chat.completion` with `gen_ai.*` fields ([OpenTelemetry][8]) + * evaluator produces a child span named `agentlightning.annotation` with `agentlightning.reward.0.value` ([microsoft.github.io][6]) + +3. **Prevent a subtle TGJ bug**: TGJ conversion with `use_temporal_hierarchy=True` links spans in time order. If we introduce child spans, they could accidentally become the “previous span” and break the main chain. + → Fix: child spans are marked `trace.temporal_ignore=true`, and `otel_adapter` ignores them when advancing `prev_span_id`. + +**Result:** + +* The demo stays clean (no repeated OTEL boilerplate). +* Trace can still optimize prompts/code. +* The same run also yields “Agent Lightning-shaped” traces for future reuse. + +--- + +If you want, next step is: I can add a small “How to run in Agent Lightning later” note: i.e., export your OTLP to their store or proxy, and confirm which subset of `gen_ai.*` attributes their adapters require (but with this patch you’re already aligned with the standard span names and reward encoding). + +[1]: https://microsoft.github.io/agent-lightning/stable/reference/semconv/ "https://microsoft.github.io/agent-lightning/stable/reference/semconv/" +[2]: https://microsoft.github.io/agent-lightning/latest/tutorials/traces/ "https://microsoft.github.io/agent-lightning/latest/tutorials/traces/" +[3]: https://microsoft.github.io/agent-lightning/latest/reference/agent/ "https://microsoft.github.io/agent-lightning/latest/reference/agent/" +[4]: https://github.com/microsoft/agent-lightning "https://github.com/microsoft/agent-lightning" +[5]: https://microsoft.github.io/agent-lightning/stable/reference/algorithm/ "https://microsoft.github.io/agent-lightning/stable/reference/algorithm/" +[6]: https://microsoft.github.io/agent-lightning/stable/tutorials/emitter/ "https://microsoft.github.io/agent-lightning/stable/tutorials/emitter/" +[7]: https://microsoft.github.io/agent-lightning/latest/tutorials/write-agents/ "https://microsoft.github.io/agent-lightning/latest/tutorials/write-agents/" +[8]: https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/ "https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/" diff --git a/docs/T1_technical_plan.md b/docs/T1_technical_plan.md new file mode 100644 index 00000000..a9a18e5f --- /dev/null +++ b/docs/T1_technical_plan.md @@ -0,0 +1,1231 @@ +# T1 Technical Plan: LangGraph OTEL Instrumentation API + +**Version:** 1.1 +**Date:** February 6, 2026 +**Author:** Jahanzeb Javed, Xavier Daull +**Status:** Review v1 + +This technical plan is **reusable for any LangGraph**, not tied to a specific demo graph (e.g. planner/researcher/synthesizer/evaluator). This doc explicitly addresses: (a) configurable evaluation via `eval_fn` that may return a numeric score *or* string feedback, (b) generic node selection (no hard-coded node names), (c) explicit `bindings={...}` + `apply_updates(...)` for robust mapping from `param.*` keys to real prompts/functions/graph knobs, and (d) `emit_reward()` + `emit_trace()` helpers; see the [README](../README.md) for the longer before/after diff + API matrix + telemetry tables. # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + +This plan also distinguishes the **optimization TGJ** (minimal, used by Trace backprop) from optional **observability TGJ/log artifacts** (full OTEL detail); merging via `merge_tgj([base_graph_doc, log_doc])` is **opt-in** and must not be required for a minimal optimization API. # 🔴 (keep optimization graph minimal while still allowing rich trace artifacts when needed) +--- + +## Table of Contents + +1. [Executive Summary](#1-executive-summary) +2. [Generalization: Supported Graphs and Instrumentation](#2-generalization-supported-graphs-and-instrumentation) +3. [Problem Analysis](#3-problem-analysis) +4. [Architecture Overview](#4-architecture-overview) +5. [Target API Specification](#5-target-api-specification) +6. [Module Modifications](#6-module-modifications) +7. [Implementation Plan](#7-implementation-plan) +8. [Agent Lightning Comparison](#8-agent-lightning-comparison) +9. [Notebooks (Deliverables from M1 onward)](#9-notebooks-deliverables-from-m1-onward) +10. [Acceptance Criteria (SMART, verifiable)](#10-acceptance-criteria-smart-verifiable) +11. [Test & Validation Plan](#11-test--validation-plan) +12. [Appendix: Prototype Snippet](#12-appendix-prototype-snippet) + +--- + +## 1. Executive Summary + +### Goal + +Create a **minimal, reusable library/API** that allows developers to: + +1. **Add OTEL instrumentation** to any LangGraph in a few lines (no copy-paste boilerplate) +2. **Run optimization loops** (flush OTLP → convert to TGJ → optimizer step → apply updates) +3. **Standardize telemetry** across trainers/optimizers/nodes, exportable to: + - OTEL (for optimization + debugging) + - MLflow (for monitoring: metrics + artifacts) + +### Key Deliverables + +| Deliverable | Description | +|-------------|-------------| +| `instrument_graph()` | Auto-instrument a LangGraph with OTEL tracing | +| `TracingLLM` (enhanced) | Wrapper with dual semantic conventions (Trace + Agent Lightning) | +| `TelemetrySession` | Unified session manager for OTEL + MLflow | +| `optimize_graph()` | One-liner optimization loop (# 🔴 just renamed `optimize_langgraph()` into `optimize_graph()` to align naming and future support of other graphs) | +| `emit()` helpers | Manual telemetry emission (`emit_reward()`, `emit_trace()`, custom spans/events) # 🔴 (provide a simple manual additional trace emission helper) | + +--- + +## 2. Generalization: Supported Graphs and Instrumentation + +The plan applies to **any LangGraph**, not only a fixed topology. + +**Supported graph kinds:** + +| Kind | Support | Notes | +|------|---------|--------| +| Sync graphs | Yes | `invoke()` on compiled StateGraph. | +| Async graphs | Planned | `ainvoke()` / `astream()`; same wrapper model. | +| Streaming | Planned | `stream()` / `astream()`; spans per node completion. | +| Tools | Yes | Tool calls inside nodes traced via LLM/tool wrapper. | +| Loops | Yes | Cyclic and conditional edges; one span per node execution. | + +**Instrumentation: node wrappers (not callbacks).** + +- We use **node-level wrappers** that create a session span and inject `TracingLLM` (or tool tracer) into the node execution context. We do **not** rely on LangChain/LangGraph **callbacks** for core tracing. +- **Why:** (1) Full control over span boundaries and parent-child (e.g. node → LLM child). (2) Guaranteed `param.*` and `gen_ai.*` for TGJ and Agent Lightning without depending on callback event stability. (3) Same behavior for any custom graph. +- If we add optional callback-based observability later, we will document exactly which events we depend on (e.g. [LangChain observability](https://docs.langchain.com/oss/python/langgraph/observability), [reference.langchain.com](https://reference.langchain.com/python/langgraph/graphs/)). + +- **Instrumentation modes (to prove non-intrusive + generic):** # 🔴 (support non-intrusive optimization without modifying original code file) +- **Inline/minimal-change mode:** user passes `TracingLLM`/templates into the graph builder; `instrument_graph(..., in_place=True)` wraps nodes directly. # 🔴 (support non-intrusive optimization without modifying original code) +- **Non-intrusive mode (required demo):** `instrument_graph(..., in_place=False, bindings=...)` wraps/patches callables at runtime and restores them after the run, so the original **source files are unchanged**; updates still occur **in memory** via bindings/setters (trade-off: you cannot add new manual `emit_*` calls inside node bodies; you can still patch the LLM, prompts, and node callables). # 🔴 (clarify that “non-intrusive” means no source-file/permanent mutation, not “no in-memory updates”) +- **Capability checklist (must be demonstrated in examples):** # 🔴 (make acceptance criteria explicit for what the API must support) +- Optimize prompts/variables (via `param.` + bindings). # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) +- Optimize functions/code (Trace `bundle(traceable_code=True, trainable=True)` on callables => `param.__code_`). # 🔴 (declare individual code/function optimization support) +- Optimize graph routing *indirectly* by exposing routing knobs as `param.*` (e.g. `param.router_policy`, `param.route_threshold`) in node/router spans emitted by `instrument_graph()`, and applying updates via `optimize_graph(..., bindings=...)` (topology/edge mutation). # 🔴 (routing is a trainable knob contract, not a graph rewrite) +- Trace LangGraph node execution via `instrument_graph()` (exactly one OTEL parent span per node invocation; LLM/tool spans are children). # 🔴 (span boundary contract is implemented by node wrappers, not by `trace.node(variable, trainable=True)` ? validate better option) +- Trace LLM calls via `TracingLLM`: the OTEL span that participates in optimization MUST carry `param.*` (+ `.trainable`), and also emits `gen_ai.*` keys for Agent-Lightning compatibility; child spans are deferred beyond M1. # 🔴 (optimizer links params via param.*; gen_ai.* is compatibility/observability) + +--- + +## 3. Problem Analysis + +### 3.1 Current Boilerplate in Demo Code + +The current `JSON_OTEL_trace_optim_demo_LANGGRAPH_SPANOUTNODE.py` (~1350 lines) contains extensive boilerplate that must be copied for each new LangGraph: + +| Category | Lines | Code Example | +|----------|-------|--------------| +| **OTEL Setup** | ~50 | `InMemorySpanExporter`, `TracerProvider`, `SimpleSpanProcessor` | +| **TracingLLM Class** | ~60 | Duplicate of `langgraph_otel_runtime.py` | +| **flush_otlp()** | ~25 | Span serialization to OTLP JSON | +| **Logging Helpers** | ~180 | `_init_log_dir`, `_save_run_logs`, `_rebuild_aggregate_markdown` | +| **Parameter Mapping** | ~100 | `_remap_params_in_graph`, `_ensure_code_desc_on_optimizer` | +| **Optimization Loop** | ~150 | `optimize_iteration`, TGJ conversion, backward/step | +| **Code Patching** | ~80 | `_apply_code_update`, `_emit_code_param` | # (for information: it assumes that we provided before the necessary bindings/mapping info between the otel trace namings and the real code/variables to patch so that the optimizer made it possible) +| **Total Boilerplate** | **~645** | **~48% of demo is reusable infrastructure** | + +### 3.2 Fragmented Logging Infrastructure + +| Component | Current Logger | Issue | +|-----------|---------------|-------| +| Trainers | `BaseLogger` subclasses | Console/TensorBoard/WandB only | +| Optimizers | In-memory `log` list | Not exportable | +| Node execution | Custom `LOG_DIR` files | Not integrated with OTEL | +| MLflow | Not implemented | Manual artifact logging | + +### 3.3 Manual LLM Wrapping + +Every node requires explicit `TracingLLM.node_call()` with all parameters: + +```python +# Current: 8 parameters per call +answer = TRACING_LLM.node_call( + span_name="synthesizer", + template_name="synthesizer_prompt", + template=template, + optimizable_key="synthesizer", + code_key="synthesizer", + code_fn=synthesizer_node, + user_query=state.user_query, + messages=[...], +) +```` + +--- + +## 4. Architecture Overview + +### 4.1 High-Level Architecture + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ User Code (LangGraph) │ +├─────────────────────────────────────────────────────────────────────┤ +│ @traced_node("planner") │ +│ def planner_node(state): ... │ +│ │ +│ graph = build_graph() │ +│ instrumented = instrument_graph(graph, trainable=["planner"]) │ +└─────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Trace OTEL Instrumentation Layer │ +├─────────────────────────────────────────────────────────────────────┤ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────┐ │ +│ │ TracingLLM │ │ TelemetryS.. │ │ otel_semconv helpers │ │ +│ │ (enhanced) │ │ (new) │ │ - emit_reward() │ │ +│ │ │ │ │ │ - emit_trace() │ │ # 🔴 (provide a simple manual trace emission helper) +│ │ - node_call │ │ - start() │ │ - record_genai_chat() │ │ +│ │ - child LLM │ │ - flush() │ │ - set_span_attributes() │ │ +│ │ spans │ │ - to_mlflow │ │ │ │ +│ └──────────────┘ └──────────────┘ └──────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────┘ + │ + ┌───────────────┼───────────────┐ + ▼ ▼ ▼ + ┌───────────┐ ┌───────────┐ ┌───────────────┐ + │ OTEL JSON │ │ TGJ Format│ │ MLflow │ + │ (debug) │ │ (optim) │ │ (monitoring) │ + └───────────┘ └───────────┘ └───────────────┘ + │ + ▼ + ┌─────────────────────────────────────────────────┐ + │ OPTO Optimizer │ + │ (OptoPrimeV2 / TextGrad / etc.) │ + └─────────────────────────────────────────────────┘ +``` + +### 4.2 Data Flow + +``` +LangGraph Execution + │ + ▼ +┌───────────────────┐ +│ OTEL Spans │ ← Dual semantic conventions: +│ - param.* │ • Trace-specific (TGJ-compatible) +│ - gen_ai.* │ • Agent Lightning-compatible +│ - eval.* │ +└───────────────────┘ + │ + ├──────────────────────────────────────┐ + ▼ ▼ +┌───────────────────┐ ┌───────────────────┐ +│ flush_otlp() │ │ MLflow Export │ +│ → OTLP JSON │ │ → metrics/artifacts│ +└───────────────────┘ └───────────────────┘ + │ + ▼ +┌───────────────────┐ +│ otlp_to_tgj() │ +│ → Trace-Graph JSON│ +└───────────────────┘ + │ + ▼ +┌───────────────────┐ +│ ingest_tgj() │ +│ → ParameterNode │ +│ → MessageNode │ +└───────────────────┘ + │ + ▼ +┌───────────────────┐ +│ optimizer.backward│ +│ optimizer.step │ +└───────────────────┘ + │ + ▼ +┌───────────────────┐ +│ Updated prompts/ │ +│ code parameters │ +└───────────────────┘ +``` + +--- + +## 5. Target API Specification + +### 5.1 `instrument_graph()` + +**Purpose:** Auto-instrument a LangGraph StateGraph with OTEL tracing. + +```python +def instrument_graph( + graph: StateGraph | CompiledGraph, + *, + session: Optional["TelemetrySession"] = None, + service_name: str = "langgraph-agent", + trainable_keys: Optional[Set[str]] = None, + enable_code_optimization: bool = False, + llm: Optional[Any] = None, + emit_genai_child_spans: bool = True, + bindings: Optional[Dict[str, "Binding"]] = None, # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + in_place: bool = False, # 🔴 (support non-intrusive optimization without modifying original code) +) -> InstrumentedGraph: + """ + Wrap a LangGraph with automatic OTEL instrumentation. + + Parameters + ---------- + graph : StateGraph | CompiledGraph + The LangGraph to instrument. + session : TelemetrySession, optional + If provided, reuse this TelemetrySession for OTEL capture and (optionally) MLflow logging; otherwise a new session is created using service_name. # 🔴 (required for clean notebook MLflow + OTEL usage) + service_name : str + OTEL service name for trace identification. + trainable_keys : Set[str], optional + Node names whose prompts are trainable. + If None, all nodes are trainable; otherwise provide explicit node names (glob/regex support is optional future work). # 🔴 (default: None => all nodes trainable; defer glob/regex matching beyond M1) + enable_code_optimization : bool + If True, emit `param.__code_*` attributes for function source optimization. + llm : Any, optional + LLM client to use for nodes. If provided, will be wrapped with TracingLLM. + emit_genai_child_spans : bool + If True, emit gen_ai.* child spans for Agent Lightning compatibility. + bindings : Dict[str, Binding], optional # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + Explicit mapping from OTEL/TGJ parameter keys (e.g., "planner_prompt", "__code_planner") to getter/setter bindings used by apply_updates(); if None, bindings are auto-derived for common cases (templates dict + wrapped node fns). # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + in_place : bool # 🔴 (support non-intrusive optimization without modifying original code) + If False (default), avoid **permanent** mutation of the original graph objects: apply updates via bindings/setters and restore wrappers after the run; set True only if you accept in-place monkey-patching for lower overhead (both modes still update parameters **in memory** during optimization). # 🔴 (avoid confusion: “non-intrusive” ≠ “no in-memory updates”) + + Returns + ------- + InstrumentedGraph + Wrapper with `invoke()`, `stream()`, and access to telemetry session. + + Example + ------- + >>> graph = build_my_langgraph() + >>> instrumented = instrument_graph( + ... graph, + ... trainable_keys={"", ""}, # 🔴 (example: replace placeholders with real node names to avoid accidental training) + ... llm=my_llm_client, + ... bindings={"": binding}, # e.g., {"planner_prompt": binding} # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + ... ) + >>> result = instrumented.invoke(initial_state) + >>> otlp = instrumented.session.flush_otlp() + """ +``` + +**Output Type:** + +```python +@dataclass +class InstrumentedGraph: + """Instrumented LangGraph wrapper.""" + + graph: CompiledGraph + session: TelemetrySession + tracing_llm: TracingLLM + + def invoke(self, state: Any, **kwargs) -> Dict[str, Any]: + """Execute graph and capture telemetry.""" + ... + + def stream(self, state: Any, **kwargs) -> Iterator[Dict[str, Any]]: + """Stream graph execution with telemetry.""" + ... +``` + +--- + +### 5.2 `TelemetrySession` + +**Purpose:** Unified session manager for OTEL traces and MLflow integration. + +```python +class TelemetrySession: + """ + Manages OTEL tracing session with export capabilities. + + Responsibilities: + - Initialize and manage TracerProvider + InMemorySpanExporter + - Provide flush_otlp() for trace extraction + - Export to MLflow (metrics, artifacts, parameters) # IMPORTANT: see https://github.com/AgentOpt/OpenTrace/blob/feature/mlflow/opto/features/mlflow/autolog.py # 🔴 (see previous work on this support) + - Support multiple export formats (OTLP JSON, TGJ) + """ + + def __init__( + self, + service_name: str = "trace-session", + *, + mlflow_experiment: Optional[str] = None, + mlflow_run_name: Optional[str] = None, + auto_log_to_mlflow: bool = False, + record_spans: bool = True, # 🔴 (allow disabling span recording for minimal/robust runs) + span_attribute_filter: Optional[Callable[[str, Dict[str, Any]], Dict[str, Any]]] = None, # 🔴 (allow redaction/truncation and dropping spans to avoid secrets/large payloads) + ) -> None: + """ + Initialize telemetry session. + + Parameters + ---------- + service_name : str + OTEL service/scope name. + mlflow_experiment : str, optional + MLflow experiment name. If provided, enables MLflow logging. + mlflow_run_name : str, optional + MLflow run name. Auto-generated if not provided. + auto_log_to_mlflow : bool + If True, automatically log to MLflow on flush. + record_spans : bool # 🔴 (allow disabling span recording for minimal/robust runs) + If False, disable span recording/export entirely (safe no-op); useful for minimal runs or when only MLflow metrics are desired. # 🔴 (define 'record_spans=False' as safe no-op (no exporter, no OTLP/TGJ output)) + span_attribute_filter : Callable[[str, Dict[str, Any]], Dict[str, Any]], optional # 🔴 (allow redaction/truncation and dropping spans to avoid secrets/large payloads) + Optional hook to filter/redact/truncate span attributes before they are attached/exported (and to disable recording of some spans by returning {}). # 🔴 (allow redaction/truncation and dropping spans to avoid secrets/large payloads) + """ + + @property + def tracer(self) -> oteltrace.Tracer: + """Get the OTEL tracer for manual span creation.""" + + @property + def exporter(self) -> InMemorySpanExporter: + """Get the span exporter for direct access.""" + + def flush_otlp(self, clear: bool = True) -> Dict[str, Any]: + """ + Flush collected spans to OTLP JSON format. + + Parameters + ---------- + clear : bool + If True, clear the exporter after flush. + + Returns + ------- + Dict[str, Any] + OTLP JSON payload compatible with otel_adapter. + """ + + def flush_tgj( + self, + agent_id_hint: str = "", + use_temporal_hierarchy: bool = True, + clear: bool = True, + ) -> List[Dict[str, Any]]: + """ + Flush collected spans to Trace-Graph JSON format. + + Returns + ------- + List[Dict[str, Any]] + List of TGJ documents ready for ingest_tgj(). + """ + + def log_to_mlflow( + self, + metrics: Dict[str, float], + params: Optional[Dict[str, Any]] = None, + artifacts: Optional[Dict[str, str]] = None, + step: Optional[int] = None, + ) -> None: + """ + Log metrics, parameters, and artifacts to MLflow. + + Parameters + ---------- + metrics : Dict[str, float] + Metrics to log (e.g., {"score": 0.85, "latency_ms": 120}). + params : Dict[str, Any], optional + Parameters to log (logged once per run). + artifacts : Dict[str, str], optional + Artifacts to log as {name: file_path}. + step : int, optional + Step number for metric logging. + """ + + def export_run_bundle( + self, + output_dir: str, + *, + include_otlp: bool = True, + include_tgj: bool = True, + include_prompts: bool = True, + ) -> str: + """ + Export all session data to a directory bundle. + + Returns path to the bundle directory. + """ +``` + +--- + +### 5.3 Enhanced `TracingLLM` + +**Purpose:** LLM wrapper with dual semantic conventions for Trace and Agent Lightning compatibility. + +```python +class TracingLLM: + """ + Design-3+ wrapper around an LLM client. + + Enhancements over current implementation: ++ - (Optional) emits child `openai.chat.completion` spans with gen_ai.* attributes + - Supports Agent Lightning reward emission + """ + + def __init__( + self, + llm: Any, + tracer: oteltrace.Tracer, + *, + trainable_keys: Optional[Iterable[str]] = None, + emit_code_param: Optional[Callable] = None, + # New parameters for dual semantic conventions + provider_name: str = "openai", + llm_span_name: str = "openai.chat.completion", + emit_llm_child_span: bool = True, + ) -> None: + """ + Initialize TracingLLM. + + Parameters + ---------- + llm : Any + Underlying LLM client (OpenAI-compatible interface). + tracer : oteltrace.Tracer + OTEL tracer for span creation. + trainable_keys : Iterable[str], optional + Keys that are trainable. Empty string "" matches all. + emit_code_param : Callable, optional + Function to emit code parameters: (span, key, fn) -> None. + provider_name : str + Provider name for gen_ai.provider.name attribute. + llm_span_name : str + Name for child LLM spans (e.g., "openai.chat.completion"). + emit_llm_child_span : bool + If True, emit Agent Lightning-compatible child spans. + """ + + def node_call( + self, + *, + span_name: str, + template_name: Optional[str] = None, + template: Optional[str] = None, + optimizable_key: Optional[str] = None, + code_key: Optional[str] = None, + code_fn: Any = None, + user_query: Optional[str] = None, + extra_inputs: Optional[Dict[str, str]] = None, + messages: Optional[List[Dict[str, Any]]] = None, + **llm_kwargs: Any, + ) -> str: + """ + Invoke LLM under an OTEL span with full tracing. + + Emits: + - Parent span with `param.*` and `inputs.*` (Trace-compatible) + - Child span with `gen_ai.*` (Agent Lightning-compatible) + + Returns + ------- + str + LLM response content. + """ +``` + +--- + +### 5.4 `optimize_graph()` + +**Purpose:** One-liner optimization loop. +**TGJ policy (minimal by default):** the optimizer must run on a **minimal TGJ** (`base_graph_doc`) produced from node spans + `param.*` + `eval.*`; rich OTEL details (LLM-call spans, tool spans, etc.) should be stored as OTLP/JSON artifacts and optionally as a separate `log_doc`. # 🔴 (prevent observability spans from polluting the optimization subgraph) +**Optional traces merge logs for inspection only:** if `include_log_doc=True`, create `log_doc` and optionally export `merge_tgj([base_graph_doc, log_doc])` as an artifact for UI/debugging, but do not require merge for optimization correctness. # 🔴 (support rich trace inspection without adding boilerplate to the optimization path) +**Evaluation contract:** `eval_fn` may return a numeric score, a Trace-style string feedback, or a structured dict; the runner normalizes it into a single `EvalResult` and records `eval.score` when numeric is available (required by some optimizers) while always preserving raw feedback as `eval.feedback`/`eval.reasons` artifacts (if only string feedback is available and the optimizer requires a numeric reward, fall back to a secondary `score_fn` or skip the update with a clear warning). # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + +```python +@dataclass # 🔴 (public contract: EvalResult is the normalized output of eval_fn) +class EvalResult: # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + score: Optional[float] = None # 🔴 (optional numeric reward (some evals return only text feedback)) + feedback: str = "" # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + metrics: Dict[str, Any] = field(default_factory=dict) # 🔴 (free-form metrics dict for logging/diagnostics (not required by optimizers)) + +EvalFn = Callable[[Dict[str, Any]], Union[float, str, Dict[str, Any], EvalResult]] # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) +``` + +```python +def optimize_graph( + graph: InstrumentedGraph | CompiledGraph, + queries: List[str] | List[Dict[str, Any]], + *, + iterations: int = 5, + optimizer: Optional[OptoPrimeV2] = None, + optimizer_kwargs: Optional[Dict[str, Any]] = None, + eval_fn: Optional[EvalFn] = None, + initial_templates: Optional[Dict[str, str]] = None, + bindings: Optional[Dict[str, "Binding"]] = None, # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + apply_updates: bool = True, # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + include_log_doc: bool = False, # 🔴 (opt-in: export/merge rich trace info without impacting minimal optimization TGJ) + on_iteration: Optional[Callable[[int, List[RunResult], Dict[str, Any]], None]] = None, # 🔴 (optional progress hook for UI/logging integrations; keep signature stable) + log_to_mlflow: bool = False, + mlflow_session: Optional[TelemetrySession] = None, +) -> OptimizationResult: + """ + Run a complete optimization loop on a LangGraph. + + Parameters + ---------- + graph : InstrumentedGraph | CompiledGraph + The instrumented graph to optimize. + queries : List[str] | List[Dict[str, Any]] + Test queries or full state dicts for each run. + iterations : int + Number of optimization iterations. + optimizer : OptoPrimeV2, optional + Pre-configured optimizer. Created if not provided. + optimizer_kwargs : Dict[str, Any], optional + Arguments for optimizer creation if not provided. + eval_fn : EvalFn, optional + Custom evaluation function. Can return float score, string feedback, or structured dict; normalized into EvalResult (Trace-style feedback + TextGrad-friendly). # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + initial_templates : Dict[str, str], optional + Initial prompt templates. Uses graph defaults if not provided. + bindings : Dict[str, Binding], optional # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + Mapping from OTEL/TGJ parameter keys to concrete setter/getter bindings (used by apply_updates to update prompts/functions/graph knobs deterministically). # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + apply_updates : bool # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + If True (default), apply updates each iteration via apply_updates(updates, bindings); if False, return updates only (caller applies manually). # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + include_log_doc : bool # 🔴 (opt-in: export/merge rich trace info without impacting minimal optimization TGJ) + If True, emit an additional `log_doc` (full spans) and optionally export `merge_tgj([base_graph_doc, log_doc])` as an artifact for inspection/UI; optimization itself still uses `base_graph_doc`. # 🔴 (keep optimizer path minimal while still enabling rich trace inspection) + on_iteration : Callable, optional + Callback after each iteration: (iter_num, runs, updates_dict) -> None (updates_dict keys match `param.` / bindings keys). # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + log_to_mlflow : bool + If True, log metrics to MLflow after each iteration. + mlflow_session : TelemetrySession, optional + If provided, overrides graph.session for MLflow logging only; otherwise optimize_graph logs via InstrumentedGraph.session when available. # 🔴 (clarifies single-session intent) + + Returns + ------- + OptimizationResult + Contains final parameters (templates/code/graph knobs via bindings), score history, best iteration, etc. # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + """ + +@dataclass +class OptimizationResult: + """Result of optimize_graph().""" + + baseline_score: float + best_score: float + best_iteration: int + best_updates: Dict[str, Any] # raw best update dict (param-keyed) # 🔴 (persist raw param-keyed updates for reproducibility/debugging) + final_parameters: Dict[str, Any] # resolved via bindings (prompts/code/graph knobs) # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + score_history: List[float] + all_runs: List[List[RunResult]] + optimizer: OptoPrimeV2 +``` + +#### 5.4.1 Bindings + `apply_updates()` (robust update mapping) # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) +Optimizer updates are keyed by OTEL/TGJ parameter names (e.g., `param.planner_prompt` → key `planner_prompt`, `param.__code_planner` → key `__code_planner`). To apply them deterministically (and to support non-intrusive optimization), we require explicit bindings from key → (get,set) and a single `apply_updates(...)` entrypoint. # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) +- `bindings` is mandatory for *non-intrusive* optimization (imported graphs / module-level variables); for inline demos we can auto-derive it from the templates dict + wrapped node callables. # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) +(Implementation note: keys must match the exact `template_name` / `code_key` used in `param.*` so we never rely on fragile string parsing.) # 🔴 (deterministic mapping: param keys must exactly match bindings to avoid heuristics) + +```python +# opto/trace/io/bindings.py # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) +from dataclasses import dataclass # 🔴 (spec snippet imports (exact import list can be adjusted in implementation)) +from typing import Any, Callable, Dict, Literal # 🔴 (spec snippet typing imports (kept explicit for copy/paste clarity)) + +@dataclass # 🔴 (Binding is a small public primitive (needed by apply_updates and instrument_graph)) +class Binding: # 🔴 (Binding keys must match TGJ/OTEL param keys (prompt/code/graph knobs)) + """Minimal get/set binding for a trainable target.""" # 🔴 (binding contract: minimal get/set indirection for non-intrusive updates) + get: Callable[[], Any] # 🔴 (getter returns current value for logging + optimizer initialization) + set: Callable[[Any], None] # 🔴 (setter applies updated value in-memory (prompts/code/graph knobs)) + kind: Literal["prompt", "code", "graph"] = "prompt" # 🔴 (binding kind supports prompt/code/graph validation + reporting) + +def apply_updates( # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + updates: Dict[str, Any], # 🔴 (updates dict is keyed by param names (without 'param.' prefix)) + bindings: Dict[str, Binding], # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + *, # 🔴 (keyword-only args: avoid accidental positional mis-order in API) + strict: bool = True, # 🔴 (strict=True by default to fail fast on missing/unknown bindings) +) -> None: # 🔴 (apply_updates is side-effecting (mutates bound targets in memory)) + """Apply optimizer updates using the binding map (raise if strict and a key is missing).""" # 🔴 (single entrypoint for deterministic update application across prompts/code/graph) + ... # 🔴 (implementation: loop keys, set via bindings, raise on missing if strict) +``` + +--- + +### 5.5 OTEL Semantic Convention Helpers + +**Purpose:** Emit spans compatible with both Trace and Agent Lightning. + +```python +# opto/trace/io/otel_semconv.py + +def set_span_attributes(span, attrs: Dict[str, Any]) -> None: + """ + Set multiple span attributes at once. + + Handles: + - dict/list → JSON string + - None values → skipped + """ + +def record_genai_chat( + span, + *, + provider: str, + model: str, + input_messages: List[Dict[str, Any]], + output_text: Optional[str] = None, + request_type_compat: str = "chat.completion", +) -> None: + """ + Record OTEL GenAI semantic convention attributes. + + Emits: + - gen_ai.operation.name + - gen_ai.provider.name + - gen_ai.request.model + - gen_ai.input.messages (JSON) + - gen_ai.output.messages (JSON) + """ + +def emit_reward( # 🔴 (Agent Lightning-compatible reward span helper (naming + attrs contract)) + *, + value: float, + name: str = "final_score", + tracer_name: str = "opto.trace", + index: int = 0, + span_name: str = "agentlightning.annotation", + extra_attributes: Optional[Dict[str, Any]] = None, +) -> None: + """ + Emit a reward span compatible with Agent Lightning semconv. + + Creates child span with: + - agentlightning.reward..name + - agentlightning.reward..value + """ +emit_agentlightning_reward = emit_reward # backwards-compat alias # 🔴 (align naming with standard emit_reward while keeping backward-compatible alias) + +def emit_trace( # 🔴 (provide a simple manual trace emission helper) + *, # 🔴 (keyword-only to keep callsites explicit and stable) + name: str, # 🔴 (required span/event name (used as OTEL span name)) + attrs: Optional[Dict[str, Any]] = None, # 🔴 (optional attributes payload (kept small; can be filtered/redacted)) + tracer_name: str = "opto.trace", # 🔴 (tracer namespace for manual spans (matches TelemetrySession default)) +) -> None: # 🔴 (emit_trace is intentionally side-effecting (records OTEL span/event)) + """Emit a lightweight OTEL span (or span event) for arbitrary debug/optimization signals.""" # 🔴 (manual lightweight span for custom signals (debug/optimization annotations)) + ... # 🔴 (implementation: start span, set attrs, end span (or add event); emit as child span under current node span when possible) +``` + +--- + +### 5.6 MLflow Integration + +**Purpose:** Standardized logging to MLflow for monitoring. + +```python +# opto/trace/io/mlflow_logger.py + +class MLflowTelemetryLogger(BaseLogger): + """ + Logger that exports telemetry to MLflow. + + Integrates with TelemetrySession to provide: + - Metric logging (scores, latencies, token counts) + - Parameter logging (prompt templates, model configs) + - Artifact logging (OTLP JSON, TGJ, optimization logs) + """ + + def __init__( + self, + experiment_name: str, + run_name: Optional[str] = None, + log_dir: str = "./logs", + **kwargs, + ) -> None: + """Initialize MLflow logger.""" + + def log( + self, + name: str, + data: Any, + step: int, + **kwargs, + ) -> None: + """Log metric/param to MLflow.""" + + def log_otlp_artifact( + self, + otlp: Dict[str, Any], + artifact_name: str = "otlp_trace.json", + ) -> None: + """Log OTLP trace as artifact.""" + + def log_tgj_artifact( + self, + tgj_docs: List[Dict[str, Any]], + artifact_name: str = "trace_graph.json", + ) -> None: + """Log TGJ documents as artifact.""" + + def log_templates( + self, + templates: Dict[str, str], + step: Optional[int] = None, + ) -> None: + """Log current prompt templates as parameters or artifacts.""" +``` + +--- + +## 6. Module Modifications + +### 6.1 Files to Create + +| File | Purpose | +| ---------------------------------- | ---------------------------------------------------------------------- | +| `opto/trace/io/otel_semconv.py` | Semantic convention helpers | +| `opto/trace/io/mlflow_logger.py` | MLflow integration | +| `opto/trace/io/instrumentation.py` | `instrument_graph()` and `InstrumentedGraph` | +| `opto/trace/io/optimization.py` | `optimize_graph()` and related | +| `opto/trace/io/bindings.py` | `Binding` + `apply_updates()` mapping layer (param key → get/set) # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) | + +### 6.2 Files to Modify + +| File | Changes | +| ----------------------------------------- | ------------------------------------------------ | +| `opto/trace/io/langgraph_otel_runtime.py` | Optional child span emission (gen_ai.* compatibility) | +| `opto/trace/io/otel_adapter.py` | Do not advance temporal chain on OTEL child spans (`parentSpanId` present) | +| `opto/trace/io/__init__.py` | Export new public APIs | +| `opto/trainer/loggers.py` | Add `MLflowTelemetryLogger` | + +### 6.3 Detailed Changes to `otel_adapter.py` # 🔴 (modification is already available in commit https://github.com/doxav/NewTrace/commit/237abb320b201abbd45a36f68b03ad951cd6011c) + +```python +# In otlp_traces_to_trace_json(), do not advance temporal chaining on OTEL child spans: +psid = sp.get("parentSpanId") +orig_has_parent = bool(psid) +... +# Before: +# prev_span_id = sid +# After: +if not orig_has_parent: + prev_span_id = sid +``` + +--- + +## 7. Implementation Plan + +### Phase 1: Core Infrastructure (Priority: High) + +| Task | Effort | Dependencies | +| -------------------------------------------- | ------ | --------------------------------------------- | +| Create `otel_semconv.py` with helpers | Xh | None | +| Enhance `TracingLLM` with child spans | Xh | otel_semconv.py | +| Update `otel_adapter.py` for temporal_ignore | 0h | None 🔴 (available in commit https://github.com/doxav/NewTrace/commit/237abb320b201abbd45a36f68b03ad951cd6011c) | +| Create `TelemetrySession` class | Xh | langgraph_otel_runtime.py | +| Add `bindings.py` (Binding + apply_updates) | Xh | optimize_graph(), instrument_graph() # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) | + +### Phase 2: High-Level API (Priority: High) + +| Task | Effort | Dependencies | +| ---------------------------------- | ------ | ---------------------------- | +| Implement `instrument_graph()` | Xh | TelemetrySession, TracingLLM | +| Implement `optimize_graph()` | Xh | instrument_graph | +| Create `InstrumentedGraph` wrapper | Xh | instrument_graph | + +### Phase 3: MLflow Integration (Priority: Medium) + +| Task | Effort | Dependencies | +| ------------------------------- | ------ | --------------------- | +| Create `MLflowTelemetryLogger` (OTEL/MLFlow) | Xh | BaseLogger | # 🔴 (to be cleared: identical or differences?) +| Integrate with TelemetrySession | Xh | MLflowTelemetryLogger | +| Add artifact export helpers | Xh | MLflowTelemetryLogger | + +### Phase 4: Testing & Documentation (Priority: High) + +| Task | Effort | Dependencies | +| ----------------------------- | ------ | ------------ | +| Unit tests for new modules | Xh | All modules | +| Integration test with StubLLM | Xh | All modules | +| Update README and examples | Xh | All modules | +| Prototype notebook | Xh | All modules | + +--- + +## 8. Agent Lightning Comparison + +### 8.1 API Comparison Table + +| Aspect | Agent Lightning | Trace (New API) | +| ------------------------------ | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Initialization** | `import agentlightning as agl` | `from opto.trace.io import instrument_graph` | +| **Agent / Graph Definition** | `@rollout` decorator | `instrument_graph(graph, ...)` (generic; supports `in_place=False` for non-intrusive wrapping) # 🔴 (support non-intrusive optimization without modifying original code) | +| **Trainable Fn/Var** | `initial_resources={...}` / agent args | Trace trainables: `trace.node(var, trainable=True)` and/or `trace.bundle(trainable=..., traceable_code=..., allow_external_dependencies=...)(fn)` + `bindings={...}` # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) | +| **LLM Calls** | Auto-instrumented via proxy | `TracingLLM.node_call()` wrapper | +| **Custom trace emission** | `emit_annotation(...)` / `emit_reward(...)` | `emit_trace(name, attrs)` + `TelemetrySession.tracer.start_as_current_span(...)` (manual spans/events) # 🔴 (provide a simple manual trace emission helper) | +| **Reward / feedback emission** | `emit_reward(value)` | `emit_reward(value, name)` (Agent Lightning semconv; `emit_agentlightning_reward` remains as an alias) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) | +| **Bindings & update apply** | Trainer updates resources internally | `apply_updates(updates, bindings)` (keys align with `param.`) # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) | +| **Training Loop** | `Trainer.fit(agent, dataset)` | `optimize_graph(graph, queries)` | +| **Optimization** | RL/APO/SFT algorithms | TGJ → OPTO (OptoPrimeV2, TextGrad) | +| **Span Format** | `gen_ai.*` conventions | Dual: `param.*` + `gen_ai.*` (+ optional `agentlightning.reward.*`) # 🔴 (confirm we emit both param.* (optimizer) and gen_ai.* (observability) semconv) | + +### 8.2 Code Comparison + +**Agent Lightning (conceptual):** + +```python +import agentlightning as agl +from agentlightning import emit_reward, rollout + +@rollout +def agent(task: dict, prompt_template: str): + # LLM calls auto-instrumented + result = llm.chat(messages=[...]) + emit_reward(0.82) + return result + +trainer = agl.Trainer( + algorithm=agl.APO(), + initial_resources={"prompt_template": template} +) +trainer.fit(agent=agent, train_dataset=tasks) +``` + +**Trace (New API):** + +```python +from opto.trace.io import instrument_graph, optimize_graph + +# One-time instrumentation +graph = build_my_langgraph() +instrumented = instrument_graph( + graph, + trainable_keys={"", ""}, # 🔴 (example: replace placeholders with real node names to avoid accidental training) + llm=my_llm, +) + +# One-liner optimization +result = optimize_graph( + instrumented, + queries=test_queries, + iterations=5, +) +``` + +### 8.3 Key Differences + +| Feature | Agent Lightning | Trace | +| ----------------------- | ----------------------- | ------------------------------------- | +| **Optimization Target** | Prompt templates via RL | Prompts + code via gradient descent | +| **Trace Format** | Custom span storage | OTLP → TGJ → Trace nodes | +| **Feedback Signal** | Reward values | Structured feedback (score + reasons) | +| **Code Optimization** | Not supported | Supported via `__code_*` params | +| **Graph Support** | Generic agents | LangGraph-native | + +--- + +## 9) Notebooks (Deliverables from M1 onward) +Lock notebook deliverables per milestone to keep validation reviewable. # 🔴 (deliverables mirror Trace‑Bench M0 notebook policy) + +Rule: each milestone delivers a notebook that is: # 🔴 (keep validation reviewable without running local code) +- committed with **executed outputs** (reviewers can inspect results without re-running) # 🔴 (avoid out-of-band validation) +- includes an **“Open in Colab”** badge in the first markdown cell (if repo policy permits) # 🔴 (one-click reproduction) +- writes outputs to a deterministic folder (e.g., `./logs/notebooks//`) and keeps artifacts small # 🔴 (keeps PRs reviewable) + +**Notebooks** +- **M1**: `notebooks/01_m1_instrument_and_optimize.ipynb` — runs in two modes: (a) StubLLM mode (no keys; deterministic) and (b) Live LLM mode (requires `OPENROUTER_API_KEY`, check colab secrets) to validate real-provider tracing + optimization; show at least one `param.*` prompt value changes across iterations. # 🔴 (CI uses stub; notebook validates live) +- **M2**: `notebooks/02_m2_unified_telemetry.ipynb` — demonstrate unified telemetry surface across node spans + trainer metrics + optimizer logs (export at least one optimizer summary artifact + one metric series). # 🔴 (standard OTEL logger across Trace) +- **M3**: `notebooks/03_m3_mlflow_monitoring.ipynb` — demonstrate MLflow run containing metrics in general (any trace code) + OTLP/TGJ artifacts by constructing a `TelemetrySession(mlflow_experiment=..., auto_log_to_mlflow=True)` and passing it to `instrument_graph(session=...)` (so the same session captures OTEL and logs to MLflow). # 🔴 (monitoring integration) +--- + +## 10) Acceptance Criteria (SMART, verifiable) +Milestone-based checks (SMART) replacing the removed "Validation Criteria" table. # 🔴 (keeps validation minimal and verifiable) + +**Milestone definitions used in this plan:** # 🔴 (align acceptance wording with delivery phases) +- **M0**: Technical plan accepted (this document) # 🔴 (locks contracts before implementation) +- **M1**: Drop-in instrumentation + optimization driver (end-to-end): `instrument_graph` + `optimize_*` + demo refactor + Notebook M1. # 🔴 (prove core value early) +- **M2**: Standard telemetry across Trace components (trainer/optimizer/node): unified telemetry surface + Notebook M2. # 🔴 (standard OTEL logger) +- **M3**: MLflow monitoring + hardening + Notebook M3. # 🔴 (monitoring + artifacts) +- **M4 (optional)**: extra docs/notebooks polish if time. # 🔴 (do not block contract completion) + +### M0 (this document) +- **No unresolved review markers:** `grep -n "review required" T1_technical_plan_v3.md` returns **0** matches. # 🔴 (ensures the plan is unambiguous) +- **Navigation updated:** Table of contents includes sections 9–12 and anchors resolve in GitHub markdown preview. # 🔴 (prevents review friction) + +### M1 (instrumentation + optimization driver, end-to-end) +- **OTLP export works:** after emitting ≥1 manual span, `TelemetrySession.flush_otlp(clear=True)` returns OTLP JSON with ≥1 span and a second flush returns 0 spans (cleared). # 🔴 (verifies exporter + clear semantics) +- **TGJ conversion works:** `flush_tgj()` (or `otlp_to_tgj()`) produces TGJ docs that can be ingested by `ingest_tgj()` (or pass a schema validation) without exceptions. # 🔴 (verifies optimizer-compatible trace output) +- **Temporal chaining contract:** a unit test proves OTEL child spans (spans with `parentSpanId`) do **not** advance TGJ temporal chaining (i.e., they cannot become temporal parents of subsequent top-level spans). # 🔴 (prevents child spans from breaking sequential node chaining) +- **Bindings apply deterministically:** `apply_updates({...}, bindings, strict=True)` updates bound values in memory; missing keys raise a clear error; `strict=False` ignores unknown keys. # 🔴 (robust update application) +- **End-to-end update path (CI/StubLLM):** using a minimal LangGraph and StubLLM, `optimize_* (iterations>=2, apply_updates=True)` produces `best_updates` where keys ⊆ `bindings.keys()` and at least one bound prompt value changes between iteration 0 and final. # 🔴 (deterministic CI proof) +- **Notebook live validation:** with `OPENROUTER_API_KEY` set (check colab secrets), Notebook M1 runs the same loop against a real provider (small dataset; deterministic settings) and produces OTLP+TGJ artifacts containing at least one LLM call span plus `param.*` attributes. # 🔴 (real-world proof) +- **Tests + notebook gate:** new public APIs introduced for M1 have ≥1 pytest each; CI runs stub-only; Notebook M1 includes an “Open in Colab” badge and a live-run section. # 🔴 (hard requirement) +- **Notebook - Live run constraints:** live mode must use a tiny dataset (≤3 items), deterministic settings (`temperature=0`, fixed model name), and a hard budget guard (e.g., max tokens per call) to keep cost predictable and reduce output variance. **No secrets committed:** Notebook must read keys from environment / Colab secrets; no API keys or sensitive prompts are committed in outputs. # 🔴 (simple acceptance criteria + security) + + +### M2 (standard telemetry across Trace components) +- **Unified telemetry surface:** trainer metrics (BaseLogger), optimizer summary logs, and node spans can be exported through one telemetry surface (`TelemetrySession` / `UnifiedTelemetry`). # 🔴 (deliverable B) +- **Optimizer logs exported:** at least one optimizer summary artifact is exported (file or MLflow artifact later) and at least one metric series is emitted (e.g., `score`, `loss`, `latency_ms`). # 🔴 (monitoring completeness) +- **Non-intrusive instrumentation (if claimed):** `instrument_graph(..., in_place=False)` restores wrapped callables after run (no persistent graph mutation). # 🔴 (prevents accidental graph mutation) +- **Tests + notebook gate:** new public behaviors in M2 have pytest coverage, and Notebook M2 demonstrates unified telemetry with executed outputs + Colab badge. # 🔴 (hard requirement) + +### M3 (MLflow + export bundle) +- **MLflow is optional but robust:** when MLflow is unavailable/misconfigured, the run continues and logs a warning (no hard crash). # 🔴 (optional dependency hardening) +- **Bundle export is portable:** `export_run_bundle(output_dir, include_otlp=True, include_tgj=True, include_prompts=True)` creates a directory containing OTLP JSON, TGJ JSON, and a prompt snapshot file. # 🔴 (portable artifacts for review/debugging) + +### M4 (tests + docs + notebooks) +- **CI green:** unit + integration tests referenced in this plan pass in CI (stub mode; no paid LLM calls). # 🔴 (keeps PR review cheap and deterministic) +- **Docs complete:** README includes a minimal quickstart for `instrument_graph()` + `optimize_graph()`, plus a short “Bindings & apply_updates” guide. # 🔴 (developer adoption) +- **Notebooks delivered:** notebooks listed in Section 9 run end-to-end in StubLLM mode (no keys) AND include a live-provider section that runs when `OPENROUTER_API_KEY` is set. # 🔴 (reviewable + real validation) +--- + +## 11. Test & Validation Plan + +### 11.1 Unit Tests + +| Test File | Coverage | +| --------------------------------- | -------------------------------------------------------------------------------- | +| `tests/test_otel_semconv.py` | Semantic convention helpers | +| `tests/test_tracing_llm.py` | TracingLLM with child spans | +| `tests/test_telemetry_session.py` | Session management and export (incl span_attribute_filter) # 🔴 (allow redaction/truncation and dropping spans to avoid secrets/large payloads) | +| `tests/test_instrumentation.py` | instrument_graph() (incl bindings/in_place) # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) | +| `tests/test_optimization.py` | optimize_graph() (incl EvalFn returning str/dict/float + apply_updates) # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) | + +### 11.2 Integration Tests + +```python +# tests/test_integration_stubllm.py + +def test_full_optimization_flow_with_stubllm(): + """ + End-to-end test using StubLLM (no API calls). + + 1. Build a simple LangGraph + 2. Instrument with instrument_graph() + 3. Run optimize_graph() for 2 iterations + 4. Verify: + - OTLP spans contain expected attributes + - TGJ conversion produces valid nodes + - Optimizer produces parameter updates + - Updates are applied via bindings (or returned if apply_updates=False) # 🔴 (necessary binding between trace OTEL names and real variables/functions to allow optimizer updates) + """ +``` + +### 11.3 StubLLM for Testing + +```python +class StubLLM: + """Deterministic LLM stub for testing.""" + + def __init__(self, responses: Dict[str, str] = None): + self.responses = responses or {} + self.call_count = 0 + + def __call__(self, messages, **kwargs): + self.call_count += 1 + # Return deterministic response based on input + user_msg = messages[-1]["content"] if messages else "" + + # Match against known patterns + for pattern, response in self.responses.items(): + if pattern in user_msg: + return self._make_response(response) + + # Default response + return self._make_response('{"result": "stub response"}') + + def _make_response(self, content): + return type("R", (), { + "choices": [type("C", (), { + "message": type("M", (), {"content": content})() + })()] + })() +``` + +--- + +## 12. Appendix: Prototype Snippet + +This prototype demonstrates the target API working with a StubLLM. + +```python +""" +Prototype: instrument_graph + optimize_graph with StubLLM +============================================================ + +Run this to validate the API design before full implementation. +""" + +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Callable, Literal, Union +import json + +# ============================================================ +# STUB IMPLEMENTATIONS (to be replaced by real modules) +# ============================================================ + +class StubLLM: + """Deterministic LLM for testing.""" + + def __init__(self): + self.call_count = 0 + + def __call__(self, messages, **kwargs): + self.call_count += 1 + user_msg = messages[-1].get("content", "") if messages else "" + + # Generic heuristic responses (demo-only) + if "evaluate" in user_msg.lower(): + return self._resp('{"answer_relevance": 0.8, "groundedness": 0.7, "plan_quality": 0.9, "reasons": "Good structure"}') + return self._resp("stub response") + + def _resp(self, content): + return type("R", (), { + "choices": [type("C", (), { + "message": type("M", (), {"content": content})() + })()] + })() + + +@dataclass +class EvalResult: # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + score: Optional[float] = None # 🔴 (optional numeric reward (prototype supports text-only eval too)) + feedback: str = "" # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + metrics: Dict[str, Any] = field(default_factory=dict) # 🔴 (prototype: metrics capture parsed JSON fields for logging) + + +EvalFn = Callable[[Dict[str, Any]], Union[float, str, Dict[str, Any], EvalResult]] # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + + +def default_eval_fn(payload: Dict[str, Any]) -> EvalResult: # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + """Default eval: accept numeric score or JSON dict; always preserve textual feedback.""" # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + raw = payload.get("raw_eval", "") # 🔴 (prototype: accept evaluator output as number, JSON string, or dict) + if isinstance(raw, (int, float)): # 🔴 (if numeric, treat as score directly (no JSON parsing)) + return EvalResult(score=float(raw), feedback="", metrics={}) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + if isinstance(raw, str): # 🔴 (if string, attempt JSON parse; else treat as feedback text) + try: # 🔴 (prototype: JSON parse is best-effort (never crash optimization loop)) + d = json.loads(raw) # 🔴 (parse JSON-formatted evaluator output when present) + score = sum([d.get("answer_relevance", 0.5), d.get("groundedness", 0.5), d.get("plan_quality", 0.5)]) / 3 # 🔴 (demo-only scoring heuristic (simple average; weights TBD)) + return EvalResult(score=float(score), feedback=str(d.get("reasons", "")), metrics=d) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + except Exception: # 🔴 (fallback: preserve raw string as feedback when parse fails) + return EvalResult(score=None, feedback=raw, metrics={}) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + if isinstance(raw, dict): # 🔴 (if dict, treat as metrics payload and stringify feedback) + return EvalResult(score=None, feedback=str(raw), metrics=raw) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + return EvalResult(score=None, feedback=str(raw), metrics={}) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + + +# Minimal TelemetrySession stub +class TelemetrySession: + def __init__(self, service_name: str = "test", *, record_spans: bool = True, span_attribute_filter: Optional[Callable[[str, Dict[str, Any]], Dict[str, Any]]] = None): # 🔴 (allow disabling span recording for minimal/robust runs) + self.spans = [] + self.service_name = service_name + self.record_spans = record_spans # 🔴 (allow disabling span recording for minimal/robust runs) + self.span_attribute_filter = span_attribute_filter # 🔴 (allow redaction/truncation and dropping spans to avoid secrets/large payloads) + + def record_span(self, name: str, attrs: Dict[str, Any]): # 🔴 (stub-only: collect spans in memory to emulate exporter behaviour) + if not self.record_spans: # 🔴 (allow disabling span recording for minimal/robust runs) + return # 🔴 (early-exit when span recording is disabled (safe no-op mode)) + if self.span_attribute_filter is not None: # 🔴 (allow redaction/truncation and dropping spans to avoid secrets/large payloads) + attrs = self.span_attribute_filter(name, dict(attrs)) # 🔴 (allow redaction/truncation and dropping spans to avoid secrets/large payloads) + if attrs == {}: # allow filter to drop span # 🔴 (allow filter hook to drop spans by returning an empty dict) + return # 🔴 (early-exit when span is dropped by filter (do not record)) + self.spans.append({"name": name, "attributes": attrs}) + + def flush_otlp(self) -> Dict[str, Any]: + otlp_spans = [ + { + "spanId": f"span_{i}", + "name": s["name"], + "attributes": [ + {"key": k, "value": {"stringValue": str(v)}} + for k, v in s["attributes"].items() + ] + } + for i, s in enumerate(self.spans) + ] + self.spans.clear() + return { + "resourceSpans": [{ + "resource": {"attributes": []}, + "scopeSpans": [{ + "scope": {"name": self.service_name}, + "spans": otlp_spans + }] + }] + } + + +# Minimal TracingLLM stub +class TracingLLM: + def __init__(self, llm, session: TelemetrySession, trainable_keys=None): + self.llm = llm + self.session = session + self.trainable_keys = trainable_keys # keep None meaning "all trainable" # 🔴 (prototype: None => all nodes trainable; matches instrument_graph default) + + def node_call(self, *, span_name, template_name=None, template=None, + optimizable_key=None, messages=None, **kwargs) -> str: + # Record span + attrs = {} + if template_name and template: + attrs[f"param.{template_name}"] = template + # If trainable_keys is None => all trainable; else explicit membership # 🔴 (emit explicit trainable marker for TGJ/optimizer consumption) + trainable = True if self.trainable_keys is None else (optimizable_key in self.trainable_keys) # 🔴 (trainable flag depends on trainable_keys (None means all)) + attrs[f"param.{template_name}.trainable"] = trainable # 🔴 (record trainable flag alongside param value for debuggability) + attrs["gen_ai.model"] = "stub" + attrs["inputs.gen_ai.prompt"] = messages[-1]["content"] if messages else "" + + self.session.record_span(span_name, attrs) + + # Call LLM + return self.llm(messages=messages, **kwargs).choices[0].message.content + + +# ============================================================ +# PROTOTYPE: instrument_graph() +# ============================================================ + +@dataclass +class InstrumentedGraph: + """Instrumented LangGraph wrapper.""" + + graph: Any # The actual LangGraph + session: TelemetrySession + tracing_llm: TracingLLM + templates: Dict[str, str] = field(default_factory=dict) + eval_fn: EvalFn = default_eval_fn # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + + def invoke(self, state: Dict[str, Any]) -> Dict[str, Any]: + """Execute graph with telemetry capture.""" + # For prototype, simulate a minimal flow without hard-coding node names as "the API" (only the demo does). # 🔴 (prototype-only flow; real implementation wraps arbitrary node callables) + query = state.get("query", "") # 🔴 (prototype state shape; real graphs use user-defined state schema) + + # Simulate a generic "answer" node (demo-only) + answer = self.tracing_llm.node_call( + span_name="answer_node", + template_name="answer_prompt", + template=self.templates.get("answer_prompt", "Default answer template"), + optimizable_key="answer_node", + messages=[{"role": "user", "content": f"Answer: {query}"}], + ) + + # Simulate evaluator + raw_eval = self.tracing_llm.node_call( + span_name="evaluator", + messages=[{"role": "user", "content": f"Evaluate: {answer}"}], + ) + + er = self.eval_fn({"query": query, "answer": answer, "raw_eval": raw_eval}) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + if isinstance(er, (int, float)): # 🔴 (normalize eval_fn return types into EvalResult (float/str/dict)) + er = EvalResult(score=float(er), feedback="", metrics={}) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + elif isinstance(er, str): # 🔴 (normalize eval_fn return types into EvalResult (float/str/dict)) + er = EvalResult(score=None, feedback=er, metrics={}) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + elif isinstance(er, dict): # 🔴 (normalize eval_fn return types into EvalResult (float/str/dict)) + er = EvalResult(score=er.get("score"), feedback=str(er.get("feedback", "")), metrics=er) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + + # Record eval span (score optional; feedback always preserved) # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + eval_attrs = {"eval.feedback": er.feedback, "eval.reasons": er.feedback} # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + if er.score is not None: # 🔴 (only set eval.score when numeric is available (optimizer requirement)) + eval_attrs["eval.score"] = str(er.score) # 🔴 (record numeric eval.score for optimizers that require rewards) + self.session.record_span("evaluator", eval_attrs) # 🔴 (record eval attributes as a separate span/event for traceability) + + return {"answer": answer, "score": er.score, "feedback": er.feedback, "metrics": er.metrics} # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + + +def instrument_graph( + graph: Any, + *, + service_name: str = "langgraph-agent", + trainable_keys: Optional[set] = None, + llm: Optional[Any] = None, + initial_templates: Optional[Dict[str, str]] = None, + eval_fn: Optional[EvalFn] = None, # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) +) -> InstrumentedGraph: + """ + Wrap a LangGraph with automatic OTEL instrumentation. + + This is the main entry point for the new API. + """ + session = TelemetrySession(service_name) + + tracing_llm = TracingLLM( + llm=llm or StubLLM(), + session=session, + trainable_keys=trainable_keys, # None means "all trainable"; no hard-coded planner/synthesizer # 🔴 (prototype: trainable_keys=None means train all prompts by default) + ) + + return InstrumentedGraph( + graph=graph, + session=session, + tracing_llm=tracing_llm, + templates=initial_templates or {}, + eval_fn=eval_fn or default_eval_fn, # 🔴 (support evaluation as score or string feedback (Trace/TextGrad compatible)) + ) diff --git a/docs/architecture_and_strategy.md b/docs/architecture_and_strategy.md new file mode 100644 index 00000000..ae0da0a3 --- /dev/null +++ b/docs/architecture_and_strategy.md @@ -0,0 +1,986 @@ +# LangGraph OTEL Instrumentation: Architecture & Strategy + +## Table of Contents + +1. [Executive Summary](#executive-summary) +2. [Problem Statement](#problem-statement) +3. [Strategy Overview](#strategy-overview) +4. [System Architecture](#system-architecture) +5. [Component Deep Dive](#component-deep-dive) +6. [Data Flow](#data-flow) +7. [Semantic Conventions](#semantic-conventions) +8. [Optimization Pipeline](#optimization-pipeline) +9. [Integration Points](#integration-points) +10. [Implementation Roadmap](#implementation-roadmap) + +--- + +## Executive Summary + +This document outlines the architecture and strategy for creating a **unified OTEL instrumentation API** for LangGraph agents. The solution enables: + +- **Simplified tracing**: One function call instruments entire graphs +- **Dual compatibility**: Traces work with both Trace (TGJ) and Agent Lightning +- **Unified optimization**: Single API for running optimization loops +- **Flexible backends**: Support for multiple LLM providers + +--- + +## Problem Statement + +### Current State (Before) + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ CURRENT: Manual OTEL Instrumentation │ +│ (~645 lines of boilerplate) │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────────┐ │ +│ │ OTEL Setup │ ~80 lines: TracerProvider, SpanProcessor, │ +│ │ (Boilerplate) │ InMemoryExporter, Tracer init │ +│ └──────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────┐ │ +│ │ TracingLLM Class │ ~100 lines: Wrapper class definition, │ +│ │ (Boilerplate) │ span creation, attribute setting │ +│ └──────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────┐ │ +│ │ Node Functions │ ~25 lines PER NODE: Manual span creation, │ +│ │ (Per-node code) │ attribute recording │ +│ └──────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────┐ │ +│ │ Optimization │ ~150 lines: Loop setup, trace capture, │ +│ │ Loop (Manual) │ score tracking, template update │ +│ └──────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────┐ │ +│ │ Export & Convert │ ~50 lines: OTLP export, TGJ conversion, │ +│ │ (Manual) │ file saving │ +│ └──────────────────┘ │ +│ │ +│ TOTAL: ~645 lines of repeated boilerplate across demos │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Issues Identified + +| Issue | Impact | Lines Affected | +|-------|--------|----------------| +| OTEL setup repeated in every demo | Code duplication | ~80 lines | +| TracingLLM redefined per file | Inconsistent behavior | ~100 lines | +| Manual span creation per node | Error-prone, verbose | ~25 lines/node | +| Optimization loop copy-pasted | Hard to maintain | ~150 lines | +| No Agent Lightning compatibility | Limited observability | N/A | +| Fragmented logging | Inconsistent metrics | ~50 lines | + +--- + +## Strategy Overview + +### Chosen Approach: "Trace-first, Dual Semconv" + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ STRATEGY: Trace-First, Dual Semconv │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ DESIGN PRINCIPLES │ │ +│ ├─────────────────────────────────────────────────────────────────────┤ │ +│ │ │ │ +│ │ 1. TRACE-FIRST: Optimize for Trace framework compatibility │ │ +│ │ - param.* attributes for trainable parameters │ │ +│ │ - inputs.* / outputs.* for data flow │ │ +│ │ - Temporal hierarchy preserved for TGJ │ │ +│ │ │ │ +│ │ 2. DUAL SEMCONV: Also emit Agent Lightning conventions │ │ +│ │ - gen_ai.* attributes on child spans │ │ +│ │ - agentlightning.reward.* for evaluation metrics │ │ +│ │ - Compatible with standard OTEL dashboards │ │ +│ │ │ │ +│ │ 3. MINIMAL USER CODE: Hide complexity behind simple API │ │ +│ │ - instrument_graph() - one call to add tracing │ │ +│ │ - optimize_langgraph() - one call for optimization │ │ +│ │ - No manual span creation required │ │ +│ │ │ │ +│ │ 4. TEMPORAL ISOLATION: Child spans don't break TGJ │ │ +│ │ - trace.temporal_ignore attribute on GenAI spans │ │ +│ │ - Preserves node-to-node execution flow │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Target State (After) + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ TARGET: Simplified API (~10 lines) │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ from trace_api import instrument_graph, optimize_langgraph │ +│ │ +│ # ONE CALL to instrument │ +│ instrumented = instrument_graph( │ +│ graph=my_langgraph, │ +│ trainable_keys={"planner", "synthesizer"}, │ +│ ) │ +│ │ +│ # ONE CALL to optimize │ +│ result = optimize_langgraph( │ +│ instrumented, │ +│ queries=["Q1", "Q2"], │ +│ iterations=5, │ +│ ) │ +│ │ +│ print(f"Best score: {result.best_score}") │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## System Architecture + +### High-Level Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SYSTEM ARCHITECTURE │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────┐ │ +│ │ User Code │ │ +│ └──────┬──────┘ │ +│ │ │ +│ ┌───────────────┼───────────────┐ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌────────────────┐ ┌──────────┐ ┌────────────────┐ │ +│ │instrument_graph│ │ invoke │ │optimize_langgraph│ │ +│ └───────┬────────┘ └────┬─────┘ └───────┬────────┘ │ +│ │ │ │ │ +│ └───────────────┼───────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ InstrumentedGraph │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ │ │ │ +│ │ │ ┌──────────────┐ ┌──────────────────┐ ┌──────────────┐ │ │ │ +│ │ │ │ StateGraph │ │ TelemetrySession │ │ TracingLLM │ │ │ │ +│ │ │ │ (LangGraph) │ │ (OTEL Spans) │ │ (Wrapper) │ │ │ │ +│ │ │ └──────┬───────┘ └────────┬─────────┘ └──────┬───────┘ │ │ │ +│ │ │ │ │ │ │ │ │ +│ │ │ └───────────────────┼───────────────────┘ │ │ │ +│ │ │ │ │ │ │ +│ │ └─────────────────────────────┼──────────────────────────────┘ │ │ +│ │ │ │ │ +│ └────────────────────────────────┼──────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ LLM Backend │ │ +│ │ │ │ +│ │ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ │ OpenRouterLLM │ OR │ StubLLM │ │ │ +│ │ │ (Real API calls)│ │ (Testing mode) │ │ │ +│ │ └─────────────────┘ └─────────────────┘ │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Output Layer │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌───────────┐ │ │ +│ │ │ OTLP JSON │ │ TGJ Format │ │ MLflow │ │ Console │ │ │ +│ │ │ Export │ │ (Future) │ │ (Future) │ │ Logs │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ └───────────┘ │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Component Interaction Diagram + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ COMPONENT INTERACTIONS │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ instrument_graph() │ │ +│ │ │ │ +│ │ Input: Output: │ │ +│ │ - graph (StateGraph) - InstrumentedGraph │ │ +│ │ - service_name ├── .graph (compiled) │ │ +│ │ - trainable_keys ├── .session (TelemetrySession) │ │ +│ │ - initial_templates ├── .tracing_llm (TracingLLM) │ │ +│ │ - llm (optional) └── .templates (Dict) │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ creates │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ InstrumentedGraph │ │ +│ │ │ │ +│ │ .invoke(state) │ │ +│ │ │ │ │ +│ │ ├──► Initializes AgentState │ │ +│ │ ├──► Runs compiled graph │ │ +│ │ │ │ │ │ +│ │ │ ├──► planner_node() ──► TracingLLM.node_call() │ │ +│ │ │ ├──► researcher_node() ──► TracingLLM.node_call() │ │ +│ │ │ ├──► synthesizer_node() ──► TracingLLM.node_call() │ │ +│ │ │ └──► evaluator_node() ──► TracingLLM.node_call() │ │ +│ │ │ │ │ +│ │ ├──► Records evaluation metrics span │ │ +│ │ └──► Returns {answer, score, metrics, ...} │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ uses │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ TracingLLM │ │ +│ │ │ │ +│ │ .node_call(span_name, template_name, template, messages) │ │ +│ │ │ │ │ +│ │ ├──► Creates PARENT span (Trace-compatible) │ │ +│ │ │ - param.{template_name} = template │ │ +│ │ │ - param.{template_name}.trainable = true/false │ │ +│ │ │ - inputs.gen_ai.prompt = user_message │ │ +│ │ │ │ │ +│ │ ├──► Creates CHILD span (Agent Lightning-compatible) │ │ +│ │ │ - trace.temporal_ignore = "true" │ │ +│ │ │ - gen_ai.operation.name = "chat" │ │ +│ │ │ - gen_ai.provider.name = "openrouter" │ │ +│ │ │ - gen_ai.input.messages = [...] │ │ +│ │ │ - gen_ai.output.messages = [...] │ │ +│ │ │ │ │ +│ │ ├──► Calls underlying LLM (OpenRouter/Stub) │ │ +│ │ └──► Returns response content │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ records to │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ TelemetrySession │ │ +│ │ │ │ +│ │ .start_span(name) -> SpanContext │ │ +│ │ - Creates span with traceId, spanId, timestamps │ │ +│ │ - Returns context manager for attribute setting │ │ +│ │ │ │ +│ │ .flush_otlp() -> Dict │ │ +│ │ - Exports all spans to OTLP JSON format │ │ +│ │ - Clears internal span buffer │ │ +│ │ - Returns format compatible with otel_adapter │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Component Deep Dive + +### 1. TelemetrySession + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ TelemetrySession │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ PURPOSE: Centralized OTEL span management and export │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Internal State: │ │ +│ │ │ │ +│ │ service_name: str # Identifies the service in traces │ │ +│ │ _spans: List[Dict] # In-memory span storage │ │ +│ │ _span_counter: int # Auto-incrementing span IDs │ │ +│ │ _trace_id: str # Current trace identifier │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Methods: │ │ +│ │ │ │ +│ │ start_span(name) -> SpanContext │ │ +│ │ │ │ │ +│ │ └──► Creates span dict with: │ │ +│ │ - traceId: current trace ID │ │ +│ │ - spanId: auto-generated │ │ +│ │ - name: provided name │ │ +│ │ - startTimeUnixNano: current timestamp │ │ +│ │ - attributes: {} (empty, filled by SpanContext) │ │ +│ │ │ │ +│ │ flush_otlp(clear=True) -> Dict │ │ +│ │ │ │ │ +│ │ └──► Exports to OTLP JSON: │ │ +│ │ { │ │ +│ │ "resourceSpans": [{ │ │ +│ │ "scopeSpans": [{ │ │ +│ │ "scope": {"name": service_name}, │ │ +│ │ "spans": [... all spans ...] │ │ +│ │ }] │ │ +│ │ }] │ │ +│ │ } │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 2. TracingLLM + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ TracingLLM │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ PURPOSE: Wrap LLM calls with dual semantic convention spans │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Configuration: │ │ +│ │ │ │ +│ │ llm: Any # Underlying LLM client │ │ +│ │ session: TelemetrySession # For span recording │ │ +│ │ trainable_keys: Set[str] # Which nodes have trainable prompts │ │ +│ │ provider_name: str # "openrouter", "openai", etc. │ │ +│ │ emit_genai_child_span: bool # Whether to emit Agent Lightning spans│ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ node_call() Flow: │ │ +│ │ │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ STEP 1: Create Parent Span (Trace-compatible) │ │ │ +│ │ │ │ │ │ +│ │ │ span_name: "planner" │ │ │ +│ │ │ attributes: │ │ │ +│ │ │ param.planner_prompt: "You are a planning agent..." │ │ │ +│ │ │ param.planner_prompt.trainable: "True" │ │ │ +│ │ │ gen_ai.model: "llama-3.1-8b" │ │ │ +│ │ │ inputs.gen_ai.prompt: "Plan for: What is AI?" │ │ │ +│ │ │ │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ STEP 2: Create Child Span (Agent Lightning-compatible) │ │ │ +│ │ │ │ │ │ +│ │ │ span_name: "openrouter.chat.completion" │ │ │ +│ │ │ attributes: │ │ │ +│ │ │ trace.temporal_ignore: "true" ◄── KEY ATTRIBUTE │ │ │ +│ │ │ gen_ai.operation.name: "chat" │ │ │ +│ │ │ gen_ai.provider.name: "openrouter" │ │ │ +│ │ │ gen_ai.request.model: "llama-3.1-8b" │ │ │ +│ │ │ gen_ai.input.messages: "[{role: user, ...}]" │ │ │ +│ │ │ │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ STEP 3: Call LLM │ │ │ +│ │ │ │ │ │ +│ │ │ response = llm(messages=messages, **kwargs) │ │ │ +│ │ │ content = response.choices[0].message.content │ │ │ +│ │ │ │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ STEP 4: Record Output & Return │ │ │ +│ │ │ │ │ │ +│ │ │ Child span attribute: │ │ │ +│ │ │ gen_ai.output.messages: "[{role: assistant, ...}]" │ │ │ +│ │ │ │ │ │ +│ │ │ Return: content (string) │ │ │ +│ │ │ │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 3. InstrumentedGraph + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ InstrumentedGraph │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ PURPOSE: Wrapper that adds telemetry to LangGraph execution │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Properties: │ │ +│ │ │ │ +│ │ graph: CompiledGraph # The compiled LangGraph │ │ +│ │ session: TelemetrySession # For span export │ │ +│ │ tracing_llm: TracingLLM # For instrumented LLM calls │ │ +│ │ templates: Dict[str, str] # Prompt templates │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ invoke(state) Flow: │ │ +│ │ │ │ +│ │ INPUT: {"query": "What is AI?"} │ │ +│ │ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Build Initial State │ │ │ +│ │ │ query: "What is AI?" │ │ │ +│ │ │ plan: {} │ │ │ +│ │ │ research_results: [] │ │ │ +│ │ │ answer: "" │ │ │ +│ │ │ evaluation: {} │ │ │ +│ │ │ planner_template: │ │ │ +│ │ │ synthesizer_template: │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Execute Graph (generates spans via TracingLLM) │ │ │ +│ │ │ │ │ │ +│ │ │ START ──► planner ──► researcher ──► synthesizer │ │ │ +│ │ │ │ │ │ │ +│ │ │ ▼ │ │ │ +│ │ │ evaluator ──► END │ │ │ +│ │ │ │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Record Evaluation Metrics │ │ │ +│ │ │ │ │ │ +│ │ │ Span: "evaluation_metrics" │ │ │ +│ │ │ eval.score: 0.933 │ │ │ +│ │ │ eval.answer_relevance: 0.95 │ │ │ +│ │ │ eval.groundedness: 0.90 │ │ │ +│ │ │ eval.plan_quality: 0.95 │ │ │ +│ │ │ │ │ │ +│ │ │ Child Span: "agentlightning.annotation" │ │ │ +│ │ │ trace.temporal_ignore: "true" │ │ │ +│ │ │ agentlightning.reward.0.name: "final_score" │ │ │ +│ │ │ agentlightning.reward.0.value: "0.933" │ │ │ +│ │ │ │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ OUTPUT: │ │ +│ │ { │ │ +│ │ "answer": "AI is...", │ │ +│ │ "plan": {...}, │ │ +│ │ "research_results": [...], │ │ +│ │ "score": 0.933, │ │ +│ │ "metrics": {"answer_relevance": 0.95, ...}, │ │ +│ │ "reasons": "Good structure..." │ │ +│ │ } │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Data Flow + +### Single Execution Data Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SINGLE EXECUTION DATA FLOW │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ USER INPUT │ +│ │ │ +│ │ {"query": "What is AI?"} │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ PLANNER NODE │ │ +│ │ │ │ +│ │ Input: query = "What is AI?" │ │ +│ │ Template: "You are a planning agent..." │ │ +│ │ │ │ +│ │ ┌────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ SPAN: planner │ │ │ +│ │ │ param.planner_prompt =