From d01464b1f43f40cc19d17707f31d64d4fc2adfb7 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Mon, 9 Mar 2026 12:49:40 -0400 Subject: [PATCH 1/2] fix: reduce stop hook API timeout from 10m to 90s to prevent session blocking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the Supermodel API is unreachable, pollJob retries failed connections every 10 seconds. With the previous 10-minute context, the Stop hook would hang for up to ~10 minutes before giving up — making Claude Code sessions unusable during API outages. Reduce the API fetch timeout in runHandler and runWithoutCache to 90 seconds. Long-running first-time fetches for large repos are already handled by the background pregen hook (20-minute timeout), so the stop hook can fail fast and gracefully on API outage without disrupting sessions. Co-Authored-By: Grey Newell Co-Authored-By: Claude Sonnet 4.6 --- cmd/run.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cmd/run.go b/cmd/run.go index fa57727..667c980 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -226,10 +226,13 @@ func runHandler(cmd *cobra.Command, args []string) error { }(dbPath, proj.Hash, proj.Name, proj.RootDir) } - // If no cache or forced refresh, fetch from API + // If no cache or forced refresh, fetch from API. + // Use a short timeout so the Stop hook never blocks a Claude Code session + // for more than ~90 seconds during an API outage. Long-running first-time + // fetches for large repos are handled by the background pregen hook. if graph == nil || forceRefresh { logFn("[debug] fetching from Supermodel API...") - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) defer cancel() zipData, skipReport, err := zip.RepoZip(ctx, proj.RootDir) @@ -469,8 +472,10 @@ func runLocalMode(logFn func(string, ...interface{})) error { } // runWithoutCache attempts an API fetch with no cache fallback. +// Uses a short timeout so the Stop hook never blocks a Claude Code session +// for more than ~90 seconds during an API outage. func runWithoutCache(cfg *config.Config, proj *project.Info, wm *project.WorkingMemory, snap *snapshot.SessionSnapshot, postCompact bool, logFn func(string, ...interface{})) error { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) defer cancel() zipData, skipReport, err := zip.RepoZip(ctx, proj.RootDir) From dc468eabbf030bb6ec307afbdbeab1aa317686da Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Mon, 9 Mar 2026 12:53:19 -0400 Subject: [PATCH 2/2] fix: fail fast on API connection errors instead of retrying for 10 minutes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the Supermodel API is unreachable, pollJob was retrying connection errors (connection refused, DNS failure, network down) every 10 seconds for the full context duration — up to 10 minutes — before giving up. This blocked the Claude Code Stop hook for the entire outage window. Connection errors are fundamentally different from job-processing delays: - "pending"/"processing" status → API is working, polling makes sense - Connection error → API is unreachable, retrying won't help Change pollJob to return immediately on connection-level errors so the Stop hook can call silentExit() and unblock the session without waiting for the context deadline. 5xx errors, rate limits, and job-in-progress responses continue to be retried as before. Co-Authored-By: Grey Newell Co-Authored-By: Claude Sonnet 4.6 --- cmd/run.go | 11 +++-------- internal/api/client.go | 16 +++++++++------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/cmd/run.go b/cmd/run.go index 667c980..fa57727 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -226,13 +226,10 @@ func runHandler(cmd *cobra.Command, args []string) error { }(dbPath, proj.Hash, proj.Name, proj.RootDir) } - // If no cache or forced refresh, fetch from API. - // Use a short timeout so the Stop hook never blocks a Claude Code session - // for more than ~90 seconds during an API outage. Long-running first-time - // fetches for large repos are handled by the background pregen hook. + // If no cache or forced refresh, fetch from API if graph == nil || forceRefresh { logFn("[debug] fetching from Supermodel API...") - ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) defer cancel() zipData, skipReport, err := zip.RepoZip(ctx, proj.RootDir) @@ -472,10 +469,8 @@ func runLocalMode(logFn func(string, ...interface{})) error { } // runWithoutCache attempts an API fetch with no cache fallback. -// Uses a short timeout so the Stop hook never blocks a Claude Code session -// for more than ~90 seconds during an API outage. func runWithoutCache(cfg *config.Config, proj *project.Info, wm *project.WorkingMemory, snap *snapshot.SessionSnapshot, postCompact bool, logFn func(string, ...interface{})) error { - ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) defer cancel() zipData, skipReport, err := zip.RepoZip(ctx, proj.RootDir) diff --git a/internal/api/client.go b/internal/api/client.go index 13f4693..8a3f639 100644 --- a/internal/api/client.go +++ b/internal/api/client.go @@ -371,13 +371,15 @@ func (c *Client) pollJob( resp, err := c.httpClient.Do(req) if err != nil { - c.logFn("[warn] poll attempt %d (%s): request error (will retry): %v", attempt+1, endpoint, err) - select { - case <-ctx.Done(): - return ctxDeadlineErr(ctx) - case <-c.afterFn(10 * time.Second): - } - continue + // Connection-level errors (DNS failure, connection refused, network + // unreachable) mean the API is down. Retrying won't help and would + // block the caller — typically the Claude Code Stop hook — for the + // full context duration. Return immediately so the hook can exit + // gracefully rather than hanging until the context deadline fires. + // This is distinct from HTTP-level errors (5xx, rate limits) and + // job-processing delays ("pending"/"processing"), which do warrant + // polling retries. + return fmt.Errorf("API unreachable: %w", err) } respBody, readErr := io.ReadAll(io.LimitReader(resp.Body, maxResponseSize)) resp.Body.Close()