From 9af8d20c24bed5756e1979cd1adde26fe9ed5adc Mon Sep 17 00:00:00 2001 From: cjmckenna Date: Thu, 25 Jun 2026 10:36:52 -0400 Subject: [PATCH] fix: stop broker readiness race and reap prior-session orphans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related reliability problems make `task`/review dispatches intermittent and slow, most visibly on Windows: 1. Broker readiness is a fixed-timer race. ensureBrokerSession spawns a `codex app-server` broker and waits a hardcoded 2000ms for it to bind its endpoint (and 150ms when probing an existing one). A cold app-server on Windows (Defender on-open scanning + MCP server init) routinely needs longer, so the wait times out, the broker is torn down, and the caller falls back to spawning a *second* app-server — two cold starts for one request, non-deterministically. waitForBrokerEndpoint also only watches the wall clock, so a healthy-but-slow broker is killed the instant it crosses the deadline. - Make both windows configurable via CODEX_COMPANION_BROKER_TIMEOUT_MS and CODEX_COMPANION_BROKER_REUSE_PROBE_MS, and raise the win32 defaults (15000ms / 750ms; 4000ms / 250ms elsewhere). - Give waitForBrokerEndpoint an optional child handle so it abandons early when the spawned broker has actually exited, instead of burning the (now longer) timeout. 2. Orphans accumulate. Cleanup only runs at SessionEnd and only for the current session, but on Windows that hook frequently never fires (abrupt terminal close / crash / window reload). Dead-pid brokers and ghost running/queued job records from prior sessions pile up, so /codex:status reports phantom jobs and a stale activeTask can block new dispatches. - SessionStart now self-heals: neutralize dead-pid running/queued job records (state + per-job files) and tear down a dead persisted broker. Best-effort; never blocks startup. Adds tests/broker-lifecycle.test.mjs covering the timeout and the child-exit early-bail. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../codex/scripts/lib/broker-lifecycle.mjs | 26 ++++- .../codex/scripts/session-lifecycle-hook.mjs | 95 ++++++++++++++++++- tests/broker-lifecycle.test.mjs | 39 ++++++++ 3 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 tests/broker-lifecycle.test.mjs diff --git a/plugins/codex/scripts/lib/broker-lifecycle.mjs b/plugins/codex/scripts/lib/broker-lifecycle.mjs index ef763819..6bae6f51 100644 --- a/plugins/codex/scripts/lib/broker-lifecycle.mjs +++ b/plugins/codex/scripts/lib/broker-lifecycle.mjs @@ -12,6 +12,20 @@ export const PID_FILE_ENV = "CODEX_COMPANION_APP_SERVER_PID_FILE"; export const LOG_FILE_ENV = "CODEX_COMPANION_APP_SERVER_LOG_FILE"; const BROKER_STATE_FILE = "broker.json"; +// APT local patch (re: openai/codex-plugin-cc broker readiness race). +// Upstream waits a fixed 2000ms for a freshly-spawned `codex app-server` to bind +// its endpoint, then gives up and lets the caller spawn a duplicate. On Windows a +// cold app-server (Defender on-open scan + MCP fleet init) routinely needs longer, +// so the fixed timer loses the race intermittently. Make both windows env-tunable +// and raise the win32 defaults; ensureBrokerSession also now waits on child +// liveness so a healthy-but-slow broker is never torn down prematurely. +const DEFAULT_BROKER_READY_TIMEOUT_MS = + Number(process.env.CODEX_COMPANION_BROKER_TIMEOUT_MS) || + (process.platform === "win32" ? 15000 : 4000); +const DEFAULT_BROKER_REUSE_PROBE_MS = + Number(process.env.CODEX_COMPANION_BROKER_REUSE_PROBE_MS) || + (process.platform === "win32" ? 750 : 250); + export function createBrokerSessionDir(prefix = "cxc-") { return fs.mkdtempSync(path.join(os.tmpdir(), prefix)); } @@ -21,7 +35,7 @@ function connectToEndpoint(endpoint) { return net.createConnection({ path: target.path }); } -export async function waitForBrokerEndpoint(endpoint, timeoutMs = 2000) { +export async function waitForBrokerEndpoint(endpoint, timeoutMs = 2000, child = null) { const start = Date.now(); while (Date.now() - start < timeoutMs) { const ready = await new Promise((resolve) => { @@ -35,6 +49,12 @@ export async function waitForBrokerEndpoint(endpoint, timeoutMs = 2000) { if (ready) { return true; } + // APT local patch: if the spawned broker has already exited there is nothing + // left to bind the endpoint, so stop waiting immediately instead of burning + // the remainder of the (now longer) timeout. + if (child && child.exitCode !== null) { + return false; + } await new Promise((resolve) => setTimeout(resolve, 50)); } return false; @@ -104,7 +124,7 @@ async function isBrokerEndpointReady(endpoint) { return false; } try { - return await waitForBrokerEndpoint(endpoint, 150); + return await waitForBrokerEndpoint(endpoint, DEFAULT_BROKER_REUSE_PROBE_MS); } catch { return false; } @@ -146,7 +166,7 @@ export async function ensureBrokerSession(cwd, options = {}) { env: options.env ?? process.env }); - const ready = await waitForBrokerEndpoint(endpoint, options.timeoutMs ?? 2000); + const ready = await waitForBrokerEndpoint(endpoint, options.timeoutMs ?? DEFAULT_BROKER_READY_TIMEOUT_MS, child); if (!ready) { teardownBrokerSession({ endpoint, diff --git a/plugins/codex/scripts/session-lifecycle-hook.mjs b/plugins/codex/scripts/session-lifecycle-hook.mjs index 778571e6..e4f144ca 100644 --- a/plugins/codex/scripts/session-lifecycle-hook.mjs +++ b/plugins/codex/scripts/session-lifecycle-hook.mjs @@ -13,7 +13,14 @@ import { sendBrokerShutdown, teardownBrokerSession } from "./lib/broker-lifecycle.mjs"; -import { loadState, resolveStateFile, saveState } from "./lib/state.mjs"; +import { + loadState, + readJobFile, + resolveJobFile, + resolveStateFile, + saveState, + writeJobFile +} from "./lib/state.mjs"; import { TRANSCRIPT_PATH_ENV } from "./lib/claude-session-transfer.mjs"; import { resolveWorkspaceRoot } from "./lib/workspace.mjs"; @@ -74,10 +81,96 @@ function cleanupSessionJobs(cwd, sessionId) { }); } +// SessionEnd only cleans the *current* session, and on Windows that hook +// frequently never fires (abrupt terminal close / crash / window reload). So +// dead-pid brokers and ghost "running"/"queued" job records from prior sessions +// accumulate, making `/codex:status` report phantom jobs and (if `activeTask` +// ever points at one) blocking new dispatches. SessionStart already runs, so use +// it to self-heal prior-session orphans. +function isPidAlive(pid) { + if (!Number.isFinite(pid) || pid <= 0) { + return false; + } + try { + process.kill(pid, 0); + return true; + } catch (error) { + // EPERM => the process exists but we may not signal it; treat as alive. + return error?.code === "EPERM"; + } +} + +function reapStaleJobs(cwd) { + const workspaceRoot = resolveWorkspaceRoot(cwd); + const stateFile = resolveStateFile(workspaceRoot); + if (!fs.existsSync(stateFile)) { + return; + } + + const state = loadState(workspaceRoot); + let changed = false; + for (const job of state.jobs) { + const active = job.status === "queued" || job.status === "running"; + if (!active || isPidAlive(job.pid)) { + continue; + } + job.status = "failed"; + job.pid = null; + changed = true; + // Keep the per-job record consistent with the neutralized state entry. + try { + const jobFile = resolveJobFile(workspaceRoot, job.id); + if (fs.existsSync(jobFile)) { + const record = readJobFile(jobFile); + record.status = "failed"; + record.pid = null; + writeJobFile(workspaceRoot, job.id, record); + } + } catch { + // Ignore per-job file write failures during self-heal. + } + } + + if (changed) { + saveState(workspaceRoot, state); + } +} + +function reapOrphanBroker(cwd) { + const existing = loadBrokerSession(cwd); + if (!existing || isPidAlive(existing.pid)) { + return; + } + // The persisted broker's owning process is gone — tear down its stale endpoint + // so the next task spawns a fresh broker instead of probing a dead one. + teardownBrokerSession({ + endpoint: existing.endpoint ?? null, + pidFile: existing.pidFile ?? null, + logFile: existing.logFile ?? null, + sessionDir: existing.sessionDir ?? null, + pid: existing.pid ?? null, + killProcess: terminateProcessTree + }); + clearBrokerSession(cwd); +} + function handleSessionStart(input) { appendEnvVar(SESSION_ID_ENV, input.session_id); appendEnvVar(TRANSCRIPT_PATH_ENV, input.transcript_path); appendEnvVar(PLUGIN_DATA_ENV, process.env[PLUGIN_DATA_ENV]); + + // Best-effort self-heal of prior-session orphans; never block startup. + const cwd = input.cwd || process.cwd(); + try { + reapStaleJobs(cwd); + } catch { + // Ignore self-heal failures. + } + try { + reapOrphanBroker(cwd); + } catch { + // Ignore self-heal failures. + } } async function handleSessionEnd(input) { diff --git a/tests/broker-lifecycle.test.mjs b/tests/broker-lifecycle.test.mjs new file mode 100644 index 00000000..5cd77c7a --- /dev/null +++ b/tests/broker-lifecycle.test.mjs @@ -0,0 +1,39 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import { waitForBrokerEndpoint } from "../plugins/codex/scripts/lib/broker-lifecycle.mjs"; + +const UNREACHABLE_ENDPOINT = + process.platform === "win32" + ? "pipe:\\\\.\\pipe\\codex-broker-lifecycle-test-does-not-exist" + : "unix:/tmp/codex-broker-lifecycle-test-does-not-exist.sock"; + +test("waitForBrokerEndpoint returns false for an unreachable endpoint within the timeout", async () => { + const start = Date.now(); + const ready = await waitForBrokerEndpoint(UNREACHABLE_ENDPOINT, 300); + const elapsed = Date.now() - start; + assert.equal(ready, false); + // Should consume roughly the whole window (probe + 50ms backoff loop), not hang. + assert.ok(elapsed >= 250 && elapsed < 2000, `unexpected elapsed ${elapsed}ms`); +}); + +test("waitForBrokerEndpoint bails immediately once the spawned broker has exited", async () => { + // A child whose exitCode is already set means nothing will ever bind the + // endpoint, so the wait must abandon early instead of burning the full timeout. + const exitedChild = { exitCode: 1 }; + const start = Date.now(); + const ready = await waitForBrokerEndpoint(UNREACHABLE_ENDPOINT, 10000, exitedChild); + const elapsed = Date.now() - start; + assert.equal(ready, false); + assert.ok(elapsed < 1000, `expected early bail, took ${elapsed}ms`); +}); + +test("waitForBrokerEndpoint keeps waiting while the spawned broker is still alive", async () => { + // exitCode null => process still booting; the wait should run to the timeout. + const liveChild = { exitCode: null }; + const start = Date.now(); + const ready = await waitForBrokerEndpoint(UNREACHABLE_ENDPOINT, 300, liveChild); + const elapsed = Date.now() - start; + assert.equal(ready, false); + assert.ok(elapsed >= 250, `expected full wait, took ${elapsed}ms`); +});