From dc09fb663ca8c0fa8069a0d71bb819e29f8d0905 Mon Sep 17 00:00:00 2001 From: Ragnar Rova Date: Mon, 15 Jun 2026 19:49:10 +0200 Subject: [PATCH] feat: add resumable review threads to /codex:review and /codex:adversarial-review Add opt-in `--resume` / `--fresh` / `--within-hours` flags so an iterative review -> fix -> re-review loop can reuse a Codex review thread instead of re-exploring the repo from a cold start every time, which sharply cuts token burn. Reuse is scoped to the git worktree and bounded by recency. - `--resume` reuses the worktree's recent review thread if one exists and otherwise starts a fresh resumable thread; `--fresh` forces a new thread; `--within-hours ` sets the reuse window (default 3h). - Native one-shot `review/start` cannot carry context between runs, so the resumable path uses a turn-based reviewer. Only persisted session-mode review jobs are eligible for resume, scoped to the same review kind (`/codex:review` vs `/codex:adversarial-review`, which share jobClass but differ by kind); native one-shot review threads are ephemeral and never selected. The newest resumable review run is reused only if it completed -- a cancelled, failed, or in-progress run (including a concurrent run still starting, before it records a thread id) is never reused, even if an older run on the same thread completed. If a persisted thread was pruned/expired the run falls back to a fresh thread instead of failing, and the reuse- continuation note is injected only when the resume actually succeeds. - The resumable plain-review prompt (prompts/review.md) is adapted from Codex's native reviewer system prompt (codex-rs/core/review_prompt.md) so that `/codex:review --resume` reviews like native `/codex:review`; only the output section is remapped to this plugin's review-output schema. - Default (no-flag) `/codex:review` and `/codex:adversarial-review` behavior is byte-identical to before -- the change is purely additive. Tests cover the new flags in tests/commands.test.mjs and tests/runtime.test.mjs (reuse, recency fallback, native-thread exclusion, kind scoping, non-completed/ newest-not-completed/still-starting exclusion, and pruned-thread fallback). Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/commands/adversarial-review.md | 9 +- plugins/codex/commands/review.md | 10 +- plugins/codex/prompts/review.md | 75 ++++++ plugins/codex/scripts/codex-companion.mjs | 132 ++++++++-- plugins/codex/scripts/lib/codex.mjs | 58 ++++- tests/commands.test.mjs | 10 +- tests/runtime.test.mjs | 248 +++++++++++++++++++ 7 files changed, 512 insertions(+), 30 deletions(-) create mode 100644 plugins/codex/prompts/review.md diff --git a/plugins/codex/commands/adversarial-review.md b/plugins/codex/commands/adversarial-review.md index da440ab4..47aedc0e 100644 --- a/plugins/codex/commands/adversarial-review.md +++ b/plugins/codex/commands/adversarial-review.md @@ -1,6 +1,6 @@ --- description: Run a Codex review that challenges the implementation approach and design choices -argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch] [focus ...]' +argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch] [--resume|--fresh] [--within-hours ] [focus ...]' disable-model-invocation: true allowed-tools: Read, Glob, Grep, Bash(node:*), Bash(git:*), AskUserQuestion --- @@ -44,6 +44,13 @@ Argument handling: - It does not support `--scope staged` or `--scope unstaged`. - Unlike `/codex:review`, it can still take extra focus text after the flags. +Context reuse (lower token burn): +- `--resume` reuses-or-creates: it reuses this git worktree's recent review thread if one exists, and otherwise starts a new resumable thread. It is safe to use on every review; you never need to seed it with `--fresh` first, and it never fails just because no prior thread exists. +- Reusing the prior thread keeps the exploration Codex already did instead of re-reading the codebase each pass, lowering token/quota burn on iterative review loops. +- Reuse is scoped to the current git worktree and only happens when the prior review thread was used within the last few hours (default 3, override with `--within-hours `); otherwise a fresh thread is started automatically. +- `--fresh` forces a new review thread for this worktree even when a recent one exists — use it only to deliberately discard the previous review context. +- Pass `--resume`/`--fresh` through verbatim; do not treat them as focus text. + Foreground flow: - Run: ```bash diff --git a/plugins/codex/commands/review.md b/plugins/codex/commands/review.md index fb70a487..1b1f4f87 100644 --- a/plugins/codex/commands/review.md +++ b/plugins/codex/commands/review.md @@ -1,6 +1,6 @@ --- description: Run a Codex code review against local git state -argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch]' +argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch] [--resume|--fresh] [--within-hours ]' disable-model-invocation: true allowed-tools: Read, Glob, Grep, Bash(node:*), Bash(git:*), AskUserQuestion --- @@ -39,6 +39,14 @@ Argument handling: - `/codex:review` is native-review only. It does not support staged-only review, unstaged-only review, or extra focus text. - If the user needs custom review instructions or more adversarial framing, they should use `/codex:adversarial-review`. +Context reuse (lower token burn): +- `--resume` reuses-or-creates: it reuses this git worktree's recent review thread if one exists, and otherwise starts a new resumable thread. It is safe to use on every review; you never need to seed it with `--fresh` first, and it never fails just because no prior thread exists. +- Reusing the prior thread lets Codex keep the exploration it already did instead of re-reading the codebase from scratch, which sharply cuts token/quota burn during a review → fix → re-review loop. +- Reuse is scoped to the current git worktree and only happens when the prior review thread was used within the last few hours (default 3, override with `--within-hours `); otherwise a fresh thread is started automatically. +- `--fresh` forces a brand-new review thread for this worktree even when a recent one exists — use it only to deliberately discard the previous review context (a new task, or a stale thread). +- `--resume`/`--fresh` run a turn-based reviewer (so the thread can be resumed) instead of Codex's one-shot native review mode, which cannot carry context between runs. Plain `/codex:review` with no flag is still the best choice for a single cold pass. +- Pass `--resume`/`--fresh` through verbatim; the companion script handles them. + Foreground flow: - Run: ```bash diff --git a/plugins/codex/prompts/review.md b/plugins/codex/prompts/review.md new file mode 100644 index 00000000..d8b86946 --- /dev/null +++ b/plugins/codex/prompts/review.md @@ -0,0 +1,75 @@ + + +# Review guidelines + +You are acting as a reviewer for a proposed code change made by another engineer. + +Below are default guidelines for determining whether the original author would appreciate an issue being flagged. They are not the final word: more specific guidelines you encounter (in the repository context below, a developer message, or a user message) override these. + +Flag something as a bug only when: +1. It meaningfully impacts the accuracy, performance, security, or maintainability of the code. +2. The bug is discrete and actionable (not a general issue with the codebase, and not a combination of multiple issues). +3. Fixing it does not demand a level of rigor absent from the rest of the codebase (e.g. one-off scripts do not need detailed comments and input validation). +4. The bug was introduced by the change under review — do not flag pre-existing bugs. +5. The original author would likely fix it if they were made aware of it. +6. It does not rely on unstated assumptions about the codebase or the author's intent. +7. You can identify the specific other code that is provably affected — it is not enough to speculate that a change may disrupt another part of the codebase. +8. It is clearly not an intentional change by the original author. + +When you flag a bug, the accompanying explanation should: +1. Be clear about why the issue is a bug. +2. Communicate severity accurately — do not claim an issue is more severe than it is. +3. Be brief: at most one paragraph, with no line breaks in the prose unless needed for a code fragment. +4. Avoid code chunks longer than 3 lines; wrap any code in inline code tags or a short code block. +5. Explicitly state the scenarios, environments, or inputs required for the bug to arise, and make clear that severity depends on those factors. +6. Be matter-of-fact, not accusatory or flattering (avoid "Great job…", "Thanks for…"). +7. Let the author grasp the issue without close reading. + +HOW MANY FINDINGS TO RETURN: + +Output every finding the original author would fix if they knew about it. If there is no finding a person would clearly want fixed, prefer returning none. Do not stop at the first qualifying finding; continue until you have listed every qualifying one. + +GUIDELINES: + +- Ignore trivial style unless it obscures meaning or violates a documented standard. +- Use one finding per distinct issue. +- Keep the line range as short as possible — avoid ranges longer than 5–10 lines; pick the subrange that pinpoints the problem. The location should overlap the change under review. +- Do not generate a full patch; describe the fix rather than writing the replacement code. + +PRIORITY: + +Assess each finding's priority and map it to `severity`: +- P0 → `critical`: drop everything; a universal issue that does not depend on assumptions about the inputs. +- P1 → `high`: urgent; should be addressed in the next cycle. +- P2 → `medium`: normal; to be fixed eventually. +- P3 → `low`: nice to have. + +OVERALL CORRECTNESS: + +Decide whether the change is correct — existing code and tests will not break and it is free of blocking bugs. Ignore non-blocking issues (style, formatting, typos, documentation, nits) for this verdict. Map a correct patch to `approve` and an incorrect one to `needs-attention`. + +## What to review + +Target: {{TARGET_LABEL}} + +{{REVIEW_COLLECTION_GUIDANCE}} + +## Output format + +Return only valid JSON matching the provided schema — no markdown fences and no extra prose. For each finding provide: +- `title`: ≤ 80 chars, imperative. +- `body`: one paragraph of Markdown explaining why it is a bug, citing the affected file/lines/function. +- `severity`: mapped from the priority above. +- `confidence`: a float from 0 to 1. +- `file`, `line_start`, `line_end`: the affected location, kept as short as possible and overlapping the change. +- `recommendation`: describe the fix concisely; do not provide a code patch. + +Set `verdict` from the overall-correctness decision, write `summary` as a terse 1–3 sentence justification of that verdict, and use `next_steps` for any follow-ups. + +## Repository context + +{{REVIEW_INPUT}} diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 35222fd5..c0c3603a 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -8,6 +8,7 @@ import { fileURLToPath } from "node:url"; import { parseArgs, splitRawArgumentString } from "./lib/args.mjs"; import { + buildPersistentReviewThreadName, buildPersistentTaskThreadName, DEFAULT_CONTINUE_PROMPT, findLatestTaskThread, @@ -69,14 +70,22 @@ const DEFAULT_STATUS_POLL_INTERVAL_MS = 2000; const VALID_REASONING_EFFORTS = new Set(["none", "minimal", "low", "medium", "high", "xhigh"]); const MODEL_ALIASES = new Map([["spark", "gpt-5.3-codex-spark"]]); const STOP_REVIEW_TASK_MARKER = "Run a stop-gate review of the previous Claude turn."; +const DEFAULT_REVIEW_REUSE_WINDOW_HOURS = 3; +const REVIEW_RESUME_CONTINUATION_NOTE = [ + "", + "You already reviewed this worktree earlier in this same thread.", + "Reuse what you already learned about the codebase; do not re-read files you have already seen unless they changed.", + "The working tree may have moved on since then. Concentrate on what changed, and re-check whether your earlier findings are now resolved or still open.", + "" +].join("\n"); function printUsage() { console.log( [ "Usage:", " node scripts/codex-companion.mjs setup [--enable-review-gate|--disable-review-gate] [--json]", - " node scripts/codex-companion.mjs review [--wait|--background] [--base ] [--scope ]", - " node scripts/codex-companion.mjs adversarial-review [--wait|--background] [--base ] [--scope ] [focus text]", + " node scripts/codex-companion.mjs review [--wait|--background] [--base ] [--scope ] [--resume|--fresh] [--within-hours ]", + " node scripts/codex-companion.mjs adversarial-review [--wait|--background] [--base ] [--scope ] [--resume|--fresh] [--within-hours ] [focus text]", " node scripts/codex-companion.mjs task [--background] [--write] [--resume-last|--resume|--fresh] [--model ] [--effort ] [prompt]", " node scripts/codex-companion.mjs status [job-id] [--all] [--json]", " node scripts/codex-companion.mjs result [job-id] [--json]", @@ -235,15 +244,19 @@ async function handleSetup(argv) { outputResult(options.json ? finalReport : renderSetupReport(finalReport), options.json); } -function buildAdversarialReviewPrompt(context, focusText) { - const template = loadPromptTemplate(ROOT_DIR, "adversarial-review"); - return interpolateTemplate(template, { - REVIEW_KIND: "Adversarial Review", +function buildReviewTurnPrompt(context, { reviewName, focusText = "", resuming = false } = {}) { + const templateName = reviewName === "Adversarial Review" ? "adversarial-review" : "review"; + const template = loadPromptTemplate(ROOT_DIR, templateName); + const body = interpolateTemplate(template, { + REVIEW_KIND: reviewName, TARGET_LABEL: context.target.label, USER_FOCUS: focusText || "No extra focus provided.", REVIEW_COLLECTION_GUIDANCE: context.collectionGuidance, REVIEW_INPUT: context.content }); + // Inject the reuse note only when actually resuming, so the no-flag path is + // byte-identical to the original one-shot prompt. + return resuming ? `${REVIEW_RESUME_CONTINUATION_NOTE}\n\n${body}` : body; } function ensureCodexAvailable(cwd) { @@ -352,6 +365,54 @@ async function resolveLatestTrackedTaskThread(cwd, options = {}) { return findLatestTaskThread(workspaceRoot); } +function resolveReviewReuseWindowMs(hours) { + const parsed = Number(hours); + if (!Number.isFinite(parsed) || parsed <= 0) { + return DEFAULT_REVIEW_REUSE_WINDOW_HOURS * 60 * 60 * 1000; + } + return parsed * 60 * 60 * 1000; +} + +// Reuse is keyed by the git worktree (the per-worktree state dir already scopes +// jobs) and bounded by recency, so stale exploration is not resumed. +// Only `resumable` jobs are eligible: plain native `/codex:review` runs persist +// a `threadId` too, but those threads are ephemeral and cannot be resumed. +function resolveLatestReviewThread(workspaceRoot, { withinMs = null, excludeJobId = null, kind = null } = {}) { + // Take the newest resumable review run and reuse its thread only if that run + // completed. The newest run is considered even before it records a thread id + // (a concurrent run that has not yet emitted "Thread ready"); otherwise we + // could fall back past it to an older completed run on the same thread and + // let two runs resume the same thread and interleave turns. A cancelled, + // failed, or still-running newest run therefore means: start fresh. + const candidate = sortJobsNewestFirst(listJobs(workspaceRoot)).find( + (job) => + job.id !== excludeJobId && + job.jobClass === "review" && + // Scope to the same review kind. `/codex:review` and + // `/codex:adversarial-review` share jobClass "review" but differ by + // `kind`; reusing across kinds would inject the other reviewer's prompt + // (e.g. the adversarial framing) into a plain review thread. + (kind === null || job.kind === kind) && + job.resumable === true + ); + if (!candidate || candidate.status !== "completed" || !candidate.threadId) { + return null; + } + + if (Number.isFinite(withinMs)) { + const lastUsed = Date.parse(candidate.completedAt ?? candidate.createdAt ?? ""); + if (!Number.isFinite(lastUsed) || Date.now() - lastUsed > withinMs) { + return null; + } + } + + return { + id: candidate.threadId, + jobId: candidate.id, + lastUsedAt: candidate.completedAt ?? candidate.createdAt ?? null + }; +} + async function executeReviewRun(request) { ensureCodexAvailable(request.cwd); ensureGitRepository(request.cwd); @@ -362,7 +423,8 @@ async function executeReviewRun(request) { }); const focusText = request.focusText?.trim() ?? ""; const reviewName = request.reviewName ?? "Review"; - if (reviewName === "Review") { + const sessionMode = Boolean(request.resume || request.fresh); + if (reviewName === "Review" && !sessionMode) { const reviewTarget = validateNativeReviewRequest(target, focusText); const result = await runAppServerReview(request.cwd, { target: reviewTarget, @@ -404,14 +466,39 @@ async function executeReviewRun(request) { } const context = collectReviewContext(request.cwd, target); - const prompt = buildAdversarialReviewPrompt(context, focusText); + + let resumeThreadId = null; + if (request.resume) { + const prior = resolveLatestReviewThread(resolveWorkspaceRoot(context.repoRoot), { + withinMs: request.reuseWindowMs ?? resolveReviewReuseWindowMs(), + excludeJobId: request.jobId ?? null, + kind: reviewName === "Adversarial Review" ? "adversarial-review" : "review" + }); + if (prior) { + resumeThreadId = prior.id; + } + } + + const persistThread = sessionMode; const result = await runAppServerTurn(context.repoRoot, { - prompt, + // Build the prompt from the ACTUAL outcome: if a pruned/expired thread + // forces a fresh-thread fallback, the continuation note must not claim + // prior context that the fresh thread does not have. + buildPrompt: ({ resumed }) => buildReviewTurnPrompt(context, { reviewName, focusText, resuming: resumed }), model: request.model, sandbox: "read-only", outputSchema: readOutputSchema(REVIEW_SCHEMA), - onProgress: request.onProgress + onProgress: request.onProgress, + resumeThreadId, + // If a persisted review thread was pruned/expired by Codex, don't fail the + // review — fall back to a fresh persisted thread. + resumeFallback: persistThread, + persistThread, + threadName: persistThread ? buildPersistentReviewThreadName(`${reviewName} ${target.label}`) : null }); + // The resume may have fallen back to a fresh thread (pruned/expired); only + // report reuse when the returned thread is actually the one we resumed. + const actuallyResumed = Boolean(resumeThreadId) && result.threadId === resumeThreadId; const parsed = parseStructuredOutput(result.finalMessage, { status: result.status, failureMessage: result.error?.message ?? result.stderr @@ -420,6 +507,9 @@ async function executeReviewRun(request) { review: reviewName, target, threadId: result.threadId, + sessionMode, + reused: actuallyResumed, + resumedThreadId: actuallyResumed ? resumeThreadId : null, context: { repoRoot: context.repoRoot, branch: context.branch, @@ -561,7 +651,7 @@ function getJobKindLabel(kind, jobClass) { return jobClass === "review" ? "review" : "rescue"; } -function createCompanionJob({ prefix, kind, title, workspaceRoot, jobClass, summary, write = false }) { +function createCompanionJob({ prefix, kind, title, workspaceRoot, jobClass, summary, write = false, resumable = false }) { return createJobRecord({ id: generateJobId(prefix), kind, @@ -570,7 +660,8 @@ function createCompanionJob({ prefix, kind, title, workspaceRoot, jobClass, summ workspaceRoot, jobClass, summary, - write + write, + resumable }); } @@ -681,13 +772,19 @@ function enqueueBackgroundTask(cwd, job, request) { async function handleReviewCommand(argv, config) { const { options, positionals } = parseCommandInput(argv, { - valueOptions: ["base", "scope", "model", "cwd"], - booleanOptions: ["json", "background", "wait"], + valueOptions: ["base", "scope", "model", "cwd", "within-hours"], + booleanOptions: ["json", "background", "wait", "resume", "fresh"], aliasMap: { m: "model" } }); + const resume = Boolean(options.resume); + const fresh = Boolean(options.fresh); + if (resume && fresh) { + throw new Error("Choose either --resume or --fresh."); + } + const cwd = resolveCommandCwd(options); const workspaceRoot = resolveCommandWorkspace(options); const focusText = positionals.join(" ").trim(); @@ -704,7 +801,8 @@ async function handleReviewCommand(argv, config) { title: metadata.title, workspaceRoot, jobClass: "review", - summary: metadata.summary + summary: metadata.summary, + resumable: resume || fresh }); await runForegroundCommand( job, @@ -716,6 +814,10 @@ async function handleReviewCommand(argv, config) { model: options.model, focusText, reviewName: config.reviewName, + resume, + fresh, + reuseWindowMs: resolveReviewReuseWindowMs(options["within-hours"]), + jobId: job.id, onProgress: progress }), { json: options.json } diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index f2fe88bd..daa53458 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -41,6 +41,7 @@ import { binaryAvailable } from "./process.mjs"; const SERVICE_NAME = "claude_code_codex_plugin"; const TASK_THREAD_PREFIX = "Codex Companion Task"; +const REVIEW_THREAD_PREFIX = "Codex Companion Review"; const DEFAULT_CONTINUE_PROMPT = "Continue from the current thread state. Pick the next highest-value step and follow through until the task is resolved."; @@ -103,6 +104,11 @@ function buildTaskThreadName(prompt) { return excerpt ? `${TASK_THREAD_PREFIX}: ${excerpt}` : TASK_THREAD_PREFIX; } +function buildReviewThreadName(label) { + const excerpt = shorten(label, 56); + return excerpt ? `${REVIEW_THREAD_PREFIX}: ${excerpt}` : REVIEW_THREAD_PREFIX; +} + function extractThreadId(message) { return message?.params?.threadId ?? null; } @@ -969,16 +975,9 @@ export async function runAppServerTurn(cwd, options = {}) { return withAppServer(cwd, async (client) => { let threadId; + let resumedThread = false; - if (options.resumeThreadId) { - emitProgress(options.onProgress, `Resuming thread ${options.resumeThreadId}.`, "starting"); - const response = await resumeThread(client, options.resumeThreadId, cwd, { - model: options.model, - sandbox: options.sandbox, - ephemeral: false - }); - threadId = response.thread.id; - } else { + const startFreshThread = async () => { emitProgress(options.onProgress, "Starting Codex task thread.", "starting"); const response = await startThread(client, cwd, { model: options.model, @@ -986,14 +985,45 @@ export async function runAppServerTurn(cwd, options = {}) { ephemeral: options.persistThread ? false : true, threadName: options.persistThread ? options.threadName : options.threadName ?? null }); - threadId = response.thread.id; + return response.thread.id; + }; + + if (options.resumeThreadId) { + emitProgress(options.onProgress, `Resuming thread ${options.resumeThreadId}.`, "starting"); + try { + const response = await resumeThread(client, options.resumeThreadId, cwd, { + model: options.model, + sandbox: options.sandbox, + ephemeral: false + }); + threadId = response.thread.id; + resumedThread = true; + } catch (error) { + // A persisted thread can be pruned/expired by Codex. When the caller + // opts into fallback, start a fresh thread instead of failing the run. + // Resume fails before any turn starts, so no tokens are wasted. + if (!options.resumeFallback) { + throw error; + } + emitProgress( + options.onProgress, + `Could not resume thread ${options.resumeThreadId}; starting a fresh thread.`, + "starting" + ); + threadId = await startFreshThread(); + } + } else { + threadId = await startFreshThread(); } emitProgress(options.onProgress, `Thread ready (${threadId}).`, "starting", { threadId }); - const prompt = options.prompt?.trim() || options.defaultPrompt || ""; + const prompt = + typeof options.buildPrompt === "function" + ? options.buildPrompt({ resumed: resumedThread }) + : options.prompt?.trim() || options.defaultPrompt || ""; if (!prompt) { throw new Error("A prompt is required for this Codex run."); } @@ -1054,6 +1084,10 @@ export function buildPersistentTaskThreadName(prompt) { return buildTaskThreadName(prompt); } +export function buildPersistentReviewThreadName(label) { + return buildReviewThreadName(label); +} + export function parseStructuredOutput(rawOutput, fallback = {}) { if (!rawOutput) { return { @@ -1085,4 +1119,4 @@ export function readOutputSchema(schemaPath) { return readJsonFile(schemaPath); } -export { DEFAULT_CONTINUE_PROMPT, TASK_THREAD_PREFIX }; +export { DEFAULT_CONTINUE_PROMPT, TASK_THREAD_PREFIX, REVIEW_THREAD_PREFIX }; diff --git a/tests/commands.test.mjs b/tests/commands.test.mjs index 3724ffa4..0e0f9296 100644 --- a/tests/commands.test.mjs +++ b/tests/commands.test.mjs @@ -37,6 +37,10 @@ test("review command uses AskUserQuestion and background Bash while staying revi assert.match(source, /When in doubt, run the review/i); assert.match(source, /\(Recommended\)/); assert.match(source, /does not support staged-only review, unstaged-only review, or extra focus text/i); + assert.match(source, /\[--resume\|--fresh\]/); + assert.match(source, /--within-hours/); + assert.match(source, /reuses this git worktree's recent review thread/i); + assert.match(source, /reuses-or-creates/i); }); test("adversarial review command uses AskUserQuestion and background Bash while staying review-only", () => { @@ -49,7 +53,11 @@ test("adversarial review command uses AskUserQuestion and background Bash while assert.match(source, /```bash/); assert.match(source, /```typescript/); assert.match(source, /adversarial-review "\$ARGUMENTS"/); - assert.match(source, /\[--scope auto\|working-tree\|branch\] \[focus \.\.\.\]/); + assert.match(source, /\[--scope auto\|working-tree\|branch\]/); + assert.match(source, /\[focus \.\.\.\]/); + assert.match(source, /\[--resume\|--fresh\]/); + assert.match(source, /--within-hours/); + assert.match(source, /reuses-or-creates/i); assert.match(source, /run_in_background:\s*true/); assert.match(source, /command:\s*`node "\$\{CLAUDE_PLUGIN_ROOT\}\/scripts\/codex-companion\.mjs" adversarial-review "\$ARGUMENTS"`/); assert.match(source, /description:\s*"Codex adversarial review"/); diff --git a/tests/runtime.test.mjs b/tests/runtime.test.mjs index 90408372..1278b4fb 100644 --- a/tests/runtime.test.mjs +++ b/tests/runtime.test.mjs @@ -300,6 +300,254 @@ test("adversarial review asks Codex to inspect larger diffs itself", () => { assert.doesNotMatch(state.lastTurnStart.prompt, /PROMPT_SELF_COLLECT_[ABC]/); }); +test("review --fresh starts a resumable thread and --resume reuses it for the worktree", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + fs.writeFileSync(path.join(repo, "README.md"), "hello again\n"); + + const fresh = run("node", [SCRIPT, "review", "--fresh"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(fresh.status, 0, fresh.stderr); + + let state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + const threadsAfterFresh = state.threads.length; + assert.equal( + state.threads.some((thread) => !thread.ephemeral && String(thread.name || "").startsWith("Codex Companion Review")), + true, + "fresh review should persist a named, non-ephemeral review thread" + ); + assert.doesNotMatch(state.lastTurnStart.prompt, //); + + const resume = run("node", [SCRIPT, "review", "--resume"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(resume.status, 0, resume.stderr); + + state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + assert.equal(state.threads.length, threadsAfterFresh, "resume should reuse the thread, not create a new one"); + assert.match(state.lastTurnStart.prompt, //); +}); + +test("review --resume falls back to a fresh thread when the prior thread is too old", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + fs.writeFileSync(path.join(repo, "README.md"), "hello again\n"); + + const fresh = run("node", [SCRIPT, "review", "--fresh"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(fresh.status, 0, fresh.stderr); + let state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + const threadsAfterFresh = state.threads.length; + + // A ~36ms recency window is far shorter than the time to spawn the resume + // process, so the prior thread is always "too old" to reuse here. + const resume = run("node", [SCRIPT, "review", "--resume", "--within-hours", "0.00001"], { + cwd: repo, + env: buildEnv(binDir) + }); + assert.equal(resume.status, 0, resume.stderr); + + state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + assert.equal(state.threads.length, threadsAfterFresh + 1, "a stale prior thread should not be reused"); + assert.doesNotMatch(state.lastTurnStart.prompt, //); +}); + +test("review --resume does not reuse a plain native review thread", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + fs.writeFileSync(path.join(repo, "README.md"), "hello again\n"); + + // Plain native review (no flag) persists a threadId but the thread is + // ephemeral and cannot be resumed. + const native = run("node", [SCRIPT, "review"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(native.status, 0, native.stderr); + let state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + const threadsAfterNative = state.threads.length; + + const resume = run("node", [SCRIPT, "review", "--resume"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(resume.status, 0, resume.stderr); + + state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + assert.equal(state.threads.length, threadsAfterNative + 1, "resume must start a fresh thread, not reuse the native one"); + assert.doesNotMatch(state.lastTurnStart.prompt, //); +}); + +test("review --resume falls back to a fresh thread when the persisted thread is gone", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + fs.writeFileSync(path.join(repo, "README.md"), "hello again\n"); + + const fresh = run("node", [SCRIPT, "review", "--fresh"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(fresh.status, 0, fresh.stderr); + + // Simulate Codex pruning/expiring the persisted thread. + const statePath = path.join(binDir, "fake-codex-state.json"); + const state = JSON.parse(fs.readFileSync(statePath, "utf8")); + state.threads = []; + fs.writeFileSync(statePath, JSON.stringify(state, null, 2)); + + // resume should recover (start a fresh thread) rather than fail. + const resume = run("node", [SCRIPT, "review", "--resume"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(resume.status, 0, resume.stderr); + + const after = JSON.parse(fs.readFileSync(statePath, "utf8")); + assert.equal(after.threads.length, 1, "fallback should create exactly one fresh thread"); + // The fresh fallback thread has no prior context, so the prompt must NOT + // claim continuation ("reuse what you already saw"). + assert.doesNotMatch(after.lastTurnStart.prompt, //); +}); + +test("review --resume does not reuse a thread whose review did not complete", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + fs.writeFileSync(path.join(repo, "README.md"), "hello again\n"); + + const fresh = run("node", [SCRIPT, "review", "--fresh"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(fresh.status, 0, fresh.stderr); + let state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + const threadsAfterFresh = state.threads.length; + + // Simulate the resumable review having failed: the thread id was recorded + // mid-run, but the review never completed. + const stateFile = path.join(resolveStateDir(repo), "state.json"); + const companionState = JSON.parse(fs.readFileSync(stateFile, "utf8")); + let mutated = false; + for (const job of companionState.jobs) { + if (job.jobClass === "review" && job.resumable && job.threadId) { + job.status = "failed"; + mutated = true; + } + } + assert.ok(mutated, "expected a resumable review job with a thread id to mutate"); + fs.writeFileSync(stateFile, JSON.stringify(companionState, null, 2)); + + // --resume must not reuse the non-completed thread; it starts a fresh one. + const resume = run("node", [SCRIPT, "review", "--resume"], { cwd: repo, env: buildEnv(binDir) }); + assert.equal(resume.status, 0, resume.stderr); + state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + assert.equal(state.threads.length, threadsAfterFresh + 1, "a non-completed review must not be reused"); + assert.doesNotMatch(state.lastTurnStart.prompt, //); +}); + +test("review --resume starts fresh when the newest run did not complete, even if an older run did", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + fs.writeFileSync(path.join(repo, "README.md"), "hello again\n"); + + // J1 (fresh) completes on a new thread; J2 (resume) completes on the same thread. + assert.equal(run("node", [SCRIPT, "review", "--fresh"], { cwd: repo, env: buildEnv(binDir) }).status, 0); + assert.equal(run("node", [SCRIPT, "review", "--resume"], { cwd: repo, env: buildEnv(binDir) }).status, 0); + + let codexState = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + const threadsBefore = codexState.threads.length; + + // Mark the NEWEST review run (the resume) as failed — it shares the thread id + // with the older completed run, which must NOT be selected as a fallback. + const stateFile = path.join(resolveStateDir(repo), "state.json"); + const companionState = JSON.parse(fs.readFileSync(stateFile, "utf8")); + const reviewJobs = companionState.jobs + .filter((j) => j.jobClass === "review" && j.resumable && j.threadId) + .sort((a, b) => String(b.updatedAt ?? "").localeCompare(String(a.updatedAt ?? ""))); + assert.ok(reviewJobs.length >= 2, "expected two resumable review jobs"); + assert.equal(reviewJobs[0].threadId, reviewJobs[1].threadId, "both runs should share the resumed thread id"); + for (const job of companionState.jobs) { + if (job.id === reviewJobs[0].id) job.status = "failed"; + } + fs.writeFileSync(stateFile, JSON.stringify(companionState, null, 2)); + + // --resume must start a fresh thread, not fall back to the older completed run. + assert.equal(run("node", [SCRIPT, "review", "--resume"], { cwd: repo, env: buildEnv(binDir) }).status, 0); + codexState = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + assert.equal(codexState.threads.length, threadsBefore + 1, "must start a fresh thread, not reuse the interrupted one"); + assert.doesNotMatch(codexState.lastTurnStart.prompt, //); +}); + +test("review --resume starts fresh when a newer resumable run is still starting (no thread id yet)", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + fs.writeFileSync(path.join(repo, "README.md"), "hello again\n"); + + // J0: completed resumable review on a thread. + assert.equal(run("node", [SCRIPT, "review", "--fresh"], { cwd: repo, env: buildEnv(binDir) }).status, 0); + let codexState = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + const threadsBefore = codexState.threads.length; + + // Inject a newer resumable review that is still running with no thread id yet + // (a concurrent --background run that has not emitted "Thread ready"). + const stateFile = path.join(resolveStateDir(repo), "state.json"); + const companionState = JSON.parse(fs.readFileSync(stateFile, "utf8")); + companionState.jobs.unshift({ + id: "review-starting-0001", + kind: "review", + jobClass: "review", + resumable: true, + status: "running", + createdAt: "2099-01-01T00:00:00.000Z", + updatedAt: "2099-01-01T00:00:00.000Z" + }); + fs.writeFileSync(stateFile, JSON.stringify(companionState, null, 2)); + + // --resume must not fall back past the still-starting run to the older thread. + assert.equal(run("node", [SCRIPT, "review", "--resume"], { cwd: repo, env: buildEnv(binDir) }).status, 0); + codexState = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + assert.equal(codexState.threads.length, threadsBefore + 1, "must start a fresh thread while a newer run is still starting"); + assert.doesNotMatch(codexState.lastTurnStart.prompt, //); +}); + +test("review --resume does not reuse an adversarial-review thread (kind-scoped)", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.mkdirSync(path.join(repo, "src")); + fs.writeFileSync(path.join(repo, "src", "app.js"), "export const value = items[0];\n"); + run("git", ["add", "src/app.js"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + fs.writeFileSync(path.join(repo, "src", "app.js"), "export const value = items[0].id;\n"); + + // An adversarial review starts a resumable adversarial thread. + assert.equal(run("node", [SCRIPT, "adversarial-review", "--fresh"], { cwd: repo, env: buildEnv(binDir) }).status, 0); + let codexState = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + const threadsBefore = codexState.threads.length; + + // Plain `/codex:review --resume` must not reuse the adversarial thread. + assert.equal(run("node", [SCRIPT, "review", "--resume"], { cwd: repo, env: buildEnv(binDir) }).status, 0); + codexState = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); + assert.equal(codexState.threads.length, threadsBefore + 1, "plain review must not reuse an adversarial-review thread"); + assert.doesNotMatch(codexState.lastTurnStart.prompt, //); +}); + test("review includes reasoning output when the app server returns it", () => { const repo = makeTempDir(); const binDir = makeTempDir();