diff --git a/src/common/reasoning-effort-manager.ts b/src/common/reasoning-effort-manager.ts new file mode 100644 index 00000000..93fc48ef --- /dev/null +++ b/src/common/reasoning-effort-manager.ts @@ -0,0 +1,188 @@ +import type { ReasoningEffort } from "../settings"; +import type { ToolCall, ToolExecutionResult } from "../tools/executor"; + +// ── Public types ────────────────────────────────────────────── + +export type TurnInput = { + /** Raw tool calls from the assistant response (before execution). */ + toolCalls: ToolCall[]; + /** Execution results after tool calls completed (same order as toolCalls). */ + toolExecutions: ToolExecutionResult[]; +}; + +// ── Internal state ─────────────────────────────────────────── + +type ManagerState = { + currentEffort: ReasoningEffort; + consecutiveFailures: number; + consecutiveIdenticalCalls: number; + lastFingerprint: string | null; + turnsAtCurrentEffort: number; + cleanTurnStreak: number; + downgradeCooldownRemaining: number; + downgradeThreshold: number; + escalateCooldownRemaining: number; +}; + +// ── Constants ───────────────────────────────────────────────── + +const FAILURE_ESCALATION_THRESHOLD = 2; +/** + * Number of consecutive identical tool calls required to trigger escalation. + * Per spec: "≥3 consecutive tool calls with identical (name, arguments) pairs". + */ +const REPETITION_ESCALATION_THRESHOLD = 3; +const DEFAULT_DOWNGRADE_THRESHOLD = 5; +const DOWNGRADE_COOLDOWN_TURNS = 3; +const ESCALATE_COOLDOWN_TURNS = 2; + +// ── Manager ─────────────────────────────────────────────────── + +export class RuntimeReasoningEffortManager { + private state: ManagerState; + + constructor() { + this.state = { + currentEffort: "high", + consecutiveFailures: 0, + consecutiveIdenticalCalls: 0, + lastFingerprint: null, + turnsAtCurrentEffort: 0, + cleanTurnStreak: 0, + downgradeCooldownRemaining: 0, + downgradeThreshold: DEFAULT_DOWNGRADE_THRESHOLD, + escalateCooldownRemaining: 0, + }; + } + + static computeFingerprint(toolCalls: ToolCall[]): string { + const normalized = toolCalls.map((tc) => ({ + name: tc.function.name, + args: tc.function.arguments.replace(/\s+/g, ""), + })); + return JSON.stringify(normalized); + } + + evaluate(input: TurnInput): ReasoningEffort | null { + const fingerprint = RuntimeReasoningEffortManager.computeFingerprint(input.toolCalls); + const allOk = input.toolExecutions.length > 0 && input.toolExecutions.every((e) => e.ok); + + this.state.turnsAtCurrentEffort += 1; + + let result: ReasoningEffort | null; + if (this.state.currentEffort === "high") { + result = this.evaluateEscalation(input, fingerprint, allOk); + } else { + result = this.evaluateDowngrade(allOk, fingerprint); + } + + // Only decrement cooldowns when no state change occurred. + // If escalate()/downgrade() just fired, the new cooldown was set + // and should NOT be decremented in the same turn. + if (result === null) { + this.state.downgradeCooldownRemaining = Math.max(0, this.state.downgradeCooldownRemaining - 1); + this.state.escalateCooldownRemaining = Math.max(0, this.state.escalateCooldownRemaining - 1); + } + + return result; + } + + getCurrentEffort(): ReasoningEffort { + return this.state.currentEffort; + } + + reset(): void { + this.state = { + currentEffort: "high", + consecutiveFailures: 0, + consecutiveIdenticalCalls: 0, + lastFingerprint: null, + turnsAtCurrentEffort: 0, + cleanTurnStreak: 0, + downgradeCooldownRemaining: 0, + downgradeThreshold: DEFAULT_DOWNGRADE_THRESHOLD, + escalateCooldownRemaining: 0, + }; + } + + getState(): Readonly { + return { ...this.state }; + } + + // ── Private helpers ───────────────────────────────────────── + + private evaluateEscalation(_input: TurnInput, fingerprint: string, allOk: boolean): ReasoningEffort | null { + if (this.state.escalateCooldownRemaining > 0) { + return null; + } + + if (!allOk) { + this.state.consecutiveFailures += 1; + // A failure breaks the "identical success" streak. + this.state.consecutiveIdenticalCalls = 0; + if (this.state.consecutiveFailures >= FAILURE_ESCALATION_THRESHOLD) { + return this.escalate(); + } + } else { + this.state.consecutiveFailures = 0; + } + + if (fingerprint === this.state.lastFingerprint && fingerprint !== null && this.state.lastFingerprint !== null) { + this.state.consecutiveIdenticalCalls += 1; + if (this.state.consecutiveIdenticalCalls >= REPETITION_ESCALATION_THRESHOLD) { + return this.escalate(); + } + } else { + // First occurrence of this fingerprint — start the streak at 1. + // (Per spec: escalation triggers on ≥3 identical calls; the 3rd triggers.) + this.state.consecutiveIdenticalCalls = 1; + } + + this.state.lastFingerprint = fingerprint; + return null; + } + + private evaluateDowngrade(allOk: boolean, fingerprint: string): ReasoningEffort | null { + if (this.state.downgradeCooldownRemaining > 0) { + this.state.lastFingerprint = fingerprint; + return null; + } + + if (allOk && fingerprint !== this.state.lastFingerprint) { + this.state.cleanTurnStreak += 1; + if (this.state.cleanTurnStreak >= this.state.downgradeThreshold) { + return this.downgrade(); + } + } else if (!allOk) { + this.state.cleanTurnStreak = 0; + } + + this.state.lastFingerprint = fingerprint; + return null; + } + + private escalate(): ReasoningEffort { + this.state.currentEffort = "max"; + this.state.consecutiveFailures = 0; + this.state.consecutiveIdenticalCalls = 0; + this.state.cleanTurnStreak = 0; + this.state.downgradeCooldownRemaining = DOWNGRADE_COOLDOWN_TURNS; + this.state.turnsAtCurrentEffort = 0; + return "max"; + } + + private downgrade(): ReasoningEffort | null { + this.state.currentEffort = "high"; + this.state.cleanTurnStreak = 0; + this.state.escalateCooldownRemaining = ESCALATE_COOLDOWN_TURNS; + this.state.consecutiveFailures = 0; + this.state.consecutiveIdenticalCalls = 0; + this.state.turnsAtCurrentEffort = 0; + if (this.state.downgradeThreshold === DEFAULT_DOWNGRADE_THRESHOLD) { + this.state.downgradeThreshold = DEFAULT_DOWNGRADE_THRESHOLD * 2; + } else { + this.state.downgradeThreshold = DEFAULT_DOWNGRADE_THRESHOLD * 4; + } + return "high"; + } +} diff --git a/src/session.ts b/src/session.ts index 9432a74d..d9266fb0 100644 --- a/src/session.ts +++ b/src/session.ts @@ -24,11 +24,13 @@ import { type CreateOpenAIClient, type ProcessTimeoutControl, type ProcessTimeoutInfo, + type ToolCall, type ToolCallExecution, type ToolExecutionHooks, } from "./tools/executor"; import { McpManager } from "./mcp/mcp-manager"; -import type { McpServerConfig, PermissionScope, PermissionSettings } from "./settings"; +import { RuntimeReasoningEffortManager } from "./common/reasoning-effort-manager"; +import type { McpServerConfig, PermissionScope, PermissionSettings, ReasoningEffort } from "./settings"; import { logApiError } from "./common/error-logger"; import { logOpenAIChatCompletionDebug, normalizeDebugError } from "./common/debug-logger"; import { killProcessTree } from "./common/process-tree"; @@ -171,14 +173,6 @@ function accumulateUsagePerModel( return usagePerModel; } -function getTotalTokens(usage: ModelUsage | null | undefined): number { - if (!isUsageRecord(usage)) { - return 0; - } - const totalTokens = usage.total_tokens; - return typeof totalTokens === "number" ? totalTokens : 0; -} - export type SessionStatus = | "failed" | "pending" @@ -339,6 +333,7 @@ export class SessionManager { private readonly mcpManager = new McpManager(); private mcpToolDefinitions: ToolDefinition[] = []; private readonly messageConverter: OpenAIMessageConverter; + private static systemPromptCache = new Map(); constructor(options: SessionManagerOptions) { this.projectRoot = options.projectRoot; @@ -414,6 +409,18 @@ export class SessionManager { return tokens; } + private estimateContextTokens(messages: SessionMessage[]): number { + let total = 0; + for (const msg of messages) { + if (msg.compacted) continue; + total += msg.content?.length ?? 0; + if (msg.messageParams) { + total += JSON.stringify(msg.messageParams).length; + } + } + return Math.ceil(total / 4); + } + private formatEstimatedTokens(tokens: number): string { if (tokens <= 0) { return "0"; @@ -1072,7 +1079,12 @@ ${agentInstructions} } const promptToolOptions = this.getPromptToolOptions(); - const systemPrompt = getSystemPrompt(this.projectRoot, promptToolOptions); + const cacheKey = `${promptToolOptions.model}`; + let systemPrompt = SessionManager.systemPromptCache.get(cacheKey); + if (!systemPrompt) { + systemPrompt = getSystemPrompt(this.projectRoot, promptToolOptions); + SessionManager.systemPromptCache.set(cacheKey, systemPrompt); + } const systemMessage = this.buildSystemMessage(sessionId, systemPrompt); this.appendSessionMessage(sessionId, systemMessage); @@ -1220,6 +1232,8 @@ ${agentInstructions} const startedAt = Date.now(); const { client, model, baseURL, temperature, thinkingEnabled, reasoningEffort, debugLogEnabled, notify, env } = this.createOpenAIClient(); + const effortManager = new RuntimeReasoningEffortManager(); + let currentReasoningEffort: ReasoningEffort = reasoningEffort ?? "high"; const now = new Date().toISOString(); rebuildSessionStateFromHistory(sessionId, this.listSessionMessages(sessionId)); @@ -1265,6 +1279,7 @@ ${agentInstructions} try { const maxIterations = 80000; // about 1K RMB cost let toolCalls: unknown[] | null = null; + const cachedTools = getTools(this.getPromptToolOptions(), this.mcpToolDefinitions); for (let iteration = 0; iteration < maxIterations; iteration++) { if (this.isInterrupted(sessionId)) { @@ -1318,14 +1333,14 @@ ${agentInstructions} thinkingEnabled, model ); - const thinkingOptions = buildThinkingRequestOptions(thinkingEnabled, baseURL, reasoningEffort); + const thinkingOptions = buildThinkingRequestOptions(thinkingEnabled, baseURL, currentReasoningEffort); const response = await this.createChatCompletionStream( client, { model, ...(temperature !== undefined ? { temperature } : {}), messages, - tools: getTools(this.getPromptToolOptions(), this.mcpToolDefinitions), + tools: cachedTools, ...thinkingOptions, }, { signal: sessionController.signal }, @@ -1383,7 +1398,7 @@ ${agentInstructions} toolCalls, usage: accumulateUsage(entry.usage, responseUsage), usagePerModel: accumulateUsagePerModel(entry.usagePerModel, model, responseUsage), - activeTokens: getTotalTokens(responseUsage), + activeTokens: this.estimateContextTokens(this.listSessionMessages(sessionId)), status: "ask_permission", failReason: null, askPermissions: permissionPlan.askPermissions, @@ -1395,6 +1410,17 @@ ${agentInstructions} messagePermissions: permissionPlan?.permissions, }); waitingForUser = toolAppendResult.waitingForUser; + + if (toolCalls && toolCalls.length > 0 && toolAppendResult.executions.length > 0) { + const turnInput = { + toolCalls: toolCalls as ToolCall[], + toolExecutions: toolAppendResult.executions.map((e) => e.result), + }; + const nextEffort = effortManager.evaluate(turnInput); + if (nextEffort !== null && nextEffort !== currentReasoningEffort) { + currentReasoningEffort = nextEffort; + } + } } if (this.isInterrupted(sessionId)) { @@ -1409,7 +1435,7 @@ ${agentInstructions} toolCalls, usage: accumulateUsage(entry.usage, responseUsage), usagePerModel: accumulateUsagePerModel(entry.usagePerModel, model, responseUsage), - activeTokens: getTotalTokens(responseUsage), + activeTokens: this.estimateContextTokens(this.listSessionMessages(sessionId)), status: refusal ? "failed" : waitingForUser ? "waiting_for_user" : toolCalls ? "processing" : "completed", failReason: refusal ? refusal : entry.failReason, askPermissions: undefined, @@ -1417,14 +1443,17 @@ ${agentInstructions} })); if (refusal) { + effortManager.reset(); return; } if (waitingForUser) { + effortManager.reset(); return; } if (!toolCalls) { + effortManager.reset(); return; } } @@ -1514,7 +1543,16 @@ ${agentInstructions} this.throwIfAborted(signal); const rawLlmResponse = response.choices?.[0]?.message?.content; const llmResponse = typeof rawLlmResponse === "string" ? rawLlmResponse : ""; - const compactedSummary = llmResponse.replace(/[\s\S]*?<\/analysis>/gi, "").trim(); + let compactedSummary: string; + try { + const parsed = JSON.parse(llmResponse); + compactedSummary = + typeof parsed.summary === "string" && parsed.summary.trim() + ? `Summary: ${parsed.summary.trim()}\nKey files: ${Array.isArray(parsed.keyFiles) ? parsed.keyFiles.join(", ") : "none"}\nPending: ${Array.isArray(parsed.pendingActions) ? parsed.pendingActions.join("; ") : "none"}` + : llmResponse.trim(); + } catch { + compactedSummary = llmResponse.trim(); + } const now = new Date().toISOString(); const responseUsage = response.usage ?? null; @@ -1522,7 +1560,7 @@ ${agentInstructions} ...entry, usage: accumulateUsage(entry.usage, responseUsage), usagePerModel: accumulateUsagePerModel(entry.usagePerModel, model, responseUsage), - activeTokens: getTotalTokens(responseUsage), + activeTokens: this.estimateContextTokens(sessionMessages), updateTime: now, })); @@ -2233,7 +2271,7 @@ ${agentInstructions} permissionOverrides?: UserToolPermission[]; messagePermissions?: MessageToolPermission[]; } = {} - ): Promise<{ waitingForUser: boolean }> { + ): Promise<{ waitingForUser: boolean; executions: ToolCallExecution[] }> { const hooks: ToolExecutionHooks = { onProcessStart: (pid, command) => this.addSessionProcess(sessionId, pid, command), onProcessExit: (pid) => this.removeSessionProcess(sessionId, pid), @@ -2261,7 +2299,7 @@ ${agentInstructions} toolExecutions.push(...executions); } if (this.isInterrupted(sessionId)) { - return { waitingForUser: false }; + return { waitingForUser: false, executions: toolExecutions }; } let waitingForUser = false; const followUpMessages: SessionMessage[] = []; @@ -2287,7 +2325,7 @@ ${agentInstructions} for (const followUpMessage of followUpMessages) { this.appendSessionMessage(sessionId, followUpMessage); } - return { waitingForUser }; + return { waitingForUser, executions: toolExecutions }; } private cloneUserPromptForMeta(prompt: UserPromptContent): UserPromptContent { diff --git a/src/tests/reasoning-effort-manager.test.ts b/src/tests/reasoning-effort-manager.test.ts new file mode 100644 index 00000000..ac7cb37f --- /dev/null +++ b/src/tests/reasoning-effort-manager.test.ts @@ -0,0 +1,155 @@ +import { describe, test } from "node:test"; +import assert from "node:assert/strict"; +import { RuntimeReasoningEffortManager, type TurnInput } from "../common/reasoning-effort-manager"; + +function mkExec(ok: boolean, name = "bash"): TurnInput["toolExecutions"][number] { + return { ok, name, output: ok ? "success" : undefined, error: ok ? undefined : "fail" }; +} + +function mkCall(name: string, args: string): TurnInput["toolCalls"][number] { + return { id: "call-1", type: "function", function: { name, arguments: args } }; +} + +describe("RuntimeReasoningEffortManager", () => { + test("starts at high", () => { + const m = new RuntimeReasoningEffortManager(); + assert.equal(m.getCurrentEffort(), "high"); + }); + + test("escalates on 2 consecutive failures", () => { + const m = new RuntimeReasoningEffortManager(); + assert.equal( + m.evaluate({ + toolCalls: [mkCall("bash", '{"cmd":"x"}')], + toolExecutions: [mkExec(false)], + }), + null + ); + assert.equal(m.getCurrentEffort(), "high"); + assert.equal( + m.evaluate({ + toolCalls: [mkCall("bash", '{"cmd":"y"}')], + toolExecutions: [mkExec(false)], + }), + "max" + ); + assert.equal(m.getCurrentEffort(), "max"); + }); + + test("resets failure counter on success", () => { + const m = new RuntimeReasoningEffortManager(); + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(true)] }); + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + assert.equal(m.getCurrentEffort(), "high"); + }); + + test("escalates on 3 identical tool calls", () => { + const m = new RuntimeReasoningEffortManager(); + const call = mkCall("read", '{"file_path":"/x"}'); + assert.equal(m.evaluate({ toolCalls: [call], toolExecutions: [mkExec(true)] }), null); + assert.equal(m.evaluate({ toolCalls: [call], toolExecutions: [mkExec(true)] }), null); + assert.equal(m.evaluate({ toolCalls: [call], toolExecutions: [mkExec(true)] }), "max"); + }); + + test("downgrades after 5 clean turns (default threshold)", () => { + const m = new RuntimeReasoningEffortManager(); + // Escalate first + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + assert.equal(m.getCurrentEffort(), "max"); + + // Cooldown: first 3 turns at "max" cannot downgrade + for (let i = 0; i < 3; i++) { + const call = mkCall("bash", `{"cmd":"cooldown${i}"}`); + assert.equal(m.evaluate({ toolCalls: [call], toolExecutions: [mkExec(true)] }), null); + } + assert.equal(m.getCurrentEffort(), "max"); + + // Now 5 clean turns with different fingerprints + for (let i = 0; i < 5; i++) { + const call = mkCall("bash", `{"cmd":"unique${i}"}`); + m.evaluate({ toolCalls: [call], toolExecutions: [mkExec(true)] }); + } + assert.equal(m.getCurrentEffort(), "high"); + }); + + test("fingerprint is independent of argument whitespace", () => { + const fp1 = RuntimeReasoningEffortManager.computeFingerprint([ + { id: "a", type: "function", function: { name: "bash", arguments: '{"cmd": "x"}' } }, + ]); + const fp2 = RuntimeReasoningEffortManager.computeFingerprint([ + { id: "b", type: "function", function: { name: "bash", arguments: '{"cmd":"x"}' } }, + ]); + assert.equal(fp1, fp2); + }); + + test("reset clears all state", () => { + const m = new RuntimeReasoningEffortManager(); + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + assert.equal(m.getCurrentEffort(), "max"); + m.reset(); + assert.equal(m.getCurrentEffort(), "high"); + assert.equal(m.getState().consecutiveFailures, 0); + assert.equal(m.getState().cleanTurnStreak, 0); + }); + + test("cooldown prevents immediate re-escalation after downgrade", () => { + const m = new RuntimeReasoningEffortManager(); + // Escalate + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + assert.equal(m.getCurrentEffort(), "max"); + // Downgrade via 8 clean turns (3 cooldown + 5 threshold) + for (let i = 0; i < 3; i++) { + m.evaluate({ toolCalls: [mkCall("bash", `{"c":"a${i}"}`)], toolExecutions: [mkExec(true)] }); + } + for (let i = 0; i < 5; i++) { + m.evaluate({ toolCalls: [mkCall("bash", `{"c":"b${i}"}`)], toolExecutions: [mkExec(true)] }); + } + assert.equal(m.getCurrentEffort(), "high"); + // Immediate failure should NOT re-escalate (cooldown active) + assert.equal(m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }), null); + assert.equal(m.getCurrentEffort(), "high"); + }); + + test("anti-flapping doubles downgrade threshold on repeated cycles", () => { + const m = new RuntimeReasoningEffortManager(); + // First cycle: escalate + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [mkExec(false)] }); + assert.equal(m.getCurrentEffort(), "max"); + // Downgrade: 3 cooldown + 5 clean + for (let i = 0; i < 3; i++) { + m.evaluate({ toolCalls: [mkCall("b", `${i}`)], toolExecutions: [mkExec(true)] }); + } + for (let i = 0; i < 5; i++) { + m.evaluate({ toolCalls: [mkCall("b", `d${i}`)], toolExecutions: [mkExec(true)] }); + } + assert.equal(m.getCurrentEffort(), "high"); + // Second cycle: escalate again (cooldown absorbs first 2 failures) + for (let i = 0; i < 2; i++) { + m.evaluate({ toolCalls: [mkCall("b", `e${i}`)], toolExecutions: [mkExec(false)] }); + } + m.evaluate({ toolCalls: [mkCall("b", "e2")], toolExecutions: [mkExec(false)] }); + m.evaluate({ toolCalls: [mkCall("b", "e3")], toolExecutions: [mkExec(false)] }); + assert.equal(m.getCurrentEffort(), "max"); + // Downgrade: 3 cooldown + now 10 clean turns needed (threshold doubled) + for (let i = 0; i < 3; i++) { + m.evaluate({ toolCalls: [mkCall("b", `f${i}`)], toolExecutions: [mkExec(true)] }); + } + for (let i = 0; i < 9; i++) { + m.evaluate({ toolCalls: [mkCall("b", `g${i}`)], toolExecutions: [mkExec(true)] }); + } + assert.equal(m.getCurrentEffort(), "max"); // still max, threshold not met + m.evaluate({ toolCalls: [mkCall("b", "g9")], toolExecutions: [mkExec(true)] }); // 10th clean + assert.equal(m.getCurrentEffort(), "high"); // now downgraded + }); + + test("no escalation on first turn (empty executions)", () => { + const m = new RuntimeReasoningEffortManager(); + assert.equal(m.evaluate({ toolCalls: [mkCall("bash", "{}")], toolExecutions: [] }), null); + assert.equal(m.getCurrentEffort(), "high"); + }); +});