diff --git a/src/services/api/openAICompatInferenceClient.test.ts b/src/services/api/openAICompatInferenceClient.test.ts index 09df1c7..a391d82 100644 --- a/src/services/api/openAICompatInferenceClient.test.ts +++ b/src/services/api/openAICompatInferenceClient.test.ts @@ -920,6 +920,101 @@ describe('mapOpenAIChatCompletionToAnthropicMessage', () => { expect(message.content).toEqual([{ type: 'text', text: 'final answer' }]) }) + + it('strips malformed tool_use blocks and matching tool_results before sending history', () => { + const request = buildOpenAICompatChatRequest({ + model: GLM_5_2_MODEL, + max_tokens: 64, + messages: [ + { role: 'user', content: 'do something' }, + { + role: 'assistant', + content: [ + { type: 'tool_use', id: 'blank_name', name: '', input: {} }, + { type: 'tool_use', id: 'missing_name', input: {} }, + { type: 'tool_use', id: 'non_string_name', name: 123, input: {} }, + ], + }, + { + role: 'user', + content: [ + { + type: 'tool_result', + tool_use_id: 'blank_name', + content: 'Tool not found', + }, + { + type: 'tool_result', + tool_use_id: 'missing_name', + content: 'Tool not found', + }, + { + type: 'tool_result', + tool_use_id: 'non_string_name', + content: 'Tool not found', + }, + { type: 'text', text: 'continue' }, + ], + }, + { role: 'user', content: 'next question' }, + ], + } as never) + + expect(request.messages).toEqual([ + { role: 'user', content: 'do something' }, + { role: 'user', content: 'continue' }, + { role: 'user', content: 'next question' }, + ]) + }) + + it('preserves valid tool_use blocks alongside malformed blocks', () => { + const request = buildOpenAICompatChatRequest({ + model: GLM_5_2_MODEL, + max_tokens: 64, + messages: [ + { role: 'user', content: 'hello' }, + { + role: 'assistant', + content: [ + { type: 'tool_use', id: 'bad', name: ' ', input: {} }, + { type: 'tool_use', id: 'good', name: 'Bash', input: { cmd: 'ls' } }, + ], + }, + { + role: 'user', + content: [ + { type: 'tool_result', tool_use_id: 'bad', content: 'Tool not found' }, + { type: 'tool_result', tool_use_id: 'good', content: 'file.ts' }, + ], + }, + ], + tools: [ + { + name: 'Bash', + description: 'Run shell commands', + input_schema: { + type: 'object', + properties: { cmd: { type: 'string' } }, + }, + }, + ], + } as never) + + const assistantMsg = request.messages.find(m => m.role === 'assistant') + expect(assistantMsg).toBeDefined() + expect(assistantMsg?.tool_calls).toEqual([ + { + id: 'good', + type: 'function', + function: { name: 'Bash', arguments: '{"cmd":"ls"}' }, + }, + ]) + + const toolMsg = request.messages.find(m => m.role === 'tool') + expect(toolMsg).toBeDefined() + expect(toolMsg?.tool_call_id).toBe('good') + expect(toolMsg?.content).toBe('file.ts') + }) }) describe('OpenAICompatInferenceClient', () => { diff --git a/src/services/api/openAICompatInferenceClient.ts b/src/services/api/openAICompatInferenceClient.ts index 3dbba55..2f9404b 100644 --- a/src/services/api/openAICompatInferenceClient.ts +++ b/src/services/api/openAICompatInferenceClient.ts @@ -681,6 +681,100 @@ function convertMessages( return converted } +function isMalformedToolUseBlock(block: unknown): block is { + id?: unknown + type: 'tool_use' +} { + if (!block || typeof block !== 'object') return false + if (!('type' in block) || (block as { type?: unknown }).type !== 'tool_use') { + return false + } + return !('name' in block) || typeof (block as { name?: unknown }).name !== 'string' || (block as { name: string }).name.trim() === '' +} + +/** + * Strips malformed assistant tool_use blocks before forwarding history to an + * OpenAI-compatible endpoint, and removes matching tool_result blocks. This + * prevents malformed model output (blank/missing/non-string tool names) from + * poisoning all later requests with history that OpenAI-compatible servers + * reject before the model can recover. + */ +function sanitizeMessagesForMalformedToolCalls(messages: unknown): unknown { + if (!Array.isArray(messages)) return messages + + const orphanedIds = new Set() + const result: unknown[] = [] + + for (const message of messages) { + if ( + !message || + typeof message !== 'object' || + !('role' in message) || + !('content' in message) + ) { + result.push(message) + continue + } + + const msg = message as { + role: unknown + content: unknown + [key: string]: unknown + } + + if (msg.role === 'assistant' && Array.isArray(msg.content)) { + const filteredContent = msg.content.filter((block: unknown) => { + if (isMalformedToolUseBlock(block)) { + if ('id' in block && typeof block.id === 'string') { + orphanedIds.add(block.id) + } + return false + } + return true + }) + + if (filteredContent.length === 0) { + continue + } + + result.push({ ...msg, content: filteredContent }) + continue + } + + if ( + msg.role === 'user' && + Array.isArray(msg.content) && + orphanedIds.size > 0 + ) { + const filteredContent = msg.content.filter((block: unknown) => { + if ( + block && + typeof block === 'object' && + 'type' in block && + (block as { type: unknown }).type === 'tool_result' && + 'tool_use_id' in block && + typeof (block as { tool_use_id: unknown }).tool_use_id === 'string' && + orphanedIds.has((block as { tool_use_id: string }).tool_use_id) + ) { + return false + } + return true + }) + + if (filteredContent.length === 0) { + continue + } + + result.push({ ...msg, content: filteredContent }) + continue + } + + result.push(message) + } + + return result +} + function convertTools(tools: unknown): Array> | undefined { if (!Array.isArray(tools) || tools.length === 0) { return undefined @@ -845,7 +939,10 @@ export function buildOpenAICompatChatRequest( : undefined const convertedTools = convertTools(params.tools) const convertedToolChoice = convertToolChoice(params.tool_choice) - const convertedMessages = convertMessages(params.system, params.messages) + const convertedMessages = convertMessages( + params.system, + sanitizeMessagesForMalformedToolCalls(params.messages), + ) const request: OpenAIChatCompletionRequest = { model: normalizeOpenAICompatModelForAPI(params.model, { diff --git a/src/services/tools/schemaConfusionHint.test.ts b/src/services/tools/schemaConfusionHint.test.ts new file mode 100644 index 0000000..cd1fe2b --- /dev/null +++ b/src/services/tools/schemaConfusionHint.test.ts @@ -0,0 +1,314 @@ +import { describe, expect, it } from 'bun:test' +import { buildSchemaConfusionHint } from './schemaConfusionHint.js' + +// Reproduces the 7 schema-confusion shapes confirmed during the +// 2026-06-20 live diagnostic exercise + null cases verifying the hint is +// only emitted when genuinely confused. +// +// Shape reference (from live reproduction): +// 1. Bash({ file_path }) → Read-shaped args on Bash +// 2. Bash({ file_path, limit }) → Read-shaped args on Bash (full) +// 3. Bash({ pattern, path, output_mode }) → Grep-shaped args on Bash +// 4. Bash({ pattern }) → Glob-shaped args on Bash +// 5. Grep({ command, description }) → Bash-shaped args on Grep +// 6. Read({ command, description }) → Bash-shaped args on Read +// 7. Read({ pattern }) → Glob-shaped args on Read +// Plus Grep({ pattern }) called with a glob-looking pattern → Glob +// +// Extension (2026-06-22): WebFetch and Write added to the cluster after +// log analysis showed GLM-5.2 repeatedly emitting WebFetch({file_path}) +// and Write({file_path, limit|offset}) without recovering, because the +// hint never fired for those called tools. The Write/Read `file_path` +// collision is resolved by way of `content` (present → Write, absent → +// possible Read leak) with Read-only optionals (limit/offset/pages) as +// the confirming signal for Read intent. + +const ALL_SEARCH_TOOLS = ['Bash', 'Grep', 'Glob', 'Read', 'WebFetch', 'Write'] + +describe('buildSchemaConfusionHint', () => { + describe('returns a hint naming the correct target tool for each confirmed shape', () => { + it('Bash with file_path → hint names Read', () => { + const hint = buildSchemaConfusionHint('Bash', { file_path: '/tmp/x' }, ALL_SEARCH_TOOLS) + expect(hint).not.toBeNull() + expect(hint!).toContain('Read') + expect(hint!).toContain('file_path') + }) + + it('Bash with file_path+limit → hint names Read', () => { + const hint = buildSchemaConfusionHint( + 'Bash', + { file_path: '/tmp/x', limit: 5 }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Read') + expect(hint!).toContain('file_path') + expect(hint!).toContain('limit') + }) + + it('Bash with pattern+path+output_mode → hint names Grep', () => { + const hint = buildSchemaConfusionHint( + 'Bash', + { pattern: 'foo', path: '/x', output_mode: 'content' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Grep') + expect(hint!).toContain('pattern') + expect(hint!).toContain('path') + }) + + it('Bash with bare pattern → hint names Glob', () => { + const hint = buildSchemaConfusionHint('Bash', { pattern: 'src/*.ts' }, ALL_SEARCH_TOOLS) + expect(hint).not.toBeNull() + expect(hint!).toContain('Glob') + expect(hint!).toContain('pattern') + }) + + it('Grep with command+description → hint names Bash', () => { + const hint = buildSchemaConfusionHint( + 'Grep', + { command: 'ls', description: 'list' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Bash') + expect(hint!).toContain('command') + }) + + it('Read with command+description → hint names Bash', () => { + const hint = buildSchemaConfusionHint( + 'Read', + { command: 'ls', description: 'list' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Bash') + expect(hint!).toContain('command') + }) + + it('Read with bare pattern → hint names Glob', () => { + const hint = buildSchemaConfusionHint( + 'Read', + { pattern: 'src/*.ts' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Glob') + expect(hint!).toContain('pattern') + }) + + it('Grep with a glob-looking pattern → hint names Glob', () => { + const hint = buildSchemaConfusionHint( + 'Grep', + { pattern: '/tmp/ncode-glm-sweep/*.txt' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Glob') + expect(hint!).toContain('pattern') + }) + }) + + describe('does not emit a hint when the call is well-formed for its tool', () => { + it('returns null for Bash with command', () => { + const hint = buildSchemaConfusionHint( + 'Bash', + { command: 'ls' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + + it('returns null for Read with file_path', () => { + const hint = buildSchemaConfusionHint( + 'Read', + { file_path: '/tmp/x' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + + it('returns null for Grep with pattern+path (real content regex)', () => { + const hint = buildSchemaConfusionHint( + 'Grep', + { pattern: 'foo|bar', path: '/x' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + + it('returns null for Glob with pattern', () => { + const hint = buildSchemaConfusionHint( + 'Glob', + { pattern: 'src/*.ts' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + }) + + describe('does not emit a hint when a sibling tool is missing from the registry', () => { + it('returns null when Read is not in siblingToolNames (Bash←file_path case)', () => { + const hint = buildSchemaConfusionHint( + 'Bash', + { file_path: '/tmp/x' }, + ['Bash', 'Grep', 'Glob'], + ) + expect(hint).toBeNull() + }) + + it('returns null when Glob is not in siblingToolNames (Grep←pattern case)', () => { + const hint = buildSchemaConfusionHint( + 'Grep', + { pattern: '*.ts' }, + ['Bash', 'Grep', 'Read'], + ) + expect(hint).toBeNull() + }) + }) + + describe('ignored tools outside the search cluster', () => { + it('returns null when toolName is TaskGet (not a search tool)', () => { + const hint = buildSchemaConfusionHint( + 'TaskGet', + { command: 'ls', file_path: '/tmp/x' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + + it('returns null when input is empty', () => { + const hint = buildSchemaConfusionHint('Bash', {}, ALL_SEARCH_TOOLS) + expect(hint).toBeNull() + }) + }) + + describe('ambiguous shapes', () => { + it('Bash with pattern+path but no other Grep-specific param → hint names Grep (path is the disambiguator)', () => { + const hint = buildSchemaConfusionHint( + 'Bash', + { pattern: 'foo', path: '/x' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Grep') + }) + + it('Bash with pattern that is a glob-looking path → hint names Glob (no path field)', () => { + const hint = buildSchemaConfusionHint( + 'Bash', + { pattern: 'src/*.ts' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Glob') + }) + }) + + describe('WebFetch extension (2026-06-22)', () => { + it('WebFetch with file_path → hint names Read', () => { + const hint = buildSchemaConfusionHint( + 'WebFetch', + { file_path: '/tmp/x' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Read') + expect(hint!).toContain('file_path') + }) + + it('WebFetch with file_path+limit → hint names Read', () => { + const hint = buildSchemaConfusionHint( + 'WebFetch', + { file_path: '/tmp/x', limit: 5 }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Read') + expect(hint!).toContain('file_path') + expect(hint!).toContain('limit') + }) + + it('returns null for well-formed WebFetch with url+prompt', () => { + const hint = buildSchemaConfusionHint( + 'WebFetch', + { url: 'https://example.com', prompt: 'summarize' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + }) + + describe('Write/Read file_path collision (2026-06-22)', () => { + it('Write with file_path+limit → hint names Read (content absent, Read-only optional present)', () => { + const hint = buildSchemaConfusionHint( + 'Write', + { file_path: '/tmp/x', limit: 5 }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Read') + expect(hint!).toContain('file_path') + expect(hint!).toContain('limit') + }) + + it('Write with file_path+offset → hint names Read', () => { + const hint = buildSchemaConfusionHint( + 'Write', + { file_path: '/tmp/x', offset: 10 }, + ALL_SEARCH_TOOLS, + ) + expect(hint).not.toBeNull() + expect(hint!).toContain('Read') + expect(hint!).toContain('offset') + }) + + it('returns null for Write with only file_path (ambiguous: could be Write missing content OR Read leaking file_path)', () => { + // When content is absent AND no Read-only optional is present, + // we cannot tell which tool the model meant. Emit no hint and let + // the bare Zod error surface the missing `content` requirement. + const hint = buildSchemaConfusionHint( + 'Write', + { file_path: '/tmp/x' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + + it('returns null for Write with file_path+content (well-formed Write)', () => { + const hint = buildSchemaConfusionHint( + 'Write', + { file_path: '/tmp/x', content: 'hello' }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + + it('returns null for Write with file_path+content+limit (well-formed Write with stray param)', () => { + // `content` present anchors Write intent. The validator will + // surface `limit` as an unexpected param via the bare Zod error. + // We do not emit a Read hint here because the called tool's own + // required keyword (file_path) + content fully express Write + // intent; the model just included a stray param that doesn't + // belong to either tool's accepted input shape. + const hint = buildSchemaConfusionHint( + 'Write', + { file_path: '/tmp/x', content: 'hello', limit: 5 }, + ALL_SEARCH_TOOLS, + ) + expect(hint).toBeNull() + }) + + it('returns null when Write is called and Read is missing from the registry', () => { + // Without Read available, the file_path+limit input on Write + // cannot route anywhere — emit no hint. + const hint = buildSchemaConfusionHint( + 'Write', + { file_path: '/tmp/x', limit: 5 }, + ['Bash', 'Grep', 'Glob', 'WebFetch', 'Write'], + ) + expect(hint).toBeNull() + }) + }) +}) \ No newline at end of file diff --git a/src/services/tools/schemaConfusionHint.ts b/src/services/tools/schemaConfusionHint.ts new file mode 100644 index 0000000..d375431 --- /dev/null +++ b/src/services/tools/schemaConfusionHint.ts @@ -0,0 +1,331 @@ +/** + * Detects tool-call schema confusion in the {Bash, Grep, Glob, Read, + * WebFetch, Write} cluster and returns a targeted, actionable hint + * appended to the standard Zod validation error. + * + * Background: GLM-5.2 (and likely other OpenAI-compatible models) emit + * calls to one tool with the parameter schema of a sibling tool. Example: + * `Bash({ file_path: '/x' })` — where `file_path` belongs to Read, not + * Bash. The raw Zod error ("required parameter `command` is missing; + * unexpected parameter `file_path` was provided") names the problem but + * does not name the fix, so the model retries by guessing. This hint + * names both. + * + * Confirmed shape catalog: + * schema-confusion: + * Bash({ file_path[, limit] }) → Read + * Bash({ pattern, path[, output_mode] }) → Grep + * Bash({ pattern }) → Glob + * Grep({ command[, description] }) → Bash + * Read({ command[, description] }) → Bash + * Read({ pattern }) → Glob + * Grep({ pattern = }) → Glob + * Extension: + * WebFetch({ file_path[, limit, offset, pages] }) → Read + * Write({ file_path, limit/offset/pages }) → Read + * Write({ file_path, content }) → null (well-formed) + * Write({ file_path }) → null (ambiguous) + * Bash({ file_path, content }) → Write (content anchors) + * + * The hint is appended to the standard `formatZodValidationError` + * output in toolExecution.ts; it does not silently reroute the call. + * Rerouting would create transcript inconsistency (a Bash call turning + * into a Read result under the same tool_use_id). The hint keeps the + * transcript clean and lets the model self-correct on the next turn. + * + * Returns null when no confusion pattern fires so callers can write + * `if (hint) errorContent += hint` without further condition. + */ + +// Set of tool names subject to schema-confusion hinting. Originally the +// search-tool cluster {Bash, Grep, Glob, Read} (schema-confusion, 2026-06-20). Extended +// to WebFetch and Write after GLM-5.2 reports showed the model leaking +// Read's `file_path` (and Read's `limit`/`offset`) onto WebFetch and Write +// repeatedly without recovering, because the hint never fired for those +// called tools. +const SEARCH_TOOL_NAMES = new Set(['Bash', 'Grep', 'Glob', 'Read', 'WebFetch', 'Write']) + +// Read-only optional params. Used to disambiguate the Write/Read file_path +// collision: only Read accepts these as input, so their presence alongside +// file_path on a Write call is a strong signal the model meant Read. +const READ_ONLY_OPTIONALS = new Set(['limit', 'offset', 'pages']) + +// Parameter-scorecard per tool in the cluster. The matcher uses these +// tables to (a) decide whether the model's input keys belong to a +// different sibling tool, and (b) hand the model the parameter name +// it should actually use on its originally-named tool. +// +// `keyword`: required/marketing parameter that uniquely identifies +// the tool (used for high-confidence detection). +// `optional`: optional params that, when present alongside the +// keyword, reinforce the diagnosis but don't diagnose alone. +type ToolParamSignature = { + keyword: string + optional: ReadonlySet +} + +const SIGNATURES: Record = { + Bash: { + keyword: 'command', + optional: new Set([ + 'description', + 'timeout', + 'run_in_background', + 'dangerouslyDisableSandbox', + ]), + }, + Read: { + keyword: 'file_path', + optional: new Set(['limit', 'offset', 'pages']), + }, + Grep: { + keyword: 'pattern', + optional: new Set([ + 'path', + 'output_mode', + 'glob', + 'type', + 'multiline', + 'head_limit', + 'offset', + 'context', + '-i', + '-n', + '-A', + '-B', + '-C', + ]), + }, + // Glob shares `pattern` with Grep. Disambiguation is content-based: + // a pattern with path-like or glob-shaped chars (no regex-isms, no + // pipe alternation) when emitted alone is treated as Glob intent. + Glob: { + keyword: 'pattern', + optional: new Set(['path', 'output_mode']), + }, + // WebFetch's url/prompt are unique — no other tool in the cluster + // accepts them. WebFetch calls that leak Read's file_path (with or + // without Read's optionals) therefore route cleanly to Read. + WebFetch: { + keyword: 'url', + optional: new Set(['prompt']), + }, + // Write: `content` is Write-unique — no other tool in the cluster + // accepts it. Making `content` Write's keyword (not `file_path`) + // resolves the Write/Read `file_path` collision at the target side: + // any input with `content` present cleanly anchors Write intent, + // regardless of whether Read is in the sibling registry. `file_path` + // becomes optional because Write does accept it as a required field + // in the actual Zod schema, but as a leak-detection signal it is + // shared with Read and cannot anchor Write intent alone. + Write: { + keyword: 'content', + optional: new Set(['file_path']), + }, +} + +// Returns true if a pattern string looks like a file glob rather than +// a regex (e.g. "*.ts", "src/**/\*.tsx", "foo/*.md") vs an actual +// regex (e.g. "foo|bar", "\\bword\\b", "^a.*z$"). +function looksLikeGlob(pattern: unknown): boolean { + if (typeof pattern !== 'string') return false + // Glob-only signatures: contains `*` or `?` or `[...]` but no regex + // alternation (`|`) or common anchors/quantifiers that Grep regexes + // rely on. We deliberately keep this conservative — when in doubt, + // we do not classify a pattern as glob (returns false) so the model + // keeps the Grep intent and the model-side retry isn't misdirected. + if (!/[*?]/.test(pattern)) return false + if (/[|]/.test(pattern)) return false + if (/\\\b|\\b|\\\d|\^\(|\$\)/.test(pattern)) return false + // Path-style glob: leading slash or ./ or contains directory separators + if (pattern.includes('/') || pattern.startsWith('.') ) return true + // Bare glob like "*.ts" with no path + return true +} + +/** + * Shared leak-detection core. Returns the sibling tool whose signature + * best matches the input, or null when no leak is detected. Called by + * both the model-facing hint string (buildSchemaConfusionHint) and the + * structured telemetry inference (inferSchemaLeakSource) so they cannot + * drift apart — a single source of truth for what counts as a schema + * leak and which sibling the input points at. + * + * Returns null when: + * - called tool is outside SEARCH_TOOL_NAMES + * - input is empty + * - called tool's own keyword is present (well-formed call) + * - Write is called without `content` and without Read-only optionals + * (ambiguous: could be Write missing `content`, could be Read + * leaking `file_path` alone) + * - no sibling signature matches the input keys cleanly + * + * Does NOT handle the Grep-with-glob-looking-pattern special case — + * that is a misuse warning (own keyword present), not a leak, and is + * handled by buildSchemaConfusionHint directly before calling this. + */ +type SchemaLeakMatch = { + target: string + matchedParams: string[] +} + +function matchSchemaLeakTarget( + toolName: string, + input: { [key: string]: unknown }, + siblingToolNames: readonly string[], +): SchemaLeakMatch | null { + if (!SEARCH_TOOL_NAMES.has(toolName)) return null + const inputKeys = Object.keys(input) + if (inputKeys.length === 0) return null + + const availableTools = new Set(siblingToolNames) + const calledSignature = SIGNATURES[toolName] + let ownKeywordPresent = inputKeys.includes(calledSignature.keyword) + + // Write/Read file_path collision override (called side). With Write's + // keyword set to `content`, `ownKeywordPresent` is true iff `content` + // is in the input. When `content` is absent, we must distinguish: + // - A Read-only optional (limit/offset/pages) present → the model + // is leaking Read. `ownKeywordPresent` is already false (no + // content), so the sibling search below considers Read. No state + // to mutate. + // - No Read-only optional present → ambiguous: could be Write + // missing `content`, could be Read leaking `file_path` alone. + // Cannot disambiguate; return null and let the bare Zod error + // surface the missing `content` requirement. + if (toolName === 'Write' && !inputKeys.includes('content')) { + const hasReadOptional = inputKeys.some(k => READ_ONLY_OPTIONALS.has(k)) + if (!hasReadOptional) { + return null + } + } + + // If the called tool's own keyword is present, the call is well-formed + // for that tool — no leak to report. + if (ownKeywordPresent) return null + + let bestMatch: SchemaLeakMatch | null = null + + for (const targetName of SEARCH_TOOL_NAMES) { + if (targetName === toolName) continue + if (!availableTools.has(targetName)) continue + + const signature = SIGNATURES[targetName] + if (!inputKeys.includes(signature.keyword)) continue + + // Disambiguate the Grep/Glob pattern-keyword collision. + if (targetName === 'Grep' || targetName === 'Glob') { + if (signature.keyword === 'pattern') { + const patternValue = input['pattern'] + // Skip a sibling whose keyword matches but whose value shape + // doesn't fit. We only emit a hint for the sibling that best + // explains the input. + if (targetName === 'Glob' && !looksLikeGlob(patternValue)) { + // If Glob's pattern looks like a regex, skip Glob. + continue + } + if ( + targetName === 'Grep' && + looksLikeGlob(patternValue) && + // If pattern looks like a glob and there are no Grep-only + // optionals present, treat as Glob intent. + !inputKeys.some(k => SIGNATURES.Grep.optional.has(k)) + ) { + continue + } + } + } + + // Verify all input keys are either the target's keyword or one + // of its optional params. Stray params weaken the match. + const knownTargetParams = new Set([signature.keyword, ...signature.optional]) + const allKnown = inputKeys.every(k => knownTargetParams.has(k)) + if (!allKnown) continue + + const matchedParams = inputKeys.filter( + k => k === signature.keyword || signature.optional.has(k), + ) + if (!bestMatch || matchedParams.length > bestMatch.matchedParams.length) { + bestMatch = { target: targetName, matchedParams } + } + } + + return bestMatch +} + +/** + * Constructs the confusion hint string. Returns null when no recognized + * pattern fires, when the tool is not in the search cluster, + * when the input is empty, or when the sibling tool the input points + * at is not actually available in the registry (the model can't be + * told to call a tool it doesn't have). + */ +export function buildSchemaConfusionHint( + toolName: string, + input: { [key: string]: unknown }, + siblingToolNames: readonly string[], +): string | null { + // Grep-as-Glob special case: own keyword (pattern) is present, but a + // bare glob-looking pattern with no Grep-specific optionals means the + // model almost certainly meant Glob. File globs are not valid Grep + // regex syntax and Grep would timeout or error. This is a misuse + // warning, not a schema leak, so it is checked before the shared + // leak matcher (which returns null when the called tool's own keyword + // is present). + if ( + toolName === 'Grep' && + siblingToolNames.includes('Glob') && + Object.keys(input).length === 1 && + typeof input['pattern'] === 'string' && + looksLikeGlob(input['pattern']) + ) { + return ( + ` Hint: \`${input['pattern']}\` looks like a file glob, not a ` + + `regex. Call \`Glob\` with \`pattern\` instead of \`Grep\`.` + ) + } + + const match = matchSchemaLeakTarget(toolName, input, siblingToolNames) + if (!match) return null + + const { target, matchedParams } = match + const ownKeyword = SIGNATURES[toolName].keyword + const paramList = matchedParams.map(p => `\`${p}\``).join(', ') + return ( + ` Hint: parameters ${paramList} belong to the \`${target}\` tool. ` + + `Call \`${target}\` with those parameters instead, or supply \`${toolName}\`'s required \`${ownKeyword}\` parameter and retry.` + ) +} + +/** + * Structured inference result for telemetry. Same matching logic as + * buildSchemaConfusionHint, but returns the structured fields instead of + * the model-facing hint string. Used by the ncode_schema_leak_rejected + * telemetry event so we can measure (a) leak frequency by tool family + * and model, (b) how often the hint fires, and (c) which sibling the + * input points at. + * + * Returns null when the called tool is outside SEARCH_TOOL_NAMES, when + * input is empty, or when no sibling signature matches the input keys. + * When the called tool's own keyword is present (well-formed call), also + * returns null — that is not a leak. + */ +export type SchemaLeakInference = { + /** Tool the input params actually belong to, e.g. "Read". */ + source_tool_inferred: string + /** Input keys that matched the inferred sibling's signature. */ + stray_params: string[] +} + +export function inferSchemaLeakSource( + toolName: string, + input: { [key: string]: unknown }, + siblingToolNames: readonly string[], +): SchemaLeakInference | null { + const match = matchSchemaLeakTarget(toolName, input, siblingToolNames) + if (!match) return null + return { + source_tool_inferred: match.target, + stray_params: match.matchedParams, + } +} \ No newline at end of file diff --git a/src/services/tools/schemaLeakTelemetry.test.ts b/src/services/tools/schemaLeakTelemetry.test.ts new file mode 100644 index 0000000..85f025e --- /dev/null +++ b/src/services/tools/schemaLeakTelemetry.test.ts @@ -0,0 +1,157 @@ +import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test' +import * as analytics from '../analytics/index.js' +import { inferSchemaLeakSource } from './schemaConfusionHint.js' + +// Tests for the structured telemetry helper that powers +// ncode_schema_leak_rejected. The helper must return exactly the same +// source_tool_inferred / stray_params the hint would name, plus a few +// null cases where we deliberately do not emit. + +const ALL_SEARCH_TOOLS = ['Bash', 'Grep', 'Glob', 'Read', 'WebFetch', 'Write'] + +describe('inferSchemaLeakSource', () => { + describe('returns source + stray params for each confirmed leak shape', () => { + it('Bash({ file_path }) → Read, [file_path]', () => { + const r = inferSchemaLeakSource('Bash', { file_path: '/tmp/x' }, ALL_SEARCH_TOOLS) + expect(r).toEqual({ source_tool_inferred: 'Read', stray_params: ['file_path'] }) + }) + + it('Bash({ file_path, limit }) → Read, [file_path, limit]', () => { + const r = inferSchemaLeakSource( + 'Bash', + { file_path: '/tmp/x', limit: 5 }, + ALL_SEARCH_TOOLS, + ) + expect(r).toEqual({ + source_tool_inferred: 'Read', + stray_params: ['file_path', 'limit'], + }) + }) + + it('Bash({ pattern, path, output_mode }) → Grep, [pattern, path, output_mode]', () => { + const r = inferSchemaLeakSource( + 'Bash', + { pattern: 'foo', path: '/x', output_mode: 'files_with_matches' }, + ALL_SEARCH_TOOLS, + ) + expect(r).toEqual({ + source_tool_inferred: 'Grep', + stray_params: ['pattern', 'path', 'output_mode'], + }) + }) + + it('Read({ command, description }) → Bash, [command, description]', () => { + const r = inferSchemaLeakSource( + 'Read', + { command: 'ls', description: 'list' }, + ALL_SEARCH_TOOLS, + ) + expect(r).toEqual({ + source_tool_inferred: 'Bash', + stray_params: ['command', 'description'], + }) + }) + + it('WebFetch({ file_path }) → Read, [file_path]', () => { + const r = inferSchemaLeakSource('WebFetch', { file_path: '/x' }, ALL_SEARCH_TOOLS) + expect(r).toEqual({ source_tool_inferred: 'Read', stray_params: ['file_path'] }) + }) + }) + + describe('returns null when no leak is detectable', () => { + it('called tool outside the search cluster → null', () => { + // TaskGet is not in SEARCH_TOOL_NAMES; even if its input is + // shaped like a sibling, we pass through without diagnosing. + const r = inferSchemaLeakSource('TaskGet', { file_path: '/x' }, ALL_SEARCH_TOOLS) + expect(r).toBeNull() + }) + + it('well-formed Bash call (own keyword present) → null', () => { + const r = inferSchemaLeakSource( + 'Bash', + { command: 'ls', description: 'list' }, + ALL_SEARCH_TOOLS, + ) + expect(r).toBeNull() + }) + + it('empty input → null', () => { + const r = inferSchemaLeakSource('Bash', {}, ALL_SEARCH_TOOLS) + expect(r).toBeNull() + }) + + it('bare Write({ file_path }) with no stray optionals → null', () => { + // Write's own keyword is file_path; when only file_path is present + // (no content, no Read-only optionals like limit/offset), we cannot + // confidently infer Read intent. The bare "required content missing" + // error from Zod is the best we can offer the model. + const r = inferSchemaLeakSource('Write', { file_path: '/x' }, ALL_SEARCH_TOOLS) + expect(r).toBeNull() + }) + + it('Write({ file_path, limit, offset }) → Read, [file_path, limit, offset]', () => { + // Read-only optionals (limit/offset/pages) disambiguate the Write/Read + // file_path collision. Their presence without Write's required + // `content` confirms Read intent. + const r = inferSchemaLeakSource( + 'Write', + { file_path: '/x', limit: 5, offset: 10 }, + ALL_SEARCH_TOOLS, + ) + expect(r).toEqual({ + source_tool_inferred: 'Read', + stray_params: ['file_path', 'limit', 'offset'], + }) + }) + }) +}) + +// Telemetry emission smoke test: verifies the right event name and +// field shape surface when a real leak is detected. We don't exercise +// the full rejection path here — that's covered by toolExecution's +// existing tests — we just confirm the helper hook composes correctly +// with the analytics sink. +describe('telemetry wiring', () => { + const events: Array<{ name: string; metadata: Record }> = [] + let restore: () => void + + beforeEach(() => { + events.length = 0 + const spy = spyOn(analytics, 'logEvent').mockImplementation( + (name: string, metadata: Record) => { + events.push({ name, metadata }) + }, + ) + restore = () => spy.mockRestore() + }) + + afterEach(() => restore()) + + it('helper result maps cleanly into the ncode_schema_leak_rejected event shape', () => { + const leak = inferSchemaLeakSource('Bash', { file_path: '/tmp/x' }, ALL_SEARCH_TOOLS) + expect(leak).not.toBeNull() + // Simulate the exact call shape toolExecution.ts makes. + if (leak) { + analytics.logEvent('ncode_schema_leak_rejected', { + model: 'glm-5.2', + target_tool: 'Bash', + source_tool_inferred: leak.source_tool_inferred, + stray_params: leak.stray_params, + missing_required_params: ['command'], + hint_emitted: true, + query_chain_id: 'chain-1', + query_depth: 0, + }) + } + expect(events).toHaveLength(1) + expect(events[0]!.name).toBe('ncode_schema_leak_rejected') + expect(events[0]!.metadata).toMatchObject({ + model: 'glm-5.2', + target_tool: 'Bash', + source_tool_inferred: 'Read', + stray_params: ['file_path'], + missing_required_params: ['command'], + hint_emitted: true, + }) + }) +}) \ No newline at end of file diff --git a/src/services/tools/toolExecution.ts b/src/services/tools/toolExecution.ts index 079664d..2bc6851 100644 --- a/src/services/tools/toolExecution.ts +++ b/src/services/tools/toolExecution.ts @@ -130,6 +130,10 @@ import { runPostToolUseHooks, runPreToolUseHooks, } from './toolHooks.js' +import { + buildSchemaConfusionHint, + inferSchemaLeakSource, +} from './schemaConfusionHint.js' /** Minimum total hook duration (ms) to show inline timing summary */ export const HOOK_TIMING_DISPLAY_THRESHOLD_MS = 500 @@ -630,6 +634,47 @@ async function checkPermissionsAndCallTool( errorContent += schemaHint } + const confusionHint = buildSchemaConfusionHint( + tool.name, + input, + toolUseContext.options.tools.map(t => t.name), + ) + if (confusionHint) { + logEvent('ncode_schema_confusion_hint', { + toolName: sanitizeToolNameForAnalytics(tool.name), + isMcp: tool.isMcp ?? false, + }) + errorContent += confusionHint + } + + const leak = inferSchemaLeakSource( + tool.name, + input, + toolUseContext.options.tools.map(t => t.name), + ) + const missingRequired = parsedInput.error.issues + .filter( + issue => + issue.code === 'invalid_type' && issue.received === 'missing', + ) + .map(issue => issue.path.join('.')) + .filter(name => name.length > 0) + if (leak || missingRequired.length > 0) { + logEvent('ncode_schema_leak_rejected', { + model: + (toolUseContext.options.mainLoopModel ?? '') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + target_tool: + sanitizeToolNameForAnalytics(tool.name) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + source_tool_inferred: (leak?.source_tool_inferred ?? null) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + stray_params: (leak?.stray_params ?? []) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + missing_required_params: missingRequired as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + hint_emitted: confusionHint !== null, + query_chain_id: + (toolUseContext.queryTracking?.chainId ?? '') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + query_depth: toolUseContext.queryTracking?.depth ?? 0, + }) + } + logForDebugging( `${tool.name} tool input error: ${errorContent.slice(0, 200)}`, )