diff --git a/scripts/reset_fmc_workorders.py b/scripts/reset_fmc_workorders.py new file mode 100644 index 00000000..501bed4e --- /dev/null +++ b/scripts/reset_fmc_workorders.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +"""Reset the ``wo_fmc`` work orders in CouchDB back to their seed state. + +The failure-code write-back scenarios (S2/S4/S5) mutate ``wo_fmc`` records in +the ``workorder`` database. This restores every record's ``failure_code`` to +the value in the seed CSV (``src/couchdb/sample_data/work_order/wo_fmc.csv``): +TRN- records keep their historical codes, TST- records go back to blank. Only +records that have drifted from the seed are written. + +For a full rebuild of the entire ``workorder`` DB from all CSVs instead, use: + cd src && uv run python -m couchdb.init_wo --drop + +Usage: + uv run python scripts/reset_fmc_workorders.py + uv run python scripts/reset_fmc_workorders.py --dry-run +""" + +from __future__ import annotations + +import argparse +import csv +import sys +from pathlib import Path + +_ROOT = Path(__file__).resolve().parent.parent +_SRC = _ROOT / "src" +sys.path.insert(0, str(_SRC)) + +_CSV = _SRC / "couchdb" / "sample_data" / "work_order" / "wo_fmc.csv" + + +def _seed_codes() -> dict[str, str | None]: + """Map wo_id → seed failure_code (None for blank) from the CSV.""" + seed: dict[str, str | None] = {} + with open(_CSV, newline="", encoding="utf-8") as f: + for row in csv.DictReader(f): + code = (row.get("failure_code") or "").strip() + seed[row["wo_id"]] = code or None + return seed + + +def main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--dry-run", action="store_true", help="Report drift without writing." + ) + args = parser.parse_args() + + from dotenv import load_dotenv + + load_dotenv(_ROOT / ".env") + + import pandas as pd + + from servers.wo.data import load, write_failure_codes + + seed = _seed_codes() + blanks = sum(1 for v in seed.values() if v is None) + print(f"seed: {len(seed)} wo_fmc records ({len(seed) - blanks} labelled, {blanks} blank)") + + df = load("wo_fmc") + if df is None: + print("CouchDB unavailable or wo_fmc not loaded — nothing to reset.") + sys.exit(1) + + def _norm(v) -> str | None: + if pd.isna(v) or not str(v).strip(): + return None + return str(v).strip() + + current = {str(r["wo_id"]): _norm(r.get("failure_code")) for _, r in df.iterrows()} + drift = {wo_id: code for wo_id, code in seed.items() if current.get(wo_id) != code} + + print(f"drifted from seed: {len(drift)} record(s)") + if not drift: + print("already at seed state — nothing to do.") + return + if args.dry_run: + for wo_id in list(drift)[:20]: + print(f" {wo_id}: {current.get(wo_id)!r} -> {drift[wo_id]!r}") + if len(drift) > 20: + print(f" … and {len(drift) - 20} more") + return + + status = write_failure_codes(drift) + if status is None: + print("CouchDB unavailable — reset aborted.") + sys.exit(1) + restored = sum(1 for ok in status.values() if ok) + print(f"reset {restored}/{len(drift)} wo_fmc record(s) to seed state.") + + +if __name__ == "__main__": + main() diff --git a/scripts/test_fmc_claude_agent.py b/scripts/test_fmc_claude_agent.py new file mode 100644 index 00000000..2868ba3f --- /dev/null +++ b/scripts/test_fmc_claude_agent.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +"""Quick smoke test: run the Claude agent over the FMC work-order scenarios. + +Loads the scenarios from ``src/scenarios/local/workorder_utterance.json``, runs +each one through ``ClaudeAgentRunner`` (which connects the ``wo`` MCP server and +its FMC tools), and prints the agent's answer next to the expected answer plus +the tools it actually called. + +Write-back scenarios (S2/S4/S5) mutate CouchDB; by default this script +re-blanks every ``TST-`` record afterwards so the evaluation dataset stays +pristine. Pass ``--no-restore`` to leave the imputations in place (e.g. to +inspect the write-back independently). + +Requires CouchDB up (``workorder`` DB loaded) and LITELLM_* env vars in .env. + +Usage: + uv run python scripts/test_fmc_claude_agent.py # all scenarios + uv run python scripts/test_fmc_claude_agent.py S1 S3 # only S1 and S3 + uv run python scripts/test_fmc_claude_agent.py --no-restore + uv run python scripts/test_fmc_claude_agent.py --show-trajectory +""" + +from __future__ import annotations + +import argparse +import asyncio +import os +import sys +import uuid +from pathlib import Path + +_ROOT = Path(__file__).resolve().parent.parent +_SRC = _ROOT / "src" +sys.path.insert(0, str(_SRC)) + +_SCENARIOS = _SRC / "scenarios" / "local" / "workorder_utterance.json" +_HR = "=" * 72 + + +def _load_scenarios(labels: set[str]) -> list[dict]: + import json + + data = json.loads(_SCENARIOS.read_text(encoding="utf-8")) + out = [] + for s in data: + label = s.get("metadata", {}).get("scenario_label", "") + if labels and label not in labels: + continue + out.append(s) + return out + + +async def _run(args: argparse.Namespace) -> None: + from agent.claude_agent.runner import ClaudeAgentRunner + from observability import set_run_context + from servers.wo.data import load, write_failure_codes + + scenarios = _load_scenarios(set(args.labels)) + if not scenarios: + print(f"No scenarios matched {args.labels!r}") + return + + traj_dir = os.environ.get("AGENT_TRAJECTORY_DIR") + if traj_dir: + print(f"[trajectory] saving per-scenario JSON to {traj_dir}") + + runner = ClaudeAgentRunner(model=args.model_id, max_turns=args.max_turns) + needs_restore = False + + for s in scenarios: + md = s["metadata"] + label = md.get("scenario_label", "?") + needs_restore = needs_restore or bool(md.get("write_back")) + + # Fresh run_id per scenario so persist_trajectory writes one file each + # (keyed by run_id, with scenario_id recorded inside). + run_id = str(uuid.uuid4()) + set_run_context(run_id=run_id, scenario_id=label) + + print(f"\n{_HR}\n{label} · {md.get('subtitle', '')}\n{_HR}") + print(f"Q: {s['text']}\n") + + result = await runner.run(s["text"]) + + tools_used = [tc.name for tc in result.trajectory.all_tool_calls] + print(f"EXPECTED : {s.get('expected_answer')}") + print(f"AGENT ANSWER : {result.answer}") + print(f"EXPECTED TOOLS: {md.get('expected_tools')}") + print(f"TOOLS USED : {tools_used}") + print( + f"(turns={len(result.trajectory.turns)}, " + f"tool_calls={len(tools_used)}, " + f"out_tokens={result.trajectory.total_output_tokens})" + ) + if traj_dir: + print(f"SAVED : {Path(traj_dir) / f'{run_id}.json'} (scenario_id={label})") + if args.show_trajectory: + for t in result.trajectory.turns: + for tc in t.tool_calls: + print(f" → {tc.name}({tc.input})") + + if needs_restore and not args.no_restore: + df = load("wo_fmc") + status = {} + if df is not None: + tst_ids = [str(w) for w in df.loc[df["wo_id"].str.startswith("TST"), "wo_id"]] + status = write_failure_codes({wo_id: None for wo_id in tst_ids}) or {} + restored = sum(1 for ok in status.values() if ok) + print(f"\n[restore] re-blanked {restored} TST- record(s) to keep the dataset pristine.") + elif needs_restore: + print("\n[restore] skipped (--no-restore); TST- records keep their imputed codes.") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("labels", nargs="*", help="Scenario labels to run (e.g. S1 S3). Default: all.") + parser.add_argument("--model-id", default="litellm_proxy/aws/claude-opus-4-6", help="Model string.") + parser.add_argument("--max-turns", type=int, default=30, help="Max agentic loop turns.") + parser.add_argument("--no-restore", action="store_true", help="Leave write-back imputations in CouchDB.") + parser.add_argument("--show-trajectory", action="store_true", help="Print each tool call.") + parser.add_argument( + "--trajectory-dir", + metavar="DIR", + default=None, + help="Save a {run_id}.json trajectory per scenario to DIR " + "(overrides/sets AGENT_TRAJECTORY_DIR).", + ) + args = parser.parse_args() + + from dotenv import load_dotenv + + load_dotenv(_ROOT / ".env") + if args.trajectory_dir: + os.environ["AGENT_TRAJECTORY_DIR"] = args.trajectory_dir + asyncio.run(_run(args)) + + +if __name__ == "__main__": + main() diff --git a/src/couchdb/init_wo.py b/src/couchdb/init_wo.py index 882aaf6c..dbcb3d1d 100644 --- a/src/couchdb/init_wo.py +++ b/src/couchdb/init_wo.py @@ -66,6 +66,7 @@ ("failure_codes.csv", "failure_codes", {}), ("primary_failure_codes.csv", "primary_failure_codes", {}), ("component.csv", "component", {}), + ("wo_fmc.csv", "wo_fmc", {}), ] # Mango indexes to create: list of field-lists diff --git a/src/couchdb/sample_data/work_order/wo_fmc.csv b/src/couchdb/sample_data/work_order/wo_fmc.csv new file mode 100644 index 00000000..f07daa74 --- /dev/null +++ b/src/couchdb/sample_data/work_order/wo_fmc.csv @@ -0,0 +1,565 @@ +wo_id,description,failure_code +TRN-WO00001,falure,Breakdown +TRN-WO00002,unserviceable dpr 1,Breakdown +TRN-WO00003,bogged,Plugged / choked +TRN-WO00004,spaying slurry,Leaking +TRN-WO00005,fell of,Minor in-service problems +TRN-WO00006,no pressure output,Plugged / choked +TRN-WO00007,damageblockage,Structural deficiency +TRN-WO00008,squeeling,Noise +TRN-WO00009,requires repacking,Minor in-service problems +TRN-WO00010,no starting issue,Failure to start on demand +TRN-WO00011,shaking,Vibration +TRN-WO00012,blow,Minor in-service problems +TRN-WO00013,need a descale,Minor in-service problems +TRN-WO00014,over heating,Overheating +TRN-WO00015,vibrationcheck,Vibration +TRN-WO00016,vibration issue,Vibration +TRN-WO00017,not open,Minor in-service problems +TRN-WO00018,sticking,Fail to function +TRN-WO00019,seized shut,Breakdown +TRN-WO00020,blocked,Plugged / choked +TRN-WO00021,not working,Breakdown +TRN-WO00022,not energising,Low output +TRN-WO00023,power issue,Electrical +TRN-WO00024,needs repairing,Structural deficiency +TRN-WO00025,not performing,Breakdown +TRN-WO00026,wont run and,Failure to start on demand +TRN-WO00027,boil out,Overheating +TRN-WO00028,not supplying,Plugged / choked +TRN-WO00029,spilled,Leaking +TRN-WO00030,stuck in,Fail to function +TRN-WO00031,leak exponentially,Leaking +TRN-WO00032,not keeping up,Low output +TRN-WO00033,pumping issue,Low output +TRN-WO00034,leak slurry,Leaking +TRN-WO00035,us,Breakdown +TRN-WO00036,requires attention,Minor in-service problems +TRN-WO00037,does not stop,Failure to stop on demand +TRN-WO00038,noisy sce,Noise +TRN-WO00039,twisting,Fail to function +TRN-WO00040,needs to be extended,Minor in-service problems +TRN-WO00041,lost its packing,Leaking +TRN-WO00042,rusted through,Structural deficiency +TRN-WO00043,needs to be held down,Minor in-service problems +TRN-WO00044,flow issue,Plugged / choked +TRN-WO00045,degraded,Structural deficiency +TRN-WO00046,wont pump floc,Breakdown +TRN-WO00047,leakspray,Leaking +TRN-WO00048,wont run in,Failure to start on demand +TRN-WO00049,needs repacked,Minor in-service problems +TRN-WO00050,passing excessively,Leaking +TRN-WO00051,blocking,Plugged / choked +TRN-WO00052,not engaging,Failure to start on demand +TRN-WO00053,shredded,Structural deficiency +TRN-WO00054,noise,Noise +TRN-WO00055,blown out,Minor in-service problems +TRN-WO00056,needs replacing,Breakdown +TRN-WO00057,not running,Breakdown +TRN-WO00058,leak when shut,Leaking +TRN-WO00059,fall off,Minor in-service problems +TRN-WO00060,not being pumped away,Plugged / choked +TRN-WO00061,scaled up,Minor in-service problems +TRN-WO00062,fallen off,Minor in-service problems +TRN-WO00063,broke away,Breakdown +TRN-WO00064,cannot run in,Failure to start on demand +TRN-WO00065,not pumping enough,Plugged / choked +TRN-WO00066,not stopping,Failure to stop on demand +TRN-WO00067,unable to shu,Failure to stop on demand +TRN-WO00068,failed internaly,Breakdown +TRN-WO00069,not there,Minor in-service problems +TRN-WO00070,inefficiant,Plugged / choked +TRN-WO00071,no reading,Abnormal instrument reading +TRN-WO00072,will not start in,Failure to start on demand +TRN-WO00073,been running hot,Overheating +TRN-WO00074,fallen out,Minor in-service problems +TRN-WO00075,leak on,Leaking +TRN-WO00076,to be removed,Breakdown +TRN-WO00077,unsupported,Minor in-service problems +TRN-WO00078,giving false reading,Abnormal instrument reading +TRN-WO00079,leak in two place,Leaking +TRN-WO00080,not reaching flow,Plugged / choked +TRN-WO00081,wont turn,Failure to start on demand +TRN-WO00082,not able to open,Minor in-service problems +TRN-WO00083,need straighting,Minor in-service problems +TRN-WO00084,hot liquid,Overheating +TRN-WO00085,fell apart,Minor in-service problems +TRN-WO00086,fail again,Breakdown +TRN-WO00087,needs to be replaced,Breakdown +TRN-WO00088,underperforming,Low output +TRN-WO00089,corroded close out,Structural deficiency +TRN-WO00090,not kicking in,Failure to start on demand +TRN-WO00091,needs to be descaled,Minor in-service problems +TRN-WO00092,over flowed,Leaking +TRN-WO00093,non operational,Breakdown +TRN-WO00094,running fault,Other +TRN-WO00095,unable to run,Failure to start on demand +TRN-WO00096,come adrift,Minor in-service problems +TRN-WO00097,stiff,Fail to function +TRN-WO00098,not high enough,Low output +TRN-WO00099,broken off,Breakdown +TRN-WO00100,snapped,Structural deficiency +TRN-WO00101,shorted out,Electrical +TRN-WO00102,running when sump empty,Other +TRN-WO00103,faul,Electrical +TRN-WO00104,has a hole on,Structural deficiency +TRN-WO00105,will not trip,Electrical +TRN-WO00106,failing to start,Failure to start on demand +TRN-WO00107,needs adjusting,Minor in-service problems +TRN-WO00108,sheared,Structural deficiency +TRN-WO00109,failed,Breakdown +TRN-WO00110,no guarding,Minor in-service problems +TRN-WO00111,unresponsive,Failure to start on demand +TRN-WO00112,no sightglass,Minor in-service problems +TRN-WO00113,not connected,Minor in-service problems +TRN-WO00114,unserviceable and line,Breakdown +TRN-WO00115,keeps triping,Electrical +TRN-WO00116,coming out,Minor in-service problems +TRN-WO00117,requires handle,Minor in-service problems +TRN-WO00118,low flow,Plugged / choked +TRN-WO00119,split,Structural deficiency +TRN-WO00120,fail to start,Failure to start on demand +TRN-WO00121,wont reset,Failure to start on demand +TRN-WO00122,hard to close,Fail to close +TRN-WO00123,hole in suction,Structural deficiency +TRN-WO00124,worn through,Structural deficiency +TRN-WO00125,passingleaking,Leaking +TRN-WO00126,not opening,Minor in-service problems +TRN-WO00127,jammedseized,Plugged / choked +TRN-WO00128,damage,Structural deficiency +TRN-WO00129,passing on,Leaking +TRN-WO00130,weak,Other +TRN-WO00131,viberating,Vibration +TRN-WO00132,static earth resistance,Electrical +TRN-WO00133,needs to be rebuilt,Minor in-service problems +TRN-WO00134,needs supporting,Minor in-service problems +TRN-WO00135,wont work,Failure to start on demand +TRN-WO00136,wornbelt,Structural deficiency +TRN-WO00137,not attached corroded,Minor in-service problems +TRN-WO00138,keeps tripping,Electrical +TRN-WO00139,weeping,Leaking +TRN-WO00140,badly rusted,Structural deficiency +TRN-WO00141,bad vibration,Vibration +TRN-WO00142,fire,Overheating +TRN-WO00143,faulting,Electrical +TRN-WO00144,contaminating condensate,Contamination +TRN-WO00145,too fast,High output +TRN-WO00146,low viscosity,Contamination +TRN-WO00147,not closing,Minor in-service problems +TRN-WO00148,require adjusting,Minor in-service problems +TRN-WO00149,sound,Noise +TRN-WO00150,bent,Minor in-service problems +TRN-WO00151,corrosion on,Structural deficiency +TRN-WO00152,doesnt operate,Breakdown +TRN-WO00153,seized open,Breakdown +TRN-WO00154,excessive stopstart,Erratic output +TRN-WO00155,needs replaceing,Breakdown +TRN-WO00156,hot thermography,Overheating +TRN-WO00157,erratic,Erratic output +TRN-WO00158,false high level,Abnormal instrument reading +TRN-WO00159,will not shut,Fail to function +TRN-WO00160,lagging,Low output +TRN-WO00161,blew off,Minor in-service problems +TRN-WO00162,no fill,Minor in-service problems +TRN-WO00163,needs packing,Minor in-service problems +TRN-WO00164,wont pump,Breakdown +TRN-WO00165,badly corrod,Structural deficiency +TRN-WO00166,surging,Electrical +TRN-WO00167,false reading,Abnormal instrument reading +TRN-WO00168,not pumping enough to get,Plugged / choked +TRN-WO00169,not draining,Plugged / choked +TRN-WO00170,not wo,Breakdown +TRN-WO00171,not charging,Electrical +TRN-WO00172,locked,Failure to rotate +TRN-WO00173,not moving,Plugged / choked +TRN-WO00174,milky,Contamination +TRN-WO00175,require tightening,Minor in-service problems +TRN-WO00176,siezed,Breakdown +TRN-WO00177,no pressure,Plugged / choked +TRN-WO00178,disconnected,Breakdown +TRN-WO00179,coming adrift,Minor in-service problems +TRN-WO00180,ruptured,Structural deficiency +TRN-WO00181,not pumpung,Plugged / choked +TRN-WO00182,leakhose,Leaking +TRN-WO00183,cracked,Structural deficiency +TRN-WO00184,broken loose,Breakdown +TRN-WO00185,failing,Breakdown +TRN-WO00186,looseworn,Minor in-service problems +TRN-WO00187,alarming eratically,Abnormal instrument reading +TRN-WO00188,are not operational,Breakdown +TRN-WO00189,no static,Abnormal instrument reading +TRN-WO00190,siezed shut,Breakdown +TRN-WO00191,running too slow,Low output +TRN-WO00192,rusty,Structural deficiency +TRN-WO00193,needs modifying,Minor in-service problems +TRN-WO00194,died,Breakdown +TRN-WO00195,smoking,Overheating +TRN-WO00196,pulling very high,High output +TRN-WO00197,blowin,Minor in-service problems +TRN-WO00198,not stopping at,Failure to stop on demand +TRN-WO00199,not operational,Breakdown +TRN-WO00200,seizedbogged,Breakdown +TRN-WO00201,usversicol,Breakdown +TRN-WO00202,static,Electrical +TRN-WO00203,does not trip,Electrical +TRN-WO00204,needs alignment,Minor in-service problems +TRN-WO00205,contaimination,Contamination +TRN-WO00206,burst again,Breakdown +TRN-WO00207,smoke,Overheating +TRN-WO00208,losing,Minor in-service problems +TRN-WO00209,no belt,Minor in-service problems +TRN-WO00210,needs tightened,Minor in-service problems +TRN-WO00211,not turning restriction,Plugged / choked +TRN-WO00212,no signal,Abnormal instrument reading +TRN-WO00213,emulsified,Contamination +TRN-WO00214,alarming vibration,Vibration +TRN-WO00215,needs patched up,Minor in-service problems +TRN-WO00216,blew off top of,Minor in-service problems +TRN-WO00217,badly corroded in,Structural deficiency +TRN-WO00218,perished,Breakdown +TRN-WO00219,need calibration,Abnormal instrument reading +TRN-WO00220,jammed in place,Plugged / choked +TRN-WO00221,collapsed,Breakdown +TRN-WO00222,requires slotting,Minor in-service problems +TRN-WO00223,fault finding,Electrical +TRN-WO00224,reading inaccurate,Abnormal instrument reading +TRN-WO00225,sucked in,Other +TRN-WO00226,come loose,Minor in-service problems +TRN-WO00227,no phase,Electrical +TRN-WO00228,bore tripping,Electrical +TRN-WO00229,spill to ground,Electrical +TRN-WO00230,reading high,Abnormal instrument reading +TRN-WO00231,rusted away,Structural deficiency +TRN-WO00232,needs a repack,Minor in-service problems +TRN-WO00233,fried,Breakdown +TRN-WO00234,no speed control,Abnormal instrument reading +TRN-WO00235,seized breakdown,Breakdown +TRN-WO00236,needs replacement,Breakdown +TRN-WO00237,not workin,Breakdown +TRN-WO00238,require investigating,Minor in-service problems +TRN-WO00239,passing when closed,Leaking +TRN-WO00240,not operating,Breakdown +TRN-WO00241,crack,Structural deficiency +TRN-WO00242,stopped pumping,Spurious stop +TRN-WO00243,not pumping issue,Plugged / choked +TRN-WO00244,vibration,Vibration +TRN-WO00245,no flow,Plugged / choked +TRN-WO00246,restricted,Low output +TRN-WO00247,high vibration,Vibration +TRN-WO00248,unserviceable no contol,Breakdown +TRN-WO00249,slippery,Minor in-service problems +TRN-WO00250,no seal,Minor in-service problems +TRN-WO00251,leak underflow,Leaking +TRN-WO00252,blockedseized,Plugged / choked +TRN-WO00253,broken near,Breakdown +TRN-WO00254,faulty earth,Electrical +TRN-WO00255,not maintaining,Plugged / choked +TRN-WO00256,wont rotate,Failure to start on demand +TRN-WO00257,not efficient,Low output +TRN-WO00258,will not start,Failure to start on demand +TRN-WO00259,contamination trip,Contamination +TRN-WO00260,not working on,Breakdown +TRN-WO00261,blown out at,Minor in-service problems +TRN-WO00262,trips out,Electrical +TRN-WO00263,struggling,Low output +TRN-WO00264,slow,Low output +TRN-WO00265,keeps on tripping,Electrical +TRN-WO00266,reading stuck,Abnormal instrument reading +TRN-WO00267,not pumping,Plugged / choked +TRN-WO00268,torn,Structural deficiency +TRN-WO00269,cutting in and out,Erratic output +TRN-WO00270,leakhole,Leaking +TRN-WO00271,come free of,Minor in-service problems +TRN-WO00272,contaminated,Contamination +TRN-WO00273,not pumpingeast,Plugged / choked +TRN-WO00274,no oil movement,Plugged / choked +TRN-WO00275,cold,Other +TRN-WO00276,problem,Other +TRN-WO00277,keeps tripping at start,Electrical +TRN-WO00278,severely corroded,Structural deficiency +TRN-WO00279,needs repack,Minor in-service problems +TRN-WO00280,not secure,Minor in-service problems +TRN-WO00281,missing on,Minor in-service problems +TRN-WO00282,require changing,Minor in-service problems +TRN-WO00283,have smoked up,Overheating +TRN-WO00284,rusted of,Structural deficiency +TRN-WO00285,relieving too early,Leaking +TRN-WO00286,missing blank,Minor in-service problems +TRN-WO00287,will not restart,Failure to start on demand +TRN-WO00288,needs repacking,Minor in-service problems +TRN-WO00289,high amp,Abnormal instrument reading +TRN-WO00290,weaping,Leaking +TRN-WO00291,burnt out,Overheating +TRN-WO00292,blowen,Minor in-service problems +TRN-WO00293,adrift,Minor in-service problems +TRN-WO00294,seizes at time,Breakdown +TRN-WO00295,may burst,Structural deficiency +TRN-WO00296,sparaying,Leaking +TRN-WO00297,tripping straight away,Electrical +TRN-WO00298,has a split in,Structural deficiency +TRN-WO00299,corrodedstripped,Structural deficiency +TRN-WO00300,buggered,Breakdown +TRN-WO00301,seized,Breakdown +TRN-WO00302,pot in backward,Minor in-service problems +TRN-WO00303,fault,Electrical +TRN-WO00304,will not pump,Breakdown +TRN-WO00305,tripping in,Electrical +TRN-WO00306,requires replacing,Minor in-service problems +TRN-WO00307,continually tripping,Electrical +TRN-WO00308,no run indication,Abnormal instrument reading +TRN-WO00309,running hot,Overheating +TRN-WO00310,poor,Fail to function +TRN-WO00311,keeps tripping out,Electrical +TRN-WO00312,playing up,Erratic output +TRN-WO00313,not providing pressure,Plugged / choked +TRN-WO00314,not starting,Failure to start on demand +TRN-WO00315,sheared off,Structural deficiency +TRN-WO00316,not puming,Plugged / choked +TRN-WO00317,corroding,Structural deficiency +TRN-WO00318,spraying out of,Leaking +TRN-WO00319,badly corroded,Structural deficiency +TRN-WO00320,requires clean,Minor in-service problems +TRN-WO00321,holedinspect,Structural deficiency +TRN-WO00322,no earth,Electrical +TRN-WO00323,missingloose,Minor in-service problems +TRN-WO00324,siezing on,Breakdown +TRN-WO00325,keeps kicking in,Failure to stop on demand +TRN-WO00326,not efficiant,Low output +TRN-WO00327,sheered,Structural deficiency +TRN-WO00328,incorrect orientation,Minor in-service problems +TRN-WO00329,worn out,Structural deficiency +TRN-WO00330,signal error,Abnormal instrument reading +TRN-WO00331,earth fault,Electrical +TRN-WO00332,unserviceable pump,Breakdown +TRN-WO00333,not attached,Minor in-service problems +TRN-WO00334,leaked out,Leaking +TRN-WO00335,shorting out,Electrical +TRN-WO00336,tripping on overload,Electrical +TRN-WO00337,broken,Breakdown +TRN-WO00338,heavy corrosion,Structural deficiency +TRN-WO00339,unable to reach,Low output +TRN-WO00340,wont open,Fail to function +TRN-WO00341,lealing,Leaking +TRN-WO00342,needs another row,Minor in-service problems +TRN-WO00343,down to earth,Electrical +TRN-WO00344,very corroded,Structural deficiency +TRN-WO00345,will not start after flushing,Failure to start on demand +TRN-WO00346,slurry spraying out,Leaking +TRN-WO00347,badly corr,Structural deficiency +TRN-WO00348,covered,Minor in-service problems +TRN-WO00349,requires a descale,Minor in-service problems +TRN-WO00350,starting to crack,Structural deficiency +TRN-WO00351,crack in transition,Structural deficiency +TRN-WO00352,constantly losing,Leaking +TRN-WO00353,continually trips out,Electrical +TRN-WO00354,dirty,Minor in-service problems +TRN-WO00355,worn again,Structural deficiency +TRN-WO00356,broke,Breakdown +TRN-WO00357,needs to be flushed,Minor in-service problems +TRN-WO00358,rusted off,Structural deficiency +TRN-WO00359,stopped working,Spurious stop +TRN-WO00360,stuck,Fail to function +TRN-WO00361,appears loose,Minor in-service problems +TRN-WO00362,very high vibration,Vibration +TRN-WO00363,cannot start,Failure to start on demand +TRN-WO00364,not secured,Minor in-service problems +TRN-WO00365,holed,Structural deficiency +TRN-WO00366,jammed closed,Plugged / choked +TRN-WO00367,tripping cause,Electrical +TRN-WO00368,coroded,Structural deficiency +TRN-WO00369,needs to be dismantled,Minor in-service problems +TRN-WO00370,missing off,Minor in-service problems +TRN-WO00371,knocking,Noise +TRN-WO00372,slipping,Minor in-service problems +TRN-WO00373,tripping on,Electrical +TRN-WO00374,caught in suction,Minor in-service problems +TRN-WO00375,defect notice,Structural deficiency +TRN-WO00376,will not stay running,Spurious stop +TRN-WO00377,come off,Minor in-service problems +TRN-WO00378,tripped out,Electrical +TRN-WO00379,not accurate,Abnormal instrument reading +TRN-WO00380,loose,Minor in-service problems +TRN-WO00381,has crack,Structural deficiency +TRN-WO00382,congested,Plugged / choked +TRN-WO00383,shows offline while running in field,Abnormal instrument reading +TRN-WO00384,showing signs of failure,Other +TRN-WO00385,passing,Leaking +TRN-WO00386,broken electrical,Breakdown +TRN-WO00387,spraying out,Leaking +TRN-WO00388,needs freeing up,Plugged / choked +TRN-WO00389,wont start,Failure to start on demand +TRN-WO00390,high earth resistance,Abnormal instrument reading +TRN-WO00391,pumping at high out put,High output +TRN-WO00392,pinholed,Minor in-service problems +TRN-WO00393,poor flow,Low output +TRN-WO00394,slipped off,Minor in-service problems +TRN-WO00395,faulty,Electrical +TRN-WO00396,needs scope,Minor in-service problems +TRN-WO00397,colapsed,Breakdown +TRN-WO00398,will not run,Breakdown +TRN-WO00399,exposed,Minor in-service problems +TRN-WO00400,losing signal,Electrical +TRN-WO00401,misaligning,Minor in-service problems +TRN-WO00402,out of position,Minor in-service problems +TRN-WO00403,need replacing,Breakdown +TRN-WO00404,not running in,Fail to function +TRN-WO00405,has split,Structural deficiency +TRN-WO00406,snapped off,Structural deficiency +TRN-WO00407,noisyvibrating,Noise +TRN-WO00408,filled,Leaking +TRN-WO00409,fail,Breakdown +TRN-WO00410,fails to open in sequence,Fail to open +TRN-WO00411,looseness,Minor in-service problems +TRN-WO00412,too high,High output +TRN-WO00413,blowing,Minor in-service problems +TRN-WO00414,running when empty,Other +TRN-WO00415,hot,Overheating +TRN-WO00416,fluctuates at high rate,Erratic output +TRN-WO00417,not reading right,Abnormal instrument reading +TRN-WO00418,sitting on hot,Overheating +TRN-WO00419,not able to maintain level,Plugged / choked +TRN-WO00420,dislodged,Minor in-service problems +TRN-WO00421,turning in reverse,Fail to function +TRN-WO00422,no out put,Plugged / choked +TRN-WO00423,leaky,Leaking +TRN-WO00424,tripping out on run up,Electrical +TRN-WO00425,need replaceing,Breakdown +TRN-WO00426,not pump,Plugged / choked +TRN-WO00427,will not run in,Breakdown +TRN-WO00428,needs rebuild,Breakdown +TRN-WO00429,leakingreplace,Leaking +TRN-WO00430,stopped abruptly,Spurious stop +TRN-WO00431,brittle,Structural deficiency +TRN-WO00432,needs securing,Minor in-service problems +TRN-WO00433,dented,Structural deficiency +TRN-WO00434,vibrationlaser,Vibration +TRN-WO00435,inconsistant flow,Plugged / choked +TRN-WO00436,needs re build,Breakdown +TRN-WO00437,unserviceable please replace,Breakdown +TRN-WO00438,not connectedblanked,Minor in-service problems +TRN-WO00439,earthing,Electrical +TRN-WO00440,no indication,Minor in-service problems +TRN-WO00441,stripped,Structural deficiency +TRN-WO00442,needs inspecting,Minor in-service problems +TRN-WO00443,snaped,Structural deficiency +TRN-WO00444,didnt trip,Electrical +TRN-WO00445,blow out,Minor in-service problems +TRN-WO00446,unserviceable leak sulphur,Breakdown +TRN-WO00447,tripping issue,Electrical +TRN-WO00448,unable to reset,Failure to stop on demand +TRN-WO00449,need modifying,Minor in-service problems +TRN-WO00450,tripping out,Electrical +TRN-WO00451,requires topupstop,Minor in-service problems +TRN-WO00452,dragged,Minor in-service problems +TRN-WO00453,not performingpumping,Breakdown +TRN-WO00454,getting jammed,Plugged / choked +TRN-WO00455,failed restest,Failure to start on demand +TRN-WO00456,starting issue,Failure to start on demand +TRN-WO00457,wrong range,Abnormal instrument reading +TRN-WO00458,may be failing,Fail to function +TRN-WO00459,stretched,Minor in-service problems +TRN-WO00460,does dont pump at,Plugged / choked +TRN-WO00461,stuck in open position,Fail to function +TRN-WO00462,seize,Breakdown +TRN-WO00463,hole in,Structural deficiency +TRN-WO00464,snappedplugged,Structural deficiency +TRN-WO00465,consumeing,Leaking +TRN-WO00466,keeps vibrating shut,Vibration +TRN-WO00467,not able to pump,Plugged / choked +TRN-WO00468,needs a descale,Minor in-service problems +TRN-WO00469,passing u,Leaking +TRN-WO00470,out of adjustment,Minor in-service problems +TRN-WO00471,failure,Breakdown +TRN-WO00472,needs reset,Electrical +TRN-WO00473,rupture,Structural deficiency +TRN-WO00474,vibrating,Vibration +TRN-WO00475,rubbing,Vibration +TRN-WO00476,leakingneed,Leaking +TRN-WO00477,playingup,Erratic output +TRN-WO00478,broken into,Breakdown +TRN-WO00479,doesnt pump to,Plugged / choked +TRN-WO00480,corrosion,Structural deficiency +TRN-WO00481,squealing,Noise +TRN-WO00482,noyt pumping,Breakdown +TRN-WO00483,fault tripping,Electrical +TRN-WO00484,needs to be repacked,Minor in-service problems +TRN-WO00485,will not turn,Fail to function +TRN-WO00486,grinding,Noise +TRN-WO00487,rusted,Structural deficiency +TRN-WO00488,trips on overload,Electrical +TRN-WO00489,stopping,Spurious stop +TRN-WO00490,split in two place,Structural deficiency +TRN-WO00491,hanging,Other +TRN-WO00492,needs fixing,Minor in-service problems +TRN-WO00493,detached,Minor in-service problems +TRN-WO00494,brokenmissing,Breakdown +TRN-WO00495,need repacking,Minor in-service problems +TRN-WO00496,wont pump to,Breakdown +TRN-WO00497,sce fault,Fail to function +TRN-WO00498,wont run,Failure to start on demand +TRN-WO00499,requires modifying,Minor in-service problems +TRN-WO00500,requires replacement,Minor in-service problems +TRN-WO00501,triped,Electrical +TRN-WO00502,stopping randomly,Spurious stop +TST-WO00001,ejected, +TST-WO00002,sticking shu, +TST-WO00003,not turning, +TST-WO00004,has failed, +TST-WO00005,needs fitting, +TST-WO00006,runs for a while and trip, +TST-WO00007,very stiff to operate, +TST-WO00008,requires rebuild, +TST-WO00009,has no equipment earth, +TST-WO00010,require tighteninginspecti, +TST-WO00011,does not work, +TST-WO00012,runs continuously, +TST-WO00013,severe corrosion, +TST-WO00014,contamination, +TST-WO00015,doesnt trip, +TST-WO00016,ripped, +TST-WO00017,unserviceable not in, +TST-WO00018,cant be adjusted, +TST-WO00019,worn, +TST-WO00020,misalignment, +TST-WO00021,switched o, +TST-WO00022,to be torqued, +TST-WO00023,sped up, +TST-WO00024,unable to pump, +TST-WO00025,spillage, +TST-WO00026,spraying out slurry, +TST-WO00027,jammed open, +TST-WO00028,no pump, +TST-WO00029,not truning, +TST-WO00030,unable to tighten, +TST-WO00031,burnt, +TST-WO00032,hot joint, +TST-WO00033,burst, +TST-WO00034,pumping fault, +TST-WO00035,not controlling, +TST-WO00036,surging cutting in and out, +TST-WO00037,failed electrical, +TST-WO00038,loose tighten, +TST-WO00039,high earth reading, +TST-WO00040,cavitating, +TST-WO00041,requires repack, +TST-WO00042,needs support, +TST-WO00043,needs maintenance, +TST-WO00044,not spinning, +TST-WO00045,dont activate, +TST-WO00046,unserviceable b feluwa, +TST-WO00047,no power, +TST-WO00048,requires cleaning, +TST-WO00049,doesnt work, +TST-WO00050,vibrationlubealignment, +TST-WO00051,blockage, +TST-WO00052,ongoing issue, +TST-WO00053,poorly supported, +TST-WO00054,rocking around, +TST-WO00055,siezeddescale, +TST-WO00056,dropped, +TST-WO00057,spraying, +TST-WO00058,not covered, +TST-WO00059,hot jointfurn, +TST-WO00060,running dry, +TST-WO00061,smoking up, +TST-WO00062,jammed, diff --git a/src/evaluation/loader.py b/src/evaluation/loader.py index 31b9c761..432c11fe 100644 --- a/src/evaluation/loader.py +++ b/src/evaluation/loader.py @@ -54,6 +54,18 @@ def load_scenarios(paths: Iterable[Path] | Path) -> list[Scenario]: return out +def _coerce_scenario(item: dict, index: int) -> Scenario: + """Build a Scenario, assigning a positional fallback id when none is set. + + Scenario files may omit ``id`` (e.g. draft utterance banks); a stable + 1-based index keeps the join key populated without baking ids into the + on-disk file. + """ + if not item.get("id"): + item = {**item, "id": index} + return Scenario.from_raw(item) + + def _load_scenario_file(path: Path) -> list[Scenario]: text = path.read_text(encoding="utf-8").strip() if not text: @@ -61,16 +73,17 @@ def _load_scenario_file(path: Path) -> list[Scenario]: if path.suffix == ".jsonl": return [ - Scenario.from_raw(json.loads(line)) - for line in text.splitlines() - if line.strip() + _coerce_scenario(json.loads(line), i) + for i, line in enumerate( + (ln for ln in text.splitlines() if ln.strip()), start=1 + ) ] raw = json.loads(text) if isinstance(raw, list): - return [Scenario.from_raw(item) for item in raw] + return [_coerce_scenario(item, i) for i, item in enumerate(raw, start=1)] if isinstance(raw, dict): - return [Scenario.from_raw(raw)] + return [_coerce_scenario(raw, 1)] raise ValueError(f"unexpected scenario JSON shape in {path}: {type(raw).__name__}") diff --git a/src/evaluation/tests/test_loader.py b/src/evaluation/tests/test_loader.py index 24260b34..e40d6cf5 100644 --- a/src/evaluation/tests/test_loader.py +++ b/src/evaluation/tests/test_loader.py @@ -56,6 +56,29 @@ def test_load_scenarios_single_object(tmp_path: Path): assert [s.id for s in out] == ["7"] +_SCENARIOS_LOCAL = Path(__file__).resolve().parents[2] / "scenarios" / "local" + + +def test_workorder_scenarios_load_and_conform(): + """The bundled work order scenarios parse and carry the expected schema.""" + path = _SCENARIOS_LOCAL / "workorder_utterance.json" + scenarios = load_scenarios(path) + + assert len(scenarios) >= 5 + assert all(isinstance(s, Scenario) for s in scenarios) + # Every scenario is a work order scenario with a non-empty question and rubric. + for s in scenarios: + assert s.type == "WorkOrder" + assert s.text.strip() + assert s.category.strip() + assert s.characteristic_form and s.characteristic_form.strip() + # IDs are unique (positional fallback when omitted on disk) and at least + # one scenario targets failure-code imputation. + ids = [s.id for s in scenarios] + assert len(ids) == len(set(ids)) + assert any(s.category == "Failure Code Imputation" for s in scenarios) + + def test_join_drops_orphans(make_persisted_record): from evaluation.models import PersistedTrajectory diff --git a/src/scenarios/local/workorder_utterance.json b/src/scenarios/local/workorder_utterance.json new file mode 100644 index 00000000..e3f47e08 --- /dev/null +++ b/src/scenarios/local/workorder_utterance.json @@ -0,0 +1,248 @@ +[ + { + "type": "WorkOrder", + "text": "Pull up work order TST-WO00032. If no failure code was recorded, suggest the single failure code from our 10-code list (Breakdown, Electrical, Fail to function, Leaking, Low output, Minor in-service problems, Overheating, Plugged / choked, Structural deficiency, Vibration). If a failure code is already recorded, return the existing code as-is.", + "category": "Failure Code Imputation", + "characteristic_form": "The expected response should retrieve work order TST-WO00032 (description 'hot joint'), observe that its failure_code is blank, and suggest a single code from the supplied 10-code list. The correct suggestion is 'Overheating'.", + "expected_answer": "Overheating", + "scoring_method": "exact_string_match", + "metadata": { + "scenario_label": "S1", + "task": "single-record-fill", + "subtitle": "failure-code suggestion", + "target_work_orders": [ + "TST-WO00032" + ], + "write_back": false, + "candidate_codes": [ + "Breakdown", + "Electrical", + "Fail to function", + "Leaking", + "Low output", + "Minor in-service problems", + "Overheating", + "Plugged / choked", + "Structural deficiency", + "Vibration" + ], + "data_sources": [ + "fmc_test_wo.csv" + ], + "gold": { + "wo_id": "TST-WO00032", + "failure_code": "Overheating" + }, + "gold_source": "fmc_test_answer_key.csv", + "expected_tools": [ + "get_work_order_failure_codes" + ] + } + }, + { + "type": "WorkOrder", + "text": "Pull up work order TST-WO00054. If no failure code was recorded, pick the single failure code from our 10-code list (Breakdown, Electrical, Fail to function, Leaking, Low output, Minor in-service problems, Overheating, Plugged / choked, Structural deficiency, Vibration) and write it back to the record. If a failure code is already recorded, leave the record as-is. Confirm the final value on the record.", + "category": "Failure Code Imputation", + "characteristic_form": "The expected response should retrieve work order TST-WO00054 (description 'rocking around'), observe that its failure_code is blank, pick the single best code from the supplied 10-code list, write it back to the record, and confirm the persisted value. The correct code is 'Vibration'.", + "expected_answer": "Vibration", + "scoring_method": "exact_string_match", + "metadata": { + "scenario_label": "S2", + "task": "single-record-fill + write-back", + "subtitle": "failure-code suggestion and imputation", + "target_work_orders": [ + "TST-WO00054" + ], + "write_back": true, + "candidate_codes": [ + "Breakdown", + "Electrical", + "Fail to function", + "Leaking", + "Low output", + "Minor in-service problems", + "Overheating", + "Plugged / choked", + "Structural deficiency", + "Vibration" + ], + "data_sources": [ + "fmc_test_wo.csv" + ], + "verifier": "Verify the DB record was updated to the imputed failure_code.", + "gold": { + "wo_id": "TST-WO00054", + "failure_code": "Vibration" + }, + "gold_source": "fmc_test_answer_key.csv", + "expected_tools": [ + "get_work_order_failure_codes", + "set_work_order_failure_codes" + ] + } + }, + { + "type": "WorkOrder", + "text": "Look across our historical work orders and rank the top three failure codes by record count. Reply with only a JSON array of exactly three objects, ordered 1, 2, 3, in this shape:\n[\n {\"rank\": 1, \"failure_code\": \"\", \"count\": },\n {\"rank\": 2, \"failure_code\": \"\", \"count\": },\n {\"rank\": 3, \"failure_code\": \"\", \"count\": }\n]\nUse the failure code spelling exactly as it appears in the data. If two codes are tied at the rank-3 cut-off, include both at \"rank\": 3 and add a fourth object {\"tie_at_rank_3\": true} at the end of the array. Output nothing outside the JSON array.", + "category": "Distribution Analysis", + "characteristic_form": "The expected response should call get_failure_code_distribution (which ranks every recorded/labelled failure code by record count) and return the top three as a JSON array of {rank, failure_code, count} objects, codes spelled exactly as in the data. In the baseline state the labelled records are the historical work orders, so the correct answer is Minor in-service problems (109), Breakdown (70), Structural deficiency (58).", + "expected_answer": "[{\"rank\": 1, \"failure_code\": \"Minor in-service problems\", \"count\": 109}, {\"rank\": 2, \"failure_code\": \"Breakdown\", \"count\": 70}, {\"rank\": 3, \"failure_code\": \"Structural deficiency\", \"count\": 58}]", + "scoring_method": "json_match", + "metadata": { + "scenario_label": "S3", + "task": "history-distribution", + "subtitle": "top-3 failure codes in the historical work orders", + "write_back": false, + "data_sources": [ + "fmc_train_wo.csv" + ], + "response_format": "json_array", + "gold": [ + { + "rank": 1, + "failure_code": "Minor in-service problems", + "count": 109 + }, + { + "rank": 2, + "failure_code": "Breakdown", + "count": 70 + }, + { + "rank": 3, + "failure_code": "Structural deficiency", + "count": 58 + } + ], + "gold_source": "fmc_train_wo.csv (verified by record count)", + "expected_tools": [ + "get_failure_code_distribution" + ], + "assumes_state": "test/target records still blank (only historical records labelled)" + } + }, + { + "type": "WorkOrder", + "text": "Learn the description-to-failure-code patterns from our historical work orders. Then go through work orders TST-WO00001 through TST-WO00010 and, for each one with a blank failure code, pick a single failure code from the set of codes used in history and write it back to the record. If a failure code is already recorded, leave that record as-is. Reply with only a JSON array of exactly 10 objects in wo_id order, in this shape:\n[\n {\"wo_id\": \"TST-WO00001\", \"failure_code\": \"\"},\n {\"wo_id\": \"TST-WO00002\", \"failure_code\": \"\"},\n ...\n {\"wo_id\": \"TST-WO00010\", \"failure_code\": \"\"}\n]\nUse the failure code spelling exactly as it appears in the data. Output nothing outside the JSON array.", + "category": "Failure Code Imputation", + "characteristic_form": "The expected response should learn description-to-code patterns from the historical work orders, then impute and write back a single failure code for each of TST-WO00001 through TST-WO00010, returning a 10-object JSON array in wo_id order. Per-record gold is taken from the answer key.", + "expected_answer": "[{\"wo_id\": \"TST-WO00001\", \"failure_code\": \"Minor in-service problems\"}, {\"wo_id\": \"TST-WO00002\", \"failure_code\": \"Fail to function\"}, {\"wo_id\": \"TST-WO00003\", \"failure_code\": \"Plugged / choked\"}, {\"wo_id\": \"TST-WO00004\", \"failure_code\": \"Breakdown\"}, {\"wo_id\": \"TST-WO00005\", \"failure_code\": \"Minor in-service problems\"}, {\"wo_id\": \"TST-WO00006\", \"failure_code\": \"Electrical\"}, {\"wo_id\": \"TST-WO00007\", \"failure_code\": \"Fail to function\"}, {\"wo_id\": \"TST-WO00008\", \"failure_code\": \"Minor in-service problems\"}, {\"wo_id\": \"TST-WO00009\", \"failure_code\": \"Electrical\"}, {\"wo_id\": \"TST-WO00010\", \"failure_code\": \"Minor in-service problems\"}]", + "scoring_method": "json_match", + "metadata": { + "scenario_label": "S4", + "task": "batch-fill + write-back", + "subtitle": "failure-code imputation from history", + "target_work_orders": [ + "TST-WO00001", + "TST-WO00002", + "TST-WO00003", + "TST-WO00004", + "TST-WO00005", + "TST-WO00006", + "TST-WO00007", + "TST-WO00008", + "TST-WO00009", + "TST-WO00010" + ], + "write_back": true, + "learning_source": "fmc_train_wo.csv", + "data_sources": [ + "fmc_train_wo.csv", + "fmc_test_wo.csv" + ], + "response_format": "json_array", + "gold": [ + { + "wo_id": "TST-WO00001", + "failure_code": "Minor in-service problems" + }, + { + "wo_id": "TST-WO00002", + "failure_code": "Fail to function" + }, + { + "wo_id": "TST-WO00003", + "failure_code": "Plugged / choked" + }, + { + "wo_id": "TST-WO00004", + "failure_code": "Breakdown" + }, + { + "wo_id": "TST-WO00005", + "failure_code": "Minor in-service problems" + }, + { + "wo_id": "TST-WO00006", + "failure_code": "Electrical" + }, + { + "wo_id": "TST-WO00007", + "failure_code": "Fail to function" + }, + { + "wo_id": "TST-WO00008", + "failure_code": "Minor in-service problems" + }, + { + "wo_id": "TST-WO00009", + "failure_code": "Electrical" + }, + { + "wo_id": "TST-WO00010", + "failure_code": "Minor in-service problems" + } + ], + "gold_source": "fmc_test_answer_key.csv", + "expected_tools": [ + "list_work_order_failure_codes", + "get_work_order_failure_codes", + "set_work_order_failure_codes" + ], + "learning_via": "list_work_order_failure_codes(labeled=True)", + "write_hint": "write all 10 imputations in one set_work_order_failure_codes call" + } + }, + { + "type": "WorkOrder", + "text": "Learn the description-to-failure-code patterns from our historical work orders. Then go through every test work order with a blank failure code, pick a single failure code from the set of codes used in history, and write it back to the record. After every blank has been written back, rank the top three failure codes by record count across just the records you filled. Reply with only a JSON array of exactly 3 strings — the failure codes ranked 1, 2, 3 — using the failure code spelling exactly as it appears in the data. Output nothing outside the JSON array.\n[\"\", \"\", \"\"]", + "category": "Distribution Analysis", + "characteristic_form": "The expected response should list the blank records via list_work_order_failure_codes(labeled=False), impute and write back a single failure code for each via set_work_order_failure_code, then rank the top three failure codes by count across the records it imputed (computed from its own write set — get_failure_code_distribution ranks ALL recorded codes, including history, so it cannot isolate the filled subset). Correct: Minor in-service problems (17), Breakdown (7), then a rank-3 tie between Electrical and Plugged / choked at count 6 — either is accepted at position 3. The write-back is verified independently against the answer key.", + "expected_answer": "[\"Minor in-service problems\", \"Breakdown\", \"Electrical\"]", + "scoring_method": "json_match", + "metadata": { + "scenario_label": "S5", + "task": "full-batch fill + top-3 distribution", + "subtitle": "impute all missing failure codes and rank", + "target_work_orders": "all blank-failure_code records in fmc_test_wo.csv (62 records, TST-WO00001 through TST-WO00062)", + "write_back": true, + "learning_source": "fmc_train_wo.csv", + "data_sources": [ + "fmc_train_wo.csv", + "fmc_test_wo.csv" + ], + "response_format": "json_array_of_strings", + "gold": [ + "Minor in-service problems", + "Breakdown", + "Electrical" + ], + "accepted_alternatives": [ + [ + "Minor in-service problems", + "Breakdown", + "Plugged / choked" + ] + ], + "tie_note": "Rank 3 is a tie between Electrical and Plugged / choked at count 6; either is accepted at position 3.", + "verification_note": "The 3-element list is scored, but the per-record write-back is verified independently against fmc_test_answer_key.csv (e.g. score(fill) in fmc_analytics_scenarios.py). A correct top-3 list with incorrect write-backs does not pass.", + "gold_source": "fmc_test_answer_key.csv", + "expected_tools": [ + "list_work_order_failure_codes", + "set_work_order_failure_codes" + ], + "distribution_note": "get_failure_code_distribution cannot scope to 'just the filled records'; the rank-3 list is computed by the agent from its own imputations.", + "write_hint": "write all imputations in one set_work_order_failure_codes call" + } + } +] diff --git a/src/servers/wo/data.py b/src/servers/wo/data.py index a2c6bf73..3f8ec696 100644 --- a/src/servers/wo/data.py +++ b/src/servers/wo/data.py @@ -114,6 +114,57 @@ def load(dataset: str) -> Optional[pd.DataFrame]: return None +def write_failure_codes(updates: Dict[str, Optional[str]]) -> Optional[Dict[str, bool]]: + """Persist failure codes onto multiple ``wo_fmc`` records in one round-trip. + + *updates* maps ``wo_id`` → ``failure_code`` (``None`` blanks the field). + Fetches all targets with a single ``$in`` query and writes them with one + ``_bulk_docs`` call. Returns a ``{wo_id: updated}`` status map (``False`` + for ids with no matching record), or ``None`` when CouchDB is unavailable. + Invalidates the cached ``wo_fmc`` DataFrame when anything was written. + """ + if not updates: + return {} + db = _get_db() + if db is None: + return None + try: + result = db.find( + selector={"dataset": {"$eq": "wo_fmc"}, "wo_id": {"$in": list(updates)}}, + limit=len(updates) + 1, + ) + by_id = {doc["wo_id"]: doc for doc in result.get("docs", [])} + status: Dict[str, bool] = {} + to_save = [] + for wo_id, failure_code in updates.items(): + doc = by_id.get(wo_id) + if doc is None: + status[wo_id] = False + continue + doc["failure_code"] = failure_code + to_save.append(doc) + status[wo_id] = True + if to_save: + db.bulk_docs(to_save) + _dataset_cache.pop("wo_fmc", None) + return status + except Exception as exc: + logger.error("Failed to write failure codes for %s: %s", list(updates), exc) + return None + + +def write_failure_code(wo_id: str, failure_code: Optional[str]) -> Optional[bool]: + """Persist a single *failure_code* onto the ``wo_fmc`` record *wo_id*. + + Thin wrapper over :func:`write_failure_codes`. Returns ``True`` on update, + ``False`` when no matching record exists, ``None`` when CouchDB is down. + """ + result = write_failure_codes({wo_id: failure_code}) + if result is None: + return None + return result.get(wo_id, False) + + # --------------------------------------------------------------------------- # Query helpers # --------------------------------------------------------------------------- diff --git a/src/servers/wo/fmc_tools.py b/src/servers/wo/fmc_tools.py new file mode 100644 index 00000000..6fa44c96 --- /dev/null +++ b/src/servers/wo/fmc_tools.py @@ -0,0 +1,238 @@ +"""Failure-mode classification (FMC) tools for the Work Order MCP server. + +These tools operate on the ``wo_fmc`` dataset — work orders keyed by ``wo_id`` +(prefix ``TRN-`` for historical/training records, ``TST-`` for test records) +carrying a free-text ``description`` and a plain-text ``failure_code`` such as +``"Breakdown"`` or ``"Overheating"``. This is distinct from the +equipment-keyed ``wo_events`` dataset used by the other work-order tools, which +classifies on structured ``MTxxx`` primary/secondary codes. + +The workflow they support: read a work order, learn description-to-code +patterns from the already-labelled records, impute a failure code, write it +back, and rank failure codes by frequency. + +Records are filtered by whether a failure code has been recorded +(``labeled``) rather than by any train/test tag — the ``wo_fmc`` dataset +carries no such tag. +""" + +from collections import Counter +from typing import List, Optional, Union + +import pandas as pd + +from .data import load, write_failure_codes +from .models import ( + ErrorResult, + FmcBatchWriteResult, + FmcCodeAssignment, + FmcCodeCount, + FmcCodeDistributionResult, + FmcWorkOrder, + FmcWorkOrdersResult, + FmcWriteEntry, +) + +_FMC_DATASET = "wo_fmc" + + +def _code(value) -> Optional[str]: + """Normalise a ``failure_code`` cell to a non-empty string, or ``None``.""" + if value is None or (isinstance(value, float) and pd.isna(value)): + return None + text = str(value).strip() + return text or None + + +def _is_labeled(value) -> bool: + """True when the record has a recorded (non-blank) failure code.""" + return _code(value) is not None + + +def get_work_order_failure_codes(wo_ids: List[str]) -> Union[FmcWorkOrdersResult, ErrorResult]: + """Retrieve one or more failure-mode work orders by ``wo_id``. + + Returns each work order's free-text description and recorded failure code + (null when none has been recorded yet), in the requested order. Pass a + one-element list to fetch a single record, or many to pull a batch at once + (preferred over many single calls). Any ids with no matching record are + reported in ``missing``. + + Args: + wo_ids: Work order identifiers, e.g. ``["TST-WO00032"]`` or + ``["TST-WO00001", "TST-WO00002"]``. + """ + if not wo_ids: + return ErrorResult(error="wo_ids must be a non-empty list") + df = load(_FMC_DATASET) + if df is None: + return ErrorResult(error="FMC work order data not available") + + requested = list(dict.fromkeys(wo_ids)) # de-duplicate, preserve order + by_id = { + str(row["wo_id"]): row + for _, row in df[df["wo_id"].isin(requested)].iterrows() + } + items: List[FmcWorkOrder] = [] + missing: List[str] = [] + labeled = 0 + for wo_id in requested: + row = by_id.get(wo_id) + if row is None: + missing.append(wo_id) + continue + code = _code(row.get("failure_code")) + if code is not None: + labeled += 1 + items.append( + FmcWorkOrder( + wo_id=wo_id, + description=str(row.get("description", "") or ""), + failure_code=code, + ) + ) + if not items: + return ErrorResult(error=f"No work orders found for: {', '.join(requested)}") + + message = ( + f"Found {len(items)} of {len(requested)} requested work order(s) " + f"({labeled} labelled, {len(items) - labeled} unlabelled)." + ) + if missing: + message += f" Not found: {', '.join(missing)}." + return FmcWorkOrdersResult( + total=len(items), + labeled=labeled, + unlabeled=len(items) - labeled, + work_orders=items, + missing=missing, + message=message, + ) + + +def list_work_order_failure_codes( + labeled: Optional[bool] = None, +) -> Union[FmcWorkOrdersResult, ErrorResult]: + """List failure-mode work orders with their descriptions and recorded codes. + + Use ``labeled=True`` for the records that already have a failure code (to + learn description-to-code patterns), ``labeled=False`` for the blank + records still to be classified, or omit it (default) for everything. + + Args: + labeled: If ``True``, only records with a recorded failure code; if + ``False``, only records with a blank failure code; if omitted, all. + """ + df = load(_FMC_DATASET) + if df is None: + return ErrorResult(error="FMC work order data not available") + + items: List[FmcWorkOrder] = [] + labeled_count = 0 + for _, row in df.iterrows(): + code = _code(row.get("failure_code")) + if labeled is True and code is None: + continue + if labeled is False and code is not None: + continue + if code is not None: + labeled_count += 1 + items.append( + FmcWorkOrder( + wo_id=str(row["wo_id"]), + description=str(row.get("description", "") or ""), + failure_code=code, + ) + ) + if not items: + return ErrorResult(error="No matching work orders found") + + return FmcWorkOrdersResult( + total=len(items), + labeled=labeled_count, + unlabeled=len(items) - labeled_count, + work_orders=items, + message=( + f"Found {len(items)} work order(s) " + f"({labeled_count} labelled, {len(items) - labeled_count} unlabelled)." + ), + ) + + +def set_work_order_failure_codes( + assignments: List[FmcCodeAssignment], +) -> Union[FmcBatchWriteResult, ErrorResult]: + """Write (impute) failure codes onto one or more work order records. + + Persists each ``failure_code`` to its ``wo_fmc`` record in CouchDB in a + single batch. Pass a one-element list to set a single record, or many to + impute a whole batch at once (preferred over many single calls). + + Args: + assignments: List of ``{"wo_id": ..., "failure_code": ...}`` items, + e.g. ``[{"wo_id": "TST-WO00054", "failure_code": "Vibration"}]``. + """ + if not assignments: + return ErrorResult(error="assignments must be a non-empty list") + + updates: dict = {} + for a in assignments: + code = (a.failure_code or "").strip() + if not code: + return ErrorResult(error=f"failure_code for '{a.wo_id}' must be a non-empty string") + if a.wo_id in updates: + return ErrorResult(error=f"duplicate wo_id in assignments: '{a.wo_id}'") + updates[a.wo_id] = code + + status = write_failure_codes(updates) + if status is None: + return ErrorResult(error="FMC work order data not available") + + results = [ + FmcWriteEntry(wo_id=wo_id, failure_code=code, updated=bool(status.get(wo_id))) + for wo_id, code in updates.items() + ] + updated = sum(1 for r in results if r.updated) + missing = [r.wo_id for r in results if not r.updated] + message = f"Recorded {updated}/{len(results)} failure code(s)." + if missing: + message += f" No record found for: {', '.join(missing)}." + return FmcBatchWriteResult( + total=len(results), + updated=updated, + results=results, + message=message, + ) + + +def get_failure_code_distribution( + top_n: Optional[int] = None, +) -> Union[FmcCodeDistributionResult, ErrorResult]: + """Rank recorded failure codes by record count across the failure-mode dataset. + + Counts every record that has a recorded failure code, sorted by count + descending. (Blank records are ignored, so this ranks the labelled + population.) + + Args: + top_n: If given, return only the top N codes. + """ + df = load(_FMC_DATASET) + if df is None: + return ErrorResult(error="FMC work order data not available") + codes = [c for c in (_code(v) for v in df.get("failure_code", [])) if c is not None] + if not codes: + return ErrorResult(error="No recorded failure codes found") + + counts = Counter(codes) + ranked = counts.most_common(top_n) + distribution = [FmcCodeCount(failure_code=code, count=count) for code, count in ranked] + return FmcCodeDistributionResult( + total_records=int(len(df)), + labeled_records=len(codes), + distribution=distribution, + message=( + f"Ranked {len(distribution)} failure code(s) across " + f"{len(codes)} labelled record(s)." + ), + ) diff --git a/src/servers/wo/main.py b/src/servers/wo/main.py index 1dbde8ed..8217aa08 100644 --- a/src/servers/wo/main.py +++ b/src/servers/wo/main.py @@ -19,7 +19,7 @@ mcp = FastMCP("wo", instructions="Work order analytics: query work orders, events, failure codes, and predict maintenance patterns.") # Register tools — imported after mcp is created to avoid circular imports. -from . import tools # noqa: E402 +from . import fmc_tools, tools # noqa: E402 _TOOLS = [ (tools.get_work_orders, "Get Work Orders"), @@ -30,6 +30,10 @@ (tools.get_work_order_distribution, "Get Work Order Distribution"), (tools.predict_next_work_order, "Predict Next Work Order"), (tools.analyze_alert_to_failure, "Analyze Alert to Failure"), + (fmc_tools.get_work_order_failure_codes, "Get Work Order Failure Codes"), + (fmc_tools.list_work_order_failure_codes, "List Work Order Failure Codes"), + (fmc_tools.set_work_order_failure_codes, "Set Work Order Failure Codes"), + (fmc_tools.get_failure_code_distribution, "Get Failure Code Distribution"), ] for _fn, _title in _TOOLS: mcp.tool(title=_title)(_fn) diff --git a/src/servers/wo/models.py b/src/servers/wo/models.py index e962282e..dfd341f6 100644 --- a/src/servers/wo/models.py +++ b/src/servers/wo/models.py @@ -1,7 +1,7 @@ """Pydantic result models for the Work Order MCP server.""" from typing import List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, Field class ErrorResult(BaseModel): @@ -146,3 +146,55 @@ class AlertToFailureResult(BaseModel): total_alerts_analyzed: int transitions: List[AlertToFailureEntry] message: str + + +# --------------------------------------------------------------------------- +# Failure-mode classification (wo_fmc dataset) +# --------------------------------------------------------------------------- + + +class FmcWorkOrder(BaseModel): + wo_id: str + description: str + failure_code: Optional[str] + + +class FmcWorkOrdersResult(BaseModel): + total: int + labeled: int + unlabeled: int + work_orders: List[FmcWorkOrder] + missing: List[str] = Field(default_factory=list) + message: str + + +class FmcCodeAssignment(BaseModel): + """A single (wo_id, failure_code) write request.""" + + wo_id: str + failure_code: str + + +class FmcWriteEntry(BaseModel): + wo_id: str + failure_code: str + updated: bool + + +class FmcBatchWriteResult(BaseModel): + total: int + updated: int + results: List[FmcWriteEntry] + message: str + + +class FmcCodeCount(BaseModel): + failure_code: str + count: int + + +class FmcCodeDistributionResult(BaseModel): + total_records: int + labeled_records: int + distribution: List[FmcCodeCount] + message: str diff --git a/src/servers/wo/tests/test_fmc_tools.py b/src/servers/wo/tests/test_fmc_tools.py new file mode 100644 index 00000000..55136dcf --- /dev/null +++ b/src/servers/wo/tests/test_fmc_tools.py @@ -0,0 +1,212 @@ +"""Unit tests for the failure-mode classification (wo_fmc) tools.""" + +from unittest.mock import patch + +import pandas as pd +import pytest + +from servers.wo import fmc_tools +from servers.wo.models import ( + ErrorResult, + FmcBatchWriteResult, + FmcCodeAssignment, + FmcCodeDistributionResult, + FmcWorkOrdersResult, +) + + +def _make_fmc_df() -> pd.DataFrame: + return pd.DataFrame( + { + "wo_id": [ + "TRN-WO00001", + "TRN-WO00002", + "TRN-WO00003", + "TRN-WO00004", + "TST-WO00001", + "TST-WO00002", + ], + "description": [ + "falure", + "unserviceable", + "bogged", + "leaking seal", + "ejected", + "hot joint", + ], + "failure_code": [ + "Breakdown", + "Breakdown", + "Plugged / choked", + "Leaking", + None, + None, + ], + } + ) + + +@pytest.fixture +def mock_load(): + with patch("servers.wo.fmc_tools.load", side_effect=lambda key: _make_fmc_df() if key == "wo_fmc" else None): + yield + + +# --- get_work_order_failure_codes ------------------------------------------ + + +def test_get_single_in_list(mock_load): + res = fmc_tools.get_work_order_failure_codes(["TRN-WO00001"]) + assert isinstance(res, FmcWorkOrdersResult) + assert res.total == 1 + wo = res.work_orders[0] + assert wo.wo_id == "TRN-WO00001" + assert wo.description == "falure" + assert wo.failure_code == "Breakdown" + assert res.missing == [] + + +def test_get_blank_record_has_null_code(mock_load): + res = fmc_tools.get_work_order_failure_codes(["TST-WO00001"]) + assert isinstance(res, FmcWorkOrdersResult) + assert res.work_orders[0].failure_code is None + + +def test_get_batch_preserves_order(mock_load): + res = fmc_tools.get_work_order_failure_codes(["TST-WO00001", "TRN-WO00003", "TRN-WO00001"]) + assert isinstance(res, FmcWorkOrdersResult) + assert [wo.wo_id for wo in res.work_orders] == ["TST-WO00001", "TRN-WO00003", "TRN-WO00001"] + assert res.labeled == 2 + assert res.unlabeled == 1 + + +def test_get_batch_reports_missing(mock_load): + res = fmc_tools.get_work_order_failure_codes(["TRN-WO00001", "TST-WO99999"]) + assert isinstance(res, FmcWorkOrdersResult) + assert res.total == 1 + assert res.missing == ["TST-WO99999"] + + +def test_get_all_missing(mock_load): + res = fmc_tools.get_work_order_failure_codes(["TST-WO99999"]) + assert isinstance(res, ErrorResult) + + +def test_get_empty_list_rejected(mock_load): + res = fmc_tools.get_work_order_failure_codes([]) + assert isinstance(res, ErrorResult) + + +def test_get_no_data(): + with patch("servers.wo.fmc_tools.load", return_value=None): + res = fmc_tools.get_work_order_failure_codes(["TRN-WO00001"]) + assert isinstance(res, ErrorResult) + + +# --- list_work_order_failure_codes ----------------------------------------- + + +def test_list_labeled_only(mock_load): + res = fmc_tools.list_work_order_failure_codes(labeled=True) + assert isinstance(res, FmcWorkOrdersResult) + assert res.total == 4 + assert res.labeled == 4 + assert res.unlabeled == 0 + assert all(wo.failure_code is not None for wo in res.work_orders) + + +def test_list_unlabeled_only(mock_load): + res = fmc_tools.list_work_order_failure_codes(labeled=False) + assert isinstance(res, FmcWorkOrdersResult) + assert res.total == 2 + assert res.labeled == 0 + assert res.unlabeled == 2 + assert all(wo.failure_code is None for wo in res.work_orders) + + +def test_list_all_default(mock_load): + res = fmc_tools.list_work_order_failure_codes() + assert isinstance(res, FmcWorkOrdersResult) + assert res.total == 6 + assert res.labeled == 4 + assert res.unlabeled == 2 + + +# --- set_work_order_failure_codes ------------------------------------------ + + +def _asg(wo_id, code): + return FmcCodeAssignment(wo_id=wo_id, failure_code=code) + + +def test_set_single(): + with patch("servers.wo.fmc_tools.write_failure_codes", return_value={"TST-WO00001": True}) as mock_write: + res = fmc_tools.set_work_order_failure_codes([_asg("TST-WO00001", "Overheating")]) + assert isinstance(res, FmcBatchWriteResult) + assert res.total == 1 + assert res.updated == 1 + assert res.results[0].failure_code == "Overheating" + mock_write.assert_called_once_with({"TST-WO00001": "Overheating"}) + + +def test_set_batch_partial_missing(): + status = {"TST-WO00001": True, "TST-WO99999": False} + with patch("servers.wo.fmc_tools.write_failure_codes", return_value=status): + res = fmc_tools.set_work_order_failure_codes( + [_asg("TST-WO00001", "Electrical"), _asg("TST-WO99999", "Breakdown")] + ) + assert isinstance(res, FmcBatchWriteResult) + assert res.total == 2 + assert res.updated == 1 + missing = [r.wo_id for r in res.results if not r.updated] + assert missing == ["TST-WO99999"] + + +def test_set_no_db(): + with patch("servers.wo.fmc_tools.write_failure_codes", return_value=None): + res = fmc_tools.set_work_order_failure_codes([_asg("TST-WO00001", "Overheating")]) + assert isinstance(res, ErrorResult) + + +def test_set_empty_list_rejected(): + res = fmc_tools.set_work_order_failure_codes([]) + assert isinstance(res, ErrorResult) + + +def test_set_empty_code_rejected(): + res = fmc_tools.set_work_order_failure_codes([_asg("TST-WO00001", " ")]) + assert isinstance(res, ErrorResult) + + +def test_set_duplicate_wo_id_rejected(): + res = fmc_tools.set_work_order_failure_codes( + [_asg("TST-WO00001", "Electrical"), _asg("TST-WO00001", "Breakdown")] + ) + assert isinstance(res, ErrorResult) + + +# --- get_failure_code_distribution ----------------------------------------- + + +def test_distribution_ranked(mock_load): + res = fmc_tools.get_failure_code_distribution() + assert isinstance(res, FmcCodeDistributionResult) + assert res.total_records == 6 + assert res.labeled_records == 4 + # Breakdown (2) ranks first; remaining tied at 1 + assert res.distribution[0].failure_code == "Breakdown" + assert res.distribution[0].count == 2 + + +def test_distribution_top_n(mock_load): + res = fmc_tools.get_failure_code_distribution(top_n=1) + assert isinstance(res, FmcCodeDistributionResult) + assert len(res.distribution) == 1 + assert res.distribution[0].failure_code == "Breakdown" + + +def test_distribution_empty_when_no_codes(): + blank = pd.DataFrame({"wo_id": ["TST-WO00001"], "description": ["x"], "failure_code": [None]}) + with patch("servers.wo.fmc_tools.load", return_value=blank): + res = fmc_tools.get_failure_code_distribution() + assert isinstance(res, ErrorResult)