From 4e9dbc630d93b2583dec7abad8631ad13c77f0e9 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Fri, 5 Jun 2026 11:17:37 -0400
Subject: [PATCH 01/10] feat: add work order failure-code categorization
 scenarios

Add src/scenarios/local/workorder_utterance.json with 6 focused
WorkOrder-type scenarios exercising the wo/ server's failure-code
categorization (get_failure_codes, get_work_order_distribution):
knowledge queries, code-to-category mapping, and per-asset category
distribution. Add an evaluation loader test validating the file parses
and conforms to the Scenario schema.

Closes #350

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/evaluation/tests/test_loader.py          | 22 ++++++++++
 src/scenarios/local/workorder_utterance.json | 44 ++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 src/scenarios/local/workorder_utterance.json

diff --git a/src/evaluation/tests/test_loader.py b/src/evaluation/tests/test_loader.py
index 24260b34..3c31bf7d 100644
--- a/src/evaluation/tests/test_loader.py
+++ b/src/evaluation/tests/test_loader.py
@@ -56,6 +56,28 @@ def test_load_scenarios_single_object(tmp_path: Path):
     assert [s.id for s in out] == ["7"]
 
 
+_SCENARIOS_LOCAL = Path(__file__).resolve().parents[2] / "scenarios" / "local"
+
+
+def test_workorder_scenarios_load_and_conform():
+    """The bundled work order scenarios parse and carry the expected schema."""
+    path = _SCENARIOS_LOCAL / "workorder_utterance.json"
+    scenarios = load_scenarios(path)
+
+    assert len(scenarios) >= 5
+    assert all(isinstance(s, Scenario) for s in scenarios)
+    # Every scenario is a work order scenario with a non-empty question and rubric.
+    for s in scenarios:
+        assert s.type == "WorkOrder"
+        assert s.text.strip()
+        assert s.category.strip()
+        assert s.characteristic_form and s.characteristic_form.strip()
+    # IDs are unique and at least one scenario targets failure-code categorization.
+    ids = [s.id for s in scenarios]
+    assert len(ids) == len(set(ids))
+    assert any(s.category == "Categorization" for s in scenarios)
+
+
 def test_join_drops_orphans(make_persisted_record):
     from evaluation.models import PersistedTrajectory
 
diff --git a/src/scenarios/local/workorder_utterance.json b/src/scenarios/local/workorder_utterance.json
new file mode 100644
index 00000000..abed0a32
--- /dev/null
+++ b/src/scenarios/local/workorder_utterance.json
@@ -0,0 +1,44 @@
+[
+    {
+      "id": 401,
+      "type": "WorkOrder",
+      "text": "What failure code categories are used to classify work orders?",
+      "category": "Knowledge Query",
+      "characteristic_form": "The expected response should call get_failure_codes and list the distinct top-level categories that work orders are grouped into (e.g., 'Maintenance and Routine Checks', 'Corrective'), describing what each category represents."
+    },
+    {
+      "id": 402,
+      "type": "WorkOrder",
+      "text": "List all failure codes that belong to the 'Maintenance and Routine Checks' category, with their descriptions.",
+      "category": "Knowledge Query",
+      "characteristic_form": "The expected response should call get_failure_codes, filter to the 'Maintenance and Routine Checks' category, and report the primary and secondary codes with descriptions (e.g., MT010 / Oil Analysis with secondary MT010b / Routine Oil Analysis, and MT001 / Routine Maintenance)."
+    },
+    {
+      "id": 403,
+      "type": "WorkOrder",
+      "text": "Which failure code category does the primary code MT010 belong to, and what does that code describe?",
+      "category": "Categorization",
+      "characteristic_form": "The expected response should look up MT010 via get_failure_codes and report that it falls under the 'Maintenance and Routine Checks' category with the description 'Oil Analysis', distinguishing the primary code from its secondary codes."
+    },
+    {
+      "id": 404,
+      "type": "WorkOrder",
+      "text": "A work order is described as 'Routine Oil Analysis'. Determine the most appropriate failure code and the category it should be filed under.",
+      "category": "Categorization",
+      "characteristic_form": "The expected response should map the description 'Routine Oil Analysis' to the secondary code MT010b (under primary code MT010, 'Oil Analysis') and assign it to the 'Maintenance and Routine Checks' category, using get_failure_codes to justify the mapping."
+    },
+    {
+      "id": 405,
+      "type": "WorkOrder",
+      "text": "Is failure code MT013 a preventive or a corrective maintenance type? Justify the answer using its category.",
+      "category": "Categorization",
+      "characteristic_form": "The expected response should identify MT013 via get_failure_codes as belonging to the 'Corrective' category (description 'Corrective'), and conclude it is a corrective rather than preventive maintenance type based on that category."
+    },
+    {
+      "id": 406,
+      "type": "WorkOrder",
+      "text": "For chiller CWC04013, summarize its work orders grouped by failure code category and state which category is most common.",
+      "category": "Distribution Analysis",
+      "characteristic_form": "The expected response should call get_work_order_distribution for equipment 'CWC04013' (optionally with get_failure_codes to resolve categories), aggregate the per-code counts up to the category level (e.g., 'Maintenance and Routine Checks' vs 'Corrective'), and identify the most frequent category for the asset."
+    }
+  ]

From 949d2ae14e69431817cdb4022a07916a053e617e Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 12:37:36 -0400
Subject: [PATCH 02/10] feat: add wo_fmc work-order failure-code dataset to
 CouchDB loader

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/couchdb/init_wo.py                        |   1 +
 src/couchdb/sample_data/work_order/wo_fmc.csv | 565 ++++++++++++++++++
 2 files changed, 566 insertions(+)
 create mode 100644 src/couchdb/sample_data/work_order/wo_fmc.csv

diff --git a/src/couchdb/init_wo.py b/src/couchdb/init_wo.py
index 882aaf6c..dbcb3d1d 100644
--- a/src/couchdb/init_wo.py
+++ b/src/couchdb/init_wo.py
@@ -66,6 +66,7 @@
     ("failure_codes.csv", "failure_codes", {}),
     ("primary_failure_codes.csv", "primary_failure_codes", {}),
     ("component.csv", "component", {}),
+    ("wo_fmc.csv", "wo_fmc", {}),
 ]
 
 # Mango indexes to create: list of field-lists
diff --git a/src/couchdb/sample_data/work_order/wo_fmc.csv b/src/couchdb/sample_data/work_order/wo_fmc.csv
new file mode 100644
index 00000000..f07daa74
--- /dev/null
+++ b/src/couchdb/sample_data/work_order/wo_fmc.csv
@@ -0,0 +1,565 @@
+wo_id,description,failure_code
+TRN-WO00001,falure,Breakdown
+TRN-WO00002,unserviceable dpr 1,Breakdown
+TRN-WO00003,bogged,Plugged / choked
+TRN-WO00004,spaying slurry,Leaking
+TRN-WO00005,fell of,Minor in-service problems
+TRN-WO00006,no pressure output,Plugged / choked
+TRN-WO00007,damageblockage,Structural deficiency
+TRN-WO00008,squeeling,Noise
+TRN-WO00009,requires repacking,Minor in-service problems
+TRN-WO00010,no starting issue,Failure to start on demand
+TRN-WO00011,shaking,Vibration
+TRN-WO00012,blow,Minor in-service problems
+TRN-WO00013,need a descale,Minor in-service problems
+TRN-WO00014,over heating,Overheating
+TRN-WO00015,vibrationcheck,Vibration
+TRN-WO00016,vibration issue,Vibration
+TRN-WO00017,not open,Minor in-service problems
+TRN-WO00018,sticking,Fail to function
+TRN-WO00019,seized shut,Breakdown
+TRN-WO00020,blocked,Plugged / choked
+TRN-WO00021,not working,Breakdown
+TRN-WO00022,not energising,Low output
+TRN-WO00023,power issue,Electrical
+TRN-WO00024,needs repairing,Structural deficiency
+TRN-WO00025,not performing,Breakdown
+TRN-WO00026,wont run and,Failure to start on demand
+TRN-WO00027,boil out,Overheating
+TRN-WO00028,not supplying,Plugged / choked
+TRN-WO00029,spilled,Leaking
+TRN-WO00030,stuck in,Fail to function
+TRN-WO00031,leak exponentially,Leaking
+TRN-WO00032,not keeping up,Low output
+TRN-WO00033,pumping issue,Low output
+TRN-WO00034,leak slurry,Leaking
+TRN-WO00035,us,Breakdown
+TRN-WO00036,requires attention,Minor in-service problems
+TRN-WO00037,does not stop,Failure to stop on demand
+TRN-WO00038,noisy sce,Noise
+TRN-WO00039,twisting,Fail to function
+TRN-WO00040,needs to be extended,Minor in-service problems
+TRN-WO00041,lost its packing,Leaking
+TRN-WO00042,rusted through,Structural deficiency
+TRN-WO00043,needs to be held down,Minor in-service problems
+TRN-WO00044,flow issue,Plugged / choked
+TRN-WO00045,degraded,Structural deficiency
+TRN-WO00046,wont pump floc,Breakdown
+TRN-WO00047,leakspray,Leaking
+TRN-WO00048,wont run in,Failure to start on demand
+TRN-WO00049,needs repacked,Minor in-service problems
+TRN-WO00050,passing excessively,Leaking
+TRN-WO00051,blocking,Plugged / choked
+TRN-WO00052,not engaging,Failure to start on demand
+TRN-WO00053,shredded,Structural deficiency
+TRN-WO00054,noise,Noise
+TRN-WO00055,blown out,Minor in-service problems
+TRN-WO00056,needs replacing,Breakdown
+TRN-WO00057,not running,Breakdown
+TRN-WO00058,leak when shut,Leaking
+TRN-WO00059,fall off,Minor in-service problems
+TRN-WO00060,not being pumped away,Plugged / choked
+TRN-WO00061,scaled up,Minor in-service problems
+TRN-WO00062,fallen off,Minor in-service problems
+TRN-WO00063,broke away,Breakdown
+TRN-WO00064,cannot run in,Failure to start on demand
+TRN-WO00065,not pumping enough,Plugged / choked
+TRN-WO00066,not stopping,Failure to stop on demand
+TRN-WO00067,unable to shu,Failure to stop on demand
+TRN-WO00068,failed internaly,Breakdown
+TRN-WO00069,not there,Minor in-service problems
+TRN-WO00070,inefficiant,Plugged / choked
+TRN-WO00071,no reading,Abnormal instrument reading
+TRN-WO00072,will not start in,Failure to start on demand
+TRN-WO00073,been running hot,Overheating
+TRN-WO00074,fallen out,Minor in-service problems
+TRN-WO00075,leak on,Leaking
+TRN-WO00076,to be removed,Breakdown
+TRN-WO00077,unsupported,Minor in-service problems
+TRN-WO00078,giving false reading,Abnormal instrument reading
+TRN-WO00079,leak in two place,Leaking
+TRN-WO00080,not reaching flow,Plugged / choked
+TRN-WO00081,wont turn,Failure to start on demand
+TRN-WO00082,not able to open,Minor in-service problems
+TRN-WO00083,need straighting,Minor in-service problems
+TRN-WO00084,hot liquid,Overheating
+TRN-WO00085,fell apart,Minor in-service problems
+TRN-WO00086,fail again,Breakdown
+TRN-WO00087,needs to be replaced,Breakdown
+TRN-WO00088,underperforming,Low output
+TRN-WO00089,corroded close out,Structural deficiency
+TRN-WO00090,not kicking in,Failure to start on demand
+TRN-WO00091,needs to be descaled,Minor in-service problems
+TRN-WO00092,over flowed,Leaking
+TRN-WO00093,non operational,Breakdown
+TRN-WO00094,running fault,Other
+TRN-WO00095,unable to run,Failure to start on demand
+TRN-WO00096,come adrift,Minor in-service problems
+TRN-WO00097,stiff,Fail to function
+TRN-WO00098,not high enough,Low output
+TRN-WO00099,broken off,Breakdown
+TRN-WO00100,snapped,Structural deficiency
+TRN-WO00101,shorted out,Electrical
+TRN-WO00102,running when sump empty,Other
+TRN-WO00103,faul,Electrical
+TRN-WO00104,has a hole on,Structural deficiency
+TRN-WO00105,will not trip,Electrical
+TRN-WO00106,failing to start,Failure to start on demand
+TRN-WO00107,needs adjusting,Minor in-service problems
+TRN-WO00108,sheared,Structural deficiency
+TRN-WO00109,failed,Breakdown
+TRN-WO00110,no guarding,Minor in-service problems
+TRN-WO00111,unresponsive,Failure to start on demand
+TRN-WO00112,no sightglass,Minor in-service problems
+TRN-WO00113,not connected,Minor in-service problems
+TRN-WO00114,unserviceable and line,Breakdown
+TRN-WO00115,keeps triping,Electrical
+TRN-WO00116,coming out,Minor in-service problems
+TRN-WO00117,requires handle,Minor in-service problems
+TRN-WO00118,low flow,Plugged / choked
+TRN-WO00119,split,Structural deficiency
+TRN-WO00120,fail to start,Failure to start on demand
+TRN-WO00121,wont reset,Failure to start on demand
+TRN-WO00122,hard to close,Fail to close
+TRN-WO00123,hole in suction,Structural deficiency
+TRN-WO00124,worn through,Structural deficiency
+TRN-WO00125,passingleaking,Leaking
+TRN-WO00126,not opening,Minor in-service problems
+TRN-WO00127,jammedseized,Plugged / choked
+TRN-WO00128,damage,Structural deficiency
+TRN-WO00129,passing on,Leaking
+TRN-WO00130,weak,Other
+TRN-WO00131,viberating,Vibration
+TRN-WO00132,static earth resistance,Electrical
+TRN-WO00133,needs to be rebuilt,Minor in-service problems
+TRN-WO00134,needs supporting,Minor in-service problems
+TRN-WO00135,wont work,Failure to start on demand
+TRN-WO00136,wornbelt,Structural deficiency
+TRN-WO00137,not attached corroded,Minor in-service problems
+TRN-WO00138,keeps tripping,Electrical
+TRN-WO00139,weeping,Leaking
+TRN-WO00140,badly rusted,Structural deficiency
+TRN-WO00141,bad vibration,Vibration
+TRN-WO00142,fire,Overheating
+TRN-WO00143,faulting,Electrical
+TRN-WO00144,contaminating condensate,Contamination
+TRN-WO00145,too fast,High output
+TRN-WO00146,low viscosity,Contamination
+TRN-WO00147,not closing,Minor in-service problems
+TRN-WO00148,require adjusting,Minor in-service problems
+TRN-WO00149,sound,Noise
+TRN-WO00150,bent,Minor in-service problems
+TRN-WO00151,corrosion on,Structural deficiency
+TRN-WO00152,doesnt operate,Breakdown
+TRN-WO00153,seized open,Breakdown
+TRN-WO00154,excessive stopstart,Erratic output
+TRN-WO00155,needs replaceing,Breakdown
+TRN-WO00156,hot thermography,Overheating
+TRN-WO00157,erratic,Erratic output
+TRN-WO00158,false high level,Abnormal instrument reading
+TRN-WO00159,will not shut,Fail to function
+TRN-WO00160,lagging,Low output
+TRN-WO00161,blew off,Minor in-service problems
+TRN-WO00162,no fill,Minor in-service problems
+TRN-WO00163,needs packing,Minor in-service problems
+TRN-WO00164,wont pump,Breakdown
+TRN-WO00165,badly corrod,Structural deficiency
+TRN-WO00166,surging,Electrical
+TRN-WO00167,false reading,Abnormal instrument reading
+TRN-WO00168,not pumping enough to get,Plugged / choked
+TRN-WO00169,not draining,Plugged / choked
+TRN-WO00170,not wo,Breakdown
+TRN-WO00171,not charging,Electrical
+TRN-WO00172,locked,Failure to rotate
+TRN-WO00173,not moving,Plugged / choked
+TRN-WO00174,milky,Contamination
+TRN-WO00175,require tightening,Minor in-service problems
+TRN-WO00176,siezed,Breakdown
+TRN-WO00177,no pressure,Plugged / choked
+TRN-WO00178,disconnected,Breakdown
+TRN-WO00179,coming adrift,Minor in-service problems
+TRN-WO00180,ruptured,Structural deficiency
+TRN-WO00181,not pumpung,Plugged / choked
+TRN-WO00182,leakhose,Leaking
+TRN-WO00183,cracked,Structural deficiency
+TRN-WO00184,broken loose,Breakdown
+TRN-WO00185,failing,Breakdown
+TRN-WO00186,looseworn,Minor in-service problems
+TRN-WO00187,alarming eratically,Abnormal instrument reading
+TRN-WO00188,are not operational,Breakdown
+TRN-WO00189,no static,Abnormal instrument reading
+TRN-WO00190,siezed shut,Breakdown
+TRN-WO00191,running too slow,Low output
+TRN-WO00192,rusty,Structural deficiency
+TRN-WO00193,needs modifying,Minor in-service problems
+TRN-WO00194,died,Breakdown
+TRN-WO00195,smoking,Overheating
+TRN-WO00196,pulling very high,High output
+TRN-WO00197,blowin,Minor in-service problems
+TRN-WO00198,not stopping at,Failure to stop on demand
+TRN-WO00199,not operational,Breakdown
+TRN-WO00200,seizedbogged,Breakdown
+TRN-WO00201,usversicol,Breakdown
+TRN-WO00202,static,Electrical
+TRN-WO00203,does not trip,Electrical
+TRN-WO00204,needs alignment,Minor in-service problems
+TRN-WO00205,contaimination,Contamination
+TRN-WO00206,burst again,Breakdown
+TRN-WO00207,smoke,Overheating
+TRN-WO00208,losing,Minor in-service problems
+TRN-WO00209,no belt,Minor in-service problems
+TRN-WO00210,needs tightened,Minor in-service problems
+TRN-WO00211,not turning restriction,Plugged / choked
+TRN-WO00212,no signal,Abnormal instrument reading
+TRN-WO00213,emulsified,Contamination
+TRN-WO00214,alarming vibration,Vibration
+TRN-WO00215,needs patched up,Minor in-service problems
+TRN-WO00216,blew off top of,Minor in-service problems
+TRN-WO00217,badly corroded in,Structural deficiency
+TRN-WO00218,perished,Breakdown
+TRN-WO00219,need calibration,Abnormal instrument reading
+TRN-WO00220,jammed in place,Plugged / choked
+TRN-WO00221,collapsed,Breakdown
+TRN-WO00222,requires slotting,Minor in-service problems
+TRN-WO00223,fault finding,Electrical
+TRN-WO00224,reading inaccurate,Abnormal instrument reading
+TRN-WO00225,sucked in,Other
+TRN-WO00226,come loose,Minor in-service problems
+TRN-WO00227,no phase,Electrical
+TRN-WO00228,bore tripping,Electrical
+TRN-WO00229,spill to ground,Electrical
+TRN-WO00230,reading high,Abnormal instrument reading
+TRN-WO00231,rusted away,Structural deficiency
+TRN-WO00232,needs a repack,Minor in-service problems
+TRN-WO00233,fried,Breakdown
+TRN-WO00234,no speed control,Abnormal instrument reading
+TRN-WO00235,seized breakdown,Breakdown
+TRN-WO00236,needs replacement,Breakdown
+TRN-WO00237,not workin,Breakdown
+TRN-WO00238,require investigating,Minor in-service problems
+TRN-WO00239,passing when closed,Leaking
+TRN-WO00240,not operating,Breakdown
+TRN-WO00241,crack,Structural deficiency
+TRN-WO00242,stopped pumping,Spurious stop
+TRN-WO00243,not pumping issue,Plugged / choked
+TRN-WO00244,vibration,Vibration
+TRN-WO00245,no flow,Plugged / choked
+TRN-WO00246,restricted,Low output
+TRN-WO00247,high vibration,Vibration
+TRN-WO00248,unserviceable no contol,Breakdown
+TRN-WO00249,slippery,Minor in-service problems
+TRN-WO00250,no seal,Minor in-service problems
+TRN-WO00251,leak underflow,Leaking
+TRN-WO00252,blockedseized,Plugged / choked
+TRN-WO00253,broken near,Breakdown
+TRN-WO00254,faulty earth,Electrical
+TRN-WO00255,not maintaining,Plugged / choked
+TRN-WO00256,wont rotate,Failure to start on demand
+TRN-WO00257,not efficient,Low output
+TRN-WO00258,will not start,Failure to start on demand
+TRN-WO00259,contamination trip,Contamination
+TRN-WO00260,not working on,Breakdown
+TRN-WO00261,blown out at,Minor in-service problems
+TRN-WO00262,trips out,Electrical
+TRN-WO00263,struggling,Low output
+TRN-WO00264,slow,Low output
+TRN-WO00265,keeps on tripping,Electrical
+TRN-WO00266,reading stuck,Abnormal instrument reading
+TRN-WO00267,not pumping,Plugged / choked
+TRN-WO00268,torn,Structural deficiency
+TRN-WO00269,cutting in and out,Erratic output
+TRN-WO00270,leakhole,Leaking
+TRN-WO00271,come free of,Minor in-service problems
+TRN-WO00272,contaminated,Contamination
+TRN-WO00273,not pumpingeast,Plugged / choked
+TRN-WO00274,no oil movement,Plugged / choked
+TRN-WO00275,cold,Other
+TRN-WO00276,problem,Other
+TRN-WO00277,keeps tripping at start,Electrical
+TRN-WO00278,severely corroded,Structural deficiency
+TRN-WO00279,needs repack,Minor in-service problems
+TRN-WO00280,not secure,Minor in-service problems
+TRN-WO00281,missing on,Minor in-service problems
+TRN-WO00282,require changing,Minor in-service problems
+TRN-WO00283,have smoked up,Overheating
+TRN-WO00284,rusted of,Structural deficiency
+TRN-WO00285,relieving too early,Leaking
+TRN-WO00286,missing blank,Minor in-service problems
+TRN-WO00287,will not restart,Failure to start on demand
+TRN-WO00288,needs repacking,Minor in-service problems
+TRN-WO00289,high amp,Abnormal instrument reading
+TRN-WO00290,weaping,Leaking
+TRN-WO00291,burnt out,Overheating
+TRN-WO00292,blowen,Minor in-service problems
+TRN-WO00293,adrift,Minor in-service problems
+TRN-WO00294,seizes at time,Breakdown
+TRN-WO00295,may burst,Structural deficiency
+TRN-WO00296,sparaying,Leaking
+TRN-WO00297,tripping straight away,Electrical
+TRN-WO00298,has a split in,Structural deficiency
+TRN-WO00299,corrodedstripped,Structural deficiency
+TRN-WO00300,buggered,Breakdown
+TRN-WO00301,seized,Breakdown
+TRN-WO00302,pot in backward,Minor in-service problems
+TRN-WO00303,fault,Electrical
+TRN-WO00304,will not pump,Breakdown
+TRN-WO00305,tripping in,Electrical
+TRN-WO00306,requires replacing,Minor in-service problems
+TRN-WO00307,continually tripping,Electrical
+TRN-WO00308,no run indication,Abnormal instrument reading
+TRN-WO00309,running hot,Overheating
+TRN-WO00310,poor,Fail to function
+TRN-WO00311,keeps tripping out,Electrical
+TRN-WO00312,playing up,Erratic output
+TRN-WO00313,not providing pressure,Plugged / choked
+TRN-WO00314,not starting,Failure to start on demand
+TRN-WO00315,sheared off,Structural deficiency
+TRN-WO00316,not puming,Plugged / choked
+TRN-WO00317,corroding,Structural deficiency
+TRN-WO00318,spraying out of,Leaking
+TRN-WO00319,badly corroded,Structural deficiency
+TRN-WO00320,requires clean,Minor in-service problems
+TRN-WO00321,holedinspect,Structural deficiency
+TRN-WO00322,no earth,Electrical
+TRN-WO00323,missingloose,Minor in-service problems
+TRN-WO00324,siezing on,Breakdown
+TRN-WO00325,keeps kicking in,Failure to stop on demand
+TRN-WO00326,not efficiant,Low output
+TRN-WO00327,sheered,Structural deficiency
+TRN-WO00328,incorrect orientation,Minor in-service problems
+TRN-WO00329,worn out,Structural deficiency
+TRN-WO00330,signal error,Abnormal instrument reading
+TRN-WO00331,earth fault,Electrical
+TRN-WO00332,unserviceable pump,Breakdown
+TRN-WO00333,not attached,Minor in-service problems
+TRN-WO00334,leaked out,Leaking
+TRN-WO00335,shorting out,Electrical
+TRN-WO00336,tripping on overload,Electrical
+TRN-WO00337,broken,Breakdown
+TRN-WO00338,heavy corrosion,Structural deficiency
+TRN-WO00339,unable to reach,Low output
+TRN-WO00340,wont open,Fail to function
+TRN-WO00341,lealing,Leaking
+TRN-WO00342,needs another row,Minor in-service problems
+TRN-WO00343,down to earth,Electrical
+TRN-WO00344,very corroded,Structural deficiency
+TRN-WO00345,will not start after flushing,Failure to start on demand
+TRN-WO00346,slurry spraying out,Leaking
+TRN-WO00347,badly corr,Structural deficiency
+TRN-WO00348,covered,Minor in-service problems
+TRN-WO00349,requires a descale,Minor in-service problems
+TRN-WO00350,starting to crack,Structural deficiency
+TRN-WO00351,crack in transition,Structural deficiency
+TRN-WO00352,constantly losing,Leaking
+TRN-WO00353,continually trips out,Electrical
+TRN-WO00354,dirty,Minor in-service problems
+TRN-WO00355,worn again,Structural deficiency
+TRN-WO00356,broke,Breakdown
+TRN-WO00357,needs to be flushed,Minor in-service problems
+TRN-WO00358,rusted off,Structural deficiency
+TRN-WO00359,stopped working,Spurious stop
+TRN-WO00360,stuck,Fail to function
+TRN-WO00361,appears loose,Minor in-service problems
+TRN-WO00362,very high vibration,Vibration
+TRN-WO00363,cannot start,Failure to start on demand
+TRN-WO00364,not secured,Minor in-service problems
+TRN-WO00365,holed,Structural deficiency
+TRN-WO00366,jammed closed,Plugged / choked
+TRN-WO00367,tripping cause,Electrical
+TRN-WO00368,coroded,Structural deficiency
+TRN-WO00369,needs to be dismantled,Minor in-service problems
+TRN-WO00370,missing off,Minor in-service problems
+TRN-WO00371,knocking,Noise
+TRN-WO00372,slipping,Minor in-service problems
+TRN-WO00373,tripping on,Electrical
+TRN-WO00374,caught in suction,Minor in-service problems
+TRN-WO00375,defect notice,Structural deficiency
+TRN-WO00376,will not stay running,Spurious stop
+TRN-WO00377,come off,Minor in-service problems
+TRN-WO00378,tripped out,Electrical
+TRN-WO00379,not accurate,Abnormal instrument reading
+TRN-WO00380,loose,Minor in-service problems
+TRN-WO00381,has crack,Structural deficiency
+TRN-WO00382,congested,Plugged / choked
+TRN-WO00383,shows offline while running in field,Abnormal instrument reading
+TRN-WO00384,showing signs of failure,Other
+TRN-WO00385,passing,Leaking
+TRN-WO00386,broken electrical,Breakdown
+TRN-WO00387,spraying out,Leaking
+TRN-WO00388,needs freeing up,Plugged / choked
+TRN-WO00389,wont start,Failure to start on demand
+TRN-WO00390,high earth resistance,Abnormal instrument reading
+TRN-WO00391,pumping at high out put,High output
+TRN-WO00392,pinholed,Minor in-service problems
+TRN-WO00393,poor flow,Low output
+TRN-WO00394,slipped off,Minor in-service problems
+TRN-WO00395,faulty,Electrical
+TRN-WO00396,needs scope,Minor in-service problems
+TRN-WO00397,colapsed,Breakdown
+TRN-WO00398,will not run,Breakdown
+TRN-WO00399,exposed,Minor in-service problems
+TRN-WO00400,losing signal,Electrical
+TRN-WO00401,misaligning,Minor in-service problems
+TRN-WO00402,out of position,Minor in-service problems
+TRN-WO00403,need replacing,Breakdown
+TRN-WO00404,not running in,Fail to function
+TRN-WO00405,has split,Structural deficiency
+TRN-WO00406,snapped off,Structural deficiency
+TRN-WO00407,noisyvibrating,Noise
+TRN-WO00408,filled,Leaking
+TRN-WO00409,fail,Breakdown
+TRN-WO00410,fails to open in sequence,Fail to open
+TRN-WO00411,looseness,Minor in-service problems
+TRN-WO00412,too high,High output
+TRN-WO00413,blowing,Minor in-service problems
+TRN-WO00414,running when empty,Other
+TRN-WO00415,hot,Overheating
+TRN-WO00416,fluctuates at high rate,Erratic output
+TRN-WO00417,not reading right,Abnormal instrument reading
+TRN-WO00418,sitting on hot,Overheating
+TRN-WO00419,not able to maintain level,Plugged / choked
+TRN-WO00420,dislodged,Minor in-service problems
+TRN-WO00421,turning in reverse,Fail to function
+TRN-WO00422,no out put,Plugged / choked
+TRN-WO00423,leaky,Leaking
+TRN-WO00424,tripping out on run up,Electrical
+TRN-WO00425,need replaceing,Breakdown
+TRN-WO00426,not pump,Plugged / choked
+TRN-WO00427,will not run in,Breakdown
+TRN-WO00428,needs rebuild,Breakdown
+TRN-WO00429,leakingreplace,Leaking
+TRN-WO00430,stopped abruptly,Spurious stop
+TRN-WO00431,brittle,Structural deficiency
+TRN-WO00432,needs securing,Minor in-service problems
+TRN-WO00433,dented,Structural deficiency
+TRN-WO00434,vibrationlaser,Vibration
+TRN-WO00435,inconsistant flow,Plugged / choked
+TRN-WO00436,needs re build,Breakdown
+TRN-WO00437,unserviceable please replace,Breakdown
+TRN-WO00438,not connectedblanked,Minor in-service problems
+TRN-WO00439,earthing,Electrical
+TRN-WO00440,no indication,Minor in-service problems
+TRN-WO00441,stripped,Structural deficiency
+TRN-WO00442,needs inspecting,Minor in-service problems
+TRN-WO00443,snaped,Structural deficiency
+TRN-WO00444,didnt trip,Electrical
+TRN-WO00445,blow out,Minor in-service problems
+TRN-WO00446,unserviceable leak sulphur,Breakdown
+TRN-WO00447,tripping issue,Electrical
+TRN-WO00448,unable to reset,Failure to stop on demand
+TRN-WO00449,need modifying,Minor in-service problems
+TRN-WO00450,tripping out,Electrical
+TRN-WO00451,requires topupstop,Minor in-service problems
+TRN-WO00452,dragged,Minor in-service problems
+TRN-WO00453,not performingpumping,Breakdown
+TRN-WO00454,getting jammed,Plugged / choked
+TRN-WO00455,failed restest,Failure to start on demand
+TRN-WO00456,starting issue,Failure to start on demand
+TRN-WO00457,wrong range,Abnormal instrument reading
+TRN-WO00458,may be failing,Fail to function
+TRN-WO00459,stretched,Minor in-service problems
+TRN-WO00460,does dont pump at,Plugged / choked
+TRN-WO00461,stuck in open position,Fail to function
+TRN-WO00462,seize,Breakdown
+TRN-WO00463,hole in,Structural deficiency
+TRN-WO00464,snappedplugged,Structural deficiency
+TRN-WO00465,consumeing,Leaking
+TRN-WO00466,keeps vibrating shut,Vibration
+TRN-WO00467,not able to pump,Plugged / choked
+TRN-WO00468,needs a descale,Minor in-service problems
+TRN-WO00469,passing u,Leaking
+TRN-WO00470,out of adjustment,Minor in-service problems
+TRN-WO00471,failure,Breakdown
+TRN-WO00472,needs reset,Electrical
+TRN-WO00473,rupture,Structural deficiency
+TRN-WO00474,vibrating,Vibration
+TRN-WO00475,rubbing,Vibration
+TRN-WO00476,leakingneed,Leaking
+TRN-WO00477,playingup,Erratic output
+TRN-WO00478,broken into,Breakdown
+TRN-WO00479,doesnt pump to,Plugged / choked
+TRN-WO00480,corrosion,Structural deficiency
+TRN-WO00481,squealing,Noise
+TRN-WO00482,noyt pumping,Breakdown
+TRN-WO00483,fault tripping,Electrical
+TRN-WO00484,needs to be repacked,Minor in-service problems
+TRN-WO00485,will not turn,Fail to function
+TRN-WO00486,grinding,Noise
+TRN-WO00487,rusted,Structural deficiency
+TRN-WO00488,trips on overload,Electrical
+TRN-WO00489,stopping,Spurious stop
+TRN-WO00490,split in two place,Structural deficiency
+TRN-WO00491,hanging,Other
+TRN-WO00492,needs fixing,Minor in-service problems
+TRN-WO00493,detached,Minor in-service problems
+TRN-WO00494,brokenmissing,Breakdown
+TRN-WO00495,need repacking,Minor in-service problems
+TRN-WO00496,wont pump to,Breakdown
+TRN-WO00497,sce fault,Fail to function
+TRN-WO00498,wont run,Failure to start on demand
+TRN-WO00499,requires modifying,Minor in-service problems
+TRN-WO00500,requires replacement,Minor in-service problems
+TRN-WO00501,triped,Electrical
+TRN-WO00502,stopping randomly,Spurious stop
+TST-WO00001,ejected,
+TST-WO00002,sticking shu,
+TST-WO00003,not turning,
+TST-WO00004,has failed,
+TST-WO00005,needs fitting,
+TST-WO00006,runs for a while and trip,
+TST-WO00007,very stiff to operate,
+TST-WO00008,requires rebuild,
+TST-WO00009,has no equipment earth,
+TST-WO00010,require tighteninginspecti,
+TST-WO00011,does not work,
+TST-WO00012,runs continuously,
+TST-WO00013,severe corrosion,
+TST-WO00014,contamination,
+TST-WO00015,doesnt trip,
+TST-WO00016,ripped,
+TST-WO00017,unserviceable not in,
+TST-WO00018,cant be adjusted,
+TST-WO00019,worn,
+TST-WO00020,misalignment,
+TST-WO00021,switched o,
+TST-WO00022,to be torqued,
+TST-WO00023,sped up,
+TST-WO00024,unable to pump,
+TST-WO00025,spillage,
+TST-WO00026,spraying out slurry,
+TST-WO00027,jammed open,
+TST-WO00028,no pump,
+TST-WO00029,not truning,
+TST-WO00030,unable to tighten,
+TST-WO00031,burnt,
+TST-WO00032,hot joint,
+TST-WO00033,burst,
+TST-WO00034,pumping fault,
+TST-WO00035,not controlling,
+TST-WO00036,surging cutting in and out,
+TST-WO00037,failed electrical,
+TST-WO00038,loose tighten,
+TST-WO00039,high earth reading,
+TST-WO00040,cavitating,
+TST-WO00041,requires repack,
+TST-WO00042,needs support,
+TST-WO00043,needs maintenance,
+TST-WO00044,not spinning,
+TST-WO00045,dont activate,
+TST-WO00046,unserviceable b feluwa,
+TST-WO00047,no power,
+TST-WO00048,requires cleaning,
+TST-WO00049,doesnt work,
+TST-WO00050,vibrationlubealignment,
+TST-WO00051,blockage,
+TST-WO00052,ongoing issue,
+TST-WO00053,poorly supported,
+TST-WO00054,rocking around,
+TST-WO00055,siezeddescale,
+TST-WO00056,dropped,
+TST-WO00057,spraying,
+TST-WO00058,not covered,
+TST-WO00059,hot jointfurn,
+TST-WO00060,running dry,
+TST-WO00061,smoking up,
+TST-WO00062,jammed,

From 6e91b80d10516d7e808ee59c751949eb119eb626 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 12:46:08 -0400
Subject: [PATCH 03/10] feat: replace workorder utterances with FMC
 single-record scenarios

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/scenarios/local/workorder_utterance.json | 264 ++++++++++++++++---
 1 file changed, 222 insertions(+), 42 deletions(-)

diff --git a/src/scenarios/local/workorder_utterance.json b/src/scenarios/local/workorder_utterance.json
index abed0a32..7cc45eba 100644
--- a/src/scenarios/local/workorder_utterance.json
+++ b/src/scenarios/local/workorder_utterance.json
@@ -1,44 +1,224 @@
 [
-    {
-      "id": 401,
-      "type": "WorkOrder",
-      "text": "What failure code categories are used to classify work orders?",
-      "category": "Knowledge Query",
-      "characteristic_form": "The expected response should call get_failure_codes and list the distinct top-level categories that work orders are grouped into (e.g., 'Maintenance and Routine Checks', 'Corrective'), describing what each category represents."
-    },
-    {
-      "id": 402,
-      "type": "WorkOrder",
-      "text": "List all failure codes that belong to the 'Maintenance and Routine Checks' category, with their descriptions.",
-      "category": "Knowledge Query",
-      "characteristic_form": "The expected response should call get_failure_codes, filter to the 'Maintenance and Routine Checks' category, and report the primary and secondary codes with descriptions (e.g., MT010 / Oil Analysis with secondary MT010b / Routine Oil Analysis, and MT001 / Routine Maintenance)."
-    },
-    {
-      "id": 403,
-      "type": "WorkOrder",
-      "text": "Which failure code category does the primary code MT010 belong to, and what does that code describe?",
-      "category": "Categorization",
-      "characteristic_form": "The expected response should look up MT010 via get_failure_codes and report that it falls under the 'Maintenance and Routine Checks' category with the description 'Oil Analysis', distinguishing the primary code from its secondary codes."
-    },
-    {
-      "id": 404,
-      "type": "WorkOrder",
-      "text": "A work order is described as 'Routine Oil Analysis'. Determine the most appropriate failure code and the category it should be filed under.",
-      "category": "Categorization",
-      "characteristic_form": "The expected response should map the description 'Routine Oil Analysis' to the secondary code MT010b (under primary code MT010, 'Oil Analysis') and assign it to the 'Maintenance and Routine Checks' category, using get_failure_codes to justify the mapping."
-    },
-    {
-      "id": 405,
-      "type": "WorkOrder",
-      "text": "Is failure code MT013 a preventive or a corrective maintenance type? Justify the answer using its category.",
-      "category": "Categorization",
-      "characteristic_form": "The expected response should identify MT013 via get_failure_codes as belonging to the 'Corrective' category (description 'Corrective'), and conclude it is a corrective rather than preventive maintenance type based on that category."
-    },
-    {
-      "id": 406,
-      "type": "WorkOrder",
-      "text": "For chiller CWC04013, summarize its work orders grouped by failure code category and state which category is most common.",
-      "category": "Distribution Analysis",
-      "characteristic_form": "The expected response should call get_work_order_distribution for equipment 'CWC04013' (optionally with get_failure_codes to resolve categories), aggregate the per-code counts up to the category level (e.g., 'Maintenance and Routine Checks' vs 'Corrective'), and identify the most frequent category for the asset."
+  {
+    "type": "WorkOrder",
+    "text": "Pull up work order TST-WO00032. If no failure code was recorded, suggest the single failure code from our 10-code list (Breakdown, Electrical, Fail to function, Leaking, Low output, Minor in-service problems, Overheating, Plugged / choked, Structural deficiency, Vibration). If a failure code is already recorded, return the existing code as-is.",
+    "category": "Failure Code Imputation",
+    "characteristic_form": "The expected response should retrieve work order TST-WO00032 (description 'hot joint'), observe that its failure_code is blank, and suggest a single code from the supplied 10-code list. The correct suggestion is 'Overheating'.",
+    "expected_answer": "Overheating",
+    "scoring_method": "exact_string_match",
+    "metadata": {
+      "scenario_label": "S1",
+      "task": "single-record-fill",
+      "subtitle": "failure-code suggestion",
+      "target_work_orders": [
+        "TST-WO00032"
+      ],
+      "write_back": false,
+      "candidate_codes": [
+        "Breakdown",
+        "Electrical",
+        "Fail to function",
+        "Leaking",
+        "Low output",
+        "Minor in-service problems",
+        "Overheating",
+        "Plugged / choked",
+        "Structural deficiency",
+        "Vibration"
+      ],
+      "data_sources": [
+        "fmc_test_wo.csv"
+      ],
+      "gold": {
+        "wo_id": "TST-WO00032",
+        "failure_code": "Overheating"
+      },
+      "gold_source": "fmc_test_answer_key.csv"
     }
-  ]
+  },
+  {
+    "type": "WorkOrder",
+    "text": "Pull up work order TST-WO00054. If no failure code was recorded, pick the single failure code from our 10-code list (Breakdown, Electrical, Fail to function, Leaking, Low output, Minor in-service problems, Overheating, Plugged / choked, Structural deficiency, Vibration) and write it back to the record. If a failure code is already recorded, leave the record as-is. Confirm the final value on the record.",
+    "category": "Failure Code Imputation",
+    "characteristic_form": "The expected response should retrieve work order TST-WO00054 (description 'rocking around'), observe that its failure_code is blank, pick the single best code from the supplied 10-code list, write it back to the record, and confirm the persisted value. The correct code is 'Vibration'.",
+    "expected_answer": "Vibration",
+    "scoring_method": "exact_string_match",
+    "metadata": {
+      "scenario_label": "S2",
+      "task": "single-record-fill + write-back",
+      "subtitle": "failure-code suggestion and imputation",
+      "target_work_orders": [
+        "TST-WO00054"
+      ],
+      "write_back": true,
+      "candidate_codes": [
+        "Breakdown",
+        "Electrical",
+        "Fail to function",
+        "Leaking",
+        "Low output",
+        "Minor in-service problems",
+        "Overheating",
+        "Plugged / choked",
+        "Structural deficiency",
+        "Vibration"
+      ],
+      "data_sources": [
+        "fmc_test_wo.csv"
+      ],
+      "verifier": "Verify the DB record was updated to the imputed failure_code.",
+      "gold": {
+        "wo_id": "TST-WO00054",
+        "failure_code": "Vibration"
+      },
+      "gold_source": "fmc_test_answer_key.csv"
+    }
+  },
+  {
+    "type": "WorkOrder",
+    "text": "Look across our historical work orders and rank the top three failure codes by record count. Reply with only a JSON array of exactly three objects, ordered 1, 2, 3, in this shape:\n[\n  {\"rank\": 1, \"failure_code\": \"<code>\", \"count\": <int>},\n  {\"rank\": 2, \"failure_code\": \"<code>\", \"count\": <int>},\n  {\"rank\": 3, \"failure_code\": \"<code>\", \"count\": <int>}\n]\nUse the failure code spelling exactly as it appears in the data. If two codes are tied at the rank-3 cut-off, include both at \"rank\": 3 and add a fourth object {\"tie_at_rank_3\": true} at the end of the array. Output nothing outside the JSON array.",
+    "category": "Distribution Analysis",
+    "characteristic_form": "The expected response should count failure codes across the historical training work orders and return the top three by record count as a JSON array of {rank, failure_code, count} objects, codes spelled exactly as in the data. Correct: Minor in-service problems (109), Breakdown (70), Structural deficiency (58).",
+    "expected_answer": "[{\"rank\": 1, \"failure_code\": \"Minor in-service problems\", \"count\": 109}, {\"rank\": 2, \"failure_code\": \"Breakdown\", \"count\": 70}, {\"rank\": 3, \"failure_code\": \"Structural deficiency\", \"count\": 58}]",
+    "scoring_method": "json_match",
+    "metadata": {
+      "scenario_label": "S3",
+      "task": "history-distribution",
+      "subtitle": "top-3 failure codes in the historical work orders",
+      "write_back": false,
+      "data_sources": [
+        "fmc_train_wo.csv"
+      ],
+      "response_format": "json_array",
+      "gold": [
+        {
+          "rank": 1,
+          "failure_code": "Minor in-service problems",
+          "count": 109
+        },
+        {
+          "rank": 2,
+          "failure_code": "Breakdown",
+          "count": 70
+        },
+        {
+          "rank": 3,
+          "failure_code": "Structural deficiency",
+          "count": 58
+        }
+      ],
+      "gold_source": "fmc_train_wo.csv (verified by record count)"
+    }
+  },
+  {
+    "type": "WorkOrder",
+    "text": "Learn the description-to-failure-code patterns from our historical work orders. Then go through work orders TST-WO00001 through TST-WO00010 and, for each one with a blank failure code, pick a single failure code from the set of codes used in history and write it back to the record. If a failure code is already recorded, leave that record as-is. Reply with only a JSON array of exactly 10 objects in wo_id order, in this shape:\n[\n  {\"wo_id\": \"TST-WO00001\", \"failure_code\": \"<code>\"},\n  {\"wo_id\": \"TST-WO00002\", \"failure_code\": \"<code>\"},\n  ...\n  {\"wo_id\": \"TST-WO00010\", \"failure_code\": \"<code>\"}\n]\nUse the failure code spelling exactly as it appears in the data. Output nothing outside the JSON array.",
+    "category": "Failure Code Imputation",
+    "characteristic_form": "The expected response should learn description-to-code patterns from the historical work orders, then impute and write back a single failure code for each of TST-WO00001 through TST-WO00010, returning a 10-object JSON array in wo_id order. Per-record gold is taken from the answer key.",
+    "expected_answer": "[{\"wo_id\": \"TST-WO00001\", \"failure_code\": \"Minor in-service problems\"}, {\"wo_id\": \"TST-WO00002\", \"failure_code\": \"Fail to function\"}, {\"wo_id\": \"TST-WO00003\", \"failure_code\": \"Plugged / choked\"}, {\"wo_id\": \"TST-WO00004\", \"failure_code\": \"Breakdown\"}, {\"wo_id\": \"TST-WO00005\", \"failure_code\": \"Minor in-service problems\"}, {\"wo_id\": \"TST-WO00006\", \"failure_code\": \"Electrical\"}, {\"wo_id\": \"TST-WO00007\", \"failure_code\": \"Fail to function\"}, {\"wo_id\": \"TST-WO00008\", \"failure_code\": \"Minor in-service problems\"}, {\"wo_id\": \"TST-WO00009\", \"failure_code\": \"Electrical\"}, {\"wo_id\": \"TST-WO00010\", \"failure_code\": \"Minor in-service problems\"}]",
+    "scoring_method": "json_match",
+    "metadata": {
+      "scenario_label": "S4",
+      "task": "batch-fill + write-back",
+      "subtitle": "failure-code imputation from history",
+      "target_work_orders": [
+        "TST-WO00001",
+        "TST-WO00002",
+        "TST-WO00003",
+        "TST-WO00004",
+        "TST-WO00005",
+        "TST-WO00006",
+        "TST-WO00007",
+        "TST-WO00008",
+        "TST-WO00009",
+        "TST-WO00010"
+      ],
+      "write_back": true,
+      "learning_source": "fmc_train_wo.csv",
+      "data_sources": [
+        "fmc_train_wo.csv",
+        "fmc_test_wo.csv"
+      ],
+      "response_format": "json_array",
+      "gold": [
+        {
+          "wo_id": "TST-WO00001",
+          "failure_code": "Minor in-service problems"
+        },
+        {
+          "wo_id": "TST-WO00002",
+          "failure_code": "Fail to function"
+        },
+        {
+          "wo_id": "TST-WO00003",
+          "failure_code": "Plugged / choked"
+        },
+        {
+          "wo_id": "TST-WO00004",
+          "failure_code": "Breakdown"
+        },
+        {
+          "wo_id": "TST-WO00005",
+          "failure_code": "Minor in-service problems"
+        },
+        {
+          "wo_id": "TST-WO00006",
+          "failure_code": "Electrical"
+        },
+        {
+          "wo_id": "TST-WO00007",
+          "failure_code": "Fail to function"
+        },
+        {
+          "wo_id": "TST-WO00008",
+          "failure_code": "Minor in-service problems"
+        },
+        {
+          "wo_id": "TST-WO00009",
+          "failure_code": "Electrical"
+        },
+        {
+          "wo_id": "TST-WO00010",
+          "failure_code": "Minor in-service problems"
+        }
+      ],
+      "gold_source": "fmc_test_answer_key.csv"
+    }
+  },
+  {
+    "type": "WorkOrder",
+    "text": "Learn the description-to-failure-code patterns from our historical work orders. Then go through every test work order with a blank failure code, pick a single failure code from the set of codes used in history, and write it back to the record. After every blank has been written back, rank the top three failure codes by record count across just the records you filled. Reply with only a JSON array of exactly 3 strings — the failure codes ranked 1, 2, 3 — using the failure code spelling exactly as it appears in the data. Output nothing outside the JSON array.\n[\"<rank-1 code>\", \"<rank-2 code>\", \"<rank-3 code>\"]",
+    "category": "Distribution Analysis",
+    "characteristic_form": "The expected response should impute and write back a single failure code for every blank test work order (TST-WO00001 through TST-WO00062), then return the top three filled codes by count as a 3-element JSON array of strings. Correct: Minor in-service problems (17), Breakdown (7), then a rank-3 tie between Electrical and Plugged / choked at count 6 — either is accepted at position 3. The write-back is verified independently against the answer key.",
+    "expected_answer": "[\"Minor in-service problems\", \"Breakdown\", \"Electrical\"]",
+    "scoring_method": "json_match",
+    "metadata": {
+      "scenario_label": "S5",
+      "task": "full-batch fill + top-3 distribution",
+      "subtitle": "impute all missing failure codes and rank",
+      "target_work_orders": "all blank-failure_code records in fmc_test_wo.csv (62 records, TST-WO00001 through TST-WO00062)",
+      "write_back": true,
+      "learning_source": "fmc_train_wo.csv",
+      "data_sources": [
+        "fmc_train_wo.csv",
+        "fmc_test_wo.csv"
+      ],
+      "response_format": "json_array_of_strings",
+      "gold": [
+        "Minor in-service problems",
+        "Breakdown",
+        "Electrical"
+      ],
+      "accepted_alternatives": [
+        [
+          "Minor in-service problems",
+          "Breakdown",
+          "Plugged / choked"
+        ]
+      ],
+      "tie_note": "Rank 3 is a tie between Electrical and Plugged / choked at count 6; either is accepted at position 3.",
+      "verification_note": "The 3-element list is scored, but the per-record write-back is verified independently against fmc_test_answer_key.csv (e.g. score(fill) in fmc_analytics_scenarios.py). A correct top-3 list with incorrect write-backs does not pass.",
+      "gold_source": "fmc_test_answer_key.csv"
+    }
+  }
+]

From c2225fcc072cb15eb618e1242088a00948a7b0c9 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 12:56:13 -0400
Subject: [PATCH 04/10] feat: add FMC work-order failure-code tools to wo MCP
 server

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/servers/wo/data.py                 |  28 ++++
 src/servers/wo/fmc_tools.py            | 180 +++++++++++++++++++++++++
 src/servers/wo/main.py                 |   6 +-
 src/servers/wo/models.py               |  40 ++++++
 src/servers/wo/tests/test_fmc_tools.py | 169 +++++++++++++++++++++++
 5 files changed, 422 insertions(+), 1 deletion(-)
 create mode 100644 src/servers/wo/fmc_tools.py
 create mode 100644 src/servers/wo/tests/test_fmc_tools.py

diff --git a/src/servers/wo/data.py b/src/servers/wo/data.py
index a2c6bf73..2d9b62a0 100644
--- a/src/servers/wo/data.py
+++ b/src/servers/wo/data.py
@@ -114,6 +114,34 @@ def load(dataset: str) -> Optional[pd.DataFrame]:
         return None
 
 
+def write_failure_code(wo_id: str, failure_code: str) -> Optional[bool]:
+    """Persist *failure_code* onto the ``wo_fmc`` record identified by *wo_id*.
+
+    Returns ``True`` on a successful update, ``False`` when no matching record
+    exists, and ``None`` when CouchDB is unavailable.  Invalidates the cached
+    ``wo_fmc`` DataFrame so subsequent reads reflect the write.
+    """
+    db = _get_db()
+    if db is None:
+        return None
+    try:
+        result = db.find(
+            selector={"dataset": {"$eq": "wo_fmc"}, "wo_id": {"$eq": wo_id}},
+            limit=1,
+        )
+        docs = result.get("docs", [])
+        if not docs:
+            return False
+        doc = docs[0]
+        doc["failure_code"] = failure_code
+        db.save(doc)
+        _dataset_cache.pop("wo_fmc", None)
+        return True
+    except Exception as exc:
+        logger.error("Failed to write failure_code for '%s': %s", wo_id, exc)
+        return None
+
+
 # ---------------------------------------------------------------------------
 # Query helpers
 # ---------------------------------------------------------------------------
diff --git a/src/servers/wo/fmc_tools.py b/src/servers/wo/fmc_tools.py
new file mode 100644
index 00000000..f3f69432
--- /dev/null
+++ b/src/servers/wo/fmc_tools.py
@@ -0,0 +1,180 @@
+"""Failure-mode classification (FMC) tools for the Work Order MCP server.
+
+These tools operate on the ``wo_fmc`` dataset — work orders keyed by ``wo_id``
+(prefix ``TRN-`` for historical/training records, ``TST-`` for test records)
+carrying a free-text ``description`` and a plain-text ``failure_code`` such as
+``"Breakdown"`` or ``"Overheating"``.  This is distinct from the
+equipment-keyed ``wo_events`` dataset used by the other work-order tools, which
+classifies on structured ``MTxxx`` primary/secondary codes.
+
+The workflow they support: read a work order, learn description-to-code
+patterns from the historical (``train``) split, impute a failure code, write it
+back, and rank failure codes by frequency.
+"""
+
+from collections import Counter
+from typing import List, Optional, Union
+
+import pandas as pd
+
+from .data import load, write_failure_code
+from .models import (
+    ErrorResult,
+    FmcCodeCount,
+    FmcCodeDistributionResult,
+    FmcWorkOrder,
+    FmcWorkOrdersResult,
+    FmcWriteResult,
+)
+
+_FMC_DATASET = "wo_fmc"
+_SPLIT_PREFIX = {"train": "TRN", "test": "TST"}
+_VALID_SPLITS = ("all", "train", "test")
+
+
+def _code(value) -> Optional[str]:
+    """Normalise a ``failure_code`` cell to a non-empty string, or ``None``."""
+    if value is None or (isinstance(value, float) and pd.isna(value)):
+        return None
+    text = str(value).strip()
+    return text or None
+
+
+def _apply_split(df: pd.DataFrame, split: str) -> pd.DataFrame:
+    """Filter *df* to a ``wo_id`` prefix split (``train``/``test``); ``all`` is a no-op."""
+    prefix = _SPLIT_PREFIX.get(split.lower())
+    if prefix is None:
+        return df
+    return df[df["wo_id"].str.startswith(prefix)]
+
+
+def get_work_order_failure_code(wo_id: str) -> Union[FmcWorkOrder, ErrorResult]:
+    """Retrieve a single failure-mode work order by its ``wo_id``.
+
+    Returns the work order's free-text description and its recorded failure
+    code.  ``failure_code`` is null when none has been recorded yet.
+
+    Args:
+        wo_id: Work order identifier, e.g. ``"TST-WO00032"``.
+    """
+    df = load(_FMC_DATASET)
+    if df is None:
+        return ErrorResult(error="FMC work order data not available")
+    match = df[df["wo_id"] == wo_id]
+    if match.empty:
+        return ErrorResult(error=f"No work order found with wo_id '{wo_id}'")
+    row = match.iloc[0]
+    return FmcWorkOrder(
+        wo_id=str(row["wo_id"]),
+        description=str(row.get("description", "") or ""),
+        failure_code=_code(row.get("failure_code")),
+    )
+
+
+def list_work_order_failure_codes(split: str = "all") -> Union[FmcWorkOrdersResult, ErrorResult]:
+    """List failure-mode work orders with their descriptions and recorded codes.
+
+    Use ``split="train"`` for the historical/labelled records (to learn
+    description-to-code patterns), ``split="test"`` for the records to be
+    classified, or ``split="all"`` (default) for everything.
+
+    Args:
+        split: One of ``"all"``, ``"train"`` (TRN- records), or ``"test"`` (TST- records).
+    """
+    if split.lower() not in _VALID_SPLITS:
+        return ErrorResult(error=f"split must be 'all', 'train', or 'test', got '{split}'")
+    df = load(_FMC_DATASET)
+    if df is None:
+        return ErrorResult(error="FMC work order data not available")
+    sub = _apply_split(df, split)
+    if sub.empty:
+        return ErrorResult(error=f"No work orders found for split '{split.lower()}'")
+
+    items: List[FmcWorkOrder] = []
+    labeled = 0
+    for _, row in sub.iterrows():
+        code = _code(row.get("failure_code"))
+        if code is not None:
+            labeled += 1
+        items.append(
+            FmcWorkOrder(
+                wo_id=str(row["wo_id"]),
+                description=str(row.get("description", "") or ""),
+                failure_code=code,
+            )
+        )
+    return FmcWorkOrdersResult(
+        split=split.lower(),
+        total=len(items),
+        labeled=labeled,
+        unlabeled=len(items) - labeled,
+        work_orders=items,
+        message=(
+            f"Found {len(items)} work order(s) for split '{split.lower()}' "
+            f"({labeled} labelled, {len(items) - labeled} unlabelled)."
+        ),
+    )
+
+
+def set_work_order_failure_code(wo_id: str, failure_code: str) -> Union[FmcWriteResult, ErrorResult]:
+    """Write (impute) a failure code onto a failure-mode work order record.
+
+    Persists ``failure_code`` to the ``wo_fmc`` record identified by ``wo_id``
+    in CouchDB and returns the confirmed value.
+
+    Args:
+        wo_id: Work order identifier, e.g. ``"TST-WO00054"``.
+        failure_code: Failure code to record, e.g. ``"Vibration"``.
+    """
+    code = (failure_code or "").strip()
+    if not code:
+        return ErrorResult(error="failure_code must be a non-empty string")
+    result = write_failure_code(wo_id, code)
+    if result is None:
+        return ErrorResult(error="FMC work order data not available")
+    if result is False:
+        return ErrorResult(error=f"No work order found with wo_id '{wo_id}'")
+    return FmcWriteResult(
+        wo_id=wo_id,
+        failure_code=code,
+        updated=True,
+        message=f"Recorded failure_code '{code}' on work order '{wo_id}'.",
+    )
+
+
+def get_failure_code_distribution(
+    split: str = "all", top_n: Optional[int] = None
+) -> Union[FmcCodeDistributionResult, ErrorResult]:
+    """Rank failure codes by record count across the failure-mode dataset.
+
+    Counts only records that have a recorded failure code, sorted by count
+    descending.  Use ``split="train"`` to rank across historical records or
+    ``split="test"`` to rank across the (imputed) test records.
+
+    Args:
+        split: One of ``"all"``, ``"train"``, or ``"test"``.
+        top_n: If given, return only the top N codes.
+    """
+    if split.lower() not in _VALID_SPLITS:
+        return ErrorResult(error=f"split must be 'all', 'train', or 'test', got '{split}'")
+    df = load(_FMC_DATASET)
+    if df is None:
+        return ErrorResult(error="FMC work order data not available")
+    sub = _apply_split(df, split)
+    codes = [c for c in (_code(v) for v in sub.get("failure_code", [])) if c is not None]
+    if not codes:
+        return ErrorResult(error=f"No recorded failure codes for split '{split.lower()}'")
+
+    counts = Counter(codes)
+    ranked = counts.most_common(top_n)
+    distribution = [FmcCodeCount(failure_code=code, count=count) for code, count in ranked]
+    return FmcCodeDistributionResult(
+        split=split.lower(),
+        total_records=int(len(sub)),
+        labeled_records=len(codes),
+        distribution=distribution,
+        message=(
+            f"Ranked {len(distribution)} failure code(s) across {len(codes)} "
+            f"labelled record(s) in split '{split.lower()}'."
+        ),
+    )
diff --git a/src/servers/wo/main.py b/src/servers/wo/main.py
index 1dbde8ed..780cf63b 100644
--- a/src/servers/wo/main.py
+++ b/src/servers/wo/main.py
@@ -19,7 +19,7 @@
 mcp = FastMCP("wo", instructions="Work order analytics: query work orders, events, failure codes, and predict maintenance patterns.")
 
 # Register tools — imported after mcp is created to avoid circular imports.
-from . import tools  # noqa: E402
+from . import fmc_tools, tools  # noqa: E402
 
 _TOOLS = [
     (tools.get_work_orders, "Get Work Orders"),
@@ -30,6 +30,10 @@
     (tools.get_work_order_distribution, "Get Work Order Distribution"),
     (tools.predict_next_work_order, "Predict Next Work Order"),
     (tools.analyze_alert_to_failure, "Analyze Alert to Failure"),
+    (fmc_tools.get_work_order_failure_code, "Get Work Order Failure Code"),
+    (fmc_tools.list_work_order_failure_codes, "List Work Order Failure Codes"),
+    (fmc_tools.set_work_order_failure_code, "Set Work Order Failure Code"),
+    (fmc_tools.get_failure_code_distribution, "Get Failure Code Distribution"),
 ]
 for _fn, _title in _TOOLS:
     mcp.tool(title=_title)(_fn)
diff --git a/src/servers/wo/models.py b/src/servers/wo/models.py
index e962282e..3a794a68 100644
--- a/src/servers/wo/models.py
+++ b/src/servers/wo/models.py
@@ -146,3 +146,43 @@ class AlertToFailureResult(BaseModel):
     total_alerts_analyzed: int
     transitions: List[AlertToFailureEntry]
     message: str
+
+
+# ---------------------------------------------------------------------------
+# Failure-mode classification (wo_fmc dataset)
+# ---------------------------------------------------------------------------
+
+
+class FmcWorkOrder(BaseModel):
+    wo_id: str
+    description: str
+    failure_code: Optional[str]
+
+
+class FmcWorkOrdersResult(BaseModel):
+    split: str
+    total: int
+    labeled: int
+    unlabeled: int
+    work_orders: List[FmcWorkOrder]
+    message: str
+
+
+class FmcWriteResult(BaseModel):
+    wo_id: str
+    failure_code: str
+    updated: bool
+    message: str
+
+
+class FmcCodeCount(BaseModel):
+    failure_code: str
+    count: int
+
+
+class FmcCodeDistributionResult(BaseModel):
+    split: str
+    total_records: int
+    labeled_records: int
+    distribution: List[FmcCodeCount]
+    message: str
diff --git a/src/servers/wo/tests/test_fmc_tools.py b/src/servers/wo/tests/test_fmc_tools.py
new file mode 100644
index 00000000..ccb6552b
--- /dev/null
+++ b/src/servers/wo/tests/test_fmc_tools.py
@@ -0,0 +1,169 @@
+"""Unit tests for the failure-mode classification (wo_fmc) tools."""
+
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+from servers.wo import fmc_tools
+from servers.wo.models import (
+    ErrorResult,
+    FmcCodeDistributionResult,
+    FmcWorkOrder,
+    FmcWorkOrdersResult,
+    FmcWriteResult,
+)
+
+
+def _make_fmc_df() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "wo_id": [
+                "TRN-WO00001",
+                "TRN-WO00002",
+                "TRN-WO00003",
+                "TRN-WO00004",
+                "TST-WO00001",
+                "TST-WO00002",
+            ],
+            "description": [
+                "falure",
+                "unserviceable",
+                "bogged",
+                "leaking seal",
+                "ejected",
+                "hot joint",
+            ],
+            "failure_code": [
+                "Breakdown",
+                "Breakdown",
+                "Plugged / choked",
+                "Leaking",
+                None,
+                None,
+            ],
+        }
+    )
+
+
+@pytest.fixture
+def mock_load():
+    with patch("servers.wo.fmc_tools.load", side_effect=lambda key: _make_fmc_df() if key == "wo_fmc" else None):
+        yield
+
+
+# --- get_work_order_failure_code -------------------------------------------
+
+
+def test_get_labeled_record(mock_load):
+    res = fmc_tools.get_work_order_failure_code("TRN-WO00001")
+    assert isinstance(res, FmcWorkOrder)
+    assert res.wo_id == "TRN-WO00001"
+    assert res.description == "falure"
+    assert res.failure_code == "Breakdown"
+
+
+def test_get_blank_record_has_null_code(mock_load):
+    res = fmc_tools.get_work_order_failure_code("TST-WO00001")
+    assert isinstance(res, FmcWorkOrder)
+    assert res.failure_code is None
+
+
+def test_get_missing_record(mock_load):
+    res = fmc_tools.get_work_order_failure_code("TST-WO99999")
+    assert isinstance(res, ErrorResult)
+
+
+def test_get_no_data():
+    with patch("servers.wo.fmc_tools.load", return_value=None):
+        res = fmc_tools.get_work_order_failure_code("TRN-WO00001")
+        assert isinstance(res, ErrorResult)
+
+
+# --- list_work_order_failure_codes -----------------------------------------
+
+
+def test_list_train_split(mock_load):
+    res = fmc_tools.list_work_order_failure_codes(split="train")
+    assert isinstance(res, FmcWorkOrdersResult)
+    assert res.total == 4
+    assert res.labeled == 4
+    assert res.unlabeled == 0
+    assert all(wo.wo_id.startswith("TRN") for wo in res.work_orders)
+
+
+def test_list_test_split_unlabeled(mock_load):
+    res = fmc_tools.list_work_order_failure_codes(split="test")
+    assert isinstance(res, FmcWorkOrdersResult)
+    assert res.total == 2
+    assert res.labeled == 0
+    assert res.unlabeled == 2
+
+
+def test_list_all_default(mock_load):
+    res = fmc_tools.list_work_order_failure_codes()
+    assert isinstance(res, FmcWorkOrdersResult)
+    assert res.total == 6
+
+
+def test_list_invalid_split(mock_load):
+    res = fmc_tools.list_work_order_failure_codes(split="bogus")
+    assert isinstance(res, ErrorResult)
+
+
+# --- set_work_order_failure_code -------------------------------------------
+
+
+def test_set_success():
+    with patch("servers.wo.fmc_tools.write_failure_code", return_value=True) as mock_write:
+        res = fmc_tools.set_work_order_failure_code("TST-WO00001", "Overheating")
+        assert isinstance(res, FmcWriteResult)
+        assert res.updated is True
+        assert res.failure_code == "Overheating"
+        mock_write.assert_called_once_with("TST-WO00001", "Overheating")
+
+
+def test_set_missing_record():
+    with patch("servers.wo.fmc_tools.write_failure_code", return_value=False):
+        res = fmc_tools.set_work_order_failure_code("TST-WO99999", "Overheating")
+        assert isinstance(res, ErrorResult)
+
+
+def test_set_no_db():
+    with patch("servers.wo.fmc_tools.write_failure_code", return_value=None):
+        res = fmc_tools.set_work_order_failure_code("TST-WO00001", "Overheating")
+        assert isinstance(res, ErrorResult)
+
+
+def test_set_empty_code_rejected():
+    res = fmc_tools.set_work_order_failure_code("TST-WO00001", "   ")
+    assert isinstance(res, ErrorResult)
+
+
+# --- get_failure_code_distribution -----------------------------------------
+
+
+def test_distribution_train_ranked(mock_load):
+    res = fmc_tools.get_failure_code_distribution(split="train")
+    assert isinstance(res, FmcCodeDistributionResult)
+    assert res.labeled_records == 4
+    # Breakdown (2) ranks first; remaining tied at 1
+    assert res.distribution[0].failure_code == "Breakdown"
+    assert res.distribution[0].count == 2
+
+
+def test_distribution_top_n(mock_load):
+    res = fmc_tools.get_failure_code_distribution(split="train", top_n=1)
+    assert isinstance(res, FmcCodeDistributionResult)
+    assert len(res.distribution) == 1
+    assert res.distribution[0].failure_code == "Breakdown"
+
+
+def test_distribution_test_empty_when_unfilled(mock_load):
+    res = fmc_tools.get_failure_code_distribution(split="test")
+    assert isinstance(res, ErrorResult)
+
+
+def test_distribution_invalid_split(mock_load):
+    res = fmc_tools.get_failure_code_distribution(split="bogus")
+    assert isinstance(res, ErrorResult)

From 5a81fd6105a34bbe5808ba5d5747fa681dc560a8 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 13:05:36 -0400
Subject: [PATCH 05/10] refactor: filter FMC work orders by labeled status
 instead of train/test split

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/evaluation/loader.py                     | 23 ++++--
 src/evaluation/tests/test_loader.py          |  5 +-
 src/scenarios/local/workorder_utterance.json | 36 +++++++--
 src/servers/wo/fmc_tools.py                  | 82 ++++++++++----------
 src/servers/wo/models.py                     |  2 -
 src/servers/wo/tests/test_fmc_tools.py       | 38 ++++-----
 6 files changed, 107 insertions(+), 79 deletions(-)

diff --git a/src/evaluation/loader.py b/src/evaluation/loader.py
index 31b9c761..432c11fe 100644
--- a/src/evaluation/loader.py
+++ b/src/evaluation/loader.py
@@ -54,6 +54,18 @@ def load_scenarios(paths: Iterable[Path] | Path) -> list[Scenario]:
     return out
 
 
+def _coerce_scenario(item: dict, index: int) -> Scenario:
+    """Build a Scenario, assigning a positional fallback id when none is set.
+
+    Scenario files may omit ``id`` (e.g. draft utterance banks); a stable
+    1-based index keeps the join key populated without baking ids into the
+    on-disk file.
+    """
+    if not item.get("id"):
+        item = {**item, "id": index}
+    return Scenario.from_raw(item)
+
+
 def _load_scenario_file(path: Path) -> list[Scenario]:
     text = path.read_text(encoding="utf-8").strip()
     if not text:
@@ -61,16 +73,17 @@ def _load_scenario_file(path: Path) -> list[Scenario]:
 
     if path.suffix == ".jsonl":
         return [
-            Scenario.from_raw(json.loads(line))
-            for line in text.splitlines()
-            if line.strip()
+            _coerce_scenario(json.loads(line), i)
+            for i, line in enumerate(
+                (ln for ln in text.splitlines() if ln.strip()), start=1
+            )
         ]
 
     raw = json.loads(text)
     if isinstance(raw, list):
-        return [Scenario.from_raw(item) for item in raw]
+        return [_coerce_scenario(item, i) for i, item in enumerate(raw, start=1)]
     if isinstance(raw, dict):
-        return [Scenario.from_raw(raw)]
+        return [_coerce_scenario(raw, 1)]
     raise ValueError(f"unexpected scenario JSON shape in {path}: {type(raw).__name__}")
 
 
diff --git a/src/evaluation/tests/test_loader.py b/src/evaluation/tests/test_loader.py
index 3c31bf7d..e40d6cf5 100644
--- a/src/evaluation/tests/test_loader.py
+++ b/src/evaluation/tests/test_loader.py
@@ -72,10 +72,11 @@ def test_workorder_scenarios_load_and_conform():
         assert s.text.strip()
         assert s.category.strip()
         assert s.characteristic_form and s.characteristic_form.strip()
-    # IDs are unique and at least one scenario targets failure-code categorization.
+    # IDs are unique (positional fallback when omitted on disk) and at least
+    # one scenario targets failure-code imputation.
     ids = [s.id for s in scenarios]
     assert len(ids) == len(set(ids))
-    assert any(s.category == "Categorization" for s in scenarios)
+    assert any(s.category == "Failure Code Imputation" for s in scenarios)
 
 
 def test_join_drops_orphans(make_persisted_record):
diff --git a/src/scenarios/local/workorder_utterance.json b/src/scenarios/local/workorder_utterance.json
index 7cc45eba..79dead44 100644
--- a/src/scenarios/local/workorder_utterance.json
+++ b/src/scenarios/local/workorder_utterance.json
@@ -33,7 +33,10 @@
         "wo_id": "TST-WO00032",
         "failure_code": "Overheating"
       },
-      "gold_source": "fmc_test_answer_key.csv"
+      "gold_source": "fmc_test_answer_key.csv",
+      "expected_tools": [
+        "get_work_order_failure_code"
+      ]
     }
   },
   {
@@ -71,14 +74,18 @@
         "wo_id": "TST-WO00054",
         "failure_code": "Vibration"
       },
-      "gold_source": "fmc_test_answer_key.csv"
+      "gold_source": "fmc_test_answer_key.csv",
+      "expected_tools": [
+        "get_work_order_failure_code",
+        "set_work_order_failure_code"
+      ]
     }
   },
   {
     "type": "WorkOrder",
     "text": "Look across our historical work orders and rank the top three failure codes by record count. Reply with only a JSON array of exactly three objects, ordered 1, 2, 3, in this shape:\n[\n  {\"rank\": 1, \"failure_code\": \"<code>\", \"count\": <int>},\n  {\"rank\": 2, \"failure_code\": \"<code>\", \"count\": <int>},\n  {\"rank\": 3, \"failure_code\": \"<code>\", \"count\": <int>}\n]\nUse the failure code spelling exactly as it appears in the data. If two codes are tied at the rank-3 cut-off, include both at \"rank\": 3 and add a fourth object {\"tie_at_rank_3\": true} at the end of the array. Output nothing outside the JSON array.",
     "category": "Distribution Analysis",
-    "characteristic_form": "The expected response should count failure codes across the historical training work orders and return the top three by record count as a JSON array of {rank, failure_code, count} objects, codes spelled exactly as in the data. Correct: Minor in-service problems (109), Breakdown (70), Structural deficiency (58).",
+    "characteristic_form": "The expected response should call get_failure_code_distribution (which ranks every recorded/labelled failure code by record count) and return the top three as a JSON array of {rank, failure_code, count} objects, codes spelled exactly as in the data. In the baseline state the labelled records are the historical work orders, so the correct answer is Minor in-service problems (109), Breakdown (70), Structural deficiency (58).",
     "expected_answer": "[{\"rank\": 1, \"failure_code\": \"Minor in-service problems\", \"count\": 109}, {\"rank\": 2, \"failure_code\": \"Breakdown\", \"count\": 70}, {\"rank\": 3, \"failure_code\": \"Structural deficiency\", \"count\": 58}]",
     "scoring_method": "json_match",
     "metadata": {
@@ -107,7 +114,11 @@
           "count": 58
         }
       ],
-      "gold_source": "fmc_train_wo.csv (verified by record count)"
+      "gold_source": "fmc_train_wo.csv (verified by record count)",
+      "expected_tools": [
+        "get_failure_code_distribution"
+      ],
+      "assumes_state": "test/target records still blank (only historical records labelled)"
     }
   },
   {
@@ -182,14 +193,20 @@
           "failure_code": "Minor in-service problems"
         }
       ],
-      "gold_source": "fmc_test_answer_key.csv"
+      "gold_source": "fmc_test_answer_key.csv",
+      "expected_tools": [
+        "list_work_order_failure_codes",
+        "get_work_order_failure_code",
+        "set_work_order_failure_code"
+      ],
+      "learning_via": "list_work_order_failure_codes(labeled=True)"
     }
   },
   {
     "type": "WorkOrder",
     "text": "Learn the description-to-failure-code patterns from our historical work orders. Then go through every test work order with a blank failure code, pick a single failure code from the set of codes used in history, and write it back to the record. After every blank has been written back, rank the top three failure codes by record count across just the records you filled. Reply with only a JSON array of exactly 3 strings — the failure codes ranked 1, 2, 3 — using the failure code spelling exactly as it appears in the data. Output nothing outside the JSON array.\n[\"<rank-1 code>\", \"<rank-2 code>\", \"<rank-3 code>\"]",
     "category": "Distribution Analysis",
-    "characteristic_form": "The expected response should impute and write back a single failure code for every blank test work order (TST-WO00001 through TST-WO00062), then return the top three filled codes by count as a 3-element JSON array of strings. Correct: Minor in-service problems (17), Breakdown (7), then a rank-3 tie between Electrical and Plugged / choked at count 6 — either is accepted at position 3. The write-back is verified independently against the answer key.",
+    "characteristic_form": "The expected response should list the blank records via list_work_order_failure_codes(labeled=False), impute and write back a single failure code for each via set_work_order_failure_code, then rank the top three failure codes by count across the records it imputed (computed from its own write set — get_failure_code_distribution ranks ALL recorded codes, including history, so it cannot isolate the filled subset). Correct: Minor in-service problems (17), Breakdown (7), then a rank-3 tie between Electrical and Plugged / choked at count 6 — either is accepted at position 3. The write-back is verified independently against the answer key.",
     "expected_answer": "[\"Minor in-service problems\", \"Breakdown\", \"Electrical\"]",
     "scoring_method": "json_match",
     "metadata": {
@@ -218,7 +235,12 @@
       ],
       "tie_note": "Rank 3 is a tie between Electrical and Plugged / choked at count 6; either is accepted at position 3.",
       "verification_note": "The 3-element list is scored, but the per-record write-back is verified independently against fmc_test_answer_key.csv (e.g. score(fill) in fmc_analytics_scenarios.py). A correct top-3 list with incorrect write-backs does not pass.",
-      "gold_source": "fmc_test_answer_key.csv"
+      "gold_source": "fmc_test_answer_key.csv",
+      "expected_tools": [
+        "list_work_order_failure_codes",
+        "set_work_order_failure_code"
+      ],
+      "distribution_note": "get_failure_code_distribution cannot scope to 'just the filled records'; the rank-3 list is computed by the agent from its own imputations."
     }
   }
 ]
diff --git a/src/servers/wo/fmc_tools.py b/src/servers/wo/fmc_tools.py
index f3f69432..ca24190a 100644
--- a/src/servers/wo/fmc_tools.py
+++ b/src/servers/wo/fmc_tools.py
@@ -8,8 +8,12 @@
 classifies on structured ``MTxxx`` primary/secondary codes.
 
 The workflow they support: read a work order, learn description-to-code
-patterns from the historical (``train``) split, impute a failure code, write it
+patterns from the already-labelled records, impute a failure code, write it
 back, and rank failure codes by frequency.
+
+Records are filtered by whether a failure code has been recorded
+(``labeled``) rather than by any train/test tag — the ``wo_fmc`` dataset
+carries no such tag.
 """
 
 from collections import Counter
@@ -28,8 +32,6 @@
 )
 
 _FMC_DATASET = "wo_fmc"
-_SPLIT_PREFIX = {"train": "TRN", "test": "TST"}
-_VALID_SPLITS = ("all", "train", "test")
 
 
 def _code(value) -> Optional[str]:
@@ -40,12 +42,9 @@ def _code(value) -> Optional[str]:
     return text or None
 
 
-def _apply_split(df: pd.DataFrame, split: str) -> pd.DataFrame:
-    """Filter *df* to a ``wo_id`` prefix split (``train``/``test``); ``all`` is a no-op."""
-    prefix = _SPLIT_PREFIX.get(split.lower())
-    if prefix is None:
-        return df
-    return df[df["wo_id"].str.startswith(prefix)]
+def _is_labeled(value) -> bool:
+    """True when the record has a recorded (non-blank) failure code."""
+    return _code(value) is not None
 
 
 def get_work_order_failure_code(wo_id: str) -> Union[FmcWorkOrder, ErrorResult]:
@@ -71,31 +70,33 @@ def get_work_order_failure_code(wo_id: str) -> Union[FmcWorkOrder, ErrorResult]:
     )
 
 
-def list_work_order_failure_codes(split: str = "all") -> Union[FmcWorkOrdersResult, ErrorResult]:
+def list_work_order_failure_codes(
+    labeled: Optional[bool] = None,
+) -> Union[FmcWorkOrdersResult, ErrorResult]:
     """List failure-mode work orders with their descriptions and recorded codes.
 
-    Use ``split="train"`` for the historical/labelled records (to learn
-    description-to-code patterns), ``split="test"`` for the records to be
-    classified, or ``split="all"`` (default) for everything.
+    Use ``labeled=True`` for the records that already have a failure code (to
+    learn description-to-code patterns), ``labeled=False`` for the blank
+    records still to be classified, or omit it (default) for everything.
 
     Args:
-        split: One of ``"all"``, ``"train"`` (TRN- records), or ``"test"`` (TST- records).
+        labeled: If ``True``, only records with a recorded failure code; if
+            ``False``, only records with a blank failure code; if omitted, all.
     """
-    if split.lower() not in _VALID_SPLITS:
-        return ErrorResult(error=f"split must be 'all', 'train', or 'test', got '{split}'")
     df = load(_FMC_DATASET)
     if df is None:
         return ErrorResult(error="FMC work order data not available")
-    sub = _apply_split(df, split)
-    if sub.empty:
-        return ErrorResult(error=f"No work orders found for split '{split.lower()}'")
 
     items: List[FmcWorkOrder] = []
-    labeled = 0
-    for _, row in sub.iterrows():
+    labeled_count = 0
+    for _, row in df.iterrows():
         code = _code(row.get("failure_code"))
+        if labeled is True and code is None:
+            continue
+        if labeled is False and code is not None:
+            continue
         if code is not None:
-            labeled += 1
+            labeled_count += 1
         items.append(
             FmcWorkOrder(
                 wo_id=str(row["wo_id"]),
@@ -103,15 +104,17 @@ def list_work_order_failure_codes(split: str = "all") -> Union[FmcWorkOrdersResu
                 failure_code=code,
             )
         )
+    if not items:
+        return ErrorResult(error="No matching work orders found")
+
     return FmcWorkOrdersResult(
-        split=split.lower(),
         total=len(items),
-        labeled=labeled,
-        unlabeled=len(items) - labeled,
+        labeled=labeled_count,
+        unlabeled=len(items) - labeled_count,
         work_orders=items,
         message=(
-            f"Found {len(items)} work order(s) for split '{split.lower()}' "
-            f"({labeled} labelled, {len(items) - labeled} unlabelled)."
+            f"Found {len(items)} work order(s) "
+            f"({labeled_count} labelled, {len(items) - labeled_count} unlabelled)."
         ),
     )
 
@@ -143,38 +146,33 @@ def set_work_order_failure_code(wo_id: str, failure_code: str) -> Union[FmcWrite
 
 
 def get_failure_code_distribution(
-    split: str = "all", top_n: Optional[int] = None
+    top_n: Optional[int] = None,
 ) -> Union[FmcCodeDistributionResult, ErrorResult]:
-    """Rank failure codes by record count across the failure-mode dataset.
+    """Rank recorded failure codes by record count across the failure-mode dataset.
 
-    Counts only records that have a recorded failure code, sorted by count
-    descending.  Use ``split="train"`` to rank across historical records or
-    ``split="test"`` to rank across the (imputed) test records.
+    Counts every record that has a recorded failure code, sorted by count
+    descending.  (Blank records are ignored, so this ranks the labelled
+    population.)
 
     Args:
-        split: One of ``"all"``, ``"train"``, or ``"test"``.
         top_n: If given, return only the top N codes.
     """
-    if split.lower() not in _VALID_SPLITS:
-        return ErrorResult(error=f"split must be 'all', 'train', or 'test', got '{split}'")
     df = load(_FMC_DATASET)
     if df is None:
         return ErrorResult(error="FMC work order data not available")
-    sub = _apply_split(df, split)
-    codes = [c for c in (_code(v) for v in sub.get("failure_code", [])) if c is not None]
+    codes = [c for c in (_code(v) for v in df.get("failure_code", [])) if c is not None]
     if not codes:
-        return ErrorResult(error=f"No recorded failure codes for split '{split.lower()}'")
+        return ErrorResult(error="No recorded failure codes found")
 
     counts = Counter(codes)
     ranked = counts.most_common(top_n)
     distribution = [FmcCodeCount(failure_code=code, count=count) for code, count in ranked]
     return FmcCodeDistributionResult(
-        split=split.lower(),
-        total_records=int(len(sub)),
+        total_records=int(len(df)),
         labeled_records=len(codes),
         distribution=distribution,
         message=(
-            f"Ranked {len(distribution)} failure code(s) across {len(codes)} "
-            f"labelled record(s) in split '{split.lower()}'."
+            f"Ranked {len(distribution)} failure code(s) across "
+            f"{len(codes)} labelled record(s)."
         ),
     )
diff --git a/src/servers/wo/models.py b/src/servers/wo/models.py
index 3a794a68..0ec9901c 100644
--- a/src/servers/wo/models.py
+++ b/src/servers/wo/models.py
@@ -160,7 +160,6 @@ class FmcWorkOrder(BaseModel):
 
 
 class FmcWorkOrdersResult(BaseModel):
-    split: str
     total: int
     labeled: int
     unlabeled: int
@@ -181,7 +180,6 @@ class FmcCodeCount(BaseModel):
 
 
 class FmcCodeDistributionResult(BaseModel):
-    split: str
     total_records: int
     labeled_records: int
     distribution: List[FmcCodeCount]
diff --git a/src/servers/wo/tests/test_fmc_tools.py b/src/servers/wo/tests/test_fmc_tools.py
index ccb6552b..7b6f06be 100644
--- a/src/servers/wo/tests/test_fmc_tools.py
+++ b/src/servers/wo/tests/test_fmc_tools.py
@@ -83,32 +83,30 @@ def test_get_no_data():
 # --- list_work_order_failure_codes -----------------------------------------
 
 
-def test_list_train_split(mock_load):
-    res = fmc_tools.list_work_order_failure_codes(split="train")
+def test_list_labeled_only(mock_load):
+    res = fmc_tools.list_work_order_failure_codes(labeled=True)
     assert isinstance(res, FmcWorkOrdersResult)
     assert res.total == 4
     assert res.labeled == 4
     assert res.unlabeled == 0
-    assert all(wo.wo_id.startswith("TRN") for wo in res.work_orders)
+    assert all(wo.failure_code is not None for wo in res.work_orders)
 
 
-def test_list_test_split_unlabeled(mock_load):
-    res = fmc_tools.list_work_order_failure_codes(split="test")
+def test_list_unlabeled_only(mock_load):
+    res = fmc_tools.list_work_order_failure_codes(labeled=False)
     assert isinstance(res, FmcWorkOrdersResult)
     assert res.total == 2
     assert res.labeled == 0
     assert res.unlabeled == 2
+    assert all(wo.failure_code is None for wo in res.work_orders)
 
 
 def test_list_all_default(mock_load):
     res = fmc_tools.list_work_order_failure_codes()
     assert isinstance(res, FmcWorkOrdersResult)
     assert res.total == 6
-
-
-def test_list_invalid_split(mock_load):
-    res = fmc_tools.list_work_order_failure_codes(split="bogus")
-    assert isinstance(res, ErrorResult)
+    assert res.labeled == 4
+    assert res.unlabeled == 2
 
 
 # --- set_work_order_failure_code -------------------------------------------
@@ -143,9 +141,10 @@ def test_set_empty_code_rejected():
 # --- get_failure_code_distribution -----------------------------------------
 
 
-def test_distribution_train_ranked(mock_load):
-    res = fmc_tools.get_failure_code_distribution(split="train")
+def test_distribution_ranked(mock_load):
+    res = fmc_tools.get_failure_code_distribution()
     assert isinstance(res, FmcCodeDistributionResult)
+    assert res.total_records == 6
     assert res.labeled_records == 4
     # Breakdown (2) ranks first; remaining tied at 1
     assert res.distribution[0].failure_code == "Breakdown"
@@ -153,17 +152,14 @@ def test_distribution_train_ranked(mock_load):
 
 
 def test_distribution_top_n(mock_load):
-    res = fmc_tools.get_failure_code_distribution(split="train", top_n=1)
+    res = fmc_tools.get_failure_code_distribution(top_n=1)
     assert isinstance(res, FmcCodeDistributionResult)
     assert len(res.distribution) == 1
     assert res.distribution[0].failure_code == "Breakdown"
 
 
-def test_distribution_test_empty_when_unfilled(mock_load):
-    res = fmc_tools.get_failure_code_distribution(split="test")
-    assert isinstance(res, ErrorResult)
-
-
-def test_distribution_invalid_split(mock_load):
-    res = fmc_tools.get_failure_code_distribution(split="bogus")
-    assert isinstance(res, ErrorResult)
+def test_distribution_empty_when_no_codes():
+    blank = pd.DataFrame({"wo_id": ["TST-WO00001"], "description": ["x"], "failure_code": [None]})
+    with patch("servers.wo.fmc_tools.load", return_value=blank):
+        res = fmc_tools.get_failure_code_distribution()
+        assert isinstance(res, ErrorResult)

From 2c8e26f3a06818c9dcf49318e824509e9c23c504 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 13:12:02 -0400
Subject: [PATCH 06/10] chore: add quick Claude-agent smoke-test script for FMC
 scenarios

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 scripts/test_fmc_claude_agent.py | 116 +++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 scripts/test_fmc_claude_agent.py

diff --git a/scripts/test_fmc_claude_agent.py b/scripts/test_fmc_claude_agent.py
new file mode 100644
index 00000000..116d77d2
--- /dev/null
+++ b/scripts/test_fmc_claude_agent.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+"""Quick smoke test: run the Claude agent over the FMC work-order scenarios.
+
+Loads the scenarios from ``src/scenarios/local/workorder_utterance.json``, runs
+each one through ``ClaudeAgentRunner`` (which connects the ``wo`` MCP server and
+its FMC tools), and prints the agent's answer next to the expected answer plus
+the tools it actually called.
+
+Write-back scenarios (S2/S4/S5) mutate CouchDB; by default this script
+re-blanks every ``TST-`` record afterwards so the evaluation dataset stays
+pristine. Pass ``--no-restore`` to leave the imputations in place (e.g. to
+inspect the write-back independently).
+
+Requires CouchDB up (``workorder`` DB loaded) and LITELLM_* env vars in .env.
+
+Usage:
+    uv run python scripts/test_fmc_claude_agent.py                 # all scenarios
+    uv run python scripts/test_fmc_claude_agent.py S1 S3           # only S1 and S3
+    uv run python scripts/test_fmc_claude_agent.py --no-restore
+    uv run python scripts/test_fmc_claude_agent.py --show-trajectory
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import sys
+from pathlib import Path
+
+_ROOT = Path(__file__).resolve().parent.parent
+_SRC = _ROOT / "src"
+sys.path.insert(0, str(_SRC))
+
+_SCENARIOS = _SRC / "scenarios" / "local" / "workorder_utterance.json"
+_HR = "=" * 72
+
+
+def _load_scenarios(labels: set[str]) -> list[dict]:
+    import json
+
+    data = json.loads(_SCENARIOS.read_text(encoding="utf-8"))
+    out = []
+    for s in data:
+        label = s.get("metadata", {}).get("scenario_label", "")
+        if labels and label not in labels:
+            continue
+        out.append(s)
+    return out
+
+
+async def _run(args: argparse.Namespace) -> None:
+    from agent.claude_agent.runner import ClaudeAgentRunner
+    from servers.wo.data import load, write_failure_code
+
+    scenarios = _load_scenarios(set(args.labels))
+    if not scenarios:
+        print(f"No scenarios matched {args.labels!r}")
+        return
+
+    runner = ClaudeAgentRunner(model=args.model_id, max_turns=args.max_turns)
+    needs_restore = False
+
+    for s in scenarios:
+        md = s["metadata"]
+        label = md.get("scenario_label", "?")
+        needs_restore = needs_restore or bool(md.get("write_back"))
+
+        print(f"\n{_HR}\n{label} · {md.get('subtitle', '')}\n{_HR}")
+        print(f"Q: {s['text']}\n")
+
+        result = await runner.run(s["text"])
+
+        tools_used = [tc.name for tc in result.trajectory.all_tool_calls]
+        print(f"EXPECTED      : {s.get('expected_answer')}")
+        print(f"AGENT ANSWER  : {result.answer}")
+        print(f"EXPECTED TOOLS: {md.get('expected_tools')}")
+        print(f"TOOLS USED    : {tools_used}")
+        print(
+            f"(turns={len(result.trajectory.turns)}, "
+            f"tool_calls={len(tools_used)}, "
+            f"out_tokens={result.trajectory.total_output_tokens})"
+        )
+        if args.show_trajectory:
+            for t in result.trajectory.turns:
+                for tc in t.tool_calls:
+                    print(f"    → {tc.name}({tc.input})")
+
+    if needs_restore and not args.no_restore:
+        df = load("wo_fmc")
+        restored = 0
+        if df is not None:
+            for wo_id in df.loc[df["wo_id"].str.startswith("TST"), "wo_id"]:
+                if write_failure_code(str(wo_id), None) is True:
+                    restored += 1
+        print(f"\n[restore] re-blanked {restored} TST- record(s) to keep the dataset pristine.")
+    elif needs_restore:
+        print("\n[restore] skipped (--no-restore); TST- records keep their imputed codes.")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("labels", nargs="*", help="Scenario labels to run (e.g. S1 S3). Default: all.")
+    parser.add_argument("--model-id", default="litellm_proxy/aws/claude-opus-4-6", help="Model string.")
+    parser.add_argument("--max-turns", type=int, default=30, help="Max agentic loop turns.")
+    parser.add_argument("--no-restore", action="store_true", help="Leave write-back imputations in CouchDB.")
+    parser.add_argument("--show-trajectory", action="store_true", help="Print each tool call.")
+    args = parser.parse_args()
+
+    from dotenv import load_dotenv
+
+    load_dotenv(_ROOT / ".env")
+    asyncio.run(_run(args))
+
+
+if __name__ == "__main__":
+    main()

From 91dbff217723537e6efe0e75adc0f18087390bdc Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 13:26:31 -0400
Subject: [PATCH 07/10] feat: make FMC failure-code write-back a batch
 operation

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 scripts/test_fmc_claude_agent.py             | 10 ++--
 src/scenarios/local/workorder_utterance.json | 12 ++--
 src/servers/wo/data.py                       | 57 ++++++++++++------
 src/servers/wo/fmc_tools.py                  | 62 +++++++++++++-------
 src/servers/wo/main.py                       |  2 +-
 src/servers/wo/models.py                     | 15 ++++-
 src/servers/wo/tests/test_fmc_tools.py       | 57 +++++++++++++-----
 7 files changed, 150 insertions(+), 65 deletions(-)

diff --git a/scripts/test_fmc_claude_agent.py b/scripts/test_fmc_claude_agent.py
index 116d77d2..2f03d0ba 100644
--- a/scripts/test_fmc_claude_agent.py
+++ b/scripts/test_fmc_claude_agent.py
@@ -50,7 +50,7 @@ def _load_scenarios(labels: set[str]) -> list[dict]:
 
 async def _run(args: argparse.Namespace) -> None:
     from agent.claude_agent.runner import ClaudeAgentRunner
-    from servers.wo.data import load, write_failure_code
+    from servers.wo.data import load, write_failure_codes
 
     scenarios = _load_scenarios(set(args.labels))
     if not scenarios:
@@ -87,11 +87,11 @@ async def _run(args: argparse.Namespace) -> None:
 
     if needs_restore and not args.no_restore:
         df = load("wo_fmc")
-        restored = 0
+        status = {}
         if df is not None:
-            for wo_id in df.loc[df["wo_id"].str.startswith("TST"), "wo_id"]:
-                if write_failure_code(str(wo_id), None) is True:
-                    restored += 1
+            tst_ids = [str(w) for w in df.loc[df["wo_id"].str.startswith("TST"), "wo_id"]]
+            status = write_failure_codes({wo_id: None for wo_id in tst_ids}) or {}
+        restored = sum(1 for ok in status.values() if ok)
         print(f"\n[restore] re-blanked {restored} TST- record(s) to keep the dataset pristine.")
     elif needs_restore:
         print("\n[restore] skipped (--no-restore); TST- records keep their imputed codes.")
diff --git a/src/scenarios/local/workorder_utterance.json b/src/scenarios/local/workorder_utterance.json
index 79dead44..c9b1465b 100644
--- a/src/scenarios/local/workorder_utterance.json
+++ b/src/scenarios/local/workorder_utterance.json
@@ -77,7 +77,7 @@
       "gold_source": "fmc_test_answer_key.csv",
       "expected_tools": [
         "get_work_order_failure_code",
-        "set_work_order_failure_code"
+        "set_work_order_failure_codes"
       ]
     }
   },
@@ -197,9 +197,10 @@
       "expected_tools": [
         "list_work_order_failure_codes",
         "get_work_order_failure_code",
-        "set_work_order_failure_code"
+        "set_work_order_failure_codes"
       ],
-      "learning_via": "list_work_order_failure_codes(labeled=True)"
+      "learning_via": "list_work_order_failure_codes(labeled=True)",
+      "write_hint": "write all 10 imputations in one set_work_order_failure_codes call"
     }
   },
   {
@@ -238,9 +239,10 @@
       "gold_source": "fmc_test_answer_key.csv",
       "expected_tools": [
         "list_work_order_failure_codes",
-        "set_work_order_failure_code"
+        "set_work_order_failure_codes"
       ],
-      "distribution_note": "get_failure_code_distribution cannot scope to 'just the filled records'; the rank-3 list is computed by the agent from its own imputations."
+      "distribution_note": "get_failure_code_distribution cannot scope to 'just the filled records'; the rank-3 list is computed by the agent from its own imputations.",
+      "write_hint": "write all imputations in one set_work_order_failure_codes call"
     }
   }
 ]
diff --git a/src/servers/wo/data.py b/src/servers/wo/data.py
index 2d9b62a0..3f8ec696 100644
--- a/src/servers/wo/data.py
+++ b/src/servers/wo/data.py
@@ -114,32 +114,55 @@ def load(dataset: str) -> Optional[pd.DataFrame]:
         return None
 
 
-def write_failure_code(wo_id: str, failure_code: str) -> Optional[bool]:
-    """Persist *failure_code* onto the ``wo_fmc`` record identified by *wo_id*.
-
-    Returns ``True`` on a successful update, ``False`` when no matching record
-    exists, and ``None`` when CouchDB is unavailable.  Invalidates the cached
-    ``wo_fmc`` DataFrame so subsequent reads reflect the write.
+def write_failure_codes(updates: Dict[str, Optional[str]]) -> Optional[Dict[str, bool]]:
+    """Persist failure codes onto multiple ``wo_fmc`` records in one round-trip.
+
+    *updates* maps ``wo_id`` → ``failure_code`` (``None`` blanks the field).
+    Fetches all targets with a single ``$in`` query and writes them with one
+    ``_bulk_docs`` call.  Returns a ``{wo_id: updated}`` status map (``False``
+    for ids with no matching record), or ``None`` when CouchDB is unavailable.
+    Invalidates the cached ``wo_fmc`` DataFrame when anything was written.
     """
+    if not updates:
+        return {}
     db = _get_db()
     if db is None:
         return None
     try:
         result = db.find(
-            selector={"dataset": {"$eq": "wo_fmc"}, "wo_id": {"$eq": wo_id}},
-            limit=1,
+            selector={"dataset": {"$eq": "wo_fmc"}, "wo_id": {"$in": list(updates)}},
+            limit=len(updates) + 1,
         )
-        docs = result.get("docs", [])
-        if not docs:
-            return False
-        doc = docs[0]
-        doc["failure_code"] = failure_code
-        db.save(doc)
-        _dataset_cache.pop("wo_fmc", None)
-        return True
+        by_id = {doc["wo_id"]: doc for doc in result.get("docs", [])}
+        status: Dict[str, bool] = {}
+        to_save = []
+        for wo_id, failure_code in updates.items():
+            doc = by_id.get(wo_id)
+            if doc is None:
+                status[wo_id] = False
+                continue
+            doc["failure_code"] = failure_code
+            to_save.append(doc)
+            status[wo_id] = True
+        if to_save:
+            db.bulk_docs(to_save)
+            _dataset_cache.pop("wo_fmc", None)
+        return status
     except Exception as exc:
-        logger.error("Failed to write failure_code for '%s': %s", wo_id, exc)
+        logger.error("Failed to write failure codes for %s: %s", list(updates), exc)
+        return None
+
+
+def write_failure_code(wo_id: str, failure_code: Optional[str]) -> Optional[bool]:
+    """Persist a single *failure_code* onto the ``wo_fmc`` record *wo_id*.
+
+    Thin wrapper over :func:`write_failure_codes`.  Returns ``True`` on update,
+    ``False`` when no matching record exists, ``None`` when CouchDB is down.
+    """
+    result = write_failure_codes({wo_id: failure_code})
+    if result is None:
         return None
+    return result.get(wo_id, False)
 
 
 # ---------------------------------------------------------------------------
diff --git a/src/servers/wo/fmc_tools.py b/src/servers/wo/fmc_tools.py
index ca24190a..d815656c 100644
--- a/src/servers/wo/fmc_tools.py
+++ b/src/servers/wo/fmc_tools.py
@@ -21,14 +21,16 @@
 
 import pandas as pd
 
-from .data import load, write_failure_code
+from .data import load, write_failure_codes
 from .models import (
     ErrorResult,
+    FmcBatchWriteResult,
+    FmcCodeAssignment,
     FmcCodeCount,
     FmcCodeDistributionResult,
     FmcWorkOrder,
     FmcWorkOrdersResult,
-    FmcWriteResult,
+    FmcWriteEntry,
 )
 
 _FMC_DATASET = "wo_fmc"
@@ -119,29 +121,49 @@ def list_work_order_failure_codes(
     )
 
 
-def set_work_order_failure_code(wo_id: str, failure_code: str) -> Union[FmcWriteResult, ErrorResult]:
-    """Write (impute) a failure code onto a failure-mode work order record.
+def set_work_order_failure_codes(
+    assignments: List[FmcCodeAssignment],
+) -> Union[FmcBatchWriteResult, ErrorResult]:
+    """Write (impute) failure codes onto one or more work order records.
 
-    Persists ``failure_code`` to the ``wo_fmc`` record identified by ``wo_id``
-    in CouchDB and returns the confirmed value.
+    Persists each ``failure_code`` to its ``wo_fmc`` record in CouchDB in a
+    single batch.  Pass a one-element list to set a single record, or many to
+    impute a whole batch at once (preferred over many single calls).
 
     Args:
-        wo_id: Work order identifier, e.g. ``"TST-WO00054"``.
-        failure_code: Failure code to record, e.g. ``"Vibration"``.
+        assignments: List of ``{"wo_id": ..., "failure_code": ...}`` items,
+            e.g. ``[{"wo_id": "TST-WO00054", "failure_code": "Vibration"}]``.
     """
-    code = (failure_code or "").strip()
-    if not code:
-        return ErrorResult(error="failure_code must be a non-empty string")
-    result = write_failure_code(wo_id, code)
-    if result is None:
+    if not assignments:
+        return ErrorResult(error="assignments must be a non-empty list")
+
+    updates: dict = {}
+    for a in assignments:
+        code = (a.failure_code or "").strip()
+        if not code:
+            return ErrorResult(error=f"failure_code for '{a.wo_id}' must be a non-empty string")
+        if a.wo_id in updates:
+            return ErrorResult(error=f"duplicate wo_id in assignments: '{a.wo_id}'")
+        updates[a.wo_id] = code
+
+    status = write_failure_codes(updates)
+    if status is None:
         return ErrorResult(error="FMC work order data not available")
-    if result is False:
-        return ErrorResult(error=f"No work order found with wo_id '{wo_id}'")
-    return FmcWriteResult(
-        wo_id=wo_id,
-        failure_code=code,
-        updated=True,
-        message=f"Recorded failure_code '{code}' on work order '{wo_id}'.",
+
+    results = [
+        FmcWriteEntry(wo_id=wo_id, failure_code=code, updated=bool(status.get(wo_id)))
+        for wo_id, code in updates.items()
+    ]
+    updated = sum(1 for r in results if r.updated)
+    missing = [r.wo_id for r in results if not r.updated]
+    message = f"Recorded {updated}/{len(results)} failure code(s)."
+    if missing:
+        message += f" No record found for: {', '.join(missing)}."
+    return FmcBatchWriteResult(
+        total=len(results),
+        updated=updated,
+        results=results,
+        message=message,
     )
 
 
diff --git a/src/servers/wo/main.py b/src/servers/wo/main.py
index 780cf63b..9f70083b 100644
--- a/src/servers/wo/main.py
+++ b/src/servers/wo/main.py
@@ -32,7 +32,7 @@
     (tools.analyze_alert_to_failure, "Analyze Alert to Failure"),
     (fmc_tools.get_work_order_failure_code, "Get Work Order Failure Code"),
     (fmc_tools.list_work_order_failure_codes, "List Work Order Failure Codes"),
-    (fmc_tools.set_work_order_failure_code, "Set Work Order Failure Code"),
+    (fmc_tools.set_work_order_failure_codes, "Set Work Order Failure Codes"),
     (fmc_tools.get_failure_code_distribution, "Get Failure Code Distribution"),
 ]
 for _fn, _title in _TOOLS:
diff --git a/src/servers/wo/models.py b/src/servers/wo/models.py
index 0ec9901c..cf9d90e0 100644
--- a/src/servers/wo/models.py
+++ b/src/servers/wo/models.py
@@ -167,10 +167,23 @@ class FmcWorkOrdersResult(BaseModel):
     message: str
 
 
-class FmcWriteResult(BaseModel):
+class FmcCodeAssignment(BaseModel):
+    """A single (wo_id, failure_code) write request."""
+
+    wo_id: str
+    failure_code: str
+
+
+class FmcWriteEntry(BaseModel):
     wo_id: str
     failure_code: str
     updated: bool
+
+
+class FmcBatchWriteResult(BaseModel):
+    total: int
+    updated: int
+    results: List[FmcWriteEntry]
     message: str
 
 
diff --git a/src/servers/wo/tests/test_fmc_tools.py b/src/servers/wo/tests/test_fmc_tools.py
index 7b6f06be..bf89482e 100644
--- a/src/servers/wo/tests/test_fmc_tools.py
+++ b/src/servers/wo/tests/test_fmc_tools.py
@@ -8,10 +8,11 @@
 from servers.wo import fmc_tools
 from servers.wo.models import (
     ErrorResult,
+    FmcBatchWriteResult,
+    FmcCodeAssignment,
     FmcCodeDistributionResult,
     FmcWorkOrder,
     FmcWorkOrdersResult,
-    FmcWriteResult,
 )
 
 
@@ -109,32 +110,56 @@ def test_list_all_default(mock_load):
     assert res.unlabeled == 2
 
 
-# --- set_work_order_failure_code -------------------------------------------
+# --- set_work_order_failure_codes ------------------------------------------
 
 
-def test_set_success():
-    with patch("servers.wo.fmc_tools.write_failure_code", return_value=True) as mock_write:
-        res = fmc_tools.set_work_order_failure_code("TST-WO00001", "Overheating")
-        assert isinstance(res, FmcWriteResult)
-        assert res.updated is True
-        assert res.failure_code == "Overheating"
-        mock_write.assert_called_once_with("TST-WO00001", "Overheating")
+def _asg(wo_id, code):
+    return FmcCodeAssignment(wo_id=wo_id, failure_code=code)
 
 
-def test_set_missing_record():
-    with patch("servers.wo.fmc_tools.write_failure_code", return_value=False):
-        res = fmc_tools.set_work_order_failure_code("TST-WO99999", "Overheating")
-        assert isinstance(res, ErrorResult)
+def test_set_single():
+    with patch("servers.wo.fmc_tools.write_failure_codes", return_value={"TST-WO00001": True}) as mock_write:
+        res = fmc_tools.set_work_order_failure_codes([_asg("TST-WO00001", "Overheating")])
+        assert isinstance(res, FmcBatchWriteResult)
+        assert res.total == 1
+        assert res.updated == 1
+        assert res.results[0].failure_code == "Overheating"
+        mock_write.assert_called_once_with({"TST-WO00001": "Overheating"})
+
+
+def test_set_batch_partial_missing():
+    status = {"TST-WO00001": True, "TST-WO99999": False}
+    with patch("servers.wo.fmc_tools.write_failure_codes", return_value=status):
+        res = fmc_tools.set_work_order_failure_codes(
+            [_asg("TST-WO00001", "Electrical"), _asg("TST-WO99999", "Breakdown")]
+        )
+        assert isinstance(res, FmcBatchWriteResult)
+        assert res.total == 2
+        assert res.updated == 1
+        missing = [r.wo_id for r in res.results if not r.updated]
+        assert missing == ["TST-WO99999"]
 
 
 def test_set_no_db():
-    with patch("servers.wo.fmc_tools.write_failure_code", return_value=None):
-        res = fmc_tools.set_work_order_failure_code("TST-WO00001", "Overheating")
+    with patch("servers.wo.fmc_tools.write_failure_codes", return_value=None):
+        res = fmc_tools.set_work_order_failure_codes([_asg("TST-WO00001", "Overheating")])
         assert isinstance(res, ErrorResult)
 
 
+def test_set_empty_list_rejected():
+    res = fmc_tools.set_work_order_failure_codes([])
+    assert isinstance(res, ErrorResult)
+
+
 def test_set_empty_code_rejected():
-    res = fmc_tools.set_work_order_failure_code("TST-WO00001", "   ")
+    res = fmc_tools.set_work_order_failure_codes([_asg("TST-WO00001", "   ")])
+    assert isinstance(res, ErrorResult)
+
+
+def test_set_duplicate_wo_id_rejected():
+    res = fmc_tools.set_work_order_failure_codes(
+        [_asg("TST-WO00001", "Electrical"), _asg("TST-WO00001", "Breakdown")]
+    )
     assert isinstance(res, ErrorResult)
 
 

From b43d68e1f026a19d4d08f6db194d7f69f5e1ee0f Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 13:30:17 -0400
Subject: [PATCH 08/10] feat: make FMC failure-code read a batch operation

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/scenarios/local/workorder_utterance.json |  6 +-
 src/servers/wo/fmc_tools.py                  | 64 ++++++++++++++++----
 src/servers/wo/main.py                       |  2 +-
 src/servers/wo/models.py                     |  3 +-
 src/servers/wo/tests/test_fmc_tools.py       | 50 ++++++++++-----
 5 files changed, 93 insertions(+), 32 deletions(-)

diff --git a/src/scenarios/local/workorder_utterance.json b/src/scenarios/local/workorder_utterance.json
index c9b1465b..e3f47e08 100644
--- a/src/scenarios/local/workorder_utterance.json
+++ b/src/scenarios/local/workorder_utterance.json
@@ -35,7 +35,7 @@
       },
       "gold_source": "fmc_test_answer_key.csv",
       "expected_tools": [
-        "get_work_order_failure_code"
+        "get_work_order_failure_codes"
       ]
     }
   },
@@ -76,7 +76,7 @@
       },
       "gold_source": "fmc_test_answer_key.csv",
       "expected_tools": [
-        "get_work_order_failure_code",
+        "get_work_order_failure_codes",
         "set_work_order_failure_codes"
       ]
     }
@@ -196,7 +196,7 @@
       "gold_source": "fmc_test_answer_key.csv",
       "expected_tools": [
         "list_work_order_failure_codes",
-        "get_work_order_failure_code",
+        "get_work_order_failure_codes",
         "set_work_order_failure_codes"
       ],
       "learning_via": "list_work_order_failure_codes(labeled=True)",
diff --git a/src/servers/wo/fmc_tools.py b/src/servers/wo/fmc_tools.py
index d815656c..6fa44c96 100644
--- a/src/servers/wo/fmc_tools.py
+++ b/src/servers/wo/fmc_tools.py
@@ -49,26 +49,64 @@ def _is_labeled(value) -> bool:
     return _code(value) is not None
 
 
-def get_work_order_failure_code(wo_id: str) -> Union[FmcWorkOrder, ErrorResult]:
-    """Retrieve a single failure-mode work order by its ``wo_id``.
+def get_work_order_failure_codes(wo_ids: List[str]) -> Union[FmcWorkOrdersResult, ErrorResult]:
+    """Retrieve one or more failure-mode work orders by ``wo_id``.
 
-    Returns the work order's free-text description and its recorded failure
-    code.  ``failure_code`` is null when none has been recorded yet.
+    Returns each work order's free-text description and recorded failure code
+    (null when none has been recorded yet), in the requested order.  Pass a
+    one-element list to fetch a single record, or many to pull a batch at once
+    (preferred over many single calls).  Any ids with no matching record are
+    reported in ``missing``.
 
     Args:
-        wo_id: Work order identifier, e.g. ``"TST-WO00032"``.
+        wo_ids: Work order identifiers, e.g. ``["TST-WO00032"]`` or
+            ``["TST-WO00001", "TST-WO00002"]``.
     """
+    if not wo_ids:
+        return ErrorResult(error="wo_ids must be a non-empty list")
     df = load(_FMC_DATASET)
     if df is None:
         return ErrorResult(error="FMC work order data not available")
-    match = df[df["wo_id"] == wo_id]
-    if match.empty:
-        return ErrorResult(error=f"No work order found with wo_id '{wo_id}'")
-    row = match.iloc[0]
-    return FmcWorkOrder(
-        wo_id=str(row["wo_id"]),
-        description=str(row.get("description", "") or ""),
-        failure_code=_code(row.get("failure_code")),
+
+    requested = list(dict.fromkeys(wo_ids))  # de-duplicate, preserve order
+    by_id = {
+        str(row["wo_id"]): row
+        for _, row in df[df["wo_id"].isin(requested)].iterrows()
+    }
+    items: List[FmcWorkOrder] = []
+    missing: List[str] = []
+    labeled = 0
+    for wo_id in requested:
+        row = by_id.get(wo_id)
+        if row is None:
+            missing.append(wo_id)
+            continue
+        code = _code(row.get("failure_code"))
+        if code is not None:
+            labeled += 1
+        items.append(
+            FmcWorkOrder(
+                wo_id=wo_id,
+                description=str(row.get("description", "") or ""),
+                failure_code=code,
+            )
+        )
+    if not items:
+        return ErrorResult(error=f"No work orders found for: {', '.join(requested)}")
+
+    message = (
+        f"Found {len(items)} of {len(requested)} requested work order(s) "
+        f"({labeled} labelled, {len(items) - labeled} unlabelled)."
+    )
+    if missing:
+        message += f" Not found: {', '.join(missing)}."
+    return FmcWorkOrdersResult(
+        total=len(items),
+        labeled=labeled,
+        unlabeled=len(items) - labeled,
+        work_orders=items,
+        missing=missing,
+        message=message,
     )
 
 
diff --git a/src/servers/wo/main.py b/src/servers/wo/main.py
index 9f70083b..8217aa08 100644
--- a/src/servers/wo/main.py
+++ b/src/servers/wo/main.py
@@ -30,7 +30,7 @@
     (tools.get_work_order_distribution, "Get Work Order Distribution"),
     (tools.predict_next_work_order, "Predict Next Work Order"),
     (tools.analyze_alert_to_failure, "Analyze Alert to Failure"),
-    (fmc_tools.get_work_order_failure_code, "Get Work Order Failure Code"),
+    (fmc_tools.get_work_order_failure_codes, "Get Work Order Failure Codes"),
     (fmc_tools.list_work_order_failure_codes, "List Work Order Failure Codes"),
     (fmc_tools.set_work_order_failure_codes, "Set Work Order Failure Codes"),
     (fmc_tools.get_failure_code_distribution, "Get Failure Code Distribution"),
diff --git a/src/servers/wo/models.py b/src/servers/wo/models.py
index cf9d90e0..dfd341f6 100644
--- a/src/servers/wo/models.py
+++ b/src/servers/wo/models.py
@@ -1,7 +1,7 @@
 """Pydantic result models for the Work Order MCP server."""
 
 from typing import List, Optional
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 
 class ErrorResult(BaseModel):
@@ -164,6 +164,7 @@ class FmcWorkOrdersResult(BaseModel):
     labeled: int
     unlabeled: int
     work_orders: List[FmcWorkOrder]
+    missing: List[str] = Field(default_factory=list)
     message: str
 
 
diff --git a/src/servers/wo/tests/test_fmc_tools.py b/src/servers/wo/tests/test_fmc_tools.py
index bf89482e..55136dcf 100644
--- a/src/servers/wo/tests/test_fmc_tools.py
+++ b/src/servers/wo/tests/test_fmc_tools.py
@@ -11,7 +11,6 @@
     FmcBatchWriteResult,
     FmcCodeAssignment,
     FmcCodeDistributionResult,
-    FmcWorkOrder,
     FmcWorkOrdersResult,
 )
 
@@ -53,31 +52,54 @@ def mock_load():
         yield
 
 
-# --- get_work_order_failure_code -------------------------------------------
+# --- get_work_order_failure_codes ------------------------------------------
 
 
-def test_get_labeled_record(mock_load):
-    res = fmc_tools.get_work_order_failure_code("TRN-WO00001")
-    assert isinstance(res, FmcWorkOrder)
-    assert res.wo_id == "TRN-WO00001"
-    assert res.description == "falure"
-    assert res.failure_code == "Breakdown"
+def test_get_single_in_list(mock_load):
+    res = fmc_tools.get_work_order_failure_codes(["TRN-WO00001"])
+    assert isinstance(res, FmcWorkOrdersResult)
+    assert res.total == 1
+    wo = res.work_orders[0]
+    assert wo.wo_id == "TRN-WO00001"
+    assert wo.description == "falure"
+    assert wo.failure_code == "Breakdown"
+    assert res.missing == []
 
 
 def test_get_blank_record_has_null_code(mock_load):
-    res = fmc_tools.get_work_order_failure_code("TST-WO00001")
-    assert isinstance(res, FmcWorkOrder)
-    assert res.failure_code is None
+    res = fmc_tools.get_work_order_failure_codes(["TST-WO00001"])
+    assert isinstance(res, FmcWorkOrdersResult)
+    assert res.work_orders[0].failure_code is None
+
+
+def test_get_batch_preserves_order(mock_load):
+    res = fmc_tools.get_work_order_failure_codes(["TST-WO00001", "TRN-WO00003", "TRN-WO00001"])
+    assert isinstance(res, FmcWorkOrdersResult)
+    assert [wo.wo_id for wo in res.work_orders] == ["TST-WO00001", "TRN-WO00003", "TRN-WO00001"]
+    assert res.labeled == 2
+    assert res.unlabeled == 1
+
+
+def test_get_batch_reports_missing(mock_load):
+    res = fmc_tools.get_work_order_failure_codes(["TRN-WO00001", "TST-WO99999"])
+    assert isinstance(res, FmcWorkOrdersResult)
+    assert res.total == 1
+    assert res.missing == ["TST-WO99999"]
+
+
+def test_get_all_missing(mock_load):
+    res = fmc_tools.get_work_order_failure_codes(["TST-WO99999"])
+    assert isinstance(res, ErrorResult)
 
 
-def test_get_missing_record(mock_load):
-    res = fmc_tools.get_work_order_failure_code("TST-WO99999")
+def test_get_empty_list_rejected(mock_load):
+    res = fmc_tools.get_work_order_failure_codes([])
     assert isinstance(res, ErrorResult)
 
 
 def test_get_no_data():
     with patch("servers.wo.fmc_tools.load", return_value=None):
-        res = fmc_tools.get_work_order_failure_code("TRN-WO00001")
+        res = fmc_tools.get_work_order_failure_codes(["TRN-WO00001"])
         assert isinstance(res, ErrorResult)
 
 

From d0a8295861e53dfe12986e52057739f8315e3010 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 13:35:47 -0400
Subject: [PATCH 09/10] feat: save per-scenario trajectories in FMC agent
 smoke-test script

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 scripts/test_fmc_claude_agent.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/scripts/test_fmc_claude_agent.py b/scripts/test_fmc_claude_agent.py
index 2f03d0ba..2868ba3f 100644
--- a/scripts/test_fmc_claude_agent.py
+++ b/scripts/test_fmc_claude_agent.py
@@ -24,7 +24,9 @@
 
 import argparse
 import asyncio
+import os
 import sys
+import uuid
 from pathlib import Path
 
 _ROOT = Path(__file__).resolve().parent.parent
@@ -50,6 +52,7 @@ def _load_scenarios(labels: set[str]) -> list[dict]:
 
 async def _run(args: argparse.Namespace) -> None:
     from agent.claude_agent.runner import ClaudeAgentRunner
+    from observability import set_run_context
     from servers.wo.data import load, write_failure_codes
 
     scenarios = _load_scenarios(set(args.labels))
@@ -57,6 +60,10 @@ async def _run(args: argparse.Namespace) -> None:
         print(f"No scenarios matched {args.labels!r}")
         return
 
+    traj_dir = os.environ.get("AGENT_TRAJECTORY_DIR")
+    if traj_dir:
+        print(f"[trajectory] saving per-scenario JSON to {traj_dir}")
+
     runner = ClaudeAgentRunner(model=args.model_id, max_turns=args.max_turns)
     needs_restore = False
 
@@ -65,6 +72,11 @@ async def _run(args: argparse.Namespace) -> None:
         label = md.get("scenario_label", "?")
         needs_restore = needs_restore or bool(md.get("write_back"))
 
+        # Fresh run_id per scenario so persist_trajectory writes one file each
+        # (keyed by run_id, with scenario_id recorded inside).
+        run_id = str(uuid.uuid4())
+        set_run_context(run_id=run_id, scenario_id=label)
+
         print(f"\n{_HR}\n{label} · {md.get('subtitle', '')}\n{_HR}")
         print(f"Q: {s['text']}\n")
 
@@ -80,6 +92,8 @@ async def _run(args: argparse.Namespace) -> None:
             f"tool_calls={len(tools_used)}, "
             f"out_tokens={result.trajectory.total_output_tokens})"
         )
+        if traj_dir:
+            print(f"SAVED         : {Path(traj_dir) / f'{run_id}.json'}  (scenario_id={label})")
         if args.show_trajectory:
             for t in result.trajectory.turns:
                 for tc in t.tool_calls:
@@ -104,11 +118,20 @@ def main() -> None:
     parser.add_argument("--max-turns", type=int, default=30, help="Max agentic loop turns.")
     parser.add_argument("--no-restore", action="store_true", help="Leave write-back imputations in CouchDB.")
     parser.add_argument("--show-trajectory", action="store_true", help="Print each tool call.")
+    parser.add_argument(
+        "--trajectory-dir",
+        metavar="DIR",
+        default=None,
+        help="Save a {run_id}.json trajectory per scenario to DIR "
+        "(overrides/sets AGENT_TRAJECTORY_DIR).",
+    )
     args = parser.parse_args()
 
     from dotenv import load_dotenv
 
     load_dotenv(_ROOT / ".env")
+    if args.trajectory_dir:
+        os.environ["AGENT_TRAJECTORY_DIR"] = args.trajectory_dir
     asyncio.run(_run(args))
 
 

From ac9e3ff982e7886fe28b9f04134668acebb238af Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 8 Jun 2026 13:39:21 -0400
Subject: [PATCH 10/10] feat: add wo_fmc CouchDB reset script to restore seed
 state

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 scripts/reset_fmc_workorders.py | 96 +++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 scripts/reset_fmc_workorders.py

diff --git a/scripts/reset_fmc_workorders.py b/scripts/reset_fmc_workorders.py
new file mode 100644
index 00000000..501bed4e
--- /dev/null
+++ b/scripts/reset_fmc_workorders.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+"""Reset the ``wo_fmc`` work orders in CouchDB back to their seed state.
+
+The failure-code write-back scenarios (S2/S4/S5) mutate ``wo_fmc`` records in
+the ``workorder`` database.  This restores every record's ``failure_code`` to
+the value in the seed CSV (``src/couchdb/sample_data/work_order/wo_fmc.csv``):
+TRN- records keep their historical codes, TST- records go back to blank.  Only
+records that have drifted from the seed are written.
+
+For a full rebuild of the entire ``workorder`` DB from all CSVs instead, use:
+    cd src && uv run python -m couchdb.init_wo --drop
+
+Usage:
+    uv run python scripts/reset_fmc_workorders.py
+    uv run python scripts/reset_fmc_workorders.py --dry-run
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import sys
+from pathlib import Path
+
+_ROOT = Path(__file__).resolve().parent.parent
+_SRC = _ROOT / "src"
+sys.path.insert(0, str(_SRC))
+
+_CSV = _SRC / "couchdb" / "sample_data" / "work_order" / "wo_fmc.csv"
+
+
+def _seed_codes() -> dict[str, str | None]:
+    """Map wo_id → seed failure_code (None for blank) from the CSV."""
+    seed: dict[str, str | None] = {}
+    with open(_CSV, newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            code = (row.get("failure_code") or "").strip()
+            seed[row["wo_id"]] = code or None
+    return seed
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true", help="Report drift without writing."
+    )
+    args = parser.parse_args()
+
+    from dotenv import load_dotenv
+
+    load_dotenv(_ROOT / ".env")
+
+    import pandas as pd
+
+    from servers.wo.data import load, write_failure_codes
+
+    seed = _seed_codes()
+    blanks = sum(1 for v in seed.values() if v is None)
+    print(f"seed: {len(seed)} wo_fmc records ({len(seed) - blanks} labelled, {blanks} blank)")
+
+    df = load("wo_fmc")
+    if df is None:
+        print("CouchDB unavailable or wo_fmc not loaded — nothing to reset.")
+        sys.exit(1)
+
+    def _norm(v) -> str | None:
+        if pd.isna(v) or not str(v).strip():
+            return None
+        return str(v).strip()
+
+    current = {str(r["wo_id"]): _norm(r.get("failure_code")) for _, r in df.iterrows()}
+    drift = {wo_id: code for wo_id, code in seed.items() if current.get(wo_id) != code}
+
+    print(f"drifted from seed: {len(drift)} record(s)")
+    if not drift:
+        print("already at seed state — nothing to do.")
+        return
+    if args.dry_run:
+        for wo_id in list(drift)[:20]:
+            print(f"  {wo_id}: {current.get(wo_id)!r} -> {drift[wo_id]!r}")
+        if len(drift) > 20:
+            print(f"  … and {len(drift) - 20} more")
+        return
+
+    status = write_failure_codes(drift)
+    if status is None:
+        print("CouchDB unavailable — reset aborted.")
+        sys.exit(1)
+    restored = sum(1 for ok in status.values() if ok)
+    print(f"reset {restored}/{len(drift)} wo_fmc record(s) to seed state.")
+
+
+if __name__ == "__main__":
+    main()