Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions scripts/reset_fmc_workorders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python
"""Reset the ``wo_fmc`` work orders in CouchDB back to their seed state.

The failure-code write-back scenarios (S2/S4/S5) mutate ``wo_fmc`` records in
the ``workorder`` database. This restores every record's ``failure_code`` to
the value in the seed CSV (``src/couchdb/sample_data/work_order/wo_fmc.csv``):
TRN- records keep their historical codes, TST- records go back to blank. Only
records that have drifted from the seed are written.

For a full rebuild of the entire ``workorder`` DB from all CSVs instead, use:
cd src && uv run python -m couchdb.init_wo --drop

Usage:
uv run python scripts/reset_fmc_workorders.py
uv run python scripts/reset_fmc_workorders.py --dry-run
"""

from __future__ import annotations

import argparse
import csv
import sys
from pathlib import Path

_ROOT = Path(__file__).resolve().parent.parent
_SRC = _ROOT / "src"
sys.path.insert(0, str(_SRC))

_CSV = _SRC / "couchdb" / "sample_data" / "work_order" / "wo_fmc.csv"


def _seed_codes() -> dict[str, str | None]:
"""Map wo_id → seed failure_code (None for blank) from the CSV."""
seed: dict[str, str | None] = {}
with open(_CSV, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
code = (row.get("failure_code") or "").strip()
seed[row["wo_id"]] = code or None
return seed


def main() -> None:
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"--dry-run", action="store_true", help="Report drift without writing."
)
args = parser.parse_args()

from dotenv import load_dotenv

load_dotenv(_ROOT / ".env")

import pandas as pd

from servers.wo.data import load, write_failure_codes

seed = _seed_codes()
blanks = sum(1 for v in seed.values() if v is None)
print(f"seed: {len(seed)} wo_fmc records ({len(seed) - blanks} labelled, {blanks} blank)")

df = load("wo_fmc")
if df is None:
print("CouchDB unavailable or wo_fmc not loaded — nothing to reset.")
sys.exit(1)

def _norm(v) -> str | None:
if pd.isna(v) or not str(v).strip():
return None
return str(v).strip()

current = {str(r["wo_id"]): _norm(r.get("failure_code")) for _, r in df.iterrows()}
drift = {wo_id: code for wo_id, code in seed.items() if current.get(wo_id) != code}

print(f"drifted from seed: {len(drift)} record(s)")
if not drift:
print("already at seed state — nothing to do.")
return
if args.dry_run:
for wo_id in list(drift)[:20]:
print(f" {wo_id}: {current.get(wo_id)!r} -> {drift[wo_id]!r}")
if len(drift) > 20:
print(f" … and {len(drift) - 20} more")
return

status = write_failure_codes(drift)
if status is None:
print("CouchDB unavailable — reset aborted.")
sys.exit(1)
restored = sum(1 for ok in status.values() if ok)
print(f"reset {restored}/{len(drift)} wo_fmc record(s) to seed state.")


if __name__ == "__main__":
main()
139 changes: 139 additions & 0 deletions scripts/test_fmc_claude_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#!/usr/bin/env python
"""Quick smoke test: run the Claude agent over the FMC work-order scenarios.

Loads the scenarios from ``src/scenarios/local/workorder_utterance.json``, runs
each one through ``ClaudeAgentRunner`` (which connects the ``wo`` MCP server and
its FMC tools), and prints the agent's answer next to the expected answer plus
the tools it actually called.

Write-back scenarios (S2/S4/S5) mutate CouchDB; by default this script
re-blanks every ``TST-`` record afterwards so the evaluation dataset stays
pristine. Pass ``--no-restore`` to leave the imputations in place (e.g. to
inspect the write-back independently).

Requires CouchDB up (``workorder`` DB loaded) and LITELLM_* env vars in .env.

Usage:
uv run python scripts/test_fmc_claude_agent.py # all scenarios
uv run python scripts/test_fmc_claude_agent.py S1 S3 # only S1 and S3
uv run python scripts/test_fmc_claude_agent.py --no-restore
uv run python scripts/test_fmc_claude_agent.py --show-trajectory
"""

from __future__ import annotations

import argparse
import asyncio
import os
import sys
import uuid
from pathlib import Path

_ROOT = Path(__file__).resolve().parent.parent
_SRC = _ROOT / "src"
sys.path.insert(0, str(_SRC))

_SCENARIOS = _SRC / "scenarios" / "local" / "workorder_utterance.json"
_HR = "=" * 72


def _load_scenarios(labels: set[str]) -> list[dict]:
import json

data = json.loads(_SCENARIOS.read_text(encoding="utf-8"))
out = []
for s in data:
label = s.get("metadata", {}).get("scenario_label", "")
if labels and label not in labels:
continue
out.append(s)
return out


async def _run(args: argparse.Namespace) -> None:
from agent.claude_agent.runner import ClaudeAgentRunner
from observability import set_run_context
from servers.wo.data import load, write_failure_codes

scenarios = _load_scenarios(set(args.labels))
if not scenarios:
print(f"No scenarios matched {args.labels!r}")
return

traj_dir = os.environ.get("AGENT_TRAJECTORY_DIR")
if traj_dir:
print(f"[trajectory] saving per-scenario JSON to {traj_dir}")

runner = ClaudeAgentRunner(model=args.model_id, max_turns=args.max_turns)
needs_restore = False

for s in scenarios:
md = s["metadata"]
label = md.get("scenario_label", "?")
needs_restore = needs_restore or bool(md.get("write_back"))

# Fresh run_id per scenario so persist_trajectory writes one file each
# (keyed by run_id, with scenario_id recorded inside).
run_id = str(uuid.uuid4())
set_run_context(run_id=run_id, scenario_id=label)

print(f"\n{_HR}\n{label} · {md.get('subtitle', '')}\n{_HR}")
print(f"Q: {s['text']}\n")

result = await runner.run(s["text"])

tools_used = [tc.name for tc in result.trajectory.all_tool_calls]
print(f"EXPECTED : {s.get('expected_answer')}")
print(f"AGENT ANSWER : {result.answer}")
print(f"EXPECTED TOOLS: {md.get('expected_tools')}")
print(f"TOOLS USED : {tools_used}")
print(
f"(turns={len(result.trajectory.turns)}, "
f"tool_calls={len(tools_used)}, "
f"out_tokens={result.trajectory.total_output_tokens})"
)
if traj_dir:
print(f"SAVED : {Path(traj_dir) / f'{run_id}.json'} (scenario_id={label})")
if args.show_trajectory:
for t in result.trajectory.turns:
for tc in t.tool_calls:
print(f" → {tc.name}({tc.input})")

if needs_restore and not args.no_restore:
df = load("wo_fmc")
status = {}
if df is not None:
tst_ids = [str(w) for w in df.loc[df["wo_id"].str.startswith("TST"), "wo_id"]]
status = write_failure_codes({wo_id: None for wo_id in tst_ids}) or {}
restored = sum(1 for ok in status.values() if ok)
print(f"\n[restore] re-blanked {restored} TST- record(s) to keep the dataset pristine.")
elif needs_restore:
print("\n[restore] skipped (--no-restore); TST- records keep their imputed codes.")


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("labels", nargs="*", help="Scenario labels to run (e.g. S1 S3). Default: all.")
parser.add_argument("--model-id", default="litellm_proxy/aws/claude-opus-4-6", help="Model string.")
parser.add_argument("--max-turns", type=int, default=30, help="Max agentic loop turns.")
parser.add_argument("--no-restore", action="store_true", help="Leave write-back imputations in CouchDB.")
parser.add_argument("--show-trajectory", action="store_true", help="Print each tool call.")
parser.add_argument(
"--trajectory-dir",
metavar="DIR",
default=None,
help="Save a {run_id}.json trajectory per scenario to DIR "
"(overrides/sets AGENT_TRAJECTORY_DIR).",
)
args = parser.parse_args()

from dotenv import load_dotenv

load_dotenv(_ROOT / ".env")
if args.trajectory_dir:
os.environ["AGENT_TRAJECTORY_DIR"] = args.trajectory_dir
asyncio.run(_run(args))


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions src/couchdb/init_wo.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
("failure_codes.csv", "failure_codes", {}),
("primary_failure_codes.csv", "primary_failure_codes", {}),
("component.csv", "component", {}),
("wo_fmc.csv", "wo_fmc", {}),
]

# Mango indexes to create: list of field-lists
Expand Down
Loading