From 3326cc1fa19a082036998c1aebcea69aae9d5d54 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:11:04 -0500
Subject: [PATCH 01/40] chore: deps and toolchain config for the agentic stack
 (stream-json, adm-zip, audit overrides)

---
 .eslintignore            |  3 +++
 .oxlintrc.json           |  1 +
 packages/db/package.json | 12 +++++++++++-
 pnpm-lock.yaml           | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 .eslintignore

diff --git a/.eslintignore b/.eslintignore
new file mode 100644
index 00000000..513a873e
--- /dev/null
+++ b/.eslintignore
@@ -0,0 +1,3 @@
+# Stale agent worktrees produced by parallel Claude Code sessions — they
+# hold their own branches and are linted as part of their own runs.
+.claude/worktrees/
diff --git a/.oxlintrc.json b/.oxlintrc.json
index ff610e51..5a03a5a0 100644
--- a/.oxlintrc.json
+++ b/.oxlintrc.json
@@ -28,6 +28,7 @@
     "no-undef": "off",
     "no-underscore-dangle": "off",
     "no-useless-undefined": "off",
+    "require-unicode-regexp": "off",
     "no-warning-comments": "off",
     "prefer-destructuring": "off",
     "sort-imports": "off",
diff --git a/packages/db/package.json b/packages/db/package.json
index 8789f48b..c7836df4 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -19,6 +19,13 @@
     "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts",
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
+    "db:backfill-agentic-intvty": "dotenv -e ../../.env -- tsx src/backfill-agentic-intvty.ts",
+    "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
+    "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
+    "db:backfill-agentic-server-logs": "dotenv -e ../../.env -- tsx src/backfill-agentic-server-logs.ts",
+    "db:backfill-dataset-stats": "dotenv -e ../../.env -- tsx src/backfill-dataset-stats.ts",
+    "db:backfill-kv-pool": "dotenv -e ../../.env -- tsx src/backfill-kv-pool.ts",
+    "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
@@ -30,11 +37,14 @@
     "@neondatabase/serverless": "^1.1.0",
     "@noble/ciphers": "^2.2.0",
     "@semianalysisai/inferencex-constants": "workspace:*",
-    "postgres": "^3.4.9"
+    "postgres": "^3.4.9",
+    "stream-chain": "^3.4.0",
+    "stream-json": "^2.1.0"
   },
   "devDependencies": {
     "@types/adm-zip": "^0.5.8",
     "@types/node": "^26.0.1",
+    "@types/stream-json": "^1.7.8",
     "@vitest/coverage-v8": "^4.1.9",
     "adm-zip": "^0.5.18",
     "dotenv-cli": "^11.0.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 084c2485..58cdbba9 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -256,6 +256,12 @@ importers:
       postgres:
         specifier: ^3.4.9
         version: 3.4.9
+      stream-chain:
+        specifier: ^3.4.0
+        version: 3.6.3
+      stream-json:
+        specifier: ^2.1.0
+        version: 2.1.0
     devDependencies:
       '@types/adm-zip':
         specifier: ^0.5.8
@@ -263,6 +269,9 @@ importers:
       '@types/node':
         specifier: ^26.0.1
         version: 26.0.1
+      '@types/stream-json':
+        specifier: ^1.7.8
+        version: 1.7.8
       '@vitest/coverage-v8':
         specifier: ^4.1.9
         version: 4.1.9(vitest@4.1.9)
@@ -2096,6 +2105,12 @@ packages:
   '@types/stats.js@0.17.4':
     resolution: {integrity: sha512-jIBvWWShCvlBqBNIZt0KAshWpvSjhkwkEu4ZUcASoAvhmrgAUI2t1dXrjSL4xXVLB4FznPrIsX3nKXFl/Dt4vA==}
 
+  '@types/stream-chain@2.1.0':
+    resolution: {integrity: sha512-guDyAl6s/CAzXUOWpGK2bHvdiopLIwpGu8v10+lb9hnQOyo4oj/ZUQFOvqFjKGsE3wJP1fpIesCcMvbXuWsqOg==}
+
+  '@types/stream-json@1.7.8':
+    resolution: {integrity: sha512-MU1OB1eFLcYWd1LjwKXrxdoPtXSRzRmAnnxs4Js/ayB5O/NvHraWwuOaqMWIebpYwM6khFlsJOHEhI9xK/ab4Q==}
+
   '@types/three@0.185.0':
     resolution: {integrity: sha512-O2Uy8Cj4Nonr8dWUUbifMdPe8B0Mq7EdOHb89S4+kjUw/KhbjTZrUuYlrQ1bpUKG+EP9QJnN7qNxbHGlGoLHMA==}
 
@@ -4812,9 +4827,15 @@ packages:
     resolution: {integrity: sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==}
     engines: {node: '>= 0.4'}
 
+  stream-chain@3.6.3:
+    resolution: {integrity: sha512-JZuELdHUuiZL4Olcr4EllGUvj9VKEaDkGHA6QAP5SruD0bgrr8TwtNXwRfH+fCncysEII7HhWll1+aOwvHYyRw==}
+
   stream-combiner@0.2.2:
     resolution: {integrity: sha512-6yHMqgLYDzQDcAkL+tjJDC5nSNuNIx0vZtRZeiPh7Saef7VHX9H5Ijn9l2VIol2zaNYlYEX6KyuT/237A58qEQ==}
 
+  stream-json@2.1.0:
+    resolution: {integrity: sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==}
+
   string-width@4.2.3:
     resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
     engines: {node: '>=8'}
@@ -6950,6 +6971,15 @@ snapshots:
 
   '@types/stats.js@0.17.4': {}
 
+  '@types/stream-chain@2.1.0':
+    dependencies:
+      '@types/node': 26.0.1
+
+  '@types/stream-json@1.7.8':
+    dependencies:
+      '@types/node': 26.0.1
+      '@types/stream-chain': 2.1.0
+
   '@types/three@0.185.0':
     dependencies:
       '@dimforge/rapier3d-compat': 0.12.0
@@ -10234,11 +10264,17 @@ snapshots:
       es-errors: 1.3.0
       internal-slot: 1.1.0
 
+  stream-chain@3.6.3: {}
+
   stream-combiner@0.2.2:
     dependencies:
       duplexer: 0.1.2
       through: 2.3.8
 
+  stream-json@2.1.0:
+    dependencies:
+      stream-chain: 3.6.3
+
   string-width@4.2.3:
     dependencies:
       emoji-regex: 8.0.0

From 580adfddcbfa415891136c3b571ccc0f8b14efc6 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:11:11 -0500
Subject: [PATCH 02/40] =?UTF-8?q?feat(db):=20agentic=20benchmark=20schema?=
 =?UTF-8?q?=20=E2=80=94=20agentic=5Ftraces=20results,=20trace-replay=20sid?=
 =?UTF-8?q?ecar,=20datasets=20tables?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/db/migrations/007_agentic.sql        | 326 ++++++++++++++++++
 ..._latest_benchmarks_single_run_per_line.sql |  49 +++
 .../migrations/009_dataset_request_stats.sql  |  55 +++
 3 files changed, 430 insertions(+)
 create mode 100644 packages/db/migrations/007_agentic.sql
 create mode 100644 packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
 create mode 100644 packages/db/migrations/009_dataset_request_stats.sql

diff --git a/packages/db/migrations/007_agentic.sql b/packages/db/migrations/007_agentic.sql
new file mode 100644
index 00000000..eceea82e
--- /dev/null
+++ b/packages/db/migrations/007_agentic.sql
@@ -0,0 +1,326 @@
+-- 007_agentic.sql
+--
+-- Squashed agentic-benchmark + datasets schema. Collapses the feat/agentx
+-- migrations 002_agentic_scenario .. 012_run_datasets into one file that sorts
+-- after master's highest migration (006_benchmark_results_workers), so the
+-- branch's numbering no longer collides with master's 002-006. None of the
+-- collapsed migrations had been applied to any deployed database.
+--
+-- Statement order is preserved exactly. The latest_benchmarks recreate uses
+-- 'select br.*', so it retains every benchmark_results column added earlier
+-- (including master's 'workers' from 006) and re-keys the view on offload_mode.
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 002_agentic_scenario.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Support agentic scenarios in benchmark_results.
+--
+-- Scenarios are discriminated by benchmark_type:
+--   'single_turn'     — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
+--   'agentic_traces'  — trace-replay agentic runs. isl/osl NULL.
+--
+-- conc retains its meaning (concurrent users/requests) for both.
+
+-- 1) isl/osl become nullable for agentic rows
+alter table benchmark_results
+  alter column isl drop not null,
+  alter column osl drop not null;
+
+-- 2) CHECK constraints: positive-or-null
+alter table benchmark_results
+  drop constraint benchmark_results_isl_positive,
+  drop constraint benchmark_results_osl_positive;
+
+alter table benchmark_results
+  add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
+  add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
+
+-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
+--    can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 003_agentic_availability.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Extend the availability table to cover agentic scenarios.
+--
+-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
+-- for availability and add benchmark_type so the frontend can enumerate
+-- agentic vs single_turn scenarios per model/date.
+--
+-- Postgres primary keys require every column to be NOT NULL, so we drop the PK
+-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
+-- equivalent except it allows isl/osl to be NULL for agentic rows.
+
+alter table availability
+  drop constraint availability_pkey;
+
+alter table availability
+  alter column isl drop not null,
+  alter column osl drop not null,
+  add column benchmark_type text not null default 'single_turn';
+
+alter table availability
+  add constraint availability_natural_key unique nulls not distinct
+    (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 004_offload_mode.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Add offload_mode as a first-class dimension on benchmark_results.
+--
+-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
+-- runs: a single run may emit two rows for the same (config, isl, osl, conc)
+-- — one with offload disabled, one enabled. The pre-existing unique key
+-- collapsed those into one row, forcing the ingest to skip variants.
+--
+-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
+-- assumption baked into the existing 5,500+ rows.
+
+alter table benchmark_results
+  add column offload_mode text not null default 'off';
+
+-- Backfill agentic rows from the offload_mode value already living in metrics
+-- JSONB (set during the earlier agentic ingest backfill).
+update benchmark_results
+   set offload_mode = metrics->>'offload_mode'
+ where benchmark_type = 'agentic_traces'
+   and metrics ? 'offload_mode';
+
+-- Replace the unique constraint so on/off variants can coexist.
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
+
+-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
+drop materialized view if exists latest_benchmarks cascade;
+
+create materialized view latest_benchmarks as
+select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
+  br.*
+from benchmark_results br
+join latest_workflow_runs wr on wr.id = br.workflow_run_id
+where br.error is null
+order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
+
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 006_agentic_trace_replay.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Capture raw aiperf trace files per agentic benchmark point.
+--
+-- The aiperf harness produces two per-point export files inside each
+-- `agentic_<suffix>` artifact:
+--   - profile_export.jsonl         (~2 MB raw, per-request data)
+--   - server_metrics_export.csv    (~20 KB raw, periodic Prometheus snapshots)
+--
+-- We persist them so the dashboard can later show per-request distributions,
+-- KV cache utilization over time, and conversation traces without needing to
+-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
+-- ~500 KB per point post-gzip the total fits comfortably without a separate
+-- blob service.
+--
+-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
+-- column on benchmark_results). Older, non-aiperf agentic runs simply have a
+-- NULL `trace_replay_id`.
+
+create table agentic_trace_replay (
+  id                                bigserial   primary key,
+  -- gzip(profile_export.jsonl); null when only the server metrics file existed
+  profile_export_jsonl_gz           bytea,
+  profile_export_uncompressed_size  bigint,
+  -- raw csv bytes; null when only the profile file existed
+  server_metrics_csv                bytea,
+  server_metrics_csv_size           bigint,
+  created_at                        timestamptz not null default now()
+);
+
+alter table benchmark_results
+  add column trace_replay_id bigint references agentic_trace_replay(id);
+
+create index benchmark_results_trace_replay_idx
+  on benchmark_results (trace_replay_id)
+  where trace_replay_id is not null;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 007_agentic_trace_server_metrics_json.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Add the full server-metrics time-series JSON to agentic_trace_replay.
+--
+-- The existing `server_metrics_csv` column holds aiperf's summary export —
+-- one row per metric with avg/min/max/std/p1..p99 across the entire run.
+-- That's enough for the cumulative cache-hit number but not for any
+-- "metric over time" view (KV cache utilization curve, queue depth, prefix
+-- hit rate per interval, cumulative prefill token source).
+--
+-- The harness also writes `server_metrics_export.json` which contains the
+-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
+-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
+-- to ~6 MB gzipped (text with repeated metric names + numeric values).
+-- That's the file we store here for any future time-series chart.
+
+alter table agentic_trace_replay
+  add column server_metrics_json_gz bytea,
+  add column server_metrics_json_uncompressed_size bigint;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 008_agentic_aggregate_stats.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed aggregate stats for each agentic_trace_replay row.
+--
+-- Previously the agentic detail page parsed the (huge) profile_export.jsonl
+-- and server_metrics_json blobs on every request to compute distribution
+-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
+-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
+-- worst rows (high-conc TP+EP server_metrics blobs that decompress past
+-- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
+--
+-- This column holds the computed stats so the API serves the page from a
+-- single SQL row read. Shape mirrors the existing benchmark_results.metrics
+-- JSONB convention; an inner `version` field lets the backfill script
+-- detect rows whose stats were computed by an older algorithm and
+-- recompute them. Null when stats haven't been computed yet (existing
+-- rows pre-backfill; the API has a slow-path fallback for that case).
+
+alter table agentic_trace_replay
+  add column aggregate_stats jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 009_agentic_chart_series.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed time-series for the agentic detail page chart.
+--
+-- Sibling to `aggregate_stats` (migration 008): that column stores
+-- per-row percentile/derived *summaries*, this one stores the full
+-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
+-- queueDepth, prefillTps, decodeTps, promptTokensBySource).
+--
+-- Without this, the detail page parsed the entire `server_metrics_json_gz`
+-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
+-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
+-- With pre-computed series the page is a single SQL row read.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored series were produced by an older algorithm.
+-- Null when the series haven't been computed yet; the API has a slow-path
+-- fallback (with stream-parse for oversized blobs) for that case.
+
+alter table agentic_trace_replay
+  add column chart_series jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 010_agentic_request_timeline.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed per-request timeline for the agentic detail page.
+--
+-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
+-- holds a thin per-request array extracted from `profile_export_jsonl_gz`
+-- so the detail page can render a Gantt-style swimlane of every request
+-- (one bar per conversation turn) without re-parsing the JSONL on every
+-- page load.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored timeline was produced by an older
+-- algorithm. Null when the timeline hasn't been computed yet; the API
+-- falls back to parsing the blob in that case.
+
+alter table agentic_trace_replay
+  add column request_timeline jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 011_datasets.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora
+-- the agentic benchmarks replay) + their per-conversation trace structure.
+--
+-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but
+-- not the source traces. These two tables back the new /datasets area: a
+-- registry of ingested dataset versions with precomputed summary + chart data,
+-- and one row per conversation holding a flamegraph-ready `structure` (turns +
+-- subagent groups with input split into cached-prefix vs uncached-suffix). The
+-- raw hash_ids are NOT stored — they're only needed at ingest to derive the
+-- cached/uncached split, so the runtime read is a single small JSONB.
+--
+-- Additive only. To revert this migration:
+--   drop table if exists dataset_conversations;
+--   drop table if exists datasets;
+--   (and see the run_datasets revert below; this is all one migration now:
+--    delete from schema_migrations where filename = '007_agentic.sql';)
+
+create table datasets (
+  -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'.
+  id          text primary key,
+  -- URL key, e.g. 'cc-traces-weka-062126'.
+  slug        text not null unique,
+  label       text not null,
+  -- 'full' | '256k' | 'no-subagents' (the published variants).
+  variant     text not null default 'full',
+  description text,
+  hf_url      text,
+  license     text,
+  conversation_count integer not null default 0,
+  -- Token totals, main_turns, subagent_groups, model mix, date range, etc.
+  summary     jsonb not null default '{}'::jsonb,
+  -- Precomputed distributions for the dataset-detail cards (input/output length,
+  -- turns per conversation, subagent fan-out, …). Versioned via an inner field.
+  chart_data  jsonb not null default '{}'::jsonb,
+  dataset_version integer not null default 1,
+  ingested_at timestamptz not null default now()
+);
+
+create table dataset_conversations (
+  id          bigserial primary key,
+  dataset_id  text not null references datasets(id) on delete cascade,
+  -- The conversation id from the dataset record (trace id).
+  conv_id     text not null,
+  models      text[] not null default '{}',
+  num_turns           integer not null default 0,
+  num_subagent_groups integer not null default 0,
+  total_in    bigint not null default 0,
+  total_out   bigint not null default 0,
+  total_cached bigint not null default 0,
+  -- Flamegraph-ready ordered node tree (turns + subagent groups, each with
+  -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts.
+  structure   jsonb not null,
+  unique (dataset_id, conv_id)
+);
+
+create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 012_run_datasets.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Maps a benchmark workflow_run to the source dataset it replayed, so the
+-- agentic detail page can deep-link each request in the timeline to the exact
+-- conversation in the /datasets viewer (the request's conversation_id, with any
+-- ::sa:/::fa: suffix stripped, is the dataset conv_id).
+--
+-- One row per workflow_run (every benchmark in a run replays the same dataset).
+-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/<slug>
+-- URL) rather than an FK, so the mapping can be recorded before/independent of
+-- the dataset being ingested; the UI degrades gracefully if the slug is absent.
+--
+-- Additive only. To revert this whole squashed migration:
+--   drop table if exists run_datasets;
+--   drop table if exists dataset_conversations;
+--   drop table if exists datasets;
+--   drop table if exists agentic_trace_replay cascade;
+--   (plus the benchmark_results/availability column + constraint changes above)
+--   delete from schema_migrations where filename = '007_agentic.sql';
+
+create table run_datasets (
+  workflow_run_id bigint primary key references workflow_runs(id) on delete cascade,
+  dataset_slug    text not null,
+  created_at      timestamptz not null default now()
+);
diff --git a/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
new file mode 100644
index 00000000..039dfe09
--- /dev/null
+++ b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
@@ -0,0 +1,49 @@
+-- ============================================================
+-- LATEST_BENCHMARKS — one run per line (no cross-run stitching)
+-- ============================================================
+--
+-- Previously the view did `distinct on (config_id, conc, isl, osl)` ordered by
+-- date desc — resolved INDEPENDENTLY per concurrency. So if a newer run
+-- re-measured only some concurrencies (a partial re-sweep), the concurrencies it
+-- skipped fell back to an older run that did measure them, and a single chart line
+-- ended up stitched from points produced by different runs on different dates.
+--
+-- A line is one config + sequence + offload mode
+-- (config_id, benchmark_type, isl, osl, offload_mode) plotted
+-- across concurrencies, and it must come from a SINGLE workflow run. We pick the
+-- newest run per line (newest date, then latest sweep by run_started_at, then
+-- highest workflow_run_id so exactly one run wins even on a same-day / null tie),
+-- then keep EVERY concurrency that one run measured. A partial re-sweep therefore
+-- truncates the line to its own concurrencies rather than borrowing an older run's.
+
+drop materialized view if exists latest_benchmarks;
+
+create materialized view latest_benchmarks as
+with winners as (
+  select distinct on (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode)
+         br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+         br.workflow_run_id as winning_run_id
+  from benchmark_results br
+  join latest_workflow_runs wr on wr.id = br.workflow_run_id
+  where br.error is null
+  order by br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+           br.date desc, wr.run_started_at desc nulls last, br.workflow_run_id desc
+)
+select br.*
+from benchmark_results br
+join winners w
+  on  w.config_id      = br.config_id
+  and w.benchmark_type = br.benchmark_type
+  and w.isl is not distinct from br.isl
+  and w.osl is not distinct from br.osl
+  and w.offload_mode = br.offload_mode
+  and w.winning_run_id = br.workflow_run_id
+where br.error is null;
+
+-- Unique key now includes benchmark_type (part of the line key). One run per line
+-- guarantees one row per concurrency, so this stays unique and keeps
+-- REFRESH MATERIALIZED VIEW CONCURRENTLY working.
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, benchmark_type, offload_mode)
+  nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/migrations/009_dataset_request_stats.sql b/packages/db/migrations/009_dataset_request_stats.sql
new file mode 100644
index 00000000..0b7c11bb
--- /dev/null
+++ b/packages/db/migrations/009_dataset_request_stats.sql
@@ -0,0 +1,55 @@
+-- Backfill dataset-level requests/conversation statistics.
+-- A request is one actual model call: each top-level turn plus each child turn
+-- inside a subagent group. The group container itself is not a request.
+
+with per_conversation as (
+  select
+    dc.dataset_id,
+    dc.num_subagent_groups,
+    (
+      dc.num_turns + coalesce((
+        select sum(jsonb_array_length(node.value->'children'))
+        from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) as node(value)
+        where node.value->>'kind' = 'subagent'
+      ), 0)
+    )::double precision as request_count
+  from dataset_conversations dc
+), request_stats as (
+  select
+    dataset_id,
+    avg(request_count) as mean_requests,
+    percentile_cont(0.5) within group (order by request_count) as median_requests,
+    avg(num_subagent_groups::double precision) as mean_subagents,
+    percentile_cont(0.5) within group (order by num_subagent_groups) as median_subagents
+  from per_conversation
+  group by dataset_id
+)
+update datasets d
+set summary = jsonb_set(
+  jsonb_set(
+    jsonb_set(
+      jsonb_set(
+        jsonb_set(
+          d.summary,
+          '{meanRequestsPerConversation}',
+          to_jsonb(request_stats.mean_requests),
+          true
+        ),
+        '{medianRequestsPerConversation}',
+        to_jsonb(request_stats.median_requests),
+        true
+      ),
+      '{meanSubagentsPerTrace}',
+      to_jsonb(request_stats.mean_subagents),
+      true
+    ),
+    '{medianSubagentsPerTrace}',
+    to_jsonb(request_stats.median_subagents),
+    true
+  ),
+  '{version}',
+  '3'::jsonb,
+  true
+)
+from request_stats
+where d.id = request_stats.dataset_id;

From 3870e2b9494c4cca8e96cd0590de9949c8b5e04c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:11:29 -0500
Subject: [PATCH 03/40] =?UTF-8?q?feat(db):=20agentic=20ETL=20=E2=80=94=20t?=
 =?UTF-8?q?race-replay=20ingest,=20chart-series/timeline/aggregate=20compu?=
 =?UTF-8?q?tation,=20v3=20agg=20schema?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/constants/src/agentic.ts             |   2 +
 packages/constants/src/framework-aliases.ts   |   1 +
 packages/constants/src/index.ts               |   1 +
 packages/constants/src/metric-keys.ts         |  71 ++-
 packages/constants/src/models.ts              |  17 +
 packages/db/src/etl/agentic-v3-flatten.ts     | 131 ++++
 packages/db/src/etl/benchmark-ingest.ts       |  40 +-
 packages/db/src/etl/benchmark-mapper.test.ts  | 291 +++++++++
 packages/db/src/etl/benchmark-mapper.ts       | 260 +++++---
 .../src/etl/compute-aggregate-stats.test.ts   | 152 +++++
 .../db/src/etl/compute-aggregate-stats.ts     | 149 +++++
 .../db/src/etl/compute-chart-series.test.ts   | 341 +++++++++++
 packages/db/src/etl/compute-chart-series.ts   | 576 ++++++++++++++++++
 .../src/etl/compute-request-timeline.test.ts  | 210 +++++++
 .../db/src/etl/compute-request-timeline.ts    | 208 +++++++
 .../db/src/etl/dataset-provenance.test.ts     |  40 ++
 packages/db/src/etl/dataset-provenance.ts     |  32 +
 packages/db/src/etl/distribution-stats.ts     |  98 +++
 packages/db/src/etl/gzip-json-stream.test.ts  |  66 ++
 packages/db/src/etl/gzip-json-stream.ts       |  58 ++
 packages/db/src/etl/normalizers.test.ts       |   5 +
 packages/db/src/etl/normalizers.ts            |  20 +-
 .../db/src/etl/server-log-metrics.test.ts     |  43 ++
 packages/db/src/etl/server-log-metrics.ts     |  65 ++
 .../db/src/etl/server-metrics-adapters.ts     | 100 +++
 packages/db/src/etl/skip-tracker.test.ts      |   1 +
 packages/db/src/etl/skip-tracker.ts           |  13 +-
 .../src/etl/trace-artifact-discovery.test.ts  |  66 ++
 .../db/src/etl/trace-artifact-discovery.ts    |  93 +++
 packages/db/src/etl/trace-replay-ingest.ts    | 151 +++++
 packages/db/src/etl/weka-structure.test.ts    | 259 ++++++++
 packages/db/src/etl/weka-structure.ts         | 327 ++++++++++
 32 files changed, 3799 insertions(+), 88 deletions(-)
 create mode 100644 packages/constants/src/agentic.ts
 create mode 100644 packages/db/src/etl/agentic-v3-flatten.ts
 create mode 100644 packages/db/src/etl/compute-aggregate-stats.test.ts
 create mode 100644 packages/db/src/etl/compute-aggregate-stats.ts
 create mode 100644 packages/db/src/etl/compute-chart-series.test.ts
 create mode 100644 packages/db/src/etl/compute-chart-series.ts
 create mode 100644 packages/db/src/etl/compute-request-timeline.test.ts
 create mode 100644 packages/db/src/etl/compute-request-timeline.ts
 create mode 100644 packages/db/src/etl/dataset-provenance.test.ts
 create mode 100644 packages/db/src/etl/dataset-provenance.ts
 create mode 100644 packages/db/src/etl/distribution-stats.ts
 create mode 100644 packages/db/src/etl/gzip-json-stream.test.ts
 create mode 100644 packages/db/src/etl/gzip-json-stream.ts
 create mode 100644 packages/db/src/etl/server-log-metrics.test.ts
 create mode 100644 packages/db/src/etl/server-log-metrics.ts
 create mode 100644 packages/db/src/etl/server-metrics-adapters.ts
 create mode 100644 packages/db/src/etl/trace-artifact-discovery.test.ts
 create mode 100644 packages/db/src/etl/trace-artifact-discovery.ts
 create mode 100644 packages/db/src/etl/trace-replay-ingest.ts
 create mode 100644 packages/db/src/etl/weka-structure.test.ts
 create mode 100644 packages/db/src/etl/weka-structure.ts

diff --git a/packages/constants/src/agentic.ts b/packages/constants/src/agentic.ts
new file mode 100644
index 00000000..42eab306
--- /dev/null
+++ b/packages/constants/src/agentic.ts
@@ -0,0 +1,2 @@
+/** Fixed output length used by the experimental normalized-E2E chart metric. */
+export const NORMALIZED_E2E_OUTPUT_TOKENS = 400;
diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts
index 6c775be4..74cbce3f 100644
--- a/packages/constants/src/framework-aliases.ts
+++ b/packages/constants/src/framework-aliases.ts
@@ -46,6 +46,7 @@ export const FRAMEWORK_LABELS: Record<string, string> = {
     ]),
   ),
   mtp: 'MTP',
+  aiperf: 'AIPerf',
 };
 
 /**
diff --git a/packages/constants/src/index.ts b/packages/constants/src/index.ts
index e767e500..7d3d6783 100644
--- a/packages/constants/src/index.ts
+++ b/packages/constants/src/index.ts
@@ -1,3 +1,4 @@
+export * from './agentic';
 export * from './framework-aliases';
 export * from './github';
 export * from './gpu-keys';
diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts
index 7fa88c97..914eed4b 100644
--- a/packages/constants/src/metric-keys.ts
+++ b/packages/constants/src/metric-keys.ts
@@ -1,48 +1,117 @@
 /**
  * Canonical set of metric keys stored in the benchmark_results.metrics JSONB column.
  *
- * All values are in seconds unless noted otherwise. Throughput values are tokens/sec/GPU.
+ * Latency values (ttft/tpot/itl/e2el/intvty) are in seconds. Throughput values are
+ * tokens/sec — `_per_gpu` is per-GPU, `_tps` is total tokens/sec across the deployment.
+ *
+ * Distribution stats (mean/median/std/p75/p90/p95/p99/p99.9) are present for latency,
+ * QPS, and per-request token counts; agentic runs carry the full set, fixed-seq runs
+ * carry median/mean/p99/std for latency only.
  */
 export const METRIC_KEYS = new Set([
   // throughput (tokens/sec/GPU)
   'tput_per_gpu',
   'output_tput_per_gpu',
   'input_tput_per_gpu',
+  // throughput (tokens/sec, deployment total) — agentic aiperf reports both
+  'total_tput_tps',
+  'output_tput_tps',
+  'input_tput_tps',
   // TTFT — time to first token
   'median_ttft',
   'mean_ttft',
+  'p75_ttft',
   'p90_ttft',
+  'p95_ttft',
   'p99_ttft',
   'p99.9_ttft',
   'std_ttft',
   // TPOT — time per output token
   'median_tpot',
   'mean_tpot',
+  'p75_tpot',
   'p90_tpot',
+  'p95_tpot',
   'p99_tpot',
   'p99.9_tpot',
   'std_tpot',
   // ITL — inter-token latency
   'median_itl',
   'mean_itl',
+  'p75_itl',
   'p90_itl',
+  'p95_itl',
   'p99_itl',
   'p99.9_itl',
   'std_itl',
   // E2EL — end-to-end latency
   'median_e2el',
   'mean_e2el',
+  'p75_e2el',
   'p90_e2el',
+  'p95_e2el',
   'p99_e2el',
   'p99.9_e2el',
   'std_e2el',
   // interactivity
   'median_intvty',
   'mean_intvty',
+  'p75_intvty',
   'p90_intvty',
+  'p95_intvty',
   'p99_intvty',
   'p99.9_intvty',
   'std_intvty',
+  // QPS — queries per second (agentic aiperf)
+  'median_qps',
+  'mean_qps',
+  'p75_qps',
+  'p90_qps',
+  'p95_qps',
+  'p99_qps',
+  'p99.9_qps',
+  'std_qps',
+  // per-request input token count distribution
+  'median_input_tokens',
+  'mean_input_tokens',
+  'p75_input_tokens',
+  'p90_input_tokens',
+  'p95_input_tokens',
+  'p99_input_tokens',
+  'p99.9_input_tokens',
+  'std_input_tokens',
+  // per-request output token count distribution — actual served
+  'median_output_tokens_actual',
+  'mean_output_tokens_actual',
+  'p75_output_tokens_actual',
+  'p90_output_tokens_actual',
+  'p95_output_tokens_actual',
+  'p99_output_tokens_actual',
+  'p99.9_output_tokens_actual',
+  'std_output_tokens_actual',
+  // per-request output token count distribution — expected from trace
+  'median_output_tokens_expected',
+  'mean_output_tokens_expected',
+  'p75_output_tokens_expected',
+  'p90_output_tokens_expected',
+  'p95_output_tokens_expected',
+  'p99_output_tokens_expected',
+  'p99.9_output_tokens_expected',
+  'std_output_tokens_expected',
+  // run totals (agentic aiperf)
+  'duration_seconds',
+  'total_requests_completed',
+  'total_prompt_tokens',
+  'total_generation_tokens',
+  // server prefix-cache observability (agentic aiperf)
+  'server_gpu_cache_hit_rate',
+  'server_cpu_cache_hit_rate',
+  'server_external_cache_hit_rate',
+  'theoretical_cache_hit_rate',
+  // server KV-cache occupancy — mean GPU KV-cache usage fraction (0-1) over the
+  // profiling window (agentic aiperf; flat in v2 artifacts, mapped from
+  // server_metrics.kv_cache.gpu_usage_pct in v3)
+  'gpu_kv_cache_usage_pct',
   // measured power / energy (emitted by runner's aggregate_power.py)
   // avg_power_w:             mean per-GPU draw (W) during the load window
   // joules_per_output_token: energy / total_output_tokens. CLUSTER-WIDE on
diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts
index 06dfa09b..9622fe8c 100644
--- a/packages/constants/src/models.ts
+++ b/packages/constants/src/models.ts
@@ -56,3 +56,20 @@ export function islOslToSequence(isl: number, osl: number): string | null {
   };
   return map[`${isl}_${osl}`] ?? null;
 }
+
+/**
+ * Map a benchmark/availability row to its sequence (scenario) string.
+ * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl.
+ * - Other rows (today: `single_turn`) fall back to `islOslToSequence`.
+ * Returns `null` for rows that can't be classified (e.g. `single_turn` with
+ * unmapped isl/osl values).
+ */
+export function rowToSequence(row: {
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+}): string | null {
+  if (row.benchmark_type === 'agentic_traces') return 'agentic-traces';
+  if (row.isl === null || row.osl === null) return null;
+  return islOslToSequence(row.isl, row.osl);
+}
diff --git a/packages/db/src/etl/agentic-v3-flatten.ts b/packages/db/src/etl/agentic-v3-flatten.ts
new file mode 100644
index 00000000..a3c223af
--- /dev/null
+++ b/packages/db/src/etl/agentic-v3-flatten.ts
@@ -0,0 +1,131 @@
+/**
+ * v3 agentic agg schema (2026-07-02+): nested containers → canonical flat keys.
+ *
+ * v3 artifacts nest their metrics under `request_metrics` / `server_metrics`
+ * containers; v1/v2 emitted the same information as flat top-level fields.
+ * `flattenAgenticAggRow` maps the nested shape onto the flat schema the DB /
+ * API / frontend consume, so the rest of the mapper stays version-agnostic.
+ */
+
+import { parseNum } from './normalizers';
+
+/**
+ * Distribution stat names accepted from v3 nested stat blocks, with the rename
+ * applied when flattening. `p50` is stored as `median_*` to match the
+ * established METRIC_KEYS naming (fixed-seq runs and the frontend both use
+ * `median_*`; no `p50_*` key exists anywhere downstream).
+ */
+const V3_STAT_KEYS: Record<string, string> = {
+  mean: 'mean',
+  p50: 'median',
+  median: 'median',
+  p75: 'p75',
+  p90: 'p90',
+  p95: 'p95',
+  p99: 'p99',
+  'p99.9': 'p99.9',
+  std: 'std',
+};
+
+/** v3 `request_metrics.latency` sub-blocks → flat metric suffix (same name). */
+const V3_LATENCY_METRICS = ['ttft', 'e2el', 'itl', 'tpot', 'intvty'] as const;
+
+/** v3 `request_metrics.tokens` sub-blocks → flat metric suffix. */
+const V3_TOKEN_METRICS: Record<string, string> = {
+  input: 'input_tokens',
+  output_actual: 'output_tokens_actual',
+  output_expected: 'output_tokens_expected',
+};
+
+/**
+ * Scalar paths in the v3 nested containers → canonical flat metric key. Keys
+ * reuse the flat v2-agentic names wherever one existed so already-ingested runs
+ * and the frontend see one consistent schema; genuinely new information gets a
+ * new key (registered in METRIC_KEYS).
+ */
+const V3_SCALAR_PATHS: [string[], string][] = [
+  // client-side throughput
+  [['request_metrics', 'throughput', 'input', 'tokens_per_second'], 'input_tput_tps'],
+  [['request_metrics', 'throughput', 'output', 'tokens_per_second'], 'output_tput_tps'],
+  [['request_metrics', 'throughput', 'total', 'tokens_per_second'], 'total_tput_tps'],
+  [['request_metrics', 'throughput', 'duration_seconds'], 'duration_seconds'],
+  [['request_metrics', 'throughput', 'per_gpu', 'total_tput_tps'], 'tput_per_gpu'],
+  [['request_metrics', 'throughput', 'per_gpu', 'output_tput_tps'], 'output_tput_per_gpu'],
+  [['request_metrics', 'throughput', 'per_gpu', 'input_tput_tps'], 'input_tput_per_gpu'],
+  [['request_metrics', 'cache', 'theoretical_cache_hit_rate'], 'theoretical_cache_hit_rate'],
+  // server-side prefix-cache observability (same fields v2 emitted flat)
+  [['server_metrics', 'cache', 'gpu_cache_hit_rate'], 'server_gpu_cache_hit_rate'],
+  [['server_metrics', 'cache', 'cpu_cache_hit_rate'], 'server_cpu_cache_hit_rate'],
+  [['server_metrics', 'cache', 'external_cache_hit_rate'], 'server_external_cache_hit_rate'],
+  // KV-cache occupancy (gpu key predates v3 as a flat auto-captured field)
+  [['server_metrics', 'kv_cache', 'gpu_usage_pct'], 'gpu_kv_cache_usage_pct'],
+  // server token totals
+  [['server_metrics', 'tokens', 'prompt_total'], 'total_prompt_tokens'],
+  [['server_metrics', 'tokens', 'generation_total'], 'total_generation_tokens'],
+  [['server_metrics', 'tokens', 'requests_completed'], 'total_requests_completed'],
+  // Deliberately NOT mapped (yet): cache.overall/prefix_cache_hits/queries,
+  // kv_cache.cpu_*, tokens.prompt_by_source, sources[] — new v3 detail we don't
+  // consume anywhere; add here + METRIC_KEYS when a view needs them.
+];
+
+/** Walk a nested object path; returns undefined on any non-object hop. */
+function atPath(obj: Record<string, any>, path: string[]): unknown {
+  let cur: unknown = obj;
+  for (const seg of path) {
+    if (!cur || typeof cur !== 'object' || Array.isArray(cur)) return undefined;
+    cur = (cur as Record<string, unknown>)[seg];
+  }
+  return cur;
+}
+
+/** Flatten one v3 stat block ({mean, p50, …}) into `out` as `{stat}_{suffix}`. */
+function flattenStatBlock(block: unknown, suffix: string, out: Record<string, number>): void {
+  if (!block || typeof block !== 'object' || Array.isArray(block)) return;
+  for (const [stat, canonical] of Object.entries(V3_STAT_KEYS)) {
+    const n = parseNum((block as Record<string, unknown>)[stat]);
+    if (n !== undefined) out[`${canonical}_${suffix}`] = n;
+  }
+}
+
+/**
+ * Flatten a v3 agentic agg row (nested `request_metrics` / `server_metrics`
+ * containers, 2026-07-02+) into the canonical flat metric schema that v1/v2
+ * artifacts emitted directly and that the DB / API / frontend consume.
+ *
+ * Returns the row unchanged when `request_metrics` is absent (v1/v2 rows pass
+ * through untouched). Otherwise returns a copy with the flattened metrics
+ * merged in; the nested containers stay on the row (they're in NON_METRIC_KEYS
+ * so the auto-capture loop ignores them).
+ *
+ * Notes on the v3 source data:
+ * - `p50` percentiles are new (v2 had no median for agentic); stored as
+ *   `median_*` to match the frontend's naming.
+ * - `latency.intvty` arrives already slow-tail inverted (pXX_intvty =
+ *   1/pXX_itl). It's flattened here for completeness, but mapBenchmarkRow's
+ *   derive-from-itl invariant still overwrites it, keeping one definition
+ *   across all harness versions.
+ */
+export function flattenAgenticAggRow(row: Record<string, any>): Record<string, any> {
+  const rm = row.request_metrics;
+  if (!rm || typeof rm !== 'object' || Array.isArray(rm)) return row;
+
+  const flat: Record<string, number> = {};
+
+  // latency distributions
+  for (const metric of V3_LATENCY_METRICS) {
+    flattenStatBlock(atPath(row, ['request_metrics', 'latency', metric]), metric, flat);
+  }
+  // qps distribution (window_seconds / samples are intentionally not stats)
+  flattenStatBlock(atPath(row, ['request_metrics', 'qps']), 'qps', flat);
+  // per-request token-count distributions
+  for (const [src, suffix] of Object.entries(V3_TOKEN_METRICS)) {
+    flattenStatBlock(atPath(row, ['request_metrics', 'tokens', src]), suffix, flat);
+  }
+  // scalars
+  for (const [path, key] of V3_SCALAR_PATHS) {
+    const n = parseNum(atPath(row, path));
+    if (n !== undefined) flat[key] = n;
+  }
+
+  return { ...row, ...flat };
+}
diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts
index a5493629..a405789d 100644
--- a/packages/db/src/etl/benchmark-ingest.ts
+++ b/packages/db/src/etl/benchmark-ingest.ts
@@ -4,6 +4,7 @@
 
 import type postgres from 'postgres';
 import type { BenchmarkParams } from './benchmark-mapper';
+import { kvCachePoolTokensFromServerLog } from './server-log-metrics';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -29,12 +30,19 @@ export async function bulkIngestBenchmarkRows(
 
   // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears
   // more than once in a single batch. Deduplicate within the batch, keeping
-  // the last occurrence (last metrics for each unique config/isl/osl/conc).
+  // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode).
   const seen = new Map<string, BenchmarkParams & { configId: number }>();
-  for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r);
+  for (const r of rows) {
+    seen.set(
+      `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`,
+      r,
+    );
+  }
   const deduped = [...seen.values()];
 
   const configIds = deduped.map((r) => r.configId);
+  const benchmarkTypes = deduped.map((r) => r.benchmarkType);
+  const offloadModes = deduped.map((r) => r.offloadMode);
   const isls = deduped.map((r) => r.isl);
   const osls = deduped.map((r) => r.osl);
   const concs = deduped.map((r) => r.conc);
@@ -49,13 +57,14 @@ export async function bulkIngestBenchmarkRows(
 
   const result = await sql<{ inserted: boolean; id: number }[]>`
     insert into benchmark_results (
-      workflow_run_id, config_id, benchmark_type, date,
+      workflow_run_id, config_id, benchmark_type, offload_mode, date,
       isl, osl, conc, image, metrics, workers
     )
     select
       ${workflowRunId},
       unnest(${sql.array(configIds)}::int[]),
-      'single_turn',
+      unnest(${sql.array(benchmarkTypes)}::text[]),
+      unnest(${sql.array(offloadModes)}::text[]),
       ${date}::date,
       unnest(${sql.array(isls)}::int[]),
       unnest(${sql.array(osls)}::int[]),
@@ -63,7 +72,7 @@ export async function bulkIngestBenchmarkRows(
       unnest(${sql.array(images)}),
       unnest(${sql.array(metricsJsons)}::jsonb[]),
       unnest(${sql.array(workersJsons)}::jsonb[])
-    on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc)
+    on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode)
     do update set
       metrics = excluded.metrics,
       image = excluded.image,
@@ -98,9 +107,18 @@ export async function insertServerLog(
     insert into server_logs (server_log) values (${serverLog})
     returning id
   `;
+  // Derive the KV-cache pool size (tokens) from the log's authoritative
+  // "GPU KV cache size: N tokens" line(s) and stash it on the result's metrics
+  // JSON, mirroring how trace-replay-ingest derives cache-hit rates. The
+  // scraped vllm:cache_config_info metric can't reconstruct this for MLA models.
+  const kvCachePoolTokens = kvCachePoolTokensFromServerLog(serverLog);
   await sql`
     update benchmark_results
-    set server_log_id = ${logId}
+    set server_log_id = ${logId}${
+      kvCachePoolTokens === null
+        ? sql``
+        : sql`, metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${kvCachePoolTokens}::bigint))`
+    }
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
 }
@@ -155,13 +173,14 @@ export async function bulkUpsertAvailability(
   sql: Sql,
   rows: {
     model: string;
-    isl: number;
-    osl: number;
+    isl: number | null;
+    osl: number | null;
     precision: string;
     hardware: string;
     framework: string;
     specMethod: string;
     disagg: boolean;
+    benchmarkType: string;
   }[],
   date: string,
 ): Promise<void> {
@@ -170,7 +189,7 @@ export async function bulkUpsertAvailability(
   const seen = new Set<string>();
   const unique: typeof rows = [];
   for (const r of rows) {
-    const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`;
+    const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`;
     if (!seen.has(key)) {
       seen.add(key);
       unique.push(r);
@@ -178,7 +197,7 @@ export async function bulkUpsertAvailability(
   }
 
   await sql`
-    insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date)
+    insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date)
     select
       unnest(${sql.array(unique.map((r) => r.model))}::text[]),
       unnest(${sql.array(unique.map((r) => r.isl))}::int[]),
@@ -188,6 +207,7 @@ export async function bulkUpsertAvailability(
       unnest(${sql.array(unique.map((r) => r.framework))}::text[]),
       unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]),
       unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]),
+      unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]),
       ${date}::date
     on conflict do nothing
   `;
diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts
index 65fb3e39..cde2f74b 100644
--- a/packages/db/src/etl/benchmark-mapper.test.ts
+++ b/packages/db/src/etl/benchmark-mapper.test.ts
@@ -22,6 +22,20 @@ function makeV1Row(overrides: Record<string, any> = {}): Record<string, any> {
   };
 }
 
+/** Minimal valid agentic row: scenario_type triggers the agentic path; `users` → conc. */
+function makeAgenticRow(overrides: Record<string, any> = {}): Record<string, any> {
+  return {
+    infmax_model_prefix: 'dsv4',
+    hw: 'b200-nv',
+    framework: 'vllm',
+    precision: 'fp4',
+    scenario_type: 'agentic-coding',
+    users: 72,
+    tput_per_gpu: 20000,
+    ...overrides,
+  };
+}
+
 /** Minimal valid v2 benchmark row (disaggregated prefill/decode parallelism). */
 function makeV2Row(overrides: Record<string, any> = {}): Record<string, any> {
   return {
@@ -570,3 +584,280 @@ describe('extractWorkers', () => {
     expect(extractWorkers([null, 'bad', 0, undefined])).toBeUndefined();
   });
 });
+
+describe('mapBenchmarkRow — agentic interactivity normalization', () => {
+  it('derives *_intvty from 1/*_itl, discarding the artifact value', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeAgenticRow({
+        p90_itl: 0.0893,
+        p90_intvty: 23.91, // fast-tail contamination — must be overwritten
+        p75_itl: 0.0692,
+        p75_intvty: 19,
+      }),
+      tracker,
+    );
+    expect(result!.benchmarkType).toBe('agentic_traces');
+    expect(result!.metrics.p90_intvty).toBeCloseTo(1 / 0.0893, 6);
+    expect(result!.metrics.p75_intvty).toBeCloseTo(1 / 0.0692, 6);
+  });
+
+  it('derives *_intvty even when the artifact omits it', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeAgenticRow({ p90_itl: 0.1 }), tracker);
+    expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6);
+  });
+
+  it('does not touch *_intvty for single_turn rows', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeV1Row({ p90_itl: 0.05, p90_intvty: 999 }), tracker);
+    expect(result!.metrics.p90_intvty).toBe(999);
+  });
+});
+
+/**
+ * Minimal v3 agentic row (2026-07-02+): nested request_metrics/server_metrics,
+ * p50 percentiles, pre-inverted intvty, kv_offloading descriptors. Mirrors the
+ * real artifact from GH run 28553943579 (trimmed).
+ */
+function makeV3AgenticRow(overrides: Record<string, any> = {}): Record<string, any> {
+  return {
+    infmax_model_prefix: 'dsv4',
+    hw: 'cluster:b300-nv',
+    framework: 'vllm',
+    precision: 'fp4',
+    spec_decoding: 'none',
+    disagg: false,
+    scenario_type: 'agentic-coding',
+    is_multinode: false,
+    tp: 4,
+    ep: 1,
+    dp_attention: 'false',
+    conc: 16,
+    image: 'vllm/vllm-openai:v0.23.0',
+    kv_offloading: 'none',
+    kv_offload_backend: '',
+    num_requests_total: 1648,
+    num_requests_successful: 1648,
+    dataset: {
+      source_type: 'public_dataset',
+      hf_dataset_name: 'semianalysisai/cc-traces-weka-062126',
+    },
+    request_metrics: {
+      qps: {
+        window_seconds: 1,
+        samples: 7209,
+        mean: 0.22846,
+        p50: 0,
+        p75: 0,
+        p90: 1,
+        p95: 1,
+        std: 0.60707,
+      },
+      latency: {
+        ttft: {
+          mean: 12.90033,
+          p50: 1.49712,
+          p75: 12.09501,
+          p90: 56.22194,
+          p95: 68.03156,
+          std: 22.68353,
+        },
+        e2el: {
+          mean: 81.05644,
+          p50: 26.18817,
+          p75: 84.93601,
+          p90: 199.85996,
+          p95: 360.31579,
+          std: 149.59205,
+        },
+        itl: {
+          mean: 0.07548,
+          p50: 0.03677,
+          p75: 0.10253,
+          p90: 0.16652,
+          p95: 0.22255,
+          std: 0.08327,
+        },
+        tpot: {
+          mean: 0.07548,
+          p50: 0.03677,
+          p75: 0.10253,
+          p90: 0.16652,
+          p95: 0.22255,
+          std: 0.08327,
+        },
+        // already slow-tail inverted upstream (pXX_intvty = 1/pXX_itl)
+        intvty: {
+          mean: 13.2482,
+          p50: 27.19411,
+          p75: 9.75304,
+          p90: 6.00526,
+          p95: 4.49335,
+          std: 24.77636,
+        },
+      },
+      tokens: {
+        input: {
+          mean: 157676.054,
+          p50: 96047,
+          p75: 197684.25,
+          p90: 404935.9,
+          p95: 547502.85,
+          std: 152480.17653,
+        },
+        output_actual: {
+          mean: 849.06735,
+          p50: 290.5,
+          p75: 783.5,
+          p90: 2231.8,
+          p95: 3915.45,
+          std: 1568.90823,
+        },
+        output_expected: {
+          mean: 1432.32728,
+          p50: 571.5,
+          p75: 1820,
+          p90: 3927,
+          p95: 5312.9,
+          std: 2067.19215,
+        },
+      },
+      throughput: {
+        input: { tokens_per_second: 35980.14001 },
+        output: { tokens_per_second: 193.7489 },
+        total: { tokens_per_second: 36173.88892 },
+        duration_seconds: 7222.04352,
+        per_gpu: {
+          total_tput_tps: 9043.47223,
+          output_tput_tps: 48.43723,
+          input_tput_tps: 8995.035,
+        },
+      },
+      cache: { theoretical_cache_hit_rate: 0.97509 },
+    },
+    server_metrics: {
+      present: true,
+      adapter: 'vllm',
+      metric_count: 49,
+      cache: {
+        gpu_cache_hit_rate: 0.78539,
+        cpu_cache_hit_rate: 0,
+        external_cache_hit_rate: 0,
+        overall_cache_hit_rate: 0.78539,
+        prefix_cache_hits: 205576960,
+        prefix_cache_queries: 261750519,
+        frontend_cache_hit_rate: null,
+      },
+      kv_cache: { gpu_usage_pct: 0.82134, cpu_usage_pct: null, cpu_used_tokens: null },
+      tokens: {
+        prompt_total: 261750519,
+        generation_total: 1422696,
+        requests_completed: 1648,
+        prompt_by_source: {
+          gpu_cache_hit: 205576960,
+          cpu_or_external_cache_hit: 0,
+          computed: 56173559,
+        },
+      },
+      sources: [{ id: 'combined|http://localhost:8888/metrics|engine=0', role: 'combined' }],
+    },
+    ...overrides,
+  };
+}
+
+describe('mapBenchmarkRow — v3 agentic nested agg schema', () => {
+  it('maps identity/routing and flattens the nested containers', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeV3AgenticRow(), tracker);
+
+    expect(result).not.toBeNull();
+    expect(result!.benchmarkType).toBe('agentic_traces');
+    expect(result!.config.hardware).toBe('b300');
+    expect(result!.conc).toBe(16);
+    expect(result!.isl).toBeNull();
+    expect(result!.osl).toBeNull();
+
+    const m = result!.metrics;
+    // latency distributions, p50 stored under the canonical median_* name
+    expect(m.median_ttft).toBeCloseTo(1.49712, 6);
+    expect(m.p90_ttft).toBeCloseTo(56.22194, 6);
+    expect(m.std_e2el).toBeCloseTo(149.59205, 6);
+    expect(m.p95_itl).toBeCloseTo(0.22255, 6);
+    expect(m.mean_tpot).toBeCloseTo(0.07548, 6);
+    // qps + token distributions
+    expect(m.median_qps).toBe(0);
+    expect(m.p90_input_tokens).toBeCloseTo(404935.9, 3);
+    expect(m.median_output_tokens_actual).toBeCloseTo(290.5, 3);
+    expect(m.p95_output_tokens_expected).toBeCloseTo(5312.9, 3);
+    // throughput scalars under the v2 flat names
+    expect(m.tput_per_gpu).toBeCloseTo(9043.47223, 3);
+    expect(m.output_tput_per_gpu).toBeCloseTo(48.43723, 3);
+    expect(m.input_tput_per_gpu).toBeCloseTo(8995.035, 3);
+    expect(m.total_tput_tps).toBeCloseTo(36173.88892, 3);
+    expect(m.duration_seconds).toBeCloseTo(7222.04352, 3);
+    // cache / kv / totals
+    expect(m.theoretical_cache_hit_rate).toBeCloseTo(0.97509, 6);
+    expect(m.server_gpu_cache_hit_rate).toBeCloseTo(0.78539, 6);
+    expect(m.server_external_cache_hit_rate).toBe(0);
+    expect(m.gpu_kv_cache_usage_pct).toBeCloseTo(0.82134, 6);
+    expect(m.total_prompt_tokens).toBe(261750519);
+    expect(m.total_generation_tokens).toBe(1422696);
+    expect(m.total_requests_completed).toBe(1648);
+    // nested containers must not leak into metrics
+    expect(m).not.toHaveProperty('request_metrics');
+    expect(m).not.toHaveProperty('server_metrics');
+  });
+
+  it('re-derives *_intvty from *_itl (matching the pre-inverted artifact values)', () => {
+    const tracker = createSkipTracker();
+    const m = mapBenchmarkRow(makeV3AgenticRow(), tracker)!.metrics;
+    // The artifact already ships slow-tail intvty; the derive invariant keeps
+    // one definition and must agree with it (up to the artifact's rounding).
+    expect(m.median_intvty).toBeCloseTo(1 / 0.03677, 6);
+    expect(m.p90_intvty).toBeCloseTo(1 / 0.16652, 6);
+    expect(m.median_intvty).toBeCloseTo(27.19411, 2);
+    expect(m.p90_intvty).toBeCloseTo(6.00526, 2);
+    // std is never inverted — passes through from the artifact
+    expect(m.std_intvty).toBeCloseTo(24.77636, 6);
+  });
+
+  it("maps kv_offloading 'none' to offload off and skips the empty backend", () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeV3AgenticRow(), tracker);
+    expect(result!.offloadMode).toBe('off');
+    expect(result!.metrics).not.toHaveProperty('kv_offload_backend');
+  });
+
+  it("maps kv_offloading 'dram' + backend to offload on with the backend preserved", () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeV3AgenticRow({ kv_offloading: 'dram', kv_offload_backend: 'mooncake', conc: 32 }),
+      tracker,
+    );
+    expect(result!.offloadMode).toBe('on');
+    expect((result!.metrics as Record<string, unknown>).kv_offloading).toBe('dram');
+    expect((result!.metrics as Record<string, unknown>).kv_offload_backend).toBe('mooncake');
+  });
+
+  it('still applies the failed-run guard to v3 rows', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeV3AgenticRow({ num_requests_successful: 0, num_requests_total: 100 }),
+      tracker,
+    );
+    expect(result).toBeNull();
+    expect(tracker.skips.failedRun).toBe(1);
+  });
+
+  it('leaves v2 flat agentic rows byte-identical (no flattening applied)', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeAgenticRow({ p90_itl: 0.1, mean_ttft: 1.5, offload_mode: 'on' }),
+      tracker,
+    );
+    expect(result!.metrics.mean_ttft).toBe(1.5);
+    expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6);
+    expect(result!.offloadMode).toBe('on');
+  });
+});
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index b25baf60..90c23ef0 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -1,11 +1,13 @@
 /**
  * Benchmark row mapper: raw JSON dict → typed `BenchmarkParams`.
- * Handles both v1 (single tp/ep) and v2 (separate prefill/decode fields).
+ * Handles v1 (single tp/ep), v2 (separate prefill/decode fields), and v3
+ * (nested agentic containers, flattened via {@link flattenAgenticAggRow}).
  */
 
 import type { ConfigParams } from './config-cache';
 import type { SkipTracker } from './skip-tracker';
 import { METRIC_KEYS, PRECISION_KEYS } from '@semianalysisai/inferencex-constants';
+import { flattenAgenticAggRow } from './agentic-v3-flatten';
 import {
   resolveModelKey,
   hwToGpuKey,
@@ -17,11 +19,7 @@ import {
   parseInt2,
 } from './normalizers';
 
-/**
- * Raw artifact field names that are renamed when stored as metrics.
- * All other numeric fields not in `NON_METRIC_KEYS` are stored under their raw name.
- */
-const METRIC_RENAMES: Record<string, string> = {};
+export { flattenAgenticAggRow };
 
 /**
  * Raw artifact fields that are config/routing dimensions, not metrics.
@@ -57,12 +55,41 @@ const NON_METRIC_KEYS = new Set([
   'decode_num_workers',
   'num_prefill_gpu',
   'num_decode_gpu',
+  // agentic scenario
+  'scenario_type',
+  'users',
+  'offload_mode',
+  'num_requests_total',
+  'num_requests_successful',
+  // v3 agentic KV-offload descriptors ('none'|'dram'|… + backend name). Mapped
+  // to offloadMode / stringified metrics explicitly in mapBenchmarkRow.
+  'kv_offloading',
+  'kv_offload_backend',
+  // v3 agentic nested containers — flattened by flattenAgenticAggRow before
+  // the auto-capture loop runs; the raw objects themselves are not metrics.
+  'request_metrics',
+  'server_metrics',
+  // Public-dataset provenance emitted by aiperf. The ingest runner uses this
+  // object to populate run_datasets; it is not a benchmark metric.
+  'dataset',
   // per-worker measured-power array (not a numeric scalar). Surfaced as a
   // sibling of the metrics JSONB by mapBenchmarkRow so the metrics column
   // stays Record<string, number> for the index signature on BenchmarkRow.
   'workers',
 ]);
 
+/**
+ * `benchmark_type` values understood by the ingest.
+ * - `single_turn`    — fixed sequence-length runs (isl/osl set).
+ * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc).
+ */
+export type BenchmarkType = 'single_turn' | 'agentic_traces';
+
+/** Reduce an offload descriptor ('none'|'dram'|…) to the binary on/off. */
+function descriptorToOnOff(v: unknown): string | null {
+  return typeof v === 'string' && v.length > 0 ? (v === 'none' ? 'off' : 'on') : null;
+}
+
 /**
  * METRIC_KEYS from constants is the canonical set of known metric keys.
  * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured
@@ -91,9 +118,13 @@ export interface WorkerPower {
 
 export interface BenchmarkParams {
   config: ConfigParams;
-  isl: number;
-  osl: number;
+  benchmarkType: BenchmarkType;
+  // Null for agentic_traces; present for single_turn.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */
+  offloadMode: string;
   image: string | null;
   metrics: Record<string, number>;
   /**
@@ -110,9 +141,11 @@ export interface BenchmarkParams {
 /**
  * Map a raw benchmark result dict to typed `BenchmarkParams`.
  *
- * Supports two artifact schemas:
+ * Supports three artifact schemas:
  * - **v1** (pre-2025-12-19): single `tp`/`ep` for both prefill and decode.
  * - **v2** (2025-12-19+): separate `prefill_tp`/`decode_tp` etc. for disaggregated configs.
+ * - **v3** (2026-07-02+, agentic only): nested `request_metrics`/`server_metrics`
+ *   containers, flattened to the v2 flat schema up front by `flattenAgenticAggRow`.
  *
  * When mapping fails (unknown model, unknown hardware, or missing ISL/OSL/conc),
  * the appropriate skip counter on `tracker` is incremented and `null` is returned.
@@ -128,6 +161,11 @@ export function mapBenchmarkRow(
   tracker: SkipTracker,
   islOslFallback?: { isl: number; osl: number } | null,
 ): BenchmarkParams | null {
+  // v3 agentic rows nest their metrics; flatten to the canonical flat schema
+  // first so the rest of the mapper (auto-capture, intvty invariant, guards)
+  // is version-agnostic. No-op for v1/v2 rows.
+  row = flattenAgenticAggRow(row);
+
   const modelKey = resolveModelKey(row);
   if (!modelKey) {
     tracker.skips.unmappedModel++;
@@ -144,14 +182,44 @@ export function mapBenchmarkRow(
     return null;
   }
 
-  const isl = parseInt2(row.isl) ?? islOslFallback?.isl;
-  const osl = parseInt2(row.osl) ?? islOslFallback?.osl;
-  const conc = parseInt2(row.conc);
-  if (!isl || !osl || !conc) {
+  // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants),
+  // no isl/osl, and `users` instead of `conc`. Everything else stays as-is.
+  const isAgentic = String(row.scenario_type ?? '').startsWith('agentic');
+  const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn';
+
+  const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
+  const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
+  // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones.
+  const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc);
+  if (!conc || (!isAgentic && (!isl || !osl))) {
     tracker.skips.noIslOsl++;
     return null;
   }
 
+  // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from
+  // every runner, including ones with 0 successful requests and null metrics.
+  // Without this skip, the empty row's nulls overwrite a good row via
+  // ON CONFLICT DO UPDATE when both share the same (config, conc, offload).
+  if (
+    typeof row.num_requests_successful === 'number' &&
+    row.num_requests_successful === 0 &&
+    typeof row.num_requests_total === 'number' &&
+    row.num_requests_total > 0
+  ) {
+    tracker.skips.failedRun++;
+    return null;
+  }
+
+  // Agentic offload signal: prefer `offload_mode` ('on'|'off'), then the v3
+  // `kv_offloading` descriptor ('none'|'dram'|…), then legacy `offloading`.
+  // Descriptors reduce to the binary on/off used for row identity ('none' →
+  // 'off', anything else → 'on') so v3 offload points keep colliding-key parity
+  // with their v2 predecessors instead of forking a third offload_mode value.
+  const offloadModeRaw =
+    typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+      ? row.offload_mode
+      : (descriptorToOnOff(row.kv_offloading) ?? descriptorToOnOff(row.offloading) ?? 'off');
+
   const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg);
   const isMultinode = parseBool(row.is_multinode);
   const precision = normalizePrecision(String(row.precision ?? ''));
@@ -160,55 +228,36 @@ export function mapBenchmarkRow(
   }
   const specMethod = normalizeSpecMethod(row.spec_decoding);
 
-  let prefillTp: number, prefillEp: number, prefillDpAttn: boolean, prefillNumWorkers: number;
-  let decodeTp: number, decodeEp: number, decodeDpAttn: boolean, decodeNumWorkers: number;
-  let numPrefillGpu: number, numDecodeGpu: number;
+  const parallelism = resolveParallelism(row);
+  const metrics = captureNumericMetrics(row);
 
-  if ('prefill_tp' in row) {
-    // v2 schema: full disagg parallelism fields
-    prefillTp = parseInt2(row.prefill_tp) ?? 1;
-    prefillEp = parseInt2(row.prefill_ep) ?? 1;
-    prefillDpAttn = parseBool(row.prefill_dp_attention);
-    prefillNumWorkers = parseInt2(row.prefill_num_workers) ?? 0;
-    decodeTp = parseInt2(row.decode_tp) ?? 1;
-    decodeEp = parseInt2(row.decode_ep) ?? 1;
-    decodeDpAttn = parseBool(row.decode_dp_attention);
-    decodeNumWorkers = parseInt2(row.decode_num_workers) ?? 0;
-    numPrefillGpu = parseInt2(row.num_prefill_gpu) ?? prefillTp * prefillEp;
-    numDecodeGpu = parseInt2(row.num_decode_gpu) ?? decodeTp * decodeEp;
-  } else {
-    // v1 schema: single tp/ep, prefill = decode
-    const tp = parseInt2(row.tp) ?? 1;
-    const ep = parseInt2(row.ep) ?? 1;
-    const dpAttn = parseBool(row.dp_attention);
-    prefillTp = tp;
-    decodeTp = tp;
-    prefillEp = ep;
-    decodeEp = ep;
-    prefillDpAttn = dpAttn;
-    decodeDpAttn = dpAttn;
-    prefillNumWorkers = 0;
-    decodeNumWorkers = 0;
-    numPrefillGpu = tp * ep;
-    numDecodeGpu = tp * ep;
+  // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`)
+  // — preserve as a stringified metric so the frontend can expose it in tooltips.
+  // v3 rows additionally carry the offload tier + backend ('dram'/'mooncake');
+  // keep them so the UI can say *what kind* of offload, not just on/off.
+  if (isAgentic) {
+    (metrics as Record<string, unknown>).offload_mode = offloadModeRaw;
+    if (typeof row.kv_offloading === 'string' && row.kv_offloading.length > 0) {
+      (metrics as Record<string, unknown>).kv_offloading = row.kv_offloading;
+    }
+    if (typeof row.kv_offload_backend === 'string' && row.kv_offload_backend.length > 0) {
+      (metrics as Record<string, unknown>).kv_offload_backend = row.kv_offload_backend;
+    }
   }
 
-  // Auto-capture all numeric fields not reserved for config/routing dimensions.
-  // Fields in METRIC_RENAMES are stored under their canonical name; all others
-  // use the raw key. Any key outside METRIC_KEYS triggers a one-time
-  // warning so new schema additions don't go silently unnoticed.
-  const metrics: Record<string, number> = {};
-  for (const [rawKey, val] of Object.entries(row)) {
-    if (NON_METRIC_KEYS.has(rawKey)) continue;
-    const n = parseNum(val);
-    if (n === undefined) continue;
-    const storedKey = METRIC_RENAMES[rawKey] ?? rawKey;
-    metrics[storedKey] = n;
-    if (!METRIC_KEYS.has(rawKey) && !_warnedMetricKeys.has(rawKey)) {
-      _warnedMetricKeys.add(rawKey);
-      console.warn(
-        `  [WARN] auto-captured unexpected metric '${rawKey}' — add to METRIC_KEYS in constants/src/metric-keys.ts or NON_METRIC_KEYS in benchmark-mapper.ts`,
-      );
+  // Slow-tail interactivity invariant. Agentic artifacts ship `*_intvty`, but the
+  // definition has drifted across harness versions: some emit `1/p(ITL)`
+  // (slow-tail), others `p(1/ITL)` — which inverts percentile order, so p90 comes
+  // out as ~1/p10(ITL) instead. The inference chart's interactivity selector and
+  // the detail time-series both treat interactivity as the reciprocal of the ITL
+  // percentile, so we derive it from `*_itl` here rather than trust the artifact,
+  // keeping every agentic row on one definition. `std` is excluded — the
+  // reciprocal of a standard deviation is meaningless. Mirrored in the frontend
+  // overlay path (agenticAliases) and the one-time backfill-agentic-intvty script.
+  if (isAgentic) {
+    for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
+      const itl = metrics[`${k}_itl`];
+      if (typeof itl === 'number' && itl > 0) metrics[`${k}_intvty`] = 1 / itl;
     }
   }
 
@@ -231,26 +280,99 @@ export function mapBenchmarkRow(
       specMethod,
       disagg,
       isMultinode,
-      prefillTp,
-      prefillEp,
-      prefillDpAttn,
-      prefillNumWorkers,
-      decodeTp,
-      decodeEp,
-      decodeDpAttn,
-      decodeNumWorkers,
-      numPrefillGpu,
-      numDecodeGpu,
+      ...parallelism,
     },
+    benchmarkType,
     isl,
     osl,
     conc,
+    offloadMode: offloadModeRaw,
     image,
     metrics,
     workers,
   };
 }
 
+/** The parallelism slice of `ConfigParams`, resolved from either artifact schema. */
+type ParallelismParams = Pick<
+  ConfigParams,
+  | 'prefillTp'
+  | 'prefillEp'
+  | 'prefillDpAttn'
+  | 'prefillNumWorkers'
+  | 'decodeTp'
+  | 'decodeEp'
+  | 'decodeDpAttn'
+  | 'decodeNumWorkers'
+  | 'numPrefillGpu'
+  | 'numDecodeGpu'
+>;
+
+/**
+ * Resolve prefill/decode parallelism from a raw row. v2 rows (2025-12-19+)
+ * carry full disagg fields keyed by the presence of `prefill_tp`; v1 rows have
+ * a single `tp`/`ep` that applies to both phases.
+ */
+function resolveParallelism(row: Record<string, any>): ParallelismParams {
+  if ('prefill_tp' in row) {
+    // v2 schema: full disagg parallelism fields
+    const prefillTp = parseInt2(row.prefill_tp) ?? 1;
+    const prefillEp = parseInt2(row.prefill_ep) ?? 1;
+    const decodeTp = parseInt2(row.decode_tp) ?? 1;
+    const decodeEp = parseInt2(row.decode_ep) ?? 1;
+    return {
+      prefillTp,
+      prefillEp,
+      prefillDpAttn: parseBool(row.prefill_dp_attention),
+      prefillNumWorkers: parseInt2(row.prefill_num_workers) ?? 0,
+      decodeTp,
+      decodeEp,
+      decodeDpAttn: parseBool(row.decode_dp_attention),
+      decodeNumWorkers: parseInt2(row.decode_num_workers) ?? 0,
+      numPrefillGpu: parseInt2(row.num_prefill_gpu) ?? prefillTp * prefillEp,
+      numDecodeGpu: parseInt2(row.num_decode_gpu) ?? decodeTp * decodeEp,
+    };
+  }
+  // v1 schema: single tp/ep, prefill = decode
+  const tp = parseInt2(row.tp) ?? 1;
+  const ep = parseInt2(row.ep) ?? 1;
+  const dpAttn = parseBool(row.dp_attention);
+  return {
+    prefillTp: tp,
+    prefillEp: ep,
+    prefillDpAttn: dpAttn,
+    prefillNumWorkers: 0,
+    decodeTp: tp,
+    decodeEp: ep,
+    decodeDpAttn: dpAttn,
+    decodeNumWorkers: 0,
+    numPrefillGpu: tp * ep,
+    numDecodeGpu: tp * ep,
+  };
+}
+
+/**
+ * Auto-capture all numeric fields not reserved for config/routing dimensions,
+ * stored under their raw key. Any key outside METRIC_KEYS triggers a one-time
+ * warning so new schema additions don't go silently unnoticed.
+ */
+function captureNumericMetrics(row: Record<string, any>): Record<string, number> {
+  const metrics: Record<string, number> = {};
+  for (const [rawKey, val] of Object.entries(row)) {
+    if (NON_METRIC_KEYS.has(rawKey)) continue;
+    const n = parseNum(val);
+    if (n === undefined) continue;
+    metrics[rawKey] = n;
+    if (!METRIC_KEYS.has(rawKey) && !_warnedMetricKeys.has(rawKey)) {
+      _warnedMetricKeys.add(rawKey);
+      console.warn(
+        `  [WARN] auto-captured unexpected metric '${rawKey}' — add to METRIC_KEYS in constants/src/metric-keys.ts or NON_METRIC_KEYS in benchmark-mapper.ts`,
+      );
+    }
+  }
+  return metrics;
+}
+
 /**
  * Narrow a raw `workers` value from the artifact JSON to `WorkerPower[]` or
  * undefined. Each entry must have a string `role`, a numeric `worker_idx`,
diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts
new file mode 100644
index 00000000..7b745c09
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.test.ts
@@ -0,0 +1,152 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import {
+  STATS_VERSION,
+  computeAggregateStats,
+  mergeProfileStatsUpgrade,
+} from './compute-aggregate-stats.js';
+
+/** Build a minimal `profile_export.jsonl` from a few synthetic requests. */
+function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) {
+  const lines = requests.map((r, i) =>
+    JSON.stringify({
+      metadata: {
+        benchmark_phase: 'profiling',
+        conversation_id: `conv-${i}`,
+        turn_index: 0,
+      },
+      metrics: {
+        input_sequence_length: { value: r.isl, unit: 'tokens' },
+        output_sequence_length: { value: r.osl, unit: 'tokens' },
+        request_latency: { value: r.rl ?? 1000, unit: 'ms' },
+        time_to_first_token: { value: r.ttft ?? 100, unit: 'ms' },
+      },
+    }),
+  );
+  return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+/** Build a tiny server_metrics_json blob with KV util + prefix cache series. */
+function makeServerBlob() {
+  const json = JSON.stringify({
+    metrics: {
+      'vllm:kv_cache_usage_perc': {
+        series: [
+          {
+            timeslices: [
+              { start_ns: 0, end_ns: 1, avg: 0.2 },
+              { start_ns: 1, end_ns: 2, avg: 0.5 },
+              { start_ns: 2, end_ns: 3, avg: 0.8 },
+            ],
+          },
+        ],
+      },
+      'vllm:prefix_cache_hits': {
+        series: [{ timeslices: [{ start_ns: 0, rate: 80 }] }],
+      },
+      'vllm:prefix_cache_queries': {
+        series: [{ timeslices: [{ start_ns: 0, rate: 100 }] }],
+      },
+    },
+  });
+  return gzipSync(Buffer.from(json));
+}
+
+describe('computeAggregateStats', () => {
+  it('returns the current STATS_VERSION in the bundle', async () => {
+    const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+    expect(stats.version).toBe(STATS_VERSION);
+  });
+
+  it('leaves every metric null when both blobs are null', async () => {
+    const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.kvCacheUtil).toBeNull();
+    expect(stats.prefixCacheHitRate).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+    expect(stats.normalizedE2e400).toBeNull();
+  });
+
+  it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => {
+    const profileBlob = makeProfileBlob([
+      { isl: 100, osl: 50, rl: 1000, ttft: 100 },
+      { isl: 200, osl: 75, rl: 2000, ttft: 200 },
+      { isl: 300, osl: 100, rl: 3000, ttft: 300 },
+    ]);
+    const stats = await computeAggregateStats({ profileBlob, serverBlob: null });
+
+    expect(stats.isl?.n).toBe(3);
+    expect(stats.isl?.mean).toBeCloseTo(200, 6);
+    expect(stats.osl?.n).toBe(3);
+    expect(stats.osl?.mean).toBeCloseTo(75, 6);
+
+    // Server-side metrics still null when there's no server blob.
+    expect(stats.kvCacheUtil).toBeNull();
+    expect(stats.prefixCacheHitRate).toBeNull();
+
+    // Derived: prefill TPS per turn = isl / (ttft/1000) = 1000 for each, so p90 = 1000.
+    expect(stats.p90PrefillTpsPerUser).toBeCloseTo(1000, 6);
+    // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+    //   loads = [150, 275, 400], mean_load = 275
+    //   scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625]
+    //   mean ≈ 1.9653
+    expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3);
+    expect(stats.normalizedE2e400?.n).toBe(3);
+    expect(stats.normalizedE2e400?.p90).toBeGreaterThan(0);
+  });
+
+  it('computes KV util + prefix hit rate from the server blob alone', async () => {
+    const stats = await computeAggregateStats({
+      profileBlob: null,
+      serverBlob: makeServerBlob(),
+    });
+    expect(stats.kvCacheUtil?.n).toBe(3);
+    expect(stats.kvCacheUtil?.mean).toBeCloseTo(0.5, 6);
+    expect(stats.prefixCacheHitRate?.n).toBe(1);
+    expect(stats.prefixCacheHitRate?.mean).toBeCloseTo(0.8, 6);
+
+    // Profile-derived metrics absent.
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+    expect(stats.normalizedE2e400).toBeNull();
+  });
+
+  it('tolerates a malformed profile blob by leaving its metrics null', async () => {
+    // A random non-gzip buffer triggers a gunzip error — code path swallows it.
+    const garbage = Buffer.from('not-gzip-data');
+    const stats = await computeAggregateStats({ profileBlob: garbage, serverBlob: null });
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+    expect(stats.normalizedE2e400).toBeNull();
+    // Version still set so the row is considered "computed".
+    expect(stats.version).toBe(STATS_VERSION);
+  });
+});
+
+describe('mergeProfileStatsUpgrade', () => {
+  it('updates profile metrics while preserving existing server distributions', async () => {
+    const existing = await computeAggregateStats({
+      profileBlob: null,
+      serverBlob: makeServerBlob(),
+    });
+    const profile = await computeAggregateStats({
+      profileBlob: makeProfileBlob([{ isl: 100, osl: 100, rl: 2080, ttft: 100 }]),
+      serverBlob: null,
+    });
+
+    const merged = mergeProfileStatsUpgrade(existing, profile);
+    expect(merged.version).toBe(STATS_VERSION);
+    expect(merged.isl?.mean).toBe(100);
+    expect(merged.normalizedE2e400?.p90).toBeGreaterThan(0);
+    expect(merged.kvCacheUtil).toEqual(existing.kvCacheUtil);
+    expect(merged.prefixCacheHitRate).toEqual(existing.prefixCacheHitRate);
+  });
+});
diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts
new file mode 100644
index 00000000..cea9361c
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.ts
@@ -0,0 +1,149 @@
+/**
+ * Pre-compute the per-row aggregate stats for an `agentic_trace_replay`
+ * blob pair. The output lands in the `aggregate_stats` JSONB column so the
+ * detail page can serve the "Aggregates across configs" view and the
+ * derived chart x-axis modes from a single SQL row read, instead of
+ * parsing the raw blobs on demand.
+ *
+ * Shape is intentionally versioned — bump `STATS_VERSION` whenever the
+ * computation changes so the backfill script knows which rows to recompute.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream';
+import { computeDerivedFromBlob } from '../queries/derived-agentic-metrics';
+import {
+  STATS_VERSION,
+  extractIslOsl,
+  extractServerMetricSamples,
+  percentilesOf,
+  type MetricPercentiles,
+} from '../queries/agentic-aggregates';
+
+export { STATS_VERSION };
+
+export interface AggregateStats {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  /** Mean of (per-session e2e time × mean_load / session_load) across sessions. */
+  normalizedSessionTimeS: number | null;
+  /** P90 of per-turn ISL/TTFT pooled across every session's turns. */
+  p90PrefillTpsPerUser: number | null;
+  /** Per-request normalized E2E distribution at a fixed 400-token OSL. */
+  normalizedE2e400: MetricPercentiles | null;
+}
+
+/**
+ * Upgrade an existing stats bundle when only profile-derived fields changed.
+ * This avoids re-reading and decompressing the much larger server-metrics blob
+ * while preserving its already-computed KV/cache distributions.
+ */
+export function mergeProfileStatsUpgrade(
+  existing: Omit<AggregateStats, 'normalizedE2e400'> & {
+    normalizedE2e400?: MetricPercentiles | null;
+  },
+  profile: AggregateStats,
+): AggregateStats {
+  return {
+    ...profile,
+    isl: profile.isl ?? existing.isl,
+    osl: profile.osl ?? existing.osl,
+    normalizedSessionTimeS: profile.normalizedSessionTimeS ?? existing.normalizedSessionTimeS,
+    p90PrefillTpsPerUser: profile.p90PrefillTpsPerUser ?? existing.p90PrefillTpsPerUser,
+    kvCacheUtil: existing.kvCacheUtil,
+    prefixCacheHitRate: existing.prefixCacheHitRate,
+  };
+}
+
+/** Metric subtrees we extract via stream-parse on oversized server blobs. */
+const TARGET_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:gpu_prefix_cache_hits',
+  'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect just the metric
+ * subtrees we care about. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync().toString('utf8')` hits on high-conc TP+EP rows.
+ */
+async function streamExtractServer(
+  buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+  const collected = await streamCollectKeys<unknown>(buffer, 'metrics', TARGET_METRIC_KEYS);
+  return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
+/**
+ * Compute the full versioned stats bundle from a (profile, server-metrics)
+ * blob pair. Either blob may be null (e.g. only the server file existed) —
+ * the corresponding stats just come back null.
+ */
+export async function computeAggregateStats(args: {
+  profileBlob: Buffer | null;
+  serverBlob: Buffer | null;
+}): Promise<AggregateStats> {
+  let islPct: MetricPercentiles | null = null;
+  let oslPct: MetricPercentiles | null = null;
+  let normalized: number | null = null;
+  let prefillP90: number | null = null;
+  let normalizedE2e400: MetricPercentiles | null = null;
+
+  if (args.profileBlob) {
+    try {
+      const jsonl = gunzipSync(args.profileBlob).toString('utf8');
+      const { isl, osl } = extractIslOsl(jsonl);
+      islPct = percentilesOf(isl);
+      oslPct = percentilesOf(osl);
+      const derived = computeDerivedFromBlob(jsonl);
+      normalized = derived.normalized_session_time_s;
+      prefillP90 = derived.p90_prefill_tps_per_user;
+      normalizedE2e400 = derived.normalized_e2e_400;
+    } catch {
+      // ignore malformed blob — leave nulls
+    }
+  }
+
+  let kvPct: MetricPercentiles | null = null;
+  let prefixPct: MetricPercentiles | null = null;
+  if (args.serverBlob) {
+    let server: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+    try {
+      const json = gunzipSync(args.serverBlob).toString('utf8');
+      server = extractServerMetricSamples(json);
+    } catch (error) {
+      // ERR_STRING_TOO_LONG hits on high-conc TP+EP rows. Stream-parse to
+      // pull just the metric subtrees we need without materializing the
+      // full 500+ MB JSON string.
+      if (isStringTooLongError(error)) {
+        try {
+          server = await streamExtractServer(args.serverBlob);
+        } catch {
+          // stream fallback failed too — leave nulls
+        }
+      }
+    }
+    if (server) {
+      kvPct = percentilesOf(server.kvCacheUtil);
+      prefixPct = percentilesOf(server.prefixCacheHitRate);
+    }
+  }
+
+  return {
+    version: STATS_VERSION,
+    isl: islPct,
+    osl: oslPct,
+    kvCacheUtil: kvPct,
+    prefixCacheHitRate: prefixPct,
+    normalizedSessionTimeS: normalized,
+    p90PrefillTpsPerUser: prefillP90,
+    normalizedE2e400,
+  };
+}
diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
new file mode 100644
index 00000000..3f088cd6
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -0,0 +1,341 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { CHART_SERIES_VERSION, computeChartSeries } from './compute-chart-series.js';
+
+/**
+ * Build a minimal server_metrics_json blob covering the metrics the chart
+ * consumes. Each timeslice is one second long starting at t=0.
+ */
+function makeBlob(opts?: {
+  prefixHits?: number;
+  prefixQueries?: number;
+  promptTokensRate?: number;
+}) {
+  const json = JSON.stringify({
+    metrics: {
+      'vllm:kv_cache_usage_perc': {
+        series: [
+          {
+            timeslices: [
+              { start_ns: 0, end_ns: 1e9, avg: 0.1 },
+              { start_ns: 1e9, end_ns: 2e9, avg: 0.4 },
+              { start_ns: 2e9, end_ns: 3e9, avg: 0.7 },
+            ],
+          },
+        ],
+      },
+      'vllm:prefix_cache_hits': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixHits ?? 75 }] }],
+      },
+      'vllm:prefix_cache_queries': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixQueries ?? 100 }] }],
+      },
+      'vllm:num_requests_running': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 5 }] }],
+      },
+      'vllm:num_requests_waiting': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 2 }] }],
+      },
+      'vllm:prompt_tokens': {
+        series: [
+          { timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.promptTokensRate ?? 1000 }] },
+        ],
+      },
+      'vllm:generation_tokens': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 500 }] }],
+      },
+      'vllm:prompt_tokens_by_source': {
+        series: [
+          {
+            labels: { source: 'local_cache_hit' },
+            timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 200 }],
+          },
+          {
+            labels: { source: 'miss' },
+            timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 800 }],
+          },
+        ],
+      },
+    },
+  });
+  return gzipSync(Buffer.from(json));
+}
+
+/** Build a synthetic per-engine vLLM metric series for the multi-engine test. */
+function buildEngineSeries(engineId: number, baseRunning: number) {
+  const labels = { engine: String(engineId) };
+  return {
+    runningSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: baseRunning },
+        { start_ns: 1e9, avg: baseRunning + 1 },
+      ],
+    },
+    waitingSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: 0 },
+        { start_ns: 1e9, avg: 0 },
+      ],
+    },
+    kvSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: 0.25 },
+        { start_ns: 1e9, avg: 0.5 },
+      ],
+    },
+    promptSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, rate: 100 },
+        { start_ns: 1e9, rate: 200 },
+      ],
+    },
+    genSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, rate: 50 },
+        { start_ns: 1e9, rate: 75 },
+      ],
+    },
+  };
+}
+
+function buildDynamoSeries(
+  endpoint_url: string,
+  dynamo_component: 'prefill' | 'backend',
+  worker_id: string,
+  value: number,
+  field: 'rate' | 'avg' = 'rate',
+) {
+  return {
+    endpoint_url,
+    labels: { dynamo_component, worker_id, dp_rank: '0', engine: '0' },
+    timeslices: [{ start_ns: 0, end_ns: 1e9, [field]: value }],
+  };
+}
+
+describe('computeChartSeries', () => {
+  it('returns null when the blob is null', async () => {
+    expect(await computeChartSeries(null)).toBeNull();
+  });
+
+  it('returns the current CHART_SERIES_VERSION in the bundle', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.version).toBe(CHART_SERIES_VERSION);
+  });
+
+  it('extracts kvCacheUsage points with t=seconds-from-start', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.kvCacheUsage).toEqual([
+      { t: 0, value: 0.1 },
+      { t: 1, value: 0.4 },
+      { t: 2, value: 0.7 },
+    ]);
+  });
+
+  it('merges warmup_metrics before profiling into one continuous series (v11)', async () => {
+    // warmup scrapes at t=0,1s; profiling scrapes at t=10,11s (own start_ns).
+    const blob = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          warmup_metrics: {
+            'vllm:kv_cache_usage_perc': {
+              series: [
+                {
+                  timeslices: [
+                    { start_ns: 0, end_ns: 1e9, avg: 0.2 },
+                    { start_ns: 1e9, end_ns: 2e9, avg: 0.3 },
+                  ],
+                },
+              ],
+            },
+          },
+          metrics: {
+            'vllm:kv_cache_usage_perc': {
+              series: [
+                {
+                  timeslices: [
+                    { start_ns: 10e9, end_ns: 11e9, avg: 0.8 },
+                    { start_ns: 11e9, end_ns: 12e9, avg: 0.9 },
+                  ],
+                },
+              ],
+            },
+          },
+        }),
+      ),
+    );
+    const series = await computeChartSeries(blob);
+    // Origin is the earliest (warmup) start_ns, so warmup sits at low t and
+    // profiling follows on the same axis — the frontend slices at the boundary.
+    expect(series?.kvCacheUsage).toEqual([
+      { t: 0, value: 0.2 },
+      { t: 1, value: 0.3 },
+      { t: 10, value: 0.8 },
+      { t: 11, value: 0.9 },
+    ]);
+  });
+
+  it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => {
+    const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 }));
+    expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]);
+  });
+
+  it('drops prefixCacheHitRate windows where queries.rate is 0', async () => {
+    const series = await computeChartSeries(makeBlob({ prefixHits: 5, prefixQueries: 0 }));
+    expect(series?.prefixCacheHitRate).toEqual([]);
+  });
+
+  it('pairs running + waiting into queueDepth points', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.queueDepth).toEqual([{ t: 0, running: 5, waiting: 2, total: 7 }]);
+  });
+
+  it('extracts prefillTps + decodeTps from counter rates', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.prefillTps).toEqual([{ t: 0, value: 1000 }]);
+    expect(series?.decodeTps).toEqual([{ t: 0, value: 500 }]);
+  });
+
+  it('splits promptTokensBySource by label and skips empty series', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(Object.keys(series!.promptTokensBySource).toSorted()).toEqual([
+      'local_cache_hit',
+      'miss',
+    ]);
+    expect(series!.promptTokensBySource['local_cache_hit']).toEqual([{ t: 0, value: 200 }]);
+    expect(series!.promptTokensBySource['miss']).toEqual([{ t: 0, value: 800 }]);
+  });
+
+  it('computes timing metadata from the widest metric window', async () => {
+    const series = await computeChartSeries(makeBlob());
+    // kvCacheUsage has the widest window (0 → 3e9), so startNs=0, endNs=3e9.
+    expect(series?.startNs).toBe(0);
+    expect(series?.endNs).toBe(3e9);
+    expect(series?.durationS).toBeCloseTo(3, 6);
+    expect(series?.timeslicesCount).toBe(3);
+  });
+
+  it('returns null on a malformed (non-gzip) blob', async () => {
+    const result = await computeChartSeries(Buffer.from('not-gzip-data'));
+    expect(result).toBeNull();
+  });
+
+  it('aggregates gauges + counters across all engine series (DP/PP fix)', async () => {
+    // Simulate a 4-engine deployment: each engine reports its own series for
+    // every metric. Cluster-wide value should be SUM for running/waiting and
+    // counter rates, AVG for kv_cache_usage_perc (per-engine fraction).
+    const engines = [0, 1, 2, 3].map((id) => buildEngineSeries(id, 3)); // running=3 per engine
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:num_requests_running': { series: engines.map((e) => e.runningSlice) },
+        'vllm:num_requests_waiting': { series: engines.map((e) => e.waitingSlice) },
+        'vllm:kv_cache_usage_perc': { series: engines.map((e) => e.kvSlice) },
+        'vllm:prompt_tokens': { series: engines.map((e) => e.promptSlice) },
+        'vllm:generation_tokens': { series: engines.map((e) => e.genSlice) },
+      },
+    });
+    const blob = gzipSync(Buffer.from(json));
+    const cs = await computeChartSeries(blob);
+    expect(cs).not.toBeNull();
+    // queueDepth.running = Σ engines = 4 × 3 = 12 at t=0; 4 × 4 = 16 at t=1
+    expect(cs!.queueDepth).toEqual([
+      { t: 0, running: 12, waiting: 0, total: 12 },
+      { t: 1, running: 16, waiting: 0, total: 16 },
+    ]);
+    // kvCacheUsage stays 0.25, 0.5 (average across engines, all engines reported same value)
+    expect(cs!.kvCacheUsage).toEqual([
+      { t: 0, value: 0.25 },
+      { t: 1, value: 0.5 },
+    ]);
+    // prefillTps = Σ rates = 4 × 100 = 400; then 4 × 200 = 800
+    expect(cs!.prefillTps).toEqual([
+      { t: 0, value: 400 },
+      { t: 1, value: 800 },
+    ]);
+    expect(cs!.decodeTps).toEqual([
+      { t: 0, value: 200 },
+      { t: 1, value: 300 },
+    ]);
+  });
+
+  it('uses the Dynamo adapter to preserve workers and canonical prefill/decode roles', async () => {
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:prompt_tokens': {
+          series: [
+            buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 100),
+            buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 200),
+            buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 300),
+          ],
+        },
+        'vllm:generation_tokens': {
+          series: [
+            buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 1),
+            buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 2),
+            buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 400),
+          ],
+        },
+        'vllm:num_requests_running': {
+          series: [
+            buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 3, 'avg'),
+            buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 4, 'avg'),
+            buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 5, 'avg'),
+          ],
+        },
+      },
+    });
+
+    const blob = gzipSync(Buffer.from(json));
+    const result = await computeChartSeries(blob, {
+      framework: 'dynamo-vllm',
+      disagg: true,
+    });
+
+    expect(result?.metricSources).toHaveLength(3);
+    expect(result?.metricSources.map(({ source: s }) => [s.role, s.workerId, s.engine])).toEqual([
+      ['prefill', 'prefill-b', '0'],
+      ['prefill', 'prefill-a', '0'],
+      ['decode', 'decode-a', '0'],
+    ]);
+    const prefillA = result?.metricSources.find(({ source: s }) => s.workerId === 'prefill-a');
+    const decode = result?.metricSources.find(({ source: s }) => s.role === 'decode');
+    expect(prefillA?.promptTps).toEqual([{ t: 0, value: 100 }]);
+    expect(prefillA?.queueDepth).toEqual([{ t: 0, running: 3, waiting: 0, total: 3 }]);
+    expect(decode?.generationTps).toEqual([{ t: 0, value: 400 }]);
+
+    const nonDisagg = await computeChartSeries(blob, {
+      framework: 'dynamo-vllm',
+      disagg: false,
+    });
+    expect(nonDisagg?.metricSources).toEqual([]);
+  });
+
+  it('does not interpret Dynamo-native labels without selecting the Dynamo adapter', async () => {
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:prompt_tokens': {
+          series: [
+            {
+              endpoint_url: '10.30.1.56:7500',
+              labels: { dynamo_component: 'prefill', worker_id: 'prefill-a', engine: '0' },
+              timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 100 }],
+            },
+          ],
+        },
+      },
+    });
+
+    const result = await computeChartSeries(gzipSync(Buffer.from(json)), {
+      framework: 'vllm',
+      disagg: true,
+    });
+
+    expect(result?.metricSources).toEqual([]);
+  });
+});
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
new file mode 100644
index 00000000..d140306f
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -0,0 +1,576 @@
+/**
+ * Pre-compute the time-series for the agentic detail page chart, so the
+ * API doesn't have to gunzip + JSON-parse a multi-hundred-MB blob on every
+ * request. The output lands in `agentic_trace_replay.chart_series` and is
+ * read directly by `getTraceServerMetrics`.
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `CHART_SERIES_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream';
+import {
+  selectServerMetricsAdapter,
+  type MetricSource,
+  type ServerMetricsContext,
+} from './server-metrics-adapters';
+
+/**
+ * Bump when the extraction algorithm changes — backfill recomputes anything
+ * older.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP
+ * deployments — most visible as a request-queue-depth chart that maxed out
+ * at ~3 when the timeline clearly showed 20+ in-flight).
+ *
+ * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative
+ * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps).
+ *
+ * v4: extract sglang:* metrics too (fallback chain in each picker), so
+ * SGLang runs populate the chart_series the same way vllm runs do.
+ *
+ * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode})
+ * into promptTokensBySource so the cumulative prompt-token-source-breakdown
+ * chart shows useful splits for SGLang runs (filtered to prefill_* modes).
+ *
+ * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source
+ * breakdown from sglang:cached_tokens — current runs always have one
+ * cache_source ("device" / HBM) but hicache (CPU offload) runs would
+ * split into "device" + "host" automatically once ingested.
+ *
+ * v7: extract sglang:hicache_host_{used,total}_tokens into a new
+ * hostKvCacheUsage series so the KV cache utilization chart can plot
+ * the CPU offload pool's usage alongside the on-GPU HBM line.
+ *
+ * v8: keep the per-engine dimension on kv_cache_usage_perc as
+ * `kvCacheUsageByEngine` (one entry per DP rank). The cluster-average
+ * line hides load skew on DEP configs; the detail page overlays the
+ * per-rank lines so a hot rank is visible at a glance.
+ *
+ * v9: retain orchestrator-normalized per-source series. Dynamo labels are
+ * mapped to canonical router/prefill/decode roles, allowing the frontend to
+ * inspect individual workers without interpreting Dynamo-native labels.
+ *
+ * v10: only emit per-source series for disaggregated configs with a recognized
+ * orchestrator adapter. Non-disaggregated and unsupported configs retain the
+ * existing aggregate-only behavior.
+ *
+ * v12: also consume the `warmup_metrics` block from the server-metrics blob and
+ * merge its scrapes into the same series as the profiling `metrics` block.
+ * Warmup and profiling timeslices carry their own absolute `start_ns` and never
+ * overlap in time, so the merged series is continuous (warmup at lower t,
+ * profiling after). This lets the agentic detail page slice `chart_series` into
+ * warmup vs profiling at the request-derived boundary; older blobs without a
+ * warmup block are unaffected. (v11 was a short-lived, since-reverted attempt to
+ * carry kvCachePoolTokens in chart_series; that value now lives in
+ * benchmark_results.metrics, derived from the server log — unrelated to this.)
+ */
+export const CHART_SERIES_VERSION = 12;
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  total: number;
+}
+
+export interface ChartSeries {
+  version: number;
+  /** ns wall-clock of the first window's start; for debugging only. */
+  startNs: number;
+  /** ns wall-clock of the last window's end. */
+  endNs: number;
+  /** Total benchmark window in seconds. */
+  durationS: number;
+  /** Number of 1Hz windows captured. */
+  timeslicesCount: number;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+  /**
+   * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across
+   * engines. Detail page derives "cumulative unique input tokens" as
+   * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually
+   * saved vs the raw queries that came in.
+   */
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  /**
+   * Host (CPU offload) KV cache utilization, 0..1. Only populated for
+   * SGLang hicache runs (derived as hicache_host_used / hicache_host_total).
+   * Frontend overlays this on the KV cache util chart as a second line.
+   */
+  hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization (0..1 each). One entry per engine
+   * series found in the raw metric, ordered by the `engine` label when
+   * present and by series-array index otherwise. Empty for single-engine
+   * deployments — the average `kvCacheUsage` line covers that case alone.
+   * The detail page overlays these on the same chart so DEP load skew is
+   * visible without changing the headline number.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+  /**
+   * The same metrics grouped by normalized server source. Existing aggregate
+   * fields above remain the default and preserve compatibility with old rows.
+   */
+  metricSources: MetricSourceSeries[];
+}
+
+export interface MetricSourceSeries {
+  source: MetricSource;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  /** Raw prompt-token counter rate for this source. */
+  promptTps: TimeSeriesPoint[];
+  /** Raw generation-token counter rate for this source. */
+  generationTps: TimeSeriesPoint[];
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  hostKvCacheUsage: TimeSeriesPoint[];
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+}
+
+// ── Raw blob shapes (subset we read) ────────────────────────────────────
+
+interface RawSlice {
+  start_ns?: number;
+  end_ns?: number;
+  avg?: number;
+  rate?: number;
+}
+
+interface RawSeries {
+  endpoint_url?: string;
+  labels?: Record<string, string>;
+  timeslices?: RawSlice[];
+}
+
+interface RawMetric {
+  series?: RawSeries[];
+}
+
+type MetricsMap = Record<string, RawMetric>;
+
+/**
+ * The set of metric subtrees the chart consumes. Includes both vllm:* and
+ * sglang:* names so the stream-parse fallback collects whichever framework
+ * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric.
+ */
+const CHART_METRIC_KEYS = new Set([
+  // vLLM
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:num_requests_running',
+  'vllm:num_requests_waiting',
+  'vllm:prompt_tokens',
+  'vllm:generation_tokens',
+  'vllm:prompt_tokens_by_source',
+  // SGLang
+  'sglang:token_usage',
+  'sglang:cached_tokens',
+  'sglang:prompt_tokens',
+  'sglang:generation_tokens',
+  'sglang:num_running_reqs',
+  'sglang:num_queue_reqs',
+  'sglang:realtime_tokens',
+  'sglang:hicache_host_used_tokens',
+  'sglang:hicache_host_total_tokens',
+]);
+
+/**
+ * Merge a warmup phase metric map into the profiling one by concatenating each
+ * metric's `series`. The two phases' timeslices carry their own absolute
+ * `start_ns` and never overlap in time, so `buildSeriesFromMetrics` (which keys
+ * by `start_ns`) yields one continuous series — warmup scrapes at lower t,
+ * profiling after. No-ops when either side is empty (older blobs have no warmup).
+ */
+function mergePhaseMetrics(profiling: MetricsMap, warmup: MetricsMap): MetricsMap {
+  if (Object.keys(warmup).length === 0) return profiling;
+  if (Object.keys(profiling).length === 0) return warmup;
+  const out: MetricsMap = {};
+  for (const name of new Set([...Object.keys(profiling), ...Object.keys(warmup)])) {
+    out[name] = {
+      series: [...(profiling[name]?.series ?? []), ...(warmup[name]?.series ?? [])],
+    };
+  }
+  return out;
+}
+
+/**
+ * Stream-parse fallback: collect the chart's metric subtrees from both phase
+ * blocks and merge (see v11). Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows.
+ */
+async function streamCollectMetrics(buffer: Buffer): Promise<MetricsMap> {
+  const [profiling, warmup] = await Promise.all([
+    streamCollectKeys<RawMetric>(buffer, 'metrics', CHART_METRIC_KEYS),
+    streamCollectKeys<RawMetric>(buffer, 'warmup_metrics', CHART_METRIC_KEYS),
+  ]);
+  return mergePhaseMetrics(profiling, warmup);
+}
+
+/**
+ * Parse the gzipped server_metrics blob into the metric map. Tries the
+ * synchronous fast path first; falls back to stream-parse on
+ * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed. Merges the warmup block
+ * into the profiling one (v11) so the series span both phases.
+ */
+async function parseMetrics(buffer: Buffer): Promise<MetricsMap> {
+  try {
+    const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as {
+      metrics?: MetricsMap;
+      warmup_metrics?: MetricsMap;
+    };
+    return mergePhaseMetrics(obj.metrics ?? {}, obj.warmup_metrics ?? {});
+  } catch (error) {
+    if (isStringTooLongError(error)) return await streamCollectMetrics(buffer);
+    throw error;
+  }
+}
+
+/**
+ * Build chart-ready time-series arrays from a gzipped server_metrics blob.
+ * The math mirrors `getTraceServerMetrics` — this helper exists so ingest,
+ * backfill, and the API path produce byte-identical results.
+ */
+export async function computeChartSeries(
+  blob: Buffer | null,
+  context: ServerMetricsContext = {},
+): Promise<ChartSeries | null> {
+  if (!blob) return null;
+  let metrics: MetricsMap;
+  try {
+    metrics = await parseMetrics(blob);
+  } catch {
+    // Malformed blob → no series (caller treats null as "no data").
+    return null;
+  }
+  return buildSeriesFromMetrics(metrics, context);
+}
+
+/**
+ * Aggregate one timeslice field across all series of a metric, indexed by
+ * `start_ns`. Multi-engine vllm deployments report one series per engine —
+ * the cluster value is the sum (for running/waiting/throughput counters)
+ * or the average (for kv_cache_usage_perc, a per-engine fraction).
+ */
+function aggregateByStart(
+  series: readonly RawSeries[] | undefined,
+  field: 'avg' | 'rate',
+  combine: 'sum' | 'avg',
+): Map<number, number> {
+  const sums = new Map<number, number>();
+  const counts = new Map<number, number>();
+  for (const s of series ?? []) {
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.start_ns !== 'number') continue;
+      const v = ts[field];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+      counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+    }
+  }
+  if (combine === 'sum') return sums;
+  const out = new Map<number, number>();
+  for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+  return out;
+}
+
+/** Stable order: emit one point per unique start_ns, chronologically. */
+function sortedEntries(m: Map<number, number>): [number, number][] {
+  return [...m.entries()].toSorted((a, b) => a[0] - b[0]);
+}
+
+function buildSeriesFromMetrics(
+  metrics: MetricsMap,
+  context: ServerMetricsContext,
+  includeMetricSources = true,
+  originStartNs?: number,
+): ChartSeries {
+  // Timing reference: smallest start_ns and largest end_ns across every
+  // timeslice we extracted. timeslicesCount is the length of any single
+  // series (engines are scraped on the same cadence), so picking the max
+  // length across all series of all metrics is safe.
+  let startNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+  let timeslicesCount = 0;
+  for (const metricMeta of Object.values(metrics)) {
+    for (const s of metricMeta?.series ?? []) {
+      const ts = s.timeslices ?? [];
+      if (ts.length === 0) continue;
+      timeslicesCount = Math.max(timeslicesCount, ts.length);
+      const first = ts[0]!;
+      const last = ts.at(-1)!;
+      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+    }
+  }
+  if (!Number.isFinite(startNs)) startNs = 0;
+  const tOf = (ns: number) => (ns - (originStartNs ?? startNs)) / 1e9;
+
+  // Pick the first metric name whose series array has any data; fallback
+  // chain lets the same code path serve both vllm:* and sglang:* blobs.
+  const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => {
+    for (const name of names) {
+      const s = metrics[name]?.series;
+      if (s && s.length > 0) return s;
+    }
+    return undefined;
+  };
+
+  // KV cache usage (gauge, 0..1) — average across engines so the value
+  // stays a fraction (each engine has its own KV pool).
+  const kvSeries = pickSeries(
+    'vllm:kv_cache_usage_perc',
+    'vllm:gpu_cache_usage_perc',
+    'sglang:token_usage',
+  );
+  const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
+    aggregateByStart(kvSeries, 'avg', 'avg'),
+  ).map(([t, v]) => ({ t: tOf(t), value: v }));
+  // Per-engine breakdown of the same metric. We only emit it when there's
+  // more than one series — single-engine deployments would just duplicate
+  // the cluster-average line.
+  const kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[] = [];
+  if (kvSeries && kvSeries.length > 1) {
+    // Sort by numeric engine label when present so rank 0..N renders in
+    // order; fall back to series-array index otherwise.
+    const decorated = kvSeries.map((s, idx) => {
+      const raw =
+        s.labels?.['engine'] ?? s.labels?.['engine_idx'] ?? s.labels?.['dp_rank'] ?? String(idx);
+      const numeric = Number(raw);
+      return { series: s, idx, label: raw, sortKey: Number.isFinite(numeric) ? numeric : idx };
+    });
+    decorated.sort((a, b) => a.sortKey - b.sortKey);
+    for (const { series, label } of decorated) {
+      const pts: TimeSeriesPoint[] = [];
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.start_ns !== 'number' || typeof ts.avg !== 'number') continue;
+        if (!Number.isFinite(ts.avg)) continue;
+        pts.push({ t: tOf(ts.start_ns), value: ts.avg });
+      }
+      if (pts.length > 0) kvCacheUsageByEngine.push({ engineLabel: label, points: pts });
+    }
+  }
+
+  // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
+  // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens.
+  const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens');
+  const qsSeries = pickSeries(
+    'vllm:prefix_cache_queries',
+    'vllm:prompt_tokens',
+    'sglang:prompt_tokens',
+  );
+  const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum');
+  const qsByT = aggregateByStart(qsSeries, 'rate', 'sum');
+  const prefixCacheHitRate: TimeSeriesPoint[] = [];
+  for (const [t, h] of sortedEntries(hitsByT)) {
+    const q = qsByT.get(t);
+    if (q !== undefined && q > 0) prefixCacheHitRate.push({ t: tOf(t), value: h / q });
+  }
+
+  // Queue depth: sum running + waiting across engines per timeslice.
+  const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs');
+  const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs');
+  const runByT = aggregateByStart(runSeries, 'avg', 'sum');
+  const waitByT = aggregateByStart(waitSeries, 'avg', 'sum');
+  const queueDepth: QueueDepthPoint[] = [];
+  // Union of timestamps so we surface activity even if one of the gauges
+  // didn't report a sample on a given tick.
+  const allTimes = new Set<number>([...runByT.keys(), ...waitByT.keys()]);
+  for (const t of [...allTimes].toSorted((a, b) => a - b)) {
+    const running = runByT.get(t) ?? 0;
+    const waiting = waitByT.get(t) ?? 0;
+    queueDepth.push({ t: tOf(t), running, waiting, total: running + waiting });
+  }
+
+  // Throughput: sum the counter `rate` (already per-second) across engines.
+  // Takes a fallback chain so vllm:* and sglang:* both work.
+  const counterRate = (...names: string[]): TimeSeriesPoint[] => {
+    const s = pickSeries(...names);
+    return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({
+      t: tOf(t),
+      value: v,
+    }));
+  };
+  const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens');
+  const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens');
+  // Tokens served from prefix cache per scrape. Lets the frontend derive
+  // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
+  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
+
+  // SGLang hicache: host-pool KV cache utilization as used/total per
+  // timeslice. Both metrics are gauges in absolute tokens. Total stays
+  // constant (it's the pool size), used fluctuates.
+  const hostUsedByT = aggregateByStart(
+    metrics['sglang:hicache_host_used_tokens']?.series,
+    'avg',
+    'sum',
+  );
+  const hostTotalByT = aggregateByStart(
+    metrics['sglang:hicache_host_total_tokens']?.series,
+    'avg',
+    'sum',
+  );
+  const hostKvCacheUsage: TimeSeriesPoint[] = [];
+  for (const [t, used] of sortedEntries(hostUsedByT)) {
+    const total = hostTotalByT.get(t);
+    if (total !== undefined && total > 0) {
+      hostKvCacheUsage.push({ t: tOf(t), value: used / total });
+    }
+  }
+
+  // Per-source prompt tokens — sum across engines per source label.
+  //   vllm: vllm:prompt_tokens_by_source has one series per source label
+  //         (local_cache_hit, external_cache_hit, miss, ...). Use the
+  //         `source`/`reason`/`kind` label as the breakdown key.
+  //   sglang: sglang:realtime_tokens uses a `mode` label with values
+  //         {prefill_cache, prefill_compute, decode}. Filter to prefill_*
+  //         since decode isn't prompt-token volume.
+  const promptBySrcByT = new Map<string, Map<number, number>>();
+  // Sum a series' per-scrape rates into the bucket for `label`. The bucket is
+  // created even when the series has no valid timeslices — the SGLang fallback
+  // below is gated on `promptBySrcByT.size === 0`, so an empty vllm breakdown
+  // must still suppress it.
+  const addSeriesRates = (label: string, series: RawSeries): void => {
+    let byT = promptBySrcByT.get(label);
+    if (!byT) {
+      byT = new Map<number, number>();
+      promptBySrcByT.set(label, byT);
+    }
+    for (const ts of series.timeslices ?? []) {
+      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+        byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+      }
+    }
+  };
+  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+    const labels = series.labels ?? {};
+    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+    addSeriesRates(source, series);
+  }
+  // SGLang fallback: only consider when the vllm metric wasn't found.
+  //   - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]`
+  //   - Cache hits, split by tier: per-series `sglang:cached_tokens` where each
+  //     series carries a `cache_source` label ("device" = HBM, "host" = CPU
+  //     offload via hicache). Current runs have only `device`; when hicache
+  //     runs land, additional series will appear and the chart will split.
+  if (promptBySrcByT.size === 0) {
+    for (const series of metrics['sglang:realtime_tokens']?.series ?? []) {
+      const labels = series.labels ?? {};
+      const mode = labels['mode'] ?? 'unknown';
+      // Only carry the cache-miss line over — cache hits come from
+      // sglang:cached_tokens broken out by cache_source below, so we'd
+      // double-count if we kept `prefill_cache` here too.
+      if (mode !== 'prefill_compute') continue;
+      addSeriesRates('compute (miss)', series);
+    }
+    // Cache hits broken out per cache_source. Strip the noisy "total" label
+    // (older sglang versions emit a single un-broken-out series labelled
+    // total — show that as just "cache hit").
+    for (const series of metrics['sglang:cached_tokens']?.series ?? []) {
+      const labels = series.labels ?? {};
+      const src = labels['cache_source'] ?? 'cache hit';
+      const label =
+        src === 'device'
+          ? 'cache hit (HBM)'
+          : src === 'host'
+            ? 'cache hit (CPU offload)'
+            : src === 'total'
+              ? 'cache hit'
+              : `cache hit (${src})`;
+      addSeriesRates(label, series);
+    }
+  }
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const [source, byT] of promptBySrcByT) {
+    const arr: TimeSeriesPoint[] = [];
+    for (const [t, v] of sortedEntries(byT)) {
+      if (v > 0) arr.push({ t: tOf(t), value: v });
+    }
+    if (arr.length > 0) promptTokensBySource[source] = arr;
+  }
+
+  const metricSources: MetricSourceSeries[] = [];
+  const adapter = selectServerMetricsAdapter(context);
+  if (includeMetricSources && context.disagg && adapter.id !== 'generic') {
+    const grouped = new Map<string, { source: MetricSource; metrics: MetricsMap }>();
+    for (const [metricName, metric] of Object.entries(metrics)) {
+      for (const series of metric.series ?? []) {
+        const source = adapter.identifySource(series);
+        let group = grouped.get(source.id);
+        if (!group) {
+          group = { source, metrics: {} };
+          grouped.set(source.id, group);
+        }
+        const groupedMetric = (group.metrics[metricName] ??= { series: [] });
+        groupedMetric.series!.push(series);
+      }
+    }
+    for (const { source, metrics: sourceMetrics } of grouped.values()) {
+      const sourceSeries = buildSeriesFromMetrics(
+        sourceMetrics,
+        context,
+        false,
+        originStartNs ?? startNs,
+      );
+      metricSources.push({
+        source,
+        kvCacheUsage: sourceSeries.kvCacheUsage,
+        prefixCacheHitRate: sourceSeries.prefixCacheHitRate,
+        queueDepth: sourceSeries.queueDepth,
+        promptTokensBySource: sourceSeries.promptTokensBySource,
+        promptTps: sourceSeries.prefillTps,
+        generationTps: sourceSeries.decodeTps,
+        prefixCacheHitsTps: sourceSeries.prefixCacheHitsTps,
+        hostKvCacheUsage: sourceSeries.hostKvCacheUsage,
+        kvCacheUsageByEngine: sourceSeries.kvCacheUsageByEngine,
+      });
+    }
+    const roleOrder: Record<MetricSource['role'], number> = {
+      router: 0,
+      prefill: 1,
+      decode: 2,
+      combined: 3,
+      unknown: 4,
+    };
+    metricSources.sort(
+      (a, b) =>
+        roleOrder[a.source.role] - roleOrder[b.source.role] ||
+        (a.source.endpointUrl ?? '').localeCompare(b.source.endpointUrl ?? '') ||
+        a.source.id.localeCompare(b.source.id),
+    );
+  }
+  return {
+    version: CHART_SERIES_VERSION,
+    startNs,
+    endNs,
+    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+    timeslicesCount,
+    kvCacheUsage,
+    prefixCacheHitRate,
+    queueDepth,
+    promptTokensBySource,
+    prefillTps,
+    decodeTps,
+    prefixCacheHitsTps,
+    hostKvCacheUsage,
+    kvCacheUsageByEngine,
+    metricSources,
+  };
+}
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
new file mode 100644
index 00000000..1ad9e63b
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -0,0 +1,210 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-request-timeline.js';
+
+interface SyntheticRequest {
+  cid: string;
+  ti: number;
+  srcTrace?: string;
+  srcOuter?: number;
+  srcInner?: number;
+  srcKind?: string;
+  wid?: string;
+  ad?: number;
+  phase?: string;
+  credit: number;
+  start: number;
+  end: number;
+  ack?: number | null;
+  ttftMs?: number | null;
+  tpotMs?: number | null;
+  tpotKey?: 'inter_token_latency' | 'time_per_output_token';
+  isl?: number | null;
+  osl?: number | null;
+  cancelled?: boolean;
+}
+
+function makeBlob(requests: SyntheticRequest[]) {
+  const lines = requests.map((r) =>
+    JSON.stringify({
+      metadata: {
+        conversation_id: r.cid,
+        turn_index: r.ti,
+        ...(r.srcTrace === undefined ? {} : { source_trace_id: r.srcTrace }),
+        ...(r.srcOuter === undefined ? {} : { source_outer_idx: r.srcOuter }),
+        ...(r.srcInner === undefined ? {} : { source_inner_idx: r.srcInner }),
+        ...(r.srcKind === undefined ? {} : { source_kind: r.srcKind }),
+        worker_id: r.wid ?? 'worker_default',
+        agent_depth: r.ad ?? 0,
+        benchmark_phase: r.phase ?? 'profiling',
+        credit_issued_ns: r.credit,
+        request_start_ns: r.start,
+        ...(r.ack === undefined ? {} : { request_ack_ns: r.ack }),
+        request_end_ns: r.end,
+        was_cancelled: r.cancelled ?? false,
+      },
+      metrics: {
+        time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' },
+        [r.tpotKey ?? 'inter_token_latency']:
+          r.tpotMs === null ? null : { value: r.tpotMs ?? 10, unit: 'ms' },
+        input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' },
+        output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' },
+      },
+    }),
+  );
+  return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+describe('computeRequestTimeline', () => {
+  it('returns null when the blob is null', () => {
+    expect(computeRequestTimeline(null)).toBeNull();
+  });
+
+  it('returns null on a malformed (non-gzip) blob', () => {
+    expect(computeRequestTimeline(Buffer.from('not-gzip'))).toBeNull();
+  });
+
+  it('returns null when the blob has no parseable records', () => {
+    expect(computeRequestTimeline(gzipSync(Buffer.from('\n\n')))).toBeNull();
+  });
+
+  it('returns the current REQUEST_TIMELINE_VERSION in the bundle', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([{ cid: 'a', ti: 0, credit: 1000, start: 2000, end: 3000 }]),
+    );
+    expect(tl?.version).toBe(REQUEST_TIMELINE_VERSION);
+  });
+
+  it('shifts ns timestamps to be relative to the earliest credit_issued', () => {
+    // Two requests with absolute ns starting at 1_000_000_000.
+    const tl = computeRequestTimeline(
+      makeBlob([
+        { cid: 'a', ti: 0, credit: 1_000_000_000, start: 1_001_000_000, end: 1_010_000_000 },
+        { cid: 'a', ti: 1, credit: 1_020_000_000, start: 1_021_000_000, end: 1_030_000_000 },
+      ]),
+    );
+    expect(tl?.startNs).toBe(1_000_000_000);
+    expect(tl?.endNs).toBe(1_030_000_000);
+    expect(tl?.durationS).toBeCloseTo(0.03, 6);
+    expect(tl?.requests[0]?.credit).toBe(0);
+    expect(tl?.requests[0]?.end).toBe(10_000_000);
+    expect(tl?.requests[1]?.start).toBe(21_000_000);
+  });
+
+  it('sorts requests by start time, regardless of input order', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        { cid: 'a', ti: 0, credit: 30, start: 50, end: 60 },
+        { cid: 'a', ti: 1, credit: 0, start: 10, end: 20 },
+        { cid: 'a', ti: 2, credit: 80, start: 90, end: 100 },
+      ]),
+    );
+    expect(tl?.requests.map((r) => r.start)).toEqual([10, 50, 90]);
+  });
+
+  it('preserves conversation/worker grouping fields', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'conv-A',
+          ti: 5,
+          wid: 'worker_abcd1234',
+          ad: 2,
+          phase: 'profiling',
+          credit: 0,
+          start: 10,
+          end: 100,
+        },
+      ]),
+    );
+    const r = tl?.requests[0]!;
+    expect(r.cid).toBe('conv-A');
+    expect(r.ti).toBe(5);
+    expect(r.wid).toBe('worker_abcd1234');
+    expect(r.ad).toBe(2);
+    expect(r.phase).toBe('profiling');
+  });
+
+  it('preserves raw source provenance fields when present', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'trace::fa:003',
+          ti: 3,
+          srcTrace: 'trace',
+          srcOuter: 204,
+          srcInner: 16,
+          srcKind: 'weka_flat',
+          credit: 0,
+          start: 10,
+          end: 100,
+        },
+      ]),
+    );
+    expect(tl?.requests[0]).toMatchObject({
+      cid: 'trace::fa:003',
+      ti: 3,
+      srcTrace: 'trace',
+      srcOuter: 204,
+      srcInner: 16,
+      srcKind: 'weka_flat',
+    });
+  });
+
+  it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'a',
+          ti: 0,
+          credit: 0,
+          start: 10,
+          end: 100,
+          ttftMs: 25.5,
+          tpotMs: 12.5,
+          isl: 1024,
+          osl: 256,
+          cancelled: true,
+        },
+      ]),
+    );
+    const r = tl?.requests[0]!;
+    expect(r.cancelled).toBe(true);
+    expect(r.ttftMs).toBeCloseTo(25.5, 6);
+    expect(r.tpotMs).toBeCloseTo(12.5, 6);
+    expect(r.isl).toBe(1024);
+    expect(r.osl).toBe(256);
+  });
+
+  it('accepts time_per_output_token as a TPOT alias', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'a',
+          ti: 0,
+          credit: 0,
+          start: 10,
+          end: 100,
+          tpotMs: 8.25,
+          tpotKey: 'time_per_output_token',
+        },
+      ]),
+    );
+    expect(tl?.requests[0]?.tpotMs).toBeCloseTo(8.25, 6);
+  });
+
+  it('skips records missing both credit_issued_ns and request_start_ns', () => {
+    // Build a record with only request_end_ns — the helper rejects it.
+    const broken = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          metadata: { conversation_id: 'a', turn_index: 0, request_end_ns: 1234 },
+          metrics: {},
+        }),
+      ),
+    );
+    expect(computeRequestTimeline(broken)).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
new file mode 100644
index 00000000..2cbe5174
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -0,0 +1,208 @@
+/**
+ * Pre-compute the per-request timeline for the agentic detail page's
+ * Gantt view. Output lands in `agentic_trace_replay.request_timeline`
+ * and is read directly by the timeline API route.
+ *
+ * Shape is a thin array — ~150 bytes per request × ~200 requests per
+ * point ≈ 30 KB per row before JSONB compression. Trivial vs the raw
+ * gzipped JSONL blob (~1-3 MB).
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `REQUEST_TIMELINE_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const REQUEST_TIMELINE_VERSION = 5;
+
+export interface RequestRecord {
+  /** Conversation id (groups turns of one agent session). */
+  cid: string;
+  /** Zero-based turn index within the conversation. */
+  ti: number;
+  /** Source trace id from the original raw dataset, when distinct from replay cid. */
+  srcTrace?: string;
+  /** Original raw top-level request index within srcTrace. */
+  srcOuter?: number;
+  /** Original nested request index within srcOuter, for subagent children. */
+  srcInner?: number;
+  /** Loader-specific source kind, e.g. weka_main or weka_flat. */
+  srcKind?: string;
+  /** Worker id (concurrency slot that handled this request). */
+  wid: string;
+  /** Sub-agent depth (0 = top-level). */
+  ad: number;
+  /** `warmup` or `profiling`. */
+  phase: string;
+  /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+  credit: number;
+  /** ns offset from timeline.startNs. HTTP send started. */
+  start: number;
+  /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+  ack: number | null;
+  /** ns offset from timeline.startNs. Last byte received. */
+  end: number;
+  /** Time-to-first-token in ms. */
+  ttftMs: number | null;
+  /** Time per output token in ms. */
+  tpotMs: number | null;
+  /** Input sequence length (tokens). */
+  isl: number | null;
+  /** Output sequence length (tokens). */
+  osl: number | null;
+  cancelled: boolean;
+}
+
+export interface RequestTimeline {
+  version: number;
+  /** Wall-clock ns of the earliest event (used as the relative-time origin). */
+  startNs: number;
+  /** Wall-clock ns of the latest `request_end_ns`. */
+  endNs: number;
+  /** Total span in seconds. */
+  durationS: number;
+  requests: RequestRecord[];
+}
+
+interface RawMetadata {
+  conversation_id?: string;
+  turn_index?: number;
+  source_trace_id?: string;
+  source_outer_idx?: number;
+  source_inner_idx?: number;
+  source_kind?: string;
+  worker_id?: string;
+  agent_depth?: number;
+  benchmark_phase?: string;
+  credit_issued_ns?: number;
+  request_start_ns?: number;
+  request_ack_ns?: number;
+  request_end_ns?: number;
+  was_cancelled?: boolean;
+}
+
+interface RawMetricValue {
+  value?: number;
+}
+
+interface RawRecord {
+  metadata?: RawMetadata;
+  metrics?: {
+    time_to_first_token?: RawMetricValue | number;
+    time_per_output_token?: RawMetricValue | number;
+    inter_token_latency?: RawMetricValue | number;
+    input_sequence_length?: RawMetricValue | number;
+    output_sequence_length?: RawMetricValue | number;
+  };
+}
+
+/** Pull a numeric metric out of the `{value, unit}` envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return Number.isFinite(v) ? v : undefined;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+/**
+ * Parse the gzipped `profile_export.jsonl` blob into a chart-ready
+ * timeline. Returns null on a missing or malformed blob.
+ */
+export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | null {
+  if (!blob) return null;
+  let text: string;
+  try {
+    text = gunzipSync(blob).toString('utf8');
+  } catch {
+    return null;
+  }
+
+  // First pass: parse + collect raw turns; find timeline origin.
+  const raw: {
+    meta: RawMetadata;
+    ttftMs: number | null;
+    tpotMs: number | null;
+    isl: number | null;
+    osl: number | null;
+  }[] = [];
+  let originNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+
+  for (const line of text.split('\n')) {
+    if (!line) continue;
+    let rec: RawRecord;
+    try {
+      rec = JSON.parse(line) as RawRecord;
+    } catch {
+      continue;
+    }
+    const meta = rec.metadata ?? {};
+    // Use credit_issued_ns when available (the true start of the request's
+    // lifecycle), falling back to request_start_ns. Skip rows missing both.
+    const cStart = meta.credit_issued_ns ?? meta.request_start_ns;
+    const cEnd = meta.request_end_ns;
+    if (typeof cStart !== 'number' || typeof cEnd !== 'number') continue;
+
+    if (cStart < originNs) originNs = cStart;
+    if (cEnd > endNs) endNs = cEnd;
+
+    raw.push({
+      meta,
+      ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null,
+      tpotMs:
+        readNum(rec.metrics?.time_per_output_token) ??
+        readNum(rec.metrics?.inter_token_latency) ??
+        null,
+      isl: readNum(rec.metrics?.input_sequence_length) ?? null,
+      osl: readNum(rec.metrics?.output_sequence_length) ?? null,
+    });
+  }
+
+  if (raw.length === 0) return null;
+  if (!Number.isFinite(originNs)) originNs = 0;
+
+  // Second pass: shift timestamps to be relative to originNs (smaller
+  // numbers fit in JSON nicely and the frontend doesn't need bigint math).
+  const requests: RequestRecord[] = [];
+  for (const r of raw) {
+    const m = r.meta;
+    const credit = (m.credit_issued_ns ?? m.request_start_ns ?? originNs) - originNs;
+    const start = (m.request_start_ns ?? m.credit_issued_ns ?? originNs) - originNs;
+    const ack = typeof m.request_ack_ns === 'number' ? m.request_ack_ns - originNs : null;
+    const end = (m.request_end_ns ?? originNs) - originNs;
+    requests.push({
+      cid: m.conversation_id ?? 'unknown',
+      ti: typeof m.turn_index === 'number' ? m.turn_index : 0,
+      srcTrace: typeof m.source_trace_id === 'string' ? m.source_trace_id : undefined,
+      srcOuter: typeof m.source_outer_idx === 'number' ? m.source_outer_idx : undefined,
+      srcInner: typeof m.source_inner_idx === 'number' ? m.source_inner_idx : undefined,
+      srcKind: typeof m.source_kind === 'string' ? m.source_kind : undefined,
+      wid: m.worker_id ?? 'unknown',
+      ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0,
+      phase: m.benchmark_phase ?? 'unknown',
+      credit,
+      start,
+      ack,
+      end,
+      ttftMs: r.ttftMs,
+      tpotMs: r.tpotMs,
+      isl: r.isl,
+      osl: r.osl,
+      cancelled: m.was_cancelled === true,
+    });
+  }
+
+  // Stable order so backfill output is deterministic.
+  requests.sort((a, b) => a.start - b.start);
+
+  return {
+    version: REQUEST_TIMELINE_VERSION,
+    startNs: originNs,
+    endNs,
+    durationS: endNs > originNs ? (endNs - originNs) / 1e9 : 0,
+    requests,
+  };
+}
diff --git a/packages/db/src/etl/dataset-provenance.test.ts b/packages/db/src/etl/dataset-provenance.test.ts
new file mode 100644
index 00000000..4022546e
--- /dev/null
+++ b/packages/db/src/etl/dataset-provenance.test.ts
@@ -0,0 +1,40 @@
+import { describe, expect, it } from 'vitest';
+
+import { datasetSlugFromBenchmarkRow } from './dataset-provenance';
+
+describe('datasetSlugFromBenchmarkRow', () => {
+  it('maps aiperf public-dataset provenance to the dashboard dataset slug', () => {
+    expect(
+      datasetSlugFromBenchmarkRow({
+        dataset: {
+          source_type: 'public_dataset',
+          loader: 'semianalysis_cc_traces_weka_with_subagents',
+          hf_dataset_name: 'semianalysisai/cc-traces-weka-062126',
+          hf_split: 'train',
+          num_dataset_entries: 393,
+        },
+      }),
+    ).toBe('cc-traces-weka-062126');
+  });
+
+  it('supports an unnamespaced Hugging Face dataset id', () => {
+    expect(
+      datasetSlugFromBenchmarkRow({
+        dataset: {
+          source_type: 'public_dataset',
+          hf_dataset_name: 'cc-traces-weka-062126',
+        },
+      }),
+    ).toBe('cc-traces-weka-062126');
+  });
+
+  it.each([
+    {},
+    { dataset: null },
+    { dataset: { source_type: 'synthetic', hf_dataset_name: 'owner/data' } },
+    { dataset: { source_type: 'public_dataset', hf_dataset_name: '' } },
+    { dataset: { source_type: 'public_dataset' } },
+  ])('ignores rows without usable public-dataset provenance: %j', (row) => {
+    expect(datasetSlugFromBenchmarkRow(row)).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/dataset-provenance.ts b/packages/db/src/etl/dataset-provenance.ts
new file mode 100644
index 00000000..f0d7cd0d
--- /dev/null
+++ b/packages/db/src/etl/dataset-provenance.ts
@@ -0,0 +1,32 @@
+const TRAILING_SLASHES = /\/+$/u;
+
+/** Dataset provenance emitted by aiperf and preserved in agentic benchmark rows. */
+export interface DatasetProvenance {
+  source_type?: unknown;
+  loader?: unknown;
+  hf_dataset_name?: unknown;
+  hf_split?: unknown;
+  hf_subset?: unknown;
+  num_dataset_entries?: unknown;
+}
+
+/**
+ * Resolve the dashboard dataset slug from a benchmark row's provenance.
+ *
+ * Dataset ingest uses the final path component of the Hugging Face dataset id
+ * as `datasets.slug`, so `semianalysisai/cc-traces-weka-062126` maps to
+ * `cc-traces-weka-062126` here as well.
+ */
+export function datasetSlugFromBenchmarkRow(row: Record<string, unknown>): string | null {
+  const dataset = row.dataset;
+  if (!dataset || typeof dataset !== 'object' || Array.isArray(dataset)) return null;
+
+  const provenance = dataset as DatasetProvenance;
+  if (provenance.source_type !== 'public_dataset') return null;
+  if (typeof provenance.hf_dataset_name !== 'string') return null;
+
+  const datasetId = provenance.hf_dataset_name.trim().replace(TRAILING_SLASHES, '');
+  if (!datasetId) return null;
+  const slug = datasetId.slice(datasetId.lastIndexOf('/') + 1);
+  return slug || null;
+}
diff --git a/packages/db/src/etl/distribution-stats.ts b/packages/db/src/etl/distribution-stats.ts
new file mode 100644
index 00000000..da3603ab
--- /dev/null
+++ b/packages/db/src/etl/distribution-stats.ts
@@ -0,0 +1,98 @@
+/**
+ * Generic distribution math shared by the dataset ETL: percentile summaries
+ * and histogram binning for the dataset-detail cards. Pure functions, no DB
+ * access. (The per-benchmark-row percentile bundle uses `percentilesOf` in
+ * `queries/agentic-aggregates` — a different shape with its own version key.)
+ */
+
+export interface HistogramBin {
+  x0: number;
+  x1: number;
+  count: number;
+}
+
+export interface NumberSummary {
+  count: number;
+  min: number;
+  max: number;
+  mean: number;
+  median: number;
+  p75: number;
+  p90: number;
+  p95: number;
+}
+
+/** Distribution summary with linear-interpolated percentiles. */
+export function summarizeValues(values: readonly number[]): NumberSummary {
+  if (values.length === 0) {
+    return { count: 0, min: 0, max: 0, mean: 0, median: 0, p75: 0, p90: 0, p95: 0 };
+  }
+  const sorted = [...values].toSorted((a, b) => a - b);
+  const quantile = (q: number): number => {
+    const pos = (sorted.length - 1) * q;
+    const lo = Math.floor(pos);
+    const hi = Math.ceil(pos);
+    if (lo === hi) return sorted[lo]!;
+    return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+  };
+  return {
+    count: sorted.length,
+    min: sorted[0]!,
+    max: sorted.at(-1)!,
+    mean: sorted.reduce((sum, value) => sum + value, 0) / sorted.length,
+    median: quantile(0.5),
+    p75: quantile(0.75),
+    p90: quantile(0.9),
+    p95: quantile(0.95),
+  };
+}
+
+/** Linear-width histogram over [0, max]. Empty input → []. */
+export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] {
+  if (values.length === 0) return [];
+  const max = Math.max(...values);
+  if (max <= 0) return [{ x0: 0, x1: 1, count: values.length }];
+  const width = max / bins;
+  const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({
+    x0: i * width,
+    x1: (i + 1) * width,
+    count: 0,
+  }));
+  for (const v of values) {
+    const idx = Math.min(bins - 1, Math.max(0, Math.floor(v / width)));
+    out[idx].count += 1;
+  }
+  return out;
+}
+
+/** Log-width histogram over positive values (values ≤ 0 are dropped). */
+export function logHistogram(values: readonly number[], bins = 40): HistogramBin[] {
+  const pos = values.filter((v) => v > 0);
+  if (pos.length === 0) return [];
+  const min = Math.min(...pos);
+  const max = Math.max(...pos);
+  const lo = Math.log10(min);
+  const hi = Math.log10(max);
+  if (hi <= lo) return [{ x0: min, x1: max <= min ? min * 10 : max, count: pos.length }];
+  const width = (hi - lo) / bins;
+  const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({
+    x0: 10 ** (lo + i * width),
+    x1: 10 ** (lo + (i + 1) * width),
+    count: 0,
+  }));
+  for (const v of pos) {
+    const idx = Math.min(bins - 1, Math.max(0, Math.floor((Math.log10(v) - lo) / width)));
+    out[idx].count += 1;
+  }
+  return out;
+}
+
+/** Log-width histogram that preserves zero as a dedicated first bin. */
+export function logHistogramWithZero(values: readonly number[], bins = 40): HistogramBin[] {
+  const zeroCount = values.filter((value) => value === 0).length;
+  const positive = values.filter((value) => value > 0);
+  if (zeroCount === 0) return logHistogram(positive, bins);
+  if (positive.length === 0) return [{ x0: 0, x1: 1, count: zeroCount }];
+  const positiveBins = logHistogram(positive, Math.max(1, bins - 1));
+  return [{ x0: 0, x1: positiveBins[0]?.x0 ?? 1, count: zeroCount }, ...positiveBins];
+}
diff --git a/packages/db/src/etl/gzip-json-stream.test.ts b/packages/db/src/etl/gzip-json-stream.test.ts
new file mode 100644
index 00000000..9051ee82
--- /dev/null
+++ b/packages/db/src/etl/gzip-json-stream.test.ts
@@ -0,0 +1,66 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream.js';
+
+describe('isStringTooLongError', () => {
+  it('matches the ERR_STRING_TOO_LONG code', () => {
+    const err = new Error('Cannot create a string longer than ...') as NodeJS.ErrnoException;
+    err.code = 'ERR_STRING_TOO_LONG';
+    expect(isStringTooLongError(err)).toBe(true);
+  });
+
+  it('matches the message-only variant', () => {
+    expect(isStringTooLongError(new Error('Cannot create a string longer than 0x1fffffe8'))).toBe(
+      true,
+    );
+  });
+
+  it('rejects unrelated errors and non-errors', () => {
+    expect(isStringTooLongError(new Error('unexpected token'))).toBe(false);
+    expect(isStringTooLongError(null)).toBe(false);
+    expect(isStringTooLongError('ERR_STRING_TOO_LONG-ish string')).toBe(false);
+  });
+});
+
+describe('streamCollectKeys', () => {
+  const blob = gzipSync(
+    JSON.stringify({
+      metrics: {
+        'vllm:prompt_tokens': { series: [{ timeslices: [{ start_ns: 1, rate: 2 }] }] },
+        'vllm:ignored_metric': { series: [] },
+      },
+      warmup_metrics: {
+        'vllm:prompt_tokens': { series: [] },
+      },
+    }),
+  );
+
+  it('collects only wanted keys under the filtered top-level block', async () => {
+    const out = await streamCollectKeys<{ series: unknown[] }>(
+      blob,
+      'metrics',
+      new Set(['vllm:prompt_tokens']),
+    );
+    expect(Object.keys(out)).toEqual(['vllm:prompt_tokens']);
+    expect(out['vllm:prompt_tokens']).toEqual({
+      series: [{ timeslices: [{ start_ns: 1, rate: 2 }] }],
+    });
+  });
+
+  it('reads a different top-level phase block via filter', async () => {
+    const out = await streamCollectKeys<{ series: unknown[] }>(
+      blob,
+      'warmup_metrics',
+      new Set(['vllm:prompt_tokens']),
+    );
+    expect(out).toEqual({ 'vllm:prompt_tokens': { series: [] } });
+  });
+
+  it('rejects on a non-gzip buffer', async () => {
+    await expect(
+      streamCollectKeys(Buffer.from('not gzip'), 'metrics', new Set(['x'])),
+    ).rejects.toThrow();
+  });
+});
diff --git a/packages/db/src/etl/gzip-json-stream.ts b/packages/db/src/etl/gzip-json-stream.ts
new file mode 100644
index 00000000..cb299a8d
--- /dev/null
+++ b/packages/db/src/etl/gzip-json-stream.ts
@@ -0,0 +1,58 @@
+/**
+ * Shared stream-parse helpers for gzipped server-metrics blobs.
+ *
+ * `gunzipSync(buffer).toString('utf8')` trips Node's 512 MB max-string-length
+ * cap on high-conc TP+EP rows, so the compute-* ETL helpers fall back to a
+ * stream-json pipeline that collects only the top-level subtrees they need.
+ * Both the fast-path error detection and the pipeline itself live here so
+ * chart-series and aggregate-stats stay byte-identical in how they parse.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+/**
+ * True when `error` is Node's max-string-length failure (`ERR_STRING_TOO_LONG`
+ * or the older message-only variant) — the signal to switch from
+ * `gunzipSync().toString()` to the streaming parser.
+ */
+export function isStringTooLongError(error: unknown): boolean {
+  const code = error && (error as NodeJS.ErrnoException).code;
+  const msg = error instanceof Error ? error.message : String(error);
+  return code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8');
+}
+
+/**
+ * Gunzip + stream-parse `buffer`, descending into the top-level `filter` key
+ * (e.g. `metrics` / `warmup_metrics`) and collecting only the child entries
+ * whose key is in `wanted`. Never materializes the full JSON string.
+ */
+export async function streamCollectKeys<T>(
+  buffer: Buffer,
+  filter: string,
+  wanted: ReadonlySet<string>,
+): Promise<Record<string, T>> {
+  const collected: Record<string, T> = {};
+  const pipeline = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    pipeline.on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: T };
+      if (wanted.has(key)) collected[key] = value;
+    });
+    pipeline.on('end', resolve);
+    pipeline.on('error', reject);
+  });
+  return collected;
+}
diff --git a/packages/db/src/etl/normalizers.test.ts b/packages/db/src/etl/normalizers.test.ts
index e569143a..82aaf67c 100644
--- a/packages/db/src/etl/normalizers.test.ts
+++ b/packages/db/src/etl/normalizers.test.ts
@@ -25,6 +25,11 @@ describe('hwToGpuKey', () => {
     expect(hwToGpuKey('mi300x-amd')).toBe('mi300x');
   });
 
+  it('strips a v3 scope prefix (cluster:…)', () => {
+    expect(hwToGpuKey('cluster:b300-nv')).toBe('b300');
+    expect(hwToGpuKey('cluster:h200')).toBe('h200');
+  });
+
   it('strips -amds suffix', () => {
     expect(hwToGpuKey('mi355x-amds')).toBe('mi355x');
   });
diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts
index 1d6a95c1..07793dee 100644
--- a/packages/db/src/etl/normalizers.ts
+++ b/packages/db/src/etl/normalizers.ts
@@ -22,7 +22,11 @@ export { GPU_KEYS };
  *   stripped base is not in `GPU_KEYS`.
  */
 export function hwToGpuKey(hw: string): string | null {
-  const base = hw.toLowerCase().split('-')[0];
+  // v3 agentic artifacts scope the hw id (`cluster:b300-nv`) — drop everything
+  // up to the last `:` first. Then take the first segment before `-` as the
+  // canonical key; that subsumes all the prior explicit suffix strips
+  // (-nv, -amds, -dgxc-slurm, -p1, -cw, …).
+  const base = hw.toLowerCase().split(':').pop()!.split('-')[0];
   return GPU_KEYS.has(base) ? base : null;
 }
 
@@ -138,7 +142,7 @@ export function resolveModelKey(row: Record<string, any>): string | null {
  */
 export function normalizeFramework(
   fw: string,
-  disaggField: any,
+  disaggField: unknown,
 ): { framework: string; disagg: boolean } {
   const lower = fw.toLowerCase();
   const alias = FRAMEWORK_ALIASES[lower];
@@ -171,7 +175,7 @@ export function normalizePrecision(raw: string): string {
  * @param spec - Raw `spec_decoding` value from the artifact.
  * @returns Lowercase method name, or `'none'` if absent/empty.
  */
-export function normalizeSpecMethod(spec: any): string {
+export function normalizeSpecMethod(spec: unknown): string {
   if (!spec || spec === '') return 'none';
   return String(spec).toLowerCase();
 }
@@ -183,7 +187,7 @@ export function normalizeSpecMethod(spec: any): string {
  * @param v - Value to coerce (any type).
  * @returns `true` if the value is one of the recognized truthy forms, `false` otherwise.
  */
-export function parseBool(v: any): boolean {
+export function parseBool(v: unknown): boolean {
   return v === true || v === 'true' || v === 'True';
 }
 
@@ -194,7 +198,7 @@ export function parseBool(v: any): boolean {
  * @param v - Value to parse (number, string, null, or undefined).
  * @returns The parsed number, or `undefined` if the input is null/undefined/NaN.
  */
-export function parseNum(v: any): number | undefined {
+export function parseNum(v: unknown): number | undefined {
   if (v === null || v === undefined) return undefined;
   const n = typeof v === 'string' ? parseFloat(v) : Number(v);
   return isNaN(n) ? undefined : n;
@@ -207,12 +211,14 @@ export function parseNum(v: any): number | undefined {
  * @param v - Value to parse (number, string, null, or undefined).
  * @returns The parsed integer, or `undefined` if the input is null/undefined/NaN.
  */
-export function parseInt2(v: any): number | undefined {
+export function parseInt2(v: unknown): number | undefined {
   if (v === null || v === undefined) return undefined;
   const n = typeof v === 'string' ? parseInt(v, 10) : Math.round(Number(v));
   return isNaN(n) ? undefined : n;
 }
 
+const ISL_OSL_PATTERN = /[_-](?<isl>\d+)k(?<osl>\d+)k[_\-.]/iu;
+
 /**
  * Extract ISL (input sequence length) and OSL (output sequence length) in tokens
  * from a file/directory name that encodes them as `{n}k{m}k`.
@@ -225,7 +231,7 @@ export function parseInt2(v: any): number | undefined {
  * @returns An object with `isl` and `osl` in tokens, or `null` if no match is found.
  */
 export function parseIslOsl(name: string): { isl: number; osl: number } | null {
-  const m = name.match(/[_-](?<isl>\d+)k(?<osl>\d+)k[_\-.]/iu);
+  const m = name.match(ISL_OSL_PATTERN);
   if (!m) return null;
   return { isl: parseInt(m[1], 10) * 1024, osl: parseInt(m[2], 10) * 1024 };
 }
diff --git a/packages/db/src/etl/server-log-metrics.test.ts b/packages/db/src/etl/server-log-metrics.test.ts
new file mode 100644
index 00000000..9e0fa852
--- /dev/null
+++ b/packages/db/src/etl/server-log-metrics.test.ts
@@ -0,0 +1,43 @@
+import { describe, expect, it } from 'vitest';
+
+import { kvCachePoolTokensFromServerLog } from './server-log-metrics';
+
+describe('kvCachePoolTokensFromServerLog', () => {
+  it('returns null for empty / missing logs', () => {
+    expect(kvCachePoolTokensFromServerLog(null)).toBeNull();
+    expect(kvCachePoolTokensFromServerLog('')).toBeNull();
+    expect(kvCachePoolTokensFromServerLog('no kv cache line here')).toBeNull();
+  });
+
+  it('reads a single-engine (ep1) pool size', () => {
+    const log = `
+(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1744] GPU KV cache size: 11,294,463 tokens
+(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1745] Maximum concurrency for 1,048,576 tokens per request: 10.77x
+`;
+    expect(kvCachePoolTokensFromServerLog(log)).toBe(11_294_463);
+  });
+
+  it('sums across data-parallel engine cores (ep8)', () => {
+    const lines = Array.from(
+      { length: 8 },
+      (_, i) =>
+        `(EngineCore_DP${i} pid=${2337827 + i}) INFO [kv_cache_utils.py:1744] GPU KV cache size: 11,577,333 tokens`,
+    ).join('\n');
+    expect(kvCachePoolTokensFromServerLog(lines)).toBe(11_577_333 * 8);
+  });
+
+  it('dedups reprinted lines for the same engine core', () => {
+    const log = `
+(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens
+(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens
+(EngineCore_DP1 pid=2) GPU KV cache size: 5,000,000 tokens
+`;
+    // DP0 counted once + DP1 once = 10M, not 15M.
+    expect(kvCachePoolTokensFromServerLog(log)).toBe(10_000_000);
+  });
+
+  it('falls back to bare lines when no engine-core prefix is present', () => {
+    const log = `INFO GPU KV cache size: 1,234,567 tokens`;
+    expect(kvCachePoolTokensFromServerLog(log)).toBe(1_234_567);
+  });
+});
diff --git a/packages/db/src/etl/server-log-metrics.ts b/packages/db/src/etl/server-log-metrics.ts
new file mode 100644
index 00000000..b8b26dd1
--- /dev/null
+++ b/packages/db/src/etl/server-log-metrics.ts
@@ -0,0 +1,65 @@
+/**
+ * Derive server-side scalars from the captured vLLM server log
+ * (`server_logs.server_log`). These come from startup log lines rather than the
+ * scraped Prometheus `/metrics`, because for MLA / sparse-attention models the
+ * `vllm:cache_config_info` labels (num_gpu_blocks × block_size) do NOT
+ * reconstruct the real KV-cache token capacity — they undercount by a
+ * non-constant factor. vLLM's own `GPU KV cache size: N tokens` line is the
+ * authoritative number.
+ */
+
+/**
+ * Total KV-cache pool size in tokens.
+ *
+ * vLLM prints one `GPU KV cache size: N tokens` line per engine core (one per
+ * data-parallel rank; tensor-parallel is already aggregated into that single
+ * per-engine number). We sum across distinct engine cores so the result is the
+ * deployment-wide total:
+ *
+ *   (EngineCore pid=…)      GPU KV cache size: 11,294,463 tokens   → ep1 total
+ *   (EngineCore_DP0 pid=…)  GPU KV cache size: 11,577,333 tokens   ┐
+ *   (EngineCore_DP1 pid=…)  GPU KV cache size: 11,577,333 tokens   ┘ → ×8 = total
+ *
+ * Returns null when the log has no such line (non-vLLM frameworks, or a log
+ * that didn't capture engine startup).
+ */
+export function kvCachePoolTokensFromServerLog(serverLog: string | null): number | null {
+  if (!serverLog) return null;
+
+  // Scan line-by-line. We deliberately avoid a global regex over the whole blob
+  // with a lazy `[^\n]*?` bridge between the engine tag and the size: some logs
+  // contain multi-megabyte single lines (progress bars, tracebacks) that make
+  // such a regex recurse and blow the stack. A per-line substring pre-filter
+  // means the (cheap) regexes only ever run on the short KV-size lines.
+  //
+  // Each engine core prints one line; the tag (e.g. `EngineCore_DP3`) is stable
+  // across a run while the pid is not, so key on the tag to dedup reprints and
+  // sum across data-parallel ranks.
+  const tagRe = /\((?<tag>EngineCore(?:_DP\d+)?)\s+pid=\d+\)/u;
+  const sizeRe = /GPU KV cache size:\s*(?<tokens>[\d,]+)\s*tokens/u;
+  const perEngine = new Map<string, number>();
+  let bareTotal = 0;
+  let bareFound = false;
+  for (const line of serverLog.split('\n')) {
+    if (!line.includes('GPU KV cache size')) continue;
+    const sizeMatch = sizeRe.exec(line);
+    if (!sizeMatch) continue;
+    const tokens = Number(sizeMatch.groups!.tokens!.replaceAll(',', ''));
+    if (!Number.isFinite(tokens) || tokens <= 0) continue;
+    const tagMatch = tagRe.exec(line);
+    if (tagMatch) {
+      perEngine.set(tagMatch.groups!.tag!, tokens);
+    } else {
+      // Fallback for logs without the engine-core prefix: count each occurrence
+      // (one per engine when there are no reprints). Best-effort only.
+      bareTotal += tokens;
+      bareFound = true;
+    }
+  }
+  if (perEngine.size > 0) {
+    let total = 0;
+    for (const v of perEngine.values()) total += v;
+    return total;
+  }
+  return bareFound ? bareTotal : null;
+}
diff --git a/packages/db/src/etl/server-metrics-adapters.ts b/packages/db/src/etl/server-metrics-adapters.ts
new file mode 100644
index 00000000..f123d9f8
--- /dev/null
+++ b/packages/db/src/etl/server-metrics-adapters.ts
@@ -0,0 +1,100 @@
+/**
+ * Normalize orchestrator-specific server-metric labels into a stable source
+ * identity consumed by the API and frontend. AIPerf owns the export envelope;
+ * the serving orchestrator owns the meaning of labels inside each series.
+ */
+
+export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown';
+
+export interface RawMetricSourceSeries {
+  endpoint_url?: string;
+  labels?: Record<string, string>;
+}
+
+export interface ServerMetricsContext {
+  /** Canonical framework stored in configs, for example `dynamo-vllm`. */
+  framework?: string | null;
+  /** Per-worker role series are only meaningful for disaggregated configs. */
+  disagg?: boolean;
+}
+
+export interface MetricSource {
+  /** Stable key used to join this source across different metric names. */
+  id: string;
+  adapter: string;
+  role: MetricSourceRole;
+  endpointUrl: string | null;
+  nativeRole: string | null;
+  workerId: string | null;
+  dpRank: string | null;
+  engine: string | null;
+}
+
+interface ServerMetricsAdapter {
+  id: string;
+  matches: (context: ServerMetricsContext) => boolean;
+  identifySource: (series: RawMetricSourceSeries) => MetricSource;
+}
+
+function stableId(adapter: string, parts: (string | null | undefined)[]): string {
+  return [adapter, ...parts.map((part) => part ?? '')].join('|');
+}
+
+const dynamoAdapter: ServerMetricsAdapter = {
+  id: 'dynamo',
+  matches: ({ framework }) => framework?.startsWith('dynamo-') ?? false,
+  identifySource(series) {
+    const labels = series.labels ?? {};
+    const nativeRole = labels['dynamo_component'] ?? null;
+    const role: MetricSourceRole =
+      nativeRole === 'prefill'
+        ? 'prefill'
+        : nativeRole === 'backend'
+          ? 'decode'
+          : nativeRole === 'frontend' || nativeRole === 'router'
+            ? 'router'
+            : 'unknown';
+    const endpointUrl = series.endpoint_url ?? labels['dynamo_endpoint'] ?? null;
+    const workerId = labels['worker_id'] ?? null;
+    const dpRank = labels['dp_rank'] ?? null;
+    const engine = labels['engine'] ?? labels['engine_idx'] ?? null;
+    return {
+      id: stableId('dynamo', [role, endpointUrl, workerId, dpRank, engine]),
+      adapter: 'dynamo',
+      role,
+      endpointUrl,
+      nativeRole,
+      workerId,
+      dpRank,
+      engine,
+    };
+  },
+};
+
+const genericAdapter: ServerMetricsAdapter = {
+  id: 'generic',
+  matches: () => true,
+  identifySource(series) {
+    const labels = series.labels ?? {};
+    const endpointUrl = series.endpoint_url ?? null;
+    const workerId = labels['worker_id'] ?? null;
+    const dpRank = labels['dp_rank'] ?? null;
+    const engine = labels['engine'] ?? labels['engine_idx'] ?? null;
+    return {
+      id: stableId('generic', [endpointUrl, workerId, dpRank, engine]),
+      adapter: 'generic',
+      role: endpointUrl || workerId || dpRank || engine ? 'unknown' : 'combined',
+      endpointUrl,
+      nativeRole: null,
+      workerId,
+      dpRank,
+      engine,
+    };
+  },
+};
+
+const ADAPTERS: readonly ServerMetricsAdapter[] = [dynamoAdapter, genericAdapter];
+
+export function selectServerMetricsAdapter(context: ServerMetricsContext): ServerMetricsAdapter {
+  return ADAPTERS.find((adapter) => adapter.matches(context)) ?? genericAdapter;
+}
diff --git a/packages/db/src/etl/skip-tracker.test.ts b/packages/db/src/etl/skip-tracker.test.ts
index 90ad73b7..e407db3a 100644
--- a/packages/db/src/etl/skip-tracker.test.ts
+++ b/packages/db/src/etl/skip-tracker.test.ts
@@ -9,6 +9,7 @@ describe('createSkipTracker', () => {
     expect(tracker.skips.unmappedHw).toBe(0);
     expect(tracker.skips.noIslOsl).toBe(0);
     expect(tracker.skips.dbError).toBe(0);
+    expect(tracker.skips.traceReplayMissing).toBe(0);
   });
 
   it('initializes with empty unmapped sets', () => {
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 134b5299..5d485bf2 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -8,7 +8,10 @@ export interface Skips {
   unmappedModel: number;
   unmappedHw: number;
   noIslOsl: number;
+  failedRun: number;
   dbError: number;
+  /** Agentic point whose sibling `agentic_<suffix>` artifact had no trace_replay files. */
+  traceReplayMissing: number;
 }
 
 export interface SkipSnapshot {
@@ -66,7 +69,15 @@ const MAX_DB_ERRORS = 10;
  * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets.
  */
 export function createSkipTracker(): SkipTracker {
-  const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 };
+  const skips: Skips = {
+    badZip: 0,
+    unmappedModel: 0,
+    unmappedHw: 0,
+    noIslOsl: 0,
+    failedRun: 0,
+    dbError: 0,
+    traceReplayMissing: 0,
+  };
   const unmappedModels = new Set<string>();
   const unmappedHws = new Set<string>();
   const unmappedPrecisions = new Set<string>();
diff --git a/packages/db/src/etl/trace-artifact-discovery.test.ts b/packages/db/src/etl/trace-artifact-discovery.test.ts
new file mode 100644
index 00000000..2bb1d51b
--- /dev/null
+++ b/packages/db/src/etl/trace-artifact-discovery.test.ts
@@ -0,0 +1,66 @@
+import { execFileSync } from 'node:child_process';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+import { afterEach, describe, expect, it } from 'vitest';
+
+import { discoverTraceReplayArtifacts } from './trace-artifact-discovery';
+
+const tempDirs: string[] = [];
+
+function tempDir(): string {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'trace-artifacts-test-'));
+  tempDirs.push(dir);
+  return dir;
+}
+
+function writeTraceFiles(dir: string): void {
+  fs.mkdirSync(path.join(dir, 'aiperf_artifacts'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'profile_export.jsonl'), '{}\n');
+  fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.csv'), 'x,y\n');
+  fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.json'), '{}');
+}
+
+afterEach(() => {
+  for (const dir of tempDirs.splice(0)) fs.rmSync(dir, { recursive: true, force: true });
+});
+
+describe('discoverTraceReplayArtifacts', () => {
+  it('discovers the existing single-node sibling layout', () => {
+    const root = tempDir();
+    writeTraceFiles(path.join(root, 'agentic_config-a'));
+
+    const found = discoverTraceReplayArtifacts(root);
+
+    expect(found.get('config-a')).toMatchObject({
+      profileJsonl: expect.stringContaining('profile_export.jsonl'),
+      serverMetricsCsv: expect.stringContaining('server_metrics_export.csv'),
+      serverMetricsJson: expect.stringContaining('server_metrics_export.json'),
+    });
+  });
+
+  it('extracts and indexes multinode traces by concurrency', () => {
+    const root = tempDir();
+    const artifactDir = path.join(root, 'multinode_server_logs_config-b');
+    const archiveSource = path.join(root, 'archive-source');
+    writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_96'));
+    writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_128'));
+    fs.mkdirSync(artifactDir, { recursive: true });
+    execFileSync('tar', [
+      '-czf',
+      path.join(artifactDir, 'multinode_server_logs.tar.gz'),
+      '-C',
+      archiveSource,
+      '.',
+    ]);
+    fs.rmSync(archiveSource, { recursive: true, force: true });
+
+    const found = discoverTraceReplayArtifacts(root);
+
+    expect([...found.keys()].toSorted()).toEqual(['config-b|128', 'config-b|96']);
+    expect(found.get('config-b|96')?.profileJsonl).toContain(
+      'multinode_server_logs/agentic/conc_96/aiperf_artifacts/profile_export.jsonl',
+    );
+  });
+});
diff --git a/packages/db/src/etl/trace-artifact-discovery.ts b/packages/db/src/etl/trace-artifact-discovery.ts
new file mode 100644
index 00000000..71ee74df
--- /dev/null
+++ b/packages/db/src/etl/trace-artifact-discovery.ts
@@ -0,0 +1,93 @@
+import { execFileSync } from 'node:child_process';
+import fs from 'node:fs';
+import path from 'node:path';
+
+export interface TraceReplayArtifactPaths {
+  profileJsonl: string | null;
+  serverMetricsCsv: string | null;
+  serverMetricsJson: string | null;
+}
+
+const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay'];
+
+const AGENTIC_PREFIX = /^agentic_/u;
+const MULTINODE_PREFIX = /^multinode_server_logs_/u;
+const CONC_DIR_PATTERN = /^conc_(?<conc>\d+)$/u;
+
+function traceFilesIn(dir: string): TraceReplayArtifactPaths | null {
+  let profileJsonl: string | null = null;
+  let serverMetricsCsv: string | null = null;
+  let serverMetricsJson: string | null = null;
+
+  for (const subdir of TRACE_SUBDIRS) {
+    const traceDir = path.join(dir, subdir);
+    if (!fs.existsSync(traceDir) || !fs.statSync(traceDir).isDirectory()) continue;
+
+    const profilePath = path.join(traceDir, 'profile_export.jsonl');
+    const csvPath = path.join(traceDir, 'server_metrics_export.csv');
+    const jsonPath = path.join(traceDir, 'server_metrics_export.json');
+    if (!profileJsonl && fs.existsSync(profilePath)) profileJsonl = profilePath;
+    if (!serverMetricsCsv && fs.existsSync(csvPath)) serverMetricsCsv = csvPath;
+    if (!serverMetricsJson && fs.existsSync(jsonPath)) serverMetricsJson = jsonPath;
+  }
+
+  if (!profileJsonl && !serverMetricsCsv && !serverMetricsJson) return null;
+  return { profileJsonl, serverMetricsCsv, serverMetricsJson };
+}
+
+function extractMultinodeArchive(artifactDir: string): string | null {
+  const archivePath = path.join(artifactDir, 'multinode_server_logs.tar.gz');
+  const extractedDir = path.join(artifactDir, 'multinode_server_logs');
+
+  if (!fs.existsSync(extractedDir) && fs.existsSync(archivePath)) {
+    fs.mkdirSync(extractedDir, { recursive: true });
+    execFileSync('tar', ['-xzf', archivePath, '-C', extractedDir], { stdio: 'ignore' });
+  }
+
+  return fs.existsSync(extractedDir) ? extractedDir : null;
+}
+
+/**
+ * Discover trace-replay siblings in both artifact layouts:
+ *
+ * - Single-node: `agentic_<suffix>/aiperf_artifacts/*`
+ * - Multinode: `multinode_server_logs_<suffix>/multinode_server_logs.tar.gz`,
+ *   containing `agentic/conc_<N>/aiperf_artifacts/*`
+ *
+ * Multinode keys include concurrency (`<suffix>|<N>`) because one artifact
+ * contains several points, each with a distinct trace payload.
+ */
+export function discoverTraceReplayArtifacts(
+  artifactsDir: string,
+): Map<string, TraceReplayArtifactPaths> {
+  const discovered = new Map<string, TraceReplayArtifactPaths>();
+  if (!fs.existsSync(artifactsDir)) return discovered;
+
+  for (const entry of fs.readdirSync(artifactsDir)) {
+    const artifactDir = path.join(artifactsDir, entry);
+    if (!fs.statSync(artifactDir).isDirectory()) continue;
+
+    if (entry.startsWith('agentic_')) {
+      const trace = traceFilesIn(artifactDir);
+      if (trace) discovered.set(entry.replace(AGENTIC_PREFIX, ''), trace);
+      continue;
+    }
+
+    if (!entry.startsWith('multinode_server_logs_')) continue;
+    const extractedDir = extractMultinodeArchive(artifactDir);
+    if (!extractedDir) continue;
+
+    const agenticDir = path.join(extractedDir, 'agentic');
+    if (!fs.existsSync(agenticDir) || !fs.statSync(agenticDir).isDirectory()) continue;
+
+    const suffix = entry.replace(MULTINODE_PREFIX, '');
+    for (const concEntry of fs.readdirSync(agenticDir)) {
+      const match = concEntry.match(CONC_DIR_PATTERN);
+      if (!match?.groups?.conc) continue;
+      const trace = traceFilesIn(path.join(agenticDir, concEntry));
+      if (trace) discovered.set(`${suffix}|${match.groups.conc}`, trace);
+    }
+  }
+
+  return discovered;
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
new file mode 100644
index 00000000..b50168db
--- /dev/null
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -0,0 +1,151 @@
+/**
+ * Insert per-point aiperf trace files (`profile_export.jsonl` +
+ * `server_metrics_export.csv`) into `agentic_trace_replay` and link the new row
+ * to each provided benchmark_results row via `trace_replay_id`.
+ *
+ * Mirrors the {@link insertServerLog} idempotency contract: rows that already
+ * have a non-null `trace_replay_id` are left alone so a re-ingest doesn't
+ * duplicate the sibling blob.
+ */
+
+import { gzipSync } from 'node:zlib';
+
+import type postgres from 'postgres';
+
+import { computeAggregateStats } from './compute-aggregate-stats.js';
+import { computeChartSeries } from './compute-chart-series.js';
+import { computeRequestTimeline } from './compute-request-timeline.js';
+import type { ServerMetricsContext } from './server-metrics-adapters';
+
+type Sql = ReturnType<typeof postgres>;
+
+/**
+ * Persist the per-point trace files and link them to `benchmarkResultIds`.
+ *
+ * @param sql                 Active `postgres` connection.
+ * @param benchmarkResultIds  DB ids of the benchmark_results rows produced by
+ *                            the same `bmk_agentic_<suffix>` artifact whose
+ *                            sibling `agentic_<suffix>` directory holds these
+ *                            trace files.
+ * @param profileExportJsonl  Raw bytes of `profile_export.jsonl`, or null.
+ *                            Gzipped before storage.
+ * @param serverMetricsCsv    Raw bytes of `server_metrics_export.csv`, or null.
+ *                            Stored as-is.
+ * @param serverMetricsJson   Raw bytes of `server_metrics_export.json` —
+ *                            per-scrape time-series of every Prometheus metric.
+ *                            Optional, gzipped before storage (~42x ratio).
+ * @param metricsContext      Canonical framework used to select the
+ *                            orchestrator-specific metric-label adapter.
+ */
+export async function insertTraceReplay(
+  sql: Sql,
+  benchmarkResultIds: number[],
+  profileExportJsonl: Buffer | null,
+  serverMetricsCsv: Buffer | null,
+  serverMetricsJson: Buffer | null = null,
+  metricsContext: ServerMetricsContext = {},
+): Promise<void> {
+  if (benchmarkResultIds.length === 0) return;
+  if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return;
+
+  // Only link rows that don't already point at a trace_replay row — keeps
+  // re-ingest from inserting duplicate sibling blobs.
+  const unlinked = await sql<{ id: number }[]>`
+    select id from benchmark_results
+    where id = any(${sql.array(benchmarkResultIds)}::bigint[])
+      and trace_replay_id is null
+  `;
+  if (unlinked.length === 0) return;
+
+  const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null;
+  const profileSize = profileExportJsonl ? profileExportJsonl.length : null;
+  const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null;
+  const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
+  const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
+
+  // Pre-compute aggregate stats + chart-ready time-series + per-request
+  // timeline so the detail page doesn't have to re-parse these blobs on
+  // every request. Each helper tolerates a null blob and falls back to
+  // a streaming parser for oversized server_metrics blobs.
+  const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([
+    computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
+    computeChartSeries(metricsJsonGz, metricsContext),
+    Promise.resolve(computeRequestTimeline(profileGz)),
+  ]);
+
+  const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
+    insert into agentic_trace_replay (
+      profile_export_jsonl_gz,
+      profile_export_uncompressed_size,
+      server_metrics_csv,
+      server_metrics_csv_size,
+      server_metrics_json_gz,
+      server_metrics_json_uncompressed_size,
+      aggregate_stats,
+      chart_series,
+      request_timeline
+    )
+    values (
+      ${profileGz},
+      ${profileSize},
+      ${serverMetricsCsv},
+      ${csvSize},
+      ${metricsJsonGz},
+      ${metricsJsonSize},
+      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])},
+      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])},
+      ${requestTimeline === null ? null : sql.json(structuredClone(requestTimeline) as unknown as Parameters<typeof sql.json>[0])}
+    )
+    returning id
+  `;
+
+  await sql`
+    update benchmark_results
+    set trace_replay_id = ${traceReplayId}
+    where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+  `;
+
+  // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang
+  // runs don't populate these in the harness JSON; vLLM runs do but only
+  // for GPU. We always recompute to keep the derivation consistent with
+  // what the detail-page charts plot — overwriting any pre-existing value.
+  //
+  // Source label naming differs by framework / cache topology:
+  //   SGLang hicache: 'cache hit (HBM)' + 'cache hit (CPU offload)'
+  //   SGLang older:   'cache hit'      (no tier breakdown)
+  //   vLLM LMCache:   'local_cache_hit' + 'external_kv_transfer'  (+ 'local_compute' for miss)
+  //   vLLM single:    falls back to prefixCacheHitsTps total (= local cache only)
+  if (chartSeries && chartSeries.prefillTps.length > 0) {
+    const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
+    if (sumPrompts > 0) {
+      const sumOf = (name: string): number =>
+        (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0);
+      // CPU-offload hits: SGLang hicache + vLLM LMCache external transfer.
+      const cpuHits = sumOf('cache hit (CPU offload)') + sumOf('external_kv_transfer');
+      // GPU/HBM hits from source breakdown, summed across known aliases.
+      const hbmFromBreakdown =
+        sumOf('cache hit (HBM)') + sumOf('cache hit') + sumOf('local_cache_hit');
+      // If the source breakdown has any GPU entry, use it. Otherwise fall back
+      // to total prefixCacheHitsTps sum (single-source vLLM path with no
+      // by_source metric — equals the lone cache counter's lifetime).
+      const gpuHits =
+        hbmFromBreakdown > 0
+          ? hbmFromBreakdown
+          : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
+      const gpuRate = gpuHits / sumPrompts;
+      const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null;
+      await sql`
+        update benchmark_results
+        set metrics = jsonb_set(
+          case when ${cpuRate}::numeric is not null
+            then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric))
+            else metrics
+          end,
+          '{server_gpu_cache_hit_rate}',
+          to_jsonb(${gpuRate}::numeric)
+        )
+        where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+      `;
+    }
+  }
+}
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
new file mode 100644
index 00000000..444236ab
--- /dev/null
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -0,0 +1,259 @@
+import { describe, it, expect } from 'vitest';
+import {
+  countSeenPrefixBlocks,
+  buildConversationStructure,
+  countConversationRequests,
+  linearHistogram,
+  logHistogram,
+  logHistogramWithZero,
+  subagentRequestTurns,
+  summarizeValues,
+  type RawWekaConversation,
+  type SubagentNode,
+  type TurnNode,
+} from './weka-structure';
+
+describe('countSeenPrefixBlocks', () => {
+  it('counts only the contiguous leading run already seen', () => {
+    const seen = new Set([1, 2, 3, 9]);
+    // 1,2,3 seen contiguously; 4 breaks the run even though 9 is seen later.
+    expect(countSeenPrefixBlocks([1, 2, 3, 4, 9], seen)).toBe(3);
+  });
+
+  it('returns 0 when the first block is unseen', () => {
+    expect(countSeenPrefixBlocks([7, 1, 2], new Set([1, 2]))).toBe(0);
+  });
+
+  it('returns the full length when every block is seen', () => {
+    expect(countSeenPrefixBlocks([1, 2], new Set([1, 2, 3]))).toBe(2);
+  });
+
+  it('handles empty hash list', () => {
+    expect(countSeenPrefixBlocks([], new Set([1]))).toBe(0);
+  });
+});
+
+describe('buildConversationStructure', () => {
+  it('splits input into cached-prefix vs uncached as the prefix cache warms', () => {
+    const conv: RawWekaConversation = {
+      id: 'c1',
+      block_size: 64,
+      requests: [
+        // Turn 0: nothing seen yet → all uncached.
+        { type: 'n', model: 'm', in: 128, out: 10, hash_ids: [1, 2] },
+        // Turn 1: blocks 1,2 already seen, 3 is new → 2 blocks cached.
+        { type: 'n', model: 'm', in: 192, out: 20, hash_ids: [1, 2, 3] },
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    const t0 = s.nodes[0] as TurnNode;
+    const t1 = s.nodes[1] as TurnNode;
+    expect(t0).toMatchObject({ kind: 'turn', in: 128, cached: 0, uncached: 128, out: 10 });
+    expect(t1.cached).toBe(128); // 2 blocks × 64
+    expect(t1.uncached).toBe(64); // 192 - 128
+    expect(s.totals).toMatchObject({
+      in: 320,
+      out: 30,
+      cached: 128,
+      uncached: 192,
+      numTurns: 2,
+      numSubagentGroups: 0,
+    });
+  });
+
+  it('stamps top-level turns with their raw Weka request index', () => {
+    const structure = buildConversationStructure({
+      id: 'raw-index',
+      requests: [
+        { type: 'n', in: 1, out: 1 },
+        { type: 'subagent', requests: [{ type: 'n', in: 1, out: 1 }] },
+        { type: 'n', in: 1, out: 1 },
+      ],
+    });
+
+    expect((structure.nodes[0] as TurnNode).rawIndex).toBe(0);
+    expect((structure.nodes[2] as TurnNode).rawIndex).toBe(2);
+  });
+
+  it('clamps cached to the effective input on a partial last block', () => {
+    const conv: RawWekaConversation = {
+      id: 'c2',
+      block_size: 64,
+      requests: [
+        { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // 2 blocks but in=100 (partial)
+        { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // both seen → cached clamped to 100
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    const t1 = s.nodes[1] as TurnNode;
+    expect(t1.cached).toBe(100);
+    expect(t1.uncached).toBe(0);
+  });
+
+  it('treats turns with no hash_ids as fully uncached', () => {
+    const conv: RawWekaConversation = {
+      id: 'c3',
+      requests: [{ type: 'n', in: 50, out: 5 }],
+    };
+    const t0 = buildConversationStructure(conv).nodes[0] as TurnNode;
+    expect(t0).toMatchObject({ cached: 0, uncached: 50 });
+  });
+
+  it('nests subagent groups with aggregated children and runs them against a spawn-time snapshot', () => {
+    const conv: RawWekaConversation = {
+      id: 'c4',
+      block_size: 64,
+      requests: [
+        { type: 'n', model: 'main', t: 0, api_time: 1, in: 64, out: 10, hash_ids: [1] },
+        {
+          type: 'subagent',
+          agent_id: 'a1',
+          subagent_type: 'Explore',
+          t: 12.5,
+          duration_ms: 1234,
+          requests: [
+            // sees parent block 1 (snapshot at spawn) → 1 block cached
+            { type: 'n', model: 'sub', t: 12.5, in: 128, out: 7, hash_ids: [1, 5] },
+            // now block 5 is also seen within the subagent → 2 cached
+            { type: 'n', model: 'sub', t: 13.1, in: 128, out: 3, hash_ids: [1, 5] },
+          ],
+        },
+        // Parent turn after subagent: block 5 must NOT be cached (subagent
+        // context not folded back); only block 1 is in the parent seen set.
+        { type: 'n', model: 'main', in: 128, out: 1, hash_ids: [1, 5] },
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    expect(s.totals.numTurns).toBe(2); // two top-level normal turns
+    expect(s.totals.numSubagentGroups).toBe(1);
+
+    const sub = s.nodes[1] as SubagentNode;
+    expect(sub.kind).toBe('subagent');
+    expect(sub.label).toBe('Explore');
+    expect(sub.agentId).toBe('a1');
+    expect(sub.rawIndex).toBe(1);
+    expect(sub.durationMs).toBe(1234);
+    expect(sub.startS).toBe(12.5);
+    expect(sub.endS).toBeCloseTo(13.734, 6);
+    expect(sub.children).toHaveLength(2);
+    expect(countConversationRequests(s)).toBe(4);
+    expect(subagentRequestTurns(s).map((turn) => turn.model)).toEqual(['sub', 'sub']);
+    expect(sub.children.map((child) => [child.startS, child.endS])).toEqual([
+      [12.5, 12.5],
+      [13.1, 13.1],
+    ]);
+    expect(sub.children.map((child) => [child.rawIndex, child.innerIndex])).toEqual([
+      [1, 0],
+      [1, 1],
+    ]);
+    expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot
+    expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child
+    expect(sub.in).toBe(256);
+    expect(sub.out).toBe(10);
+
+    const afterSub = s.nodes[2] as TurnNode;
+    expect(afterSub.cached).toBe(64); // only block 1; block 5 not folded back
+    expect((s.nodes[0] as TurnNode).endS).toBe(1);
+  });
+
+  it('counts top-level and subagent child turns as requests, but not subagent groups', () => {
+    const structure = buildConversationStructure({
+      id: 'request-count',
+      requests: [
+        { type: 'n', in: 1, out: 1 },
+        {
+          type: 'subagent',
+          requests: [
+            { type: 'n', in: 1, out: 1 },
+            { type: 'n', in: 1, out: 1 },
+          ],
+        },
+      ],
+    });
+
+    expect(countConversationRequests(structure)).toBe(3);
+    expect(subagentRequestTurns(structure)).toHaveLength(2);
+  });
+
+  it('falls back to the default block size and a generic subagent label', () => {
+    const conv: RawWekaConversation = {
+      id: 'c5',
+      requests: [{ type: 'subagent', requests: [{ type: 'n', in: 10, out: 1, hash_ids: [1] }] }],
+    };
+    const s = buildConversationStructure(conv);
+    expect(s.blockSize).toBe(64);
+    expect((s.nodes[0] as SubagentNode).label).toBe('Subagent');
+  });
+
+  it('derives a subagent time range from child timings when group timing is absent', () => {
+    const conv: RawWekaConversation = {
+      id: 'c6',
+      requests: [
+        {
+          type: 'subagent',
+          requests: [
+            { type: 'n', t: 5, api_time: 2.5, in: 10, out: 1 },
+            { type: 'n', t: 9, api_time: 3, in: 10, out: 1 },
+          ],
+        },
+      ],
+    };
+    const sub = buildConversationStructure(conv).nodes[0] as SubagentNode;
+    expect(sub.startS).toBe(5);
+    expect(sub.endS).toBe(12);
+  });
+
+  it('normalizes legacy subagent-relative request intervals', () => {
+    const structure = buildConversationStructure({
+      id: 'legacy-relative',
+      requests: [
+        {
+          type: 'subagent',
+          t: 100,
+          requests: [{ type: 'n', t: 2, api_time: 3, in: 10, out: 1 }],
+        },
+      ],
+    });
+    const child = (structure.nodes[0] as SubagentNode).children[0]!;
+    expect(child).toMatchObject({ startS: 102, endS: 105 });
+  });
+});
+
+describe('histograms', () => {
+  it('linearHistogram buckets across [0, max] and totals the count', () => {
+    const bins = linearHistogram([0, 1, 2, 3, 4], 4);
+    expect(bins).toHaveLength(4);
+    expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(5);
+    expect(bins[0].x0).toBe(0);
+  });
+
+  it('linearHistogram handles all-zero input', () => {
+    expect(linearHistogram([0, 0])).toEqual([{ x0: 0, x1: 1, count: 2 }]);
+  });
+
+  it('logHistogram drops non-positive values and preserves the positive total', () => {
+    const bins = logHistogram([1, 10, 100, 1000, 0, -5], 3);
+    expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(4);
+  });
+
+  it('both return [] for empty input', () => {
+    expect(linearHistogram([])).toEqual([]);
+    expect(logHistogram([])).toEqual([]);
+  });
+
+  it('preserves zero-valued samples in a dedicated log histogram bin', () => {
+    const bins = logHistogramWithZero([0, 0, 1, 10, 100], 4);
+    expect(bins[0]).toEqual({ x0: 0, x1: 1, count: 2 });
+    expect(bins.reduce((total, bin) => total + bin.count, 0)).toBe(5);
+  });
+});
+
+describe('summarizeValues', () => {
+  it('computes the same linearly-interpolated percentile set as request distributions', () => {
+    const summary = summarizeValues(Array.from({ length: 100 }, (_, i) => i + 1));
+    expect(summary.median).toBeCloseTo(50.5, 6);
+    expect(summary.p75).toBeCloseTo(75.25, 6);
+    expect(summary.p90).toBeCloseTo(90.1, 6);
+    expect(summary.p95).toBeCloseTo(95.05, 6);
+  });
+});
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
new file mode 100644
index 00000000..ccfb6ec7
--- /dev/null
+++ b/packages/db/src/etl/weka-structure.ts
@@ -0,0 +1,327 @@
+/**
+ * Pure transforms for the HuggingFace cc-traces-weka datasets.
+ *
+ * Turns a raw conversation record (`{ id, block_size, requests[] }`, where each
+ * request is a normal turn or a subagent group) into a compact, flamegraph-ready
+ * `structure`: ordered nodes with input split into cached-prefix vs
+ * uncached-suffix. The cached split ports `_count_seen_prefix_blocks` from the
+ * aiperf weka loader (contiguous leading hash_ids already seen under an infinite
+ * KV cache). No DB access — safe to import anywhere and unit-test directly.
+ */
+
+export const DEFAULT_BLOCK_SIZE = 64;
+
+// ── Raw record shapes (subset we read) ──────────────────────────────────────
+
+export interface RawWekaRequest {
+  t?: number;
+  type?: string; // 'n' | 's'
+  model?: string;
+  in?: number;
+  out?: number;
+  hash_ids?: number[];
+  api_time?: number;
+}
+
+export interface RawWekaSubagent {
+  t?: number;
+  type: 'subagent';
+  agent_id?: string;
+  subagent_type?: string;
+  duration_ms?: number;
+  requests?: RawWekaRequest[];
+  models?: string[];
+}
+
+export type RawWekaEntry = RawWekaRequest | RawWekaSubagent;
+
+export interface RawWekaConversation {
+  id: string;
+  models?: string[];
+  block_size?: number;
+  hash_id_scope?: string;
+  requests?: RawWekaEntry[];
+}
+
+// ── Output structure (stored in dataset_conversations.structure) ─────────────
+
+export interface TurnNode {
+  kind: 'turn';
+  turnIndex: number;
+  /** Zero-based index in the raw Weka requests array, when this row maps to one. */
+  rawIndex?: number;
+  /** Zero-based index within a raw nested request array, when this row maps to one. */
+  innerIndex?: number;
+  /** Seconds from the start of the conversation. */
+  startS?: number;
+  /** End of the original request interval (`startS + api_time`). */
+  endS?: number;
+  model?: string;
+  in: number;
+  out: number;
+  /** Input tokens served from the prefix cache (≤ in). */
+  cached: number;
+  /** Input tokens that must be (re)computed (in - cached). */
+  uncached: number;
+}
+
+export interface SubagentNode {
+  kind: 'subagent';
+  label: string;
+  agentId?: string;
+  /** Zero-based index of the raw top-level subagent marker. */
+  rawIndex?: number;
+  /** Seconds from the start of the conversation. */
+  startS?: number;
+  /** Seconds from the start of the conversation. */
+  endS?: number;
+  durationMs?: number;
+  in: number;
+  out: number;
+  cached: number;
+  uncached: number;
+  children: TurnNode[];
+}
+
+export type StructureNode = TurnNode | SubagentNode;
+
+export interface ConversationStructure {
+  blockSize: number;
+  nodes: StructureNode[];
+  totals: {
+    in: number;
+    out: number;
+    cached: number;
+    uncached: number;
+    numTurns: number;
+    numSubagentGroups: number;
+  };
+}
+
+/** Actual model requests in a conversation: main turns plus subagent child turns. */
+export function countConversationRequests(structure: ConversationStructure): number {
+  return structure.totals.numTurns + subagentRequestTurns(structure).length;
+}
+
+/** Model requests issued by inner subagents, excluding all parent-agent turns. */
+export function subagentRequestTurns(structure: ConversationStructure): TurnNode[] {
+  return structure.nodes.flatMap((node) => (node.kind === 'subagent' ? node.children : []));
+}
+
+const isSubagent = (e: RawWekaEntry): e is RawWekaSubagent =>
+  (e as RawWekaSubagent).type === 'subagent';
+
+/**
+ * Count contiguous leading hash_ids already present in `seen`
+ * (port of aiperf `_count_seen_prefix_blocks`).
+ */
+export function countSeenPrefixBlocks(
+  hashIds: readonly number[],
+  seen: ReadonlySet<number>,
+): number {
+  let hits = 0;
+  for (const h of hashIds) {
+    if (!seen.has(h)) break;
+    hits += 1;
+  }
+  return hits;
+}
+
+/**
+ * Compute the {cached, uncached} input-token split for one request and fold its
+ * blocks into `seen`. `cached` is derived from blocks but clamped to the
+ * request's effective `in` so cached+uncached === in even when the last block is
+ * partial (in = hash_token_count, not always a multiple of blockSize).
+ */
+function splitInput(
+  req: RawWekaRequest,
+  seen: Set<number>,
+  blockSize: number,
+): { in: number; cached: number; uncached: number } {
+  const input = Math.max(0, Math.round(req.in ?? 0));
+  const hashIds = req.hash_ids ?? [];
+  if (hashIds.length === 0) {
+    return { in: input, cached: 0, uncached: input };
+  }
+  const cachedBlocks = countSeenPrefixBlocks(hashIds, seen);
+  for (const h of hashIds) seen.add(h);
+  const cached = Math.min(input, cachedBlocks * blockSize);
+  return { in: input, cached, uncached: input - cached };
+}
+
+function subagentLabel(s: RawWekaSubagent): string {
+  const base = s.subagent_type?.trim();
+  return base && base.length > 0 ? base : 'Subagent';
+}
+
+function finiteTime(value: number | undefined): number | undefined {
+  return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined;
+}
+
+function requestEndS(startS: number | undefined, apiTime: number | undefined): number | undefined {
+  if (startS === undefined) return undefined;
+  const duration = finiteTime(apiTime) ?? 0;
+  return startS + duration;
+}
+
+/** Mirror aiperf's legacy-relative/current-absolute subagent timestamp handling. */
+function subagentRequestStartS(
+  entry: RawWekaSubagent,
+  request: RawWekaRequest,
+): number | undefined {
+  const requestStart = finiteTime(request.t);
+  if (requestStart === undefined) return undefined;
+  const groupStart = finiteTime(entry.t);
+  if (groupStart !== undefined && requestStart + 1e-6 < groupStart) {
+    return groupStart + requestStart;
+  }
+  return requestStart;
+}
+
+function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } {
+  const children = entry.requests ?? [];
+  const childStarts = children
+    .map((child) => subagentRequestStartS(entry, child))
+    .filter((value): value is number => value !== undefined);
+  const startS =
+    finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined);
+  const durationMs = finiteTime(entry.duration_ms);
+  if (startS !== undefined && durationMs !== undefined) {
+    return { startS, endS: startS + durationMs / 1000 };
+  }
+
+  const childEnds = children
+    .map((child) => {
+      const childStart = subagentRequestStartS(entry, child);
+      if (childStart === undefined) return undefined;
+      return childStart + (finiteTime(child.api_time) ?? 0);
+    })
+    .filter((value): value is number => value !== undefined);
+  return {
+    startS,
+    endS: childEnds.length > 0 ? Math.max(...childEnds) : startS,
+  };
+}
+
+/**
+ * Build the flamegraph structure for one conversation. Main turns share a single
+ * accumulating prefix-cache `seen` set; each subagent group runs against a
+ * *copy* of the parent `seen` at spawn (its context is separate and is not
+ * folded back into the parent), mirroring the weka loader's parent/child split.
+ */
+export function buildConversationStructure(
+  conv: RawWekaConversation,
+  blockSizeOverride?: number,
+): ConversationStructure {
+  const blockSize = blockSizeOverride ?? conv.block_size ?? DEFAULT_BLOCK_SIZE;
+  const seen = new Set<number>();
+  const nodes: StructureNode[] = [];
+  let totalIn = 0;
+  let totalOut = 0;
+  let totalCached = 0;
+  let totalUncached = 0;
+  let numTurns = 0;
+  let numSubagentGroups = 0;
+  let turnIndex = 0;
+
+  for (const [idx, entry] of (conv.requests ?? []).entries()) {
+    if (isSubagent(entry)) {
+      const { startS, endS } = subagentTimeRange(entry);
+      const childSeen = new Set(seen); // snapshot at spawn; not merged back
+      const children: TurnNode[] = [];
+      let gin = 0;
+      let gout = 0;
+      let gcached = 0;
+      let guncached = 0;
+      for (const [innerIdx, inner] of (entry.requests ?? []).entries()) {
+        const split = splitInput(inner, childSeen, blockSize);
+        const out = Math.max(0, Math.round(inner.out ?? 0));
+        const childStartS = subagentRequestStartS(entry, inner);
+        children.push({
+          kind: 'turn',
+          turnIndex: turnIndex++,
+          rawIndex: idx,
+          innerIndex: innerIdx,
+          startS: childStartS,
+          endS: requestEndS(childStartS, inner.api_time),
+          model: inner.model,
+          in: split.in,
+          out,
+          cached: split.cached,
+          uncached: split.uncached,
+        });
+        gin += split.in;
+        gout += out;
+        gcached += split.cached;
+        guncached += split.uncached;
+      }
+      nodes.push({
+        kind: 'subagent',
+        label: subagentLabel(entry),
+        agentId: entry.agent_id,
+        rawIndex: idx,
+        startS,
+        endS,
+        durationMs: entry.duration_ms,
+        in: gin,
+        out: gout,
+        cached: gcached,
+        uncached: guncached,
+        children,
+      });
+      numSubagentGroups += 1;
+      totalIn += gin;
+      totalOut += gout;
+      totalCached += gcached;
+      totalUncached += guncached;
+    } else {
+      const split = splitInput(entry, seen, blockSize);
+      const out = Math.max(0, Math.round(entry.out ?? 0));
+      const startS = finiteTime(entry.t);
+      nodes.push({
+        kind: 'turn',
+        turnIndex: turnIndex++,
+        rawIndex: idx,
+        startS,
+        endS: requestEndS(startS, entry.api_time),
+        model: entry.model,
+        in: split.in,
+        out,
+        cached: split.cached,
+        uncached: split.uncached,
+      });
+      numTurns += 1;
+      totalIn += split.in;
+      totalOut += out;
+      totalCached += split.cached;
+      totalUncached += split.uncached;
+    }
+  }
+
+  return {
+    blockSize,
+    nodes,
+    totals: {
+      in: totalIn,
+      out: totalOut,
+      cached: totalCached,
+      uncached: totalUncached,
+      numTurns,
+      numSubagentGroups,
+    },
+  };
+}
+
+// ── Distribution binning (for the dataset-detail cards) ──────────────────────
+// The implementations moved to distribution-stats.ts (generic, dataset-agnostic
+// math); re-exported here because this module is the established import path
+// for the dataset ingest/backfill scripts and the frontend.
+
+export {
+  linearHistogram,
+  logHistogram,
+  logHistogramWithZero,
+  summarizeValues,
+  type HistogramBin,
+  type NumberSummary,
+} from './distribution-stats';

From 28b78cdb22ed64ca385401003271b5ed310857ab Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:11:35 -0500
Subject: [PATCH 04/40] feat(db): agentic query layer, backfill CLIs, weka
 dataset ingest

---
 packages/db/src/backfill-agentic-intvty.ts    | 107 +++++
 .../db/src/backfill-agentic-server-logs.ts    | 215 +++++++++
 packages/db/src/backfill-aggregate-stats.ts   | 126 ++++++
 packages/db/src/backfill-chart-series.ts      | 124 ++++++
 packages/db/src/backfill-dataset-stats.ts     | 111 +++++
 packages/db/src/backfill-kv-pool.ts           | 103 +++++
 packages/db/src/backfill-request-timeline.ts  |  97 ++++
 packages/db/src/ingest-ci-run.ts              | 187 ++++++--
 packages/db/src/ingest-gcs-backup.ts          |   9 +-
 packages/db/src/ingest-supplemental.ts        |  14 +-
 packages/db/src/ingest-weka-dataset.ts        | 416 ++++++++++++++++++
 .../src/json-provider.line-single-run.test.ts |  26 +-
 packages/db/src/json-provider.ts              |  17 +-
 packages/db/src/lib/backfill-runner.test.ts   |  55 +++
 packages/db/src/lib/backfill-runner.ts        |  98 +++++
 packages/db/src/lib/github-artifacts.test.ts  |  42 ++
 packages/db/src/lib/github-artifacts.ts       |  86 ++++
 .../db/src/queries/agentic-aggregates.test.ts | 113 +++++
 packages/db/src/queries/agentic-aggregates.ts | 406 +++++++++++++++++
 packages/db/src/queries/agentic-shared.ts     |  81 ++++
 packages/db/src/queries/benchmark-siblings.ts | 169 +++++++
 packages/db/src/queries/benchmarks.ts         |  33 +-
 packages/db/src/queries/datasets.test.ts      | 102 +++++
 packages/db/src/queries/datasets.ts           | 213 +++++++++
 .../queries/derived-agentic-metrics.test.ts   | 111 +++++
 .../db/src/queries/derived-agentic-metrics.ts | 268 +++++++++++
 .../db/src/queries/request-timeline.test.ts   |  45 ++
 packages/db/src/queries/request-timeline.ts   |  64 +++
 packages/db/src/queries/trace-availability.ts |  34 ++
 .../db/src/queries/trace-histograms.test.ts   |  78 ++++
 packages/db/src/queries/trace-histograms.ts   | 134 ++++++
 .../src/queries/trace-server-metrics.test.ts  | 105 +++++
 .../db/src/queries/trace-server-metrics.ts    | 211 +++++++++
 packages/db/src/queries/workflow-info.ts      |  15 +-
 34 files changed, 3944 insertions(+), 71 deletions(-)
 create mode 100644 packages/db/src/backfill-agentic-intvty.ts
 create mode 100644 packages/db/src/backfill-agentic-server-logs.ts
 create mode 100644 packages/db/src/backfill-aggregate-stats.ts
 create mode 100644 packages/db/src/backfill-chart-series.ts
 create mode 100644 packages/db/src/backfill-dataset-stats.ts
 create mode 100644 packages/db/src/backfill-kv-pool.ts
 create mode 100644 packages/db/src/backfill-request-timeline.ts
 create mode 100644 packages/db/src/ingest-weka-dataset.ts
 create mode 100644 packages/db/src/lib/backfill-runner.test.ts
 create mode 100644 packages/db/src/lib/backfill-runner.ts
 create mode 100644 packages/db/src/lib/github-artifacts.test.ts
 create mode 100644 packages/db/src/lib/github-artifacts.ts
 create mode 100644 packages/db/src/queries/agentic-aggregates.test.ts
 create mode 100644 packages/db/src/queries/agentic-aggregates.ts
 create mode 100644 packages/db/src/queries/agentic-shared.ts
 create mode 100644 packages/db/src/queries/benchmark-siblings.ts
 create mode 100644 packages/db/src/queries/datasets.test.ts
 create mode 100644 packages/db/src/queries/datasets.ts
 create mode 100644 packages/db/src/queries/derived-agentic-metrics.test.ts
 create mode 100644 packages/db/src/queries/derived-agentic-metrics.ts
 create mode 100644 packages/db/src/queries/request-timeline.test.ts
 create mode 100644 packages/db/src/queries/request-timeline.ts
 create mode 100644 packages/db/src/queries/trace-availability.ts
 create mode 100644 packages/db/src/queries/trace-histograms.test.ts
 create mode 100644 packages/db/src/queries/trace-histograms.ts
 create mode 100644 packages/db/src/queries/trace-server-metrics.test.ts
 create mode 100644 packages/db/src/queries/trace-server-metrics.ts

diff --git a/packages/db/src/backfill-agentic-intvty.ts b/packages/db/src/backfill-agentic-intvty.ts
new file mode 100644
index 00000000..a8eebdba
--- /dev/null
+++ b/packages/db/src/backfill-agentic-intvty.ts
@@ -0,0 +1,107 @@
+/**
+ * Backfill: enforce the slow-tail interactivity invariant on agentic rows.
+ *
+ * Agentic trace-replay artifacts emit both `*_itl` and `*_intvty`. Historically
+ * the harness wrote `*_intvty = 1/p(ITL)` (slow-tail — "interactivity at the
+ * p-th latency"), which is what the inference chart's interactivity selector
+ * and the detail time-series both assume. A later "timing fix" harness started
+ * emitting `*_intvty = p(1/ITL)` instead (fast-tail — equivalent to
+ * `1/p(100-x)(ITL)`), because taking the reciprocal reverses percentile order.
+ * Ingest stores every metric verbatim, so those runs landed in the DB with the
+ * opposite definition — e.g. p90 reading 23.9 instead of 11.2 for the same
+ * point — contaminating cross-run Pareto comparisons.
+ *
+ * This rewrites `mean/p75/p90/p95 _intvty = 1/_itl` for every agentic row so the
+ * stored value always matches the slow-tail definition the charts use. It is
+ * idempotent: rows already on the correct definition are left untouched (guarded
+ * by a relative-deviation check). `std_intvty` is intentionally NOT touched —
+ * the reciprocal of a standard deviation is meaningless, and the API strips it.
+ * The prior fast-tail value is discarded on purpose (p10_itl isn't stored, so it
+ * isn't recoverable anyway, and per project policy fast-tail must not back a
+ * slow-tail selector).
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils.js';
+
+// Percentile-style keys whose interactivity is the reciprocal of the matching
+// ITL percentile. `std` is excluded by design (not a reciprocal); `median`/`p99`
+// are absent from agentic artifacts so they never appear here.
+const KEYS = ['mean', 'p75', 'p90', 'p95'] as const;
+
+// Relative tolerance: skip rows already within 1e-6 of 1/itl so correct rows
+// keep their original full-precision value and the change counts are accurate.
+const REL_TOL = 1e-6;
+
+const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
+
+async function contaminationCounts(): Promise<Record<string, number>> {
+  const out: Record<string, number> = {};
+  for (const k of KEYS) {
+    const rows = await sql.unsafe(`
+      SELECT count(*)::int AS n
+      FROM benchmark_results
+      WHERE benchmark_type = 'agentic_traces'
+        AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0
+        AND metrics ? '${k}_intvty'
+        AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric)
+            > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric)
+    `);
+    out[k] = (rows[0] as unknown as { n: number }).n;
+  }
+  return out;
+}
+
+async function main(): Promise<void> {
+  const total = await sql<{ n: number }[]>`
+    SELECT count(*)::int AS n FROM benchmark_results WHERE benchmark_type = 'agentic_traces'
+  `;
+  console.log(`Agentic rows: ${total[0]!.n}`);
+
+  const before = await contaminationCounts();
+  console.log('Contaminated (intvty != 1/itl) before:', JSON.stringify(before));
+  if (KEYS.every((k) => before[k] === 0)) {
+    console.log('Nothing to backfill — all agentic rows already satisfy intvty = 1/itl.');
+    await sql.end();
+    return;
+  }
+
+  if (!hasYesFlag() && !(await confirm('Rewrite *_intvty = 1/*_itl for these rows? (y/N) '))) {
+    await sql.end();
+    return;
+  }
+
+  let totalUpdated = 0;
+  for (const k of KEYS) {
+    // keys are from a fixed trusted const — safe to interpolate.
+    const res = await sql.unsafe(`
+      UPDATE benchmark_results
+      SET metrics = jsonb_set(metrics, '{${k}_intvty}', to_jsonb(1.0 / (metrics->>'${k}_itl')::numeric))
+      WHERE benchmark_type = 'agentic_traces'
+        AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0
+        AND metrics ? '${k}_intvty'
+        AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric)
+            > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric)
+    `);
+    console.log(`  ${k}_intvty: updated ${res.count} row(s)`);
+    totalUpdated += res.count;
+  }
+
+  const after = await contaminationCounts();
+  console.log('Contaminated after:', JSON.stringify(after));
+  if (!KEYS.every((k) => after[k] === 0)) {
+    throw new Error('Backfill incomplete — some rows still deviate. Aborting before MV refresh.');
+  }
+
+  await refreshLatestBenchmarks(sql);
+  console.log(`Done. Rewrote ${totalUpdated} metric value(s) across agentic rows.`);
+  await sql.end();
+}
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
diff --git a/packages/db/src/backfill-agentic-server-logs.ts b/packages/db/src/backfill-agentic-server-logs.ts
new file mode 100644
index 00000000..37157861
--- /dev/null
+++ b/packages/db/src/backfill-agentic-server-logs.ts
@@ -0,0 +1,215 @@
+/**
+ * Backfill server logs (and the derived KV-cache pool size) for AGENTIC
+ * benchmark points.
+ *
+ * Agentic runs upload their vLLM server log as a `server_logs_<key>` artifact,
+ * but the ingest path historically failed to link it to agentic rows (the
+ * `bmk_agentic_<key>` → `server_logs_<key>` key mismatch, now fixed in
+ * ingest-ci-run). As a result the agentic server log text was never stored, so
+ * `kv_cache_pool_tokens` cannot be derived from the DB — we must re-fetch the
+ * artifacts from GitHub.
+ *
+ * For each agentic workflow run this:
+ *   1. lists the run's artifacts and keeps only `server_logs_*` + `bmk_agentic_*`
+ *      (dedup by logical name, mirroring ingest's runner-suffix collapse),
+ *   2. downloads + unzips just those (small — skips the multi-MB trace dirs),
+ *   3. maps each `bmk_agentic_<key>` JSON → config → benchmark_results rows via
+ *      the same mapBenchmarkRow/config-cache logic ingest uses,
+ *   4. calls insertServerLog(), which stores+links the log AND derives
+ *      `kv_cache_pool_tokens` into benchmark_results.metrics.
+ *
+ * Idempotent: insertServerLog only links rows whose server_log_id is null.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-server-logs
+ *     [--limit N]   only process the first N workflow runs
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+import { hasNoSslFlag } from './cli-utils';
+import { insertServerLog } from './etl/benchmark-ingest';
+import { mapBenchmarkRow } from './etl/benchmark-mapper';
+import { createConfigCache } from './etl/config-cache';
+import { createAdminSql } from './etl/db-utils';
+import { createSkipTracker } from './etl/skip-tracker';
+import { confirmProceed, parseLimitForceFlags, runBackfillMain } from './lib/backfill-runner';
+import {
+  RUNNER_SUFFIX_RE,
+  dedupeArtifactsByLogicalName,
+  downloadArtifact,
+  listRunArtifacts,
+  type ArtifactMeta,
+} from './lib/github-artifacts';
+
+const REPO = 'SemiAnalysisAI/InferenceX';
+
+const flags = parseLimitForceFlags();
+const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
+
+/**
+ * List the run's `server_logs_*` / `bmk_agentic_*` artifacts, deduped by
+ * runner-suffix-stripped logical name (matches ingest's collapse).
+ */
+function listArtifacts(githubRunId: string): Map<string, ArtifactMeta> {
+  return dedupeArtifactsByLogicalName(
+    listRunArtifacts(REPO, githubRunId).filter(
+      (a) => a.name.startsWith('server_logs_') || a.name.startsWith('bmk_agentic_'),
+    ),
+  );
+}
+
+/** Logical key shared by a server_logs_/bmk_agentic_ artifact pair. */
+function logicalKey(name: string): string {
+  return name
+    .replace(/^server_logs_/u, '')
+    .replace(/^bmk_agentic_/u, '')
+    .replace(RUNNER_SUFFIX_RE, '');
+}
+
+/**
+ * Read up to `maxBytes` of a (possibly huge) server log as UTF-8, stripping NUL
+ * bytes. vLLM's "GPU KV cache size" startup lines are near the top, so a head
+ * read is enough to derive the KV pool — and it caps storage for the rare
+ * multi-hundred-MB logs that exceed V8's ~512 MB string limit.
+ */
+const stripNul = (s: string): string => s.replaceAll(String.fromCodePoint(0), '');
+
+function readServerLogCapped(p: string, maxBytes = 64 * 1024 * 1024): string {
+  if (fs.statSync(p).size <= maxBytes) return stripNul(fs.readFileSync(p, 'utf8'));
+  const fd = fs.openSync(p, 'r');
+  try {
+    const buf = Buffer.allocUnsafe(maxBytes);
+    const n = fs.readSync(fd, buf, 0, maxBytes, 0);
+    return stripNul(buf.subarray(0, n).toString('utf8'));
+  } finally {
+    fs.closeSync(fd);
+  }
+}
+
+function findJsonFiles(dir: string): string[] {
+  const out: string[] = [];
+  const walk = (d: string) => {
+    for (const e of fs.readdirSync(d, { withFileTypes: true })) {
+      const p = path.join(d, e.name);
+      if (e.isDirectory()) walk(p);
+      else if (e.name.endsWith('.json')) out.push(p);
+    }
+  };
+  walk(dir);
+  return out;
+}
+
+async function main(): Promise<void> {
+  console.log('=== backfill-agentic-server-logs ===');
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Agentic workflow runs that still have unlinked server logs.
+  const runs = await sql<{ github_run_id: string; workflow_run_id: number }[]>`
+    select distinct wr.github_run_id::text as github_run_id, wr.id as workflow_run_id
+    from benchmark_results br
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    where br.benchmark_type = 'agentic_traces'
+      and br.server_log_id is null
+    order by wr.id
+    ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+  `;
+
+  if (runs.length === 0) {
+    console.log('\n  Nothing to do — all agentic rows already have a server log.');
+    return;
+  }
+  if (!(await confirmProceed(`${runs.length} agentic workflow run(s) to process.`))) return;
+
+  const cache = createConfigCache(sql);
+  await cache.preloadConfigs();
+  const tracker = createSkipTracker();
+
+  let linkedRows = 0;
+  let runsOk = 0;
+  let runsFailed = 0;
+  const t0 = Date.now();
+
+  for (const { github_run_id: githubRunId, workflow_run_id: wrId } of runs) {
+    const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `kvpool-${githubRunId}-`));
+    try {
+      const artifacts = listArtifacts(githubRunId);
+      // server log path by logical key
+      const serverLogByKey = new Map<string, string>();
+      const bmkDirs: string[] = [];
+      for (const art of artifacts.values()) {
+        const dir = downloadArtifact(art, tmp);
+        if (art.name.startsWith('server_logs_')) {
+          const logPath = path.join(dir, 'server.log');
+          if (fs.existsSync(logPath)) serverLogByKey.set(logicalKey(art.name), logPath);
+        } else {
+          bmkDirs.push(dir);
+        }
+      }
+
+      let runLinked = 0;
+      for (const bmkDir of bmkDirs) {
+        const key = logicalKey(path.basename(bmkDir));
+        const logPath = serverLogByKey.get(key);
+        if (!logPath) continue;
+        for (const file of findJsonFiles(bmkDir)) {
+          let raw: unknown;
+          try {
+            raw = JSON.parse(fs.readFileSync(file, 'utf8'));
+          } catch {
+            continue;
+          }
+          const rows = Array.isArray(raw) ? raw : [raw];
+          for (const row of rows) {
+            if (!row || typeof row !== 'object') continue;
+            const mapped = mapBenchmarkRow(row as Record<string, unknown>, tracker);
+            if (!mapped || mapped.benchmarkType !== 'agentic_traces') continue;
+            const configId = await cache.getOrCreateConfig(mapped.config);
+            const ids = await sql<{ id: number }[]>`
+              select id from benchmark_results
+              where workflow_run_id = ${wrId}
+                and config_id = ${configId}
+                and conc = ${mapped.conc}
+                and benchmark_type = 'agentic_traces'
+                and server_log_id is null
+            `;
+            if (ids.length === 0) continue;
+            const serverLog = readServerLogCapped(logPath);
+            await insertServerLog(
+              sql,
+              ids.map((r) => r.id),
+              serverLog,
+            );
+            runLinked += ids.length;
+          }
+        }
+      }
+      linkedRows += runLinked;
+      runsOk++;
+      const elapsed = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ run ${githubRunId}: ${serverLogByKey.size} log(s), linked ${runLinked} row(s) ` +
+          `(${runsOk}/${runs.length}, ${elapsed}s total)`,
+      );
+    } catch (error) {
+      runsFailed++;
+      console.error(
+        `  ✗ run ${githubRunId}: ${error instanceof Error ? (error.stack ?? error.message) : String(error)}`,
+      );
+    } finally {
+      fs.rmSync(tmp, { recursive: true, force: true });
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(
+    `\n=== complete: ${linkedRows} row(s) linked across ${runsOk} run(s) ` +
+      `(${runsFailed} failed) in ${totalSec}s ===`,
+  );
+  if (runsFailed > 0) process.exitCode = 1;
+}
+
+runBackfillMain('backfill-agentic-server-logs', sql, main);
diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts
new file mode 100644
index 00000000..2e3a4038
--- /dev/null
+++ b/packages/db/src/backfill-aggregate-stats.ts
@@ -0,0 +1,126 @@
+/**
+ * Backfill `agentic_trace_replay.aggregate_stats` for rows that are missing it
+ * or were computed by an older `STATS_VERSION`.
+ *
+ * The ingest path now computes stats inline, but existing rows (and rows
+ * whose computation logic has since changed) still need this pass. Run after
+ * applying migration 008 and any time `STATS_VERSION` bumps.
+ *
+ * Strategy:
+ *   - Stream rows one at a time (server_metrics_json_gz can be hundreds of
+ *     MB decompressed for TP+EP / high-conc points — keeping one in memory
+ *     at a time avoids OOM).
+ *   - Skip rows whose stored `aggregate_stats.version` already matches.
+ *   - Recompute via the same `computeAggregateStats()` helper the ingest
+ *     path uses, so behavior cannot drift.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-aggregate-stats
+ *     [--limit N]   only process the first N candidate rows (useful for
+ *                   smoke-tests on a fresh deploy)
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { hasNoSslFlag } from './cli-utils.js';
+import {
+  computeAggregateStats,
+  mergeProfileStatsUpgrade,
+  STATS_VERSION,
+  type AggregateStats,
+} from './etl/compute-aggregate-stats.js';
+import { createAdminSql } from './etl/db-utils.js';
+import {
+  confirmProceed,
+  jsonbParam,
+  parseLimitForceFlags,
+  runBackfillMain,
+  runPerIdBackfill,
+} from './lib/backfill-runner.js';
+
+const flags = parseLimitForceFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-aggregate-stats ===');
+  console.log(`  STATS_VERSION = ${STATS_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Find candidates: rows missing stats, or whose stored version is stale.
+  // Using >>'version'::int comparison would error on null; coalesce to -1 so
+  // null-stats rows always count as stale.
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where aggregate_stats is null
+           or coalesce((aggregate_stats->>'version')::int, -1) <> ${STATS_VERSION}
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return;
+
+  await runPerIdBackfill(
+    candidates.map((c) => c.id),
+    async (id) => {
+      // Fetch one row at a time — the json_gz blob is the heavy field.
+      const [row] = await sql<
+        { profile_export_jsonl_gz: Buffer | null; aggregate_stats: AggregateStats | null }[]
+      >`
+        select profile_export_jsonl_gz, aggregate_stats
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        return 'skipped';
+      }
+
+      let stats: AggregateStats;
+      if (row.aggregate_stats?.version === 3) {
+        const profileStats = await computeAggregateStats({
+          profileBlob: row.profile_export_jsonl_gz,
+          serverBlob: null,
+        });
+        stats = mergeProfileStatsUpgrade(row.aggregate_stats, profileStats);
+      } else {
+        const [serverRow] = await sql<{ server_metrics_json_gz: Buffer | null }[]>`
+          select server_metrics_json_gz
+          from agentic_trace_replay
+          where id = ${id}
+        `;
+        stats = await computeAggregateStats({
+          profileBlob: row.profile_export_jsonl_gz,
+          serverBlob: serverRow?.server_metrics_json_gz ?? null,
+        });
+      }
+
+      await sql`
+        update agentic_trace_replay
+        set aggregate_stats = ${jsonbParam(sql, stats)}
+        where id = ${id}
+      `;
+      return 'ok';
+    },
+  );
+}
+
+runBackfillMain('backfill-aggregate-stats', sql, main);
diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts
new file mode 100644
index 00000000..94e009cf
--- /dev/null
+++ b/packages/db/src/backfill-chart-series.ts
@@ -0,0 +1,124 @@
+/**
+ * Backfill `agentic_trace_replay.chart_series` for rows that are missing it
+ * or were computed by an older `CHART_SERIES_VERSION`.
+ *
+ * The ingest path now computes the time-series inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION`
+ * bumps.
+ *
+ * Strategy:
+ *   - Stream rows one at a time (server_metrics_json_gz can decompress
+ *     past 500 MB on high-conc TP+EP points — one in memory at a time
+ *     avoids OOM).
+ *   - Skip rows whose stored version already matches.
+ *   - Recompute via the same `computeChartSeries()` helper the ingest
+ *     path uses, so behavior cannot drift.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-chart-series
+ *     [--limit N]   only process the first N candidate rows
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { hasNoSslFlag } from './cli-utils.js';
+import { CHART_SERIES_VERSION, computeChartSeries } from './etl/compute-chart-series.js';
+import { createAdminSql } from './etl/db-utils.js';
+import {
+  confirmProceed,
+  jsonbParam,
+  parseLimitForceFlags,
+  runBackfillMain,
+  runPerIdBackfill,
+} from './lib/backfill-runner.js';
+
+const flags = parseLimitForceFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-chart-series ===');
+  console.log(`  CHART_SERIES_VERSION = ${CHART_SERIES_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Only rows that actually have a server_metrics blob can produce a
+  // chart_series. Rows without the blob legitimately keep `chart_series`
+  // null and the API serves them via the slow path (which also returns
+  // null because there's no blob to parse — so the page falls into the
+  // "no stored trace_replay blob" branch).
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where server_metrics_json_gz is not null
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where server_metrics_json_gz is not null
+          and (
+            chart_series is null
+            or coalesce((chart_series->>'version')::int, -1) <> ${CHART_SERIES_VERSION}
+          )
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return;
+
+  await runPerIdBackfill(
+    candidates.map((c) => c.id),
+    async (id) => {
+      const [row] = await sql<
+        {
+          server_metrics_json_gz: Buffer | null;
+          framework: string | null;
+          disagg: boolean | null;
+        }[]
+      >`
+        select atr.server_metrics_json_gz, source.framework, source.disagg
+        from agentic_trace_replay atr
+        left join lateral (
+          select c.framework, c.disagg
+          from benchmark_results br
+          join configs c on c.id = br.config_id
+          where br.trace_replay_id = atr.id
+          order by br.id
+          limit 1
+        ) source on true
+        where atr.id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        return 'skipped';
+      }
+
+      const series = await computeChartSeries(row.server_metrics_json_gz, {
+        framework: row.framework,
+        disagg: row.disagg ?? false,
+      });
+
+      await sql`
+        update agentic_trace_replay
+        set chart_series = ${series === null ? null : jsonbParam(sql, series)}
+        where id = ${id}
+      `;
+      return 'ok';
+    },
+  );
+}
+
+runBackfillMain('backfill-chart-series', sql, main);
diff --git a/packages/db/src/backfill-dataset-stats.ts b/packages/db/src/backfill-dataset-stats.ts
new file mode 100644
index 00000000..e9c6916d
--- /dev/null
+++ b/packages/db/src/backfill-dataset-stats.ts
@@ -0,0 +1,111 @@
+/**
+ * Backfill dataset summary stats and subagent-only ISL/OSL distributions from
+ * the compact structures already stored in `dataset_conversations`.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-dataset-stats --yes
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils';
+import { createAdminSql } from './etl/db-utils';
+import { logHistogram, summarizeValues } from './etl/weka-structure';
+import { jsonbParam, runBackfillMain } from './lib/backfill-runner';
+
+interface DatasetRow {
+  id: string;
+  slug: string;
+  summary: Record<string, unknown>;
+  chart_data: Record<string, unknown>;
+}
+
+interface ConversationRow {
+  num_subagent_groups: number | string;
+  request_count: number | string;
+}
+
+interface SubagentRequestRow {
+  input_tokens: number | string;
+  output_tokens: number | string;
+}
+
+const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
+
+async function main(): Promise<void> {
+  const datasets = await sql<DatasetRow[]>`
+    select id, slug, summary, chart_data
+    from datasets
+    order by slug
+  `;
+  if (datasets.length === 0) {
+    console.log('No datasets found.');
+    return;
+  }
+
+  console.log(`Backfill subagent dataset stats for ${datasets.length} dataset(s).`);
+  if (!hasYesFlag() && !(await confirm('Continue? (y/N) '))) return;
+
+  for (const dataset of datasets) {
+    const conversations = await sql<ConversationRow[]>`
+      select
+        num_subagent_groups,
+        (
+          num_turns + coalesce((
+            select sum(jsonb_array_length(node.value->'children'))
+            from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value)
+            where node.value->>'kind' = 'subagent'
+          ), 0)
+        ) as request_count
+      from dataset_conversations dc
+      where dataset_id = ${dataset.id}
+    `;
+    const requests = await sql<SubagentRequestRow[]>`
+      select
+        (child.value->>'in')::double precision as input_tokens,
+        (child.value->>'out')::double precision as output_tokens
+      from dataset_conversations dc
+      cross join lateral jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value)
+      cross join lateral jsonb_array_elements(coalesce(node.value->'children', '[]'::jsonb)) child(value)
+      where dc.dataset_id = ${dataset.id}
+        and node.value->>'kind' = 'subagent'
+    `;
+
+    const subagentsPerTrace = conversations.map((row) => Number(row.num_subagent_groups));
+    const requestsPerConversation = conversations.map((row) => Number(row.request_count));
+    const inputTokens = requests.map((row) => Number(row.input_tokens));
+    const outputTokens = requests.map((row) => Number(row.output_tokens));
+    const subagentStats = summarizeValues(subagentsPerTrace);
+    const requestStats = summarizeValues(requestsPerConversation);
+    const summary = {
+      ...dataset.summary,
+      version: 3,
+      meanSubagentsPerTrace: subagentStats.mean,
+      medianSubagentsPerTrace: subagentStats.median,
+      meanRequestsPerConversation: requestStats.mean,
+      medianRequestsPerConversation: requestStats.median,
+    };
+    const chartData = {
+      ...dataset.chart_data,
+      version: 3,
+      subagentInputTokensPerRequest: {
+        bins: logHistogram(inputTokens),
+        stats: summarizeValues(inputTokens),
+      },
+      subagentOutputTokensPerRequest: {
+        bins: logHistogram(outputTokens),
+        stats: summarizeValues(outputTokens),
+      },
+    };
+
+    await sql`
+      update datasets
+      set summary = ${sql.json(summary)},
+          chart_data = ${jsonbParam(sql, chartData)}
+      where id = ${dataset.id}
+    `;
+    console.log(
+      `  ${dataset.slug}: ${requests.length.toLocaleString()} inner requests, median ${subagentStats.median}, mean ${subagentStats.mean.toFixed(1)} subagents/trace`,
+    );
+  }
+}
+
+runBackfillMain('backfill-dataset-stats', sql, main);
diff --git a/packages/db/src/backfill-kv-pool.ts b/packages/db/src/backfill-kv-pool.ts
new file mode 100644
index 00000000..efa04c81
--- /dev/null
+++ b/packages/db/src/backfill-kv-pool.ts
@@ -0,0 +1,103 @@
+/**
+ * Backfill `benchmark_results.metrics->kv_cache_pool_tokens` from the captured
+ * server logs. The value is parsed from vLLM's authoritative
+ * "GPU KV cache size: N tokens" startup line(s), summed across data-parallel
+ * engine cores (see {@link kvCachePoolTokensFromServerLog}).
+ *
+ * The ingest path now derives this inline in `insertServerLog`, but existing
+ * rows need this one-time pass. Idempotent: re-running only touches rows that
+ * still lack the value (unless --force).
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-kv-pool
+ *     [--limit N]   only process the first N candidate server logs
+ *     [--force]     recompute even when the value is already set
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { hasNoSslFlag } from './cli-utils.js';
+import { createAdminSql } from './etl/db-utils.js';
+import { kvCachePoolTokensFromServerLog } from './etl/server-log-metrics.js';
+import { confirmProceed, parseLimitForceFlags, runBackfillMain } from './lib/backfill-runner.js';
+
+const flags = parseLimitForceFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-kv-pool ===');
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // One server log can be linked to several benchmark_results (multiple
+  // concurrency points share a server). Group by log id so we parse each log
+  // once and fan the value out to all its rows.
+  const candidates = flags.force
+    ? await sql<{ server_log_id: number }[]>`
+        select distinct server_log_id
+        from benchmark_results
+        where server_log_id is not null
+        order by server_log_id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ server_log_id: number }[]>`
+        select distinct server_log_id
+        from benchmark_results
+        where server_log_id is not null
+          and metrics->>'kv_cache_pool_tokens' is null
+        order by server_log_id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  if (!(await confirmProceed(`${candidates.length} candidate server log(s).`))) return;
+
+  let updated = 0;
+  let logsWithValue = 0;
+  let logsNoValue = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { server_log_id: logId } of candidates) {
+    try {
+      const [row] = await sql<{ server_log: string | null }[]>`
+        select server_log from server_logs where id = ${logId}
+      `;
+      const tokens = kvCachePoolTokensFromServerLog(row?.server_log ?? null);
+      if (tokens === null) {
+        logsNoValue++;
+        continue; // non-vLLM or no startup line — leave unset
+      }
+      logsWithValue++;
+      const targets = flags.force
+        ? sql`server_log_id = ${logId}`
+        : sql`server_log_id = ${logId} and metrics->>'kv_cache_pool_tokens' is null`;
+      const result = await sql`
+        update benchmark_results
+        set metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${tokens}::bigint))
+        where ${targets}
+      `;
+      updated += result.count;
+      console.log(`  ✓ log=${logId}: ${tokens.toLocaleString()} tok → ${result.count} row(s)`);
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ log=${logId}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(
+    `\n=== backfill complete: ${updated} row(s) updated from ${logsWithValue} log(s) ` +
+      `(${logsNoValue} log(s) had no KV-pool line, ${failed} failed) in ${totalSec}s ===`,
+  );
+  if (failed > 0) process.exitCode = 1;
+}
+
+runBackfillMain('backfill-kv-pool', sql, main);
diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts
new file mode 100644
index 00000000..09126654
--- /dev/null
+++ b/packages/db/src/backfill-request-timeline.ts
@@ -0,0 +1,97 @@
+/**
+ * Backfill `agentic_trace_replay.request_timeline` for rows that are
+ * missing it or were computed by an older `REQUEST_TIMELINE_VERSION`.
+ *
+ * The ingest path now computes the timeline inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 010 and any time the version bumps.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline
+ *     [--limit N]   only process the first N candidate rows
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { hasNoSslFlag } from './cli-utils.js';
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+} from './etl/compute-request-timeline.js';
+import { createAdminSql } from './etl/db-utils.js';
+import {
+  confirmProceed,
+  jsonbParam,
+  parseLimitForceFlags,
+  runBackfillMain,
+  runPerIdBackfill,
+} from './lib/backfill-runner.js';
+
+const flags = parseLimitForceFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-request-timeline ===');
+  console.log(`  REQUEST_TIMELINE_VERSION = ${REQUEST_TIMELINE_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Only rows with a profile_export blob can produce a timeline. Rows
+  // without the blob keep `request_timeline` null and the API serves them
+  // as "no timeline data".
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where profile_export_jsonl_gz is not null
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where profile_export_jsonl_gz is not null
+          and (
+            request_timeline is null
+            or coalesce((request_timeline->>'version')::int, -1) <> ${REQUEST_TIMELINE_VERSION}
+          )
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return;
+
+  await runPerIdBackfill(
+    candidates.map((c) => c.id),
+    async (id) => {
+      const [row] = await sql<{ profile_export_jsonl_gz: Buffer | null }[]>`
+        select profile_export_jsonl_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        return 'skipped';
+      }
+      const timeline = computeRequestTimeline(row.profile_export_jsonl_gz);
+      await sql`
+        update agentic_trace_replay
+        set request_timeline = ${timeline === null ? null : jsonbParam(sql, timeline)}
+        where id = ${id}
+      `;
+      return 'ok';
+    },
+  );
+}
+
+runBackfillMain('backfill-request-timeline', sql, main);
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index cb222a86..d23a8f63 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -21,7 +21,6 @@
  *     original source sweep run, so public links point at the real benchmark run.
  */
 
-import { execSync } from 'child_process';
 import fs from 'fs';
 import os from 'os';
 import path from 'path';
@@ -29,6 +28,12 @@ import path from 'path';
 import { GPU_KEYS } from '@semianalysisai/inferencex-constants';
 
 import { hasNoSslFlag } from './cli-utils';
+import {
+  dedupeArtifactsByLogicalName,
+  downloadArtifact,
+  fetchRunAttempt,
+  listRunArtifacts,
+} from './lib/github-artifacts';
 import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils';
 import { isRunAttemptPurged } from './etl/run-overrides';
 import { createSkipTracker } from './etl/skip-tracker';
@@ -45,6 +50,9 @@ import {
   bulkUpsertAvailability,
   insertServerLog,
 } from './etl/benchmark-ingest';
+import { insertTraceReplay } from './etl/trace-replay-ingest';
+import { discoverTraceReplayArtifacts } from './etl/trace-artifact-discovery';
+import { datasetSlugFromBenchmarkRow } from './etl/dataset-provenance';
 import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper';
 import { ingestEvalRow } from './etl/eval-ingest';
 import { mapEvalSamples } from './etl/eval-samples-mapper';
@@ -95,48 +103,20 @@ if (isDownloadMode) {
   console.log(`  Repo:   ${REPO}`);
   console.log(`\n--- Downloading artifacts to ${artifactsDir} ---`);
 
-  const artifactListJson = execSync(
-    `gh api "repos/${REPO}/actions/runs/${runIdStr}/artifacts" --paginate --jq '.artifacts[]'`,
-    { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024 },
-  );
-
-  const allArtifacts: { name: string; archive_download_url: string; created_at: string }[] = [];
-  for (const line of artifactListJson.trim().split('\n')) {
-    if (!line) continue;
-    try {
-      const parsed = JSON.parse(line);
-      allArtifacts.push(parsed);
-    } catch {}
-  }
-
-  const byName = new Map<string, (typeof allArtifacts)[0]>();
-  for (const a of allArtifacts) {
-    const existing = byName.get(a.name);
-    if (!existing || a.created_at > existing.created_at) {
-      byName.set(a.name, a);
-    }
-  }
+  // Retried configs produce artifacts on multiple runners — keep only the
+  // most recent per logical name (see RUNNER_SUFFIX_RE in github-artifacts)
+  // so a failed attempt's empty metrics can't overwrite the good one via
+  // ON CONFLICT DO UPDATE.
+  const byLogical = dedupeArtifactsByLogicalName(listRunArtifacts(REPO, runIdStr));
 
-  for (const [name, artifact] of byName) {
-    console.log(`  ${name}`);
-    const zipPath = path.join(artifactsDir, 'artifact.zip');
-    execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, {
-      stdio: ['pipe', 'pipe', 'inherit'],
-    });
-    const destDir = path.join(artifactsDir, name);
-    fs.mkdirSync(destDir, { recursive: true });
-    execSync(`unzip -oq "${zipPath}" -d "${destDir}"`, { stdio: 'inherit' });
-    fs.unlinkSync(zipPath);
+  for (const artifact of byLogical.values()) {
+    console.log(`  ${artifact.name}`);
+    downloadArtifact(artifact, artifactsDir);
   }
 
-  console.log(`\n  Downloaded ${byName.size} artifact(s)`);
+  console.log(`\n  Downloaded ${byLogical.size} artifact(s)`);
 
-  // Fetch run attempt from API
-  const attemptStr = execSync(
-    `gh api "repos/${REPO}/actions/runs/${runIdStr}" --jq '.run_attempt'`,
-    { encoding: 'utf8' },
-  ).trim();
-  runAttemptNum = parseInt(attemptStr || '1', 10);
+  runAttemptNum = fetchRunAttempt(REPO, runIdStr);
 } else {
   // CI mode — read from env vars
   for (const key of [
@@ -194,6 +174,14 @@ const ARTIFACT_NAMES = {
   changelog: 'changelog-metadata',
 } as const;
 
+/**
+ * Strip the `bmk_` and/or `agentic_` prefixes from an artifact directory name
+ * so the bare suffix becomes a shared key between `bmk_agentic_<suffix>` and
+ * its sibling `agentic_<suffix>` artifact.
+ */
+const stripBmkAndAgenticPrefix = (s: string): string =>
+  s.replace(/^bmk_/u, '').replace(/^agentic_/u, '');
+
 function readJson(filePath: string): unknown {
   try {
     return JSON.parse(fs.readFileSync(filePath, 'utf8'));
@@ -294,13 +282,14 @@ async function main(): Promise<void> {
 
   const availRows: {
     model: string;
-    isl: number;
-    osl: number;
+    isl: number | null;
+    osl: number | null;
     precision: string;
     hardware: string;
     framework: string;
     specMethod: string;
     disagg: boolean;
+    benchmarkType: string;
   }[] = [];
 
   let totalNewBmk = 0,
@@ -311,6 +300,11 @@ async function main(): Promise<void> {
   let totalSamples = 0;
   let totalSampleFiles = 0;
   let totalChangelogs = 0;
+  let totalTraceReplayLinked = 0;
+  const datasetSlugs = new Set<string>();
+  // Dataset slugs referenced by this run's agentic rows but absent from the
+  // `datasets` table — timeline→dataset deep links 404 until they're ingested.
+  const missingDatasets = new Set<string>();
 
   // ── Check for evals-only flag in changelog ────────────────────────────
   const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog);
@@ -355,8 +349,13 @@ async function main(): Promise<void> {
     if (fs.existsSync(artifactsDir)) {
       for (const d of fs.readdirSync(artifactsDir)) {
         if (!d.startsWith('server_logs_')) continue;
-        const logPath = path.join(artifactsDir, d, 'server.log');
-        if (!fs.existsSync(logPath)) continue;
+        // feat-agentx-v1.0 harness nests the log under `results/server.log`;
+        // older runs keep it at the artifact root. Check both.
+        const logPath = [
+          path.join(artifactsDir, d, 'server.log'),
+          path.join(artifactsDir, d, 'results', 'server.log'),
+        ].find((p) => fs.existsSync(p));
+        if (!logPath) continue;
         const configKey = d.replace(/^server_logs_/u, '');
         serverLogPaths.set(configKey, logPath);
       }
@@ -365,6 +364,17 @@ async function main(): Promise<void> {
       console.log(`  Found ${serverLogPaths.size} server log artifact(s)`);
     }
 
+    // Sibling aiperf artifacts: each `bmk_agentic_<suffix>` is paired with an
+    // `agentic_<suffix>` dir holding `profile_export.jsonl` and
+    // `server_metrics_export.csv`. The harness emits these under either a
+    // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current).
+    // Older non-aiperf agentic runs don't ship this sibling. Key on the bare
+    // suffix so both names map to the same Map entry.
+    const traceReplayPaths = discoverTraceReplayArtifacts(artifactsDir);
+    if (traceReplayPaths.size > 0) {
+      console.log(`  Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`);
+    }
+
     const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))];
     console.log(`  Found ${allBmkFiles.length} benchmark JSON file(s)`);
 
@@ -376,6 +386,12 @@ async function main(): Promise<void> {
         ? data
         : [data as Record<string, any>];
 
+      for (const rawRow of rawRows) {
+        if (!rawRow || typeof rawRow !== 'object') continue;
+        const datasetSlug = datasetSlugFromBenchmarkRow(rawRow);
+        if (datasetSlug) datasetSlugs.add(datasetSlug);
+      }
+
       const rows = rawRows
         .filter((r) => typeof r === 'object' && r !== null)
         .map((r) => mapBenchmarkRow(r, tracker))
@@ -415,13 +431,21 @@ async function main(): Promise<void> {
               framework: r.config.framework,
               specMethod: r.config.specMethod,
               disagg: r.config.disagg,
+              benchmarkType: r.benchmarkType,
             });
           }
 
           const parentDir = path.basename(path.dirname(file));
           if (parentDir.startsWith('bmk_') && insertedIds.length > 0) {
+            // Single-turn artifacts are `bmk_<key>` paired with
+            // `server_logs_<key>`. Agentic artifacts are `bmk_agentic_<key>`
+            // but the server log is still `server_logs_<key>` (no `agentic_`
+            // prefix), so fall back to the fully-stripped suffix — otherwise
+            // agentic rows never get their server log (and KV-pool size) linked.
             const configKey = parentDir.replace(/^bmk_/u, '');
-            const logPath = serverLogPaths.get(configKey);
+            const logPath =
+              serverLogPaths.get(configKey) ??
+              serverLogPaths.get(stripBmkAndAgenticPrefix(parentDir));
             if (logPath) {
               try {
                 const serverLog = fs.readFileSync(logPath, 'utf8').replaceAll('\u0000', '');
@@ -431,12 +455,49 @@ async function main(): Promise<void> {
               }
             }
           }
+
+          // Trace-replay sibling lookup for agentic points only. The aiperf
+          // harness emits `agentic_<suffix>/trace_replay/...` next to the
+          // `bmk_agentic_<suffix>` artifact we just ingested.
+          if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) {
+            const suffix = stripBmkAndAgenticPrefix(parentDir);
+            const concMatch = path.basename(file).match(/_conc(?<conc>\d+)\.json$/u);
+            const trace =
+              (concMatch?.groups?.conc
+                ? traceReplayPaths.get(`${suffix}|${concMatch.groups.conc}`)
+                : undefined) ?? traceReplayPaths.get(suffix);
+            if (trace) {
+              try {
+                const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null;
+                const metrics = trace.serverMetricsCsv
+                  ? fs.readFileSync(trace.serverMetricsCsv)
+                  : null;
+                const metricsJson = trace.serverMetricsJson
+                  ? fs.readFileSync(trace.serverMetricsJson)
+                  : null;
+                await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson, {
+                  framework: toInsert[0]?.config.framework,
+                  disagg: toInsert[0]?.config.disagg,
+                });
+                totalTraceReplayLinked += insertedIds.length;
+              } catch (error: any) {
+                tracker.recordDbError(`trace_replay for ${suffix}`, error);
+              }
+            } else {
+              tracker.skips.traceReplayMissing++;
+            }
+          }
         } catch (error: any) {
           tracker.recordDbError(path.basename(file), error);
         }
       }
     }
     console.log(`  Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`);
+    if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) {
+      console.log(
+        `  Trace replay: ${totalTraceReplayLinked} rows linked, ${tracker.skips.traceReplayMissing} agentic point(s) missing sibling artifact`,
+      );
+    }
 
     if (availRows.length > 0) {
       try {
@@ -446,6 +507,30 @@ async function main(): Promise<void> {
         tracker.recordDbError('availability', error);
       }
     }
+
+    if (datasetSlugs.size > 1) {
+      throw new Error(
+        `Conflicting dataset provenance in workflow run ${runId}: ${[...datasetSlugs].toSorted().join(', ')}`,
+      );
+    }
+    const [datasetSlug] = datasetSlugs;
+    if (datasetSlug) {
+      await sql`
+        insert into run_datasets (workflow_run_id, dataset_slug)
+        values (${workflowRunId}, ${datasetSlug})
+        on conflict (workflow_run_id) do update
+        set dataset_slug = excluded.dataset_slug
+      `;
+      console.log(`  Dataset: linked workflow run to ${datasetSlug}`);
+      const [known] = await sql`select 1 as ok from datasets where slug = ${datasetSlug}`;
+      if (!known) {
+        missingDatasets.add(datasetSlug);
+        console.warn(
+          `  ⚠ Dataset ${datasetSlug} is not in the datasets table — request-timeline deep links ` +
+            `will 404 until it is ingested (packages/db/src/ingest-weka-dataset.ts)`,
+        );
+      }
+    }
   }
 
   // ── Ingest run stats ──────────────────────────────────────────────────
@@ -654,11 +739,17 @@ async function main(): Promise<void> {
 
   const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker;
   const totalSkips =
-    skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError;
+    skips.badZip +
+    skips.unmappedModel +
+    skips.unmappedHw +
+    skips.noIslOsl +
+    skips.failedRun +
+    skips.dbError;
   if (totalSkips > 0) {
     console.log(`\n  Skipped: ${totalSkips} rows`);
     const skipLines: [string, number][] = [
       ['no isl/osl (old format)', skips.noIslOsl],
+      ['failed run (0 successful)', skips.failedRun],
       ['unmapped model', skips.unmappedModel],
       ['unmapped hw', skips.unmappedHw],
       ['bad/empty zip', skips.badZip],
@@ -690,7 +781,10 @@ async function main(): Promise<void> {
   const unmappedOutPath = process.env.UNMAPPED_ENTITIES_OUTPUT;
   if (
     unmappedOutPath &&
-    (unmappedModels.size > 0 || unmappedHws.size > 0 || unmappedPrecisions.size > 0)
+    (unmappedModels.size > 0 ||
+      unmappedHws.size > 0 ||
+      unmappedPrecisions.size > 0 ||
+      missingDatasets.size > 0)
   ) {
     fs.writeFileSync(
       unmappedOutPath,
@@ -698,6 +792,7 @@ async function main(): Promise<void> {
         models: [...unmappedModels],
         hardware: [...unmappedHws],
         precisions: [...unmappedPrecisions],
+        datasets: [...missingDatasets],
       }),
     );
   }
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index b9f2b3b5..faa093e3 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -457,6 +457,9 @@ async function mapWorkflowDir(
       unmappedModel: local.skips.unmappedModel,
       unmappedHw: local.skips.unmappedHw,
       noIslOsl: local.skips.noIslOsl,
+      failedRun: local.skips.failedRun,
+      // GCS backup doesn't ingest aiperf trace files; counter stays 0.
+      traceReplayMissing: local.skips.traceReplayMissing,
     },
     localUnmappedModels: new Set(local.unmappedModels),
     localUnmappedHws: new Set(local.unmappedHws),
@@ -621,13 +624,14 @@ async function main(): Promise<void> {
     // Upsert availability rows only for successfully resolved configs
     const availRows: {
       model: string;
-      isl: number;
-      osl: number;
+      isl: number | null;
+      osl: number | null;
       precision: string;
       hardware: string;
       framework: string;
       specMethod: string;
       disagg: boolean;
+      benchmarkType: string;
     }[] = [];
     for (const r of allInserted) {
       availRows.push({
@@ -639,6 +643,7 @@ async function main(): Promise<void> {
         framework: r.config.framework,
         specMethod: r.config.specMethod,
         disagg: r.config.disagg,
+        benchmarkType: r.benchmarkType,
       });
     }
     if (availRows.length > 0) {
diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts
index a3b62fe0..f868767e 100644
--- a/packages/db/src/ingest-supplemental.ts
+++ b/packages/db/src/ingest-supplemental.ts
@@ -219,8 +219,10 @@ async function ingestSupplementalBmk(
 
     const rows: {
       configId: number;
-      isl: number;
-      osl: number;
+      benchmarkType: 'single_turn' | 'agentic_traces';
+      offloadMode: string;
+      isl: number | null;
+      osl: number | null;
       conc: number;
       image: string | null;
       metrics: Record<string, number>;
@@ -271,6 +273,8 @@ async function ingestSupplementalBmk(
 
       rows.push({
         configId,
+        benchmarkType: 'single_turn',
+        offloadMode: 'off',
         isl: entry.isl,
         osl: entry.osl,
         conc: entry.conc,
@@ -294,13 +298,14 @@ async function ingestSupplementalBmk(
     // to `rows` are exactly the valid ones.
     const availRows: {
       model: string;
-      isl: number;
-      osl: number;
+      isl: number | null;
+      osl: number | null;
       precision: string;
       hardware: string;
       framework: string;
       specMethod: string;
       disagg: boolean;
+      benchmarkType: string;
     }[] = [];
     for (const entry of entries) {
       const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined });
@@ -317,6 +322,7 @@ async function ingestSupplementalBmk(
         framework,
         specMethod,
         disagg,
+        benchmarkType: 'single_turn',
       });
     }
     if (availRows.length > 0) {
diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
new file mode 100644
index 00000000..ed6774c0
--- /dev/null
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -0,0 +1,416 @@
+/**
+ * Ingest a HuggingFace cc-traces-weka dataset into the `datasets` +
+ * `dataset_conversations` tables that back the /datasets area.
+ *
+ * Public dataset, no token needed — fetched via the HF datasets-server rows API
+ * (rows are large, ~3.5 MB each, so we page in small chunks with adaptive
+ * backoff). Per conversation we build a flamegraph-ready `structure` (turns +
+ * subagent groups, input split into cached-prefix vs uncached) and accumulate
+ * dataset-level distributions for the detail cards. Raw hash_ids are discarded
+ * after the cached/uncached split is computed.
+ *
+ * Usage (DATABASE_WRITE_URL must be provided — never hardcoded):
+ *   DATABASE_WRITE_URL='postgres://…' pnpm exec tsx src/ingest-weka-dataset.ts \
+ *     semianalysisai/cc-traces-weka-062126 [--label "…"] [--variant full|256k] \
+ *     [--description "…"] [--limit N]
+ *
+ * Upsert: re-running replaces the dataset's rows (delete + re-insert).
+ * Remember to purge the API cache afterwards (POST /api/v1/invalidate).
+ */
+
+import { createAdminSql } from './etl/db-utils';
+import { hasNoSslFlag } from './cli-utils';
+import {
+  buildConversationStructure,
+  countConversationRequests,
+  linearHistogram,
+  logHistogram,
+  logHistogramWithZero,
+  subagentRequestTurns,
+  summarizeValues,
+  type ConversationStructure,
+  type RawWekaConversation,
+  type TurnNode,
+} from './etl/weka-structure';
+
+const ROWS_API = 'https://datasets-server.huggingface.co/rows';
+const INFO_API = 'https://datasets-server.huggingface.co/info';
+
+interface CliArgs {
+  dataset: string;
+  label?: string;
+  variant?: string;
+  description?: string;
+  limit?: number;
+}
+
+function parseArgs(): CliArgs {
+  const argv = process.argv.slice(2);
+  const positional = argv.filter((a) => !a.startsWith('--'));
+  const dataset = positional[0];
+  if (!dataset) {
+    console.error(
+      'Usage: tsx src/ingest-weka-dataset.ts <hf-dataset-id> [--label …] [--variant full|256k] [--description …] [--limit N]',
+    );
+    process.exit(1);
+  }
+  const getFlag = (name: string): string | undefined => {
+    const i = argv.indexOf(`--${name}`);
+    return i !== -1 && i + 1 < argv.length ? argv[i + 1] : undefined;
+  };
+  const limitRaw = getFlag('limit');
+  return {
+    dataset,
+    label: getFlag('label'),
+    variant: getFlag('variant'),
+    description: getFlag('description'),
+    limit: limitRaw ? Number(limitRaw) : undefined,
+  };
+}
+
+const sleep = (ms: number) =>
+  new Promise<void>((resolve) => {
+    setTimeout(resolve, ms);
+  });
+
+/**
+ * Fetch JSON, transparently retrying on HF rate-limiting (429) and transient
+ * 5xx with exponential backoff. Honors a Retry-After header when present.
+ */
+async function fetchJson(url: string, attempt = 0): Promise<unknown> {
+  const res = await fetch(url);
+  if (res.status === 429 || res.status >= 500) {
+    if (attempt >= 6) {
+      throw new Error(`${res.status} ${res.statusText} after ${attempt} retries for ${url}`);
+    }
+    const retryAfter = Number(res.headers.get('retry-after'));
+    const waitMs =
+      Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1000 : 2000 * 2 ** attempt;
+    console.warn(
+      `  ${res.status} ${res.statusText}; waiting ${Math.round(waitMs / 1000)}s (attempt ${attempt + 1})`,
+    );
+    await sleep(waitMs);
+    return fetchJson(url, attempt + 1);
+  }
+  if (!res.ok) {
+    throw new Error(`${res.status} ${res.statusText} for ${url}`);
+  }
+  return res.json();
+}
+
+async function getRowCount(dataset: string): Promise<number> {
+  const info = (await fetchJson(`${INFO_API}?dataset=${encodeURIComponent(dataset)}`)) as {
+    dataset_info?: Record<string, { splits?: Record<string, { num_examples?: number }> }>;
+  };
+  const cfg = info.dataset_info?.['default'];
+  const num = cfg?.splits?.['train']?.num_examples;
+  return typeof num === 'number' ? num : 0;
+}
+
+/** Page through rows with adaptive length (halve on "too big"/error). */
+async function* iterRows(
+  dataset: string,
+  total: number,
+  limit?: number,
+): AsyncGenerator<RawWekaConversation> {
+  const cap = limit ? Math.min(limit, total) : total;
+  let offset = 0;
+  let length = 5; // ~18 MB/page at ~3.5 MB/row; backs off on failure
+  while (offset < cap) {
+    const want = Math.min(length, cap - offset);
+    const url = `${ROWS_API}?dataset=${encodeURIComponent(dataset)}&config=default&split=train&offset=${offset}&length=${want}`;
+    let payload: { rows?: { row: RawWekaConversation }[] };
+    try {
+      payload = (await fetchJson(url)) as { rows?: { row: RawWekaConversation }[] };
+    } catch (error) {
+      if (want > 1) {
+        length = Math.max(1, Math.floor(want / 2));
+        console.warn(
+          `  page @${offset} (len ${want}) failed (${String(error)}); retrying with len ${length}`,
+        );
+        continue;
+      }
+      throw error;
+    }
+    const rows = payload.rows ?? [];
+    if (rows.length === 0) break;
+    for (const r of rows) yield r.row;
+    offset += rows.length;
+    process.stdout.write(`\r  fetched ${Math.min(offset, cap)}/${cap} conversations`);
+    if (offset < cap) await sleep(400); // be polite to the HF datasets-server
+  }
+  process.stdout.write('\n');
+}
+
+interface Accumulator {
+  inputPerTurn: number[]; // effective input tokens, every turn (incl. subagent children)
+  uncachedInputPerTurn: number[];
+  outputPerTurn: number[];
+  cachedFractionPerTurn: number[]; // cached/in, for turns with in>0
+  turnsPerConv: number[]; // main (top-level) turns
+  requestsPerConv: number[]; // main turns + subagent child turns
+  subagentInputPerRequest: number[];
+  subagentOutputPerRequest: number[];
+  subagentGroupsPerConv: number[];
+  subagentTurnsPerGroup: number[];
+  totalIn: number;
+  totalOut: number;
+  totalCached: number;
+  mainTurns: number;
+  subagentGroups: number;
+  subagentTurns: number;
+  modelCounts: Record<string, number>;
+}
+
+function newAccumulator(): Accumulator {
+  return {
+    inputPerTurn: [],
+    uncachedInputPerTurn: [],
+    outputPerTurn: [],
+    cachedFractionPerTurn: [],
+    turnsPerConv: [],
+    requestsPerConv: [],
+    subagentInputPerRequest: [],
+    subagentOutputPerRequest: [],
+    subagentGroupsPerConv: [],
+    subagentTurnsPerGroup: [],
+    totalIn: 0,
+    totalOut: 0,
+    totalCached: 0,
+    mainTurns: 0,
+    subagentGroups: 0,
+    subagentTurns: 0,
+    modelCounts: {},
+  };
+}
+
+function recordTurn(acc: Accumulator, t: TurnNode): void {
+  acc.inputPerTurn.push(t.in);
+  acc.uncachedInputPerTurn.push(t.uncached);
+  acc.outputPerTurn.push(t.out);
+  if (t.in > 0) acc.cachedFractionPerTurn.push(t.cached / t.in);
+  if (t.model) acc.modelCounts[t.model] = (acc.modelCounts[t.model] ?? 0) + 1;
+}
+
+function accumulate(acc: Accumulator, s: ConversationStructure): void {
+  acc.totalIn += s.totals.in;
+  acc.totalOut += s.totals.out;
+  acc.totalCached += s.totals.cached;
+  acc.mainTurns += s.totals.numTurns;
+  acc.subagentGroups += s.totals.numSubagentGroups;
+  acc.turnsPerConv.push(s.totals.numTurns);
+  acc.requestsPerConv.push(countConversationRequests(s));
+  for (const turn of subagentRequestTurns(s)) {
+    acc.subagentInputPerRequest.push(turn.in);
+    acc.subagentOutputPerRequest.push(turn.out);
+  }
+  acc.subagentGroupsPerConv.push(s.totals.numSubagentGroups);
+  for (const node of s.nodes) {
+    if (node.kind === 'turn') {
+      recordTurn(acc, node);
+    } else {
+      acc.subagentTurnsPerGroup.push(node.children.length);
+      acc.subagentTurns += node.children.length;
+      for (const child of node.children) recordTurn(acc, child);
+    }
+  }
+}
+
+function buildChartData(acc: Accumulator) {
+  return {
+    version: 3,
+    inputTokensPerTurn: {
+      bins: logHistogram(acc.inputPerTurn),
+      stats: summarizeValues(acc.inputPerTurn),
+    },
+    uncachedInputTokensPerTurn: {
+      bins: logHistogramWithZero(acc.uncachedInputPerTurn),
+      stats: summarizeValues(acc.uncachedInputPerTurn),
+    },
+    outputTokensPerTurn: {
+      bins: logHistogram(acc.outputPerTurn),
+      stats: summarizeValues(acc.outputPerTurn),
+    },
+    subagentInputTokensPerRequest: {
+      bins: logHistogram(acc.subagentInputPerRequest),
+      stats: summarizeValues(acc.subagentInputPerRequest),
+    },
+    subagentOutputTokensPerRequest: {
+      bins: logHistogram(acc.subagentOutputPerRequest),
+      stats: summarizeValues(acc.subagentOutputPerRequest),
+    },
+    turnsPerConversation: {
+      bins: linearHistogram(acc.turnsPerConv),
+      stats: summarizeValues(acc.turnsPerConv),
+    },
+    subagentGroupsPerConversation: {
+      bins: linearHistogram(acc.subagentGroupsPerConv),
+      stats: summarizeValues(acc.subagentGroupsPerConv),
+    },
+    cachedFractionPerTurn: {
+      bins: linearHistogram(acc.cachedFractionPerTurn, 20),
+      stats: summarizeValues(acc.cachedFractionPerTurn),
+    },
+  };
+}
+
+function buildSummary(acc: Accumulator, blockSize: number, hashIdScope: string | null) {
+  const cachedPct = acc.totalIn > 0 ? acc.totalCached / acc.totalIn : 0;
+  const requestsPerConversation = summarizeValues(acc.requestsPerConv);
+  const subagentsPerTrace = summarizeValues(acc.subagentGroupsPerConv);
+  return {
+    version: 3,
+    blockSize,
+    hashIdScope,
+    totalIn: acc.totalIn,
+    totalOut: acc.totalOut,
+    totalCached: acc.totalCached,
+    cachedPct,
+    mainTurns: acc.mainTurns,
+    subagentGroups: acc.subagentGroups,
+    subagentTurns: acc.subagentTurns,
+    meanRequestsPerConversation: requestsPerConversation.mean,
+    medianRequestsPerConversation: requestsPerConversation.median,
+    meanSubagentsPerTrace: subagentsPerTrace.mean,
+    medianSubagentsPerTrace: subagentsPerTrace.median,
+    modelMix: acc.modelCounts,
+  };
+}
+
+function slugFromDataset(dataset: string): string {
+  return dataset.includes('/') ? dataset.slice(dataset.indexOf('/') + 1) : dataset;
+}
+
+function inferVariant(slug: string): string {
+  if (slug.endsWith('-256k')) return '256k';
+  if (slug.includes('no-subagent')) return 'no-subagents';
+  return 'full';
+}
+
+function defaultLabel(slug: string): string {
+  // cc-traces-weka-062126 → "CC Traces Weka 062126"
+  return slug
+    .split('-')
+    .map((p) => (/^\d+$/u.test(p) ? p : p.toUpperCase()))
+    .join(' ')
+    .replace(/^CC TRACES WEKA/u, 'CC Traces Weka');
+}
+
+async function main(): Promise<void> {
+  const args = parseArgs();
+  const slug = slugFromDataset(args.dataset);
+  const variant = args.variant ?? inferVariant(slug);
+  const label = args.label ?? defaultLabel(slug);
+  const hfUrl = `https://huggingface.co/datasets/${args.dataset}`;
+
+  console.log(`=== ingest-weka-dataset: ${args.dataset} ===`);
+  console.log(`  slug=${slug} variant=${variant} label="${label}"`);
+
+  const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1 });
+
+  const total = await getRowCount(args.dataset);
+  console.log(`  ${total} conversations on HF`);
+
+  const acc = newAccumulator();
+  let blockSize = 64;
+  let hashIdScope: string | null = null;
+
+  // Buffer the per-conversation rows; flush in batches to keep memory bounded.
+  interface ConvRow {
+    dataset_id: string;
+    conv_id: string;
+    models: string[];
+    num_turns: number;
+    num_subagent_groups: number;
+    total_in: number;
+    total_out: number;
+    total_cached: number;
+    structure: ConversationStructure;
+  }
+  const pending: ConvRow[] = [];
+
+  try {
+    // Upsert the dataset shell first (FK target). Counts/summary filled at the end.
+    await sql`
+      insert into datasets (id, slug, label, variant, description, hf_url, license)
+      values (${args.dataset}, ${slug}, ${label}, ${variant}, ${args.description ?? null}, ${hfUrl}, 'apache-2.0')
+      on conflict (id) do update set
+        slug = excluded.slug, label = excluded.label, variant = excluded.variant,
+        description = coalesce(excluded.description, datasets.description),
+        hf_url = excluded.hf_url, license = excluded.license, ingested_at = now()
+    `;
+    // Clear prior conversations for a clean re-ingest.
+    await sql`delete from dataset_conversations where dataset_id = ${args.dataset}`;
+
+    const flush = async () => {
+      if (pending.length === 0) return;
+      // postgres.js row-helper insert: serializes `structure` to jsonb and
+      // `models` to text[] per row (unnest can't carry a text[] column — a 2D
+      // array would flatten into scalar rows).
+      const rows = pending.map((p) => ({
+        dataset_id: args.dataset,
+        conv_id: p.conv_id,
+        models: p.models,
+        num_turns: p.num_turns,
+        num_subagent_groups: p.num_subagent_groups,
+        total_in: p.total_in,
+        total_out: p.total_out,
+        total_cached: p.total_cached,
+        structure: sql.json(p.structure as unknown as Parameters<typeof sql.json>[0]),
+      }));
+      await sql`insert into dataset_conversations ${sql(rows)}`;
+      pending.length = 0;
+    };
+
+    let count = 0;
+    for await (const conv of iterRows(args.dataset, total, args.limit)) {
+      blockSize = conv.block_size ?? blockSize;
+      hashIdScope = conv.hash_id_scope ?? hashIdScope;
+      const structure = buildConversationStructure(conv);
+      accumulate(acc, structure);
+      pending.push({
+        dataset_id: args.dataset,
+        conv_id: conv.id,
+        models: Array.isArray(conv.models) ? conv.models : [],
+        num_turns: structure.totals.numTurns,
+        num_subagent_groups: structure.totals.numSubagentGroups,
+        total_in: structure.totals.in,
+        total_out: structure.totals.out,
+        total_cached: structure.totals.cached,
+        structure,
+      });
+      count += 1;
+      if (pending.length >= 25) await flush();
+    }
+    await flush();
+
+    const summary = buildSummary(acc, blockSize, hashIdScope);
+    const chartData = buildChartData(acc);
+    await sql`
+      update datasets set
+        conversation_count = ${count},
+        summary = ${sql.json(summary as unknown as Parameters<typeof sql.json>[0])},
+        chart_data = ${sql.json(chartData as unknown as Parameters<typeof sql.json>[0])},
+        ingested_at = now()
+      where id = ${args.dataset}
+    `;
+
+    console.log(`\n  ingested ${count} conversations`);
+    console.log(
+      `  main turns=${acc.mainTurns} subagent groups=${acc.subagentGroups} subagent turns=${acc.subagentTurns}`,
+    );
+    console.log(
+      `  totals: in=${acc.totalIn.toLocaleString()} out=${acc.totalOut.toLocaleString()} ` +
+        `cached=${acc.totalCached.toLocaleString()} (${(summary.cachedPct * 100).toFixed(1)}% of input)`,
+    );
+    console.log('\n=== done ===');
+    console.log('  Purge the API cache: POST /api/v1/invalidate');
+  } finally {
+    await sql.end({ timeout: 5 });
+  }
+}
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
diff --git a/packages/db/src/json-provider.line-single-run.test.ts b/packages/db/src/json-provider.line-single-run.test.ts
index 643b8896..b75fa26a 100644
--- a/packages/db/src/json-provider.line-single-run.test.ts
+++ b/packages/db/src/json-provider.line-single-run.test.ts
@@ -7,8 +7,9 @@ import { afterAll, beforeAll, describe, expect, it } from 'vitest';
 import type { getLatestBenchmarks as GetLatestBenchmarks } from './json-provider.js';
 
 /**
- * A chart line is one config + sequence (config_id, benchmark_type, isl, osl) plotted across
- * concurrencies, and it must come from a SINGLE workflow run. getLatestBenchmarks picks the
+ * A chart line is one config + sequence + offload mode
+ * (config_id, benchmark_type, isl, osl, offload_mode) plotted across concurrencies, and it must
+ * come from a SINGLE workflow run. getLatestBenchmarks picks the
  * newest run per line (date, then run_started_at, then workflow_run_id) and returns EVERY
  * concurrency that one run measured — never stitching skipped concurrencies from an older run.
  *
@@ -62,6 +63,7 @@ const result = (
   tpot: number,
   isl = 1024,
   osl = 1024,
+  offloadMode = 'off',
 ) => ({
   id: nextResultId++,
   workflow_run_id: runDbId,
@@ -71,6 +73,7 @@ const result = (
   isl,
   osl,
   conc,
+  offload_mode: offloadMode,
   image: null,
   metrics: { median_tpot: tpot },
   error: null,
@@ -105,6 +108,10 @@ beforeAll(async () => {
       // config 1, seq (8192,1024): only run A measured it (run B skipped this sequence).
       result(10, 1, OLD, 1, 0.2, 8192, 1024),
       result(10, 1, OLD, 8, 0.3, 8192, 1024),
+      // Offload mode is an independent line dimension. A newer off-mode run must not hide
+      // the older on-mode line for the same config and sequence.
+      result(10, 1, OLD, 4, 0.25, 4096, 4096, 'on'),
+      result(11, 1, NEW, 4, 0.2, 4096, 4096, 'off'),
       // config 2, seq (1024,1024): two same-day runs with identical run_started_at.
       result(20, 2, NEW, 1, 0.5),
       result(20, 2, NEW, 8, 0.6),
@@ -157,6 +164,21 @@ describe('getLatestBenchmarks — one run per line', () => {
     ]);
   });
 
+  it('selects winning runs independently for each offload mode', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false).filter(
+      (r) => r.isl === 4096 && r.osl === 4096,
+    );
+
+    expect(
+      rows
+        .map((r) => ({ offloadMode: r.offload_mode, runUrl: r.run_url }))
+        .toSorted((a, b) => a.offloadMode.localeCompare(b.offloadMode)),
+    ).toEqual([
+      { offloadMode: 'off', runUrl: 'https://github.com/x/runs/101/attempts/1' },
+      { offloadMode: 'on', runUrl: 'https://github.com/x/runs/100/attempts/1' },
+    ]);
+  });
+
   it('breaks a same-day, same-timestamp tie by workflow_run_id (higher id wins the whole line)', () => {
     const rows = getLatestBenchmarks('testm', NEW, false);
     // config 2: run E (200, id 20) and run F (201, id 21) share run_started_at; F wins by id.
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index dfb03e98..b502b243 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -72,6 +72,8 @@ interface RawBenchmarkResult {
   isl: number;
   osl: number;
   conc: number;
+  /** Added by the AgentX schema; older dumps omit it and are treated as off. */
+  offload_mode?: string;
   image: string | null;
   metrics: Record<string, number>;
   /** Added in migration 006; older dumps omit this field — surfaced as undefined. */
@@ -281,6 +283,7 @@ function toBenchmarkRow(
   metrics?: Record<string, number>,
 ): BenchmarkRow {
   return {
+    id: br.id,
     hardware: c.hardware,
     framework: c.framework,
     model: c.model,
@@ -298,6 +301,8 @@ function toBenchmarkRow(
     decode_num_workers: c.decode_num_workers,
     num_prefill_gpu: c.num_prefill_gpu,
     num_decode_gpu: c.num_decode_gpu,
+    benchmark_type: br.benchmark_type ?? 'single_turn',
+    offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off',
     isl: br.isl,
     osl: br.osl,
     conc: br.conc,
@@ -351,9 +356,9 @@ export function compareBenchmarkRecency(
   return bStarted.localeCompare(aStarted);
 }
 
-/** Chart-line identity: one config + sequence. All concurrencies of a line come from one run. */
+/** Chart-line identity: one config + sequence + offload mode. All concurrencies of a line come from one run. */
 const lineKey = (br: RawBenchmarkResult): string =>
-  `${br.config_id}:${br.benchmark_type}:${br.isl}:${br.osl}`;
+  `${br.config_id}:${br.benchmark_type}:${br.isl}:${br.osl}:${br.offload_mode ?? 'off'}`;
 
 export function getLatestBenchmarks(
   modelKey: string | string[],
@@ -390,7 +395,7 @@ export function getLatestBenchmarks(
     return true;
   });
 
-  // Single run per LINE (config_id, benchmark_type, isl, osl): pick the newest run that
+  // Single run per LINE (config_id, benchmark_type, isl, osl, offload_mode): pick the newest run that
   // produced data for the line, then keep EVERY concurrency that one run measured. Sort by
   // recency (date, then run_started_at) with a final workflow_run_id DESC tiebreak so exactly
   // one run wins even when run_started_at is equal/null — matching the SQL ORDER BY.
@@ -499,7 +504,11 @@ export function getAvailabilityData(): AvailabilityRow[] {
   for (const a of s.availability) {
     const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`;
     if (validKeys.has(key)) {
-      rows.push({ ...a, date: toDateString(a.date) });
+      rows.push({
+        ...a,
+        benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn',
+        date: toDateString(a.date),
+      });
     }
   }
 
diff --git a/packages/db/src/lib/backfill-runner.test.ts b/packages/db/src/lib/backfill-runner.test.ts
new file mode 100644
index 00000000..6da9071f
--- /dev/null
+++ b/packages/db/src/lib/backfill-runner.test.ts
@@ -0,0 +1,55 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { parseLimitForceFlags, runPerIdBackfill } from './backfill-runner.js';
+
+describe('parseLimitForceFlags', () => {
+  const originalArgv = process.argv;
+  afterEach(() => {
+    process.argv = originalArgv;
+  });
+
+  it('defaults to no limit and force off', () => {
+    process.argv = ['node', 'script.ts'];
+    expect(parseLimitForceFlags()).toEqual({ limit: null, force: false });
+  });
+
+  it('parses --limit N and --force', () => {
+    process.argv = ['node', 'script.ts', '--limit', '25', '--force', '--yes'];
+    expect(parseLimitForceFlags()).toEqual({ limit: 25, force: true });
+  });
+});
+
+describe('runPerIdBackfill', () => {
+  beforeEach(() => {
+    vi.spyOn(console, 'log').mockImplementation(() => {});
+    vi.spyOn(console, 'error').mockImplementation(() => {});
+  });
+  afterEach(() => {
+    vi.restoreAllMocks();
+    process.exitCode = undefined;
+  });
+
+  it('processes ids serially and leaves exitCode unset on success', async () => {
+    const seen: number[] = [];
+    await runPerIdBackfill([1, 2, 3], (id) => {
+      seen.push(id);
+      return Promise.resolve(id === 2 ? 'skipped' : 'ok');
+    });
+    expect(seen).toEqual([1, 2, 3]);
+    expect(process.exitCode).toBeUndefined();
+    // Two ✓ lines (skipped rows do not log) plus the summary line.
+    const logged = vi.mocked(console.log).mock.calls.map((c) => String(c[0]));
+    expect(logged.filter((l) => l.includes('✓')).length).toBe(2);
+    expect(logged.at(-1)).toContain('=== backfill complete: 2 ok, 0 failed');
+  });
+
+  it('counts throws as failures and sets exitCode = 1', async () => {
+    await runPerIdBackfill([1, 2], (id) =>
+      id === 1 ? Promise.reject(new Error('boom')) : Promise.resolve('ok'),
+    );
+    expect(process.exitCode).toBe(1);
+    const logged = vi.mocked(console.log).mock.calls.map((c) => String(c[0]));
+    expect(logged.at(-1)).toContain('=== backfill complete: 1 ok, 1 failed');
+    expect(vi.mocked(console.error).mock.calls[0]?.[0]).toContain('✗ id=1: boom');
+  });
+});
diff --git a/packages/db/src/lib/backfill-runner.ts b/packages/db/src/lib/backfill-runner.ts
new file mode 100644
index 00000000..de00bee1
--- /dev/null
+++ b/packages/db/src/lib/backfill-runner.ts
@@ -0,0 +1,98 @@
+/**
+ * Shared scaffolding for the one-shot `backfill-*.ts` CLI scripts (invoked
+ * via the `db:backfill-*` package scripts). Each script keeps only its
+ * candidate query and per-row recompute; flag parsing, the `--yes`
+ * confirmation gate, per-row progress logging, and the exit-code summary
+ * live here so every backfill behaves identically on the command line.
+ */
+
+import { confirm, hasYesFlag } from '../cli-utils.js';
+import type { Sql } from '../etl/db-utils.js';
+
+export interface LimitForceFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+/** Parse the standard `--limit N` / `--force` backfill flags from argv. */
+export function parseLimitForceFlags(): LimitForceFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+/**
+ * Print the candidate-count line, then gate on `--yes` or an interactive
+ * y/N prompt. Returns false (after logging "Aborted.") when declined.
+ */
+export async function confirmProceed(candidatesLabel: string): Promise<boolean> {
+  console.log(`\n  ${candidatesLabel}`);
+  if (hasYesFlag()) return true;
+  const ok = await confirm('\nProceed? (y/N) ');
+  if (!ok) console.log('Aborted.');
+  return ok;
+}
+
+/**
+ * Iterate candidate row ids one at a time (the recomputed blobs can be
+ * hundreds of MB decompressed — serial processing keeps memory bounded),
+ * logging per-row progress and a final summary. `processRow` returns 'ok'
+ * (counts toward the ✓ log) or 'skipped' (e.g. row vanished — the callback
+ * logs its own warning); throwing marks the row failed. Sets
+ * `process.exitCode = 1` when any row failed.
+ */
+export async function runPerIdBackfill(
+  ids: readonly number[],
+  processRow: (id: number) => Promise<'ok' | 'skipped'>,
+): Promise<void> {
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const id of ids) {
+    const start = Date.now();
+    try {
+      if ((await processRow(id)) === 'skipped') continue;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(`  ✓ id=${id} (${elapsed}s, ${ok}/${ids.length} done, ${elapsedTotal}s total)`);
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+/**
+ * jsonb parameter for a freshly computed value. `structuredClone` strips
+ * class instances/prototypes so postgres.js serializes plain data only —
+ * matches what the inline ingest path stores.
+ */
+export function jsonbParam(sql: Sql, value: unknown): ReturnType<Sql['json']> {
+  return sql.json(structuredClone(value) as unknown as Parameters<typeof sql.json>[0]);
+}
+
+/** Standard `main().catch(…).finally(sql.end())` trailer for backfill CLIs. */
+export function runBackfillMain(name: string, sql: Sql, main: () => Promise<void>): void {
+  main()
+    .catch((error) => {
+      console.error(`${name} failed:`, error);
+      process.exitCode = 1;
+    })
+    .finally(() => sql.end());
+}
diff --git a/packages/db/src/lib/github-artifacts.test.ts b/packages/db/src/lib/github-artifacts.test.ts
new file mode 100644
index 00000000..571643a5
--- /dev/null
+++ b/packages/db/src/lib/github-artifacts.test.ts
@@ -0,0 +1,42 @@
+import { describe, expect, it } from 'vitest';
+
+import { RUNNER_SUFFIX_RE, dedupeArtifactsByLogicalName } from './github-artifacts.js';
+
+const art = (name: string, created_at: string) => ({
+  name,
+  archive_download_url: `https://api.github.com/${name}`,
+  created_at,
+});
+
+describe('RUNNER_SUFFIX_RE', () => {
+  it('strips the trailing runner-pool + attempt token', () => {
+    expect('bmk_dsr1_conc4_h200-cw_00'.replace(RUNNER_SUFFIX_RE, '')).toBe('bmk_dsr1_conc4');
+    expect('bmk_dsr1_conc4_h200-dgxc-slurm_1'.replace(RUNNER_SUFFIX_RE, '')).toBe('bmk_dsr1_conc4');
+  });
+
+  it('does not over-match across earlier underscore separators', () => {
+    // The (conc, offload) variant tokens must survive — only the final
+    // `_<pool>_<digits>` pair is stripped.
+    expect('bmk_agentic_glm5_offload_on_b200-nb_2'.replace(RUNNER_SUFFIX_RE, '')).toBe(
+      'bmk_agentic_glm5_offload_on',
+    );
+    expect('server_logs_glm5'.replace(RUNNER_SUFFIX_RE, '')).toBe('server_logs_glm5');
+  });
+});
+
+describe('dedupeArtifactsByLogicalName', () => {
+  it('keeps only the most recent artifact per logical name', () => {
+    const deduped = dedupeArtifactsByLogicalName([
+      art('bmk_dsr1_conc4_h200-cw_00', '2026-06-01T00:00:00Z'),
+      art('bmk_dsr1_conc4_h200-dgxc-slurm_1', '2026-06-02T00:00:00Z'),
+      art('bmk_dsr1_conc8_h200-cw_00', '2026-06-01T00:00:00Z'),
+    ]);
+    expect([...deduped.keys()].toSorted()).toEqual(['bmk_dsr1_conc4', 'bmk_dsr1_conc8']);
+    expect(deduped.get('bmk_dsr1_conc4')?.name).toBe('bmk_dsr1_conc4_h200-dgxc-slurm_1');
+  });
+
+  it('passes through names without a runner suffix unchanged', () => {
+    const deduped = dedupeArtifactsByLogicalName([art('run-stats', '2026-06-01T00:00:00Z')]);
+    expect(deduped.get('run-stats')?.name).toBe('run-stats');
+  });
+});
diff --git a/packages/db/src/lib/github-artifacts.ts b/packages/db/src/lib/github-artifacts.ts
new file mode 100644
index 00000000..291740cf
--- /dev/null
+++ b/packages/db/src/lib/github-artifacts.ts
@@ -0,0 +1,86 @@
+/**
+ * GitHub Actions artifact helpers shared by `ingest-ci-run.ts` (download
+ * mode) and `backfill-agentic-server-logs.ts`. All calls shell out to the
+ * `gh` CLI, which picks up GITHUB_TOKEN from the environment.
+ */
+
+import { execSync } from 'node:child_process';
+import fs from 'node:fs';
+import path from 'node:path';
+
+export interface ArtifactMeta {
+  name: string;
+  archive_download_url: string;
+  created_at: string;
+}
+
+/**
+ * Strips the trailing `_<runner-pool>_<attempt-digits>` token from an
+ * artifact name so retries on different runners collapse to one logical
+ * artifact. Without this, two artifacts produced for the same logical
+ * config (e.g. `…_h200-cw_00` and `…_h200-dgxc-slurm_1`) both land in the
+ * DB and the failed one's empty metrics can overwrite the good one via
+ * ON CONFLICT DO UPDATE.
+ *
+ * The runner pool name itself has no underscores (`h200-cw`,
+ * `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip
+ * bounded — using `\w` here would over-match across earlier `_` separators
+ * and collapse different (conc, offload) variants into the same logical
+ * name.
+ */
+export const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/u;
+
+/** List a workflow run's artifacts via `gh api` (paginated). Malformed lines are skipped. */
+export function listRunArtifacts(repo: string, runId: string): ArtifactMeta[] {
+  const json = execSync(
+    `gh api "repos/${repo}/actions/runs/${runId}/artifacts" --paginate --jq '.artifacts[]'`,
+    { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024 },
+  );
+  const out: ArtifactMeta[] = [];
+  for (const line of json.trim().split('\n')) {
+    if (!line) continue;
+    try {
+      out.push(JSON.parse(line) as ArtifactMeta);
+    } catch {
+      // skip malformed line
+    }
+  }
+  return out;
+}
+
+/**
+ * Group artifacts by their runner-suffix-stripped logical name, keeping only
+ * the most recent (`created_at`) per group.
+ */
+export function dedupeArtifactsByLogicalName(
+  artifacts: readonly ArtifactMeta[],
+): Map<string, ArtifactMeta> {
+  const byLogical = new Map<string, ArtifactMeta>();
+  for (const a of artifacts) {
+    const key = a.name.replace(RUNNER_SUFFIX_RE, '');
+    const existing = byLogical.get(key);
+    if (!existing || a.created_at > existing.created_at) byLogical.set(key, a);
+  }
+  return byLogical;
+}
+
+/** Download + unzip one artifact into `<destRoot>/<artifact.name>`; returns that dir. */
+export function downloadArtifact(artifact: ArtifactMeta, destRoot: string): string {
+  const zipPath = path.join(destRoot, 'artifact.zip');
+  execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, {
+    stdio: ['pipe', 'pipe', 'inherit'],
+  });
+  const destDir = path.join(destRoot, artifact.name);
+  fs.mkdirSync(destDir, { recursive: true });
+  execSync(`unzip -oq "${zipPath}" -d "${destDir}"`, { stdio: 'inherit' });
+  fs.unlinkSync(zipPath);
+  return destDir;
+}
+
+/** Fetch a run's current attempt number via `gh api` (defaults to 1). */
+export function fetchRunAttempt(repo: string, runId: string): number {
+  const attemptStr = execSync(`gh api "repos/${repo}/actions/runs/${runId}" --jq '.run_attempt'`, {
+    encoding: 'utf8',
+  }).trim();
+  return parseInt(attemptStr || '1', 10);
+}
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
new file mode 100644
index 00000000..529306cf
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -0,0 +1,113 @@
+import { describe, expect, it } from 'vitest';
+
+import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates';
+
+describe('percentilesOf', () => {
+  it('returns null for empty input', () => {
+    expect(percentilesOf([])).toBeNull();
+    expect(percentilesOf([Number.NaN, Number.POSITIVE_INFINITY])).toBeNull();
+  });
+
+  it('computes percentiles for a simple integer range', () => {
+    // 1..100, evenly spaced — linear quantile is straightforward.
+    const xs = Array.from({ length: 100 }, (_, i) => i + 1);
+    const p = percentilesOf(xs);
+    expect(p).not.toBeNull();
+    expect(p!.n).toBe(100);
+    expect(p!.mean).toBeCloseTo(50.5, 6);
+    expect(p!.p50).toBeCloseTo(50.5, 6);
+    // For 100 sorted values, p75 = sorted[0.75 * 99] = sorted[74.25] interp.
+    expect(p!.p75).toBeCloseTo(75.25, 6);
+    expect(p!.p90).toBeCloseTo(90.1, 6);
+    expect(p!.p99).toBeCloseTo(99.01, 6);
+  });
+
+  it('filters out non-finite values before computing', () => {
+    const p = percentilesOf([1, 2, Number.NaN, 3, Number.POSITIVE_INFINITY, 4]);
+    expect(p?.n).toBe(4);
+    expect(p?.mean).toBeCloseTo(2.5, 6);
+  });
+});
+
+describe('extractIslOsl', () => {
+  it('reads input/output sequence length from profiling records', () => {
+    const lines = [
+      JSON.stringify({
+        metadata: { benchmark_phase: 'profiling' },
+        metrics: {
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+      JSON.stringify({
+        metadata: { benchmark_phase: 'profiling' },
+        metrics: {
+          input_sequence_length: { value: 200, unit: 'tokens' },
+          output_sequence_length: { value: 75, unit: 'tokens' },
+        },
+      }),
+      // warmup record — should be ignored
+      JSON.stringify({
+        metadata: { benchmark_phase: 'warmup' },
+        metrics: {
+          input_sequence_length: { value: 9999, unit: 'tokens' },
+          output_sequence_length: { value: 9999, unit: 'tokens' },
+        },
+      }),
+    ];
+    const { isl, osl } = extractIslOsl(lines.join('\n'));
+    expect(isl).toEqual([100, 200]);
+    expect(osl).toEqual([50, 75]);
+  });
+});
+
+describe('extractServerMetricSamples', () => {
+  it('extracts KV cache util gauge and computes per-interval prefix hit rate', () => {
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:kv_cache_usage_perc': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, end_ns: 1, avg: 0.1 },
+                { start_ns: 1, end_ns: 2, avg: 0.5 },
+                { start_ns: 2, end_ns: 3, avg: 0.9 },
+              ],
+            },
+          ],
+        },
+        'vllm:prefix_cache_hits': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, rate: 80 },
+                { start_ns: 1, rate: 50 },
+                { start_ns: 2, rate: 0 }, // skipped because matching queries.rate is 0
+              ],
+            },
+          ],
+        },
+        'vllm:prefix_cache_queries': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, rate: 100 }, // hit rate = 0.8
+                { start_ns: 1, rate: 100 }, // hit rate = 0.5
+                { start_ns: 2, rate: 0 },
+              ],
+            },
+          ],
+        },
+      },
+    });
+    const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
+    expect(kvCacheUtil).toEqual([0.1, 0.5, 0.9]);
+    expect(prefixCacheHitRate).toEqual([0.8, 0.5]);
+  });
+
+  it('returns empty arrays when the JSON lacks the expected metric series', () => {
+    const out = extractServerMetricSamples(JSON.stringify({ metrics: {} }));
+    expect(out.kvCacheUtil).toEqual([]);
+    expect(out.prefixCacheHitRate).toEqual([]);
+  });
+});
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
new file mode 100644
index 00000000..72faa148
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -0,0 +1,406 @@
+/**
+ * Per-id aggregate stats for the "Aggregates across configs" view on the
+ * agentic detail page. Each id contributes one summary number per metric per
+ * percentile so the frontend can plot how each metric varies across the
+ * SKU's parallelism + concurrency configs.
+ *
+ * Sources:
+ *  - `profile_export.jsonl` → ISL / OSL per request (filtered to profiling phase)
+ *  - `server_metrics_json` → time-series of KV cache utilization +
+ *     prefix-cache hit rate per scrape interval
+ *
+ * Returns mean/p50/p75/p90/p99 per metric. Nulls when the blob is missing
+ * or has no usable samples — frontend treats those as "no data".
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+import type { DbClient } from '../connection.js';
+import {
+  fetchAggregateStatsRows,
+  percentilesOf,
+  readNum,
+  type MetricPercentiles,
+} from './agentic-shared';
+
+// Percentile math + envelope reader live in agentic-shared.ts; re-exported
+// here because etl/compute-aggregate-stats and the API layer import them
+// from this module.
+export { percentilesOf, type MetricPercentiles } from './agentic-shared';
+
+/**
+ * Bump when the aggregate-stats computation algorithm changes — the backfill
+ * script recomputes any row whose stored `aggregate_stats.version` is older.
+ * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
+ * import: the compute helper depends on the extractors below.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
+ *
+ * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate
+ * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way
+ * they do for vllm runs.
+ *
+ * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL.
+ */
+export const STATS_VERSION = 4;
+
+export interface AgenticAggregate {
+  id: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record<number, AgenticAggregate>;
+
+/**
+ * `profile_export_jsonl_gz` is small (~1-3 MB) so we can batch many per
+ * round-trip. `server_metrics_json_gz` is much bigger (~17 MB compressed
+ * for high-conc TP+EP runs; Neon encodes bytea over HTTP at ~1.6× wire
+ * size, so two of those = ~50 MB and three already trips the 64 MB cap).
+ * We fetch the two blob types in separate queries with different chunk
+ * sizes.
+ */
+const PROFILE_CHUNK_SIZE = 8;
+const SERVER_CHUNK_SIZE = 1;
+
+interface ProfileRecord {
+  metadata?: { benchmark_phase?: string };
+  metrics?: {
+    input_sequence_length?: { value?: number } | number;
+    output_sequence_length?: { value?: number } | number;
+  };
+}
+
+/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */
+export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } {
+  const isl: number[] = [];
+  const osl: number[] = [];
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const m = rec.metrics ?? {};
+    const i = readNum(m.input_sequence_length);
+    const o = readNum(m.output_sequence_length);
+    if (typeof i === 'number') isl.push(i);
+    if (typeof o === 'number') osl.push(o);
+  }
+  return { isl, osl };
+}
+
+interface TimeSlice {
+  start_ns?: number;
+  end_ns?: number;
+  avg?: number;
+  rate?: number;
+  count?: number;
+  sum?: number;
+}
+interface Series {
+  labels?: Record<string, string>;
+  timeslices?: TimeSlice[];
+}
+interface MetricMeta {
+  series?: Series[];
+}
+interface MetricsJson {
+  metrics?: Record<string, MetricMeta>;
+}
+
+/**
+ * Aggregate a per-timeslice field across all series of a metric, indexed by
+ * the timeslice's `start_ns`. vllm reports one series per engine on
+ * multi-engine DP/PP deployments, so we sum (or average) across engines to
+ * get the cluster-wide value at each timeslice.
+ *
+ * `field` selects which numeric field on a timeslice to read (`avg` for
+ * gauges, `rate` for counter deltas). `combine` controls cross-engine math:
+ * 'sum' for running/waiting/throughput counters where the cluster total is
+ * the sum; 'avg' for KV cache utilization, which is bounded [0, 1] per
+ * engine and should be averaged across engines for the cluster view.
+ */
+function aggregateSeriesByStart(
+  metricSeries: readonly Series[] | undefined,
+  field: 'avg' | 'rate',
+  combine: 'sum' | 'avg',
+): Map<number, number> {
+  const sums = new Map<number, number>();
+  const counts = new Map<number, number>();
+  for (const s of metricSeries ?? []) {
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.start_ns !== 'number') continue;
+      const v = ts[field];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+      counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+    }
+  }
+  if (combine === 'sum') return sums;
+  const out = new Map<number, number>();
+  for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+  return out;
+}
+
+/**
+ * Parse the server_metrics_json → time-series arrays for KV cache util and
+ * prefix cache hit rate (per-interval, computed from the prometheus
+ * counters the same way trace-server-metrics does it).
+ *
+ * Aggregates across all engine series so multi-engine DP/PP deployments are
+ * counted correctly (previously we only read engine 0).
+ */
+/** First metric whose series array is non-empty; supports vllm/sglang fallback. */
+function pickFirstNonEmpty(
+  metrics: Record<string, MetricMeta>,
+  ...names: string[]
+): Series[] | undefined {
+  for (const name of names) {
+    const s = metrics[name]?.series;
+    if (s && s.length > 0) return s;
+  }
+  return undefined;
+}
+
+export function extractServerMetricSamples(json: string): {
+  kvCacheUtil: number[];
+  prefixCacheHitRate: number[];
+} {
+  const parsed = JSON.parse(json) as MetricsJson;
+  const metrics = parsed.metrics ?? {};
+
+  // KV cache util — per-engine gauge in [0, 1]. Average across engines so the
+  // value stays a percentage; summing would give meaningless 0..N.
+  const kvSeriesAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:kv_cache_usage_perc',
+    'vllm:gpu_cache_usage_perc',
+    'sglang:token_usage',
+  );
+  const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()];
+
+  // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across
+  // all engines. Sum first, then divide. SGLang names: cached_tokens / prompt_tokens.
+  const hitsAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:prefix_cache_hits',
+    'vllm:gpu_prefix_cache_hits',
+    'sglang:cached_tokens',
+  );
+  const queriesAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:prefix_cache_queries',
+    'vllm:gpu_prefix_cache_queries',
+    'vllm:prompt_tokens',
+    'sglang:prompt_tokens',
+  );
+  const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum');
+  const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum');
+  const prefixCacheHitRate: number[] = [];
+  for (const [t, h] of hitsByT) {
+    const q = qByT.get(t);
+    if (q !== undefined && q > 0) prefixCacheHitRate.push(h / q);
+  }
+
+  return { kvCacheUtil, prefixCacheHitRate };
+}
+
+/** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */
+const TARGET_METRIC_KEYS = new Set([
+  // vLLM
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:gpu_prefix_cache_hits',
+  'vllm:gpu_prefix_cache_queries',
+  'vllm:prompt_tokens',
+  // SGLang
+  'sglang:token_usage',
+  'sglang:cached_tokens',
+  'sglang:prompt_tokens',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect ONLY the metrics
+ * we need. Avoids the Node 512 MB string cap that JSON.parse hits on
+ * server_metrics blobs from high-conc TP+EP runs (which can decompress to
+ * >500 MB because vllm dumps `cache_config_info` every scrape interval).
+ *
+ * Pipeline: Buffer → gunzip → JSON parser → Pick('metrics') →
+ * StreamObject (one metric per chunk) → keep only the keys we care about.
+ *
+ * Returns the same `{ kvCacheUtil, prefixCacheHitRate }` shape as the
+ * synchronous fast path so callers can use either interchangeably.
+ */
+async function streamExtractServerMetricSamples(
+  buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+  const collected: Record<string, MetricMeta> = {};
+  // stream-json's TypeScript types don't compose cleanly with node:stream's
+  // pipeline() generic, and several `.pipe()`/event APIs are typed loosely —
+  // cast to any for this local pipe chain. It works at runtime.
+  // stream-json composes transforms via stream-chain. `pick`/`streamObject`
+  // each return a Transform when called; `chain([...])` wires them.
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const pipeline = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipeline as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: MetricMeta };
+      if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipeline as any).on('end', resolve);
+    (pipeline as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
+export async function getAgenticAggregates(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<AgenticAggregateMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const result: AgenticAggregateMap = {};
+
+  // Fast path: read the pre-computed `aggregate_stats` JSONB written by the
+  // ingest pipeline (and back-filled by `backfill-aggregate-stats.ts`). One
+  // round-trip pulls everything we need for every requested id with no blob
+  // decompression, so the slow blob-parsing fallback only runs for ids
+  // whose stats are missing or were produced by an older `STATS_VERSION`.
+  const statsRows = await fetchAggregateStatsRows<AggregateStatsRow>(sql, benchmarkResultIds);
+
+  const idsNeedingProfile: number[] = [];
+  const idsNeedingServer: number[] = [];
+  for (const row of statsRows) {
+    const id = Number(row.benchmark_result_id);
+    const agg = blankAggregate(id);
+    if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+      agg.isl = row.stats.isl ?? null;
+      agg.osl = row.stats.osl ?? null;
+      agg.kvCacheUtil = row.stats.kvCacheUtil ?? null;
+      agg.prefixCacheHitRate = row.stats.prefixCacheHitRate ?? null;
+    } else {
+      // No stats (or stale version) — schedule the blob-parse fallback below
+      // so the response still surfaces data. Backfill should drain these.
+      idsNeedingProfile.push(id);
+      idsNeedingServer.push(id);
+    }
+    result[id] = agg;
+  }
+  // Also fall back for ids that didn't return a row at all (no trace_replay
+  // link) — keep the caller contract: every id we know about lands in the map.
+  for (const id of benchmarkResultIds) {
+    if (!(id in result)) result[id] = blankAggregate(id);
+  }
+
+  if (idsNeedingProfile.length === 0 && idsNeedingServer.length === 0) {
+    return result;
+  }
+
+  // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ──────
+  for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) {
+    const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE);
+    const rows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as profile_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as { benchmark_result_id: number; profile_blob: Buffer | null }[];
+    for (const row of rows) {
+      const id = Number(row.benchmark_result_id);
+      result[id] ??= blankAggregate(id);
+      if (row.profile_blob) {
+        try {
+          const jsonl = gunzipSync(row.profile_blob).toString('utf8');
+          const { isl, osl } = extractIslOsl(jsonl);
+          result[id].isl = percentilesOf(isl);
+          result[id].osl = percentilesOf(osl);
+        } catch {
+          // ignore malformed blob
+        }
+      }
+    }
+  }
+  // ── Fallback Pass 2: server_metrics blobs (huge; one at a time). ───────
+  // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row
+  // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow
+  // path runs at most once per sibling set.
+  for (let i = 0; i < idsNeedingServer.length; i += SERVER_CHUNK_SIZE) {
+    const chunk = idsNeedingServer.slice(i, i + SERVER_CHUNK_SIZE);
+    const rows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.server_metrics_json_gz as server_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as { benchmark_result_id: number; server_blob: Buffer | null }[];
+    for (const row of rows) {
+      const id = Number(row.benchmark_result_id);
+      result[id] ??= blankAggregate(id);
+      if (!row.server_blob) continue;
+      let parsed: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+      try {
+        const json = gunzipSync(row.server_blob).toString('utf8');
+        parsed = extractServerMetricSamples(json);
+      } catch (error) {
+        // ERR_STRING_TOO_LONG (>512 MB) hits on high-conc TP+EP rows whose
+        // server_metrics_json decompresses past Node's max string length.
+        // Stream-parse to extract just the metric subtrees we care about.
+        const code = error && (error as NodeJS.ErrnoException).code;
+        const msg = error instanceof Error ? error.message : String(error);
+        if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+          try {
+            parsed = await streamExtractServerMetricSamples(row.server_blob);
+          } catch {
+            // stream fallback failed too — leave nulls
+          }
+        }
+      }
+      if (parsed) {
+        result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil);
+        result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate);
+      }
+    }
+  }
+  return result;
+}
+
+/** Shape of the JSONB column when read back via postgres-js. */
+interface AggregateStatsRow {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  normalizedSessionTimeS: number | null;
+  p90PrefillTpsPerUser: number | null;
+}
+
+function blankAggregate(id: number): AgenticAggregate {
+  return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
+}
diff --git a/packages/db/src/queries/agentic-shared.ts b/packages/db/src/queries/agentic-shared.ts
new file mode 100644
index 00000000..e8a639e7
--- /dev/null
+++ b/packages/db/src/queries/agentic-shared.ts
@@ -0,0 +1,81 @@
+/**
+ * Helpers shared by the agentic per-point queries (`agentic-aggregates.ts`,
+ * `derived-agentic-metrics.ts`): percentile math over aiperf samples,
+ * the `{value, unit}` metric-envelope reader, and the single-round-trip
+ * `aggregate_stats` fetch both fast paths start from.
+ */
+
+import type { DbClient } from '../connection.js';
+
+export interface MetricPercentiles {
+  mean: number;
+  p50: number;
+  p75: number;
+  p90: number;
+  p99: number;
+  /** Sample count used to compute the percentiles. */
+  n: number;
+}
+
+/** Linear-interpolated percentile (matches numpy's default linear method). */
+export function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 0) return Number.NaN;
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+export function meanOf(xs: number[]): number {
+  if (xs.length === 0) return Number.NaN;
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+
+/** Compute the percentile bundle for an array of samples; null if empty. */
+export function percentilesOf(samples: number[]): MetricPercentiles | null {
+  const clean = samples.filter((v) => Number.isFinite(v));
+  if (clean.length === 0) return null;
+  const sorted = [...clean].toSorted((a, b) => a - b);
+  return {
+    mean: meanOf(sorted),
+    p50: quantile(sorted, 0.5),
+    p75: quantile(sorted, 0.75),
+    p90: quantile(sorted, 0.9),
+    p99: quantile(sorted, 0.99),
+    n: sorted.length,
+  };
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+export function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return v;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+/**
+ * One round-trip fetch of the pre-computed `aggregate_stats` JSONB for a set
+ * of benchmark_results ids (via their trace_replay link). Both agentic fast
+ * paths read from this; ids without a trace_replay row simply don't appear.
+ * `Stats` is the caller's view of the JSONB shape.
+ */
+export async function fetchAggregateStatsRows<Stats>(
+  sql: DbClient,
+  benchmarkResultIds: readonly number[],
+): Promise<{ benchmark_result_id: number; stats: Stats | null }[]> {
+  return (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.aggregate_stats as stats
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+  `) as unknown as { benchmark_result_id: number; stats: Stats | null }[];
+}
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
new file mode 100644
index 00000000..2d36eb22
--- /dev/null
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -0,0 +1,169 @@
+/**
+ * Find all benchmark_results that share the same SKU (hardware + framework +
+ * model + precision + spec_method + disagg + benchmark_type + workflow_run)
+ * as the given point. Used by the detail page to render a "switch between
+ * concs / parallelisms" navigator within a single run.
+ */
+
+import type { DbClient } from '../connection.js';
+
+export interface BenchmarkSibling {
+  id: number;
+  conc: number;
+  /** "on" | "off" | null. */
+  offload_mode: string | null;
+  decode_tp: number;
+  decode_ep: number;
+  decode_dp_attention: boolean;
+  decode_num_workers: number;
+  prefill_tp: number;
+  prefill_ep: number;
+  prefill_dp_attention: boolean;
+  prefill_num_workers: number;
+  num_prefill_gpu: number;
+  num_decode_gpu: number;
+  disagg: boolean;
+  is_multinode: boolean;
+  /** Throughput per GPU (tok/s/gpu) for this point; null if the metric is absent. */
+  tput_per_gpu: number | null;
+  /**
+   * Total requests for this point — `total_requests_completed` (aiperf runner)
+   * falling back to the legacy `num_requests_total`; null if neither is present.
+   */
+  total_requests: number | null;
+  /** True if this row IS the point passed in. */
+  is_current: boolean;
+  /** Whether the row has a stored trace_replay blob (for navigation hint). */
+  has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  benchmark_type: string;
+  /** Human-readable workflow_run summary so the page header can hint at provenance. */
+  github_run_id: number;
+  date: string;
+  /** Slug of the source dataset this run replayed (run_datasets), or null. */
+  dataset_slug: string | null;
+}
+
+export interface BenchmarkSiblings {
+  sku: BenchmarkSku;
+  siblings: BenchmarkSibling[];
+}
+
+export async function getBenchmarkSiblings(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<BenchmarkSiblings | null> {
+  // Step 1: resolve the SKU defining fields for the requested point.
+  const seed = (await sql`
+    select
+      c.hardware, c.framework, c.model, c.precision, c.spec_method,
+      br.benchmark_type, br.workflow_run_id, br.date::text,
+      wr.github_run_id, rd.dataset_slug
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    left join run_datasets rd on rd.workflow_run_id = br.workflow_run_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as {
+    hardware: string;
+    framework: string;
+    model: string;
+    precision: string;
+    spec_method: string;
+    benchmark_type: string;
+    workflow_run_id: number;
+    date: string;
+    github_run_id: number;
+    dataset_slug: string | null;
+  }[];
+  const root = seed[0];
+  if (!root) return null;
+
+  // Step 2: pull every sibling row sharing the SKU within the same workflow_run.
+  const rows = (await sql`
+    select
+      br.id, br.conc, br.offload_mode,
+      c.decode_tp, c.decode_ep, c.decode_dp_attention, c.decode_num_workers,
+      c.prefill_tp, c.prefill_ep, c.prefill_dp_attention, c.prefill_num_workers,
+      c.num_prefill_gpu, c.num_decode_gpu, c.disagg, c.is_multinode,
+      (br.metrics->>'tput_per_gpu')::float8 as tput_per_gpu,
+      coalesce(
+        (br.metrics->>'total_requests_completed')::float8,
+        (br.metrics->>'num_requests_total')::float8
+      ) as total_requests,
+      (br.trace_replay_id is not null) as has_trace
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    where br.workflow_run_id = ${root.workflow_run_id}
+      and br.benchmark_type = ${root.benchmark_type}
+      and c.hardware = ${root.hardware}
+      and c.framework = ${root.framework}
+      and c.model = ${root.model}
+      and c.precision = ${root.precision}
+      and c.spec_method = ${root.spec_method}
+    order by c.decode_tp, c.decode_ep, br.offload_mode nulls first, br.conc
+  `) as unknown as {
+    id: number;
+    conc: number;
+    offload_mode: string | null;
+    decode_tp: number;
+    decode_ep: number;
+    decode_dp_attention: boolean;
+    decode_num_workers: number;
+    prefill_tp: number;
+    prefill_ep: number;
+    prefill_dp_attention: boolean;
+    prefill_num_workers: number;
+    num_prefill_gpu: number;
+    num_decode_gpu: number;
+    disagg: boolean;
+    is_multinode: boolean;
+    tput_per_gpu: number | null;
+    total_requests: number | null;
+    has_trace: boolean;
+  }[];
+
+  const siblings: BenchmarkSibling[] = rows.map((r) => ({
+    id: Number(r.id),
+    conc: r.conc,
+    offload_mode: r.offload_mode,
+    decode_tp: r.decode_tp,
+    decode_ep: r.decode_ep,
+    decode_dp_attention: r.decode_dp_attention,
+    decode_num_workers: r.decode_num_workers,
+    prefill_tp: r.prefill_tp,
+    prefill_ep: r.prefill_ep,
+    prefill_dp_attention: r.prefill_dp_attention,
+    prefill_num_workers: r.prefill_num_workers,
+    num_prefill_gpu: r.num_prefill_gpu,
+    num_decode_gpu: r.num_decode_gpu,
+    disagg: r.disagg,
+    is_multinode: r.is_multinode,
+    tput_per_gpu: r.tput_per_gpu === null ? null : Number(r.tput_per_gpu),
+    total_requests: r.total_requests === null ? null : Number(r.total_requests),
+    is_current: Number(r.id) === benchmarkResultId,
+    has_trace: r.has_trace,
+  }));
+
+  return {
+    sku: {
+      hardware: root.hardware,
+      framework: root.framework,
+      model: root.model,
+      precision: root.precision,
+      spec_method: root.spec_method,
+      benchmark_type: root.benchmark_type,
+      github_run_id: Number(root.github_run_id),
+      date: root.date,
+      dataset_slug: root.dataset_slug ?? null,
+    },
+    siblings,
+  };
+}
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index d99a1da1..37301e2b 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -11,6 +11,8 @@ import type { WorkerPower } from '../etl/benchmark-mapper.js';
 export type BenchmarkWorkerRow = WorkerPower;
 
 export interface BenchmarkRow {
+  /** Stable benchmark_results id used for agentic detail lookups. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -28,9 +30,11 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   /**
@@ -50,7 +54,7 @@ export interface BenchmarkRow {
  * `['glm5', 'glm5.1']` unions both buckets under the one display.
  *
  * Selection unit is the LINE, not the point: for each line
- * `(config_id, benchmark_type, isl, osl)` we pick the single newest workflow run that
+ * `(config_id, benchmark_type, isl, osl, offload_mode)` we pick the single newest workflow run that
  * produced data for it (newest date, then latest sweep, then highest run id) and return
  * EVERY concurrency that one run measured — and nothing from any other run. A partial
  * re-sweep therefore truncates the line to its own concurrencies rather than stitching the
@@ -93,7 +97,8 @@ export async function getLatestBenchmarks(
             )
           )`
         : sql``;
-    // winners: the single newest run per LINE (config_id, benchmark_type, isl, osl) under the
+    // winners: the single newest run per LINE
+    // (config_id, benchmark_type, isl, osl, offload_mode) under the
     // date/run cutoff. br.date is a calendar day, so two same-day sweeps tie on date — break
     // by wr.run_started_at (latest sweep wins), then br.workflow_run_id so exactly one run wins
     // even when run_started_at is equal/null. The outer join then pulls EVERY concurrency that
@@ -101,8 +106,8 @@ export async function getLatestBenchmarks(
     // of concurrencies a partial re-sweep skipped).
     const rows = await sql`
       WITH winners AS (
-        SELECT DISTINCT ON (br.config_id, br.benchmark_type, br.isl, br.osl)
-          br.config_id, br.benchmark_type, br.isl, br.osl,
+        SELECT DISTINCT ON (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode)
+          br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
           br.workflow_run_id AS winning_run_id
         FROM benchmark_results br
         JOIN configs c ON c.id = br.config_id
@@ -111,10 +116,11 @@ export async function getLatestBenchmarks(
           AND br.error IS NULL
           AND ${dateFilter}
           ${runFilter}
-        ORDER BY br.config_id, br.benchmark_type, br.isl, br.osl,
+        ORDER BY br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
                  br.date DESC, wr.run_started_at DESC NULLS LAST, br.workflow_run_id DESC
       )
       SELECT
+        br.id,
         c.hardware,
         c.framework,
         c.model,
@@ -132,6 +138,8 @@ export async function getLatestBenchmarks(
         c.decode_num_workers,
         c.num_prefill_gpu,
         c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
         br.isl,
         br.osl,
         br.conc,
@@ -148,6 +156,7 @@ export async function getLatestBenchmarks(
         AND w.benchmark_type = br.benchmark_type
         AND w.isl IS NOT DISTINCT FROM br.isl
         AND w.osl IS NOT DISTINCT FROM br.osl
+        AND w.offload_mode = br.offload_mode
         AND w.winning_run_id = br.workflow_run_id
       WHERE br.error IS NULL
       ORDER BY br.config_id, br.conc, br.isl, br.osl
@@ -158,6 +167,7 @@ export async function getLatestBenchmarks(
   // No date filter: use materialized view for instant lookups
   const rows = await sql`
     SELECT
+      lb.id,
       c.hardware,
       c.framework,
       c.model,
@@ -175,6 +185,8 @@ export async function getLatestBenchmarks(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      lb.benchmark_type,
+      lb.offload_mode,
       lb.isl,
       lb.osl,
       lb.conc,
@@ -207,6 +219,7 @@ export async function getBenchmarksForRun(
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   const rows = await sql`
     SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+      br.id,
       c.hardware,
       c.framework,
       c.model,
@@ -224,6 +237,8 @@ export async function getBenchmarksForRun(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
+      br.offload_mode,
       br.isl,
       br.osl,
       br.conc,
@@ -257,6 +272,7 @@ export async function getAllBenchmarksForHistory(
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   const rows = await sql`
     SELECT
+      br.id,
       c.hardware,
       c.framework,
       c.model,
@@ -274,9 +290,12 @@ export async function getAllBenchmarksForHistory(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
+      br.offload_mode,
       br.isl,
       br.osl,
       br.conc,
+      br.image,
       br.metrics - '{std_ttft,std_tpot,std_e2el,std_intvty,std_itl,mean_ttft,mean_tpot,mean_e2el,mean_intvty,mean_itl}'::text[] as metrics,
       br.workers,
       br.date::text,
diff --git a/packages/db/src/queries/datasets.test.ts b/packages/db/src/queries/datasets.test.ts
new file mode 100644
index 00000000..c1676445
--- /dev/null
+++ b/packages/db/src/queries/datasets.test.ts
@@ -0,0 +1,102 @@
+import { describe, expect, it } from 'vitest';
+
+import type { DbClient } from '../connection.js';
+import { getConversation, listConversations, listDatasets } from './datasets.js';
+
+/**
+ * Mock DbClient: returns canned result sets in call order. Each call to the
+ * tagged-template `sql` shifts the next queued rows array. The query text is
+ * ignored — these tests assert the JS-side shaping/coercion, not SQL.
+ */
+function mockSql(queue: unknown[][]): DbClient {
+  const responses = [...queue];
+  return (() => Promise.resolve(responses.shift() ?? [])) as unknown as DbClient;
+}
+
+describe('listDatasets', () => {
+  it('coerces conversation_count to a number', async () => {
+    const sql = mockSql([
+      [
+        {
+          id: 'a/b',
+          slug: 'b',
+          label: 'B',
+          variant: 'full',
+          conversation_count: '393',
+          summary: {},
+        },
+      ],
+    ]);
+    const out = await listDatasets(sql);
+    expect(out).toHaveLength(1);
+    expect(out[0].conversation_count).toBe(393);
+    expect(typeof out[0].conversation_count).toBe('number');
+  });
+});
+
+describe('listConversations', () => {
+  it('returns null when the dataset slug is unknown', async () => {
+    const sql = mockSql([[]]); // datasets lookup → no rows
+    expect(await listConversations(sql, 'missing')).toBeNull();
+  });
+
+  it('returns total + numerically-coerced items', async () => {
+    const sql = mockSql([
+      [{ id: 'ds-id' }], // datasets lookup
+      [{ n: 2 }], // count
+      [
+        {
+          conv_id: 'c1',
+          models: ['m'],
+          num_turns: '5',
+          num_subagent_groups: '1',
+          total_in: '1000',
+          total_out: '200',
+          total_cached: '900',
+        },
+      ], // items
+    ]);
+    const out = await listConversations(sql, 'b', { sort: 'tokens' });
+    expect(out).not.toBeNull();
+    expect(out!.total).toBe(2);
+    expect(out!.items[0]).toMatchObject({
+      conv_id: 'c1',
+      num_turns: 5,
+      num_subagent_groups: 1,
+      total_in: 1000,
+      total_out: 200,
+      total_cached: 900,
+    });
+    expect(typeof out!.items[0].total_in).toBe('number');
+  });
+});
+
+describe('getConversation', () => {
+  it('returns null when the conversation is missing', async () => {
+    const sql = mockSql([[]]);
+    expect(await getConversation(sql, 'b', 'nope')).toBeNull();
+  });
+
+  it('coerces counts and passes through the structure', async () => {
+    const structure = { blockSize: 64, nodes: [], totals: {} };
+    const sql = mockSql([
+      [
+        {
+          conv_id: 'c1',
+          models: ['m'],
+          num_turns: '3',
+          num_subagent_groups: '0',
+          total_in: '500',
+          total_out: '100',
+          total_cached: '450',
+          structure,
+        },
+      ],
+    ]);
+    const out = await getConversation(sql, 'b', 'c1');
+    expect(out).not.toBeNull();
+    expect(out!.num_turns).toBe(3);
+    expect(out!.total_cached).toBe(450);
+    expect(out!.structure).toBe(structure);
+  });
+});
diff --git a/packages/db/src/queries/datasets.ts b/packages/db/src/queries/datasets.ts
new file mode 100644
index 00000000..cfefe391
--- /dev/null
+++ b/packages/db/src/queries/datasets.ts
@@ -0,0 +1,213 @@
+/**
+ * Read queries for the agentic-benchmark source datasets (the HF cc-traces-weka
+ * corpora ingested by ingest-weka-dataset.ts). Back the /datasets area:
+ *   - listDatasets      → registry cards (no per-conversation rows)
+ *   - getDataset        → one dataset incl. precomputed chart_data
+ *   - listConversations → paginated conversation list (counts only, no structure)
+ *   - getConversation   → one conversation's flamegraph structure
+ */
+
+import type { DbClient } from '../connection.js';
+import type { ConversationStructure } from '../etl/weka-structure.js';
+
+export interface DatasetSummary {
+  blockSize?: number;
+  hashIdScope?: string | null;
+  totalIn?: number;
+  totalOut?: number;
+  totalCached?: number;
+  cachedPct?: number;
+  mainTurns?: number;
+  subagentGroups?: number;
+  subagentTurns?: number;
+  meanRequestsPerConversation?: number;
+  medianRequestsPerConversation?: number;
+  meanSubagentsPerTrace?: number;
+  medianSubagentsPerTrace?: number;
+  modelMix?: Record<string, number>;
+  [k: string]: unknown;
+}
+
+export interface DatasetRecord {
+  id: string;
+  slug: string;
+  label: string;
+  variant: string;
+  description: string | null;
+  hf_url: string | null;
+  license: string | null;
+  conversation_count: number;
+  summary: DatasetSummary;
+  ingested_at: string;
+}
+
+export interface DatasetDetail extends DatasetRecord {
+  /** Precomputed distribution bins + stats keyed by metric (see ingest buildChartData). */
+  chart_data: Record<string, unknown>;
+}
+
+export interface ConversationListItem {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+}
+
+export interface ConversationList {
+  total: number;
+  items: ConversationListItem[];
+}
+
+export interface ConversationDetail {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+  structure: ConversationStructure;
+}
+
+/** All ingested datasets, newest first. Excludes the (large) chart_data blob. */
+export async function listDatasets(sql: DbClient): Promise<DatasetRecord[]> {
+  const rows = (await sql`
+    select id, slug, label, variant, description, hf_url, license,
+           conversation_count, summary, ingested_at::text
+    from datasets
+    order by ingested_at desc, slug asc
+  `) as unknown as DatasetRecord[];
+  return rows.map((r) => ({ ...r, conversation_count: Number(r.conversation_count) }));
+}
+
+/** One dataset by slug, including chart_data. Null if not found. */
+export async function getDataset(sql: DbClient, slug: string): Promise<DatasetDetail | null> {
+  const rows = (await sql`
+    select id, slug, label, variant, description, hf_url, license,
+           conversation_count, summary, chart_data, ingested_at::text
+    from datasets
+    where slug = ${slug}
+  `) as unknown as DatasetDetail[];
+  const row = rows[0];
+  if (!row) return null;
+  return { ...row, conversation_count: Number(row.conversation_count) };
+}
+
+export interface ListConversationsOpts {
+  search?: string;
+  limit?: number;
+  offset?: number;
+  /** 'tokens' (total_in desc), 'turns' (num_turns desc), or 'id' (conv_id asc). */
+  sort?: 'tokens' | 'turns' | 'subagents' | 'id';
+}
+
+const MAX_LIMIT = 200;
+
+/**
+ * Paginated conversation list for a dataset (by slug). Returns counts only —
+ * the per-conversation `structure` blob is fetched separately by
+ * getConversation so the list stays light.
+ */
+export async function listConversations(
+  sql: DbClient,
+  slug: string,
+  opts: ListConversationsOpts = {},
+): Promise<ConversationList | null> {
+  const ds = (await sql`select id from datasets where slug = ${slug}`) as unknown as {
+    id: string;
+  }[];
+  const datasetId = ds[0]?.id;
+  if (!datasetId) return null;
+
+  const limit = Math.min(MAX_LIMIT, Math.max(1, opts.limit ?? 50));
+  const offset = Math.max(0, opts.offset ?? 0);
+  const search = opts.search?.trim();
+  const like = search ? `%${search}%` : null;
+
+  const totalRows = (await sql`
+    select count(*)::int as n
+    from dataset_conversations
+    where dataset_id = ${datasetId}
+      and (${like}::text is null or conv_id ilike ${like})
+  `) as unknown as { n: number }[];
+  const total = totalRows[0]?.n ?? 0;
+
+  // Separate queries per sort (literal ORDER BY) — the neon HTTP driver doesn't
+  // compose nested sql fragments the way postgres.js does, so we can't splice an
+  // order-by fragment. The sort key is an enum, never raw user input.
+  const sort = opts.sort ?? 'tokens';
+  let items: ConversationListItem[];
+  if (sort === 'turns') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by num_turns desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else if (sort === 'subagents') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by num_subagent_groups desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else if (sort === 'id') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by total_in desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  }
+
+  return {
+    total,
+    items: items.map((r) => ({
+      ...r,
+      num_turns: Number(r.num_turns),
+      num_subagent_groups: Number(r.num_subagent_groups),
+      total_in: Number(r.total_in),
+      total_out: Number(r.total_out),
+      total_cached: Number(r.total_cached),
+    })),
+  };
+}
+
+/** One conversation's full flamegraph structure. Null if dataset/conv missing. */
+export async function getConversation(
+  sql: DbClient,
+  slug: string,
+  convId: string,
+): Promise<ConversationDetail | null> {
+  const rows = (await sql`
+    select dc.conv_id, dc.models, dc.num_turns, dc.num_subagent_groups,
+           dc.total_in, dc.total_out, dc.total_cached, dc.structure
+    from dataset_conversations dc
+    join datasets d on d.id = dc.dataset_id
+    where d.slug = ${slug} and dc.conv_id = ${convId}
+  `) as unknown as ConversationDetail[];
+  const row = rows[0];
+  if (!row) return null;
+  return {
+    ...row,
+    num_turns: Number(row.num_turns),
+    num_subagent_groups: Number(row.num_subagent_groups),
+    total_in: Number(row.total_in),
+    total_out: Number(row.total_out),
+    total_cached: Number(row.total_cached),
+  };
+}
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
new file mode 100644
index 00000000..afc5b22d
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -0,0 +1,111 @@
+import { describe, expect, it } from 'vitest';
+
+import { computeDerivedFromBlob } from './derived-agentic-metrics.js';
+
+/** Build one aiperf JSONL record for the synthetic fixture. */
+function rec(
+  conversation_id: string,
+  turn_index: number,
+  fields: { isl: number; osl: number; ttft_ms: number; latency_ms: number },
+): string {
+  return JSON.stringify({
+    metadata: { conversation_id, turn_index, benchmark_phase: 'profiling' },
+    metrics: {
+      request_latency: { value: fields.latency_ms, unit: 'ms' },
+      time_to_first_token: { value: fields.ttft_ms, unit: 'ms' },
+      input_sequence_length: { value: fields.isl, unit: 'tokens' },
+      output_sequence_length: { value: fields.osl, unit: 'tokens' },
+    },
+  });
+}
+
+describe('computeDerivedFromBlob', () => {
+  it('returns nulls when no usable records', () => {
+    const out = computeDerivedFromBlob('');
+    expect(out.normalized_session_time_s).toBeNull();
+    expect(out.p90_prefill_tps_per_user).toBeNull();
+    expect(out.normalized_e2e_400).toBeNull();
+  });
+
+  it('normalizes each request to 400 output tokens before taking percentiles', () => {
+    const jsonl = [
+      // Both requests have TTFT=2s and ITL=20ms, despite very different OSL/E2E.
+      rec('s1', 0, { isl: 100, osl: 100, ttft_ms: 2000, latency_ms: 3980 }),
+      rec('s2', 0, { isl: 100, osl: 1000, ttft_ms: 2000, latency_ms: 21_980 }),
+    ].join('\n');
+
+    const out = computeDerivedFromBlob(jsonl);
+    // 2s TTFT + 399 × 20ms ITL = 9.98s for both requests.
+    expect(out.normalized_e2e_400?.n).toBe(2);
+    expect(out.normalized_e2e_400?.p75).toBeCloseTo(9.98, 8);
+    expect(out.normalized_e2e_400?.p90).toBeCloseTo(9.98, 8);
+  });
+
+  it('rescales single-session time and computes P90 prefill', () => {
+    // One session, two turns. load = (100+50) + (200+50) = 400.
+    // Single session ⇒ mean_load = load_i ⇒ T̃ = T = (1000+2000) ms = 3.0 s.
+    const jsonl = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }),
+    ].join('\n');
+    const out = computeDerivedFromBlob(jsonl);
+    expect(out.normalized_session_time_s).toBeCloseTo(3, 6);
+    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → global P90 = 200.
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+  });
+
+  it('rescales times across sessions with unequal load', () => {
+    // s1: 1 turn, load = 100, T = 1s
+    // s2: 1 turn, load = 300, T = 3s
+    // mean_load = 200; T̃_1 = 1 * 200/100 = 2; T̃_2 = 3 * 200/300 = 2
+    // Mean T̃ = 2.0
+    const jsonl = [
+      rec('s1', 0, { isl: 90, osl: 10, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s2', 0, { isl: 270, osl: 30, ttft_ms: 500, latency_ms: 3000 }),
+    ].join('\n');
+    const out = computeDerivedFromBlob(jsonl);
+    expect(out.normalized_session_time_s).toBeCloseTo(2, 6);
+  });
+
+  it('drops records missing required fields and skips non-profiling phase', () => {
+    const lines = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      // missing TTFT — should be skipped
+      JSON.stringify({
+        metadata: { conversation_id: 's1', turn_index: 1, benchmark_phase: 'profiling' },
+        metrics: {
+          request_latency: { value: 1000, unit: 'ms' },
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+      // warmup phase — should be skipped
+      JSON.stringify({
+        metadata: { conversation_id: 's2', turn_index: 0, benchmark_phase: 'warmup' },
+        metrics: {
+          request_latency: { value: 9999, unit: 'ms' },
+          time_to_first_token: { value: 9999, unit: 'ms' },
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+    ];
+    const out = computeDerivedFromBlob(lines.join('\n'));
+    expect(out.normalized_session_time_s).toBeCloseTo(1, 6);
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+  });
+
+  it('p90 across turns: 10-turn session picks the right rank', () => {
+    // Prefill rates 100..1000 (per turn isl/ttft); p90 of 10 values (linear) = 910.
+    const turns = Array.from({ length: 10 }, (_, i) =>
+      rec('s1', i, {
+        isl: (i + 1) * 100, // 100, 200, ..., 1000 tokens
+        osl: 10,
+        ttft_ms: 1000, // 1 second → rates: 100..1000 tps
+        latency_ms: 1500,
+      }),
+    );
+    const out = computeDerivedFromBlob(turns.join('\n'));
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6);
+  });
+});
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
new file mode 100644
index 00000000..8e5d15c9
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -0,0 +1,268 @@
+/**
+ * Live-computed per-point metrics derived from the stored aiperf
+ * `profile_export.jsonl` blob. These aren't precomputed in the metrics JSONB
+ * because they require grouping by `conversation_id` and aggregating per
+ * session — work that's cheap once per agentic point but adds up to be
+ * meaningful only when actually plotted.
+ *
+ * - normalized_session_time_s: per the "Mean Normalized Session Time" proposal
+ *   (https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa). Sum of
+ *   per-turn `request_latency` per session (inter-turn tool/thinking gaps are
+ *   inherently excluded since we only sum the active GPU time, not wallclock).
+ *   Each session's time is rescaled by `mean_load / session_load`, where load
+ *   is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions.
+ *
+ * - p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
+ *   Per turn: prefill_tps = ISL / TTFT_seconds. Single P90 across every turn
+ *   in every session — the per-session percentile + cross-session mean
+ *   sandwich was discarded because it just dampens tail behavior.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import { NORMALIZED_E2E_OUTPUT_TOKENS } from '@semianalysisai/inferencex-constants';
+
+import type { DbClient } from '../connection.js';
+import { STATS_VERSION } from './agentic-aggregates';
+import {
+  fetchAggregateStatsRows,
+  meanOf,
+  percentilesOf,
+  quantile,
+  readNum,
+  type MetricPercentiles,
+} from './agentic-shared';
+
+export interface DerivedAgenticMetric {
+  /** benchmark_results.id this entry belongs to. */
+  id: number;
+  /** Mean normalized session time in seconds. */
+  normalized_session_time_s: number | null;
+  /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */
+  p90_prefill_tps_per_user: number | null;
+  /** P75 normalized per-request E2E at a fixed 400-token output length. */
+  p75_normalized_e2e_400_s: number | null;
+  /** P90 normalized per-request E2E at a fixed 400-token output length. */
+  p90_normalized_e2e_400_s: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
+
+/**
+ * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless
+ * HTTP driver caps responses at 64 MB — chunk to stay well under.
+ */
+const QUERY_CHUNK_SIZE = 6;
+
+interface RecordMetrics {
+  request_latency?: { value?: number; unit?: string } | number;
+  time_to_first_token?: { value?: number; unit?: string } | number;
+  input_sequence_length?: { value?: number } | number;
+  output_sequence_length?: { value?: number } | number;
+}
+
+interface RecordMetadata {
+  conversation_id?: string;
+  turn_index?: number;
+  benchmark_phase?: string;
+}
+
+interface ProfileRecord {
+  metadata?: RecordMetadata;
+  metrics?: RecordMetrics;
+}
+
+interface TurnFields {
+  request_latency_ms: number;
+  ttft_ms: number;
+  isl: number;
+  osl: number;
+}
+
+function extractTurn(rec: ProfileRecord): TurnFields | null {
+  const m = rec.metrics ?? {};
+  const rl = readNum(m.request_latency);
+  const tt = readNum(m.time_to_first_token);
+  const isl = readNum(m.input_sequence_length);
+  const osl = readNum(m.output_sequence_length);
+  if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null;
+  if (rl <= 0 || tt <= 0 || isl <= 0) return null;
+  return { request_latency_ms: rl, ttft_ms: tt, isl, osl };
+}
+
+/**
+ * Parse one point's JSONL and return the two derived metrics. Returns
+ * `{ session_time: null, prefill: null }` if the blob has no usable records.
+ */
+export function computeDerivedFromBlob(jsonl: string): {
+  normalized_session_time_s: number | null;
+  p90_prefill_tps_per_user: number | null;
+  normalized_e2e_400: MetricPercentiles | null;
+} {
+  // Group records by conversation_id, filter to the profiling phase.
+  const bySession = new Map<string, TurnFields[]>();
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const sid = rec.metadata?.conversation_id;
+    if (!sid) continue;
+    const turn = extractTurn(rec);
+    if (!turn) continue;
+    let list = bySession.get(sid);
+    if (!list) {
+      list = [];
+      bySession.set(sid, list);
+    }
+    list.push(turn);
+  }
+  if (bySession.size === 0) {
+    return {
+      normalized_session_time_s: null,
+      p90_prefill_tps_per_user: null,
+      normalized_e2e_400: null,
+    };
+  }
+
+  // Per-session aggregates for session time; per-turn prefill rates pool into
+  // a single global array so the percentile sees the full distribution.
+  const sessionTimesS: number[] = [];
+  const sessionLoads: number[] = [];
+  const allPrefillRates: number[] = [];
+  const allNormalizedE2eS: number[] = [];
+  for (const turns of bySession.values()) {
+    let timeMs = 0;
+    let load = 0;
+    for (const t of turns) {
+      timeMs += t.request_latency_ms;
+      load += t.isl + t.osl;
+      const ttftSec = t.ttft_ms / 1000;
+      if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec);
+
+      // Keep the observed TTFT, then project the request's mean decode
+      // interval to a fixed output length. Do this per request before taking
+      // percentiles so long original outputs do not dominate the tail.
+      const observedDecodeIntervals = Math.max(t.osl - 1, 1);
+      const itlMs = (t.request_latency_ms - t.ttft_ms) / observedDecodeIntervals;
+      const normalizedMs = t.ttft_ms + (NORMALIZED_E2E_OUTPUT_TOKENS - 1) * itlMs;
+      if (
+        Number.isFinite(itlMs) &&
+        itlMs >= 0 &&
+        Number.isFinite(normalizedMs) &&
+        normalizedMs > 0
+      ) {
+        allNormalizedE2eS.push(normalizedMs / 1000);
+      }
+    }
+    if (load > 0) {
+      sessionTimesS.push(timeMs / 1000);
+      sessionLoads.push(load);
+    }
+  }
+
+  // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+  let normalized: number | null = null;
+  if (sessionTimesS.length > 0) {
+    const meanLoad = meanOf(sessionLoads);
+    if (meanLoad > 0) {
+      const scaled: number[] = [];
+      for (let i = 0; i < sessionTimesS.length; i++) {
+        const ti = sessionTimesS[i]!;
+        const li = sessionLoads[i]!;
+        if (li > 0) scaled.push(ti * (meanLoad / li));
+      }
+      normalized = scaled.length > 0 ? meanOf(scaled) : null;
+    }
+  }
+
+  let prefill: number | null = null;
+  if (allPrefillRates.length > 0) {
+    allPrefillRates.sort((a, b) => a - b);
+    prefill = quantile(allPrefillRates, 0.9);
+  }
+
+  return {
+    normalized_session_time_s: normalized,
+    p90_prefill_tps_per_user: prefill,
+    normalized_e2e_400: percentilesOf(allNormalizedE2eS),
+  };
+}
+
+export async function getDerivedAgenticMetrics(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<DerivedAgenticMetricMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const result: DerivedAgenticMetricMap = {};
+
+  // Fast path: read the pre-computed values out of `aggregate_stats`. The
+  // ingest pipeline computes both metrics in the same pass that produces the
+  // percentile bundles, so a single SQL round-trip covers most ids without
+  // touching the gzipped profile blob.
+  const statsRows = await fetchAggregateStatsRows<{
+    version?: number;
+    normalizedSessionTimeS?: number | null;
+    p90PrefillTpsPerUser?: number | null;
+    normalizedE2e400?: MetricPercentiles | null;
+  }>(sql, benchmarkResultIds);
+
+  const idsNeedingBlob: number[] = [];
+  for (const row of statsRows) {
+    const id = Number(row.benchmark_result_id);
+    if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+      result[id] = {
+        id,
+        normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null,
+        p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null,
+        p75_normalized_e2e_400_s: row.stats.normalizedE2e400?.p75 ?? null,
+        p90_normalized_e2e_400_s: row.stats.normalizedE2e400?.p90 ?? null,
+      };
+    } else {
+      idsNeedingBlob.push(id);
+    }
+  }
+
+  if (idsNeedingBlob.length === 0) return result;
+
+  // Fallback: parse the profile blob directly. Used for rows whose
+  // `aggregate_stats` is null or computed by an older STATS_VERSION; the
+  // backfill script drains the population so this path should be rare.
+  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+        and atr.profile_export_jsonl_gz is not null
+    `) as { benchmark_result_id: number; blob: Buffer }[];
+    rows.push(...chunkRows);
+  }
+
+  for (const row of rows) {
+    try {
+      const jsonl = gunzipSync(row.blob).toString('utf8');
+      const { normalized_session_time_s, p90_prefill_tps_per_user, normalized_e2e_400 } =
+        computeDerivedFromBlob(jsonl);
+      result[Number(row.benchmark_result_id)] = {
+        id: Number(row.benchmark_result_id),
+        normalized_session_time_s,
+        p90_prefill_tps_per_user,
+        p75_normalized_e2e_400_s: normalized_e2e_400?.p75 ?? null,
+        p90_normalized_e2e_400_s: normalized_e2e_400?.p90 ?? null,
+      };
+    } catch {
+      // Skip malformed blobs silently — frontend treats missing ids as "no data".
+    }
+  }
+  return result;
+}
diff --git a/packages/db/src/queries/request-timeline.test.ts b/packages/db/src/queries/request-timeline.test.ts
new file mode 100644
index 00000000..62ba5385
--- /dev/null
+++ b/packages/db/src/queries/request-timeline.test.ts
@@ -0,0 +1,45 @@
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline';
+import type { DbClient } from '../connection.js';
+
+import { getRequestTimeline } from './request-timeline';
+
+function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } {
+  const responses = [...queue];
+  const calls: string[] = [];
+  const sql = ((strings: TemplateStringsArray) => {
+    calls.push(strings.join('?'));
+    return Promise.resolve(responses.shift() ?? []);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+const timeline: RequestTimeline = {
+  version: REQUEST_TIMELINE_VERSION,
+  startNs: 100,
+  endNs: 200,
+  durationS: 0.0000001,
+  requests: [],
+};
+
+describe('getRequestTimeline', () => {
+  it('returns the current precomputed timeline without selecting the raw profile blob', async () => {
+    const { sql, calls } = mockSql([
+      [{ trace_replay_id: 870, has_blob: true, request_timeline: timeline }],
+    ]);
+
+    await expect(getRequestTimeline(sql, 422991)).resolves.toEqual(timeline);
+    expect(calls).toHaveLength(1);
+    expect(calls[0]).not.toContain('profile_export_jsonl_gz as blob');
+  });
+
+  it('does not fetch a blob when neither a current timeline nor a blob exists', async () => {
+    const { sql, calls } = mockSql([
+      [{ trace_replay_id: 870, has_blob: false, request_timeline: null }],
+    ]);
+
+    await expect(getRequestTimeline(sql, 422991)).resolves.toBeNull();
+    expect(calls).toHaveLength(1);
+  });
+});
diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts
new file mode 100644
index 00000000..2a6bb40c
--- /dev/null
+++ b/packages/db/src/queries/request-timeline.ts
@@ -0,0 +1,64 @@
+/**
+ * Per-request timeline for the agentic detail page's Gantt view.
+ *
+ * Backed by `agentic_trace_replay.request_timeline` (pre-computed at
+ * ingest time, see `etl/compute-request-timeline.ts`). The fast path is
+ * a single SQL row read; the slow path re-computes from
+ * `profile_export_jsonl_gz` and is only taken when the column is missing
+ * or the stored `REQUEST_TIMELINE_VERSION` is stale.
+ */
+
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+  type RequestTimeline,
+} from '../etl/compute-request-timeline';
+
+import type { DbClient } from '../connection.js';
+
+export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline';
+
+interface RawMetaRow {
+  trace_replay_id: number;
+  has_blob: boolean;
+  request_timeline: RequestTimeline | null;
+}
+
+interface RawBlobRow {
+  blob: Buffer | null;
+}
+
+export async function getRequestTimeline(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<RequestTimeline | null> {
+  const rows = (await sql`
+    select
+      atr.id as trace_replay_id,
+      (atr.profile_export_jsonl_gz is not null) as has_blob,
+      atr.request_timeline
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as RawMetaRow[];
+  const row = rows[0];
+  if (!row) return null;
+
+  // Fast path: pre-computed timeline at the current version.
+  if (row.request_timeline && Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION) {
+    return row.request_timeline;
+  }
+
+  if (!row.has_blob) return null;
+
+  // Slow path only: fetch the large profile blob after establishing that the
+  // pre-computed timeline is stale or missing. Long trace runs can have blobs
+  // large enough to exceed Neon's 64 MiB encoded-response limit, so the fast
+  // path must never select the blob alongside request_timeline.
+  const blobRows = (await sql`
+    select profile_export_jsonl_gz as blob
+    from agentic_trace_replay
+    where id = ${row.trace_replay_id}
+  `) as unknown as RawBlobRow[];
+  return computeRequestTimeline(blobRows[0]?.blob ?? null);
+}
diff --git a/packages/db/src/queries/trace-availability.ts b/packages/db/src/queries/trace-availability.ts
new file mode 100644
index 00000000..155b3d4c
--- /dev/null
+++ b/packages/db/src/queries/trace-availability.ts
@@ -0,0 +1,34 @@
+/**
+ * Bulk "does this point have a trace_replay blob?" lookup. Used by the
+ * inference scatter chart to decide whether to render a "View charts"
+ * button in the pinned tooltip — a pure presence check that doesn't need
+ * the multi-megabyte blob payload `getTraceHistograms` ships.
+ *
+ * Going through `trace-histograms` for this trips Neon's 64 MB
+ * per-HTTP-response cap as soon as one chunk's combined gzip payload
+ * exceeds the cap (high-conc 8×8 rows can be 13 MB compressed each).
+ */
+
+import type { DbClient } from '../connection.js';
+
+/** Map of `benchmark_results.id` → true for each id that has a trace_replay blob. */
+export type TraceAvailabilityMap = Record<number, true>;
+
+export async function getTraceAvailability(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<TraceAvailabilityMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows = (await sql`
+    select br.id
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+      and atr.profile_export_jsonl_gz is not null
+  `) as { id: number }[];
+
+  const result: TraceAvailabilityMap = {};
+  for (const row of rows) result[Number(row.id)] = true;
+  return result;
+}
diff --git a/packages/db/src/queries/trace-histograms.test.ts b/packages/db/src/queries/trace-histograms.test.ts
new file mode 100644
index 00000000..c3c6ec8a
--- /dev/null
+++ b/packages/db/src/queries/trace-histograms.test.ts
@@ -0,0 +1,78 @@
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline';
+import type { DbClient } from '../connection.js';
+
+import { getTraceHistograms } from './trace-histograms';
+
+function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } {
+  const responses = [...queue];
+  const calls: string[] = [];
+  const sql = ((strings: TemplateStringsArray) => {
+    calls.push(strings.join('?'));
+    return Promise.resolve(responses.shift() ?? []);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+const timeline: RequestTimeline = {
+  version: REQUEST_TIMELINE_VERSION,
+  startNs: 0,
+  endNs: 10,
+  durationS: 0.00000001,
+  requests: [
+    {
+      cid: 'session-1',
+      ti: 0,
+      wid: '0',
+      ad: 0,
+      phase: 'profiling',
+      credit: 0,
+      start: 1,
+      ack: 2,
+      end: 3,
+      ttftMs: 1,
+      tpotMs: 2,
+      isl: 4096,
+      osl: 512,
+      cancelled: false,
+    },
+    {
+      cid: 'session-1',
+      ti: 1,
+      wid: '0',
+      ad: 0,
+      phase: 'profiling',
+      credit: 4,
+      start: 5,
+      ack: 6,
+      end: 7,
+      ttftMs: 1,
+      tpotMs: 2,
+      isl: null,
+      osl: 128,
+      cancelled: false,
+    },
+  ],
+};
+
+describe('getTraceHistograms', () => {
+  it('builds distributions from the precomputed timeline without selecting the raw blob', async () => {
+    const { sql, calls } = mockSql([
+      [
+        {
+          benchmark_result_id: 422991,
+          trace_replay_id: 870,
+          request_timeline: timeline,
+          has_blob: true,
+        },
+      ],
+    ]);
+
+    await expect(getTraceHistograms(sql, [422991])).resolves.toEqual({
+      422991: { id: 422991, isl: [4096], osl: [512, 128] },
+    });
+    expect(calls).toHaveLength(1);
+    expect(calls[0]).not.toContain('profile_export_jsonl_gz as blob');
+  });
+});
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
new file mode 100644
index 00000000..24b96c35
--- /dev/null
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -0,0 +1,134 @@
+/**
+ * Fetch per-request ISL/OSL arrays from stored aiperf `profile_export.jsonl`
+ * blobs (gzipped in `agentic_trace_replay.profile_export_jsonl_gz`). Caller
+ * passes the set of `benchmark_results.id`s it wants and receives one entry
+ * per id that actually has a trace_replay blob (others are silently skipped).
+ *
+ * The JSONL has one JSON object per request with the shape:
+ *   { metrics: { input_sequence_length: { value, unit }, output_sequence_length: {...}, ... } }
+ *
+ * Returns raw arrays rather than pre-binned histograms — payload stays tiny
+ * (~256 ints * 2 fields per point, ~2 KB compressed) and the frontend can bin
+ * however it wants.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline';
+
+import type { DbClient } from '../connection.js';
+
+export interface TraceHistogramPoint {
+  /** benchmark_results.id this entry belongs to. */
+  id: number;
+  /** Input sequence length (tokens) per completed request. */
+  isl: number[];
+  /** Output sequence length (tokens) per completed request. */
+  osl: number[];
+}
+
+export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
+
+const QUERY_CHUNK_SIZE = 12;
+// Bytea values expand in Neon's JSON-over-HTTP response. Keep raw fallback
+// reads comfortably below its 64 MiB response cap; current ingests should use
+// request_timeline instead and never need this path.
+const MAX_FALLBACK_BLOB_BYTES = 24 * 1024 * 1024;
+
+interface TimelineRow {
+  benchmark_result_id: number;
+  trace_replay_id: number;
+  request_timeline: RequestTimeline | null;
+  has_blob: boolean;
+}
+
+function histogramFromTimeline(id: number, timeline: RequestTimeline): TraceHistogramPoint {
+  const isl: number[] = [];
+  const osl: number[] = [];
+  for (const request of timeline.requests) {
+    if (typeof request.isl === 'number' && Number.isFinite(request.isl)) isl.push(request.isl);
+    if (typeof request.osl === 'number' && Number.isFinite(request.osl)) osl.push(request.osl);
+  }
+  return { id, isl, osl };
+}
+
+export async function getTraceHistograms(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<TraceHistogramMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const result: TraceHistogramMap = {};
+  const fallbackRows: TimelineRow[] = [];
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.id as trace_replay_id,
+        atr.request_timeline,
+        (atr.profile_export_jsonl_gz is not null) as has_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as unknown as TimelineRow[];
+    for (const row of chunkRows) {
+      const id = Number(row.benchmark_result_id);
+      if (
+        row.request_timeline &&
+        Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION
+      ) {
+        result[id] = histogramFromTimeline(id, row.request_timeline);
+      } else if (row.has_blob) {
+        fallbackRows.push(row);
+      }
+    }
+  }
+
+  // Compatibility fallback for pre-timeline rows. Fetch one small blob at a
+  // time; oversized legacy rows are omitted instead of turning the whole API
+  // response into a 507.
+  for (const row of fallbackRows) {
+    const blobRows = (await sql`
+      select profile_export_jsonl_gz as blob
+      from agentic_trace_replay
+      where id = ${row.trace_replay_id}
+        and octet_length(profile_export_jsonl_gz) <= ${MAX_FALLBACK_BLOB_BYTES}
+    `) as unknown as { blob: Buffer }[];
+    const blob = blobRows[0]?.blob;
+    if (!blob) continue;
+    try {
+      const jsonl = gunzipSync(blob).toString('utf8');
+      const isl: number[] = [];
+      const osl: number[] = [];
+      for (const line of jsonl.split('\n')) {
+        if (!line) continue;
+        let rec: { metrics?: Record<string, { value?: number } | number> };
+        try {
+          rec = JSON.parse(line);
+        } catch {
+          continue;
+        }
+        const m = rec.metrics ?? {};
+        const islVal = readMetric(m['input_sequence_length']);
+        const oslVal = readMetric(m['output_sequence_length']);
+        if (typeof islVal === 'number' && Number.isFinite(islVal)) isl.push(islVal);
+        if (typeof oslVal === 'number' && Number.isFinite(oslVal)) osl.push(oslVal);
+      }
+      result[Number(row.benchmark_result_id)] = {
+        id: Number(row.benchmark_result_id),
+        isl,
+        osl,
+      };
+    } catch {
+      // Drop malformed blobs silently — caller treats missing ids as "no data".
+    }
+  }
+  return result;
+}
+
+function readMetric(v: { value?: number } | number | undefined): number | undefined {
+  if (v === undefined || v === null) return undefined;
+  if (typeof v === 'number') return v;
+  return v.value;
+}
diff --git a/packages/db/src/queries/trace-server-metrics.test.ts b/packages/db/src/queries/trace-server-metrics.test.ts
new file mode 100644
index 00000000..f045dfda
--- /dev/null
+++ b/packages/db/src/queries/trace-server-metrics.test.ts
@@ -0,0 +1,105 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { CHART_SERIES_VERSION, type ChartSeries } from '../etl/compute-chart-series';
+import type { DbClient } from '../connection.js';
+
+import { getTraceServerMetrics } from './trace-server-metrics';
+
+function currentSeries(): ChartSeries {
+  return {
+    version: CHART_SERIES_VERSION,
+    startNs: 0,
+    endNs: 1e9,
+    durationS: 1,
+    timeslicesCount: 1,
+    kvCacheUsage: [],
+    prefixCacheHitRate: [],
+    queueDepth: [],
+    promptTokensBySource: {},
+    prefillTps: [{ t: 0, value: 100 }],
+    decodeTps: [],
+    prefixCacheHitsTps: [],
+    hostKvCacheUsage: [],
+    kvCacheUsageByEngine: [],
+    metricSources: [],
+  };
+}
+
+function metaRow(overrides: Record<string, unknown> = {}) {
+  return {
+    id: 42,
+    trace_replay_id: 7,
+    has_blob: true,
+    chart_series: currentSeries(),
+    hardware: 'gb200',
+    framework: 'dynamo-vllm',
+    model: 'deepseek-r1-0528',
+    precision: 'fp8',
+    spec_method: 'none',
+    disagg: true,
+    conc: 128,
+    offload_mode: 'off',
+    isl: null,
+    osl: null,
+    benchmark_type: 'agentic_traces',
+    date: '2026-06-23',
+    run_url: null,
+    server_gpu_cache_hit_rate: null,
+    server_cpu_cache_hit_rate: null,
+    kv_cache_pool_tokens: null,
+    ...overrides,
+  };
+}
+
+function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } {
+  const responses = [...queue];
+  const calls: string[] = [];
+  const sql = ((strings: TemplateStringsArray) => {
+    calls.push(strings.join('?'));
+    return Promise.resolve(responses.shift() ?? []);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+describe('getTraceServerMetrics', () => {
+  it('returns current precomputed series without selecting the raw blob', async () => {
+    const { sql, calls } = mockSql([[metaRow()]]);
+
+    const result = await getTraceServerMetrics(sql, 42);
+
+    expect(result?.prefillTps).toEqual([{ t: 0, value: 100 }]);
+    expect(calls).toHaveLength(1);
+    expect(calls[0]).not.toContain('server_metrics_json_gz as blob');
+  });
+
+  it('fetches and computes the raw blob only when chart_series is stale', async () => {
+    const raw = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          metrics: {
+            'vllm:prompt_tokens': {
+              series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 321 }] }],
+            },
+          },
+        }),
+      ),
+    );
+    const stale = { ...currentSeries(), version: CHART_SERIES_VERSION - 1 };
+    const { sql, calls } = mockSql([[metaRow({ chart_series: stale })], [{ blob: raw }]]);
+
+    const result = await getTraceServerMetrics(sql, 42);
+
+    expect(result?.prefillTps).toEqual([{ t: 0, value: 321 }]);
+    expect(calls).toHaveLength(2);
+    expect(calls[1]).toContain('server_metrics_json_gz as blob');
+  });
+
+  it('returns null without a blob and does not issue a second query', async () => {
+    const { sql, calls } = mockSql([[metaRow({ has_blob: false, chart_series: null })]]);
+
+    await expect(getTraceServerMetrics(sql, 42)).resolves.toBeNull();
+    expect(calls).toHaveLength(1);
+  });
+});
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
new file mode 100644
index 00000000..d24d0879
--- /dev/null
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -0,0 +1,211 @@
+/**
+ * Time-series view of one agentic benchmark point: chart-ready arrays for
+ * KV utilization, prefix-cache hit rate, queue depth, prefill + decode TPS,
+ * and per-source prompt-token counts.
+ *
+ * Backed by `agentic_trace_replay.chart_series` (pre-computed at ingest
+ * time, see `etl/compute-chart-series.ts`). The fast path is a single SQL
+ * row read; the slow path re-computes from `server_metrics_json_gz` and is
+ * only taken when the column is missing or the stored
+ * `CHART_SERIES_VERSION` is stale (the backfill script should drain that).
+ */
+
+import {
+  CHART_SERIES_VERSION,
+  computeChartSeries,
+  type ChartSeries,
+  type MetricSourceSeries,
+  type QueueDepthPoint,
+  type TimeSeriesPoint,
+} from '../etl/compute-chart-series';
+
+import type { DbClient } from '../connection.js';
+
+export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series';
+
+export interface PointMeta {
+  id: number;
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  disagg: boolean;
+  conc: number;
+  offload_mode: string | null;
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+  date: string;
+  /** GitHub Actions run URL for jumping to the source. */
+  run_url: string | null;
+  /** Cumulative end-of-run cache-hit number the dashboard already shows. */
+  server_gpu_cache_hit_rate: number | null;
+  /** Cumulative end-of-run CPU offload cache-hit. */
+  server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+  /** Point context — hardware, model, conc, etc. for the page header. */
+  meta: PointMeta;
+  /** ns wall-clock of the first window's start; for debugging only. */
+  startNs: number;
+  /** ns wall-clock of the last window's end. */
+  endNs: number;
+  /** Total benchmark window in seconds. */
+  durationS: number;
+  /** Number of 1Hz windows captured. */
+  timeslicesCount: number;
+  /** vllm:kv_cache_usage_perc avg per scrape, values in 0..1. */
+  kvCacheUsage: TimeSeriesPoint[];
+  /** Per-window prefix-cache hit rate computed as Δhits / Δqueries (0..1). */
+  prefixCacheHitRate: TimeSeriesPoint[];
+  /** Request queue depth: running, waiting, total per scrape. */
+  queueDepth: QueueDepthPoint[];
+  /**
+   * Per-source prompt-token counts over time (counter rate per scrape).
+   * Keyed by the value of the `source` label (typically `local_cache_hit`,
+   * `external_cache_hit`, `miss`, etc.). Plot as stacked area.
+   */
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  /** Prefill throughput: vllm:prompt_tokens rate (tokens/sec) per scrape. */
+  prefillTps: TimeSeriesPoint[];
+  /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */
+  decodeTps: TimeSeriesPoint[];
+  /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+  hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization. Empty for single-engine deployments —
+   * the cluster-average `kvCacheUsage` line covers that case alone.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+  /**
+   * Total KV-cache pool size in tokens (num_gpu_blocks × block_size, summed
+   * across engines). vLLM only — null for SGLang/TRT or older rows.
+   */
+  kvCachePoolTokens: number | null;
+  /** Orchestrator-normalized metrics grouped by endpoint/worker. */
+  metricSources: MetricSourceSeries[];
+}
+
+interface RawMetaRow extends PointMeta {
+  trace_replay_id: number | null;
+  has_blob: boolean;
+  chart_series: ChartSeries | null;
+  /** Derived at server-log ingest from "GPU KV cache size: N tokens" lines. */
+  kv_cache_pool_tokens: string | null;
+}
+
+interface RawBlobRow {
+  blob: Buffer | null;
+}
+
+function buildMeta(row: RawMetaRow): PointMeta {
+  return {
+    id: Number(row.id),
+    hardware: row.hardware,
+    framework: row.framework,
+    model: row.model,
+    precision: row.precision,
+    spec_method: row.spec_method,
+    disagg: row.disagg,
+    conc: row.conc,
+    offload_mode: row.offload_mode,
+    isl: row.isl,
+    osl: row.osl,
+    benchmark_type: row.benchmark_type,
+    date: row.date,
+    run_url: row.run_url,
+    server_gpu_cache_hit_rate:
+      row.server_gpu_cache_hit_rate === null ? null : Number(row.server_gpu_cache_hit_rate),
+    server_cpu_cache_hit_rate:
+      row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate),
+  };
+}
+
+function merge(
+  meta: PointMeta,
+  series: ChartSeries,
+  kvCachePoolTokens: number | null,
+): TraceServerMetrics {
+  return {
+    meta,
+    kvCachePoolTokens,
+    startNs: series.startNs,
+    endNs: series.endNs,
+    durationS: series.durationS,
+    timeslicesCount: series.timeslicesCount,
+    kvCacheUsage: series.kvCacheUsage,
+    prefixCacheHitRate: series.prefixCacheHitRate,
+    queueDepth: series.queueDepth,
+    promptTokensBySource: series.promptTokensBySource,
+    prefillTps: series.prefillTps,
+    decodeTps: series.decodeTps,
+    // v2 chart_series rows pre-backfill don't have this field — default to []
+    prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
+    hostKvCacheUsage: series.hostKvCacheUsage ?? [],
+    // v8+ field; older chart_series rows lack it → omit per-engine overlay.
+    kvCacheUsageByEngine: series.kvCacheUsageByEngine ?? [],
+    // v9+ field; old rows are served without a source selector until backfilled.
+    metricSources: series.metricSources ?? [],
+  };
+}
+
+export async function getTraceServerMetrics(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<TraceServerMetrics | null> {
+  const rows = (await sql`
+    select
+      br.trace_replay_id,
+      (atr.server_metrics_json_gz is not null) as has_blob,
+      atr.chart_series,
+      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
+      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
+      br.date::text,
+      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
+      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
+      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate,
+      (br.metrics ->> 'kv_cache_pool_tokens')::numeric as kv_cache_pool_tokens
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as RawMetaRow[];
+  const row = rows[0];
+  if (!row) return null;
+  if (!row.has_blob || row.trace_replay_id === null) return null;
+  const meta = buildMeta(row);
+  const kvCachePoolTokens =
+    row.kv_cache_pool_tokens === null ? null : Number(row.kv_cache_pool_tokens);
+
+  // Fast path: pre-computed chart_series at the current version.
+  if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) {
+    return merge(meta, row.chart_series, kvCachePoolTokens);
+  }
+
+  // Slow path only: fetch the large raw blob after establishing that the
+  // pre-computed series is missing or stale. Disaggregated blobs can be tens
+  // of MB compressed, so selecting this in the metadata query defeats the
+  // fast path even when chart_series is current.
+  const blobRows = (await sql`
+    select server_metrics_json_gz as blob
+    from agentic_trace_replay
+    where id = ${row.trace_replay_id}
+  `) as unknown as RawBlobRow[];
+  const blob = blobRows[0]?.blob;
+  if (!blob) return null;
+
+  // `computeChartSeries` handles
+  // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP
+  // rows succeed even before the backfill drains them.
+  const series = await computeChartSeries(blob, {
+    framework: row.framework,
+    disagg: row.disagg,
+  });
+  if (!series) return null;
+  return merge(meta, series, kvCachePoolTokens);
+}
diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts
index dfcb9e9f..01e13dd8 100644
--- a/packages/db/src/queries/workflow-info.ts
+++ b/packages/db/src/queries/workflow-info.ts
@@ -129,20 +129,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise<DateC
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
-/** Get available (model, ISL/OSL, precision, hardware, framework, spec_method, date) combos for the availability API. */
+/** Get available (model, ISL/OSL, precision, hardware, framework, spec_method, benchmark_type, date) combos for the availability API. */
 export async function getAvailabilityData(sql: DbClient): Promise<AvailabilityRow[]> {
   const rows = await sql`
-    SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text
+    SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text
     FROM availability a
     WHERE EXISTS (
       SELECT 1
@@ -153,8 +155,9 @@ export async function getAvailabilityData(sql: DbClient): Promise<AvailabilityRo
         AND c.hardware = a.hardware
         AND c.framework = a.framework
         AND c.precision = a.precision
-        AND br.isl = a.isl
-        AND br.osl = a.osl
+        AND br.isl IS NOT DISTINCT FROM a.isl
+        AND br.osl IS NOT DISTINCT FROM a.osl
+        AND br.benchmark_type = a.benchmark_type
         AND br.date = a.date
         AND br.error IS NULL
         AND wr.conclusion IS NOT NULL

From 740f9aa3328426dc113c1f7017440b3599911145 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:11:41 -0500
Subject: [PATCH 05/40] ci: agentic ingest dispatch workflow and ingest agent
 docs

---
 .claude/agents/ingest.md                     | 196 +++++++++++++++++++
 .github/workflows/ingest-agentic-results.yml | 180 +++++++++++++++++
 2 files changed, 376 insertions(+)
 create mode 100644 .claude/agents/ingest.md
 create mode 100644 .github/workflows/ingest-agentic-results.yml

diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md
new file mode 100644
index 00000000..10e37d6c
--- /dev/null
+++ b/.claude/agents/ingest.md
@@ -0,0 +1,196 @@
+---
+name: ingest
+description: Ingest a benchmark run from GitHub Actions into the Neon DB used by the feat/agentx deployment. The target DB write URL must be provided in the invocation. Handles standard ingest, delete+reingest, and changelog entries. Invoke when the user asks to ingest a workflow run URL.
+tools: Bash, Read, Edit, Write
+---
+
+You ingest benchmark runs from `SemiAnalysisAI/InferenceX` GitHub Actions into the Neon branch used by the `feat/agentx` deployment of this dashboard. Operate on `/Users/quilicic/InferenceX-app`.
+
+## Environment
+
+- **Repo root**: `/Users/quilicic/InferenceX-app`
+- **DB write URL — MUST be provided by the invoker.** There is no default: the target Neon branch changes over time, and ingesting into the wrong one silently corrupts a live deployment. If the prompt does not include a `postgresql://` write URL, STOP and ask for it before touching anything. Requirements:
+  - Use the **direct (non-pooled)** host for ingest/migrations — no `-pooler` in the hostname.
+  - For psql diagnostics you may use the same URL directly: `psql "$DATABASE_WRITE_URL" -c "..."`.
+- **Local dev server**: usually `http://localhost:3002` (port 3000 is a different project on this machine — never purge port 3000)
+- **Preview URL**: `https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app`
+- **INVALIDATE_SECRET** lives in repo root `.env` under that key.
+- **GitHub auth**: `gh auth token` for `gh` calls and the GITHUB_TOKEN env var.
+
+## Standard ingest
+
+```bash
+cd /Users/quilicic/InferenceX-app/packages/db
+DATABASE_WRITE_URL='<provided direct non-pooled write URL>' \
+GITHUB_TOKEN=$(gh auth token) \
+pnpm exec tsx src/ingest-ci-run.ts --download <RUN_ID> SemiAnalysisAI/InferenceX
+```
+
+Then refresh the materialized view (the script's auto-refresh sometimes races):
+`REFRESH MATERIALIZED VIEW latest_benchmarks;`
+
+## Cache purge (always do after any DB mutation)
+
+```bash
+SECRET=$(grep "^INVALIDATE_SECRET" /Users/quilicic/InferenceX-app/.env | cut -d= -f2 | tr -d '"')
+# Localhost (port 3002, NOT 3000)
+curl -s -X POST -H "Authorization: Bearer $SECRET" http://localhost:3002/api/v1/invalidate
+# Preview
+mkdir -p /tmp/vp && cd /tmp/vp \
+  && vercel link --project inferencemax-app --scope semianalysisai --yes >/dev/null 2>&1 \
+  && vercel curl /api/v1/invalidate \
+       --deployment https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app \
+       --yes -- -sS -X POST -H "Authorization: Bearer $SECRET"
+rm -rf /tmp/vp
+```
+
+## Delete + reingest (use only when user explicitly says "delete and reingest" OR when the run supersedes prior data with the same (model, hw, framework, precision))
+
+```sql
+BEGIN;
+DELETE FROM benchmark_results br USING configs c
+WHERE c.id = br.config_id
+  AND c.model = '<model>' AND c.hardware = '<hw>' AND c.framework = '<framework>'
+  AND c.precision = '<prec>' AND br.benchmark_type = '<bt>';
+DELETE FROM availability
+WHERE model = '<model>' AND hardware = '<hw>' AND framework = '<framework>'
+  AND precision = '<prec>' AND benchmark_type = '<bt>';
+COMMIT;
+```
+
+If the user says "replace ONLY the points this run produces", scope the DELETE to `AND br.conc IN (...)` so untouched conc levels survive. Don't do this unless asked.
+
+## AIPerf tagging — DO NOT use by default
+
+AIPerf is no longer a separate harness from the user's perspective. **Always** ingest with `spec_method='none'` (the standard path above), regardless of run name. Run names that include the word "aiperf" do NOT mean you should set `spec_decoding='aiperf'` — the user wants those runs to merge into the standard legend entry alongside other runs of the same (model, hw, framework, precision).
+
+Only override this if the user **explicitly** asks for the run to appear as a separate legend line. If they do, the patching procedure is preserved below. Otherwise, use the standard ingest section above and do not touch `spec_decoding`.
+
+<details>
+<summary>Explicit-request-only: how to tag a run as `spec_decoding='aiperf'`</summary>
+
+```bash
+RID=<run_id>
+TMPDIR=$(mktemp -d -t aiperf-$RID-XXXX)
+cd $TMPDIR
+
+# 1. Logical-name dedup + download
+gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/$RID/artifacts" --paginate \
+  --jq '.artifacts[] | "\(.name)\t\(.archive_download_url)\t\(.created_at)"' \
+  | python3 -c "
+import sys, re, collections
+seen = collections.OrderedDict()
+for line in sys.stdin:
+    name, url, created = line.rstrip('\n').split('\t')
+    key = re.sub(r'_[a-zA-Z][a-zA-Z0-9.-]*_\d+$', '', name)
+    if key not in seen or seen[key][2] < created:
+        seen[key] = (name, url, created)
+for _, (name, url, _) in seen.items():
+    print(f'{name}\t{url}')
+" > artifacts.tsv
+while IFS=$'\t' read -r name url; do
+  mkdir -p "$name"
+  gh api "$url" > "$name/a.zip" 2>/dev/null
+  unzip -oq "$name/a.zip" -d "$name" 2>/dev/null
+  rm "$name/a.zip"
+done < artifacts.tsv
+
+# 2. Patch every benchmark JSON to set spec_decoding=aiperf
+find $TMPDIR -name "*.json" | python3 -c "
+import sys, json
+for fn in (l.strip() for l in sys.stdin):
+    try:
+        with open(fn) as f: d = json.load(f)
+    except Exception: continue
+    rows = d if isinstance(d, list) else [d]
+    if not rows or not isinstance(rows[0], dict): continue
+    changed = False
+    for row in rows:
+        if isinstance(row, dict) and ('scenario_type' in row or 'infmax_model_prefix' in row or 'tput_per_gpu' in row):
+            row['spec_decoding'] = 'aiperf'
+            changed = True
+    if changed:
+        with open(fn, 'w') as f: json.dump(d if isinstance(d, list) else rows[0], f)
+"
+
+# 3. Ingest in CI mode (reads INGEST_* env vars)
+cd /Users/quilicic/InferenceX-app/packages/db
+INGEST_RUN_ID=$RID INGEST_RUN_ATTEMPT=1 INGEST_ARTIFACTS_PATH=$TMPDIR INGEST_REPO=SemiAnalysisAI/InferenceX \
+DATABASE_WRITE_URL='<provided direct non-pooled write URL>' \
+GITHUB_TOKEN=$(gh auth token) \
+pnpm exec tsx src/ingest-ci-run.ts
+rm -rf $TMPDIR
+```
+
+The `spec_method` column has a lowercase check constraint — always lowercase.
+
+</details>
+
+## Don't auto-mention "AIPerf" in changelog entries
+
+Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`.
+
+## Adding a perf changelog entry — MANDATORY for every ingest
+
+**You ALWAYS MUST add a changelog entry for every run you ingest. This is not optional.** Every standard ingest, delete+reingest, and partial ingest gets exactly one changelog entry. Never finish an ingest without one.
+
+- If the user gave changelog text, use it verbatim (substitute `<SKU>` with the run's hardware SKU when the text contains that placeholder).
+- If the user did NOT specify text, DO NOT skip the changelog — derive a sensible description from the run name (see convention below) and add it anyway, then tell the user what you used so they can adjust.
+
+Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `<model>-<precision>-<hw>-<framework>` (matches what the user actually sees in the filter chain).
+
+```sql
+INSERT INTO changelog_entries (workflow_run_id, date, base_ref, head_ref, config_keys, description, pr_link)
+SELECT id, date, '', '', ARRAY['<model>-<precision>-<hw>-<framework>'], '<description>', NULL
+FROM latest_workflow_runs WHERE github_run_id = <RUN_ID>
+RETURNING id, workflow_run_id, date::text, description;
+```
+
+Description convention from prior entries: `<HW upper> <Model> Ingest #<N> (<note>)` — e.g.
+
+- `B200 Kimi Ingest #1`
+- `MI355X Kimi Ingest #2`
+- `H200 Kimi Ingest #1 (mmap cache)`
+
+If the user doesn't specify a description, DO NOT skip the entry and DO NOT block on asking — derive a description from the run name, add the entry, and report what you used so the user can adjust.
+
+## Common gotchas
+
+- **`conclusion IS NULL` filter**: availability hides runs whose `latest_workflow_runs.conclusion` is null (still in_progress). If a user wants in-progress data shown, you can `UPDATE workflow_runs SET conclusion='success', status='completed' WHERE id = <wr_id>` then `REFRESH MATERIALIZED VIEW latest_benchmarks`.
+- **failed_run filter**: rows where `num_requests_successful === 0 AND num_requests_total > 0` get skipped on purpose — they have null metrics and would overwrite good rows via ON CONFLICT.
+- **Aggregated `results_bmk` artifact** contains rows from all runner attempts merged together — pair the artifact-level logical-name dedup with the row-level failed-run skip to avoid empty-row overwrites.
+- **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `_<runner>_<attempt>` suffix.
+- **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection.
+- **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = '<full-sweep-date>'` so the frontend's max-date-per-group dedup doesn't drop the older sweep.
+- **Agentic interactivity normalization (`*_intvty`)**: for `agentic_traces` runs, interactivity MUST be the slow-tail reciprocal of the ITL percentile — `*_intvty = 1/*_itl` (so `p90_intvty = 1/p90_itl`). Some harness versions emit `*_intvty` as `p(1/ITL)` instead (fast-tail — inverts percentile order, e.g. p90 shows ~`1/p10(ITL)`), which silently contaminates cross-run Pareto comparisons. The ingest mapper (`benchmark-mapper.ts`) now **derives `*_intvty` from `*_itl` and discards the artifact's value** for agentic rows, so a normal ingest is self-correcting — no manual step needed. The frontend `agenticAliases` does the same for overlay / `?unofficialrun=` rows. If you ever load agentic data through a path that bypasses the mapper, run `pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes` (idempotent; rewrites `mean/p75/p90/p95 _intvty = 1/_itl`) then refresh the MV + purge cache. `std_intvty` is intentionally left alone (the reciprocal of a std is meaningless; the API strips it anyway).
+
+## Process
+
+1. **Always start by checking the run** with `gh api repos/SemiAnalysisAI/InferenceX/actions/runs/<RID> --jq '{name, status, conclusion}'`. Note the model/hw/precision from the name. If `status != "completed"`, ask the user if they want to ingest in-progress data (will likely have failed_run skips).
+2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding.
+3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line.
+4. **Refresh materialized view**.
+5. **Add changelog entry — ALWAYS, MANDATORY.** Every ingest gets exactly one changelog entry (see "Adding a perf changelog entry — MANDATORY"). Use the user's text if given (substituting `<SKU>`); otherwise derive one from the run name and add it anyway. Never skip this step.
+6. **Purge both caches** (localhost 3002 + preview — never port 3000).
+7. **Report** the row count, date, hardware, run id, and the changelog id (always present).
+
+## Related: ingesting agentic _datasets_ (not benchmark runs)
+
+This agent ingests **benchmark runs**. The HF agentic trace **datasets** (`semianalysisai/cc-traces-weka-*`) that the agentic benchmark replays are ingested by a separate script, not this flow:
+
+```bash
+cd packages/db && DATABASE_WRITE_URL='<direct write url>' \
+  pnpm exec tsx src/ingest-weka-dataset.ts <hf-dataset-id> \
+  [--label "…"] [--variant full|256k] [--description "…"] [--limit N]
+```
+
+It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
+
+New agentic benchmark artifacts preserve AIPerf's `metadata.dataset` provenance as a top-level `dataset` object. Standard benchmark ingest automatically derives the dataset slug from `dataset.hf_dataset_name` and upserts `run_datasets`; do not manually backfill that mapping for new-format runs. Manual mapping is only needed for legacy artifacts that do not contain dataset provenance.
+
+## Don't
+
+- Don't push to git unless the user asked.
+- Don't ingest without permission if it's a delete+reingest of existing data.
+- Don't hit port 3000 for cache purge — it's a different project.
+- Don't capitalize `spec_method` values (DB has a lowercase check constraint).
diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml
new file mode 100644
index 00000000..cf8366ea
--- /dev/null
+++ b/.github/workflows/ingest-agentic-results.yml
@@ -0,0 +1,180 @@
+name: Ingest Agentic Benchmark Results
+
+# Dispatched from the main InferenceX repo at the end of an agentic (AgentX
+# trace-replay) sweep, mirroring the fixed-seq-len `ingest-results` dispatch:
+#
+#   curl -sSf -X POST \
+#     -H "Authorization: Bearer $INFX_FRONTEND_PAT" \
+#     -H "Accept: application/vnd.github+v3+json" \
+#     https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \
+#     -d '{"event_type": "ingest-agentic-results",
+#          "client_payload": {"run-id": "<run_id>", "run-attempt": "<attempt>"}}'
+#
+# The ingest script (packages/db/src/ingest-ci-run.ts) auto-detects agentic
+# artifacts: benchmark rows land in benchmark_results (benchmark_type=
+# 'agentic_traces'), raw profile exports + server metrics land in the
+# agentic_trace_replay sidecar with precomputed chart/timeline JSONBs, the
+# run is linked to its dataset in run_datasets, and changelog-metadata is
+# ingested when present. This is a separate workflow from ingest-results.yml
+# because agentic ingests are blob-heavy (100MB+ gzipped profile exports per
+# high-concurrency point) and need a much longer timeout, plus
+# agentic-specific alerting (missing dataset slug).
+
+on:
+  repository_dispatch:
+    types: [ingest-agentic-results]
+
+jobs:
+  ingest:
+    # Blob-heavy: uploading trace-replay sidecars for a ~20-point sweep takes
+    # far longer than a fixed-seq-len ingest.
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - name: Wait for source run to finish
+        run: sleep 300
+
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+      - uses: pnpm/action-setup@0e279bb959325dab635dd2c09392533439d90093 # v6.0.8
+      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+        with:
+          node-version: '24'
+          cache: pnpm
+      - name: Install dependencies
+        run: pnpm install --filter @semianalysisai/inferencex-db...
+        env:
+          CYPRESS_INSTALL_BINARY: '0'
+
+      - name: Run migrations
+        env:
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
+        run: pnpm admin:db:migrate --yes
+
+      - name: Download artifacts from InferenceX run
+        env:
+          GH_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
+          RUN_ID: ${{ github.event.client_payload.run-id }}
+          ARTIFACTS_PATH: ${{ github.workspace }}/artifacts
+        run: |
+          mkdir -p "$ARTIFACTS_PATH"
+
+          # Download all artifacts for the run, deduplicated by name (keep latest).
+          gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/${RUN_ID}/artifacts" --paginate \
+            | jq -r '
+                [.artifacts[]]
+                | group_by(.name) | map(sort_by(.created_at) | last)[]
+                | "\(.name)\t\(.archive_download_url)"' \
+            | while IFS=$'\t' read -r name url; do
+                echo "Downloading artifact: ${name}"
+                ok=false
+                for attempt in 1 2 3; do
+                  if gh api "${url}" > artifact.zip; then
+                    ok=true
+                    break
+                  fi
+                  echo "  Attempt ${attempt}/3 failed, retrying in ${attempt}s..."
+                  sleep "$attempt"
+                done
+                if [ "$ok" = false ]; then
+                  echo "::warning::Failed to download artifact after 3 attempts: ${name} — skipping"
+                  rm -f artifact.zip
+                  echo "$name" >> "$ARTIFACTS_PATH/.failures"
+                  continue
+                fi
+                mkdir -p "${ARTIFACTS_PATH}/${name}"
+                if ! unzip -o artifact.zip -d "${ARTIFACTS_PATH}/${name}"; then
+                  echo "::warning::Failed to extract artifact: ${name} — skipping"
+                  rm -rf "${ARTIFACTS_PATH:?}/${name}"
+                  echo "$name" >> "$ARTIFACTS_PATH/.failures"
+                fi
+                rm -f artifact.zip
+              done
+
+          if [ -f "$ARTIFACTS_PATH/.failures" ]; then
+            count=$(wc -l < "$ARTIFACTS_PATH/.failures")
+            rm "$ARTIFACTS_PATH/.failures"
+            echo "::warning::${count} artifact(s) failed to download; ingesting what's available"
+          fi
+
+          echo "Downloaded artifacts:"
+          ls "$ARTIFACTS_PATH/"
+
+          if [ -z "$(ls -A "$ARTIFACTS_PATH")" ]; then
+            echo "::error::No artifacts could be downloaded from run ${RUN_ID}"
+            exit 1
+          fi
+
+      - name: Ingest results to DB
+        env:
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
+          GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
+          INGEST_RUN_ID: ${{ github.event.client_payload.run-id }}
+          INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt }}
+          INGEST_ARTIFACTS_PATH: ${{ github.workspace }}/artifacts
+          INGEST_REPO: SemiAnalysisAI/InferenceX
+          UNMAPPED_ENTITIES_OUTPUT: ${{ github.workspace }}/unmapped-entities.json
+        run: pnpm admin:db:ingest:ci
+
+      - name: Apply run overrides
+        env:
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
+        run: pnpm admin:db:apply-overrides --yes
+
+      - name: Verify database
+        env:
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
+        run: pnpm admin:db:verify
+
+      - name: Invalidate Vercel cache
+        env:
+          VERCEL_INVALIDATE_SECRET: ${{ secrets.VERCEL_INVALIDATE_SECRET }}
+        run: |
+          curl -sSf -X POST "https://inferencex.semianalysis.com/api/v1/invalidate" \
+            -H "Authorization: Bearer $VERCEL_INVALIDATE_SECRET" || true
+
+      - name: Check for unmapped entities
+        if: always()
+        id: unmapped
+        run: |
+          f="${{ github.workspace }}/unmapped-entities.json"
+          if [ -f "$f" ]; then
+            echo "found=true" >> "$GITHUB_OUTPUT"
+            models=$(jq -r '.models // [] | join(", ")' "$f")
+            hardware=$(jq -r '.hardware // [] | join(", ")' "$f")
+            precisions=$(jq -r '.precisions // [] | join(", ")' "$f")
+            datasets=$(jq -r '.datasets // [] | join(", ")' "$f")
+            msg=""
+            [ -n "$models" ] && msg="${msg}Models: ${models}\n"
+            [ -n "$hardware" ] && msg="${msg}Hardware: ${hardware}\n"
+            [ -n "$precisions" ] && msg="${msg}Precisions: ${precisions}\n"
+            [ -n "$datasets" ] && msg="${msg}Datasets missing from datasets table (run ingest-weka-dataset): ${datasets}\n"
+            {
+              echo 'summary<<EOF'
+              echo -e "$msg"
+              echo 'EOF'
+            } >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Notify Slack on unmapped entities
+        if: steps.unmapped.outputs.found == 'true'
+        uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3
+        with:
+          webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
+          webhook-type: incoming-webhook
+          payload: |
+            {
+              "text": ":warning: *Unrecognized entities during agentic ingest*\nRun ID: ${{ github.event.client_payload.run-id }}\n```${{ steps.unmapped.outputs.summary }}```\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
+            }
+
+      - name: Notify Slack on failure
+        if: failure()
+        uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3
+        with:
+          webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
+          webhook-type: incoming-webhook
+          payload: |
+            {
+              "text": ":rotating_light: *Agentic ingest workflow failed*\nRun ID: ${{ github.event.client_payload.run-id }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
+            }

From 311eb3cc5d158887e3a075b20066458ea0261563 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:11:46 -0500
Subject: [PATCH 06/40] feat(api): v1 agentic + datasets endpoints and React
 Query hooks

---
 .../app/src/app/api/unofficial-run/route.ts   |   6 +
 .../app/api/v1/agentic-aggregates/route.ts    |  34 +++
 .../app/api/v1/benchmark-siblings/route.ts    |  28 +++
 .../[slug]/conversations/[convId]/route.ts    |  33 +++
 .../v1/datasets/[slug]/conversations/route.ts |  53 +++++
 .../src/app/api/v1/datasets/[slug]/route.ts   |  29 +++
 packages/app/src/app/api/v1/datasets/route.ts |  24 +++
 .../api/v1/derived-agentic-metrics/route.ts   |  43 ++++
 packages/app/src/app/api/v1/id-routes.test.ts | 136 ++++++++++++
 packages/app/src/app/api/v1/id-routes.ts      |  85 ++++++++
 .../src/app/api/v1/request-timeline/route.ts  |  30 +++
 .../app/api/v1/trace-availability/route.ts    |  29 +++
 .../src/app/api/v1/trace-histograms/route.ts  |  34 +++
 .../app/api/v1/trace-server-metrics/route.ts  |  30 +++
 .../src/hooks/api/benchmark-id-query.test.ts  |  37 ++++
 .../app/src/hooks/api/benchmark-id-query.ts   |  59 ++++++
 .../src/hooks/api/use-agentic-aggregates.ts   |  31 +++
 .../src/hooks/api/use-benchmark-siblings.ts   |  44 ++++
 packages/app/src/hooks/api/use-benchmarks.ts  |  12 +-
 packages/app/src/hooks/api/use-datasets.ts    | 199 ++++++++++++++++++
 .../api/use-derived-agentic-metrics.test.ts   |  13 ++
 .../hooks/api/use-derived-agentic-metrics.ts  |  55 +++++
 .../app/src/hooks/api/use-request-timeline.ts |  53 +++++
 .../src/hooks/api/use-trace-availability.ts   |  15 ++
 .../app/src/hooks/api/use-trace-histograms.ts |  25 +++
 .../src/hooks/api/use-trace-server-metrics.ts |  96 +++++++++
 packages/app/src/hooks/useChartContext.ts     |  12 +-
 packages/app/src/hooks/useThemeColors.test.ts |  28 +++
 28 files changed, 1269 insertions(+), 4 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/agentic-aggregates/route.ts
 create mode 100644 packages/app/src/app/api/v1/benchmark-siblings/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/route.ts
 create mode 100644 packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
 create mode 100644 packages/app/src/app/api/v1/id-routes.test.ts
 create mode 100644 packages/app/src/app/api/v1/id-routes.ts
 create mode 100644 packages/app/src/app/api/v1/request-timeline/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-availability/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-histograms/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-server-metrics/route.ts
 create mode 100644 packages/app/src/hooks/api/benchmark-id-query.test.ts
 create mode 100644 packages/app/src/hooks/api/benchmark-id-query.ts
 create mode 100644 packages/app/src/hooks/api/use-agentic-aggregates.ts
 create mode 100644 packages/app/src/hooks/api/use-benchmark-siblings.ts
 create mode 100644 packages/app/src/hooks/api/use-datasets.ts
 create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts
 create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.ts
 create mode 100644 packages/app/src/hooks/api/use-request-timeline.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-availability.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-histograms.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-server-metrics.ts

diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts
index 072c99f1..304ccb0b 100644
--- a/packages/app/src/app/api/unofficial-run/route.ts
+++ b/packages/app/src/app/api/unofficial-run/route.ts
@@ -33,6 +33,10 @@ export function normalizeArtifactRows(
     if (!params) continue;
     const { config } = params;
     results.push({
+      // Synthetic id — overlay rows aren't persisted, so trace_replay lookups
+      // (keyed on benchmark_results.id) will always miss, which is the
+      // intended behaviour: overlays never have stored trace_replay blobs.
+      id: 0,
       hardware: config.hardware,
       framework: config.framework,
       model: config.model,
@@ -50,6 +54,8 @@ export function normalizeArtifactRows(
       decode_num_workers: config.decodeNumWorkers,
       num_prefill_gpu: config.numPrefillGpu,
       num_decode_gpu: config.numDecodeGpu,
+      benchmark_type: params.benchmarkType,
+      offload_mode: params.offloadMode,
       isl: params.isl,
       osl: params.osl,
       conc: params.conc,
diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
new file mode 100644
index 00000000..63fd2512
--- /dev/null
+++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
@@ -0,0 +1,34 @@
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getAgenticAggregates,
+  type AgenticAggregateMap,
+} from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idsQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: response stays small (a few numbers per id), but generating it
+// parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the
+// "Aggregates" toggle stays snappy.
+const getCachedAgenticAggregates = cachedQuery(
+  (ids: number[]): Promise<AgenticAggregateMap> => getAgenticAggregates(getDb(), ids),
+  'agentic-aggregates',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/agentic-aggregates?ids=1,2,3
+ *
+ * Returns per-id mean/p50/p75/p90/p99 for ISL, OSL, KV cache utilization,
+ * and prefix cache hit rate — computed live from the stored aiperf
+ * profile_export.jsonl + server_metrics_json blobs. Ids without a
+ * trace_replay blob (or with no usable samples) get nulls.
+ */
+export const GET = idsQueryRoute({
+  maxIds: 200,
+  logLabel: 'agentic aggregates',
+  fetch: getCachedAgenticAggregates,
+});
diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
new file mode 100644
index 00000000..38e79c23
--- /dev/null
+++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
@@ -0,0 +1,28 @@
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getBenchmarkSiblings,
+  type BenchmarkSiblings,
+} from '@semianalysisai/inferencex-db/queries/benchmark-siblings';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedSiblings = cachedQuery(
+  (id: number): Promise<BenchmarkSiblings | null> => getBenchmarkSiblings(getDb(), id),
+  'benchmark-siblings',
+);
+
+/**
+ * GET /api/v1/benchmark-siblings?id=N
+ *
+ * Returns the SKU (hw/framework/model/precision/spec/benchmark_type) of the
+ * benchmark_result + all sibling rows that share that SKU within the same
+ * workflow_run. Used by the agentic detail page to render a navigator.
+ */
+export const GET = idQueryRoute({
+  logLabel: 'benchmark siblings',
+  fetch: getCachedSiblings,
+});
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
new file mode 100644
index 00000000..84cc15e3
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
@@ -0,0 +1,33 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getConversation,
+  type ConversationDetail,
+} from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedConversation = cachedQuery(
+  (slug: string, convId: string): Promise<ConversationDetail | null> =>
+    getConversation(getDb(), slug, convId),
+  'dataset-conversation',
+);
+
+/** GET /api/v1/datasets/[slug]/conversations/[convId] — flamegraph structure. */
+export async function GET(
+  _request: NextRequest,
+  { params }: { params: Promise<{ slug: string; convId: string }> },
+) {
+  const { slug, convId } = await params;
+  try {
+    const data = await getCachedConversation(slug, decodeURIComponent(convId));
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset conversation:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
new file mode 100644
index 00000000..62b9e5b7
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
@@ -0,0 +1,53 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  listConversations,
+  type ConversationList,
+  type ListConversationsOpts,
+} from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const SORTS = new Set(['tokens', 'turns', 'subagents', 'id']);
+
+const getCachedConversations = cachedQuery(
+  (
+    slug: string,
+    search: string,
+    limit: number,
+    offset: number,
+    sort: string,
+  ): Promise<ConversationList | null> =>
+    listConversations(getDb(), slug, {
+      search: search || undefined,
+      limit,
+      offset,
+      sort: sort as ListConversationsOpts['sort'],
+    }),
+  'dataset-conversations',
+);
+
+/**
+ * GET /api/v1/datasets/[slug]/conversations?search=&limit=&offset=&sort=
+ * Paginated conversation list (counts only, no flamegraph structure).
+ */
+export async function GET(request: NextRequest, { params }: { params: Promise<{ slug: string }> }) {
+  const { slug } = await params;
+  const sp = request.nextUrl.searchParams;
+  const search = sp.get('search') ?? '';
+  const limit = Math.min(200, Math.max(1, Number(sp.get('limit')) || 50));
+  const offset = Math.max(0, Number(sp.get('offset')) || 0);
+  const sortParam = sp.get('sort') ?? 'tokens';
+  const sort = SORTS.has(sortParam) ? sortParam : 'tokens';
+  try {
+    const data = await getCachedConversations(slug, search, limit, offset, sort);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset conversations:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
new file mode 100644
index 00000000..9e4af580
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
@@ -0,0 +1,29 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { getDataset, type DatasetDetail } from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedDataset = cachedQuery(
+  (slug: string): Promise<DatasetDetail | null> => getDataset(getDb(), slug),
+  'dataset',
+);
+
+/** GET /api/v1/datasets/[slug] — one dataset incl. precomputed chart_data. */
+export async function GET(
+  _request: NextRequest,
+  { params }: { params: Promise<{ slug: string }> },
+) {
+  const { slug } = await params;
+  try {
+    const data = await getCachedDataset(slug);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/route.ts b/packages/app/src/app/api/v1/datasets/route.ts
new file mode 100644
index 00000000..f0acca3c
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/route.ts
@@ -0,0 +1,24 @@
+import { NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { listDatasets, type DatasetRecord } from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedDatasets = cachedQuery(
+  (): Promise<DatasetRecord[]> => listDatasets(getDb()),
+  'datasets',
+);
+
+/** GET /api/v1/datasets — all ingested cc-traces-weka datasets (registry cards). */
+export async function GET() {
+  try {
+    const data = await getCachedDatasets();
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching datasets:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
new file mode 100644
index 00000000..836a8d93
--- /dev/null
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -0,0 +1,43 @@
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getDerivedAgenticMetrics,
+  type DerivedAgenticMetricMap,
+} from '@semianalysisai/inferencex-db/queries/derived-agentic-metrics';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idsQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: the response is one entry per id with two numbers, but the
+// derivation work parses thousands of JSONL records per blob — cache the
+// computed result so a chart-refresh hits the warm path.
+// Bumped to v3 for per-request normalized-E2E @ 400 output tokens.
+// Stale v1 cache entries return undefined for the new field and silently
+// blank the chart with "No data available".
+const getCachedDerivedAgenticMetrics = cachedQuery(
+  (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
+  'derived-agentic-metrics-v3',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/derived-agentic-metrics?ids=1,2,3
+ *
+ * Returns per-id derived metrics computed live from the stored aiperf
+ * profile_export.jsonl blobs:
+ *  - normalized_session_time_s: mean across sessions of session e2e time
+ *    (Σ per-turn request_latency) rescaled by mean_load / session_load.
+ *  - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT)
+ *    across every turn in every session.
+ *  - p75/p90_normalized_e2e_400_s: percentile of per-request
+ *    TTFT + 399 × observed ITL.
+ *
+ * Ids without a trace_replay blob or with unparseable records are omitted.
+ */
+export const GET = idsQueryRoute({
+  maxIds: 200,
+  logLabel: 'derived agentic metrics',
+  fetch: getCachedDerivedAgenticMetrics,
+});
diff --git a/packages/app/src/app/api/v1/id-routes.test.ts b/packages/app/src/app/api/v1/id-routes.test.ts
new file mode 100644
index 00000000..32499e99
--- /dev/null
+++ b/packages/app/src/app/api/v1/id-routes.test.ts
@@ -0,0 +1,136 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('@/lib/api-cache', () => ({
+  cachedJson: (data: unknown) => Response.json(data),
+}));
+
+import { NextRequest, NextResponse } from 'next/server';
+
+import { idQueryRoute, idsQueryRoute, parseIdsParam } from './id-routes';
+
+function req(url: string): NextRequest {
+  return new NextRequest(new URL(url, 'http://localhost'));
+}
+
+beforeEach(() => {
+  vi.clearAllMocks();
+});
+
+describe('parseIdsParam', () => {
+  it('parses, dedupes, and sorts ids ascending', () => {
+    const result = parseIdsParam(req('/x?ids=3, 1,2,3'), 200);
+    expect(result).toEqual([1, 2, 3]);
+  });
+
+  it('drops non-finite and non-positive ids', () => {
+    const result = parseIdsParam(req('/x?ids=abc,-1,0,5'), 200);
+    expect(result).toEqual([5]);
+  });
+
+  it('returns 400 when the param is missing', async () => {
+    const result = parseIdsParam(req('/x'), 200);
+    expect(result).toBeInstanceOf(NextResponse);
+    const res = result as NextResponse;
+    expect(res.status).toBe(400);
+    const body = await res.json();
+    expect(body.error).toBe('ids query param is required');
+  });
+
+  it('returns 400 when no valid ids remain', async () => {
+    const result = parseIdsParam(req('/x?ids=abc,-2'), 200);
+    expect(result).toBeInstanceOf(NextResponse);
+    const res = result as NextResponse;
+    expect(res.status).toBe(400);
+    const body = await res.json();
+    expect(body.error).toBe('no valid ids provided');
+  });
+
+  it('returns 400 when the id count exceeds maxIds', async () => {
+    const result = parseIdsParam(req('/x?ids=1,2,3'), 2);
+    expect(result).toBeInstanceOf(NextResponse);
+    const res = result as NextResponse;
+    expect(res.status).toBe(400);
+    const body = await res.json();
+    expect(body.error).toBe('too many ids (max 2)');
+  });
+});
+
+describe('idsQueryRoute', () => {
+  it('fetches with sorted deduped ids and returns the payload', async () => {
+    const fetch = vi.fn().mockResolvedValue({ 1: 'a', 2: 'b' });
+    const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch });
+
+    const res = await GET(req('/x?ids=2,1,2'));
+    expect(res.status).toBe(200);
+    expect(await res.json()).toEqual({ 1: 'a', 2: 'b' });
+    expect(fetch).toHaveBeenCalledWith([1, 2]);
+  });
+
+  it('returns 400 without calling fetch when ids are invalid', async () => {
+    const fetch = vi.fn();
+    const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch });
+
+    const res = await GET(req('/x'));
+    expect(res.status).toBe(400);
+    expect(fetch).not.toHaveBeenCalled();
+  });
+
+  it('returns 500 and logs when the fetch throws', async () => {
+    const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
+    const fetch = vi.fn().mockRejectedValue(new Error('boom'));
+    const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch });
+
+    const res = await GET(req('/x?ids=1'));
+    expect(res.status).toBe(500);
+    const body = await res.json();
+    expect(body.error).toBe('Internal server error');
+    expect(consoleSpy).toHaveBeenCalledWith('Error fetching things:', expect.any(Error));
+    consoleSpy.mockRestore();
+  });
+});
+
+describe('idQueryRoute', () => {
+  it('fetches by id and returns the payload', async () => {
+    const fetch = vi.fn().mockResolvedValue({ value: 42 });
+    const GET = idQueryRoute({ logLabel: 'thing', fetch });
+
+    const res = await GET(req('/x?id=7'));
+    expect(res.status).toBe(200);
+    expect(await res.json()).toEqual({ value: 42 });
+    expect(fetch).toHaveBeenCalledWith(7);
+  });
+
+  it.each(['/x', '/x?id=abc', '/x?id=0'])('returns 400 for %s', async (url) => {
+    const fetch = vi.fn();
+    const GET = idQueryRoute({ logLabel: 'thing', fetch });
+
+    const res = await GET(req(url));
+    expect(res.status).toBe(400);
+    const body = await res.json();
+    expect(body.error).toBe('id is required (benchmark_result_id)');
+    expect(fetch).not.toHaveBeenCalled();
+  });
+
+  it('returns 404 when the fetch yields null', async () => {
+    const fetch = vi.fn().mockResolvedValue(null);
+    const GET = idQueryRoute({ logLabel: 'thing', fetch });
+
+    const res = await GET(req('/x?id=7'));
+    expect(res.status).toBe(404);
+    const body = await res.json();
+    expect(body.error).toBe('Not found');
+  });
+
+  it('returns 500 and logs when the fetch throws', async () => {
+    const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
+    const fetch = vi.fn().mockRejectedValue(new Error('boom'));
+    const GET = idQueryRoute({ logLabel: 'thing', fetch });
+
+    const res = await GET(req('/x?id=7'));
+    expect(res.status).toBe(500);
+    const body = await res.json();
+    expect(body.error).toBe('Internal server error');
+    expect(consoleSpy).toHaveBeenCalledWith('Error fetching thing:', expect.any(Error));
+    consoleSpy.mockRestore();
+  });
+});
diff --git a/packages/app/src/app/api/v1/id-routes.ts b/packages/app/src/app/api/v1/id-routes.ts
new file mode 100644
index 00000000..fea9221b
--- /dev/null
+++ b/packages/app/src/app/api/v1/id-routes.ts
@@ -0,0 +1,85 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { cachedJson } from '@/lib/api-cache';
+
+/**
+ * Shared GET-handler factories for the agentic benchmark routes, which all
+ * key off `benchmark_results.id`. Two shapes exist:
+ *  - bulk `?ids=1,2,3` routes returning a map keyed by id
+ *  - single `?id=N` routes returning one payload or 404
+ *
+ * Both preserve the v1 error contract: 400 with `{error}` for bad params,
+ * 404 `{error: 'Not found'}` when a single-id lookup misses, and 500
+ * `{error: 'Internal server error'}` (with a console.error) on query failure.
+ * Success payloads go through `cachedJson` for CDN caching + gzip.
+ */
+
+/**
+ * Parse, dedupe, validate, and ascending-sort the `ids` query param.
+ * Sorted so the same set of ids in any order hits the same cache entry.
+ * Returns a NextResponse (400) when the param is missing, empty, or too long.
+ */
+export function parseIdsParam(request: NextRequest, maxIds: number): number[] | NextResponse {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > maxIds) {
+    return NextResponse.json({ error: `too many ids (max ${maxIds})` }, { status: 400 });
+  }
+  return ids.toSorted((a, b) => a - b);
+}
+
+/** Build a GET handler for a bulk `?ids=…` route. */
+export function idsQueryRoute<T>(options: {
+  maxIds: number;
+  /** Human-readable name used in the 500-path console.error. */
+  logLabel: string;
+  fetch: (ids: number[]) => Promise<T>;
+}): (request: NextRequest) => Promise<Response> {
+  const { maxIds, logLabel, fetch } = options;
+  return async (request: NextRequest) => {
+    const ids = parseIdsParam(request, maxIds);
+    if (ids instanceof NextResponse) return ids;
+    try {
+      return cachedJson(await fetch(ids));
+    } catch (error) {
+      console.error(`Error fetching ${logLabel}:`, error);
+      return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+    }
+  };
+}
+
+/** Build a GET handler for a single `?id=N` route (404 when the fetch misses). */
+export function idQueryRoute<T>(options: {
+  logLabel: string;
+  fetch: (id: number) => Promise<T | null>;
+}): (request: NextRequest) => Promise<Response> {
+  const { logLabel, fetch } = options;
+  return async (request: NextRequest) => {
+    const id = Number(request.nextUrl.searchParams.get('id'));
+    if (!id || !Number.isFinite(id)) {
+      return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+    }
+    try {
+      const data = await fetch(id);
+      if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+      return cachedJson(data);
+    } catch (error) {
+      console.error(`Error fetching ${logLabel}:`, error);
+      return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+    }
+  };
+}
diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts
new file mode 100644
index 00000000..9a3750d6
--- /dev/null
+++ b/packages/app/src/app/api/v1/request-timeline/route.ts
@@ -0,0 +1,30 @@
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getRequestTimeline,
+  type RequestTimeline,
+} from '@semianalysisai/inferencex-db/queries/request-timeline';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedRequestTimeline = cachedQuery(
+  (id: number): Promise<RequestTimeline | null> => getRequestTimeline(getDb(), id),
+  'request-timeline',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/request-timeline?id=N
+ *
+ * Returns the per-request Gantt timeline for one agentic benchmark point.
+ * Each request entry has ns-from-start offsets for credit/start/ack/end,
+ * plus TTFT, ISL, OSL, conversation id, turn index, worker id. 404 if the
+ * point has no stored profile_export.jsonl blob.
+ */
+export const GET = idQueryRoute({
+  logLabel: 'request timeline',
+  fetch: getCachedRequestTimeline,
+});
diff --git a/packages/app/src/app/api/v1/trace-availability/route.ts b/packages/app/src/app/api/v1/trace-availability/route.ts
new file mode 100644
index 00000000..45eafef4
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-availability/route.ts
@@ -0,0 +1,29 @@
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceAvailability,
+  type TraceAvailabilityMap,
+} from '@semianalysisai/inferencex-db/queries/trace-availability';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idsQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceAvailability = cachedQuery(
+  (ids: number[]): Promise<TraceAvailabilityMap> => getTraceAvailability(getDb(), ids),
+  'trace-availability',
+);
+
+/**
+ * GET /api/v1/trace-availability?ids=1,2,3
+ *
+ * Returns `{[id]: true}` for ids that have a stored trace_replay blob.
+ * Lightweight presence check used by the scatter tooltip to decide whether
+ * to render the "View charts" button — see queries/trace-availability.ts.
+ */
+export const GET = idsQueryRoute({
+  maxIds: 500,
+  logLabel: 'trace availability',
+  fetch: getCachedTraceAvailability,
+});
diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
new file mode 100644
index 00000000..131010ff
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -0,0 +1,34 @@
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceHistograms,
+  type TraceHistogramMap,
+} from '@semianalysisai/inferencex-db/queries/trace-histograms';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idsQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: a 50-id histogram payload can easily exceed Next.js's 2MB
+// unstable_cache limit (each point carries one int per request, ~500-1000+
+// requests for agentic), which manifests as a 500 from the route. Blob
+// storage lets us cache the larger response without losing the warm-cache hit.
+const getCachedTraceHistograms = cachedQuery(
+  (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
+  'trace-histograms',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/trace-histograms?ids=1,2,3
+ *
+ * Returns per-request ISL/OSL arrays parsed from the stored aiperf
+ * `profile_export.jsonl` blobs, keyed by `benchmark_results.id`.
+ * Ids without a trace_replay blob are omitted from the response.
+ */
+export const GET = idsQueryRoute({
+  maxIds: 200,
+  logLabel: 'trace histograms',
+  fetch: getCachedTraceHistograms,
+});
diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
new file mode 100644
index 00000000..a759e6dc
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
@@ -0,0 +1,30 @@
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceServerMetrics,
+  type TraceServerMetrics,
+} from '@semianalysisai/inferencex-db/queries/trace-server-metrics';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceServerMetrics = cachedQuery(
+  (id: number): Promise<TraceServerMetrics | null> => getTraceServerMetrics(getDb(), id),
+  'trace-server-metrics',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/trace-server-metrics?id=N
+ *
+ * Returns parsed time-series for the agentic detail view: KV cache usage,
+ * prefix cache hit rate per interval, queue depth, and per-source prompt
+ * token rates. Times are in seconds from benchmark start. 404 if the point
+ * has no stored server_metrics_export.json blob.
+ */
+export const GET = idQueryRoute({
+  logLabel: 'trace server metrics',
+  fetch: getCachedTraceServerMetrics,
+});
diff --git a/packages/app/src/hooks/api/benchmark-id-query.test.ts b/packages/app/src/hooks/api/benchmark-id-query.test.ts
new file mode 100644
index 00000000..c7d951f4
--- /dev/null
+++ b/packages/app/src/hooks/api/benchmark-id-query.test.ts
@@ -0,0 +1,37 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+import { bulkIdsFetcher } from './benchmark-id-query';
+
+afterEach(() => {
+  vi.unstubAllGlobals();
+});
+
+describe('bulkIdsFetcher', () => {
+  it('returns an empty map without fetching for an empty id set', async () => {
+    const fetchMock = vi.fn();
+    vi.stubGlobal('fetch', fetchMock);
+
+    const result = await bulkIdsFetcher<true>('trace-availability')([]);
+    expect(result).toEqual({});
+    expect(fetchMock).not.toHaveBeenCalled();
+  });
+
+  it('fetches the endpoint with comma-joined ids and returns the parsed map', async () => {
+    const fetchMock = vi.fn().mockResolvedValue(Response.json({ 1: true, 3: true }));
+    vi.stubGlobal('fetch', fetchMock);
+
+    const result = await bulkIdsFetcher<true>('trace-availability')([1, 3]);
+    expect(result).toEqual({ 1: true, 3: true });
+    expect(fetchMock).toHaveBeenCalledWith('/api/v1/trace-availability?ids=1,3', {
+      signal: undefined,
+    });
+  });
+
+  it('throws with the endpoint name and status on a non-ok response', async () => {
+    vi.stubGlobal('fetch', vi.fn().mockResolvedValue(new Response('nope', { status: 500 })));
+
+    await expect(bulkIdsFetcher<true>('trace-histograms')([1])).rejects.toThrow(
+      'trace-histograms 500',
+    );
+  });
+});
diff --git a/packages/app/src/hooks/api/benchmark-id-query.ts b/packages/app/src/hooks/api/benchmark-id-query.ts
new file mode 100644
index 00000000..0aa50687
--- /dev/null
+++ b/packages/app/src/hooks/api/benchmark-id-query.ts
@@ -0,0 +1,59 @@
+import { useQuery } from '@tanstack/react-query';
+
+/**
+ * Shared React Query plumbing for the agentic endpoints keyed by
+ * `benchmark_results.id` (`/api/v1/<endpoint>?ids=…` bulk maps and
+ * `/api/v1/<endpoint>?id=N` single lookups).
+ *
+ * Conventions kept identical across all of these hooks:
+ *  - queryKey = [endpoint, sorted-deduped-ids-comma-joined] so any
+ *    permutation of the same id set hits the same cache entry
+ *  - staleTime = 5 minutes (the underlying blobs are immutable per run)
+ *  - bulk queries disabled for empty id sets; single queries 404 → null
+ */
+
+const STALE_TIME_MS = 5 * 60 * 1000;
+
+/** Build the standard bulk fetcher: GET `/api/v1/<endpoint>?ids=…` → map. */
+export function bulkIdsFetcher<T>(
+  endpoint: string,
+): (ids: number[], signal?: AbortSignal) => Promise<Record<number, T>> {
+  return async (ids, signal) => {
+    if (ids.length === 0) return {};
+    const res = await fetch(`/api/v1/${endpoint}?ids=${ids.join(',')}`, { signal });
+    if (!res.ok) throw new Error(`${endpoint} ${res.status}`);
+    return (await res.json()) as Record<number, T>;
+  };
+}
+
+/** Bulk map query over a set of benchmark_results ids. */
+export function useBulkIdsQuery<T>(
+  endpoint: string,
+  ids: number[],
+  enabled: boolean,
+  fetchByIds: (ids: number[], signal?: AbortSignal) => Promise<T>,
+) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: [endpoint, sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchByIds(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: STALE_TIME_MS,
+  });
+}
+
+/** Single-payload query for one benchmark_results id; 404 resolves to null. */
+export function useByIdQuery<T>(endpoint: string, id: number | null, enabled: boolean) {
+  return useQuery({
+    queryKey: [endpoint, id] as const,
+    queryFn: async ({ signal }): Promise<T | null> => {
+      if (!id) return null;
+      const res = await fetch(`/api/v1/${endpoint}?id=${id}`, { signal });
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`${endpoint} ${res.status}`);
+      return (await res.json()) as T;
+    },
+    enabled,
+    staleTime: STALE_TIME_MS,
+  });
+}
diff --git a/packages/app/src/hooks/api/use-agentic-aggregates.ts b/packages/app/src/hooks/api/use-agentic-aggregates.ts
new file mode 100644
index 00000000..7ca029cf
--- /dev/null
+++ b/packages/app/src/hooks/api/use-agentic-aggregates.ts
@@ -0,0 +1,31 @@
+import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query';
+
+export interface MetricPercentiles {
+  mean: number;
+  p50: number;
+  p75: number;
+  p90: number;
+  p99: number;
+  n: number;
+}
+
+export interface AgenticAggregate {
+  id: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record<number, AgenticAggregate>;
+
+const fetchAgenticAggregates = bulkIdsFetcher<AgenticAggregate>('agentic-aggregates');
+
+/**
+ * Fetch per-id aggregate stats (mean/p50/p75/p90/p99) for ISL, OSL, KV
+ * cache utilization, and prefix cache hit rate. Used by the "Aggregates
+ * across configs" view on the agentic detail page.
+ */
+export function useAgenticAggregates(ids: number[], enabled = true) {
+  return useBulkIdsQuery('agentic-aggregates', ids, enabled, fetchAgenticAggregates);
+}
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
new file mode 100644
index 00000000..58469c26
--- /dev/null
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -0,0 +1,44 @@
+import { useByIdQuery } from './benchmark-id-query';
+
+export interface BenchmarkSibling {
+  id: number;
+  conc: number;
+  offload_mode: string | null;
+  decode_tp: number;
+  decode_ep: number;
+  decode_dp_attention: boolean;
+  decode_num_workers: number;
+  prefill_tp: number;
+  prefill_ep: number;
+  prefill_dp_attention: boolean;
+  prefill_num_workers: number;
+  num_prefill_gpu: number;
+  num_decode_gpu: number;
+  disagg: boolean;
+  is_multinode: boolean;
+  tput_per_gpu: number | null;
+  total_requests: number | null;
+  is_current: boolean;
+  has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  benchmark_type: string;
+  github_run_id: number;
+  date: string;
+  dataset_slug: string | null;
+}
+
+export interface BenchmarkSiblings {
+  sku: BenchmarkSku;
+  siblings: BenchmarkSibling[];
+}
+
+export function useBenchmarkSiblings(id: number | null) {
+  return useByIdQuery<BenchmarkSiblings>('benchmark-siblings', id, id !== null && id > 0);
+}
diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts
index a8d634f1..095cf192 100644
--- a/packages/app/src/hooks/api/use-benchmarks.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.ts
@@ -28,6 +28,14 @@ export function benchmarkQueryOptions(
   };
 }
 
-export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) {
-  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId));
+export function useBenchmarks(
+  model: string,
+  date?: string,
+  enabled = true,
+  runId?: string,
+  exactRun?: boolean,
+) {
+  return useQuery(
+    benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId, exactRun),
+  );
 }
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts
new file mode 100644
index 00000000..ea1b17cf
--- /dev/null
+++ b/packages/app/src/hooks/api/use-datasets.ts
@@ -0,0 +1,199 @@
+import { useQuery, keepPreviousData } from '@tanstack/react-query';
+
+import type {
+  ConversationStructure,
+  StructureNode,
+} from '@semianalysisai/inferencex-db/etl/weka-structure';
+
+export type { ConversationStructure, StructureNode };
+
+export interface DatasetSummary {
+  blockSize?: number;
+  hashIdScope?: string | null;
+  totalIn?: number;
+  totalOut?: number;
+  totalCached?: number;
+  cachedPct?: number;
+  mainTurns?: number;
+  subagentGroups?: number;
+  subagentTurns?: number;
+  meanRequestsPerConversation?: number;
+  medianRequestsPerConversation?: number;
+  meanSubagentsPerTrace?: number;
+  medianSubagentsPerTrace?: number;
+  modelMix?: Record<string, number>;
+  [k: string]: unknown;
+}
+
+export interface DatasetRecord {
+  id: string;
+  slug: string;
+  label: string;
+  variant: string;
+  description: string | null;
+  hf_url: string | null;
+  license: string | null;
+  conversation_count: number;
+  summary: DatasetSummary;
+  ingested_at: string;
+}
+
+export interface HistogramBin {
+  x0: number;
+  x1: number;
+  count: number;
+}
+
+export interface DistributionStats {
+  count: number;
+  min: number;
+  max: number;
+  mean: number;
+  median: number;
+  /** Added in chart_data v2. */
+  p75?: number;
+  p90: number;
+  /** Added in chart_data v2. */
+  p95?: number;
+}
+
+export interface Distribution {
+  bins: HistogramBin[];
+  stats: DistributionStats;
+}
+
+export interface DatasetChartData {
+  version?: number;
+  inputTokensPerTurn?: Distribution;
+  uncachedInputTokensPerTurn?: Distribution;
+  outputTokensPerTurn?: Distribution;
+  subagentInputTokensPerRequest?: Distribution;
+  subagentOutputTokensPerRequest?: Distribution;
+  turnsPerConversation?: Distribution;
+  subagentGroupsPerConversation?: Distribution;
+  cachedFractionPerTurn?: Distribution;
+  [k: string]: unknown;
+}
+
+export interface DatasetDetail extends DatasetRecord {
+  chart_data: DatasetChartData;
+}
+
+export interface ConversationListItem {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+}
+
+export interface ConversationList {
+  total: number;
+  items: ConversationListItem[];
+}
+
+export interface ConversationDetail {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+  structure: ConversationStructure;
+}
+
+export type ConversationSort = 'tokens' | 'turns' | 'subagents' | 'id';
+
+// Dataset contents only change on (rare) re-ingest, so cache aggressively.
+const DAY = 24 * 60 * 60 * 1000;
+
+/** Shared fetch for the per-dataset endpoints: 404 → null, other errors throw. */
+async function fetchJsonOr404<T>(
+  url: string,
+  label: string,
+  signal: AbortSignal,
+): Promise<T | null> {
+  const res = await fetch(url, { signal });
+  if (res.status === 404) return null;
+  if (!res.ok) throw new Error(`${label} ${res.status}`);
+  return (await res.json()) as T;
+}
+
+/** All ingested datasets (registry cards). */
+export function useDatasets() {
+  return useQuery({
+    queryKey: ['datasets'] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch('/api/v1/datasets', { signal });
+      if (!res.ok) throw new Error(`datasets ${res.status}`);
+      return (await res.json()) as DatasetRecord[];
+    },
+    staleTime: DAY,
+  });
+}
+
+/** One dataset incl. chart_data. */
+export function useDataset(slug: string | null) {
+  return useQuery({
+    queryKey: ['dataset', slug] as const,
+    queryFn: ({ signal }) =>
+      fetchJsonOr404<DatasetDetail>(`/api/v1/datasets/${slug}`, 'dataset', signal),
+    enabled: Boolean(slug),
+    staleTime: DAY,
+  });
+}
+
+export interface UseConversationsArgs {
+  slug: string | null;
+  search?: string;
+  limit?: number;
+  offset?: number;
+  sort?: ConversationSort;
+}
+
+/** Paginated conversation list for a dataset (counts only). */
+export function useDatasetConversations({
+  slug,
+  search = '',
+  limit = 50,
+  offset = 0,
+  sort = 'tokens',
+}: UseConversationsArgs) {
+  return useQuery({
+    queryKey: ['dataset-conversations', slug, search, limit, offset, sort] as const,
+    queryFn: ({ signal }) => {
+      const qs = new URLSearchParams({
+        limit: String(limit),
+        offset: String(offset),
+        sort,
+      });
+      if (search) qs.set('search', search);
+      return fetchJsonOr404<ConversationList>(
+        `/api/v1/datasets/${slug}/conversations?${qs.toString()}`,
+        'dataset-conversations',
+        signal,
+      );
+    },
+    enabled: Boolean(slug),
+    placeholderData: keepPreviousData,
+    staleTime: DAY,
+  });
+}
+
+/** One conversation's flamegraph structure. */
+export function useDatasetConversation(slug: string | null, convId: string | null) {
+  return useQuery({
+    queryKey: ['dataset-conversation', slug, convId] as const,
+    queryFn: ({ signal }) =>
+      fetchJsonOr404<ConversationDetail>(
+        `/api/v1/datasets/${slug}/conversations/${encodeURIComponent(convId ?? '')}`,
+        'dataset-conversation',
+        signal,
+      ),
+    enabled: Boolean(slug) && Boolean(convId),
+    staleTime: DAY,
+  });
+}
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts
new file mode 100644
index 00000000..2e54f418
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts
@@ -0,0 +1,13 @@
+import { describe, expect, it } from 'vitest';
+
+import { chunkDerivedAgenticMetricIds } from './use-derived-agentic-metrics';
+
+describe('chunkDerivedAgenticMetricIds', () => {
+  it('keeps every id while respecting the API limit', () => {
+    const ids = Array.from({ length: 401 }, (_, index) => index + 1);
+    const chunks = chunkDerivedAgenticMetricIds(ids);
+
+    expect(chunks.map((chunk) => chunk.length)).toEqual([200, 200, 1]);
+    expect(chunks.flat()).toEqual(ids);
+  });
+});
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
new file mode 100644
index 00000000..388563d9
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -0,0 +1,55 @@
+import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query';
+
+export interface DerivedAgenticMetric {
+  id: number;
+  /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
+   *  by mean_load / session_load. Null when the JSONL had no usable records. */
+  normalized_session_time_s: number | null;
+  /** P90 of per-turn ISL/TTFT across every turn in every session.
+   *  Null when no prefill rates could be computed. */
+  p90_prefill_tps_per_user: number | null;
+  /** P75 normalized per-request E2E at a fixed 400-token output length. */
+  p75_normalized_e2e_400_s: number | null;
+  /** P90 normalized per-request E2E at a fixed 400-token output length. */
+  p90_normalized_e2e_400_s: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
+
+const MAX_IDS_PER_REQUEST = 200;
+
+export function chunkDerivedAgenticMetricIds(ids: number[]): number[][] {
+  const chunks: number[][] = [];
+  for (let i = 0; i < ids.length; i += MAX_IDS_PER_REQUEST) {
+    chunks.push(ids.slice(i, i + MAX_IDS_PER_REQUEST));
+  }
+  return chunks;
+}
+
+const fetchChunk = bulkIdsFetcher<DerivedAgenticMetric>('derived-agentic-metrics');
+
+// Unlike the other bulk endpoints, dashboards can put >200 agentic points on
+// screen at once, so this fetcher splits the id set across parallel requests
+// to stay under the route's MAX_IDS_PER_REQUEST.
+async function fetchDerivedAgenticMetrics(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<DerivedAgenticMetricMap> {
+  if (ids.length === 0) return {};
+  const maps = await Promise.all(
+    chunkDerivedAgenticMetricIds(ids).map((chunk) => fetchChunk(chunk, signal)),
+  );
+  return Object.assign({}, ...maps) as DerivedAgenticMetricMap;
+}
+
+/**
+ * Fetch per-id derived agentic metrics (session time + p90 prefill TPS/user)
+ * computed live from the stored aiperf profile_export.jsonl. Used to drive
+ * the "Session Time" and "Prefill TPS/user" chart variants.
+ *
+ * Ids without a trace_replay blob (older or non-aiperf agentic runs) are
+ * silently omitted from the response.
+ */
+export function useDerivedAgenticMetrics(ids: number[], enabled = true) {
+  return useBulkIdsQuery('derived-agentic-metrics', ids, enabled, fetchDerivedAgenticMetrics);
+}
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
new file mode 100644
index 00000000..6f43de25
--- /dev/null
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -0,0 +1,53 @@
+import { useByIdQuery } from './benchmark-id-query';
+
+export interface RequestRecord {
+  /** Conversation id (groups turns of one agent session). */
+  cid: string;
+  /** Zero-based turn index within the conversation. */
+  ti: number;
+  /** Source trace id from the original raw dataset, when provided by AIPerf. */
+  srcTrace?: string;
+  /** Original raw top-level request index within srcTrace. */
+  srcOuter?: number;
+  /** Original nested request index within srcOuter, for subagent children. */
+  srcInner?: number;
+  /** Loader-specific source kind, e.g. weka_main or weka_flat. */
+  srcKind?: string;
+  /** Worker id (concurrency slot that handled this request). */
+  wid: string;
+  /** Sub-agent depth (0 = top-level). */
+  ad: number;
+  /** `warmup` or `profiling`. */
+  phase: string;
+  /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+  credit: number;
+  /** ns offset from timeline.startNs. HTTP send started. */
+  start: number;
+  /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+  ack: number | null;
+  /** ns offset from timeline.startNs. Last byte received. */
+  end: number;
+  ttftMs: number | null;
+  /** Time per output token in ms. */
+  tpotMs: number | null;
+  isl: number | null;
+  osl: number | null;
+  cancelled: boolean;
+}
+
+export interface RequestTimeline {
+  version: number;
+  startNs: number;
+  endNs: number;
+  durationS: number;
+  requests: RequestRecord[];
+}
+
+/**
+ * Lazy-fetch the per-request Gantt timeline for one agentic point.
+ * Enabled only when the caller opts in (e.g. the timeline view becomes
+ * active), so the payload (~30 KB per point) isn't paid for every page load.
+ */
+export function useRequestTimeline(id: number | null, enabled = false) {
+  return useByIdQuery<RequestTimeline>('request-timeline', id, enabled && Boolean(id));
+}
diff --git a/packages/app/src/hooks/api/use-trace-availability.ts b/packages/app/src/hooks/api/use-trace-availability.ts
new file mode 100644
index 00000000..24e4c067
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-availability.ts
@@ -0,0 +1,15 @@
+import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query';
+
+export type TraceAvailabilityMap = Record<number, true>;
+
+const fetchTraceAvailability = bulkIdsFetcher<true>('trace-availability');
+
+/**
+ * Bulk presence lookup: which of the given `benchmark_results.id`s have a
+ * stored trace_replay blob. Used by the scatter chart to decide whether to
+ * surface the "View charts" button — cheap boolean per id instead of
+ * shipping multi-MB profile blobs just for the check.
+ */
+export function useTraceAvailability(ids: number[], enabled = true) {
+  return useBulkIdsQuery('trace-availability', ids, enabled, fetchTraceAvailability);
+}
diff --git a/packages/app/src/hooks/api/use-trace-histograms.ts b/packages/app/src/hooks/api/use-trace-histograms.ts
new file mode 100644
index 00000000..8197147a
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-histograms.ts
@@ -0,0 +1,25 @@
+import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query';
+
+export interface TraceHistogramPoint {
+  id: number;
+  /** Input sequence length (tokens) per completed request. */
+  isl: number[];
+  /** Output sequence length (tokens) per completed request. */
+  osl: number[];
+}
+
+export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
+
+const fetchTraceHistograms = bulkIdsFetcher<TraceHistogramPoint>('trace-histograms');
+
+/**
+ * Fetch per-request ISL/OSL arrays for a set of benchmark_results.id values.
+ * Ids without a stored trace_replay blob are silently omitted from the response.
+ *
+ * Caller passes the agentic id set currently on screen; React Query handles
+ * dedup + stale-while-revalidate. Cache key is sorted-ids-comma-joined so
+ * any permutation of the same set hits the same cache entry.
+ */
+export function useTraceHistograms(ids: number[], enabled = true) {
+  return useBulkIdsQuery('trace-histograms', ids, enabled, fetchTraceHistograms);
+}
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
new file mode 100644
index 00000000..47cf66a6
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -0,0 +1,96 @@
+import { useByIdQuery } from './benchmark-id-query';
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  total: number;
+}
+export interface PointMeta {
+  id: number;
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  disagg: boolean;
+  conc: number;
+  offload_mode: string | null;
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+  date: string;
+  run_url: string | null;
+  server_gpu_cache_hit_rate: number | null;
+  server_cpu_cache_hit_rate: number | null;
+}
+
+export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown';
+
+export interface MetricSource {
+  id: string;
+  adapter: string;
+  role: MetricSourceRole;
+  endpointUrl: string | null;
+  nativeRole: string | null;
+  workerId: string | null;
+  dpRank: string | null;
+  engine: string | null;
+}
+
+export interface MetricSourceSeries {
+  source: MetricSource;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  promptTps: TimeSeriesPoint[];
+  generationTps: TimeSeriesPoint[];
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  hostKvCacheUsage: TimeSeriesPoint[];
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+}
+
+export interface TraceServerMetrics {
+  meta: PointMeta;
+  startNs: number;
+  endNs: number;
+  durationS: number;
+  timeslicesCount: number;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+  /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+  hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization. Empty for single-engine deployments —
+   * the cluster-average `kvCacheUsage` line covers that case alone.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+  /**
+   * Total KV-cache pool size in tokens (num_gpu_blocks × block_size, summed
+   * across engines). vLLM only — null for SGLang/TRT or older rows.
+   */
+  kvCachePoolTokens: number | null;
+  /** Orchestrator-normalized metrics grouped by endpoint/worker. */
+  metricSources: MetricSourceSeries[];
+}
+
+/**
+ * Lazy-fetch parsed server-metric time-series for one agentic point.
+ * Enabled only when the caller passes `enabled=true` (the detail panel opens),
+ * so we don't pay the parse cost on every hover.
+ */
+export function useTraceServerMetrics(id: number | null, enabled = false) {
+  return useByIdQuery<TraceServerMetrics>('trace-server-metrics', id, enabled && Boolean(id));
+}
diff --git a/packages/app/src/hooks/useChartContext.ts b/packages/app/src/hooks/useChartContext.ts
index 49812c3e..be095430 100644
--- a/packages/app/src/hooks/useChartContext.ts
+++ b/packages/app/src/hooks/useChartContext.ts
@@ -37,6 +37,12 @@ export function reconcileActiveSet<T>(
 interface UseChartStateConfig {
   /** URL parameter prefix (e.g., 'i_' for inference, 'r_' for reliability, 'e_' for evaluation) */
   urlPrefix: string;
+  /**
+   * Initial high-contrast value when the URL has no `<prefix>hc` param.
+   * Defaults to false; the inference chart opts in to true. A `<prefix>hc=0`
+   * URL param overrides it back off.
+   */
+  defaultHighContrast?: boolean;
 }
 
 /**
@@ -44,7 +50,7 @@ interface UseChartStateConfig {
  * Includes mobile-specific legend collapse behavior.
  */
 export function useChartUIState(config: UseChartStateConfig) {
-  const { urlPrefix } = config;
+  const { urlPrefix, defaultHighContrast = false } = config;
   const { getUrlParam } = useUrlState();
 
   const hcParam = `${urlPrefix}hc` as any;
@@ -52,7 +58,7 @@ export function useChartUIState(config: UseChartStateConfig) {
 
   // Initialize with safe defaults that match SSR output to avoid hydration mismatches.
   // URL-param values are applied in a mount effect so the state is only set client-side.
-  const [highContrast, setHighContrast] = useState(false);
+  const [highContrast, setHighContrast] = useState(defaultHighContrast);
   const [isLegendExpanded, setIsLegendExpanded] = useState(true);
   const didInit = useRef(false);
 
@@ -60,7 +66,9 @@ export function useChartUIState(config: UseChartStateConfig) {
     if (didInit.current) return;
     didInit.current = true;
     const hcVal = getUrlParam(hcParam);
+    // Respect both overrides so the toggle round-trips regardless of the default.
     if (hcVal === '1') setHighContrast(true);
+    else if (hcVal === '0') setHighContrast(false);
     const legendVal = getUrlParam(legendParam);
     if (legendVal === '0') setIsLegendExpanded(false);
   }, [getUrlParam, hcParam, legendParam]);
diff --git a/packages/app/src/hooks/useThemeColors.test.ts b/packages/app/src/hooks/useThemeColors.test.ts
index 7275e384..11050d19 100644
--- a/packages/app/src/hooks/useThemeColors.test.ts
+++ b/packages/app/src/hooks/useThemeColors.test.ts
@@ -170,4 +170,32 @@ describe('useThemeColors color maps', () => {
     }
     unmountOn();
   });
+
+  // Regression: deselecting a legend line must not recolor the remaining lines.
+  // The HC palette is sized/indexed by the key set it's generated over, so when
+  // it was generated over the *active* subset (no hcKeys), shrinking the
+  // selection re-sized the palette and shifted every remaining line's hue (most
+  // visible on single-vendor agentic runs spanning the full wheel). Passing a
+  // stable `hcKeys` (the full set with data) fixes each line's color.
+  it('keeps a line HC color stable across active subsets when hcKeys is the full set', () => {
+    const FULL = ['b200', 'b300']; // single-vendor (NVIDIA) agentic comparison
+
+    const all = renderHook<UseThemeColorsResult>(() =>
+      useThemeColors({ highContrast: true, activeKeys: ['b200', 'b300'], hcKeys: FULL }),
+    );
+    const b200WithBoth = all.result.current.resolveColor('b200');
+    const b300Color = all.result.current.resolveColor('b300');
+    all.unmount();
+
+    // b300 deselected → only b200 active, but hcKeys is still the full set.
+    const subset = renderHook<UseThemeColorsResult>(() =>
+      useThemeColors({ highContrast: true, activeKeys: ['b200'], hcKeys: FULL }),
+    );
+    const b200Alone = subset.result.current.resolveColor('b200');
+    subset.unmount();
+
+    expect(b200WithBoth).toMatch(/^#[0-9a-f]{6}$/iu);
+    expect(b200WithBoth).not.toBe(b300Color); // HC still produces distinct hues
+    expect(b200Alone).toBe(b200WithBoth); // deselecting b300 did NOT recolor b200
+  });
 });

From bd3089418bffa12540cf864d9f52a7be0544066b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:12:03 -0500
Subject: [PATCH 07/40] =?UTF-8?q?feat(datasets):=20dataset=20browser=20?=
 =?UTF-8?q?=E2=80=94=20conversation=20flamegraph,=20distributions,=20deep-?=
 =?UTF-8?q?link=20targets?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../[slug]/conversations/[convId]/page.tsx    |  35 ++
 packages/app/src/app/datasets/[slug]/page.tsx |  32 ++
 packages/app/src/app/datasets/page.tsx        |  99 ++++
 .../components/datasets/conversation-view.tsx | 109 +++++
 .../components/datasets/dataset-detail.tsx    | 312 +++++++++++++
 .../src/components/datasets/dataset-list.tsx  |  86 ++++
 .../components/datasets/distribution-card.tsx | 220 +++++++++
 .../app/src/components/datasets/format.ts     |  28 ++
 packages/app/src/components/datasets/stat.tsx |   9 +
 .../datasets/trace-flamegraph-model.ts        | 422 +++++++++++++++++
 .../datasets/trace-flamegraph.test.ts         | 246 ++++++++++
 .../components/datasets/trace-flamegraph.tsx  | 439 ++++++++++++++++++
 12 files changed, 2037 insertions(+)
 create mode 100644 packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
 create mode 100644 packages/app/src/app/datasets/[slug]/page.tsx
 create mode 100644 packages/app/src/app/datasets/page.tsx
 create mode 100644 packages/app/src/components/datasets/conversation-view.tsx
 create mode 100644 packages/app/src/components/datasets/dataset-detail.tsx
 create mode 100644 packages/app/src/components/datasets/dataset-list.tsx
 create mode 100644 packages/app/src/components/datasets/distribution-card.tsx
 create mode 100644 packages/app/src/components/datasets/format.ts
 create mode 100644 packages/app/src/components/datasets/stat.tsx
 create mode 100644 packages/app/src/components/datasets/trace-flamegraph-model.ts
 create mode 100644 packages/app/src/components/datasets/trace-flamegraph.test.ts
 create mode 100644 packages/app/src/components/datasets/trace-flamegraph.tsx

diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
new file mode 100644
index 00000000..83eb56a0
--- /dev/null
+++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
@@ -0,0 +1,35 @@
+import { Suspense } from 'react';
+import type { Metadata } from 'next';
+
+import { ConversationView } from '@/components/datasets/conversation-view';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+interface Props {
+  params: Promise<{ slug: string; convId: string }>;
+}
+
+export async function generateMetadata({ params }: Props): Promise<Metadata> {
+  const { slug, convId } = await params;
+  const short = convId.slice(0, 12);
+  const title = `Conversation ${short} | ${slug}`;
+  const description = `Per-turn token flamegraph (cached prefix vs uncached input vs output) for conversation ${short} in the ${slug} agentic trace dataset.`;
+  return {
+    title,
+    description,
+    alternates: { canonical: `${SITE_URL}/datasets/${slug}/conversations/${convId}` },
+    robots: { index: false }, // per-conversation pages are too numerous to index
+  };
+}
+
+export default async function ConversationPage({ params }: Props) {
+  const { slug, convId } = await params;
+  return (
+    <main className="relative">
+      <div className="container mx-auto px-4 pb-8 lg:px-8">
+        <Suspense>
+          <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+        </Suspense>
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/app/datasets/[slug]/page.tsx b/packages/app/src/app/datasets/[slug]/page.tsx
new file mode 100644
index 00000000..f32e3fa6
--- /dev/null
+++ b/packages/app/src/app/datasets/[slug]/page.tsx
@@ -0,0 +1,32 @@
+import type { Metadata } from 'next';
+
+import { DatasetDetail } from '@/components/datasets/dataset-detail';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+interface Props {
+  params: Promise<{ slug: string }>;
+}
+
+export async function generateMetadata({ params }: Props): Promise<Metadata> {
+  const { slug } = await params;
+  const title = `${slug} | Agentic Datasets`;
+  const description = `Distributions, token statistics, and per-conversation flamegraphs for the ${slug} agentic trace dataset.`;
+  return {
+    title,
+    description,
+    alternates: { canonical: `${SITE_URL}/datasets/${slug}` },
+    openGraph: { title: `${title} | InferenceX`, description, url: `${SITE_URL}/datasets/${slug}` },
+    twitter: { title: `${title} | InferenceX`, description },
+  };
+}
+
+export default async function DatasetDetailPage({ params }: Props) {
+  const { slug } = await params;
+  return (
+    <main className="relative">
+      <div className="container mx-auto px-4 pb-8 lg:px-8">
+        <DatasetDetail slug={slug} />
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/app/datasets/page.tsx b/packages/app/src/app/datasets/page.tsx
new file mode 100644
index 00000000..7fe46b93
--- /dev/null
+++ b/packages/app/src/app/datasets/page.tsx
@@ -0,0 +1,99 @@
+import type { Metadata } from 'next';
+
+import { Card } from '@/components/ui/card';
+import { JsonLd } from '@/components/json-ld';
+import { DatasetList } from '@/components/datasets/dataset-list';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+const DESCRIPTION =
+  'The real Claude Code agentic conversation traces that the InferenceX agentic benchmark replays — methodology, distributions, and per-conversation flamegraphs.';
+
+export const metadata: Metadata = {
+  title: 'Agentic Datasets',
+  description: DESCRIPTION,
+  alternates: { canonical: `${SITE_URL}/datasets` },
+  openGraph: {
+    title: 'Agentic Datasets | InferenceX',
+    description: DESCRIPTION,
+    url: `${SITE_URL}/datasets`,
+  },
+  twitter: { title: 'Agentic Datasets | InferenceX', description: DESCRIPTION },
+};
+
+const jsonLd = {
+  '@context': 'https://schema.org',
+  '@type': 'CollectionPage',
+  name: 'InferenceX Agentic Datasets',
+  description: DESCRIPTION,
+  url: `${SITE_URL}/datasets`,
+};
+
+export default function DatasetsPage() {
+  return (
+    <main className="relative">
+      <JsonLd data={jsonLd} />
+      <div className="container mx-auto flex flex-col gap-6 px-4 pb-8 lg:px-8">
+        <section>
+          <Card>
+            <h1 className="mb-2 text-xl font-semibold text-foreground">
+              Agentic Benchmark Datasets
+            </h1>
+            <p className="mb-3 text-sm text-muted-foreground">
+              InferenceX&apos;s agentic benchmark doesn&apos;t replay synthetic prompts — it replays
+              real Claude Code coding sessions captured as <strong>conversation traces</strong>.
+              Each trace is a full multi-turn session: the main agent&apos;s turns plus any
+              subagents it spawned, with per-turn input/output token counts and the 64-token
+              KV-cache block hashes needed to reconstruct prefix-cache reuse. The traces are
+              published openly on HuggingFace under <code>semianalysisai/cc-traces-weka-*</code>{' '}
+              (apache-2.0).
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">
+              How traces are captured
+            </h2>
+            <p className="mb-3 text-sm text-muted-foreground">
+              Production Claude Code sessions are recorded through a logging proxy that captures
+              every API request: its input and output token counts, the model used, timing (TTFT,
+              inter-token latency), and a list of <code>hash_ids</code> — one per 64-token KV block
+              of the request&apos;s input. Subagent invocations are grouped under their parent turn.
+              No prompt or completion text is stored; only token counts and block hashes, so the
+              corpus is shareable while remaining a faithful workload for replay.
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">
+              Cached prefix vs uncached suffix
+            </h2>
+            <p className="mb-3 text-sm text-muted-foreground">
+              Agentic workloads are dominated by prefix reuse: each turn resends the growing
+              conversation, so most of its input is already in the KV cache from prior turns. We
+              reconstruct this exactly. Walking a conversation in order under an idealized infinite
+              cache, a turn&apos;s <strong>cached prefix</strong> is its longest run of leading{' '}
+              <code>hash_ids</code> already seen; the rest is the <strong>uncached suffix</strong>{' '}
+              that must be (re)computed. Blocks are 64 tokens; the split is clamped so cached +
+              uncached equals the turn&apos;s effective input even on a partial final block.
+              Subagents run against a snapshot of the parent cache at spawn (their context is
+              separate and is not folded back into the parent).
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">Dataset variants</h2>
+            <ul className="mb-1 list-disc space-y-1 pl-5 text-sm text-muted-foreground">
+              <li>
+                <strong>full</strong> — every captured request, unmodified.
+              </li>
+              <li>
+                <strong>256k</strong> — requests whose input + output exceeds 256,000 tokens are
+                dropped so every turn fits a 256k context window (used when benchmarking engines
+                configured for a 256k max context).
+              </li>
+            </ul>
+          </Card>
+        </section>
+
+        <section className="flex flex-col gap-3">
+          <h2 className="text-lg font-semibold text-foreground">Datasets</h2>
+          <DatasetList />
+        </section>
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
new file mode 100644
index 00000000..415a430d
--- /dev/null
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -0,0 +1,109 @@
+'use client';
+
+import Link from 'next/link';
+import { useSearchParams } from 'next/navigation';
+
+import { Card } from '@/components/ui/card';
+import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
+import { useDatasetConversation } from '@/hooks/api/use-datasets';
+import { compact, formatShare } from './format';
+import { Stat } from './stat';
+
+export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
+  const { data, isLoading, isError } = useDatasetConversation(slug, convId);
+
+  // Deep-link target from a request-timeline click: ?raw=<outerIdx> or ?turn=<ti>[&sa=<agentId>].
+  // useSearchParams (not a one-shot window.location read) so the params are
+  // present on the very first client-side navigation, not just after a reload.
+  const params = useSearchParams();
+  const turnRaw = params.get('turn');
+  const sourceRaw = params.get('raw');
+  const sourceInner = params.get('inner');
+  const highlight = {
+    turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null,
+    raw: sourceRaw !== null && /^\d+$/u.test(sourceRaw) ? Number(sourceRaw) : null,
+    inner: sourceInner !== null && /^\d+$/u.test(sourceInner) ? Number(sourceInner) : null,
+    agent: params.get('sa'),
+  };
+
+  if (isLoading) {
+    return (
+      <div className="py-12 text-center text-sm text-muted-foreground">Loading conversation…</div>
+    );
+  }
+  if (isError || !data) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">
+        Conversation not found.{' '}
+        <Link href={`/datasets/${slug}`} className="text-primary underline">
+          Back to dataset
+        </Link>
+      </div>
+    );
+  }
+
+  const cachedPct = formatShare(data.total_cached, data.total_in);
+
+  return (
+    <div className="flex flex-col gap-6">
+      <div>
+        <div className="mb-1 flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
+          <Link href="/datasets" className="hover:text-foreground">
+            Datasets
+          </Link>
+          <span>/</span>
+          <Link href={`/datasets/${slug}`} className="hover:text-foreground">
+            {slug}
+          </Link>
+          <span>/</span>
+          <span className="text-foreground">conversation</span>
+        </div>
+        <h1 className="break-all font-mono text-lg font-semibold text-foreground">
+          {data.conv_id}
+        </h1>
+        {data.models.length > 0 && (
+          <div className="mt-2 flex flex-wrap gap-2">
+            {data.models.map((m) => (
+              <span
+                key={m}
+                className="rounded-md border border-border/40 px-2 py-0.5 text-xs text-foreground"
+              >
+                {m}
+              </span>
+            ))}
+          </div>
+        )}
+      </div>
+
+      <Card className="p-4">
+        <dl className="grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-6">
+          <Stat label="Main turns" value={String(data.num_turns)} />
+          <Stat label="Subagent groups" value={String(data.num_subagent_groups)} />
+          <Stat label="Input" value={`${compact(data.total_in)} tok`} />
+          <Stat label="Output" value={`${compact(data.total_out)} tok`} />
+          <Stat label="Cached" value={`${compact(data.total_cached)} tok`} />
+          <Stat label="Cached %" value={cachedPct} />
+        </dl>
+      </Card>
+
+      <Card className="p-4">
+        <h2 className="mb-3 text-lg font-semibold text-foreground">Flamegraph</h2>
+        <p className="mb-4 text-xs text-muted-foreground">
+          One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default —
+          click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
+          plus generated output. Timestamps are elapsed from conversation start; subagent headers
+          show their full active range. A colored bracket on the left groups requests in the same
+          main-agent or subagent scope whose original execution intervals overlapped (ran in
+          parallel).
+        </p>
+        <TraceFlamegraph
+          structure={data.structure}
+          highlightTurn={highlight.turn}
+          highlightRawIndex={highlight.raw}
+          highlightInnerIndex={highlight.inner}
+          highlightAgentId={highlight.agent}
+        />
+      </Card>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
new file mode 100644
index 00000000..051e7457
--- /dev/null
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -0,0 +1,312 @@
+'use client';
+
+import { useState } from 'react';
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { DistributionCard } from '@/components/datasets/distribution-card';
+import {
+  useDataset,
+  useDatasetConversations,
+  type ConversationSort,
+} from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+import { compact, formatPct, formatShare, perConversation } from './format';
+import { Stat } from './stat';
+
+const PAGE = 50;
+
+const SORTS: { value: ConversationSort; label: string }[] = [
+  { value: 'tokens', label: 'Total input ↓' },
+  { value: 'turns', label: 'Turns ↓' },
+  { value: 'subagents', label: 'Subagent groups ↓' },
+  { value: 'id', label: 'Conversation ID' },
+];
+
+export function DatasetDetail({ slug }: { slug: string }) {
+  const { data: dataset, isLoading, isError } = useDataset(slug);
+  const [search, setSearch] = useState('');
+  const [sort, setSort] = useState<ConversationSort>('tokens');
+  const [page, setPage] = useState(0);
+
+  const { data: convs, isFetching } = useDatasetConversations({
+    slug,
+    search,
+    sort,
+    limit: PAGE,
+    offset: page * PAGE,
+  });
+
+  if (isLoading) {
+    return <div className="py-12 text-center text-sm text-muted-foreground">Loading dataset…</div>;
+  }
+  if (isError || !dataset) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">
+        Dataset not found.{' '}
+        <Link href="/datasets" className="text-primary underline">
+          Back to datasets
+        </Link>
+      </div>
+    );
+  }
+
+  const s = dataset.summary ?? {};
+  const cd = dataset.chart_data ?? {};
+  const total = convs?.total ?? 0;
+  const pageCount = Math.ceil(total / PAGE);
+
+  return (
+    <div className="flex flex-col gap-6">
+      {/* header */}
+      <div>
+        <div className="mb-1 flex items-center gap-2">
+          <Link href="/datasets" className="text-xs text-muted-foreground hover:text-foreground">
+            ← Datasets
+          </Link>
+        </div>
+        <div className="flex flex-wrap items-baseline justify-between gap-2">
+          <h1 className="text-2xl font-semibold text-foreground">{dataset.label}</h1>
+          <div className="flex items-center gap-2 text-xs">
+            <span className="rounded-full border border-border/50 px-2 py-0.5 uppercase tracking-wide text-muted-foreground">
+              {dataset.variant}
+            </span>
+            {dataset.hf_url && (
+              <a
+                href={dataset.hf_url}
+                target="_blank"
+                rel="noopener noreferrer"
+                onClick={() => track('datasets_hf_link_clicked', { slug })}
+                className="text-primary hover:underline"
+              >
+                View on HuggingFace ↗
+              </a>
+            )}
+          </div>
+        </div>
+        {dataset.description && (
+          <p className="mt-2 max-w-3xl text-sm text-muted-foreground">{dataset.description}</p>
+        )}
+      </div>
+
+      {/* summary stats */}
+      <Card className="p-4">
+        <dl className="grid grid-cols-2 gap-4 sm:grid-cols-4 lg:grid-cols-8">
+          <Stat label="Conversations" value={dataset.conversation_count.toLocaleString()} />
+          <Stat
+            label="Median requests / convo"
+            value={perConversation(s.medianRequestsPerConversation)}
+          />
+          <Stat
+            label="Mean requests / convo"
+            value={perConversation(s.meanRequestsPerConversation)}
+          />
+          <Stat label="Main turns" value={compact(s.mainTurns ?? 0)} />
+          <Stat
+            label="Median subagents / trace"
+            value={perConversation(s.medianSubagentsPerTrace)}
+          />
+          <Stat label="Mean subagents / trace" value={perConversation(s.meanSubagentsPerTrace)} />
+          <Stat label="Cached input" value={formatPct(s.cachedPct)} />
+          <Stat label="Total tokens" value={compact((s.totalIn ?? 0) + (s.totalOut ?? 0))} />
+        </dl>
+        {s.modelMix && Object.keys(s.modelMix).length > 0 && (
+          <div className="mt-4 border-t border-border/40 pt-3">
+            <div className="mb-1.5 text-xs font-medium text-muted-foreground">
+              Model mix (turns)
+            </div>
+            <div className="flex flex-wrap gap-2">
+              {Object.entries(s.modelMix)
+                .toSorted((a, b) => b[1] - a[1])
+                .map(([model, count]) => (
+                  <span
+                    key={model}
+                    className="rounded-md border border-border/40 px-2 py-0.5 text-xs text-foreground"
+                  >
+                    {model} <span className="text-muted-foreground">{compact(count)}</span>
+                  </span>
+                ))}
+            </div>
+          </div>
+        )}
+      </Card>
+
+      {/* distribution cards */}
+      <section className="flex flex-col gap-3">
+        <h2 className="text-lg font-semibold text-foreground">Distributions</h2>
+        <div className="grid gap-4 lg:grid-cols-2">
+          <DistributionCard
+            title="Input tokens per turn"
+            unit="tokens"
+            scale="log"
+            distribution={cd.inputTokensPerTurn}
+          />
+          <DistributionCard
+            title="Output tokens per turn"
+            unit="tokens"
+            scale="log"
+            distribution={cd.outputTokensPerTurn}
+          />
+          <DistributionCard
+            title="Uncached input tokens per request"
+            unit="tokens"
+            scale="log"
+            distribution={cd.uncachedInputTokensPerTurn}
+          />
+          <DistributionCard
+            title="Turns per conversation"
+            unit="turns"
+            distribution={cd.turnsPerConversation}
+          />
+          <DistributionCard
+            title="Subagent request ISL"
+            subtitle="Inner subagent requests only"
+            unit="tokens"
+            scale="log"
+            distribution={cd.subagentInputTokensPerRequest}
+          />
+          <DistributionCard
+            title="Subagent request OSL"
+            subtitle="Inner subagent requests only"
+            unit="tokens"
+            scale="log"
+            distribution={cd.subagentOutputTokensPerRequest}
+          />
+          <DistributionCard
+            title="Cached fraction per turn"
+            unit=""
+            distribution={cd.cachedFractionPerTurn}
+            formatValue={formatPct}
+          />
+        </div>
+      </section>
+
+      {/* conversation list */}
+      <section className="flex flex-col gap-3">
+        <div className="flex flex-wrap items-center justify-between gap-3">
+          <h2 className="text-lg font-semibold text-foreground">
+            Conversations{' '}
+            <span className="text-sm font-normal text-muted-foreground">({total})</span>
+          </h2>
+          <div className="flex items-center gap-2">
+            <input
+              type="text"
+              value={search}
+              onChange={(e) => {
+                setSearch(e.target.value);
+                setPage(0);
+              }}
+              placeholder="Search by ID…"
+              className="h-8 w-40 rounded-md border border-border/40 bg-background px-2 text-xs outline-none focus:border-primary"
+            />
+            <Select
+              value={sort}
+              onValueChange={(v) => {
+                setSort(v as ConversationSort);
+                setPage(0);
+                track('datasets_conversations_sorted', { mode: v });
+              }}
+            >
+              <SelectTrigger className="h-8 w-[12rem] text-xs" aria-label="Sort conversations">
+                <SelectValue />
+              </SelectTrigger>
+              <SelectContent>
+                {SORTS.map((o) => (
+                  <SelectItem key={o.value} value={o.value} className="text-xs">
+                    {o.label}
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+          </div>
+        </div>
+
+        <Card className="overflow-hidden p-0">
+          <table className="w-full text-sm">
+            <thead className="border-b border-border/40 bg-muted/30 text-xs text-muted-foreground">
+              <tr>
+                <th className="px-3 py-2 text-left font-medium">Conversation</th>
+                <th className="px-3 py-2 text-right font-medium">Turns</th>
+                <th className="px-3 py-2 text-right font-medium">Subagents</th>
+                <th className="px-3 py-2 text-right font-medium">Input</th>
+                <th className="px-3 py-2 text-right font-medium">Output</th>
+                <th className="px-3 py-2 text-right font-medium">Cached</th>
+              </tr>
+            </thead>
+            <tbody>
+              {(convs?.items ?? []).map((c) => {
+                const cachedPct = formatShare(c.total_cached, c.total_in);
+                return (
+                  <tr
+                    key={c.conv_id}
+                    className="border-b border-border/20 last:border-0 hover:bg-accent/40"
+                  >
+                    <td className="px-3 py-2">
+                      <Link
+                        href={`/datasets/${slug}/conversations/${c.conv_id}`}
+                        onClick={() => track('datasets_conversation_clicked', { slug })}
+                        className="font-mono text-xs text-primary hover:underline"
+                      >
+                        {c.conv_id.slice(0, 20)}…
+                      </Link>
+                      {c.models.length > 0 && (
+                        <span className="ml-2 text-[11px] text-muted-foreground">
+                          {c.models.length} model{c.models.length === 1 ? '' : 's'}
+                        </span>
+                      )}
+                    </td>
+                    <td className="px-3 py-2 text-right tabular-nums">{c.num_turns}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{c.num_subagent_groups}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{compact(c.total_in)}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{compact(c.total_out)}</td>
+                    <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">
+                      {cachedPct}
+                    </td>
+                  </tr>
+                );
+              })}
+              {!isFetching && (convs?.items.length ?? 0) === 0 && (
+                <tr>
+                  <td colSpan={6} className="px-3 py-8 text-center text-xs text-muted-foreground">
+                    No conversations match.
+                  </td>
+                </tr>
+              )}
+            </tbody>
+          </table>
+        </Card>
+
+        {pageCount > 1 && (
+          <div className="flex items-center justify-center gap-3 text-xs">
+            <button
+              type="button"
+              disabled={page === 0}
+              onClick={() => setPage((p) => Math.max(0, p - 1))}
+              className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
+            >
+              ← Prev
+            </button>
+            <span className="text-muted-foreground">
+              Page {page + 1} of {pageCount}
+            </span>
+            <button
+              type="button"
+              disabled={page >= pageCount - 1}
+              onClick={() => setPage((p) => Math.min(pageCount - 1, p + 1))}
+              className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
+            >
+              Next →
+            </button>
+          </div>
+        )}
+      </section>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx
new file mode 100644
index 00000000..d85d7eaa
--- /dev/null
+++ b/packages/app/src/components/datasets/dataset-list.tsx
@@ -0,0 +1,86 @@
+'use client';
+
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+import { compact, formatPct, perConversation } from './format';
+
+function DatasetCard({ d }: { d: DatasetRecord }) {
+  const s = d.summary ?? {};
+  const cachedPct = formatPct(s.cachedPct);
+  return (
+    <Link
+      href={`/datasets/${d.slug}`}
+      onClick={() => track('datasets_card_clicked', { slug: d.slug })}
+      className="block transition-colors hover:[&_*]:border-primary/40"
+    >
+      <Card className="h-full p-4 transition-colors hover:border-primary/40">
+        <div className="mb-1 flex items-baseline justify-between gap-2">
+          <h3 className="text-base font-semibold text-foreground">{d.label}</h3>
+          <span className="rounded-full border border-border/50 px-2 py-0.5 text-[10px] uppercase tracking-wide text-muted-foreground">
+            {d.variant}
+          </span>
+        </div>
+        {d.description && (
+          <p className="mb-3 line-clamp-2 text-xs text-muted-foreground">{d.description}</p>
+        )}
+        <dl className="grid grid-cols-2 gap-x-4 gap-y-1.5 text-xs">
+          <Stat label="Conversations" value={d.conversation_count.toLocaleString()} />
+          <Stat
+            label="Median requests / convo"
+            value={perConversation(s.medianRequestsPerConversation)}
+          />
+          <Stat
+            label="Mean requests / convo"
+            value={perConversation(s.meanRequestsPerConversation)}
+          />
+          <Stat label="Main turns" value={compact(s.mainTurns ?? 0)} />
+          <Stat label="Subagent groups" value={compact(s.subagentGroups ?? 0)} />
+          <Stat label="Cached input" value={cachedPct} />
+          <Stat label="Total input" value={`${compact(s.totalIn ?? 0)} tok`} />
+          <Stat label="Total output" value={`${compact(s.totalOut ?? 0)} tok`} />
+        </dl>
+        <div className="mt-3 text-xs font-medium text-primary">View dataset →</div>
+      </Card>
+    </Link>
+  );
+}
+
+function Stat({ label, value }: { label: string; value: string }) {
+  return (
+    <div className="flex items-baseline justify-between gap-2">
+      <dt className="text-muted-foreground">{label}</dt>
+      <dd className="tabular-nums font-medium text-foreground">{value}</dd>
+    </div>
+  );
+}
+
+export function DatasetList() {
+  const { data, isLoading, isError } = useDatasets();
+
+  if (isLoading) {
+    return <div className="py-12 text-center text-sm text-muted-foreground">Loading datasets…</div>;
+  }
+  if (isError || !data) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">Failed to load datasets.</div>
+    );
+  }
+  if (data.length === 0) {
+    return (
+      <div className="py-12 text-center text-sm text-muted-foreground">
+        No datasets ingested yet.
+      </div>
+    );
+  }
+
+  return (
+    <div className="grid gap-4 sm:grid-cols-2">
+      {data.map((d) => (
+        <DatasetCard key={d.id} d={d} />
+      ))}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx
new file mode 100644
index 00000000..8adc02ee
--- /dev/null
+++ b/packages/app/src/components/datasets/distribution-card.tsx
@@ -0,0 +1,220 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { Card } from '@/components/ui/card';
+import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover';
+import type { Distribution } from '@/hooks/api/use-datasets';
+import { compact } from './format';
+
+interface DistributionCardProps {
+  title: string;
+  subtitle?: string;
+  unit: string;
+  distribution?: Distribution;
+  scale?: 'log' | 'linear';
+  /** Format the x value (defaults to compact). e.g. percent for cached fraction. */
+  formatValue?: (v: number) => string;
+}
+
+const W = 720;
+const H = 240;
+const PAD = { top: 12, right: 16, bottom: 48, left: 52 };
+
+/**
+ * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a
+ * themeable bar chart with p50/p75/p90/p95 guide lines and a hover tooltip. Bars are
+ * drawn at equal visual width; for log-scaled bins the edge labels are already
+ * log-spaced so the shape reads as a log histogram.
+ */
+export function DistributionCard({
+  title,
+  subtitle,
+  unit,
+  distribution,
+  scale = 'linear',
+  formatValue = compact,
+}: DistributionCardProps) {
+  const computed = useMemo(() => {
+    const bins = distribution?.bins ?? [];
+    if (bins.length === 0) return null;
+    const maxCount = Math.max(1, ...bins.map((b) => b.count));
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const n = bins.length;
+    const barW = innerW / n;
+    // Map a data value to an x pixel by locating its bin (positional — works for
+    // both linear and log bins since the edges are precomputed at ingest).
+    // Out-of-range values clamp to the first/last bin.
+    const valueToX = (v: number): number => {
+      for (let i = 0; i < n; i++) {
+        if (v >= bins[i].x0 && (v < bins[i].x1 || i === n - 1)) {
+          return PAD.left + (i + 0.5) * barW;
+        }
+      }
+      if (v <= bins[0].x0) return PAD.left + 0.5 * barW;
+      return PAD.left + (n - 0.5) * barW;
+    };
+    return { bins, maxCount, innerW, innerH, n, barW, valueToX };
+  }, [distribution]);
+
+  if (!computed) {
+    return (
+      <Card className="p-4">
+        <div className="mb-1 text-sm font-medium text-foreground">{title}</div>
+        <div className="grid h-[240px] place-items-center text-xs text-muted-foreground">
+          No data
+        </div>
+      </Card>
+    );
+  }
+
+  const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed;
+  const stats = distribution?.stats;
+
+  const guides: { label: string; value: number; color: string }[] = stats
+    ? [
+        { label: 'p50', value: stats.median, color: '#3b82f6' },
+        ...(typeof stats.p75 === 'number'
+          ? [{ label: 'p75', value: stats.p75, color: '#22c55e' }]
+          : []),
+        { label: 'p90', value: stats.p90, color: '#f59e0b' },
+        ...(typeof stats.p95 === 'number'
+          ? [{ label: 'p95', value: stats.p95, color: '#ef4444' }]
+          : []),
+      ]
+    : [];
+
+  // X tick labels from a few bin edges.
+  const tickIdxs = [0, Math.floor(n / 3), Math.floor((2 * n) / 3), n - 1];
+
+  const resolve = (fraction: number) => {
+    const i = Math.min(n - 1, Math.max(0, Math.floor(fraction * n)));
+    const b = bins[i];
+    const items: HoverItem[] = [
+      {
+        color: 'currentColor',
+        label: 'Range',
+        value: `${formatValue(b.x0)}–${formatValue(b.x1)} ${unit}`,
+      },
+      { color: 'currentColor', label: 'Count', value: b.count.toLocaleString() },
+    ];
+    return { items };
+  };
+
+  return (
+    <Card className="p-4">
+      <div className="mb-0.5 flex items-baseline justify-between gap-2">
+        <span className="text-sm font-medium text-foreground">{title}</span>
+        {scale === 'log' && (
+          <span className="text-[10px] uppercase tracking-wide text-muted-foreground">
+            log scale
+          </span>
+        )}
+      </div>
+      {subtitle && <div className="mb-1 text-xs text-muted-foreground">{subtitle}</div>}
+      {stats && (
+        <div className="mb-2 text-xs text-muted-foreground">
+          n={stats.count.toLocaleString()} · p50 {formatValue(stats.median)}
+          {typeof stats.p75 === 'number' && <> · p75 {formatValue(stats.p75)}</>} · p90{' '}
+          {formatValue(stats.p90)}
+          {typeof stats.p95 === 'number' && <> · p95 {formatValue(stats.p95)}</>} · max{' '}
+          {formatValue(stats.max)} {unit}
+        </div>
+      )}
+      <div className="w-full text-muted-foreground">
+        <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+          {/* bars */}
+          {bins.map((b, i) => {
+            const h = (b.count / maxCount) * innerH;
+            const x = PAD.left + i * barW;
+            const y = PAD.top + (innerH - h);
+            return (
+              <rect
+                key={i}
+                x={x}
+                y={y}
+                width={Math.max(0, barW - 1)}
+                height={h}
+                className="fill-primary/55"
+              />
+            );
+          })}
+
+          {/* guide lines */}
+          {guides.map((g) => {
+            const x = valueToX(g.value);
+            return (
+              <line
+                key={g.label}
+                x1={x}
+                x2={x}
+                y1={PAD.top}
+                y2={PAD.top + innerH}
+                stroke={g.color}
+                strokeWidth={2}
+                strokeDasharray="5 3"
+                opacity={0.95}
+              />
+            );
+          })}
+
+          {/* x axis */}
+          <line
+            x1={PAD.left}
+            x2={PAD.left + innerW}
+            y1={PAD.top + innerH}
+            y2={PAD.top + innerH}
+            stroke="currentColor"
+            opacity={0.2}
+          />
+          {tickIdxs.map((i, k) => {
+            const anchor = k === 0 ? 'start' : k === tickIdxs.length - 1 ? 'end' : 'middle';
+            const x = PAD.left + (i + 0.5) * barW;
+            return (
+              <text
+                key={i}
+                x={x}
+                y={PAD.top + innerH + 14}
+                fontSize={11}
+                fill="currentColor"
+                opacity={0.7}
+                textAnchor={anchor}
+              >
+                {formatValue(bins[i].x0)}
+              </text>
+            );
+          })}
+          <text
+            x={W / 2}
+            y={H - 16}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.55}
+            textAnchor="middle"
+          >
+            {unit}
+          </text>
+
+          {/* guide legend */}
+          {guides.map((g, i) => (
+            <g key={g.label} transform={`translate(${PAD.left + i * 110}, ${PAD.top})`}>
+              <line
+                x1={0}
+                x2={12}
+                y1={4}
+                y2={4}
+                stroke={g.color}
+                strokeWidth={2}
+                strokeDasharray="5 3"
+              />
+              <text x={16} y={7} fontSize={10} fill="currentColor" opacity={0.85}>
+                {g.label} {formatValue(g.value)}
+              </text>
+            </g>
+          ))}
+        </ChartHover>
+      </div>
+    </Card>
+  );
+}
diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts
new file mode 100644
index 00000000..fd526d12
--- /dev/null
+++ b/packages/app/src/components/datasets/format.ts
@@ -0,0 +1,28 @@
+/**
+ * Compact number formatter for dataset token/count displays:
+ * 1234 → "1.2k", 1_200_000 → "1.2M", 3.2e9 → "3.2B", 0.82 → "0.82".
+ */
+export function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  if (abs > 0 && abs < 1) return n.toFixed(2);
+  return String(Math.round(n));
+}
+
+/** Format a per-conversation count without hiding a meaningful fractional mean. */
+export function perConversation(n: number | undefined): string {
+  if (typeof n !== 'number' || !Number.isFinite(n)) return '—';
+  return n.toLocaleString(undefined, { maximumFractionDigits: 1 });
+}
+
+/** Format a 0–1 fraction as a whole percent ("42%"), em dash when absent. */
+export function formatPct(fraction: number | undefined): string {
+  return typeof fraction === 'number' ? `${(fraction * 100).toFixed(0)}%` : '—';
+}
+
+/** Percent share of `part` in `total` ("42%"), em dash when `total` is 0. */
+export function formatShare(part: number, total: number): string {
+  return total > 0 ? `${((part / total) * 100).toFixed(0)}%` : '—';
+}
diff --git a/packages/app/src/components/datasets/stat.tsx b/packages/app/src/components/datasets/stat.tsx
new file mode 100644
index 00000000..3fb6a32a
--- /dev/null
+++ b/packages/app/src/components/datasets/stat.tsx
@@ -0,0 +1,9 @@
+/** Label/value pair for the summary <dl> grids on dataset and conversation pages. */
+export function Stat({ label, value }: { label: string; value: string }) {
+  return (
+    <div>
+      <dt className="text-xs text-muted-foreground">{label}</dt>
+      <dd className="text-lg font-semibold tabular-nums text-foreground">{value}</dd>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph-model.ts b/packages/app/src/components/datasets/trace-flamegraph-model.ts
new file mode 100644
index 00000000..2aff9ac3
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph-model.ts
@@ -0,0 +1,422 @@
+/**
+ * Pure logic for the trace flamegraph: overlap detection, deep-link resolution,
+ * visible-row construction, and bracket-lane layout. No React/DOM — everything
+ * here is unit-testable directly. Rendering lives in trace-flamegraph.tsx.
+ */
+
+import type { StructureNode } from '@/hooks/api/use-datasets';
+
+// Kept distinct from token-segment colors. A row can carry multiple rails when
+// it overlaps different requests during different parts of its lifetime.
+export const OVERLAP_COLORS = ['#06b6d4', '#ec4899', '#6366f1', '#84cc16', '#f97316'] as const;
+
+// Cap on simultaneously-drawn bracket lanes. A pathological conversation (e.g. a
+// long-running session whose subagent fans out into hundreds of children with
+// 15+ concurrent requests) can require dozens of lanes; left unbounded the
+// gutter grows wide enough to push the bars off-screen AND emits one DOM node
+// per lane per row (tens of thousands of empty divs). We bound it: lanes beyond
+// the cap fold into the last "dense" lane, which stays readable for the common
+// case (≤6 concurrent) and degrades gracefully for the outliers.
+export const MAX_LANES = 6;
+
+export interface TimedRequest {
+  key: string;
+  startS?: number;
+  endS?: number;
+}
+
+export interface RequestOverlapGroup {
+  id: string;
+  requestKeys: string[];
+  startS: number;
+  endS: number;
+}
+
+/**
+ * Find maximal sets of requests that were simultaneously in flight.
+ * Intervals are half-open, so one request ending exactly when another begins
+ * is serialized rather than parallel. Maximal-set filtering prevents a nested
+ * A/B pair from duplicating an A/B/C marker, while preserving A/B and B/C as
+ * separate groups when their overlaps happen at different times.
+ */
+export function findRequestOverlapGroups(
+  requests: TimedRequest[],
+  scopeKey = 'scope',
+): RequestOverlapGroup[] {
+  const valid = requests.filter(
+    (request): request is TimedRequest & { startS: number; endS: number } =>
+      Number.isFinite(request.startS) &&
+      Number.isFinite(request.endS) &&
+      request.endS! > request.startS!,
+  );
+  const boundaries = [
+    ...new Set(valid.flatMap((request) => [request.startS, request.endS])),
+  ].toSorted((a, b) => a - b);
+  const candidates = new Map<string, Omit<RequestOverlapGroup, 'id'>>();
+
+  for (let i = 0; i < boundaries.length - 1; i++) {
+    const startS = boundaries[i]!;
+    const endS = boundaries[i + 1]!;
+    if (endS <= startS) continue;
+    const requestKeys = valid
+      .filter((request) => request.startS <= startS && request.endS >= endS)
+      .map((request) => request.key)
+      .toSorted();
+    if (requestKeys.length < 2) continue;
+    const key = requestKeys.join('\u0000');
+    const existing = candidates.get(key);
+    candidates.set(key, {
+      requestKeys,
+      startS: existing ? Math.min(existing.startS, startS) : startS,
+      endS: existing ? Math.max(existing.endS, endS) : endS,
+    });
+  }
+
+  const maximal = [...candidates.values()].filter(
+    (candidate, _, all) =>
+      !all.some(
+        (other) =>
+          other.requestKeys.length > candidate.requestKeys.length &&
+          candidate.requestKeys.every((key) => other.requestKeys.includes(key)),
+      ),
+  );
+
+  return maximal
+    .toSorted(
+      (a, b) =>
+        a.startS - b.startS ||
+        a.endS - b.endS ||
+        a.requestKeys.join(',').localeCompare(b.requestKeys.join(',')),
+    )
+    .map((group, index) => ({ ...group, id: `${scopeKey}-${index + 1}` }));
+}
+
+export interface RowOverlap {
+  id: string;
+  label: string;
+  color: string;
+  startS: number;
+  endS: number;
+  peerCount: number;
+}
+
+export interface VisibleRow {
+  key: string;
+  label: string;
+  sublabel?: string;
+  timeLabel?: string;
+  cached: number;
+  uncached: number;
+  output: number;
+  total: number;
+  indent: number;
+  isGroup: boolean;
+  isExpanded: boolean;
+  groupIndex?: number;
+  overlaps: RowOverlap[];
+}
+
+/** Format seconds from conversation start as a compact elapsed timestamp. */
+export function formatElapsedTime(seconds: number): string {
+  const total = Math.max(0, Math.round(seconds));
+  const hours = Math.floor(total / 3600);
+  const minutes = Math.floor((total % 3600) / 60);
+  const secs = total % 60;
+  const mm = String(minutes).padStart(2, '0');
+  const ss = String(secs).padStart(2, '0');
+  return hours > 0 ? `${hours}:${mm}:${ss}` : `${mm}:${ss}`;
+}
+
+/** Elapsed-interval label for a row ("+MM:SS–MM:SS"), or undefined when untimed. */
+export function timeLabel(startS?: number, endS?: number): string | undefined {
+  if (startS === undefined || !Number.isFinite(startS)) return undefined;
+  const start = formatElapsedTime(startS);
+  if (endS === undefined || !Number.isFinite(endS) || endS <= startS) return `+${start}`;
+  return `+${start}–${formatElapsedTime(endS)}`;
+}
+
+export interface DeepLinkHighlight {
+  turn?: number | null;
+  raw?: number | null;
+  inner?: number | null;
+  agent?: string | null;
+}
+
+export interface DeepLinkTarget {
+  rowKey: string;
+  expandGroup: number | null;
+}
+
+/**
+ * Resolve a request-timeline deep link to a flamegraph row key (+ the subagent
+ * group that must be expanded to show it). Raw Weka source coordinates are
+ * exact and take precedence:
+ *   raw=<outer>             -> top-level Weka request
+ *   raw=<outer>&inner=<idx> -> subagent child inside that top-level marker
+ * Otherwise main turns match by main-turn ordinal and subagent turns match the
+ * group by agentId, then the ti-th child.
+ *
+ * `buildConversationStructure` emits exactly one node per raw Weka entry (and
+ * one child per nested entry), so a node's array position IS its raw index.
+ * Structures ingested before rawIndex/innerIndex were stored omit the explicit
+ * fields — fall back to the array position so deep links keep resolving against
+ * those older rows instead of silently doing nothing.
+ */
+export function resolveDeepLinkTarget(
+  nodes: readonly StructureNode[],
+  highlight: DeepLinkHighlight,
+): DeepLinkTarget | null {
+  const { turn, raw, inner, agent } = highlight;
+  if (typeof raw === 'number' && raw >= 0) {
+    if (typeof inner === 'number' && inner >= 0) {
+      const gi = nodes.findIndex(
+        (node, i) => node.kind === 'subagent' && (node.rawIndex ?? i) === raw,
+      );
+      if (gi === -1) return null;
+      const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
+      const ci = group.children.findIndex((child, i) => (child.innerIndex ?? i) === inner);
+      if (ci === -1) return null;
+      return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi };
+    }
+    const i = nodes.findIndex(
+      (node, idx) => node.kind === 'turn' && (node.rawIndex ?? idx) === raw,
+    );
+    if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null };
+    return null;
+  }
+  if (typeof turn !== 'number' || turn < 0) return null;
+  if (agent) {
+    const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === agent);
+    if (gi === -1) return null;
+    const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
+    if (turn >= group.children.length) return null;
+    return { rowKey: `g-${gi}-c-${turn}`, expandGroup: gi };
+  }
+  let ordinal = 0;
+  for (let i = 0; i < nodes.length; i++) {
+    if (nodes[i].kind === 'turn') {
+      if (ordinal === turn) return { rowKey: `t-${i}`, expandGroup: null };
+      ordinal += 1;
+    }
+  }
+  return null;
+}
+
+/**
+ * Overlap groups per row key. Main-agent turns and each subagent's children are
+ * separate scopes — parallelism is only meaningful within one agent's stream.
+ */
+export function buildRowOverlaps(nodes: readonly StructureNode[]): Map<string, RowOverlap[]> {
+  const mainGroups = findRequestOverlapGroups(
+    nodes.flatMap((node, i) =>
+      node.kind === 'turn' ? [{ key: `t-${i}`, startS: node.startS, endS: node.endS }] : [],
+    ),
+    'main',
+  );
+  const subagentGroups = nodes.flatMap((node, i) =>
+    node.kind === 'subagent'
+      ? findRequestOverlapGroups(
+          node.children.map((child, ci) => ({
+            key: `g-${i}-c-${ci}`,
+            startS: child.startS,
+            endS: child.endS,
+          })),
+          `subagent-${i}`,
+        )
+      : [],
+  );
+  const groups: RequestOverlapGroup[] = [...mainGroups, ...subagentGroups];
+
+  const byRow = new Map<string, RowOverlap[]>();
+  groups.forEach((group, groupIndex) => {
+    const overlap = {
+      id: group.id,
+      label: `P${groupIndex + 1}`,
+      color: OVERLAP_COLORS[groupIndex % OVERLAP_COLORS.length]!,
+      startS: group.startS,
+      endS: group.endS,
+      peerCount: group.requestKeys.length - 1,
+    };
+    group.requestKeys.forEach((key) => byRow.set(key, [...(byRow.get(key) ?? []), overlap]));
+  });
+  return byRow;
+}
+
+/**
+ * Flatten structure nodes into the rows currently visible: one row per main
+ * turn, one header per subagent group, plus indented children for expanded
+ * groups. Row keys (`t-<i>`, `g-<i>`, `g-<i>-c-<ci>`) index by node position so
+ * they stay stable across expand/collapse.
+ */
+export function buildVisibleRows(
+  nodes: readonly StructureNode[],
+  expanded: ReadonlySet<number>,
+  overlapsByRow: ReadonlyMap<string, RowOverlap[]>,
+): VisibleRow[] {
+  const out: VisibleRow[] = [];
+  let turnNo = 0;
+  nodes.forEach((node: StructureNode, i) => {
+    if (node.kind === 'turn') {
+      turnNo += 1;
+      out.push({
+        key: `t-${i}`,
+        label: `Turn ${turnNo}`,
+        sublabel: node.model ?? undefined,
+        timeLabel: timeLabel(node.startS, node.endS),
+        cached: node.cached,
+        uncached: node.uncached,
+        output: node.out,
+        total: node.in + node.out,
+        indent: 0,
+        isGroup: false,
+        isExpanded: false,
+        overlaps: overlapsByRow.get(`t-${i}`) ?? [],
+      });
+    } else {
+      const isExpanded = expanded.has(i);
+      out.push({
+        key: `g-${i}`,
+        label: `${node.label}`,
+        sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${
+          node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : ''
+        }`,
+        timeLabel: timeLabel(node.startS, node.endS),
+        cached: node.cached,
+        uncached: node.uncached,
+        output: node.out,
+        total: node.in + node.out,
+        indent: 0,
+        isGroup: true,
+        isExpanded,
+        groupIndex: i,
+        overlaps: [],
+      });
+      if (isExpanded) {
+        node.children.forEach((child, ci) => {
+          out.push({
+            key: `g-${i}-c-${ci}`,
+            label: `↳ subturn ${ci + 1}`,
+            sublabel: child.model ?? undefined,
+            timeLabel: timeLabel(child.startS, child.endS),
+            cached: child.cached,
+            uncached: child.uncached,
+            output: child.out,
+            total: child.in + child.out,
+            indent: 1,
+            isGroup: false,
+            isExpanded: false,
+            overlaps: overlapsByRow.get(`g-${i}-c-${ci}`) ?? [],
+          });
+        });
+      }
+    }
+  });
+  return out;
+}
+
+export interface BraceSeg {
+  role: 'first' | 'middle' | 'last' | 'through';
+  isMember: boolean;
+  color: string;
+  groupId: string;
+  peerCount: number;
+  startS: number;
+  endS: number;
+}
+
+export interface BraceLayout {
+  laneCount: number;
+  overflowLanes: number;
+  /** Per visible row: only the lanes that actually carry a bracket segment. */
+  rowSegs: { lane: number; seg: BraceSeg }[][];
+}
+
+/**
+ * Geometry for the parallel-group brackets drawn in the left gutter. Each
+ * overlap group becomes a vertical bracket spanning from its first to its last
+ * visible member row, with a right-pointing tick on the exact member rows.
+ * Non-transitive chains (a row in two groups) get separate lanes so their
+ * brackets sit side by side. `through` = a row inside a group's span that is
+ * NOT itself a member (the aux-stream edge case) — drawn as a faint connector
+ * with no tick.
+ */
+export function computeBraceLayout(rows: readonly VisibleRow[]): BraceLayout {
+  const groupMap = new Map<
+    string,
+    { id: string; color: string; peerCount: number; startS: number; endS: number; idxs: number[] }
+  >();
+  rows.forEach((r, idx) => {
+    for (const ov of r.overlaps) {
+      const g = groupMap.get(ov.id) ?? {
+        id: ov.id,
+        color: ov.color,
+        peerCount: ov.peerCount,
+        startS: ov.startS,
+        endS: ov.endS,
+        idxs: [],
+      };
+      g.idxs.push(idx);
+      groupMap.set(ov.id, g);
+    }
+  });
+  const groups = [...groupMap.values()]
+    .filter((g) => g.idxs.length >= 2) // need ≥2 visible members to bracket
+    .map((g) => ({
+      ...g,
+      min: Math.min(...g.idxs),
+      max: Math.max(...g.idxs),
+      members: new Set(g.idxs),
+    }))
+    .toSorted((a, b) => a.min - b.min || a.max - b.max);
+
+  // Greedy lane assignment: a group reuses a lane whose previous group ended
+  // before this one starts.
+  const laneEnd: number[] = [];
+  const laneOf = new Map<string, number>();
+  for (const g of groups) {
+    let lane = laneEnd.findIndex((end) => end < g.min);
+    if (lane === -1) {
+      lane = laneEnd.length;
+      laneEnd.push(g.max);
+    } else {
+      laneEnd[lane] = g.max;
+    }
+    laneOf.set(g.id, lane);
+  }
+  const rawLaneCount = laneEnd.length;
+  // Bound the gutter (see MAX_LANES). Lanes past the cap collapse onto the last
+  // visible lane, so every parallel row still carries a marker but the gutter
+  // width and DOM-node count stay bounded regardless of how parallel the
+  // conversation is.
+  const laneCount = Math.min(rawLaneCount, MAX_LANES);
+  const displayLane = (lane: number) => Math.min(lane, laneCount - 1);
+
+  // Sparse per-row segments: only lanes that actually carry a bracket on a row
+  // are stored (and later rendered). The previous dense matrix emitted one DOM
+  // node per lane per row — catastrophic at 49 lanes × 2k rows.
+  const rowSegs: { lane: number; seg: BraceSeg }[][] = rows.map(() => []);
+  for (const g of groups) {
+    const lane = displayLane(laneOf.get(g.id)!);
+    for (let idx = g.min; idx <= g.max; idx++) {
+      const isMember = g.members.has(idx);
+      const role =
+        idx === g.min ? 'first' : idx === g.max ? 'last' : isMember ? 'middle' : 'through';
+      const seg: BraceSeg = {
+        role,
+        isMember,
+        color: g.color,
+        groupId: g.id,
+        peerCount: g.peerCount,
+        startS: g.startS,
+        endS: g.endS,
+      };
+      const cell = rowSegs[idx]!;
+      const existing = cell.find((c) => c.lane === lane);
+      // Collisions only happen in the folded overflow lane. Prefer a real
+      // member marker over a faint pass-through connector.
+      if (!existing) cell.push({ lane, seg });
+      else if (seg.isMember && !existing.seg.isMember) existing.seg = seg;
+    }
+  }
+  return { laneCount, overflowLanes: rawLaneCount - laneCount, rowSegs };
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts
new file mode 100644
index 00000000..0af344f1
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts
@@ -0,0 +1,246 @@
+import { describe, expect, it } from 'vitest';
+
+import type {
+  StructureNode,
+  SubagentNode,
+  TurnNode,
+} from '@semianalysisai/inferencex-db/etl/weka-structure';
+
+import {
+  buildRowOverlaps,
+  buildVisibleRows,
+  computeBraceLayout,
+  findRequestOverlapGroups,
+  formatElapsedTime,
+  resolveDeepLinkTarget,
+  timeLabel,
+} from './trace-flamegraph-model';
+
+describe('formatElapsedTime', () => {
+  it('formats elapsed seconds below and above one hour', () => {
+    expect(formatElapsedTime(0)).toBe('00:00');
+    expect(formatElapsedTime(65.4)).toBe('01:05');
+    expect(formatElapsedTime(3661.6)).toBe('1:01:02');
+    expect(formatElapsedTime(86_541.149)).toBe('24:02:21');
+  });
+
+  it('clamps negative offsets to the conversation origin', () => {
+    expect(formatElapsedTime(-5)).toBe('00:00');
+  });
+});
+
+describe('timeLabel', () => {
+  it('renders a range when the end is after the start, a point otherwise', () => {
+    expect(timeLabel(65, 130)).toBe('+01:05–02:10');
+    expect(timeLabel(65)).toBe('+01:05');
+    expect(timeLabel(65, 65)).toBe('+01:05');
+    expect(timeLabel(undefined, 130)).toBeUndefined();
+    expect(timeLabel(Number.NaN, 130)).toBeUndefined();
+  });
+});
+
+describe('findRequestOverlapGroups', () => {
+  it('keeps non-transitive overlap chains as separate groups', () => {
+    const groups = findRequestOverlapGroups([
+      { key: 'A', startS: 1, endS: 8 },
+      { key: 'B', startS: 5, endS: 11 },
+      { key: 'C', startS: 9, endS: 15 },
+    ]);
+
+    expect(groups.map((group) => group.requestKeys)).toEqual([
+      ['A', 'B'],
+      ['B', 'C'],
+    ]);
+    expect(groups.map(({ startS, endS }) => [startS, endS])).toEqual([
+      [5, 8],
+      [9, 11],
+    ]);
+  });
+
+  it('does not consider touching or invalid intervals parallel', () => {
+    expect(
+      findRequestOverlapGroups([
+        { key: 'A', startS: 1, endS: 5 },
+        { key: 'B', startS: 5, endS: 8 },
+        { key: 'missing-end', startS: 3 },
+        { key: 'zero-duration', startS: 4, endS: 4 },
+      ]),
+    ).toEqual([]);
+  });
+
+  it('returns only the maximal simultaneous set for nested intervals', () => {
+    const groups = findRequestOverlapGroups([
+      { key: 'A', startS: 1, endS: 10 },
+      { key: 'B', startS: 2, endS: 8 },
+      { key: 'C', startS: 3, endS: 7 },
+    ]);
+    expect(groups).toMatchObject([{ requestKeys: ['A', 'B', 'C'], startS: 3, endS: 7 }]);
+  });
+});
+
+const turn = (turnIndex: number, extra: Partial<TurnNode> = {}): TurnNode => ({
+  kind: 'turn',
+  turnIndex,
+  in: 100,
+  out: 10,
+  cached: 0,
+  uncached: 100,
+  ...extra,
+});
+const subagent = (children: TurnNode[], extra: Partial<SubagentNode> = {}): SubagentNode => ({
+  kind: 'subagent',
+  label: 'Subagent',
+  in: 100,
+  out: 10,
+  cached: 0,
+  uncached: 100,
+  children,
+  ...extra,
+});
+
+describe('resolveDeepLinkTarget', () => {
+  // Node layout mirroring a real Weka conversation: raw entries
+  //   0: turn, 1: subagent (2 children), 2: turn
+  const withRawIndexes: StructureNode[] = [
+    turn(0, { rawIndex: 0 }),
+    subagent([turn(1, { rawIndex: 1, innerIndex: 0 }), turn(2, { rawIndex: 1, innerIndex: 1 })], {
+      agentId: 'subagent_001_abcd1234',
+      rawIndex: 1,
+    }),
+    turn(3, { rawIndex: 2 }),
+  ];
+  // The same conversation as stored by the pre-rawIndex ingest (fields absent).
+  const legacy: StructureNode[] = [
+    turn(0),
+    subagent([turn(1), turn(2)], { agentId: 'subagent_001_abcd1234' }),
+    turn(3),
+  ];
+
+  it('resolves raw source coordinates against explicit rawIndex fields', () => {
+    expect(resolveDeepLinkTarget(withRawIndexes, { raw: 2 })).toEqual({
+      rowKey: 't-2',
+      expandGroup: null,
+    });
+    expect(resolveDeepLinkTarget(withRawIndexes, { raw: 1, inner: 1 })).toEqual({
+      rowKey: 'g-1-c-1',
+      expandGroup: 1,
+    });
+  });
+
+  it('falls back to node array position for structures ingested before rawIndex existed', () => {
+    // One node per raw entry means position === raw index, so the deep link
+    // must still resolve exactly (regression: it previously returned null and
+    // the flamegraph neither scrolled nor highlighted anything).
+    expect(resolveDeepLinkTarget(legacy, { raw: 2, turn: 1 })).toEqual({
+      rowKey: 't-2',
+      expandGroup: null,
+    });
+    expect(resolveDeepLinkTarget(legacy, { raw: 0, turn: 0 })).toEqual({
+      rowKey: 't-0',
+      expandGroup: null,
+    });
+  });
+
+  it('resolves subagent children positionally when innerIndex is absent', () => {
+    expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 1, turn: 1 })).toEqual({
+      rowKey: 'g-1-c-1',
+      expandGroup: 1,
+    });
+  });
+
+  it('returns null for out-of-range raw coordinates instead of guessing', () => {
+    expect(resolveDeepLinkTarget(legacy, { raw: 9 })).toBeNull();
+    expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 5 })).toBeNull();
+    // raw pointing at a subagent marker without inner does not match a turn.
+    expect(resolveDeepLinkTarget(legacy, { raw: 1 })).toBeNull();
+  });
+
+  it('keeps the positional turn/agent fallback for links without raw coordinates', () => {
+    expect(resolveDeepLinkTarget(legacy, { turn: 1 })).toEqual({
+      rowKey: 't-2',
+      expandGroup: null,
+    });
+    expect(resolveDeepLinkTarget(legacy, { turn: 1, agent: 'subagent_001_abcd1234' })).toEqual({
+      rowKey: 'g-1-c-1',
+      expandGroup: 1,
+    });
+    expect(resolveDeepLinkTarget(legacy, {})).toBeNull();
+  });
+});
+
+describe('buildVisibleRows', () => {
+  const nodes: StructureNode[] = [
+    turn(0, { startS: 0, endS: 10, model: 'claude' }),
+    subagent([turn(1), turn(2)], { label: 'Subagent: search', durationMs: 12_000 }),
+    turn(3),
+  ];
+
+  it('hides collapsed subagent children and keys rows by node position', () => {
+    const rows = buildVisibleRows(nodes, new Set(), new Map());
+    expect(rows.map((r) => r.key)).toEqual(['t-0', 'g-1', 't-2']);
+    expect(rows[0]).toMatchObject({
+      label: 'Turn 1',
+      sublabel: 'claude',
+      timeLabel: '+00:00–00:10',
+      total: 110,
+      isGroup: false,
+    });
+    expect(rows[1]).toMatchObject({
+      label: 'Subagent: search',
+      sublabel: '2 turns · 12s',
+      isGroup: true,
+      isExpanded: false,
+      groupIndex: 1,
+    });
+  });
+
+  it('inserts indented child rows for expanded groups and attaches overlaps', () => {
+    const overlap = {
+      id: 'main-1',
+      label: 'P1',
+      color: '#06b6d4',
+      startS: 0,
+      endS: 1,
+      peerCount: 1,
+    };
+    const rows = buildVisibleRows(nodes, new Set([1]), new Map([['g-1-c-0', [overlap]]]));
+    expect(rows.map((r) => r.key)).toEqual(['t-0', 'g-1', 'g-1-c-0', 'g-1-c-1', 't-2']);
+    expect(rows[2]).toMatchObject({ label: '↳ subturn 1', indent: 1, overlaps: [overlap] });
+    expect(rows[3]!.overlaps).toEqual([]);
+  });
+});
+
+describe('buildRowOverlaps and computeBraceLayout', () => {
+  it('brackets parallel main turns and spans a non-member row as pass-through', () => {
+    const nodes: StructureNode[] = [
+      turn(0, { startS: 0, endS: 10 }),
+      turn(1), // untimed — sits inside the bracket span without being a member
+      turn(2, { startS: 5, endS: 30 }), // overlaps turn 0 and turn 3
+      turn(3, { startS: 28, endS: 40 }),
+    ];
+    const overlaps = buildRowOverlaps(nodes);
+    expect([...overlaps.keys()].toSorted()).toEqual(['t-0', 't-2', 't-3']);
+
+    const rows = buildVisibleRows(nodes, new Set(), overlaps);
+    const layout = computeBraceLayout(rows);
+    // Two overlap groups sharing rows 0–2 and 2–3 need two side-by-side lanes.
+    expect(layout.laneCount).toBe(2);
+    expect(layout.overflowLanes).toBe(0);
+    const roles = layout.rowSegs.map((segs) =>
+      segs.map(({ lane, seg }) => `${lane}:${seg.role}${seg.isMember ? '' : ':nonmember'}`),
+    );
+    expect(roles[0]).toEqual(['0:first']);
+    expect(roles[1]).toEqual(['0:through:nonmember']);
+    expect(roles[2]!.toSorted()).toEqual(['0:last', '1:first']);
+    expect(roles[3]).toEqual(['1:last']);
+  });
+
+  it('reports no lanes for a fully serial conversation', () => {
+    const nodes: StructureNode[] = [
+      turn(0, { startS: 0, endS: 5 }),
+      turn(1, { startS: 5, endS: 9 }),
+    ];
+    const rows = buildVisibleRows(nodes, new Set(), buildRowOverlaps(nodes));
+    expect(computeBraceLayout(rows)).toEqual({ laneCount: 0, overflowLanes: 0, rowSegs: [[], []] });
+  });
+});
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
new file mode 100644
index 00000000..d63cc691
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -0,0 +1,439 @@
+'use client';
+
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
+import { createPortal } from 'react-dom';
+
+import type { ConversationStructure } from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+import { compact, formatShare } from './format';
+import {
+  buildRowOverlaps,
+  buildVisibleRows,
+  computeBraceLayout,
+  formatElapsedTime,
+  MAX_LANES,
+  OVERLAP_COLORS,
+  resolveDeepLinkTarget,
+  type VisibleRow,
+} from './trace-flamegraph-model';
+
+// Pure logic lives in trace-flamegraph-model.ts; re-exported here so this file
+// stays the module entry point for the flamegraph's public API.
+export {
+  findRequestOverlapGroups,
+  formatElapsedTime,
+  resolveDeepLinkTarget,
+} from './trace-flamegraph-model';
+export type {
+  DeepLinkHighlight,
+  DeepLinkTarget,
+  RequestOverlapGroup,
+  TimedRequest,
+} from './trace-flamegraph-model';
+
+// Stacked-bar segment colors. Cached prefix vs uncached input vs output —
+// fixed hues (theme-independent) so the meaning is stable in light/dark.
+const SEG = {
+  cached: '#10b981', // emerald-500 — input served from prefix cache
+  uncached: '#f59e0b', // amber-500 — input that must be (re)computed
+  output: '#8b5cf6', // violet-500 — generated tokens
+} as const;
+
+const LEGEND = [
+  { key: 'cached', label: 'Cached prefix', color: SEG.cached },
+  { key: 'uncached', label: 'Uncached input', color: SEG.uncached },
+  { key: 'output', label: 'Output', color: SEG.output },
+] as const;
+
+// Width (px) of one parallel-group bracket lane in the left gutter. Overlapping
+// groups (non-transitive chains) get their own lane so their brackets sit
+// side-by-side instead of stacking visually.
+const LANE_W = 14;
+
+interface TooltipState {
+  x: number;
+  y: number;
+  row: VisibleRow;
+}
+
+/**
+ * Per-conversation flamegraph driven by the precomputed `structure` JSONB.
+ * One row per turn; subagent groups render a collapsible header with indented
+ * children (collapsed by default). Each bar stacks cached-prefix + uncached
+ * input + output, scaled to the widest visible turn.
+ */
+export function TraceFlamegraph({
+  structure,
+  highlightTurn,
+  highlightRawIndex,
+  highlightInnerIndex,
+  highlightAgentId,
+}: {
+  structure: ConversationStructure;
+  /** Turn index to scroll to / highlight (from a request-timeline deep link). */
+  highlightTurn?: number | null;
+  /** Raw Weka top-level request index to scroll to / highlight. */
+  highlightRawIndex?: number | null;
+  /** Raw Weka nested request index under highlightRawIndex, for subagent children. */
+  highlightInnerIndex?: number | null;
+  /** Subagent id when the highlighted turn is inside a subagent group. */
+  highlightAgentId?: string | null;
+}) {
+  const nodes = structure.nodes;
+
+  // Resolve the deep-link target to a row key (+ the group that must be open to
+  // show it). See resolveDeepLinkTarget for the matching rules.
+  const target = useMemo(
+    () =>
+      resolveDeepLinkTarget(nodes, {
+        turn: highlightTurn,
+        raw: highlightRawIndex,
+        inner: highlightInnerIndex,
+        agent: highlightAgentId,
+      }),
+    [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId],
+  );
+
+  // Subagent groups collapsed by default — except the deep-link target's group.
+  const [expanded, setExpanded] = useState<Set<number>>(() =>
+    typeof target?.expandGroup === 'number' ? new Set([target.expandGroup]) : new Set(),
+  );
+  const [tooltip, setTooltip] = useState<TooltipState | null>(null);
+  const scrollRef = useRef<HTMLDivElement>(null);
+
+  // Portal target only exists after mount (the tooltip is portaled to body so
+  // its position:fixed is viewport-relative, immune to ancestor transforms).
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
+  // The deep-link target row gets a state-driven highlight (ring + bg flash)
+  // that fades out — state-driven so a re-render can't clobber it, and so the
+  // fade is a real CSS transition rather than an abrupt classList removal.
+  const [highlightKey, setHighlightKey] = useState<string | null>(target?.rowKey ?? null);
+
+  // When the deep-link target resolves/changes: expand its subagent group, then
+  // (after the row renders) scroll it into view and flash the highlight. Runs on
+  // first load and on any later target change (e.g. clicking another bar into
+  // the same conversation). The row query/scroll is deferred to the next frame
+  // so the just-expanded child row exists in the DOM.
+  useEffect(() => {
+    if (!target) return;
+    if (typeof target.expandGroup === 'number') {
+      const gi = target.expandGroup;
+      setExpanded((prev) => (prev.has(gi) ? prev : new Set(prev).add(gi)));
+    }
+    setHighlightKey(target.rowKey);
+    const raf = requestAnimationFrame(() => {
+      scrollRef.current
+        ?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`)
+        ?.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    });
+    const t = setTimeout(() => setHighlightKey(null), 2200);
+    return () => {
+      cancelAnimationFrame(raf);
+      clearTimeout(t);
+    };
+  }, [target]);
+
+  const groupIndexes = useMemo(() => {
+    const out: number[] = [];
+    nodes.forEach((node, i) => {
+      if (node.kind === 'subagent') out.push(i);
+    });
+    return out;
+  }, [nodes]);
+
+  const toggle = useCallback((i: number) => {
+    setExpanded((prev) => {
+      const next = new Set(prev);
+      if (next.has(i)) next.delete(i);
+      else next.add(i);
+      return next;
+    });
+  }, []);
+
+  const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]);
+  const collapseAll = useCallback(() => setExpanded(new Set()), []);
+
+  const overlapsByRow = useMemo(() => buildRowOverlaps(nodes), [nodes]);
+
+  const rows = useMemo(
+    () => buildVisibleRows(nodes, expanded, overlapsByRow),
+    [nodes, expanded, overlapsByRow],
+  );
+
+  // Two scales: leaf turns/subturns share a per-turn axis (the primary signal —
+  // how cached/uncached evolves), while subagent group headers carry aggregates
+  // orders of magnitude larger, so they get their own axis to stay comparable to
+  // each other. Group bars render slim + muted, so the mixed scale reads as a
+  // distinct "group summary" track rather than a contradiction.
+  const maxTotal = useMemo(
+    () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)),
+    [rows],
+  );
+  const maxGroupTotal = useMemo(
+    () => Math.max(1, ...rows.filter((r) => r.isGroup).map((r) => r.total)),
+    [rows],
+  );
+
+  const braces = useMemo(() => computeBraceLayout(rows), [rows]);
+
+  const onMove = (e: React.MouseEvent, row: VisibleRow) => {
+    setTooltip({ x: e.clientX, y: e.clientY, row });
+  };
+
+  return (
+    <div className="relative">
+      <div className="mb-3 flex flex-wrap items-center justify-between gap-3">
+        <div className="flex items-center gap-3 text-xs">
+          {LEGEND.map((l) => (
+            <span key={l.key} className="inline-flex items-center gap-1.5">
+              <span
+                className="inline-block size-3 rounded-sm"
+                style={{ backgroundColor: l.color }}
+              />
+              <span className="text-muted-foreground">{l.label}</span>
+            </span>
+          ))}
+          <span className="inline-flex items-center gap-1.5">
+            <span
+              className="inline-block h-4 w-2 rounded-l-sm border-y-2 border-l-2"
+              style={{ borderColor: OVERLAP_COLORS[0] }}
+            />
+            <span className="text-muted-foreground">Bracketed rows ran in parallel</span>
+          </span>
+        </div>
+        {groupIndexes.length > 0 && (
+          <div className="flex items-center gap-1.5">
+            <button
+              type="button"
+              onClick={() => {
+                track('datasets_flamegraph_expand_all');
+                expandAll();
+              }}
+              className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
+            >
+              Expand all
+            </button>
+            <button
+              type="button"
+              onClick={() => {
+                track('datasets_flamegraph_collapse_all');
+                collapseAll();
+              }}
+              className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
+            >
+              Collapse all
+            </button>
+          </div>
+        )}
+      </div>
+
+      {braces.overflowLanes > 0 && (
+        <p className="mb-2 text-[11px] text-muted-foreground">
+          Dense parallel region — bracket lanes capped at {MAX_LANES}; {braces.overflowLanes}{' '}
+          further overlapping {braces.overflowLanes === 1 ? 'group is' : 'groups are'} folded into
+          the last lane.
+        </p>
+      )}
+
+      <div
+        ref={scrollRef}
+        className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2"
+      >
+        {/* gap-0 so the per-row bracket segments connect into a continuous
+            vertical rail across the rows of a parallel group. */}
+        <div className="flex flex-col gap-0">
+          {rows.map((row, idx) => {
+            // Group headers use the group axis; turns/subturns use the per-turn
+            // axis. Clamp to the track width either way.
+            const denom = row.isGroup ? maxGroupTotal : maxTotal;
+            const widthPct = Math.min(100, Math.max(0.5, (row.total / denom) * 100));
+            const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0;
+            const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0;
+            const ow = row.total > 0 ? (row.output / row.total) * 100 : 0;
+            const isHighlighted = row.key === highlightKey;
+            const segs = braces.rowSegs[idx]!;
+            return (
+              <div
+                key={row.key}
+                data-rowkey={row.key}
+                className={`flex items-stretch rounded-sm transition-colors duration-700 ${
+                  isHighlighted ? 'bg-primary/20 ring-2 ring-primary' : 'ring-0'
+                }`}
+              >
+                {/* Parallel-group bracket gutter (only rendered when the
+                    conversation has any overlaps, so non-overlap traces keep a
+                    flush-left layout with no dead space). Segments are sparse and
+                    absolutely positioned per lane so a row only pays for the
+                    lanes it actually touches. */}
+                {braces.laneCount > 0 && (
+                  <div
+                    className="relative shrink-0 self-stretch"
+                    style={{ width: braces.laneCount * LANE_W }}
+                  >
+                    {segs.map(({ lane, seg }) => {
+                      const top = seg.role === 'first' ? '50%' : '0';
+                      const bottom = seg.role === 'last' ? '50%' : '0';
+                      return (
+                        <div
+                          key={`${lane}-${seg.groupId}`}
+                          className="absolute top-0 bottom-0"
+                          style={{ left: lane * LANE_W, width: LANE_W }}
+                          {...(seg.isMember
+                            ? {
+                                'data-testid': `flamegraph-overlap-${row.key}`,
+                                'data-overlap-group': seg.groupId,
+                              }
+                            : {})}
+                          title={
+                            seg.isMember
+                              ? `Ran in parallel with ${seg.peerCount} other request${
+                                  seg.peerCount === 1 ? '' : 's'
+                                } (+${formatElapsedTime(seg.startS)}–${formatElapsedTime(seg.endS)})`
+                              : undefined
+                          }
+                        >
+                          {/* vertical rail */}
+                          <div
+                            className="absolute"
+                            style={{
+                              left: 5,
+                              width: 2,
+                              top,
+                              bottom,
+                              backgroundColor: seg.color,
+                              opacity: seg.isMember ? 0.95 : 0.3,
+                              borderTopLeftRadius: seg.role === 'first' ? 3 : 0,
+                              borderBottomLeftRadius: seg.role === 'last' ? 3 : 0,
+                            }}
+                          />
+                          {/* right-pointing tick marking an actual member row */}
+                          {seg.isMember && (
+                            <div
+                              className="absolute"
+                              style={{
+                                left: 5,
+                                top: '50%',
+                                height: 2,
+                                width: LANE_W - 7,
+                                transform: 'translateY(-1px)',
+                                backgroundColor: seg.color,
+                              }}
+                            />
+                          )}
+                        </div>
+                      );
+                    })}
+                  </div>
+                )}
+
+                {/* row content (indented for subagent children) */}
+                <div
+                  className="flex flex-1 items-center gap-2 py-0.5"
+                  style={{ paddingLeft: row.indent * 20 }}
+                >
+                  {/* label / group toggle */}
+                  <div className="flex w-52 shrink-0 items-center overflow-hidden">
+                    {row.isGroup ? (
+                      <button
+                        type="button"
+                        onClick={() => {
+                          track('datasets_flamegraph_group_toggled', {
+                            expanded: !row.isExpanded,
+                          });
+                          if (row.groupIndex !== undefined) toggle(row.groupIndex);
+                        }}
+                        className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
+                      >
+                        <span className="inline-block w-3 text-muted-foreground">
+                          {row.isExpanded ? '▾' : '▸'}
+                        </span>
+                        <span className="truncate">{row.label}</span>
+                      </button>
+                    ) : (
+                      <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
+                    )}
+                  </div>
+
+                  {/* Original interval, measured from conversation start. */}
+                  <div
+                    className="w-36 shrink-0 text-[11px] tabular-nums text-muted-foreground"
+                    data-testid={`flamegraph-time-${row.key}`}
+                  >
+                    {row.timeLabel ?? '—'}
+                  </div>
+
+                  {/* stacked bar — group headers render as a slim muted summary
+                      strip so they read as aggregates, not individual turns. */}
+                  <div
+                    className="relative flex h-5 flex-1 items-center"
+                    onMouseMove={(e) => onMove(e, row)}
+                    onMouseLeave={() => setTooltip(null)}
+                  >
+                    <div
+                      className={`flex overflow-hidden rounded-sm ${
+                        row.isGroup ? 'h-2.5 opacity-80' : 'h-5'
+                      }`}
+                      style={{ width: `${widthPct}%` }}
+                    >
+                      <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
+                      <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
+                      <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                    </div>
+                  </div>
+
+                  {/* total */}
+                  <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
+                    {compact(row.total)}
+                  </div>
+                </div>
+              </div>
+            );
+          })}
+        </div>
+      </div>
+
+      {tooltip &&
+        mounted &&
+        createPortal(
+          <div
+            className="pointer-events-none fixed z-50 rounded-md border border-border bg-popover px-2.5 py-1.5 text-xs shadow-md"
+            style={{ left: tooltip.x + 12, top: tooltip.y + 12 }}
+          >
+            <div className="mb-1 font-medium text-foreground">
+              {tooltip.row.label}
+              {tooltip.row.sublabel ? (
+                <span className="ml-1 font-normal text-muted-foreground">
+                  {tooltip.row.sublabel}
+                </span>
+              ) : null}
+            </div>
+            <div className="grid grid-cols-[auto_auto] gap-x-3 gap-y-0.5 text-muted-foreground">
+              <span style={{ color: SEG.cached }}>Cached prefix</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.cached)}
+              </span>
+              <span style={{ color: SEG.uncached }}>Uncached input</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.uncached)}
+              </span>
+              <span style={{ color: SEG.output }}>Output</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.output)}
+              </span>
+              <span>Cached %</span>
+              <span className="text-right tabular-nums text-foreground">
+                {formatShare(tooltip.row.cached, tooltip.row.cached + tooltip.row.uncached)}
+              </span>
+              <span>From start</span>
+              <span className="text-right tabular-nums text-foreground">
+                {tooltip.row.timeLabel ?? '—'}
+              </span>
+            </div>
+          </div>,
+          document.body,
+        )}
+    </div>
+  );
+}

From 4d5bb87496131bd8a037f91c2123b495abb97ce4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:12:09 -0500
Subject: [PATCH 08/40] =?UTF-8?q?feat(agentic):=20per-point=20detail=20?=
 =?UTF-8?q?=E2=80=94=20request=20timeline,=20time-series=20charts,=20aggre?=
 =?UTF-8?q?gates,=20distributions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../inference/agentic/[id]/page.tsx           |  17 +
 .../agentic-point/agentic-point-detail.tsx    | 334 ++++++++++
 .../agentic-point/aggregate-chart.tsx         | 286 +++++++++
 .../agentic-point/aggregates-grid.tsx         | 104 ++++
 .../inference/agentic-point/chart-hover.tsx   | 148 +++++
 .../inference/agentic-point/chart-shared.tsx  |  57 ++
 .../agentic-point/dataset-conv-id.test.ts     |  53 ++
 .../inference/agentic-point/distribution.tsx  | 233 +++++++
 .../agentic-point/expandable-chart.tsx        |  56 ++
 .../agentic-point/metric-source-toolbar.tsx   | 130 ++++
 .../agentic-point/phase-slice.test.ts         | 212 +++++++
 .../inference/agentic-point/phase-slice.ts    | 188 ++++++
 .../inference/agentic-point/point-summary.tsx |  50 ++
 .../agentic-point/request-metric-cards.tsx    | 223 +++++++
 .../agentic-point/request-timeline.test.ts    | 378 ++++++++++++
 .../agentic-point/request-timeline.tsx        | 581 ++++++++++++++++++
 .../agentic-point/server-metric-cards.tsx     | 474 ++++++++++++++
 .../inference/agentic-point/sibling-nav.tsx   | 247 ++++++++
 .../agentic-point/time-series-chart.tsx       | 526 ++++++++++++++++
 .../agentic-point/time-series-math.test.ts    | 457 ++++++++++++++
 .../agentic-point/time-series-math.ts         | 491 +++++++++++++++
 .../inference/agentic-point/timeline-bars.tsx | 252 ++++++++
 .../timeline-cursor-stats.test.ts             |  69 +++
 .../agentic-point/timeline-cursor-stats.ts    |  57 ++
 .../agentic-point/timeline-format.ts          |  15 +
 .../agentic-point/timeline-layout.ts          |  21 +
 .../inference/agentic-point/timeline-rows.ts  | 476 ++++++++++++++
 .../agentic-point/timeline-tooltips.tsx       | 143 +++++
 .../agentic-point/timeline-view-snapshot.ts   | 108 ++++
 29 files changed, 6386 insertions(+)
 create mode 100644 packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/aggregates-grid.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/chart-hover.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/chart-shared.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/distribution.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/expandable-chart.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/phase-slice.test.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/phase-slice.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/point-summary.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/request-metric-cards.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.test.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/server-metric-cards.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/sibling-nav.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/time-series-chart.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/time-series-math.test.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/time-series-math.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/timeline-bars.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/timeline-format.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/timeline-layout.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/timeline-rows.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts

diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
new file mode 100644
index 00000000..77f29805
--- /dev/null
+++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
@@ -0,0 +1,17 @@
+import type { Metadata } from 'next';
+
+import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail';
+
+export const metadata: Metadata = {
+  title: 'Agentic trace detail | InferenceX',
+  robots: { index: false },
+};
+
+export default async function AgenticPointDetailPage({
+  params,
+}: {
+  params: Promise<{ id: string }>;
+}) {
+  const { id } = await params;
+  return <AgenticPointDetail id={Number(id)} />;
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
new file mode 100644
index 00000000..64742acd
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -0,0 +1,334 @@
+'use client';
+
+import Link from 'next/link';
+import { usePathname, useRouter, useSearchParams } from 'next/navigation';
+import { useCallback, useMemo, useState } from 'react';
+import { ArrowLeft } from 'lucide-react';
+
+import { useAgenticAggregates } from '@/hooks/api/use-agentic-aggregates';
+import { useRequestTimeline } from '@/hooks/api/use-request-timeline';
+import { useTraceServerMetrics } from '@/hooks/api/use-trace-server-metrics';
+import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+import { AggregatesGrid } from './aggregates-grid';
+import { MetricSourceToolbar } from './metric-source-toolbar';
+import {
+  phaseBoundarySec,
+  sliceServerSeriesByPhase,
+  sliceTimelineByPhase,
+  timelineHasWarmup,
+  type ServerSeriesLike,
+  type StagePhase,
+} from './phase-slice';
+import { PointSummary } from './point-summary';
+import { RequestMetricOverTime, SequenceMetricCard } from './request-metric-cards';
+import { RequestTimelineView } from './request-timeline';
+import {
+  CumulativeUniqueInputTokensCard,
+  InflightUniqueTokensCard,
+  KvCacheUtilizationCard,
+  PrefixCacheHitRateCard,
+  PromptTokenSourceCard,
+  RequestActivityCard,
+  ThroughputCard,
+  type RequestActivityView,
+} from './server-metric-cards';
+import { SiblingNav } from './sibling-nav';
+import type { ThroughputSeriesKey } from './time-series-math';
+
+interface Props {
+  id: number;
+}
+
+type DetailView = 'point' | 'timeline' | 'aggregates';
+
+const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
+  { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
+  { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' },
+  { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
+];
+
+const isDetailView = (value: string | null): value is DetailView =>
+  value === 'point' || value === 'timeline' || value === 'aggregates';
+
+/** URL-persisted detail view (`?view=`; per-point is the unadorned default). */
+function useDetailView(): [DetailView, (nextView: DetailView) => void] {
+  const router = useRouter();
+  const pathname = usePathname();
+  const searchParams = useSearchParams();
+  const requestedView = searchParams.get('view');
+  const view: DetailView = isDetailView(requestedView) ? requestedView : 'point';
+  const setView = useCallback(
+    (nextView: DetailView) => {
+      const nextParams = new URLSearchParams(searchParams.toString());
+      if (nextView === 'point') nextParams.delete('view');
+      else nextParams.set('view', nextView);
+      const query = nextParams.toString();
+      router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false });
+      track('inference_agentic_detail_view_changed', { view: nextView });
+    },
+    [pathname, router, searchParams],
+  );
+  return [view, setView];
+}
+
+export function AgenticPointDetail({ id }: Props) {
+  const router = useRouter();
+  const metricsQuery = useTraceServerMetrics(id, true);
+  const siblingsQuery = useBenchmarkSiblings(id);
+
+  const metrics = metricsQuery.data;
+  const siblingsData = siblingsQuery.data;
+
+  const [view, setView] = useDetailView();
+  const [metricSourceId, setMetricSourceId] = useState('all');
+  const [requestActivityView, setRequestActivityView] = useState<RequestActivityView>('queue');
+  const [throughputSeries, setThroughputSeries] = useState<ReadonlySet<ThroughputSeriesKey>>(
+    () => new Set(['input', 'decode']),
+  );
+  // Fetch aggregates only when the aggregates view is active. Uses the full
+  // sibling set (across parallelism + concurrency configs) so each chart
+  // shows how the metric varies across the SKU.
+  const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
+  const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
+  // Per-request timeline used by the timeline view AND every per-point
+  // request-derived chart (ISL/OSL, latency-over-time, in-flight), so fetch
+  // whenever we're on either view.
+  const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point');
+  const timeline = timelineQuery.data;
+
+  // Warmup vs profiling stage. Only meaningful when the point actually has a
+  // warmup phase (older runs are profiling-only) — when absent the toggle is
+  // hidden and everything falls back to the full (profiling) run.
+  const [phase, setPhase] = useState<StagePhase>('profiling');
+  const hasWarmup = useMemo(() => timelineHasWarmup(timeline), [timeline]);
+  const effectivePhase: StagePhase = hasWarmup ? phase : 'profiling';
+
+  // Server-metric boundary on the chart's own t-axis (rebased through absolute
+  // ns — see phase-slice header for the origin-gap invariant). Request charts
+  // get a phase-scoped timeline (filtered + rebased) so they share a 0-based
+  // axis with the server charts for the selected phase.
+  const boundarySec = useMemo(() => phaseBoundarySec(metrics, timeline), [metrics, timeline]);
+  const phaseTimeline = useMemo(
+    () => (timeline ? sliceTimelineByPhase(timeline, effectivePhase) : null),
+    [timeline, effectivePhase],
+  );
+
+  const metricSources = metrics?.metricSources ?? [];
+  const selectedMetricSource = metricSources.find(({ source }) => source.id === metricSourceId);
+  const baseServerSeries: ServerSeriesLike | undefined = useMemo(() => {
+    const src = metrics?.metricSources?.find((m) => m.source.id === metricSourceId);
+    if (src) {
+      return {
+        kvCacheUsage: src.kvCacheUsage,
+        prefixCacheHitRate: src.prefixCacheHitRate,
+        queueDepth: src.queueDepth,
+        promptTokensBySource: src.promptTokensBySource,
+        prefillTps: src.promptTps,
+        decodeTps: src.generationTps,
+        prefixCacheHitsTps: src.prefixCacheHitsTps,
+        hostKvCacheUsage: src.hostKvCacheUsage,
+        kvCacheUsageByEngine: src.kvCacheUsageByEngine,
+      };
+    }
+    return metrics ?? undefined;
+  }, [metrics, metricSourceId]);
+  // Phase-sliced server series (+ matching durationS) consumed by every server
+  // chart. Null only when there are no server metrics at all.
+  const sliced = useMemo(
+    () =>
+      baseServerSeries
+        ? sliceServerSeriesByPhase(
+            baseServerSeries,
+            effectivePhase,
+            boundarySec,
+            metrics?.durationS ?? 0,
+          )
+        : null,
+    [baseServerSeries, effectivePhase, boundarySec, metrics?.durationS],
+  );
+  // Some runs only scrape server metrics during profiling — `chart_series`
+  // starts at the profiling boundary, so the warmup slice collapses to ~0–1
+  // points (just the t=0 origin) even though request-level warmup data exists.
+  // Require ≥2 points in some series to count as real warmup coverage; otherwise
+  // show an explanatory note instead of six silently-blank charts.
+  const slicedHasServerData =
+    (sliced?.series.kvCacheUsage.length ?? 0) > 1 ||
+    (sliced?.series.queueDepth.length ?? 0) > 1 ||
+    (sliced?.series.prefillTps.length ?? 0) > 1 ||
+    (sliced?.series.prefixCacheHitRate.length ?? 0) > 1;
+
+  return (
+    <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
+      <div className="flex items-center gap-2">
+        <button
+          type="button"
+          onClick={() => router.back()}
+          className="inline-flex items-center gap-1 text-sm text-muted-foreground hover:text-foreground"
+        >
+          <ArrowLeft className="size-4" /> Back
+        </button>
+        <span className="text-sm text-muted-foreground">·</span>
+        <Link href="/inference" className="text-sm text-muted-foreground hover:text-foreground">
+          Inference chart
+        </Link>
+      </div>
+
+      {siblingsData ? (
+        <SiblingNav sku={siblingsData.sku} siblings={siblingsData.siblings} />
+      ) : siblingsQuery.isLoading ? (
+        <div className="text-sm text-muted-foreground">Loading SKU navigator…</div>
+      ) : null}
+
+      {metrics ? (
+        <PointSummary meta={metrics.meta} />
+      ) : metricsQuery.isLoading ? (
+        <div className="text-sm text-muted-foreground">Loading point metadata…</div>
+      ) : null}
+
+      {metricsQuery.isError && (
+        <div className="rounded-lg border border-destructive/40 bg-destructive/10 p-4 text-sm text-destructive">
+          Failed to load trace data for benchmark point #{id}.
+        </div>
+      )}
+      {metricsQuery.data === null && !metricsQuery.isLoading && (
+        <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+          No stored trace_replay blob for benchmark point #{id}. This point predates the aiperf
+          time-series capture, or its source artifacts have expired on GitHub.
+        </div>
+      )}
+
+      <div className="flex items-center justify-between gap-3">
+        <SegmentedToggle
+          value={view}
+          options={VIEW_OPTIONS}
+          onValueChange={setView}
+          ariaLabel="Detail view"
+          testId="detail-view-toggle"
+          buttonClassName="px-3 py-1.5 text-sm"
+        />
+        {view === 'aggregates' && (
+          <span className="text-xs text-muted-foreground">
+            {siblingIds.length} configs in SKU
+            {aggregatesQuery.isLoading ? ' · loading…' : ''}
+          </span>
+        )}
+        {view === 'timeline' && timelineQuery.data && (
+          <span className="text-xs text-muted-foreground">
+            {timelineQuery.data.requests.length} requests
+          </span>
+        )}
+      </div>
+
+      {view === 'point' && (metricSources.length > 1 || hasWarmup) && (
+        <MetricSourceToolbar
+          hasWarmup={hasWarmup}
+          phase={phase}
+          onPhaseChange={setPhase}
+          metricSources={metricSources}
+          selectedSource={selectedMetricSource}
+          onSourceChange={setMetricSourceId}
+          fallbackAdapter={metrics?.meta.framework}
+        />
+      )}
+
+      {view === 'aggregates' ? (
+        <AggregatesGrid
+          siblings={siblingsData?.siblings ?? []}
+          aggregates={aggregatesQuery.data}
+          isLoading={aggregatesQuery.isLoading}
+        />
+      ) : view === 'timeline' ? (
+        timelineQuery.isLoading ? (
+          <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+            Loading request timeline…
+          </div>
+        ) : timelineQuery.data ? (
+          <RequestTimelineView
+            data={timelineQuery.data}
+            datasetSlug={siblingsQuery.data?.sku.dataset_slug}
+            pointId={id}
+          />
+        ) : (
+          <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+            No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact
+            isn&apos;t stored for this row.
+          </div>
+        )
+      ) : (
+        <>
+          {effectivePhase === 'warmup' && (
+            <p
+              className="rounded-md border-l-2 border-amber-500/60 bg-amber-500/10 px-3 py-2 text-xs text-muted-foreground"
+              data-testid="warmup-phase-note"
+            >
+              Showing the <span className="font-medium text-foreground">warmup</span> phase — a
+              cache-warming pass whose outputs are capped at 1 token. Warmup OSL ≈ 1, and
+              interactivity/decode are blank (single-token outputs have no inter-token latency).
+              {!slicedHasServerData &&
+                ' Warmup server-side metrics aren’t available for this point, so the server charts below are empty — the request-level charts above still reflect warmup.'}
+            </p>
+          )}
+          <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+            <SequenceMetricCard
+              metric="isl"
+              timeline={phaseTimeline}
+              timelineLoading={timelineQuery.isLoading}
+            />
+            <SequenceMetricCard
+              metric="osl"
+              timeline={phaseTimeline}
+              timelineLoading={timelineQuery.isLoading}
+            />
+
+            <RequestMetricOverTime
+              title="Interactivity over time"
+              metric="interactivity"
+              timeline={phaseTimeline}
+              isLoading={timelineQuery.isLoading}
+            />
+
+            <RequestMetricOverTime
+              title="TTFT over time"
+              metric="ttft"
+              timeline={phaseTimeline}
+              isLoading={timelineQuery.isLoading}
+              latencySelector
+            />
+
+            <KvCacheUtilizationCard sliced={sliced} />
+
+            <RequestActivityCard
+              sliced={sliced}
+              phaseTimeline={phaseTimeline}
+              timelineLoading={timelineQuery.isLoading}
+              view={requestActivityView}
+              onViewChange={setRequestActivityView}
+            />
+
+            <PrefixCacheHitRateCard sliced={sliced} />
+
+            <ThroughputCard
+              sliced={sliced}
+              selectedSource={selectedMetricSource}
+              selected={throughputSeries}
+              onSelectedChange={setThroughputSeries}
+            />
+
+            <PromptTokenSourceCard sliced={sliced} />
+
+            <CumulativeUniqueInputTokensCard sliced={sliced} />
+
+            <InflightUniqueTokensCard
+              phaseTimeline={phaseTimeline}
+              timelineLoading={timelineQuery.isLoading}
+              kvCachePoolTokens={metrics?.kvCachePoolTokens ?? null}
+            />
+          </div>
+        </>
+      )}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
new file mode 100644
index 00000000..d4526d24
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
@@ -0,0 +1,286 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+import { ChartEmpty, PERCENTILE_COLORS } from './chart-shared';
+
+export type PercentileKey = 'mean' | 'p50' | 'p75' | 'p90' | 'p99';
+
+interface PercentileLine {
+  key: PercentileKey;
+  /** Display label in legend / tooltip. */
+  label: string;
+  color: string;
+}
+
+const PERCENTILE_LINES: PercentileLine[] = [
+  { key: 'mean', label: 'Mean', color: PERCENTILE_COLORS.mean },
+  { key: 'p50', label: 'P50', color: PERCENTILE_COLORS.p50 },
+  { key: 'p75', label: 'P75', color: PERCENTILE_COLORS.p75 },
+  { key: 'p90', label: 'P90', color: PERCENTILE_COLORS.p90 },
+  { key: 'p99', label: 'P99', color: PERCENTILE_COLORS.p99 },
+];
+
+// Wider bottom/left padding than CHART_PAD — the x-axis carries rotated
+// per-config labels instead of time ticks.
+const PAD = { top: 16, right: 16, bottom: 90, left: 64 };
+
+export interface AggregatePoint {
+  /** Sibling label rendered on x-axis (e.g. "TP8 • c=8"). */
+  label: string;
+  /** Per-percentile value; missing percentiles are dropped from the plot. */
+  values: Partial<Record<PercentileKey, number>>;
+  /** Sibling id — purely informational, used in the tooltip title. */
+  id?: number;
+}
+
+/**
+ * Multi-line chart: one x-position per sibling config, one line per
+ * percentile (mean/p50/p75/p90/p99). Designed for the "Aggregates across
+ * configs" view on the agentic detail page.
+ */
+export function AggregateChart({
+  points,
+  unit,
+  yMax,
+  yFmt,
+  width = 720,
+  height = 320,
+}: {
+  points: readonly AggregatePoint[];
+  unit: string;
+  /** Optional fixed y-axis upper bound (e.g. 1 for percentages). */
+  yMax?: number;
+  /** Optional value formatter (e.g. percentage → "30%"). */
+  yFmt?: (v: number) => string;
+  width?: number;
+  height?: number;
+}) {
+  const W = width;
+  const H = height;
+  const fmt = (v: number) =>
+    yFmt
+      ? yFmt(v)
+      : v >= 10000
+        ? new Intl.NumberFormat('en-US').format(Math.round(v))
+        : v.toFixed(v < 10 ? 2 : 0);
+
+  const computed = useMemo(() => {
+    if (points.length === 0) return null;
+    let yMaxComputed = 0;
+    for (const p of points) {
+      for (const line of PERCENTILE_LINES) {
+        const v = p.values[line.key];
+        if (typeof v === 'number' && Number.isFinite(v) && v > yMaxComputed) yMaxComputed = v;
+      }
+    }
+    const yTop = yMax ?? (yMaxComputed === 0 ? 1 : yMaxComputed * 1.05);
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    return { yTop, innerW, innerH };
+  }, [points, W, H, yMax]);
+
+  if (!computed) {
+    return <ChartEmpty height={H} />;
+  }
+  const { yTop, innerW, innerH } = computed;
+
+  // X positions: evenly spaced across the inner width.
+  const xOf = (i: number) =>
+    points.length === 1 ? PAD.left + innerW / 2 : PAD.left + (i / (points.length - 1)) * innerW;
+  const yOf = (v: number) => PAD.top + (1 - v / yTop) * innerH;
+
+  // 5 y-axis ticks evenly between 0 and yTop.
+  const yTicks = Array.from({ length: 5 }, (_, i) => (yTop * i) / 4);
+
+  // Resolve hover: snap to nearest sibling index and emit all percentiles
+  // that have data at that x.
+  const resolve = (fraction: number) => {
+    const idx = Math.round(fraction * (points.length - 1));
+    const p = points[Math.max(0, Math.min(points.length - 1, idx))];
+    if (!p) return null;
+    const items: HoverItem[] = [];
+    for (const line of PERCENTILE_LINES) {
+      const v = p.values[line.key];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      items.push({ color: line.color, label: line.label, value: fmt(v) });
+    }
+    return { items, title: p.label };
+  };
+
+  return (
+    <div className="w-full">
+      <div className="mb-2 flex flex-wrap items-center gap-x-3 gap-y-1 text-xs">
+        {PERCENTILE_LINES.map((line) => (
+          <div key={line.key} className="flex items-center gap-1.5">
+            <span className="inline-block w-3 h-0.5" style={{ backgroundColor: line.color }} />
+            <span className="text-muted-foreground">{line.label}</span>
+          </div>
+        ))}
+        <span className="ml-auto text-muted-foreground">
+          {points.length} configs · units: {unit}
+        </span>
+      </div>
+      <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+        {/* y-axis ticks + gridlines */}
+        {yTicks.map((v, i) => {
+          const y = yOf(v);
+          return (
+            <g key={`y${i}`}>
+              <line
+                x1={PAD.left}
+                x2={PAD.left + innerW}
+                y1={y}
+                y2={y}
+                stroke="currentColor"
+                opacity={0.08}
+              />
+              <text
+                x={PAD.left - 8}
+                y={y + 3}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.55}
+                textAnchor="end"
+              >
+                {fmt(v)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* X-axis tick labels — one per sibling, rotated 30° to fit. */}
+        {points.map((p, i) => {
+          const x = xOf(i);
+          return (
+            <g key={`x${i}`}>
+              <line
+                x1={x}
+                x2={x}
+                y1={PAD.top + innerH}
+                y2={PAD.top + innerH + 4}
+                stroke="currentColor"
+                opacity={0.4}
+              />
+              <text
+                x={x}
+                y={PAD.top + innerH + 8}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.7}
+                textAnchor="end"
+                transform={`rotate(-30 ${x} ${PAD.top + innerH + 8})`}
+              >
+                {p.label}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* X axis baseline */}
+        <line
+          x1={PAD.left}
+          x2={PAD.left + innerW}
+          y1={PAD.top + innerH}
+          y2={PAD.top + innerH}
+          stroke="currentColor"
+          opacity={0.25}
+        />
+
+        {/* Horizontal connecting lines per percentile — faint backdrop so the
+            eye can follow how each percentile changes across configs. */}
+        {PERCENTILE_LINES.map((line) => {
+          const segments: { x1: number; y1: number; x2: number; y2: number }[] = [];
+          let prev: { x: number; y: number } | null = null;
+          for (let i = 0; i < points.length; i++) {
+            const v = points[i]!.values[line.key];
+            if (typeof v !== 'number' || !Number.isFinite(v)) {
+              prev = null;
+              continue;
+            }
+            const x = xOf(i);
+            const y = yOf(v);
+            if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y });
+            prev = { x, y };
+          }
+          return (
+            <g key={`hline-${line.key}`} opacity={0.35}>
+              {segments.map((s, j) => (
+                <line
+                  key={`s${j}`}
+                  x1={s.x1}
+                  y1={s.y1}
+                  x2={s.x2}
+                  y2={s.y2}
+                  stroke={line.color}
+                  strokeWidth={1}
+                />
+              ))}
+            </g>
+          );
+        })}
+
+        {/* Per-sibling vertical bar spanning the percentile range, with a
+            colored tick at each percentile level. Mean rendered as a small
+            diamond to distinguish from the percentile ticks. */}
+        {points.map((p, i) => {
+          const x = xOf(i);
+          // Collect percentile values present for this sibling.
+          const present = PERCENTILE_LINES.filter(
+            (line) =>
+              typeof p.values[line.key] === 'number' && Number.isFinite(p.values[line.key]!),
+          ).map((line) => ({ ...line, value: p.values[line.key]! }));
+          if (present.length === 0) return null;
+          // Only the *percentile* values define the bar extent; mean might be
+          // outside the percentile span on weird distributions.
+          const pctlOnly = present.filter((p2) => p2.key !== 'mean');
+          const bandValues = pctlOnly.length > 0 ? pctlOnly : present;
+          const bandYs = bandValues.map((b) => yOf(b.value));
+          const yLo = Math.min(...bandYs);
+          const yHi = Math.max(...bandYs);
+          return (
+            <g key={`bar-${i}`}>
+              <line
+                x1={x}
+                x2={x}
+                y1={yLo}
+                y2={yHi}
+                stroke="currentColor"
+                strokeWidth={1}
+                opacity={0.35}
+              />
+              {present.map((b) => {
+                const ty = yOf(b.value);
+                if (b.key === 'mean') {
+                  // Diamond marker for mean.
+                  const s = 4;
+                  return (
+                    <polygon
+                      key={`m-${b.key}`}
+                      points={`${x},${ty - s} ${x + s},${ty} ${x},${ty + s} ${x - s},${ty}`}
+                      fill={b.color}
+                      stroke={b.color}
+                    />
+                  );
+                }
+                // Horizontal tick at each percentile.
+                return (
+                  <line
+                    key={`tk-${b.key}`}
+                    x1={x - 6}
+                    x2={x + 6}
+                    y1={ty}
+                    y2={ty}
+                    stroke={b.color}
+                    strokeWidth={2.5}
+                  />
+                );
+              })}
+            </g>
+          );
+        })}
+      </ChartHover>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx b/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx
new file mode 100644
index 00000000..09252940
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx
@@ -0,0 +1,104 @@
+'use client';
+
+import type { AgenticAggregateMap, MetricPercentiles } from '@/hooks/api/use-agentic-aggregates';
+import type { BenchmarkSibling } from '@/hooks/api/use-benchmark-siblings';
+
+import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
+import { CHART_SIZES } from './chart-shared';
+import { ExpandableChart } from './expandable-chart';
+import { chipLabel } from './sibling-nav';
+
+/** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */
+function toAggPoint(
+  sibling: { id: number; label: string },
+  pct: MetricPercentiles | null | undefined,
+): AggregatePoint {
+  const values: Partial<Record<PercentileKey, number>> = {};
+  if (pct) {
+    values.mean = pct.mean;
+    values.p50 = pct.p50;
+    values.p75 = pct.p75;
+    values.p90 = pct.p90;
+    values.p99 = pct.p99;
+  }
+  return { id: sibling.id, label: sibling.label, values };
+}
+
+/** "Aggregates across configs" view: ISL/OSL/KV/prefix stats per SKU sibling. */
+export function AggregatesGrid({
+  siblings,
+  aggregates,
+  isLoading,
+}: {
+  siblings: BenchmarkSibling[];
+  aggregates: AgenticAggregateMap | undefined;
+  isLoading: boolean;
+}) {
+  if (siblings.length === 0) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        SKU sibling list not loaded yet — open a point to populate.
+      </div>
+    );
+  }
+  if (isLoading && !aggregates) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        Computing aggregates across {siblings.length} configs… (parsing trace blobs)
+      </div>
+    );
+  }
+  const labeled = siblings.map((s) => ({ id: s.id, label: chipLabel(s) }));
+  const islPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.isl));
+  const oslPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.osl));
+  const kvPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.kvCacheUtil));
+  const prefixPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.prefixCacheHitRate));
+  return (
+    <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+      <ExpandableChart
+        title="ISL distribution (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={islPoints}
+            unit="tokens"
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="OSL distribution (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={oslPoints}
+            unit="tokens"
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="KV cache utilization (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={kvPoints}
+            unit="%"
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="Prefix cache hit rate (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={prefixPoints}
+            unit="%"
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/chart-hover.tsx b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
new file mode 100644
index 00000000..24270122
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
@@ -0,0 +1,148 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+
+/** Vertical crosshair + floating value tooltip overlay shared by every chart. */
+export interface HoverItem {
+  /** Color swatch to render next to the label. */
+  color: string;
+  label: string;
+  value: string;
+  /** Optional faint secondary line (e.g. timestamp under main values). */
+  hint?: string;
+}
+
+interface ChartHoverProps {
+  /** Padding inside the SVG; matches the chart's CHART_PAD. */
+  pad: { top: number; right: number; bottom: number; left: number };
+  /** SVG viewBox dimensions used to render the chart. */
+  width: number;
+  height: number;
+  /**
+   * Called with the cursor's normalized x in [0..1] across the plot area.
+   * Returns `null` to hide the tooltip (e.g. cursor outside data range).
+   */
+  resolve: (xFraction: number) => { items: HoverItem[]; title?: string } | null;
+  children: ReactNode;
+}
+
+/**
+ * Wrap a chart's <svg> render to add mouse-driven crosshair + tooltip.
+ *
+ * The chart owner renders its bars / lines / axes via `children`; this wrapper
+ * adds an invisible <rect> across the plot area to capture pointer events, a
+ * vertical line that follows the cursor, and a floating tooltip on the right
+ * of the cursor (auto-flipping to the left when it would overflow).
+ */
+export function ChartHover({ pad, width, height, resolve, children }: ChartHoverProps) {
+  const [hover, setHover] = useState<{
+    xPx: number;
+    yPx: number;
+    fraction: number;
+    items: HoverItem[];
+    title?: string;
+  } | null>(null);
+
+  const innerW = width - pad.left - pad.right;
+  const innerH = height - pad.top - pad.bottom;
+
+  const onMove = (e: React.MouseEvent<SVGRectElement>) => {
+    const svg = e.currentTarget.ownerSVGElement;
+    if (!svg) return;
+    const rect = svg.getBoundingClientRect();
+    // Convert client coords → SVG viewBox coords.
+    const sx = ((e.clientX - rect.left) * width) / rect.width;
+    const sy = ((e.clientY - rect.top) * height) / rect.height;
+    const fraction = Math.max(0, Math.min(1, (sx - pad.left) / innerW));
+    const resolved = resolve(fraction);
+    if (!resolved) {
+      setHover(null);
+      return;
+    }
+    setHover({ xPx: sx, yPx: sy, fraction, items: resolved.items, title: resolved.title });
+  };
+
+  const onLeave = () => setHover(null);
+
+  return (
+    <div className="relative w-full">
+      <svg
+        viewBox={`0 0 ${width} ${height}`}
+        preserveAspectRatio="xMidYMid meet"
+        className="w-full h-auto text-foreground"
+      >
+        {children}
+        {hover && (
+          <line
+            x1={hover.xPx}
+            x2={hover.xPx}
+            y1={pad.top}
+            y2={pad.top + innerH}
+            stroke="currentColor"
+            strokeWidth={1}
+            strokeDasharray="3 3"
+            opacity={0.4}
+            pointerEvents="none"
+          />
+        )}
+        <rect
+          x={pad.left}
+          y={pad.top}
+          width={innerW}
+          height={innerH}
+          fill="transparent"
+          onMouseMove={onMove}
+          onMouseLeave={onLeave}
+        />
+      </svg>
+      {hover && hover.items.length > 0 && (
+        <HoverTooltip
+          xFraction={hover.fraction}
+          containerWidth={width}
+          padLeft={pad.left}
+          innerW={innerW}
+          title={hover.title}
+          items={hover.items}
+        />
+      )}
+    </div>
+  );
+}
+
+function HoverTooltip({
+  xFraction,
+  containerWidth,
+  padLeft,
+  innerW,
+  title,
+  items,
+}: {
+  xFraction: number;
+  containerWidth: number;
+  padLeft: number;
+  innerW: number;
+  title?: string;
+  items: HoverItem[];
+}) {
+  // Position tooltip near the crosshair as a % of the container.
+  // We flip to the cursor's left side when it would overflow the right edge.
+  const xPx = padLeft + xFraction * innerW;
+  const onRight = xPx < containerWidth * 0.55;
+  const left = onRight ? `${(xPx / containerWidth) * 100}%` : 'auto';
+  const right = onRight ? 'auto' : `${((containerWidth - xPx) / containerWidth) * 100}%`;
+  return (
+    <div
+      className="pointer-events-none absolute top-2 z-10 rounded-md border border-border bg-popover px-2 py-1.5 text-xs shadow-md"
+      style={{ left, right, marginLeft: onRight ? 8 : 0, marginRight: onRight ? 0 : 8 }}
+    >
+      {title && <div className="font-medium text-foreground mb-1">{title}</div>}
+      {items.map((it, i) => (
+        <div key={i} className="flex items-center gap-1.5 leading-tight">
+          <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: it.color }} />
+          <span className="text-muted-foreground">{it.label}</span>
+          <span className="ml-auto font-medium text-foreground tabular-nums">{it.value}</span>
+        </div>
+      ))}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/chart-shared.tsx b/packages/app/src/components/inference/agentic-point/chart-shared.tsx
new file mode 100644
index 00000000..f00f4532
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/chart-shared.tsx
@@ -0,0 +1,57 @@
+'use client';
+
+/**
+ * Shared presentational constants and helpers for the agentic point-detail
+ * charts (time-series, stacked-area, distribution, aggregate). These charts
+ * are hand-rolled SVG (not the d3-chart library) and share axis padding,
+ * tick formatting, and empty/loading states.
+ */
+
+/** Axis padding shared by the time-series, stacked-area, and distribution charts. */
+export const CHART_PAD = { top: 12, right: 16, bottom: 56, left: 60 } as const;
+
+/** Sizes passed to charts for the inline (small) vs expanded (dialog) render. */
+export const CHART_SIZES = {
+  inline: { width: 720, height: 260 },
+  expanded: { width: 1300, height: 520 },
+} as const;
+
+/**
+ * Guide-line colors per percentile, shared by the aggregate chart's lines and
+ * the distribution chart's vertical guides so the same percentile reads as the
+ * same color across the detail page.
+ */
+export const PERCENTILE_COLORS = {
+  mean: '#ef4444',
+  p50: '#3b82f6',
+  p75: '#22c55e',
+  p90: '#f59e0b',
+  p95: '#ef4444',
+  p99: '#a855f7',
+} as const;
+
+/** Integer tick label: thousands separators only once the value reaches 10000. */
+export const fmtCount = (n: number): string =>
+  n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
+
+/** Seconds → "42s" / "3m 20s" time-axis tick label. */
+export const fmtSeconds = (s: number): string => {
+  if (s < 60) return `${Math.round(s)}s`;
+  const m = Math.floor(s / 60);
+  const rem = Math.round(s % 60);
+  return `${m}m ${rem}s`;
+};
+
+/** "No data" placeholder sized to match the chart it replaces. */
+export function ChartEmpty({ height = 260 }: { height?: number }) {
+  return (
+    <div className="grid place-items-center text-xs text-muted-foreground" style={{ height }}>
+      No data
+    </div>
+  );
+}
+
+/** Loading placeholder for a chart card. */
+export function ChartSkeleton() {
+  return <div className="h-[260px] rounded-md bg-muted/30 animate-pulse" />;
+}
diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
new file mode 100644
index 00000000..f55d6131
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
@@ -0,0 +1,53 @@
+import { describe, expect, it } from 'vitest';
+
+import { datasetConvId, subagentIdOf } from './request-timeline';
+
+describe('datasetConvId', () => {
+  it('returns a plain conversation id unchanged', () => {
+    expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602')).toBe(
+      '002001296e8a8c38ad9d7cc436d691afc602',
+    );
+  });
+
+  it('strips a ::sa: subagent suffix to the parent conv id', () => {
+    expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe(
+      '002001296e8a8c38ad9d7cc436d691afc602',
+    );
+  });
+
+  it('strips a ::fa: forked-agent suffix', () => {
+    expect(datasetConvId('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBe(
+      '02bc0afb13f7a2d9efa86c28511261d85c0e',
+    );
+  });
+
+  it('strips at the first :: even with a trailing stream index', () => {
+    expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc');
+  });
+});
+
+describe('subagentIdOf', () => {
+  it('returns null for a main-conversation cid', () => {
+    expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602')).toBeNull();
+  });
+
+  it('extracts the subagent id from a ::sa: cid', () => {
+    expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe(
+      'subagent_004_27c95af7',
+    );
+  });
+
+  it('drops a trailing :s<stream> index from the subagent id', () => {
+    expect(subagentIdOf('abc::sa:subagent_001_f552fe6f:s3')).toBe('subagent_001_f552fe6f');
+  });
+
+  it('drops an :aux:<n> stream suffix from the subagent id', () => {
+    expect(subagentIdOf('04dba6fe::sa:subagent_001_b00fdc12:aux:011')).toBe(
+      'subagent_001_b00fdc12',
+    );
+  });
+
+  it('returns null for a ::fa: forked-agent cid (no matching subagent group)', () => {
+    expect(subagentIdOf('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBeNull();
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx
new file mode 100644
index 00000000..6573d60c
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/distribution.tsx
@@ -0,0 +1,233 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+import { CHART_PAD, ChartEmpty, PERCENTILE_COLORS, fmtCount } from './chart-shared';
+import { quantile } from './time-series-math';
+
+const PAD = CHART_PAD;
+
+const GUIDES = [
+  { label: 'p50', q: 0.5, color: PERCENTILE_COLORS.p50 },
+  { label: 'p75', q: 0.75, color: PERCENTILE_COLORS.p75 },
+  { label: 'p90', q: 0.9, color: PERCENTILE_COLORS.p90 },
+  { label: 'p95', q: 0.95, color: PERCENTILE_COLORS.p95 },
+] as const;
+
+/**
+ * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the
+ * detail-page card — fills its container width via `viewBox` + 100% width.
+ * Hover shows the bin range + count + cumulative percentile.
+ */
+export function Distribution({
+  values,
+  unit,
+  width = 720,
+  height = 260,
+}: {
+  values: readonly number[];
+  unit: string;
+  width?: number;
+  height?: number;
+}) {
+  const W = width;
+  const H = height;
+
+  const computed = useMemo(() => {
+    if (values.length === 0) return null;
+    const sorted = [...values].toSorted((a, b) => a - b);
+    const min = sorted[0]!;
+    const max = sorted.at(-1)!;
+    const range = Math.max(1e-9, max - min);
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length))));
+    const counts: number[] = Array.from({ length: nBins }, () => 0);
+    for (const v of values) {
+      const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+      counts[i]!++;
+    }
+    return { sorted, min, max, range, innerW, innerH, nBins, counts };
+  }, [values, W, H]);
+
+  if (!computed) {
+    return <ChartEmpty />;
+  }
+  const { sorted, min, max, range, innerW, innerH, nBins, counts } = computed;
+  const maxCount = Math.max(...counts, 1);
+  const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
+  const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
+  const barW = innerW / nBins;
+
+  const fmt = fmtCount;
+
+  // Hover: report the bin range under cursor, its count, and what percentile
+  // the bin's midpoint represents in the empirical distribution.
+  const resolve = (fraction: number) => {
+    const v = min + fraction * range;
+    const binIdx = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+    const binLo = min + (binIdx * range) / nBins;
+    const binHi = min + ((binIdx + 1) * range) / nBins;
+    const count = counts[binIdx] ?? 0;
+    // Cumulative % at the bin's right edge.
+    let cumCount = 0;
+    for (let i = 0; i <= binIdx; i++) cumCount += counts[i] ?? 0;
+    const cumPct = (cumCount / values.length) * 100;
+    const items: HoverItem[] = [
+      { color: 'currentColor', label: 'Bin', value: `${fmt(binLo)}–${fmt(binHi)} ${unit}` },
+      { color: 'currentColor', label: 'Count', value: count.toLocaleString() },
+      { color: 'currentColor', label: 'Cumulative', value: `${cumPct.toFixed(1)}%` },
+    ];
+    return { items };
+  };
+
+  const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
+  const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
+
+  return (
+    <div className="w-full">
+      <div className="mb-2 text-xs text-muted-foreground">
+        {values.length.toLocaleString()} requests · range {fmt(min)}–{fmt(max)} {unit}
+      </div>
+      <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+        {/* y-axis gridlines + labels */}
+        {yTickVals.map((v, i) => {
+          const y = yScale(v);
+          return (
+            <g key={`y${i}`}>
+              <line
+                x1={PAD.left - 4}
+                x2={PAD.left}
+                y1={y}
+                y2={y}
+                stroke="currentColor"
+                opacity={0.4}
+              />
+              <text
+                x={PAD.left - 8}
+                y={y + 3}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.55}
+                textAnchor="end"
+              >
+                {fmt(v)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* Bars */}
+        {counts.map((c, i) => {
+          const h = (c / maxCount) * innerH;
+          const x = PAD.left + i * barW;
+          const y = PAD.top + (innerH - h);
+          return (
+            <rect
+              key={i}
+              x={x}
+              y={y}
+              width={Math.max(0, barW - 1)}
+              height={h}
+              fill="currentColor"
+              opacity={0.55}
+            />
+          );
+        })}
+
+        {/* Percentile guide lines */}
+        {GUIDES.map(({ q, color }) => {
+          const v = quantile(sorted, q);
+          const x = xScale(v);
+          return (
+            <line
+              key={q}
+              x1={x}
+              x2={x}
+              y1={PAD.top}
+              y2={PAD.top + innerH}
+              stroke={color}
+              strokeWidth={2}
+              strokeDasharray="5 3"
+              opacity={0.95}
+            />
+          );
+        })}
+
+        {/* X axis */}
+        <line
+          x1={PAD.left}
+          x2={PAD.left + innerW}
+          y1={PAD.top + innerH}
+          y2={PAD.top + innerH}
+          stroke="currentColor"
+          opacity={0.2}
+        />
+        {xTickVals.map((v, i) => {
+          const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+          return (
+            <text
+              key={`x${i}`}
+              x={xScale(v)}
+              y={PAD.top + innerH + 14}
+              fontSize={11}
+              fill="currentColor"
+              opacity={0.7}
+              textAnchor={anchor}
+            >
+              {fmt(v)}
+            </text>
+          );
+        })}
+        <text
+          x={W / 2}
+          y={H - 22}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+        >
+          value ({unit})
+        </text>
+        <text
+          x={10}
+          y={H / 2}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+          transform={`rotate(-90 10 ${H / 2})`}
+        >
+          count
+        </text>
+
+        {/* Percentile legend chips */}
+        {(() => {
+          const chipY = H - 8;
+          const chipW = innerW / GUIDES.length;
+          return GUIDES.map(({ label: ql, q, color }, i) => {
+            const v = quantile(sorted, q);
+            const x = PAD.left + i * chipW;
+            return (
+              <g key={ql}>
+                <line
+                  x1={x + 2}
+                  x2={x + 14}
+                  y1={chipY - 4}
+                  y2={chipY - 4}
+                  stroke={color}
+                  strokeWidth={2}
+                  strokeDasharray="5 3"
+                />
+                <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                  {ql} {fmt(v)}
+                </text>
+              </g>
+            );
+          });
+        })()}
+      </ChartHover>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
new file mode 100644
index 00000000..cb5987ec
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
@@ -0,0 +1,56 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+import { Maximize2 } from 'lucide-react';
+
+import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog';
+
+/**
+ * Wraps a chart in a card with a header + expand button. Click the button to
+ * open the chart in a large dialog. The `render` prop receives `expanded:true`
+ * inside the dialog so charts can pick larger width/height.
+ */
+export function ExpandableChart({
+  title,
+  render,
+  controls,
+  testId,
+}: {
+  title: string;
+  render: (expanded: boolean) => ReactNode;
+  controls?: ReactNode;
+  testId?: string;
+}) {
+  const [open, setOpen] = useState(false);
+
+  return (
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4" data-testid={testId}>
+      <div className="flex items-start justify-between mb-3 gap-2">
+        <h2 className="text-sm font-semibold text-foreground">{title}</h2>
+        <div className="flex items-center gap-2">
+          {controls}
+          <button
+            type="button"
+            aria-label="Expand chart"
+            onClick={() => setOpen(true)}
+            className="text-muted-foreground hover:text-foreground transition-colors"
+          >
+            <Maximize2 className="size-4" />
+          </button>
+        </div>
+      </div>
+      {render(false)}
+      <Dialog open={open} onOpenChange={setOpen}>
+        <DialogContent className="max-w-[min(96vw,1400px)] w-[min(96vw,1400px)]">
+          <DialogHeader>
+            <div className="flex items-center justify-between gap-3 pr-8">
+              <DialogTitle>{title}</DialogTitle>
+              {controls}
+            </div>
+          </DialogHeader>
+          <div className="w-full">{render(true)}</div>
+        </DialogContent>
+      </Dialog>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx b/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx
new file mode 100644
index 00000000..e56ddeee
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx
@@ -0,0 +1,130 @@
+'use client';
+
+import type { MetricSource, MetricSourceSeries } from '@/hooks/api/use-trace-server-metrics';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { track } from '@/lib/analytics';
+
+import type { StagePhase } from './phase-slice';
+
+const SOURCE_ROLE_LABEL: Record<MetricSource['role'], string> = {
+  router: 'Router',
+  prefill: 'Prefill',
+  decode: 'Decode',
+  combined: 'Combined',
+  unknown: 'Unknown',
+};
+
+/** "Role · instance" label for one server-metrics endpoint. */
+export function metricSourceLabel(source: MetricSource): string {
+  const instance =
+    source.workerId ??
+    (source.dpRank ? `DP ${source.dpRank}` : null) ??
+    source.endpointUrl ??
+    (source.engine ? `engine ${source.engine}` : null);
+  return instance
+    ? `${SOURCE_ROLE_LABEL[source.role]} · ${instance}`
+    : SOURCE_ROLE_LABEL[source.role];
+}
+
+// Warmup vs profiling stage selector. Drives the server-metric charts AND the
+// request-derived charts (ISL/OSL, latency-over-time, in-flight). Only shown
+// when the point actually has a warmup phase.
+const STAGE_PHASE_OPTIONS: SegmentedToggleOption<StagePhase>[] = [
+  { value: 'profiling', label: 'Profiling', testId: 'stage-phase-profiling' },
+  { value: 'warmup', label: 'Warmup', testId: 'stage-phase-warmup' },
+];
+
+/**
+ * Sticky per-point toolbar: warmup/profiling stage toggle (when the point has
+ * a warmup phase) and the server-metrics endpoint selector (when the point has
+ * more than one source). The parent decides when to render it at all.
+ */
+export function MetricSourceToolbar({
+  hasWarmup,
+  phase,
+  onPhaseChange,
+  metricSources,
+  selectedSource,
+  onSourceChange,
+  fallbackAdapter,
+}: {
+  hasWarmup: boolean;
+  phase: StagePhase;
+  onPhaseChange: (phase: StagePhase) => void;
+  metricSources: MetricSourceSeries[];
+  selectedSource: MetricSourceSeries | undefined;
+  onSourceChange: (id: string) => void;
+  /** Adapter reported in analytics when the selected source lookup misses. */
+  fallbackAdapter: string | undefined;
+}) {
+  return (
+    <div
+      className="sticky top-16 z-40 flex items-center justify-between gap-2 rounded-lg border border-border/40 bg-background/90 px-3 py-2 shadow-sm backdrop-blur"
+      data-testid="metric-source-toolbar"
+    >
+      {hasWarmup ? (
+        <div className="flex items-center gap-2">
+          <span className="text-xs text-muted-foreground">Stage</span>
+          <SegmentedToggle
+            value={phase}
+            options={STAGE_PHASE_OPTIONS}
+            onValueChange={(value) => {
+              onPhaseChange(value);
+              track('inference_agentic_phase_changed', { phase: value });
+            }}
+            ariaLabel="Stage phase"
+            testId="stage-phase-toggle"
+            buttonClassName="px-2.5 py-1 text-xs"
+          />
+        </div>
+      ) : (
+        <span />
+      )}
+      {metricSources.length > 1 ? (
+        <div className="flex items-center gap-2">
+          <span className="text-xs text-muted-foreground">Server metrics</span>
+          <Select
+            value={selectedSource?.source.id ?? 'all'}
+            onValueChange={(value) => {
+              onSourceChange(value);
+              const source = metricSources.find((entry) => entry.source.id === value)?.source;
+              track('inference_agentic_metric_source_changed', {
+                source: value,
+                role: source?.role ?? 'all',
+                adapter: source?.adapter ?? fallbackAdapter ?? 'unknown',
+              });
+            }}
+          >
+            <SelectTrigger
+              size="sm"
+              className="max-w-72"
+              aria-label="Server metrics source"
+              data-testid="metric-source-select"
+            >
+              <SelectValue />
+            </SelectTrigger>
+            <SelectContent>
+              <SelectItem value="all">All endpoints</SelectItem>
+              {metricSources.map(({ source }) => (
+                <SelectItem
+                  key={source.id}
+                  value={source.id}
+                  title={source.endpointUrl ?? undefined}
+                >
+                  {metricSourceLabel(source)}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
+      ) : null}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.test.ts b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts
new file mode 100644
index 00000000..ef6cdaab
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts
@@ -0,0 +1,212 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline';
+import {
+  phaseBoundaryNs,
+  phaseBoundarySec,
+  requestsForPhase,
+  sliceServerSeriesByPhase,
+  sliceTimelineByPhase,
+  timelineHasWarmup,
+  type ServerSeriesLike,
+} from './phase-slice';
+
+function req(overrides: Partial<RequestRecord>): RequestRecord {
+  return {
+    cid: 'c',
+    ti: 0,
+    wid: 'w',
+    ad: 0,
+    phase: 'profiling',
+    credit: 0,
+    start: 0,
+    ack: null,
+    end: 1,
+    ttftMs: null,
+    tpotMs: null,
+    isl: null,
+    osl: null,
+    cancelled: false,
+    ...overrides,
+  };
+}
+
+function timeline(requests: RequestRecord[], startNs = 1_000): RequestTimeline {
+  return { version: 3, startNs, endNs: startNs + 1, durationS: 1, requests };
+}
+
+function makeSeries(ts: number[]): ServerSeriesLike {
+  const pts = ts.map((t) => ({ t, value: t * 10 }));
+  return {
+    kvCacheUsage: pts,
+    prefixCacheHitRate: pts,
+    queueDepth: ts.map((t) => ({ t, running: t, waiting: t + 1, total: 2 * t + 1 })),
+    promptTokensBySource: { src: pts },
+    prefillTps: pts,
+    decodeTps: pts,
+    prefixCacheHitsTps: pts,
+    hostKvCacheUsage: pts,
+    kvCacheUsageByEngine: [{ engineLabel: 'e0', points: pts }],
+  };
+}
+
+describe('phaseBoundaryNs', () => {
+  it('returns null when there are no profiling requests', () => {
+    expect(phaseBoundaryNs(timeline([req({ phase: 'warmup', start: 5 })]))).toBeNull();
+  });
+
+  it('returns null when there are no warmup requests', () => {
+    expect(phaseBoundaryNs(timeline([req({ phase: 'profiling', start: 5 })]))).toBeNull();
+  });
+
+  it('returns startNs + earliest profiling start when both phases present', () => {
+    const t = timeline(
+      [
+        req({ phase: 'warmup', start: 0 }),
+        req({ phase: 'profiling', start: 900 }),
+        req({ phase: 'profiling', start: 700 }),
+      ],
+      1_000,
+    );
+    expect(phaseBoundaryNs(t)).toBe(1_700);
+  });
+
+  it('returns null for nullish timeline', () => {
+    expect(phaseBoundaryNs(null)).toBeNull();
+    expect(phaseBoundaryNs(undefined)).toBeNull();
+  });
+});
+
+describe('phaseBoundarySec', () => {
+  it('rebases through absolute ns by subtracting serverMetrics.startNs (origin gap)', () => {
+    // timeline origin and server-metrics origin differ — the classic ~124s gap.
+    const tl = timeline(
+      [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 600 * 1e9 })],
+      200 * 1e9, // timeline.startNs
+    );
+    // boundaryNs = 200e9 + 600e9 = 800e9 ; serverMetrics origin = 124e9 earlier
+    const boundarySec = phaseBoundarySec({ startNs: 76 * 1e9 }, tl);
+    // (800e9 - 76e9)/1e9 = 724
+    expect(boundarySec).toBe(724);
+  });
+
+  it('clamps a negative mapping to 0', () => {
+    const tl = timeline(
+      [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 0 })],
+      0,
+    );
+    expect(phaseBoundarySec({ startNs: 5 * 1e9 }, tl)).toBe(0);
+  });
+
+  it('returns null when serverMetrics missing or no split', () => {
+    const tl = timeline(
+      [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 1e9 })],
+      0,
+    );
+    expect(phaseBoundarySec(null, tl)).toBeNull();
+    expect(phaseBoundarySec({ startNs: 0 }, timeline([req({ phase: 'profiling' })]))).toBeNull();
+  });
+});
+
+describe('timelineHasWarmup', () => {
+  it('detects warmup presence', () => {
+    expect(timelineHasWarmup(timeline([req({ phase: 'profiling' })]))).toBe(false);
+    expect(timelineHasWarmup(timeline([req({ phase: 'warmup' })]))).toBe(true);
+    expect(timelineHasWarmup(null)).toBe(false);
+  });
+});
+
+describe('sliceServerSeriesByPhase', () => {
+  it('is an identity passthrough (full duration) when boundary is null', () => {
+    const s = makeSeries([0, 1, 2]);
+    const out = sliceServerSeriesByPhase(s, 'profiling', null, 99);
+    expect(out.series).toBe(s);
+    expect(out.durationS).toBe(99);
+  });
+
+  it('warmup keeps t < boundary, no rebase, durationS = boundary', () => {
+    const s = makeSeries([0, 1, 2, 3, 4]);
+    const out = sliceServerSeriesByPhase(s, 'warmup', 2, 5);
+    expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1]); // excludes t===2
+    expect(out.durationS).toBe(2);
+  });
+
+  it('profiling keeps t >= boundary and rebases to start at 0', () => {
+    const s = makeSeries([0, 1, 2, 3, 4]);
+    const out = sliceServerSeriesByPhase(s, 'profiling', 2, 5);
+    expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1, 2]); // 2,3,4 -> 0,1,2
+    expect(out.series.kvCacheUsage.map((p) => p.value)).toEqual([20, 30, 40]); // values preserved
+    expect(out.durationS).toBe(3); // 5 - 2
+  });
+
+  it('slices queueDepth, promptTokensBySource, and kvCacheUsageByEngine; preserves queue fields', () => {
+    const s = makeSeries([0, 1, 2, 3]);
+    const out = sliceServerSeriesByPhase(s, 'profiling', 2, 4);
+    expect(out.series.queueDepth).toEqual([
+      { t: 0, running: 2, waiting: 3, total: 5 },
+      { t: 1, running: 3, waiting: 4, total: 7 },
+    ]);
+    expect(out.series.promptTokensBySource.src.map((p) => p.t)).toEqual([0, 1]);
+    expect(out.series.kvCacheUsageByEngine[0]!.points.map((p) => p.t)).toEqual([0, 1]);
+    expect(out.series.kvCacheUsageByEngine[0]!.engineLabel).toBe('e0');
+  });
+
+  it('does not mutate the input series', () => {
+    const s = makeSeries([0, 1, 2]);
+    const before = s.kvCacheUsage.map((p) => p.t);
+    sliceServerSeriesByPhase(s, 'profiling', 1, 3);
+    expect(s.kvCacheUsage.map((p) => p.t)).toEqual(before);
+  });
+});
+
+describe('requestsForPhase', () => {
+  const rs = [
+    req({ phase: 'warmup', isl: 1 }),
+    req({ phase: 'profiling', isl: 2 }),
+    req({ phase: 'unknown', isl: 3 }),
+  ];
+
+  it('profiling selects only profiling rows', () => {
+    expect(requestsForPhase(rs, 'profiling').map((r) => r.isl)).toEqual([2]);
+  });
+
+  it('warmup selects everything that is not profiling', () => {
+    expect(requestsForPhase(rs, 'warmup').map((r) => r.isl)).toEqual([1, 3]);
+  });
+});
+
+describe('sliceTimelineByPhase', () => {
+  // startNs origin = 1000; warmup request at offset 0..50, profiling at 100..300.
+  const tl = timeline(
+    [
+      req({ phase: 'warmup', credit: 0, start: 0, ack: 10, end: 50, isl: 1 }),
+      req({ phase: 'profiling', credit: 90, start: 100, ack: 120, end: 300, isl: 2 }),
+    ],
+    1_000,
+  );
+  // tl.durationS default = 1 from helper; override for window math.
+  const tlDur: RequestTimeline = { ...tl, durationS: 3 };
+
+  it('returns the input unchanged for a single-phase timeline', () => {
+    const single = timeline([req({ phase: 'profiling', start: 5 })]);
+    expect(sliceTimelineByPhase(single, 'profiling')).toBe(single);
+  });
+
+  it('warmup keeps pre-boundary requests, no rebase, startNs unchanged', () => {
+    const out = sliceTimelineByPhase(tlDur, 'warmup');
+    expect(out.requests.map((r) => r.isl)).toEqual([1]);
+    expect(out.requests[0]!.start).toBe(0); // not rebased
+    expect(out.startNs).toBe(1_000);
+  });
+
+  it('profiling keeps post-boundary requests and rebases offsets + startNs', () => {
+    const out = sliceTimelineByPhase(tlDur, 'profiling');
+    expect(out.requests.map((r) => r.isl)).toEqual([2]);
+    // boundary offset = 100 → rebased: start 100→0, end 300→200, ack 120→20, credit 90→-10
+    expect(out.requests[0]!.start).toBe(0);
+    expect(out.requests[0]!.end).toBe(200);
+    expect(out.requests[0]!.ack).toBe(20);
+    // startNs shifts forward by the boundary offset so absolute time is preserved
+    expect(out.startNs).toBe(1_100);
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.ts b/packages/app/src/components/inference/agentic-point/phase-slice.ts
new file mode 100644
index 00000000..e6e17719
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/phase-slice.ts
@@ -0,0 +1,188 @@
+/**
+ * Warmup vs profiling phase slicing for the agentic per-point detail page.
+ *
+ * Agentic trace-replay runs have two phases: a warmup (cache-warming) pass, then
+ * the measured profiling window. The server-metric time-series (`chart_series`)
+ * spans the whole run with no per-point phase label, but the per-request
+ * `request_timeline` IS phase-tagged. We derive the warmup→profiling boundary
+ * from the timeline and slice the server series at it.
+ *
+ * ⚠️ ORIGIN-GAP INVARIANT: the two payloads share the aiperf clock but have
+ * DIFFERENT zero origins — `serverMetrics.startNs` is the first server scrape,
+ * `timeline.startNs` is the first request's credit (observed ~124 s apart in
+ * real runs). The boundary must therefore be rebased through absolute ns by
+ * subtracting `serverMetrics.startNs`; a same-axis offset comparison would be
+ * off by the origin gap. This rebasing lives in `phaseBoundarySec` only.
+ */
+
+import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline';
+import type {
+  QueueDepthPoint,
+  TimeSeriesPoint,
+  TraceServerMetrics,
+} from '@/hooks/api/use-trace-server-metrics';
+
+export type StagePhase = 'warmup' | 'profiling';
+
+/**
+ * The subset of server-metric series the per-point charts render. Both the
+ * top-level `TraceServerMetrics` and a per-source object (after the detail page
+ * remaps `promptTps`→`prefillTps`, `generationTps`→`decodeTps`) are assignable.
+ */
+export interface ServerSeriesLike {
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  hostKvCacheUsage: TimeSeriesPoint[];
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+}
+
+/** True when the timeline contains at least one non-profiling (warmup) request. */
+export function timelineHasWarmup(timeline: RequestTimeline | null | undefined): boolean {
+  return Boolean(timeline?.requests.some((r) => r.phase !== 'profiling'));
+}
+
+/**
+ * Absolute-ns wall-clock instant where the profiling phase begins
+ * = `timeline.startNs + earliest profiling request's start offset`.
+ * Returns null unless BOTH a warmup and a profiling request exist (nothing to
+ * split otherwise).
+ */
+export function phaseBoundaryNs(timeline: RequestTimeline | null | undefined): number | null {
+  if (!timeline) return null;
+  let hasWarmup = false;
+  let minProfilingStart: number | null = null;
+  for (const r of timeline.requests) {
+    if (r.phase === 'profiling') {
+      if (minProfilingStart === null || r.start < minProfilingStart) minProfilingStart = r.start;
+    } else {
+      hasWarmup = true;
+    }
+  }
+  if (!hasWarmup || minProfilingStart === null) return null;
+  return timeline.startNs + minProfilingStart;
+}
+
+/**
+ * The profiling-start boundary expressed on the SERVER-METRIC chart's own t-axis
+ * (seconds from `serverMetrics.startNs`). See the origin-gap invariant at the top
+ * of the file — the `- serverMetrics.startNs` subtraction is mandatory.
+ *
+ * Returns null when there's no warmup/profiling split, or `serverMetrics` is
+ * absent (→ callers fall back to the full-run series).
+ */
+export function phaseBoundarySec(
+  serverMetrics: Pick<TraceServerMetrics, 'startNs'> | null | undefined,
+  timeline: RequestTimeline | null | undefined,
+): number | null {
+  if (!serverMetrics) return null;
+  const boundaryNs = phaseBoundaryNs(timeline);
+  if (boundaryNs === null) return null;
+  return Math.max(0, (boundaryNs - serverMetrics.startNs) / 1e9);
+}
+
+export interface PhaseSlicedSeries<S> {
+  series: S;
+  durationS: number;
+}
+
+/**
+ * Slice every server-metric series to one phase:
+ *  - warmup:    keep points with `t < boundary`, no rebase, `durationS = boundary`
+ *  - profiling: keep points with `t >= boundary`, rebased so `t` starts at 0,
+ *               `durationS = full - boundary`
+ *
+ * A point exactly at `t === boundary` belongs to profiling. Null boundary
+ * (single-phase point, or no server metrics) → identity passthrough with the
+ * full `durationS`. Pure — returns new objects, never mutates the input.
+ *
+ * NOTE: rebasing the profiling slice to start at 0 makes the cumulative charts
+ * (prompt-token source, unique-input-tokens) read as "since profiling start"
+ * rather than "since run start" — intended.
+ */
+export function sliceServerSeriesByPhase<S extends ServerSeriesLike>(
+  series: S,
+  phase: StagePhase,
+  boundarySec: number | null,
+  fullDurationS: number,
+): PhaseSlicedSeries<S> {
+  if (boundarySec === null) return { series, durationS: fullDurationS };
+  const b = boundarySec;
+  const keep = phase === 'warmup' ? (t: number) => t < b : (t: number) => t >= b;
+  const rebase = phase === 'profiling' ? (t: number) => t - b : (t: number) => t;
+
+  const sliceTs = (pts: TimeSeriesPoint[]): TimeSeriesPoint[] =>
+    pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) }));
+  const sliceQd = (pts: QueueDepthPoint[]): QueueDepthPoint[] =>
+    pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) }));
+  const sliceRecord = (
+    rec: Record<string, TimeSeriesPoint[]>,
+  ): Record<string, TimeSeriesPoint[]> => {
+    const out: Record<string, TimeSeriesPoint[]> = {};
+    for (const [k, v] of Object.entries(rec)) out[k] = sliceTs(v);
+    return out;
+  };
+
+  const slicedFields: ServerSeriesLike = {
+    kvCacheUsage: sliceTs(series.kvCacheUsage),
+    prefixCacheHitRate: sliceTs(series.prefixCacheHitRate),
+    queueDepth: sliceQd(series.queueDepth),
+    promptTokensBySource: sliceRecord(series.promptTokensBySource),
+    prefillTps: sliceTs(series.prefillTps),
+    decodeTps: sliceTs(series.decodeTps),
+    prefixCacheHitsTps: sliceTs(series.prefixCacheHitsTps),
+    hostKvCacheUsage: sliceTs(series.hostKvCacheUsage),
+    kvCacheUsageByEngine: series.kvCacheUsageByEngine.map((e) => ({
+      engineLabel: e.engineLabel,
+      points: sliceTs(e.points),
+    })),
+  };
+
+  const durationS = phase === 'warmup' ? b : Math.max(1, fullDurationS - b);
+  return { series: { ...series, ...slicedFields } as S, durationS };
+}
+
+/** Filter request-timeline records to one phase (warmup = anything not profiling). */
+export function requestsForPhase(requests: RequestRecord[], phase: StagePhase): RequestRecord[] {
+  return phase === 'warmup'
+    ? requests.filter((r) => r.phase !== 'profiling')
+    : requests.filter((r) => r.phase === 'profiling');
+}
+
+/**
+ * Scope a whole request timeline to one phase: keep only that phase's requests
+ * and, for profiling, rebase every ns offset (and `startNs`) so the phase starts
+ * at t=0 — mirroring `sliceServerSeriesByPhase` so the request-derived charts and
+ * the server charts share a 0-based axis for the same phase. `durationS` becomes
+ * the phase window. Returns the input unchanged when there's no warmup/profiling
+ * split (single-phase point). Pure — new object, original untouched.
+ *
+ * The boundary here is on the REQUEST clock (offset from `timeline.startNs`), so
+ * we use `phaseBoundaryNs` minus `timeline.startNs` rather than the server-axis
+ * `phaseBoundarySec` (different origin — see the file header).
+ */
+export function sliceTimelineByPhase(
+  timeline: RequestTimeline,
+  phase: StagePhase,
+): RequestTimeline {
+  const boundaryNs = phaseBoundaryNs(timeline);
+  if (boundaryNs === null) return timeline;
+  const boundaryOff = boundaryNs - timeline.startNs; // ns offset on the request clock
+  const inPhase = (r: RequestRecord) =>
+    phase === 'warmup' ? r.start < boundaryOff : r.start >= boundaryOff;
+  const shift = phase === 'profiling' ? boundaryOff : 0;
+  const requests = timeline.requests.filter(inPhase).map((r) => ({
+    ...r,
+    credit: r.credit - shift,
+    start: r.start - shift,
+    ack: r.ack === null ? null : r.ack - shift,
+    end: r.end - shift,
+  }));
+  const durationS =
+    phase === 'warmup' ? boundaryOff / 1e9 : Math.max(1, timeline.durationS - boundaryOff / 1e9);
+  return { ...timeline, startNs: timeline.startNs + shift, requests, durationS };
+}
diff --git a/packages/app/src/components/inference/agentic-point/point-summary.tsx b/packages/app/src/components/inference/agentic-point/point-summary.tsx
new file mode 100644
index 00000000..8a777baa
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/point-summary.tsx
@@ -0,0 +1,50 @@
+'use client';
+
+import type { ReactNode } from 'react';
+
+import type { PointMeta } from '@/hooks/api/use-trace-server-metrics';
+
+const fmtPct = (v: number | null | undefined): string =>
+  v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`;
+
+function MetaLine({ label, value }: { label: string; value: ReactNode }) {
+  return (
+    <div className="flex flex-col gap-0.5">
+      <span className="text-xs uppercase tracking-wide text-muted-foreground">{label}</span>
+      <span className="text-sm font-medium text-foreground">{value}</span>
+    </div>
+  );
+}
+
+/** Selected-point header: config facts (offload, concurrency, cache hit rates, ISL/OSL). */
+export function PointSummary({ meta }: { meta: PointMeta }) {
+  return (
+    <div className="mb-4">
+      <div className="flex items-baseline justify-between gap-3 mb-2">
+        <p className="text-sm text-muted-foreground">
+          Selected point
+          {meta.disagg ? ' · disagg' : ''}
+          {meta.spec_method && meta.spec_method !== 'none' ? ` · spec=${meta.spec_method}` : ''}
+        </p>
+        {meta.run_url && (
+          <a
+            href={meta.run_url}
+            target="_blank"
+            rel="noopener noreferrer"
+            className="text-xs text-muted-foreground hover:text-foreground underline"
+          >
+            GitHub Actions run →
+          </a>
+        )}
+      </div>
+      <div className="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-6 gap-3">
+        <MetaLine label="Offload" value={(meta.offload_mode ?? 'off').toUpperCase()} />
+        <MetaLine label="Concurrency" value={meta.conc} />
+        <MetaLine label="GPU cache hit" value={fmtPct(meta.server_gpu_cache_hit_rate)} />
+        <MetaLine label="CPU cache hit" value={fmtPct(meta.server_cpu_cache_hit_rate)} />
+        {meta.isl !== null && <MetaLine label="ISL" value={meta.isl} />}
+        {meta.osl !== null && <MetaLine label="OSL" value={meta.osl} />}
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx b/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx
new file mode 100644
index 00000000..8ca85ac9
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx
@@ -0,0 +1,223 @@
+'use client';
+
+import { useState } from 'react';
+
+import type { RequestTimeline } from '@/hooks/api/use-request-timeline';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+import { CHART_SIZES, ChartEmpty, ChartSkeleton } from './chart-shared';
+import { Distribution } from './distribution';
+import { ExpandableChart } from './expandable-chart';
+import { TimeSeriesChart } from './time-series-chart';
+import {
+  averageSequenceLengthInFlight,
+  rollingRequestMetric,
+  timeRollingAverage,
+  type RequestMetric,
+  type RequestPercentile,
+} from './time-series-math';
+
+const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption<RequestPercentile>[] = [
+  { value: 'p75', label: 'P75' },
+  { value: 'p90', label: 'P90' },
+];
+
+const LATENCY_METRIC_OPTIONS: SegmentedToggleOption<'ttft' | 'e2e'>[] = [
+  { value: 'ttft', label: 'TTFT', testId: 'latency-metric-ttft' },
+  { value: 'e2e', label: 'E2E', testId: 'latency-metric-e2e' },
+];
+
+type SequenceMetricView = 'distribution' | 'inflight';
+
+const SEQUENCE_METRIC_OPTIONS: SegmentedToggleOption<SequenceMetricView>[] = [
+  { value: 'distribution', label: 'Distribution' },
+  { value: 'inflight', label: 'In-flight avg' },
+];
+
+// Unofficial-run overlays cannot open this persisted point-detail route: they
+// have no benchmark_results id or stored request timeline. These charts are
+// therefore intentionally limited to DB-backed agentic points.
+export function RequestMetricOverTime({
+  title,
+  metric,
+  timeline,
+  isLoading,
+  latencySelector = false,
+}: {
+  title: string;
+  metric: RequestMetric;
+  timeline: RequestTimeline | null | undefined;
+  isLoading: boolean;
+  latencySelector?: boolean;
+}) {
+  const [percentile, setPercentile] = useState<RequestPercentile>('p90');
+  const [latencyMetric, setLatencyMetric] = useState<'ttft' | 'e2e'>('ttft');
+  const selectedMetric = latencySelector ? latencyMetric : metric;
+  const result = timeline
+    ? rollingRequestMetric(timeline.requests, selectedMetric, percentile, 50)
+    : null;
+  const metricLabel =
+    selectedMetric === 'ttft' ? 'TTFT' : selectedMetric === 'e2e' ? 'E2E latency' : 'Interactivity';
+  const color =
+    selectedMetric === 'ttft' ? '#f59e0b' : selectedMetric === 'e2e' ? '#a855f7' : '#06b6d4';
+  const pointCount = result?.raw.length;
+  const isLatency = selectedMetric !== 'interactivity';
+
+  const controls = (
+    <div className="flex items-center gap-2">
+      {latencySelector && (
+        <SegmentedToggle
+          value={latencyMetric}
+          options={LATENCY_METRIC_OPTIONS}
+          onValueChange={(value) => {
+            setLatencyMetric(value);
+            track('inference_agentic_latency_metric_changed', { metric: value });
+          }}
+          ariaLabel="Latency metric"
+          testId="latency-metric-toggle"
+        />
+      )}
+      <span
+        className="text-xs tabular-nums text-muted-foreground"
+        data-testid={`${selectedMetric}-point-count`}
+      >
+        {pointCount === undefined
+          ? '— points'
+          : `${pointCount.toLocaleString()} ${pointCount === 1 ? 'point' : 'points'}`}
+      </span>
+      <SegmentedToggle
+        value={percentile}
+        options={REQUEST_PERCENTILE_OPTIONS}
+        onValueChange={(value) => {
+          setPercentile(value);
+          track('inference_agentic_percentile_changed', {
+            metric: selectedMetric,
+            percentile: value,
+          });
+        }}
+        ariaLabel={`${metricLabel} percentile`}
+        testId={`${selectedMetric}-percentile-toggle`}
+      />
+    </div>
+  );
+
+  return (
+    <ExpandableChart
+      title={latencySelector ? `${metricLabel} over time` : title}
+      controls={controls}
+      testId={`${metric}-over-time-chart`}
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!timeline) return isLoading ? <ChartSkeleton /> : <ChartEmpty />;
+        return (
+          <TimeSeriesChart
+            series={[
+              {
+                name: `${percentile.toUpperCase()} (rolling 50 req)`,
+                data: result?.trend ?? [],
+                rawData: result?.raw,
+                color,
+                strokeWidth: 2.5,
+              },
+              {
+                name: isLatency
+                  ? `Cumulative ${percentile.toUpperCase()} ${metricLabel}`
+                  : `1 / cumulative ${percentile.toUpperCase()} TPOT`,
+                data: result?.cumulative ?? [],
+                color: '#ef4444',
+                strokeWidth: 3,
+              },
+            ]}
+            durationS={timeline.durationS}
+            yFmt={
+              isLatency
+                ? (value) => `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s`
+                : (value) => `${value.toFixed(0)}`
+            }
+            yAxisLabel={isLatency ? `${metricLabel} (s)` : 'Interactivity (tok/s/user)'}
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
+export function SequenceMetricCard({
+  metric,
+  timeline,
+  timelineLoading,
+}: {
+  metric: 'isl' | 'osl';
+  /** Phase-scoped timeline — distribution values + in-flight are both derived from it. */
+  timeline: RequestTimeline | null | undefined;
+  timelineLoading: boolean;
+}) {
+  const [view, setView] = useState<SequenceMetricView>('distribution');
+  const acronym = metric.toUpperCase();
+  const fullName = metric === 'isl' ? 'Input sequence length' : 'Output sequence length';
+  const testPrefix = `${metric}-metric`;
+  // Per-request ISL/OSL for the selected phase (request_timeline carries both,
+  // so the distribution honours the warmup/profiling toggle for free).
+  const values = timeline
+    ? timeline.requests
+        .map((r) => r[metric])
+        .filter((v): v is number => typeof v === 'number' && Number.isFinite(v))
+    : undefined;
+  return (
+    <ExpandableChart
+      title={view === 'distribution' ? `${fullName} distribution` : `Average ${acronym} in flight`}
+      testId={`${testPrefix}-chart`}
+      controls={
+        <SegmentedToggle
+          value={view}
+          options={SEQUENCE_METRIC_OPTIONS.map((option) => ({
+            ...option,
+            testId: `${testPrefix}-${option.value}`,
+          }))}
+          onValueChange={(value) => {
+            setView(value);
+            track('inference_agentic_sequence_metric_view_changed', { metric, view: value });
+          }}
+          ariaLabel={`${acronym} chart view`}
+          testId={`${testPrefix}-toggle`}
+          buttonClassName="px-2 py-1 text-xs"
+        />
+      }
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (view === 'distribution') {
+          if (values && values.length > 0)
+            return <Distribution values={values} unit="tokens" {...size} />;
+          return timelineLoading ? <ChartSkeleton /> : <ChartEmpty />;
+        }
+        if (!timeline) return timelineLoading ? <ChartSkeleton /> : <ChartEmpty />;
+        const raw = averageSequenceLengthInFlight(timeline.requests, metric);
+        return (
+          <div>
+            {metric === 'osl' && (
+              <p className="mb-2 text-xs text-muted-foreground">
+                Retrospective: final observed OSL is assigned across each request&apos;s lifetime.
+              </p>
+            )}
+            <TimeSeriesChart
+              series={[
+                {
+                  name: `Average ${acronym} in flight (30s avg)`,
+                  data: timeRollingAverage(raw, 30),
+                  rawData: raw,
+                  color: metric === 'isl' ? '#3b82f6' : '#a855f7',
+                  strokeWidth: 2.5,
+                },
+              ]}
+              durationS={timeline.durationS}
+              yAxisLabel="Tokens / request"
+              {...size}
+            />
+          </div>
+        );
+      }}
+    />
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
new file mode 100644
index 00000000..cf43f5ae
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -0,0 +1,378 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import {
+  buildRequestTimelineRows,
+  computeStableRowIndex,
+  conversationHref,
+  parseTimelineViewSnapshot,
+  requestIdleStats,
+  splitTimelineCid,
+  type TimelineViewSnapshot,
+} from './request-timeline';
+
+const request = (start: number, end: number): RequestRecord => ({
+  cid: 'conversation',
+  ti: start,
+  wid: 'worker',
+  ad: 0,
+  phase: 'profiling',
+  credit: start,
+  start,
+  ack: null,
+  end,
+  ttftMs: null,
+  tpotMs: null,
+  isl: null,
+  osl: null,
+  cancelled: false,
+});
+
+describe('requestIdleStats', () => {
+  it('sums only gaps where no requests overlap', () => {
+    expect(
+      requestIdleStats([
+        request(0, 10),
+        request(5, 20),
+        request(30, 40),
+        request(35, 50),
+        request(70, 80),
+      ]),
+    ).toEqual({ idleNs: 30, spanNs: 80 });
+  });
+
+  it('handles unsorted and nested requests without double-counting busy time', () => {
+    expect(requestIdleStats([request(20, 30), request(0, 100), request(10, 40)])).toEqual({
+      idleNs: 0,
+      spanNs: 100,
+    });
+  });
+
+  it('does not count time before the first start or after the final end', () => {
+    expect(requestIdleStats([request(100, 200), request(300, 400)])).toEqual({
+      idleNs: 100,
+      spanNs: 300,
+    });
+  });
+
+  it('returns zeroes for an empty timeline', () => {
+    expect(requestIdleStats([])).toEqual({ idleNs: 0, spanNs: 0 });
+  });
+});
+
+describe('subagent timeline hierarchy', () => {
+  it('parses aux lanes separately from their parent subagent id', () => {
+    expect(splitTimelineCid('conv::sa:subagent_001_abcd:aux:011')).toEqual({
+      parent: 'conv',
+      subagentBase: 'subagent_001_abcd',
+      stream: null,
+      aux: '011',
+    });
+  });
+
+  it('renders aux requests as always-visible children of their subagent', () => {
+    const records = [
+      { ...request(0, 10), cid: 'conv' },
+      { ...request(10, 30), cid: 'conv::sa:subagent_001_abcd' },
+      { ...request(12, 20), cid: 'conv::sa:subagent_001_abcd:aux:011' },
+      { ...request(14, 24), cid: 'conv::sa:subagent_001_abcd:aux:012' },
+      { ...request(40, 50), cid: 'conv::sa:subagent_002_ef01' },
+    ];
+
+    const rows = buildRequestTimelineRows(records, 'conversation', new Set());
+    expect(rows.map(({ kind, depth }) => ({ kind, depth }))).toEqual([
+      { kind: 'parent', depth: 0 },
+      { kind: 'subagent', depth: 1 },
+      { kind: 'aux', depth: 2 },
+      { kind: 'aux', depth: 2 },
+      { kind: 'subagent', depth: 1 },
+    ]);
+    expect(rows[1]!.requests.map((record) => record.cid)).toEqual(['conv::sa:subagent_001_abcd']);
+    expect(rows[1]!.auxCount).toBe(2);
+    expect(rows[2]!.label).toBe('aux 011 · parallel');
+    expect(rows[3]!.label).toBe('aux 012 · parallel');
+  });
+
+  it('keeps aux lanes visible while primary streams remain collapsed', () => {
+    const records = [
+      { ...request(10, 20), cid: 'conv::sa:subagent_001_abcd:s0' },
+      { ...request(12, 22), cid: 'conv::sa:subagent_001_abcd:s1' },
+      { ...request(14, 18), cid: 'conv::sa:subagent_001_abcd:aux:001' },
+    ];
+
+    const rows = buildRequestTimelineRows(records, 'conversation', new Set());
+    expect(rows.map((row) => row.kind)).toEqual(['parent', 'subagent', 'aux']);
+    expect(rows[1]!.requests).toHaveLength(2);
+    expect(rows[2]!.requests).toHaveLength(1);
+  });
+
+  it('parses aux lanes hanging directly off the main conversation', () => {
+    expect(splitTimelineCid('conv::aux:000')).toEqual({
+      parent: 'conv',
+      subagentBase: null,
+      stream: null,
+      aux: '000',
+    });
+    expect(splitTimelineCid('conv::aux:red:002')).toEqual({
+      parent: 'conv',
+      subagentBase: null,
+      stream: null,
+      aux: 'red:002',
+    });
+    expect(splitTimelineCid('conv::sa:subagent_001_abcd:aux:red:002')).toEqual({
+      parent: 'conv',
+      subagentBase: 'subagent_001_abcd',
+      stream: null,
+      aux: 'red:002',
+    });
+  });
+
+  it('nests main-agent aux lanes under the parent conversation row', () => {
+    const records = [
+      { ...request(0, 10), cid: 'conv' },
+      { ...request(2, 8), cid: 'conv::aux:001' },
+      { ...request(4, 12), cid: 'conv::aux:red:002' },
+      { ...request(20, 30), cid: 'conv::sa:subagent_001_abcd' },
+    ];
+
+    const rows = buildRequestTimelineRows(records, 'conversation', new Set());
+    expect(rows.map(({ kind, depth }) => ({ kind, depth }))).toEqual([
+      { kind: 'parent', depth: 0 },
+      { kind: 'aux', depth: 1 },
+      { kind: 'aux', depth: 1 },
+      { kind: 'subagent', depth: 1 },
+    ]);
+    expect(rows[0]!.requests.map((record) => record.cid)).toEqual(['conv']);
+    expect(rows[1]!.label).toBe('aux 001 · parallel');
+    expect(rows[1]!.parentRowKey).toBe('conv');
+    expect(rows[2]!.label).toBe('aux red:002 · parallel');
+    // Aux lanes inherit the parent conversation's color.
+    expect(rows[1]!.color).toBe(rows[0]!.color);
+    expect(rows[2]!.color).toBe(rows[0]!.color);
+  });
+
+  it('groups main-agent aux requests with their parent for stable order/color', () => {
+    const records = [
+      { ...request(50, 60), cid: 'other' },
+      { ...request(0, 10), cid: 'conv::aux:000' },
+      { ...request(5, 15), cid: 'conv' },
+    ];
+    const index = computeStableRowIndex(records, 'conversation');
+    // 'conv' groups with its aux lane (earliest start 0) and sorts before 'other'.
+    expect([...index.keys()].toSorted()).toEqual(['conv', 'other']);
+    expect(index.get('conv')).toBe(0);
+    expect(index.get('other')).toBe(1);
+  });
+
+  it('deep-links a main-agent aux request to the parent conversation without sa', () => {
+    expect(conversationHref('slug', { ...request(0, 10), cid: 'abc123::aux:red:002', ti: 3 })).toBe(
+      '/datasets/slug/conversations/abc123?turn=3',
+    );
+  });
+});
+
+describe('conversationHref', () => {
+  it('builds a turn-carrying dataset link for a main-conversation request', () => {
+    expect(
+      conversationHref('cc-traces-weka-062126', { ...request(0, 10), cid: 'abc123', ti: 4 }),
+    ).toBe('/datasets/cc-traces-weka-062126/conversations/abc123?turn=4');
+  });
+
+  it('carries the subagent id and strips the ::sa suffix from the conv id', () => {
+    expect(
+      conversationHref('slug', {
+        ...request(0, 10),
+        cid: 'abc123::sa:subagent_001_bf1c5c16:s2',
+        ti: 7,
+      }),
+    ).toBe('/datasets/slug/conversations/abc123?turn=7&sa=subagent_001_bf1c5c16');
+  });
+
+  it('uses raw source provenance for flattened-agent dataset links', () => {
+    expect(
+      conversationHref('slug', {
+        ...request(0, 10),
+        cid: '02bc0afb13f7a2d9efa86c28511261d85c0e::fa:003',
+        ti: 3,
+        srcTrace: '02bc0afb13f7a2d9efa86c28511261d85c0e',
+        srcOuter: 204,
+        srcKind: 'weka_flat',
+      }),
+    ).toBe('/datasets/slug/conversations/02bc0afb13f7a2d9efa86c28511261d85c0e?turn=3&raw=204');
+  });
+
+  it('uses raw nested source provenance for subagent child links', () => {
+    expect(
+      conversationHref('slug', {
+        ...request(0, 10),
+        cid: '117ebe75819d050f308a0a81647893abd02d::sa:subagent_010_32ee2daa',
+        ti: 16,
+        srcTrace: '117ebe75819d050f308a0a81647893abd02d',
+        srcOuter: 39,
+        srcInner: 16,
+        srcKind: 'weka_subagent',
+      }),
+    ).toBe(
+      '/datasets/slug/conversations/117ebe75819d050f308a0a81647893abd02d?turn=16&raw=39&inner=16',
+    );
+  });
+});
+
+describe('stable row order + color across phase filters', () => {
+  // Same conversations appear in both warmup and profiling. Their global
+  // first-start order is A (0) < B (10) < C (only profiling, 50). The bug:
+  // filtering to a phase re-sorted + re-colored by the visible subset, so a
+  // conversation jumped rows and swapped color when toggling phases.
+  const rec = (
+    cid: string,
+    phase: RequestRecord['phase'],
+    start: number,
+    end: number,
+  ): RequestRecord => ({ ...request(start, end), cid, phase });
+  const full: RequestRecord[] = [
+    rec('A', 'warmup', 0, 5),
+    rec('A', 'profiling', 100, 110),
+    rec('B', 'warmup', 10, 15),
+    rec('B', 'profiling', 120, 130),
+    rec('C', 'profiling', 50, 60), // profiling-only; earliest profiling start
+  ];
+
+  it('keeps each conversation in the same position and color when the phase changes', () => {
+    const index = computeStableRowIndex(full, 'conversation');
+    const warmupRows = buildRequestTimelineRows(
+      full.filter((r) => r.phase === 'warmup'),
+      'conversation',
+      new Set(),
+      index,
+    ).filter((r) => r.kind === 'parent');
+    const profilingRows = buildRequestTimelineRows(
+      full.filter((r) => r.phase === 'profiling'),
+      'conversation',
+      new Set(),
+      index,
+    ).filter((r) => r.kind === 'parent');
+
+    // Position: A before B in both phases (C only shows in profiling, and sorts
+    // after A/B by its global index — NOT first by its earlier profiling start).
+    expect(warmupRows.map((r) => r.label)).toEqual(['A', 'B']);
+    expect(profilingRows.map((r) => r.label)).toEqual(['A', 'B', 'C']);
+
+    // Color: identical per conversation across phases, distinct between them.
+    const warmupColors = Object.fromEntries(warmupRows.map((r) => [r.label, r.color]));
+    const profilingColors = Object.fromEntries(profilingRows.map((r) => [r.label, r.color]));
+    expect(warmupColors.A).toBe(profilingColors.A);
+    expect(warmupColors.B).toBe(profilingColors.B);
+    expect(warmupColors.A).not.toBe(warmupColors.B);
+  });
+
+  it('phase-spanning conversations occupy the same ABSOLUTE row in both phase views', () => {
+    // Warmup-only conversations start earliest — under a plain global-start
+    // ordering they'd sit above the shared ones in the warmup view but be
+    // absent from the profiling view, sliding every shared row up when the
+    // toggle flips. Spanning conversations must sort first so the leading block
+    // is identical in both views and a carried-over conversation never moves.
+    const data: RequestRecord[] = [
+      rec('W1', 'warmup', 0, 2),
+      rec('W2', 'warmup', 3, 4),
+      rec('A', 'warmup', 5, 8),
+      rec('A', 'profiling', 100, 110),
+      rec('B', 'warmup', 10, 15),
+      rec('B', 'profiling', 120, 130),
+      rec('P', 'profiling', 50, 60),
+    ];
+    const index = computeStableRowIndex(data, 'conversation');
+    const parentLabels = (phase: RequestRecord['phase']) =>
+      buildRequestTimelineRows(
+        data.filter((r) => r.phase === phase),
+        'conversation',
+        new Set(),
+        index,
+      )
+        .filter((r) => r.kind === 'parent')
+        .map((r) => r.label);
+    // Shared block [A, B] leads both views at rows 0 and 1; phase-unique
+    // conversations fill in below.
+    expect(parentLabels('warmup')).toEqual(['A', 'B', 'W1', 'W2']);
+    expect(parentLabels('profiling')).toEqual(['A', 'B', 'P']);
+  });
+
+  it('without a shared index, the same subset re-sorts by its own start times (regression guard)', () => {
+    // Sanity: the legacy self-contained path (no index arg) orders by the
+    // subset's own first-start, which is exactly why the shared index is needed.
+    const profilingOnly = buildRequestTimelineRows(
+      full.filter((r) => r.phase === 'profiling'),
+      'conversation',
+      new Set(),
+    ).filter((r) => r.kind === 'parent');
+    // C (start 50) sorts first here, ahead of A (100) and B (120).
+    expect(profilingOnly.map((r) => r.label)).toEqual(['C', 'A', 'B']);
+  });
+});
+
+describe('parseTimelineViewSnapshot', () => {
+  const full: TimelineViewSnapshot = {
+    viewStart: 1_000,
+    viewEnd: 5_000,
+    rowMode: 'worker',
+    phaseFilter: 'warmup',
+    expanded: ['conv::sa:subagent_001_abcd'],
+    scrollTop: 240,
+    scrollLeft: 80,
+  };
+
+  it('round-trips a full snapshot', () => {
+    expect(parseTimelineViewSnapshot(JSON.stringify(full))).toEqual(full);
+  });
+
+  it('round-trips the profiling phase and rejects the removed "all" value', () => {
+    expect(
+      parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'profiling' }))?.phaseFilter,
+    ).toBe('profiling');
+    // 'all' is no longer a valid phase — coerces back to the profiling default.
+    expect(
+      parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'all' }))?.phaseFilter,
+    ).toBe('profiling');
+  });
+
+  it('returns null for absent or unparseable input', () => {
+    expect(parseTimelineViewSnapshot(null)).toBeNull();
+    expect(parseTimelineViewSnapshot('')).toBeNull();
+    expect(parseTimelineViewSnapshot('{not json')).toBeNull();
+    expect(parseTimelineViewSnapshot('42')).toBeNull();
+  });
+
+  it('preserves a null viewEnd (not zoomed) and rejects non-finite viewEnd', () => {
+    const restored = parseTimelineViewSnapshot(JSON.stringify({ ...full, viewEnd: null }));
+    expect(restored?.viewEnd).toBeNull();
+    // NaN / Infinity don't survive JSON, but a malformed string value must coerce to null.
+    expect(parseTimelineViewSnapshot('{"viewEnd":"oops"}')?.viewEnd).toBeNull();
+  });
+
+  it('falls back to defaults for invalid enums and missing numbers', () => {
+    expect(parseTimelineViewSnapshot('{}')).toEqual({
+      viewStart: 0,
+      viewEnd: null,
+      rowMode: 'conversation',
+      phaseFilter: 'profiling',
+      expanded: [],
+      scrollTop: 0,
+      scrollLeft: 0,
+    });
+    const bogus = parseTimelineViewSnapshot(
+      JSON.stringify({ rowMode: 'nope', phaseFilter: 'nope', viewStart: 'x', scrollTop: null }),
+    )!;
+    expect(bogus.rowMode).toBe('conversation');
+    expect(bogus.phaseFilter).toBe('profiling');
+    expect(bogus.viewStart).toBe(0);
+    expect(bogus.scrollTop).toBe(0);
+  });
+
+  it('drops non-string entries from the expanded list', () => {
+    expect(parseTimelineViewSnapshot('{"expanded":["a",1,null,"b"]}')!.expanded).toEqual([
+      'a',
+      'b',
+    ]);
+    expect(parseTimelineViewSnapshot('{"expanded":"nope"}')!.expanded).toEqual([]);
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
new file mode 100644
index 00000000..1786c74d
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -0,0 +1,581 @@
+'use client';
+
+import { useCallback, useLayoutEffect, useMemo, useRef, useState } from 'react';
+import { useRouter } from 'next/navigation';
+
+import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+import { requestsForPhase } from './phase-slice';
+import { TimelineBars } from './timeline-bars';
+import { formatDuration } from './timeline-format';
+import {
+  CHART_WIDTH,
+  HEADER_HEIGHT,
+  LABEL_WIDTH,
+  PADDING_RIGHT,
+  ROW_GAP,
+  ROW_HEIGHT,
+  TIMELINE_BODY_MAX_HEIGHT,
+  timelineSvgHeight,
+} from './timeline-layout';
+import {
+  buildRequestTimelineRows,
+  computeStableRowIndex,
+  conversationHref,
+  requestIdleStats,
+  type RequestTimelineRow,
+  type RowMode,
+} from './timeline-rows';
+import type { SortedRequestTimes } from './timeline-cursor-stats';
+import {
+  consumeTimelineViewSnapshot,
+  saveTimelineViewSnapshot,
+  type PhaseFilter,
+} from './timeline-view-snapshot';
+import {
+  CursorPopover,
+  TimelineTooltip,
+  type CursorState,
+  type TooltipData,
+} from './timeline-tooltips';
+
+// Stable public API: pure helpers and types live in focused modules, but
+// external consumers (detail page, tests) import them from here.
+export {
+  buildRequestTimelineRows,
+  computeStableRowIndex,
+  conversationHref,
+  datasetConvId,
+  requestIdleStats,
+  splitTimelineCid,
+  subagentIdOf,
+} from './timeline-rows';
+export type { RequestIdleStats, RequestTimelineRow } from './timeline-rows';
+export { parseTimelineViewSnapshot } from './timeline-view-snapshot';
+export type { TimelineViewSnapshot } from './timeline-view-snapshot';
+
+/**
+ * Gantt-style request timeline for one agentic benchmark point.
+ *
+ * Rows are conversations (or workers — toggle in the header). Bars are
+ * individual HTTP requests, drawn from request_start to request_end with a
+ * thin lead-in segment from credit_issued (load gen queue). Shift+scroll
+ * zooms, drag pans, hover shows per-request stats.
+ *
+ * The reference for this layout is the agent-timeline in semianalysis-claude-code-proxy.
+ */
+
+const ROW_MODE_OPTIONS: SegmentedToggleOption<RowMode>[] = [
+  { value: 'conversation', label: 'By conversation', testId: 'timeline-mode-conversation' },
+  { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' },
+];
+
+const PHASE_OPTIONS: SegmentedToggleOption<PhaseFilter>[] = [
+  { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' },
+  { value: 'warmup', label: 'Warmup', testId: 'timeline-phase-warmup' },
+];
+
+const PLOT_WIDTH = CHART_WIDTH - PADDING_RIGHT;
+
+export function RequestTimelineView({
+  data,
+  datasetSlug,
+  pointId,
+}: {
+  data: RequestTimeline;
+  /** Source dataset slug for this run; enables click-to-conversation deep links. */
+  datasetSlug?: string | null;
+  /** benchmark_results.id — keys the per-point view-state snapshot for restore. */
+  pointId: number;
+}) {
+  const router = useRouter();
+  const [rowMode, setRowMode] = useState<RowMode>('conversation');
+  const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
+  const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+
+  // The scroll container (vertical row scroll + horizontal chart scroll) and a
+  // ref mirror of the live view state, so click-through can snapshot the exact
+  // position without rebuilding openConversation on every zoom/pan tick.
+  const scrollRef = useRef<HTMLDivElement>(null);
+  const liveStateRef = useRef<{
+    viewStart: number;
+    viewEnd: number | null;
+    rowMode: RowMode;
+    phaseFilter: PhaseFilter;
+    expandedSubagents: ReadonlySet<string>;
+  }>({
+    viewStart: 0,
+    viewEnd: null,
+    rowMode: 'conversation',
+    phaseFilter: 'profiling',
+    expandedSubagents: new Set(),
+  });
+
+  const openConversation = useCallback(
+    (req: RequestRecord) => {
+      if (!datasetSlug) return;
+      // Snapshot the current zoom/scroll/filter position so the browser back
+      // button restores it (see the restore effect below).
+      if (scrollRef.current) {
+        const live = liveStateRef.current;
+        saveTimelineViewSnapshot(pointId, {
+          viewStart: live.viewStart,
+          viewEnd: live.viewEnd,
+          rowMode: live.rowMode,
+          phaseFilter: live.phaseFilter,
+          expanded: [...live.expandedSubagents],
+          scrollTop: scrollRef.current.scrollTop,
+          scrollLeft: scrollRef.current.scrollLeft,
+        });
+      }
+      track('agentic_timeline_to_dataset', { slug: datasetSlug });
+      router.push(conversationHref(datasetSlug, req));
+    },
+    [datasetSlug, router, pointId],
+  );
+  // Which multi-stream subagents currently have their per-stream rows
+  // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id).
+  const [expandedSubagents, setExpandedSubagents] = useState<ReadonlySet<string>>(() => new Set());
+  const toggleSubagent = useCallback((key: string) => {
+    setExpandedSubagents((prev) => {
+      const next = new Set(prev);
+      if (next.has(key)) next.delete(key);
+      else next.add(key);
+      return next;
+    });
+  }, []);
+  const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
+
+  // The phase toggle only means something when warmup requests are actually
+  // present. aiperf's profile_export only contains profiling-phase requests, so
+  // in practice every record is `profiling` and the toggle is a no-op — hide it
+  // unless a non-profiling request exists (keeps it working if warmup is ever
+  // exported).
+  const hasWarmup = useMemo(
+    () => data.requests.some((r) => r.phase !== 'profiling'),
+    [data.requests],
+  );
+
+  // Apply phase filter, then group into rows. With no warmup data the filter
+  // collapses to "profiling" regardless of the (hidden) toggle state.
+  const filtered = useMemo(
+    () => requestsForPhase(data.requests, hasWarmup ? phaseFilter : 'profiling'),
+    [data.requests, phaseFilter, hasWarmup],
+  );
+  // Stable order/color per conversation (or worker), computed over the FULL
+  // request set — NOT the phase-filtered subset — so a row keeps its position
+  // and color when the user toggles between warmup and profiling.
+  const stableRowIndex = useMemo(
+    () => computeStableRowIndex(data.requests, rowMode),
+    [data.requests, rowMode],
+  );
+  const rows = useMemo(
+    () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents, stableRowIndex),
+    [filtered, rowMode, expandedSubagents, stableRowIndex],
+  );
+  const idleStats = useMemo(() => requestIdleStats(filtered), [filtered]);
+
+  // Pre-sort the timestamp columns so the cursor-time stats popover can
+  // count "running / waiting at time t" in O(log n). With a few hundred
+  // requests this is overkill — but it stays smooth on huge runs too.
+  const sortedTimes = useMemo<SortedRequestTimes>(() => {
+    const credits = filtered.map((r) => r.credit).toSorted((a, b) => a - b);
+    const starts = filtered.map((r) => r.start).toSorted((a, b) => a - b);
+    const ends = filtered.map((r) => r.end).toSorted((a, b) => a - b);
+    return { credits, starts, ends };
+  }, [filtered]);
+
+  // Cursor state (vertical line + stats popover). null when the mouse
+  // isn't over the chart. xPx is svg-local; tNs is the ns offset from
+  // dataStart that the cursor is pointing at.
+  const [cursor, setCursor] = useState<CursorState | null>(null);
+
+  // Timeline extent (clamped to actual data — if we filtered out warmup
+  // the visible window should shrink to just the profiling phase).
+  const { dataStart, dataEnd } = useMemo(() => {
+    if (filtered.length === 0) return { dataStart: 0, dataEnd: 1 };
+    let min = Number.POSITIVE_INFINITY;
+    let max = Number.NEGATIVE_INFINITY;
+    for (const r of filtered) {
+      if (r.credit < min) min = r.credit;
+      if (r.end > max) max = r.end;
+    }
+    return { dataStart: min, dataEnd: max };
+  }, [filtered]);
+  const totalNs = Math.max(dataEnd - dataStart, 1);
+
+  // Visible window state (ns offsets, relative to dataStart).
+  const [viewStart, setViewStart] = useState(0);
+  const [viewEnd, setViewEnd] = useState<number | null>(null);
+  const vStart = viewStart;
+  const vEnd = viewEnd ?? totalNs;
+  const visibleDur = Math.max(vEnd - vStart, 1);
+  const isZoomed = viewEnd !== null;
+
+  // Mirror the live view state into a ref so the click-through snapshot reads
+  // the latest values without rebuilding openConversation on every zoom tick.
+  liveStateRef.current = { viewStart, viewEnd, rowMode, phaseFilter, expandedSubagents };
+
+  // Restore the snapshot written on click-through (e.g. open a request in the
+  // dataset flamegraph, then hit the browser back button). Runs once per mount,
+  // keyed by point id; the snapshot is consumed so a later reload starts fresh.
+  // Scroll is applied after the restored filters/expansions re-render the rows
+  // (rAF fires after that synchronous commit, before paint — no visible jump).
+  useLayoutEffect(() => {
+    const snapshot = consumeTimelineViewSnapshot(pointId);
+    if (!snapshot) return;
+    setRowMode(snapshot.rowMode);
+    setPhaseFilter(snapshot.phaseFilter);
+    setExpandedSubagents(new Set(snapshot.expanded));
+    setViewStart(snapshot.viewStart);
+    setViewEnd(snapshot.viewEnd);
+    const target = { top: snapshot.scrollTop, left: snapshot.scrollLeft };
+    requestAnimationFrame(() => {
+      const el = scrollRef.current;
+      if (!el) return;
+      el.scrollTop = target.top;
+      el.scrollLeft = target.left;
+    });
+    // setState setters are stable; only re-run if the point itself changes.
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [pointId]);
+
+  const svgHeight = timelineSvgHeight(rows.length);
+
+  // Native (non-passive) wheel handler: React's synthetic onWheel is attached
+  // passively, so preventDefault there is silently ignored and shift+scroll
+  // would zoom AND horizontally pan the scroll container.
+  const zoomSvgRef = useRef<SVGSVGElement | null>(null);
+  const handleWheel = useCallback(
+    (e: WheelEvent) => {
+      // Zoom only on shift+scroll so plain scrolling keeps its native meaning
+      // (page / row-container scroll) instead of being hijacked by the chart.
+      if (!e.shiftKey) return;
+      e.preventDefault();
+      const rect = (e.currentTarget as SVGSVGElement).getBoundingClientRect();
+      const mouseX = e.clientX - rect.left;
+      const mouseRatio = Math.max(0, Math.min(1, mouseX / PLOT_WIDTH));
+      const curStart = vStart;
+      const curEnd = vEnd;
+      const curDur = curEnd - curStart;
+      // With shift held, most browsers report the wheel delta on deltaX.
+      const delta = e.deltaY || e.deltaX;
+      const factor = delta > 0 ? 1.2 : 1 / 1.2;
+      const newDur = Math.min(Math.max(curDur * factor, totalNs * 0.001), totalNs);
+      const pivot = curStart + mouseRatio * curDur;
+      let newStart = pivot - mouseRatio * newDur;
+      let newEnd = pivot + (1 - mouseRatio) * newDur;
+      if (newStart < 0) {
+        newEnd -= newStart;
+        newStart = 0;
+      }
+      if (newEnd > totalNs) {
+        newStart -= newEnd - totalNs;
+        newEnd = totalNs;
+        if (newStart < 0) newStart = 0;
+      }
+      if (newEnd - newStart >= totalNs * 0.99) {
+        setViewStart(0);
+        setViewEnd(null);
+      } else {
+        setViewStart(newStart);
+        setViewEnd(newEnd);
+      }
+    },
+    [vStart, vEnd, totalNs],
+  );
+
+  useLayoutEffect(() => {
+    const svg = zoomSvgRef.current;
+    if (!svg) return;
+    svg.addEventListener('wheel', handleWheel, { passive: false });
+    return () => svg.removeEventListener('wheel', handleWheel);
+  }, [handleWheel]);
+
+  const handleMouseDown = useCallback(
+    (e: React.MouseEvent<SVGSVGElement>) => {
+      if (e.button !== 0) return;
+      dragRef.current = { startX: e.clientX, vs: vStart, ve: vEnd };
+    },
+    [vStart, vEnd],
+  );
+
+  const handleMouseMove = useCallback(
+    (e: React.MouseEvent<SVGSVGElement>) => {
+      // Dragging takes precedence over cursor tracking — panning the view.
+      if (dragRef.current) {
+        const dx = e.clientX - dragRef.current.startX;
+        const nsPerPx = visibleDur / PLOT_WIDTH;
+        const delta = -dx * nsPerPx;
+        let ns = dragRef.current.vs + delta;
+        let ne = dragRef.current.ve + delta;
+        const dur = ne - ns;
+        if (ns < 0) {
+          ns = 0;
+          ne = dur;
+        }
+        if (ne > totalNs) {
+          ne = totalNs;
+          ns = totalNs - dur;
+          if (ns < 0) ns = 0;
+        }
+        setViewStart(ns);
+        setViewEnd(ne);
+        setTooltip(null);
+        setCursor(null);
+        return;
+      }
+      // Track the cursor position in svg-local px and the matching ns offset
+      // so the crosshair + stats popover can render. Clamped to the chart
+      // plot area (don't show a cursor on the axis labels gutter).
+      const rect = e.currentTarget.getBoundingClientRect();
+      const xPx = Math.max(0, Math.min(PLOT_WIDTH, e.clientX - rect.left));
+      const nsPerPx = visibleDur / PLOT_WIDTH;
+      const tNs = vStart + xPx * nsPerPx;
+      setCursor({ xPx, tNs, clientX: e.clientX, clientY: e.clientY });
+    },
+    [visibleDur, totalNs, vStart],
+  );
+
+  const handleMouseUp = useCallback(() => {
+    dragRef.current = null;
+  }, []);
+
+  const handleMouseLeave = useCallback(() => {
+    dragRef.current = null;
+    setCursor(null);
+  }, []);
+
+  const resetZoom = useCallback(() => {
+    setViewStart(0);
+    setViewEnd(null);
+  }, []);
+
+  // Stable bar callbacks so TimelineBars' memo isn't defeated by fresh
+  // closures on every tooltip/cursor state change.
+  const handleBarHover = useCallback(
+    (e: React.MouseEvent, row: RequestTimelineRow, req: RequestRecord) => {
+      setTooltip({ x: e.clientX, y: e.clientY, row, req });
+    },
+    [],
+  );
+  const handleBarLeave = useCallback(() => setTooltip(null), []);
+  const handleBarClick = useCallback(
+    (e: React.MouseEvent, req: RequestRecord) => {
+      if (e.metaKey || e.ctrlKey || e.shiftKey || e.altKey || e.button !== 0) return;
+      e.preventDefault();
+      openConversation(req);
+    },
+    [openConversation],
+  );
+
+  if (rows.length === 0) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        No requests in the current filter.
+      </div>
+    );
+  }
+
+  const totalRequests = filtered.length;
+
+  return (
+    <div className="space-y-3">
+      {/* Controls */}
+      <div className="flex flex-wrap items-center gap-2">
+        <SegmentedToggle
+          value={rowMode}
+          options={ROW_MODE_OPTIONS}
+          onValueChange={setRowMode}
+          ariaLabel="Row mode"
+          testId="timeline-row-mode"
+          buttonClassName="px-2.5 py-1 text-xs"
+        />
+        {hasWarmup && (
+          <SegmentedToggle
+            value={phaseFilter}
+            options={PHASE_OPTIONS}
+            onValueChange={setPhaseFilter}
+            ariaLabel="Phase filter"
+            testId="timeline-phase-filter"
+            buttonClassName="px-2.5 py-1 text-xs"
+          />
+        )}
+        <span className="ml-auto text-xs text-muted-foreground">
+          {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '}
+          {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '}
+          {formatDuration((dataEnd - dataStart) / 1e6)} ·{' '}
+          <span
+            data-testid="timeline-total-idle-time"
+            title="Time between the first request start and final request end with no requests in flight"
+          >
+            idle {formatDuration(idleStats.idleNs / 1e6)}
+            {idleStats.spanNs > 0
+              ? ` (${((idleStats.idleNs / idleStats.spanNs) * 100).toFixed(1)}%)`
+              : ''}
+          </span>
+          {isZoomed && (
+            <>
+              {' · '}
+              <button type="button" onClick={resetZoom} className="text-foreground hover:underline">
+                reset zoom
+              </button>
+            </>
+          )}
+        </span>
+      </div>
+
+      {/* Chart container */}
+      <div className="rounded-md border border-border/60 bg-card overflow-hidden">
+        {/* Fixed-height window: rows scroll vertically and the chart scrolls
+            horizontally inside it, so the card doesn't grow to fit every
+            conversation/worker AND the horizontal scrollbar stays pinned to the
+            window's bottom edge (rather than the bottom of the tall content). */}
+        <div
+          ref={scrollRef}
+          className="overflow-auto"
+          style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}
+        >
+          <div className="flex w-max">
+            {/* Label column — pinned left (sticky) so it stays put during
+                horizontal scroll, while scrolling vertically with the rows. */}
+            <div
+              className="sticky left-0 z-10 flex-shrink-0 border-r border-border/60 bg-card"
+              style={{ width: LABEL_WIDTH }}
+            >
+              <div
+                className="border-b border-border/60 flex items-end px-2 pb-1"
+                style={{ height: HEADER_HEIGHT }}
+              >
+                <span className="text-[9px] font-mono font-bold uppercase tracking-[0.15em] text-muted-foreground">
+                  {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
+                </span>
+              </div>
+              {rows.map((row) => {
+                const isSubagentRow = row.kind === 'subagent';
+                const isChildRow = row.kind === 'stream' || row.kind === 'aux';
+                const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
+                const isExpanded = isExpandable && expandedSubagents.has(row.key);
+                return (
+                  <div
+                    key={row.key}
+                    data-timeline-row-kind={row.kind}
+                    className="flex items-center gap-1 overflow-hidden pr-2"
+                    style={{
+                      height: ROW_HEIGHT + ROW_GAP,
+                      paddingLeft: 4 + row.depth * 10,
+                    }}
+                  >
+                    {isExpandable ? (
+                      <button
+                        type="button"
+                        onClick={() => toggleSubagent(row.key)}
+                        className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
+                        aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                        title={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                      >
+                        <span className="text-[10px] leading-none">{isExpanded ? '▾' : '▸'}</span>
+                      </button>
+                    ) : (
+                      <span className="size-3.5 shrink-0" />
+                    )}
+                    <span
+                      className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                      style={{
+                        backgroundColor: row.color,
+                        opacity: isChildRow ? 0.4 : isSubagentRow ? 0.55 : 1,
+                      }}
+                    />
+                    <span
+                      className="text-[10px] font-mono truncate"
+                      style={{
+                        color: row.color,
+                        opacity: isChildRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                      }}
+                    >
+                      {row.label}
+                      {isExpandable && (
+                        <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
+                      )}
+                      {isSubagentRow && (row.auxCount ?? 0) > 0 && (
+                        <span className="text-muted-foreground ml-1">+{row.auxCount} aux</span>
+                      )}
+                    </span>
+                    <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                      {row.requests.length > 0 ? row.requests.length : '—'}
+                    </span>
+                  </div>
+                );
+              })}
+            </div>
+
+            {/* Chart column — horizontal scrolling is handled by the window
+                container above so its scrollbar stays pinned to the window's
+                bottom edge; double-click anywhere resets the zoom. */}
+            <div className="flex-shrink-0">
+              <svg
+                ref={zoomSvgRef}
+                width={CHART_WIDTH}
+                height={svgHeight}
+                className="block"
+                style={{ cursor: isZoomed ? 'grab' : 'crosshair' }}
+                onMouseDown={handleMouseDown}
+                onMouseMove={handleMouseMove}
+                onMouseUp={handleMouseUp}
+                onMouseLeave={handleMouseLeave}
+                onDoubleClick={resetZoom}
+              >
+                <TimelineBars
+                  rows={rows}
+                  expandedSubagents={expandedSubagents}
+                  dataStart={dataStart}
+                  vStart={vStart}
+                  vEnd={vEnd}
+                  datasetSlug={datasetSlug}
+                  onBarHover={handleBarHover}
+                  onBarLeave={handleBarLeave}
+                  onBarClick={handleBarClick}
+                />
+
+                {/* Cursor crosshair — drawn on top of bars so it stays visible
+                  through dense rows. Stats popover is rendered as fixed
+                  HTML below the SVG block. */}
+                {cursor && (
+                  <line
+                    x1={cursor.xPx}
+                    x2={cursor.xPx}
+                    y1={0}
+                    y2={svgHeight}
+                    stroke="currentColor"
+                    strokeWidth={1}
+                    opacity={0.45}
+                    pointerEvents="none"
+                  />
+                )}
+              </svg>
+            </div>
+          </div>
+        </div>
+      </div>
+
+      {/* Footer — interaction hint only. */}
+      <div className="flex items-center px-1 text-[11px] text-muted-foreground">
+        <span className="ml-auto opacity-70">
+          shift+scroll to zoom · drag to pan · double-click to reset
+        </span>
+      </div>
+
+      {/* Cursor stats popover: count of in-flight / waiting at the cursor's
+          ns offset. Hidden when the user is hovering an individual bar
+          (per-request tooltip wins). */}
+      {cursor && !tooltip && (
+        <CursorPopover cursor={cursor} dataStart={dataStart} times={sortedTimes} />
+      )}
+
+      {/* Tooltip */}
+      {tooltip && <TimelineTooltip data={tooltip} linkable={Boolean(datasetSlug)} />}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx b/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx
new file mode 100644
index 00000000..6eb109b7
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx
@@ -0,0 +1,474 @@
+'use client';
+
+import type { RequestTimeline } from '@/hooks/api/use-request-timeline';
+import type { MetricSourceSeries, QueueDepthPoint } from '@/hooks/api/use-trace-server-metrics';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+import { CHART_SIZES, ChartEmpty, ChartSkeleton } from './chart-shared';
+import { ExpandableChart } from './expandable-chart';
+import { metricSourceLabel } from './metric-source-toolbar';
+import type { PhaseSlicedSeries, ServerSeriesLike } from './phase-slice';
+import { StackedAreaChart, TimeSeriesChart } from './time-series-chart';
+import {
+  cumulativeCompletedRequests,
+  cumulativeDifferenceMonotonic,
+  cumulativeTimeAverage,
+  cumulativeUniqueInputTokens,
+  buildThroughputChartSeries,
+  inflightUniqueTokens,
+  rollingAverage,
+  timeRollingAverage,
+  toggleThroughputSeries,
+  type ThroughputSeriesKey,
+} from './time-series-math';
+
+/**
+ * Phase-sliced server series (+ matching durationS). Null while the trace
+ * blob is loading or absent — cards render a skeleton until it arrives.
+ */
+type SlicedServerSeries = PhaseSlicedSeries<ServerSeriesLike> | null;
+
+export type RequestActivityView = 'queue' | 'completed';
+
+const REQUEST_ACTIVITY_OPTIONS: SegmentedToggleOption<RequestActivityView>[] = [
+  { value: 'queue', label: 'Queue depth', testId: 'request-activity-queue' },
+  { value: 'completed', label: 'Completed', testId: 'request-activity-completed' },
+];
+
+/** Compact token count for chart labels: 306808 → "307K tok", 3.2e6 → "3.2M tok". */
+const fmtTokensCompact = (n: number): string => {
+  if (n >= 1e6) return `${(n / 1e6).toFixed(1)}M tok`;
+  if (n >= 1e3) return `${Math.round(n / 1e3)}K tok`;
+  return `${Math.round(n)} tok`;
+};
+
+// Per-DP-rank color palette for DEP runs (one distinct color per rank in
+// the KV cache utilization overlay). Mirrors the request-timeline row
+// palette so the same DP index reads as the same color across both views.
+// Wraps mod-N if more than 12 ranks ever land.
+const DP_RANK_PALETTE = [
+  '#3b82f6',
+  '#ef4444',
+  '#10b981',
+  '#f59e0b',
+  '#a855f7',
+  '#06b6d4',
+  '#f97316',
+  '#84cc16',
+  '#ec4899',
+  '#14b8a6',
+  '#8b5cf6',
+  '#eab308',
+];
+
+export function KvCacheUtilizationCard({ sliced }: { sliced: SlicedServerSeries }) {
+  return (
+    <ExpandableChart
+      title="KV cache utilization over time"
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!sliced) return <ChartSkeleton />;
+        const serverSeries = sliced.series;
+        // For SGLang hicache rows we have both GPU (HBM) util and
+        // host (CPU offload pool) util — overlay them as two lines.
+        const hasHost = serverSeries.hostKvCacheUsage.length > 0;
+        // DEP runs report one series per engine. When there's more
+        // than one, draw one line per rank in distinct colors so
+        // load skew is visible at a glance; cluster-average sits on
+        // top in white so it stands out.
+        const perEngine = serverSeries.kvCacheUsageByEngine ?? [];
+        const hasPerEngine = perEngine.length > 1;
+        // Render order matters: per-engine first → average drawn on top.
+        const series = [
+          ...(hasPerEngine
+            ? perEngine.map((e, i) => ({
+                name: `DP ${e.engineLabel}`,
+                data: rollingAverage(e.points, 50),
+                color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!,
+                // Thin + translucent so the Avg line on top reads as
+                // the headline number, not just one more series.
+                strokeWidth: 1,
+                strokeOpacity: 0.5,
+              }))
+            : []),
+          {
+            name: hasHost ? 'GPU HBM (avg n=50)' : hasPerEngine ? 'Avg' : 'GPU KV cache (avg n=50)',
+            data: rollingAverage(serverSeries.kvCacheUsage, 50),
+            // Skip raw scatter when per-engine overlay is on — the
+            // DP-rank lines already convey the spread, dots would be noise.
+            rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage,
+            // Bold red Avg sits on top of the translucent per-DP lines.
+            // DP 1 in the palette is #ef4444 (lighter red); the darker
+            // #dc2626 here plus the heavier stroke keeps it distinct.
+            color: hasPerEngine ? '#dc2626' : '#3b82f6',
+            strokeWidth: hasPerEngine ? 3.5 : 2,
+          },
+          ...(hasHost
+            ? [
+                {
+                  name: 'CPU offload pool (avg n=50)',
+                  data: rollingAverage(serverSeries.hostKvCacheUsage, 50),
+                  rawData: serverSeries.hostKvCacheUsage,
+                  color: '#f97316',
+                  strokeWidth: 2,
+                },
+              ]
+            : []),
+        ];
+        return (
+          <TimeSeriesChart
+            series={series}
+            durationS={sliced.durationS}
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            yAxisLabel="KV cache (%)"
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
+export function RequestActivityCard({
+  sliced,
+  phaseTimeline,
+  timelineLoading,
+  view,
+  onViewChange,
+}: {
+  sliced: SlicedServerSeries;
+  phaseTimeline: RequestTimeline | null;
+  timelineLoading: boolean;
+  view: RequestActivityView;
+  onViewChange: (view: RequestActivityView) => void;
+}) {
+  return (
+    <ExpandableChart
+      title={view === 'queue' ? 'Request queue depth' : 'Cumulative completed requests'}
+      testId="request-activity-chart"
+      controls={
+        <SegmentedToggle
+          value={view}
+          options={REQUEST_ACTIVITY_OPTIONS}
+          onValueChange={(value) => {
+            onViewChange(value);
+            track('inference_agentic_request_activity_changed', { view: value });
+          }}
+          ariaLabel="Request activity metric"
+          testId="request-activity-toggle"
+          buttonClassName="px-2 py-1 text-xs"
+        />
+      }
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (view === 'completed') {
+          if (!phaseTimeline) {
+            return timelineLoading ? <ChartSkeleton /> : <ChartEmpty />;
+          }
+          return (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'Completed requests',
+                  data: cumulativeCompletedRequests(phaseTimeline.requests),
+                  color: '#3b82f6',
+                  strokeWidth: 2.5,
+                },
+              ]}
+              durationS={phaseTimeline.durationS}
+              yAxisLabel="Requests"
+              {...size}
+            />
+          );
+        }
+        if (!sliced) return <ChartSkeleton />;
+        const serverSeries = sliced.series;
+        return (
+          <TimeSeriesChart
+            series={[
+              {
+                name: 'Running (avg n=50)',
+                data: rollingAverage(
+                  serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
+                    t: p.t,
+                    value: p.running,
+                  })),
+                  50,
+                ),
+                color: '#22c55e',
+                strokeWidth: 2,
+              },
+              {
+                name: 'Waiting (avg n=50)',
+                data: rollingAverage(
+                  serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
+                    t: p.t,
+                    value: p.waiting,
+                  })),
+                  50,
+                ),
+                color: '#ef4444',
+                strokeWidth: 2,
+              },
+              {
+                name: 'Total (avg n=50)',
+                data: rollingAverage(
+                  serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
+                    t: p.t,
+                    value: p.total,
+                  })),
+                  50,
+                ),
+                color: '#3b82f6',
+                strokeWidth: 2,
+              },
+            ]}
+            durationS={sliced.durationS}
+            yAxisLabel="Requests"
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
+export function PrefixCacheHitRateCard({ sliced }: { sliced: SlicedServerSeries }) {
+  return (
+    <ExpandableChart
+      title="Prefix cache hit rate per interval"
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!sliced) return <ChartSkeleton />;
+        const serverSeries = sliced.series;
+        return (
+          <TimeSeriesChart
+            series={[
+              {
+                name: 'GPU (HBM, avg n=50)',
+                data: rollingAverage(serverSeries.prefixCacheHitRate, 50),
+                rawData: serverSeries.prefixCacheHitRate,
+                color: '#a855f7',
+                strokeWidth: 2,
+              },
+            ]}
+            durationS={sliced.durationS}
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            yAxisLabel="Hit rate (%)"
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
+export function ThroughputCard({
+  sliced,
+  selectedSource,
+  selected,
+  onSelectedChange,
+}: {
+  sliced: SlicedServerSeries;
+  selectedSource: MetricSourceSeries | undefined;
+  selected: ReadonlySet<ThroughputSeriesKey>;
+  onSelectedChange: (next: ReadonlySet<ThroughputSeriesKey>) => void;
+}) {
+  return (
+    <ExpandableChart
+      title={
+        selectedSource
+          ? `Throughput · ${metricSourceLabel(selectedSource.source)}`
+          : 'Throughput (input & decode)'
+      }
+      controls={
+        <div className="flex items-center gap-1" data-testid="throughput-series-toggle">
+          {(
+            [
+              ['input', 'Input'],
+              ['decode', 'Decode'],
+            ] as const
+          ).map(([key, label]) => {
+            const active = selected.has(key);
+            const isOnlyActive = active && selected.size === 1;
+            return (
+              <button
+                key={key}
+                type="button"
+                aria-pressed={active}
+                disabled={isOnlyActive}
+                data-testid={`throughput-series-${key}`}
+                className={`rounded px-2 py-1 text-xs font-medium transition-colors ${
+                  active
+                    ? key === 'input'
+                      ? 'bg-blue-500/20 text-blue-600 dark:text-blue-300'
+                      : 'bg-orange-500/20 text-orange-600 dark:text-orange-300'
+                    : 'bg-muted text-muted-foreground hover:text-foreground'
+                } disabled:cursor-not-allowed disabled:opacity-60`}
+                onClick={() => {
+                  const next = toggleThroughputSeries(selected, key);
+                  if (next === selected) return;
+                  onSelectedChange(next);
+                  track('inference_agentic_throughput_series_toggled', {
+                    series: key,
+                    enabled: next.has(key),
+                  });
+                }}
+              >
+                {label}
+              </button>
+            );
+          })}
+        </div>
+      }
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!sliced) return <ChartSkeleton />;
+        const serverSeries = sliced.series;
+        return (
+          <TimeSeriesChart
+            series={buildThroughputChartSeries(
+              serverSeries.prefillTps,
+              serverSeries.decodeTps,
+              selected,
+            )}
+            durationS={sliced.durationS}
+            yAxisLabel="Tokens / sec"
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
+export function PromptTokenSourceCard({ sliced }: { sliced: SlicedServerSeries }) {
+  return (
+    <ExpandableChart
+      title="Cumulative prompt token source breakdown"
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!sliced) return <ChartSkeleton />;
+        return (
+          <StackedAreaChart
+            sourceSeries={sliced.series.promptTokensBySource}
+            durationS={sliced.durationS}
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
+export function CumulativeUniqueInputTokensCard({ sliced }: { sliced: SlicedServerSeries }) {
+  return (
+    <ExpandableChart
+      title="Total unique input tokens over time"
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!sliced) return <ChartSkeleton />;
+        const serverSeries = sliced.series;
+        // Unique = total prompt tokens received minus tokens served from
+        // any cache tier — i.e. the freshly prefill-computed tokens. Prefer
+        // the promptTokensBySource breakdown (its buckets sum to the real
+        // prompt-token total, so subtracting cache tiers is exact). Fall
+        // back to cumsum(prefillTps - prefixCacheHitsTps) only for older
+        // data without the breakdown: vllm:prefix_cache_hits re-counts
+        // tokens across scheduler passes, so its cumulative can exceed the
+        // prompt tokens received, driving the diff negative and freezing
+        // the monotonic-clamped line after a few seconds.
+        const uniqueFromBreakdown = cumulativeUniqueInputTokens(serverSeries.promptTokensBySource);
+        const uniqueData =
+          uniqueFromBreakdown.length > 0
+            ? uniqueFromBreakdown
+            : cumulativeDifferenceMonotonic(
+                serverSeries.prefillTps,
+                serverSeries.prefixCacheHitsTps,
+              );
+        return (
+          <TimeSeriesChart
+            series={[
+              {
+                name: 'Cumulative unique input tokens',
+                data: uniqueData,
+                color: '#3b82f6',
+                strokeWidth: 2,
+              },
+            ]}
+            durationS={sliced.durationS}
+            yAxisLabel="Tokens"
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
+export function InflightUniqueTokensCard({
+  phaseTimeline,
+  timelineLoading,
+  kvCachePoolTokens,
+}: {
+  phaseTimeline: RequestTimeline | null;
+  timelineLoading: boolean;
+  /** KV-cache pool size in tokens (vLLM only) — drawn as a constant ceiling. */
+  kvCachePoolTokens: number | null;
+}) {
+  return (
+    <ExpandableChart
+      title="Unique input tokens in flight"
+      testId="unique-input-inflight-chart"
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!phaseTimeline) {
+          return timelineLoading ? <ChartSkeleton /> : <ChartEmpty />;
+        }
+        // Step function: at each request start/end, sum the ISLs of
+        // currently-active requests across distinct cids. Within one
+        // cid turns are sequential so each cid contributes at most
+        // one in-flight ISL; across cids we treat content as
+        // independent (cross-conv prefix sharing adds <1pp in
+        // practice). Smooth with a 30s time-weighted rolling average
+        // so brief turn-handoff dips don't dominate the chart.
+        const raw = inflightUniqueTokens(phaseTimeline.requests);
+        const smoothed = timeRollingAverage(raw, 30);
+        // KV-cache pool size (vLLM only) drawn as a constant ceiling so
+        // you can see how close the working set gets to eviction
+        // pressure. Phase-independent — it's a static config value.
+        const pool = kvCachePoolTokens;
+        return (
+          <TimeSeriesChart
+            series={[
+              {
+                name: 'In flight (avg 30s)',
+                data: smoothed,
+                rawData: raw,
+                color: '#a855f7',
+                strokeWidth: 2,
+              },
+              {
+                name: 'Cumulative average',
+                data: cumulativeTimeAverage(raw),
+                color: '#ef4444',
+                strokeWidth: 3,
+              },
+            ]}
+            durationS={phaseTimeline.durationS}
+            yAxisLabel="Tokens"
+            refLines={
+              pool && pool > 0
+                ? [{ value: pool, label: `KV cache pool · ${fmtTokensCompact(pool)}` }]
+                : undefined
+            }
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
new file mode 100644
index 00000000..a1a5d1ab
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -0,0 +1,247 @@
+'use client';
+
+import { useMemo, useState } from 'react';
+import { useRouter } from 'next/navigation';
+import { ChevronLeft, ChevronRight } from 'lucide-react';
+
+import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
+import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { track } from '@/lib/analytics';
+
+const HW_LABELS: Record<string, string> = {
+  b200: 'B200',
+  b300: 'B300',
+  gb200: 'GB200',
+  gb300: 'GB300',
+  h100: 'H100',
+  h200: 'H200',
+  mi300x: 'MI300X',
+  mi325x: 'MI325X',
+  mi355x: 'MI355X',
+};
+
+const MODEL_LABELS: Record<string, string> = {
+  dsr1: 'DeepSeek R1',
+  dsv4: 'DeepSeek V4 Pro',
+  glm5: 'GLM-5',
+  'glm5.1': 'GLM-5.1',
+  gptoss120b: 'gpt-oss 120B',
+  kimik2: 'Kimi K2',
+  'kimik2.5': 'Kimi K2.5',
+  'kimik2.6': 'Kimi K2.6',
+  llama70b: 'Llama 3.3 70B',
+  'minimaxm2.5': 'MiniMax M2.5',
+  'minimaxm2.7': 'MiniMax M2.7',
+  'qwen3.5': 'Qwen 3.5',
+};
+
+function hwLabel(hw: string) {
+  return HW_LABELS[hw] ?? hw.toUpperCase();
+}
+function modelLabel(m: string) {
+  return MODEL_LABELS[m] ?? m;
+}
+function frameworkLabel(fw: string) {
+  if (fw === 'vllm') return 'vLLM';
+  if (fw === 'sglang') return 'SGLang';
+  if (fw === 'trt') return 'TRT';
+  if (fw === 'mori-sglang') return 'Mori-SGLang';
+  if (fw.startsWith('dynamo-')) return `Dynamo ${fw.slice('dynamo-'.length).toUpperCase()}`;
+  return fw;
+}
+
+/** Short label for a sibling chip: parallelism + concurrency. */
+export function chipLabel(s: BenchmarkSibling): string {
+  // Same parallelism labeler the chart points use (TP/EP/TEP/DEP/DPA…).
+  const parallel = parallelismLabel({
+    tp: s.decode_tp,
+    ep: s.decode_ep,
+    dpAttention: s.decode_dp_attention,
+    disagg: s.disagg,
+    isMultinode: s.is_multinode,
+    prefillTp: s.prefill_tp,
+    prefillEp: s.prefill_ep,
+    prefillDpAttention: s.prefill_dp_attention,
+    prefillNumWorkers: s.prefill_num_workers,
+    decodeTp: s.decode_tp,
+    decodeEp: s.decode_ep,
+    decodeDpAttention: s.decode_dp_attention,
+    decodeNumWorkers: s.decode_num_workers,
+  });
+  const offload = s.offload_mode === 'on' ? ' • off=ON' : '';
+  return `${parallel} • c=${s.conc}${offload}`;
+}
+
+type SortMode = 'default' | 'conc' | 'parallelism' | 'tput' | 'requests';
+
+const SORT_OPTIONS: { value: SortMode; label: string }[] = [
+  { value: 'default', label: 'Default' },
+  { value: 'conc', label: 'Concurrency ↑' },
+  { value: 'parallelism', label: 'Parallelism' },
+  { value: 'tput', label: 'Throughput/GPU ↓' },
+  { value: 'requests', label: 'Total requests ↓' },
+];
+
+// Group key for the "parallelism" sort: ep first (so TP/EP1 sorts ahead of
+// EP/TEP/DEP groups), then tp, then dp-attention, then disagg — every config
+// of one parallelism lands together, ordered by concurrency within.
+const parallelRank = (s: BenchmarkSibling): [number, number, number, number] => [
+  s.decode_ep ?? 0,
+  s.decode_tp ?? 0,
+  s.decode_dp_attention ? 1 : 0,
+  s.disagg ? 1 : 0,
+];
+
+function sortSiblings(siblings: BenchmarkSibling[], mode: SortMode): BenchmarkSibling[] {
+  if (mode === 'default') return siblings;
+  const out = [...siblings];
+  if (mode === 'conc') {
+    out.sort((a, b) => a.conc - b.conc);
+  } else if (mode === 'tput') {
+    // Highest throughput/GPU first; rows missing the metric sink to the end.
+    out.sort((a, b) => (b.tput_per_gpu ?? -Infinity) - (a.tput_per_gpu ?? -Infinity));
+  } else if (mode === 'requests') {
+    // Most total requests first; rows missing the metric sink to the end.
+    out.sort((a, b) => (b.total_requests ?? -Infinity) - (a.total_requests ?? -Infinity));
+  } else {
+    out.sort((a, b) => {
+      const ra = parallelRank(a);
+      const rb = parallelRank(b);
+      for (let i = 0; i < ra.length; i++) {
+        if (ra[i] !== rb[i]) return ra[i] - rb[i];
+      }
+      // Within a parallelism group: offload off before on, then concurrency.
+      const oa = a.offload_mode === 'on' ? 1 : 0;
+      const ob = b.offload_mode === 'on' ? 1 : 0;
+      return oa - ob || a.conc - b.conc;
+    });
+  }
+  return out;
+}
+
+const isSortMode = (v: string | null): v is SortMode =>
+  v !== null && SORT_OPTIONS.some((o) => o.value === v);
+
+export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) {
+  const router = useRouter();
+  // Persist the sort in the URL so clicking a point (which remounts this
+  // component on the new route) keeps the chosen order instead of resetting.
+  // Read it once from the URL on mount — this component only renders after the
+  // client-side siblings query resolves, so `window` is always available here
+  // (no SSR/hydration mismatch). Matches the app's window-based url-state read.
+  const [sortMode, setSortMode] = useState<SortMode>(() => {
+    if (typeof window === 'undefined') return 'default';
+    const v = new URLSearchParams(window.location.search).get('sort');
+    return isSortMode(v) ? v : 'default';
+  });
+
+  const sorted = useMemo(() => sortSiblings(siblings, sortMode), [siblings, sortMode]);
+
+  // prev/next follow the displayed (sorted) order so navigation matches the row.
+  const currentIdx = sorted.findIndex((s) => s.is_current);
+  const prev = currentIdx > 0 ? sorted[currentIdx - 1] : null;
+  const next = currentIdx !== -1 && currentIdx < sorted.length - 1 ? sorted[currentIdx + 1] : null;
+
+  // Carry the active sort through every point-to-point link.
+  const hrefFor = (id: number) =>
+    sortMode === 'default'
+      ? `/inference/agentic/${id}`
+      : `/inference/agentic/${id}?sort=${sortMode}`;
+
+  const currentId = siblings.find((s) => s.is_current)?.id;
+
+  const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`;
+
+  return (
+    <div className="border-b border-border/40 pb-4 mb-4">
+      <div className="flex items-baseline justify-between gap-3 mb-3">
+        <h1 className="text-2xl font-semibold text-foreground">{skuLabel}</h1>
+        <span className="text-xs text-muted-foreground">
+          {siblings.length} point{siblings.length === 1 ? '' : 's'} in this run · {sku.date}
+        </span>
+      </div>
+      <div className="flex items-center gap-2 flex-wrap">
+        <div className="flex items-center gap-1.5">
+          <span className="text-xs text-muted-foreground">Sort by</span>
+          <Select
+            value={sortMode}
+            onValueChange={(v) => {
+              const mode = v as SortMode;
+              setSortMode(mode);
+              track('agentic_siblings_sorted', { mode });
+              // Mirror into the URL (replace, no history spam) so a refresh —
+              // and the next point's mount — keep the chosen order.
+              if (currentId !== undefined) {
+                const href =
+                  mode === 'default'
+                    ? `/inference/agentic/${currentId}`
+                    : `/inference/agentic/${currentId}?sort=${mode}`;
+                router.replace(href, { scroll: false });
+              }
+            }}
+          >
+            <SelectTrigger
+              className="h-7 w-[10rem] text-xs"
+              aria-label="Sort points"
+              data-testid="sibling-sort-select"
+            >
+              <SelectValue />
+            </SelectTrigger>
+            <SelectContent>
+              {SORT_OPTIONS.map((o) => (
+                <SelectItem key={o.value} value={o.value} className="text-xs">
+                  {o.label}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
+        <button
+          type="button"
+          disabled={!prev}
+          onClick={() => prev && router.push(hrefFor(prev.id))}
+          className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+          aria-label="Previous point"
+        >
+          <ChevronLeft className="size-3.5" /> prev
+        </button>
+        <div className="flex items-center gap-1 flex-wrap">
+          {sorted.map((s) => {
+            const active = s.is_current;
+            return (
+              <button
+                key={s.id}
+                type="button"
+                onClick={() => !active && router.push(hrefFor(s.id))}
+                className={`px-2 py-1 rounded-md text-xs border transition-colors ${
+                  active
+                    ? 'border-primary bg-primary text-primary-foreground font-medium'
+                    : 'border-border/40 text-foreground hover:bg-accent'
+                } ${s.has_trace ? '' : 'opacity-60'}`}
+                title={s.has_trace ? undefined : 'No stored trace data'}
+              >
+                {chipLabel(s)}
+              </button>
+            );
+          })}
+        </div>
+        <button
+          type="button"
+          disabled={!next}
+          onClick={() => next && router.push(hrefFor(next.id))}
+          className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+          aria-label="Next point"
+        >
+          next <ChevronRight className="size-3.5" />
+        </button>
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
new file mode 100644
index 00000000..2131c82e
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -0,0 +1,526 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+import { CHART_PAD, ChartEmpty, fmtCount, fmtSeconds } from './chart-shared';
+import { interpAt, type ChartSeries } from './time-series-math';
+
+// Historical entry point: the pure data-shaping helpers lived in this module
+// before being extracted; re-export them so both import paths stay valid.
+export * from './time-series-math';
+
+/** A constant horizontal reference line (e.g. a capacity ceiling). */
+export interface ReferenceLine {
+  value: number;
+  label: string;
+  /** Line + label color. Defaults to a muted emerald. */
+  color?: string;
+}
+
+interface TimeSeriesChartProps {
+  series: ChartSeries[];
+  durationS: number;
+  yMax?: number;
+  yFmt?: (v: number) => string;
+  yAxisLabel?: string;
+  width?: number;
+  height?: number;
+  /**
+   * Horizontal reference lines drawn across the plot. Their values are folded
+   * into the auto y-max so the line stays on-chart even when it exceeds the
+   * data (e.g. a KV-cache pool ceiling well above the working set).
+   */
+  refLines?: readonly ReferenceLine[];
+}
+
+const NO_REF_LINES: readonly ReferenceLine[] = [];
+
+const PAD = CHART_PAD;
+
+export function TimeSeriesChart({
+  series,
+  durationS,
+  yMax: yMaxOpt,
+  yFmt = fmtCount,
+  yAxisLabel,
+  width = 720,
+  height = 260,
+  refLines = NO_REF_LINES,
+}: TimeSeriesChartProps) {
+  const W = width;
+  const H = height;
+
+  const layout = useMemo(() => {
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const xMax = Math.max(durationS, 1);
+    // Fold reference-line values into the auto max so a ceiling above the data
+    // (e.g. KV-cache pool >> working set) still renders inside the plot.
+    const refMax = refLines.length > 0 ? Math.max(...refLines.map((r) => r.value)) : 0;
+    const yMax =
+      yMaxOpt ?? Math.max(1e-9, refMax, ...series.flatMap((s) => s.data.map((d) => d.value)));
+    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+    const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
+    return { innerW, innerH, xMax, yMax, xScale, yScale };
+  }, [series, durationS, yMaxOpt, refLines, W, H]);
+
+  const { innerW, innerH, xMax, yMax, xScale, yScale } = layout;
+
+  const subsample = (arr: TimeSeriesPoint[]) => {
+    if (arr.length === 0) return arr;
+    const stride = Math.max(1, Math.floor(arr.length / innerW));
+    return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
+  };
+
+  // Pre-format axis ticks.
+  const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+  const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
+
+  const resolve = (fraction: number) => {
+    const t = fraction * xMax;
+    const items: HoverItem[] = [];
+    for (const s of series) {
+      if (s.hideFromHover) continue;
+      const v = interpAt(s.data, t);
+      if (v === null || !Number.isFinite(v)) continue;
+      items.push({ color: s.color, label: s.name, value: yFmt(v) });
+    }
+    if (items.length === 0) return null;
+    return { items, title: fmtSeconds(t) };
+  };
+
+  if (series.every((s) => s.data.length === 0)) {
+    return <ChartEmpty />;
+  }
+
+  return (
+    <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+      {/* y-axis gridlines + labels */}
+      {yTickVals.map((v, i) => {
+        const y = yScale(v);
+        return (
+          <g key={`y${i}`}>
+            <line
+              x1={PAD.left - 4}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke="currentColor"
+              opacity={0.08}
+            />
+            <text
+              x={PAD.left - 8}
+              y={y + 3}
+              fontSize={10}
+              fill="currentColor"
+              opacity={0.55}
+              textAnchor="end"
+            >
+              {yFmt(v)}
+            </text>
+          </g>
+        );
+      })}
+
+      {/* Raw scatter underlay */}
+      {series
+        .filter((s) => s.rawData && s.rawData.length > 0)
+        .map((s, si) =>
+          subsample(s.rawData!).map((d, i) => (
+            <circle
+              key={`r${si}-${i}`}
+              cx={xScale(d.t)}
+              cy={yScale(d.value)}
+              r={1.5}
+              fill={s.color}
+              opacity={0.2}
+            />
+          )),
+        )}
+
+      {/* Lines */}
+      {series.map((s, si) => {
+        if (s.data.length === 0) return null;
+        const sampled = subsample(s.data);
+        const path = sampled
+          .map(
+            (d, i) =>
+              `${i === 0 ? 'M' : 'L'}${xScale(d.t).toFixed(2)},${yScale(d.value).toFixed(2)}`,
+          )
+          .join(' ');
+        return (
+          <path
+            key={`l${si}`}
+            d={path}
+            fill="none"
+            stroke={s.color}
+            strokeWidth={s.strokeWidth ?? 1.8}
+            strokeOpacity={s.strokeOpacity ?? 1}
+          />
+        );
+      })}
+
+      {/* Horizontal reference lines (e.g. KV-cache pool ceiling). Drawn on top
+          of the data lines, with a label pinned to the right edge. */}
+      {refLines.map((ref, i) => {
+        if (!Number.isFinite(ref.value) || ref.value < 0 || ref.value > yMax) return null;
+        const y = yScale(ref.value);
+        const color = ref.color ?? '#16a34a';
+        return (
+          <g key={`ref${i}`}>
+            <line
+              x1={PAD.left}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke={color}
+              strokeWidth={1.5}
+              strokeDasharray="5 4"
+              opacity={0.85}
+            />
+            <text
+              x={PAD.left + innerW - 4}
+              y={y - 4}
+              fontSize={10}
+              fill={color}
+              opacity={0.95}
+              textAnchor="end"
+            >
+              {ref.label}
+            </text>
+          </g>
+        );
+      })}
+
+      {/* X-axis */}
+      <line
+        x1={PAD.left}
+        x2={PAD.left + innerW}
+        y1={PAD.top + innerH}
+        y2={PAD.top + innerH}
+        stroke="currentColor"
+        opacity={0.2}
+      />
+      {xTickVals.map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return (
+          <text
+            key={`x${i}`}
+            x={x}
+            y={PAD.top + innerH + 14}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.7}
+            textAnchor={anchor}
+          >
+            {fmtSeconds(v)}
+          </text>
+        );
+      })}
+      <text
+        x={W / 2}
+        y={H - 22}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+      >
+        time
+      </text>
+
+      {yAxisLabel && (
+        <text
+          x={10}
+          y={H / 2}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+          transform={`rotate(-90 10 ${H / 2})`}
+        >
+          {yAxisLabel}
+        </text>
+      )}
+
+      {/* Legend — skip series flagged hideFromHover so per-engine
+          underlays don't clutter the chip row. */}
+      {(() => {
+        const visible = series.filter((s) => !s.hideFromHover);
+        const chipY = H - 8;
+        const chipW = innerW / Math.max(1, visible.length);
+        return visible.map((s, i) => {
+          const x = PAD.left + i * chipW;
+          return (
+            <g key={`leg${i}`}>
+              <line
+                x1={x + 2}
+                x2={x + 14}
+                y1={chipY - 4}
+                y2={chipY - 4}
+                stroke={s.color}
+                strokeWidth={s.strokeWidth ?? 2}
+              />
+              <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                {s.name}
+              </text>
+            </g>
+          );
+        });
+      })()}
+    </ChartHover>
+  );
+}
+
+// Fixed colors for the token-source names the chart-series builder emits
+// (vLLM names first, then the SGLang names compute-chart-series produces).
+const KNOWN_SOURCE_COLORS: Record<string, string> = {
+  local_compute: '#f97316',
+  local_cache_hit: '#3b82f6',
+  external_kv_transfer: '#22c55e',
+  miss: '#f97316',
+  'cache hit (HBM)': '#3b82f6',
+  'cache hit (CPU offload)': '#22c55e',
+  'cache hit': '#3b82f6',
+  'compute (miss)': '#f97316',
+};
+
+const SOURCE_LABELS: Record<string, string> = {
+  local_compute: 'Prefill',
+  local_cache_hit: 'HBM Cache Hit',
+  external_kv_transfer: 'Offload Cache Hit',
+  miss: 'Miss',
+};
+
+// Fallback palette for any source name not in KNOWN_SOURCE_COLORS so we never
+// emit two layers in the same shade. Cycles by stack (insertion) order.
+const FALLBACK_PALETTE = [
+  '#3b82f6',
+  '#f97316',
+  '#22c55e',
+  '#a855f7',
+  '#ef4444',
+  '#06b6d4',
+  '#f59e0b',
+  '#ec4899',
+];
+
+/** Stacked-area chart for token-source share over time. */
+export function StackedAreaChart({
+  sourceSeries,
+  durationS,
+  width = 720,
+  height = 260,
+}: {
+  sourceSeries: Record<string, TimeSeriesPoint[]>;
+  durationS: number;
+  width?: number;
+  height?: number;
+}) {
+  const W = width;
+  const H = height;
+
+  const computed = useMemo(() => {
+    const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
+    if (entries.length === 0) return null;
+
+    // Different sources can land on different scrape timestamps
+    // (SGLang's hits/misses fire on alternating ticks), so we MUST
+    // align across all sources before computing shares — otherwise the
+    // share calculation indexes into each source's own time axis and
+    // mixes values from different moments.
+    //
+    // Approach: union all timestamps across sources, then for each
+    // unique timestamp carry forward the cumulative sum for every
+    // source (a source that didn't report at time t holds its previous
+    // cumulative value rather than dropping to 0).
+    const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted(
+      (a, b) => a - b,
+    );
+
+    // For each source, walk its (sorted) array and produce a parallel
+    // cumulative-sum array indexed against `tValues` via carry-forward.
+    const cum: Record<string, number[]> = {};
+    for (const [name, arr] of entries) {
+      const valByT = new Map(arr.map((p) => [p.t, p.value]));
+      const out: number[] = Array.from({ length: tValues.length });
+      let acc = 0;
+      for (let i = 0; i < tValues.length; i++) {
+        const v = valByT.get(tValues[i]!);
+        if (v !== undefined) acc += v;
+        out[i] = acc;
+      }
+      cum[name] = out;
+    }
+
+    const shares: Record<string, number[]> = {};
+    for (const name of Object.keys(cum)) shares[name] = [];
+    for (let i = 0; i < tValues.length; i++) {
+      const total = entries.reduce((s, [name]) => s + (cum[name]?.[i] ?? 0), 0);
+      for (const [name] of entries) {
+        shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0);
+      }
+    }
+    return { tValues, shares };
+  }, [sourceSeries]);
+
+  if (!computed) {
+    return <ChartEmpty />;
+  }
+  const { tValues, shares } = computed;
+
+  const stackOrder = Object.keys(shares);
+
+  // Assign colors once per render in stack order so the layers and the hover
+  // tooltip always agree, including for unknown source names on the fallback
+  // palette.
+  const colorByName = new Map<string, string>();
+  let fallbackIdx = 0;
+  for (const name of stackOrder) {
+    const known = KNOWN_SOURCE_COLORS[name];
+    colorByName.set(name, known ?? FALLBACK_PALETTE[fallbackIdx++ % FALLBACK_PALETTE.length]!);
+  }
+  const colorFor = (name: string): string => colorByName.get(name) ?? FALLBACK_PALETTE[0]!;
+
+  const innerW = W - PAD.left - PAD.right;
+  const innerH = H - PAD.top - PAD.bottom;
+  const xMax = Math.max(durationS, 1);
+  const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+  const yScale = (v: number) => PAD.top + (1 - v) * innerH;
+
+  const lower: number[] = Array.from({ length: tValues.length }, () => 0);
+  const layers = stackOrder.map((name) => {
+    const upper = shares[name]!.map((v, i) => lower[i]! + v);
+    const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+    const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+    const d = `${top
+      .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+      .join(' ')} ${[...bottom]
+      .toReversed()
+      .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
+      .join(' ')} Z`;
+    const color = colorFor(name);
+    for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
+    return { name, color, d };
+  });
+
+  const resolve = (fraction: number) => {
+    const t = fraction * xMax;
+    // Find the closest tValue index.
+    let idx = 0;
+    let bestDist = Infinity;
+    for (let i = 0; i < tValues.length; i++) {
+      const d = Math.abs(tValues[i]! - t);
+      if (d < bestDist) {
+        bestDist = d;
+        idx = i;
+      }
+    }
+    const items: HoverItem[] = stackOrder.map((name) => ({
+      color: colorFor(name),
+      label: SOURCE_LABELS[name] ?? name,
+      value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`,
+    }));
+    return { items, title: fmtSeconds(t) };
+  };
+
+  const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+  const yTickVals = [0, 0.25, 0.5, 0.75, 1];
+
+  return (
+    <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+      {yTickVals.map((v, i) => {
+        const y = yScale(v);
+        return (
+          <g key={`y${i}`}>
+            <line
+              x1={PAD.left - 4}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke="currentColor"
+              opacity={0.08}
+            />
+            <text
+              x={PAD.left - 8}
+              y={y + 3}
+              fontSize={10}
+              fill="currentColor"
+              opacity={0.55}
+              textAnchor="end"
+            >
+              {(v * 100).toFixed(0)}%
+            </text>
+          </g>
+        );
+      })}
+      {layers.map((l, i) => (
+        <path key={i} d={l.d} fill={l.color} opacity={0.75} />
+      ))}
+      <line
+        x1={PAD.left}
+        x2={PAD.left + innerW}
+        y1={PAD.top + innerH}
+        y2={PAD.top + innerH}
+        stroke="currentColor"
+        opacity={0.2}
+      />
+      {xTickVals.map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return (
+          <text
+            key={`x${i}`}
+            x={x}
+            y={PAD.top + innerH + 14}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.7}
+            textAnchor={anchor}
+          >
+            {fmtSeconds(v)}
+          </text>
+        );
+      })}
+      <text
+        x={W / 2}
+        y={H - 22}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+      >
+        time
+      </text>
+      <text
+        x={10}
+        y={H / 2}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+        transform={`rotate(-90 10 ${H / 2})`}
+      >
+        % of prefill tokens
+      </text>
+      {(() => {
+        const chipY = H - 8;
+        const chipW = innerW / Math.max(1, layers.length);
+        return layers.map((l, i) => {
+          const x = PAD.left + i * chipW;
+          return (
+            <g key={`leg${i}`}>
+              <rect x={x + 2} y={chipY - 9} width={12} height={8} fill={l.color} opacity={0.75} />
+              <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                {SOURCE_LABELS[l.name] ?? l.name}
+              </text>
+            </g>
+          );
+        });
+      })()}
+    </ChartHover>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-math.test.ts b/packages/app/src/components/inference/agentic-point/time-series-math.test.ts
new file mode 100644
index 00000000..d92fc9ba
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-math.test.ts
@@ -0,0 +1,457 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import {
+  averageSequenceLengthInFlight,
+  buildThroughputChartSeries,
+  cumulativeAverage,
+  cumulativeCompletedRequests,
+  cumulativeDifferenceMonotonic,
+  cumulativeTimeAverage,
+  cumulativeUniqueInputTokens,
+  inflightUniqueTokens,
+  interpAt,
+  rollingAverage,
+  rollingRequestMetric,
+  timeRollingAverage,
+  toggleThroughputSeries,
+} from './time-series-math';
+
+const request = (
+  endS: number,
+  ttftMs: number | null,
+  tpotMs: number | null,
+  overrides: Partial<RequestRecord> = {},
+): RequestRecord => ({
+  cid: 'conversation',
+  ti: endS,
+  wid: 'worker',
+  ad: 0,
+  phase: 'profiling',
+  credit: 0,
+  start: 0,
+  ack: null,
+  end: endS * 1e9,
+  ttftMs,
+  tpotMs,
+  isl: 100,
+  osl: 10,
+  cancelled: false,
+  ...overrides,
+});
+
+describe('rollingRequestMetric', () => {
+  it('computes a trailing P75 TTFT over the requested window', () => {
+    const result = rollingRequestMetric(
+      [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30), request(4, 400, 40)],
+      'ttft',
+      'p75',
+      3,
+    );
+
+    expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 });
+    expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]);
+    expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.325]);
+  });
+
+  it('inverts the rolling TPOT percentile for interactivity', () => {
+    const result = rollingRequestMetric(
+      [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30)],
+      'interactivity',
+      'p90',
+      3,
+    );
+
+    expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]);
+    expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8);
+    expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]);
+  });
+
+  it('computes E2E latency from request start through request end', () => {
+    const result = rollingRequestMetric(
+      [request(2, 100, 10, { start: 500_000_000 }), request(4, 200, 20, { start: 1_000_000_000 })],
+      'e2e',
+      'p90',
+      50,
+    );
+
+    expect(result.raw).toEqual([
+      { t: 2, value: 1.5 },
+      { t: 4, value: 3 },
+    ]);
+    expect(result.trend.at(-1)?.value).toBeCloseTo(2.85, 8);
+    expect(result.cumulative.at(-1)?.value).toBeCloseTo(2.85, 8);
+  });
+
+  it('drops cancelled, missing, and non-positive samples (phase is the caller’s concern)', () => {
+    const result = rollingRequestMetric(
+      [
+        request(1, 100, 10),
+        request(2, 200, 20, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline
+        request(3, 300, 30, { cancelled: true }),
+        request(4, null, null),
+        request(5, 0, 0),
+      ],
+      'ttft',
+      'p90',
+    );
+
+    expect(result.raw).toEqual([
+      { t: 1, value: 0.1 },
+      { t: 2, value: 0.2 },
+    ]);
+  });
+});
+
+describe('timeRollingAverage', () => {
+  it('integrates the step function over the trailing window', () => {
+    const result = timeRollingAverage(
+      [
+        { t: 0, value: 10 },
+        { t: 2, value: 20 },
+        { t: 4, value: 40 },
+      ],
+      4,
+    );
+
+    // t=0: zero-length window → raw value. t=2: 10 held on [0,2) → 10.
+    // t=4: 10 on [0,2) + 20 on [2,4) = 60 area / 4 s = 15.
+    expect(result).toEqual([
+      { t: 0, value: 10 },
+      { t: 2, value: 10 },
+      { t: 4, value: 15 },
+    ]);
+  });
+
+  it('carries the pre-window step value into a clipped window', () => {
+    const result = timeRollingAverage(
+      [
+        { t: 0, value: 10 },
+        { t: 2, value: 20 },
+        { t: 4, value: 40 },
+      ],
+      2,
+    );
+
+    // Window [2,4): value 20 held throughout (the t=0 sample sets the step
+    // value at the window start via carry-forward of data[j-1]).
+    expect(result.at(-1)).toEqual({ t: 4, value: 20 });
+  });
+
+  it('passes through empty input and non-positive windows', () => {
+    expect(timeRollingAverage([], 30)).toEqual([]);
+    const data = [{ t: 0, value: 1 }];
+    expect(timeRollingAverage(data, 0)).toBe(data);
+  });
+});
+
+describe('rollingAverage', () => {
+  it('averages a centered window clipped at the edges', () => {
+    const data = [1, 2, 3, 4].map((value, i) => ({ t: i, value }));
+    expect(rollingAverage(data, 3).map((p) => p.value)).toEqual([1.5, 2, 3, 3.5]);
+  });
+
+  it('passes through window sizes of 1 or less', () => {
+    const data = [{ t: 0, value: 5 }];
+    expect(rollingAverage(data, 1)).toBe(data);
+  });
+});
+
+describe('cumulativeAverage', () => {
+  it('hides the startup interval without removing it from later averages', () => {
+    const result = cumulativeAverage(
+      [
+        { t: 0, value: 300 },
+        { t: 30, value: 0 },
+        { t: 60, value: 0 },
+        { t: 90, value: 100 },
+      ],
+      60,
+    );
+
+    expect(result).toEqual([
+      { t: 60, value: 100 },
+      { t: 90, value: 100 },
+    ]);
+  });
+
+  it('preserves the original behavior when no burn-in is requested', () => {
+    expect(
+      cumulativeAverage([
+        { t: 0, value: 10 },
+        { t: 1, value: 20 },
+      ]),
+    ).toEqual([
+      { t: 0, value: 10 },
+      { t: 1, value: 15 },
+    ]);
+  });
+});
+
+describe('cumulativeTimeAverage', () => {
+  it('computes a run-to-date time-weighted average for a step series', () => {
+    expect(
+      cumulativeTimeAverage([
+        { t: 0, value: 100 },
+        { t: 1, value: 300 },
+        { t: 3, value: 100 },
+        { t: 4, value: 0 },
+      ]),
+    ).toEqual([
+      { t: 0, value: 100 },
+      { t: 1, value: 100 },
+      { t: 3, value: 700 / 3 },
+      { t: 4, value: 200 },
+    ]);
+  });
+
+  it('coalesces same-time request events to their final step value', () => {
+    expect(
+      cumulativeTimeAverage([
+        { t: 0, value: 0 },
+        { t: 0, value: 100 },
+        { t: 2, value: 0 },
+      ]),
+    ).toEqual([
+      { t: 0, value: 100 },
+      { t: 2, value: 100 },
+    ]);
+  });
+});
+
+describe('cumulativeCompletedRequests', () => {
+  it('sorts completions and excludes cancelled requests (phase is the caller’s concern)', () => {
+    expect(
+      cumulativeCompletedRequests([
+        request(4, 100, 10),
+        request(2, 100, 10),
+        request(1, 100, 10, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline
+        request(3, 100, 10, { cancelled: true }),
+      ]),
+    ).toEqual([
+      { t: 0, value: 0 },
+      { t: 1, value: 1 },
+      { t: 2, value: 2 },
+      { t: 4, value: 3 },
+    ]);
+  });
+
+  it('returns no series when there are no successful completions', () => {
+    expect(cumulativeCompletedRequests([request(1, 100, 10, { cancelled: true })])).toEqual([]);
+  });
+});
+
+describe('averageSequenceLengthInFlight', () => {
+  it('computes the event-time average across overlapping profiling requests', () => {
+    expect(
+      averageSequenceLengthInFlight(
+        [
+          request(4, 100, 10, { start: 0, end: 4_000_000_000, isl: 100 }),
+          request(3, 100, 10, { start: 1_000_000_000, end: 3_000_000_000, isl: 300 }),
+        ],
+        'isl',
+      ),
+    ).toEqual([
+      { t: 0, value: 100 },
+      { t: 1, value: 200 },
+      { t: 3, value: 100 },
+      { t: 4, value: 0 },
+    ]);
+  });
+
+  it('excludes cancelled and missing sequence lengths (phase is the caller’s concern)', () => {
+    // Only the null-osl and cancelled rows are dropped; the warmup row is kept
+    // (the caller passes a phase-scoped timeline), so it produces a step series.
+    expect(
+      averageSequenceLengthInFlight(
+        [
+          request(1, 100, 10, { osl: null }),
+          request(2, 100, 10, { osl: 20, cancelled: true }),
+          request(3, 100, 10, { osl: 30, phase: 'warmup', start: 0, end: 3_000_000_000 }),
+        ],
+        'osl',
+      ),
+    ).toEqual([
+      { t: 0, value: 30 },
+      { t: 3, value: 0 },
+    ]);
+  });
+});
+
+describe('toggleThroughputSeries', () => {
+  it('allows either series to be hidden when both are selected', () => {
+    expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'input')]).toEqual(['decode']);
+    expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'decode')]).toEqual(['input']);
+  });
+
+  it('does not allow the final visible series to be hidden', () => {
+    const selected = new Set<'input' | 'decode'>(['decode']);
+    expect(toggleThroughputSeries(selected, 'decode')).toBe(selected);
+  });
+
+  it('allows the hidden series to be restored', () => {
+    expect([...toggleThroughputSeries(new Set(['decode']), 'input')]).toEqual(['decode', 'input']);
+  });
+
+  it('only includes the total running average when both series are visible', () => {
+    const input = [{ t: 0, value: 10 }];
+    const decode = [{ t: 0, value: 20 }];
+
+    expect(
+      buildThroughputChartSeries(input, decode, new Set(['input', 'decode'])).map(
+        ({ name }) => name,
+      ),
+    ).toEqual(['Input (avg n=50)', 'Decode (avg n=50)', 'Total running avg (60s burn-in)']);
+    expect(
+      buildThroughputChartSeries(input, decode, new Set(['input'])).map(({ name }) => name),
+    ).toEqual(['Input (avg n=50)']);
+    expect(
+      buildThroughputChartSeries(input, decode, new Set(['decode'])).map(({ name }) => name),
+    ).toEqual(['Decode (avg n=50)']);
+  });
+});
+
+describe('cumulativeUniqueInputTokens', () => {
+  it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => {
+    const out = cumulativeUniqueInputTokens({
+      local_compute: [
+        { t: 0, value: 100 },
+        { t: 1, value: 50 },
+      ],
+      local_cache_hit: [
+        { t: 0, value: 900 },
+        { t: 1, value: 950 },
+      ],
+      external_kv_transfer: [
+        { t: 0, value: 5000 },
+        { t: 1, value: 6000 },
+      ],
+    });
+    expect(out).toEqual([
+      { t: 0, value: 100 },
+      { t: 1, value: 150 },
+    ]);
+  });
+
+  it('recognizes the sglang compute/cache labels the builder emits', () => {
+    const out = cumulativeUniqueInputTokens({
+      'compute (miss)': [
+        { t: 0, value: 10 },
+        { t: 2, value: 20 },
+      ],
+      'cache hit (HBM)': [{ t: 0, value: 999 }],
+      'cache hit (CPU offload)': [{ t: 2, value: 999 }],
+    });
+    expect(out).toEqual([
+      { t: 0, value: 10 },
+      { t: 2, value: 30 },
+    ]);
+  });
+
+  it('sums multiple non-cache buckets at the same timestamp', () => {
+    const out = cumulativeUniqueInputTokens({
+      local_compute: [{ t: 0, value: 100 }],
+      miss: [{ t: 0, value: 25 }],
+    });
+    expect(out).toEqual([{ t: 0, value: 125 }]);
+  });
+
+  it('is monotonic non-decreasing (no clamp needed — values are rates ≥ 0)', () => {
+    const out = cumulativeUniqueInputTokens({
+      local_compute: [
+        { t: 0, value: 300 },
+        { t: 1, value: 0 },
+        { t: 2, value: 10 },
+      ],
+    });
+    expect(out.map((p) => p.value)).toEqual([300, 300, 310]);
+  });
+
+  it('returns [] when there is no breakdown so the caller can fall back', () => {
+    expect(cumulativeUniqueInputTokens(undefined)).toEqual([]);
+    expect(cumulativeUniqueInputTokens({})).toEqual([]);
+  });
+
+  it('returns [] when every bucket is a cache tier (no computed signal)', () => {
+    expect(
+      cumulativeUniqueInputTokens({
+        local_cache_hit: [{ t: 0, value: 100 }],
+        'cache hit': [{ t: 0, value: 100 }],
+      }),
+    ).toEqual([]);
+  });
+});
+
+describe('inflightUniqueTokens', () => {
+  it('sums active ISLs across cids as a step series (ends before starts on ties)', () => {
+    const out = inflightUniqueTokens([
+      { cid: 'a', start: 0, end: 2e9, isl: 100 },
+      { cid: 'a', start: 2e9, end: 4e9, isl: 150 }, // turn handoff at t=2
+      { cid: 'b', start: 1e9, end: 3e9, isl: 200 },
+    ]);
+    expect(out).toEqual([
+      { t: 0, value: 0 },
+      { t: 0, value: 100 },
+      { t: 1, value: 300 },
+      { t: 2, value: 200 }, // end of a's turn 1 processed first — no double count
+      { t: 2, value: 350 },
+      { t: 3, value: 150 },
+      { t: 4, value: 0 },
+    ]);
+  });
+
+  it('counts one in-flight ISL per cid even when its requests overlap', () => {
+    const out = inflightUniqueTokens([
+      { cid: 'a', start: 0, end: 3e9, isl: 100 },
+      { cid: 'a', start: 1e9, end: 2e9, isl: 50 },
+    ]);
+    expect(out).toEqual([
+      { t: 0, value: 0 },
+      { t: 0, value: 100 },
+      { t: 1, value: 100 }, // nested request folded into the cid's max ISL
+      { t: 2, value: 0 },
+      { t: 3, value: 0 },
+    ]);
+  });
+
+  it('skips requests without a positive ISL and empty input', () => {
+    expect(inflightUniqueTokens([])).toEqual([]);
+    expect(inflightUniqueTokens([{ cid: 'a', start: 0, end: 1e9, isl: null }])).toEqual([]);
+    expect(inflightUniqueTokens([{ cid: 'a', start: 0, end: 1e9, isl: 0 }])).toEqual([]);
+  });
+});
+
+describe('cumulativeDifferenceMonotonic', () => {
+  it('unions timestamps and clamps the difference to its running max', () => {
+    expect(
+      cumulativeDifferenceMonotonic(
+        [
+          { t: 0, value: 10 },
+          { t: 1, value: 10 },
+        ],
+        [
+          { t: 0, value: 5 },
+          { t: 2, value: 20 }, // drives the raw diff negative — clamp holds
+        ],
+      ),
+    ).toEqual([
+      { t: 0, value: 5 },
+      { t: 1, value: 15 },
+      { t: 2, value: 15 },
+    ]);
+  });
+});
+
+describe('interpAt', () => {
+  it('linearly interpolates between samples and clamps outside the range', () => {
+    const data = [
+      { t: 0, value: 0 },
+      { t: 10, value: 100 },
+    ];
+    expect(interpAt(data, 5)).toBe(50);
+    expect(interpAt(data, -1)).toBe(0);
+    expect(interpAt(data, 11)).toBe(100);
+    expect(interpAt([], 5)).toBeNull();
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/time-series-math.ts b/packages/app/src/components/inference/agentic-point/time-series-math.ts
new file mode 100644
index 00000000..7242db4d
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-math.ts
@@ -0,0 +1,491 @@
+/**
+ * Pure data-shaping helpers behind the agentic point-detail time-series
+ * charts: rolling/cumulative aggregations over `TimeSeriesPoint[]` server
+ * scrapes and per-request timeline records. No React, no SVG — everything
+ * here is unit-testable in isolation (see time-series-math.test.ts).
+ */
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+
+/** One drawable line in a TimeSeriesChart. */
+export interface ChartSeries {
+  name: string;
+  /** The line to draw (caller pre-smooths if desired). */
+  data: TimeSeriesPoint[];
+  /** Optional raw per-scrape values; rendered as low-opacity scatter behind the line. */
+  rawData?: TimeSeriesPoint[];
+  color: string;
+  /** Override default stroke width (1.8). Use higher values for emphasis lines. */
+  strokeWidth?: number;
+  /** Stroke opacity (0..1). Use < 1 for background/underlay lines. */
+  strokeOpacity?: number;
+  /** Hide from the hover legend (e.g. per-engine underlay lines that
+   *  would clutter the tooltip). The path still renders. */
+  hideFromHover?: boolean;
+}
+
+export type RequestMetric = 'interactivity' | 'ttft' | 'e2e';
+export type RequestPercentile = 'p75' | 'p90';
+export type ThroughputSeriesKey = 'input' | 'decode';
+
+/** Toggle one throughput series while preserving the at-least-one invariant. */
+export function toggleThroughputSeries(
+  selected: ReadonlySet<ThroughputSeriesKey>,
+  key: ThroughputSeriesKey,
+): ReadonlySet<ThroughputSeriesKey> {
+  if (selected.has(key) && selected.size === 1) return selected;
+  const next = new Set(selected);
+  if (next.has(key)) next.delete(key);
+  else next.add(key);
+  return next;
+}
+
+/** Linear-interpolated percentile (matches numpy's default method). */
+export function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+/** Linear-interpolated value at time `t` from a time-sorted series. */
+export function interpAt(data: TimeSeriesPoint[], t: number): number | null {
+  if (data.length === 0) return null;
+  if (t <= data[0]!.t) return data[0]!.value;
+  if (t >= data.at(-1)!.t) return data.at(-1)!.value;
+  // Binary search
+  let lo = 0;
+  let hi = data.length - 1;
+  while (hi - lo > 1) {
+    const mid = (lo + hi) >> 1;
+    if (data[mid]!.t <= t) lo = mid;
+    else hi = mid;
+  }
+  const a = data[lo]!;
+  const b = data[hi]!;
+  if (b.t === a.t) return a.value;
+  const frac = (t - a.t) / (b.t - a.t);
+  return a.value + (b.value - a.value) * frac;
+}
+
+/**
+ * Build raw request samples plus a trailing request-count percentile. E2E
+ * latency is measured from HTTP request start through final response byte.
+ *
+ * The percentile is computed in latency space. Interactivity then inverts
+ * the selected TPOT percentile, matching the aggregate chart convention:
+ * P90 interactivity = 1 / P90 TPOT (a conservative tail-latency view).
+ */
+export function rollingRequestMetric(
+  requests: readonly RequestRecord[],
+  metric: RequestMetric,
+  percentile: RequestPercentile,
+  windowSize = 50,
+): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } {
+  const q = percentile === 'p75' ? 0.75 : 0.9;
+  // Phase is the caller's concern — the agentic detail page passes a
+  // phase-scoped (warmup or profiling) timeline. Here we only drop cancelled
+  // requests and samples without a usable latency value.
+  const samples = requests
+    .filter((request) => !request.cancelled)
+    .flatMap((request) => {
+      const latencyMs =
+        metric === 'ttft'
+          ? request.ttftMs
+          : metric === 'e2e'
+            ? (request.end - request.start) / 1e6
+            : request.tpotMs;
+      if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return [];
+      return [{ t: request.end / 1e9, latencyMs }];
+    })
+    .toSorted((a, b) => a.t - b.t);
+
+  const raw = samples.map(({ t, latencyMs }) => ({
+    t,
+    value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000,
+  }));
+  const trend = samples.map(({ t }, i) => {
+    const start = Math.max(0, i - Math.max(1, windowSize) + 1);
+    const sorted = samples
+      .slice(start, i + 1)
+      .map((sample) => sample.latencyMs)
+      .toSorted((a, b) => a - b);
+    const latencyMs = quantile(sorted, q);
+    return { t, value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000 };
+  });
+  const prefixLatencies: number[] = [];
+  const cumulative = samples.map(({ t, latencyMs }) => {
+    let lo = 0;
+    let hi = prefixLatencies.length;
+    while (lo < hi) {
+      const mid = (lo + hi) >> 1;
+      if (prefixLatencies[mid]! <= latencyMs) lo = mid + 1;
+      else hi = mid;
+    }
+    prefixLatencies.splice(lo, 0, latencyMs);
+    const cumulativeLatencyMs = quantile(prefixLatencies, q);
+    return {
+      t,
+      value: metric === 'interactivity' ? 1000 / cumulativeLatencyMs : cumulativeLatencyMs / 1000,
+    };
+  });
+
+  return { raw, trend, cumulative };
+}
+
+/**
+ * Time-weighted rolling average over a `windowS`-second trailing window.
+ * Treats the input as a step function (value held constant between
+ * samples) and integrates over the trailing window, dividing by the
+ * window length. Good for smoothing irregularly-sampled event series
+ * (e.g. request start/end events) where the regular sample-count
+ * `rollingAverage` would over-weight bursts of close-together events.
+ */
+export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] {
+  if (data.length === 0 || windowS <= 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  for (let i = 0; i < data.length; i++) {
+    const tEnd = data[i]!.t;
+    const tStart = Math.max(0, tEnd - windowS);
+    // Find the first sample j whose t is >= tStart; the step value at
+    // tStart is data[j-1].value if j > 0, else data[0].value.
+    let j = 0;
+    while (j < data.length && data[j]!.t < tStart) j++;
+    let prevT = tStart;
+    let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value;
+    let area = 0;
+    for (; j <= i; j++) {
+      const curT = data[j]!.t;
+      area += prevV * (curT - prevT);
+      prevT = curT;
+      prevV = data[j]!.value;
+    }
+    const dur = tEnd - tStart;
+    out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value };
+  }
+  return out;
+}
+
+/** Centered rolling average over `windowSize` samples. */
+export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] {
+  if (data.length === 0 || windowSize <= 1) return data;
+  const half = Math.floor(windowSize / 2);
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  for (let i = 0; i < data.length; i++) {
+    const start = Math.max(0, i - half);
+    const end = Math.min(data.length, i + half + 1);
+    let sum = 0;
+    let n = 0;
+    for (let j = start; j < end; j++) {
+      sum += data[j]!.value;
+      n++;
+    }
+    out[i] = { t: data[i]!.t, value: n > 0 ? sum / n : 0 };
+  }
+  return out;
+}
+
+/**
+ * Expanding-window cumulative mean from index 0..i.
+ *
+ * `burnInS` suppresses rendering during the unstable startup interval while
+ * retaining those samples in every later average. This avoids visually
+ * promoting a single bursty counter bucket without changing the run-to-date
+ * meaning of the line once it appears.
+ */
+export function cumulativeAverage(data: TimeSeriesPoint[], burnInS = 0): TimeSeriesPoint[] {
+  if (data.length === 0) return data;
+  const out: TimeSeriesPoint[] = [];
+  const firstT = data[0]!.t;
+  let sum = 0;
+  for (let i = 0; i < data.length; i++) {
+    sum += data[i]!.value;
+    if (data[i]!.t - firstT >= burnInS) {
+      out.push({ t: data[i]!.t, value: sum / (i + 1) });
+    }
+  }
+  return out;
+}
+
+/**
+ * Run-to-date time-weighted average of a step series.
+ *
+ * Duplicate timestamps are coalesced to their final value before integration;
+ * this is important for request handoffs where several start/end events occur
+ * at the same instant. Each value is held until the next timestamp.
+ */
+export function cumulativeTimeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  if (data.length === 0) return [];
+  const points: TimeSeriesPoint[] = [];
+  for (const point of data.toSorted((a, b) => a.t - b.t)) {
+    if (!Number.isFinite(point.t) || !Number.isFinite(point.value)) continue;
+    const previous = points.at(-1);
+    if (previous?.t === point.t) previous.value = point.value;
+    else points.push({ ...point });
+  }
+  if (points.length === 0) return [];
+
+  const firstT = points[0]!.t;
+  let previousT = firstT;
+  let previousValue = points[0]!.value;
+  let area = 0;
+  return points.map((point, index) => {
+    if (index === 0) return { t: point.t, value: point.value };
+    area += previousValue * (point.t - previousT);
+    const duration = point.t - firstT;
+    previousT = point.t;
+    previousValue = point.value;
+    return { t: point.t, value: duration > 0 ? area / duration : point.value };
+  });
+}
+
+/**
+ * Cumulative count of successfully completed (non-cancelled) requests by end
+ * time. Phase is the caller's concern — pass a phase-scoped timeline.
+ */
+export function cumulativeCompletedRequests(requests: readonly RequestRecord[]): TimeSeriesPoint[] {
+  const completionTimes = requests
+    .filter((request) => !request.cancelled)
+    .map((request) => request.end / 1e9)
+    .filter(Number.isFinite)
+    .toSorted((a, b) => a - b);
+  if (completionTimes.length === 0) return [];
+  return [{ t: 0, value: 0 }, ...completionTimes.map((t, index) => ({ t, value: index + 1 }))];
+}
+
+/**
+ * Retrospective average sequence length among requests active at each event.
+ * OSL uses the request's final observed length across its whole lifetime.
+ */
+export function averageSequenceLengthInFlight(
+  requests: readonly RequestRecord[],
+  metric: 'isl' | 'osl',
+): TimeSeriesPoint[] {
+  const events = new Map<number, { tokenDelta: number; countDelta: number }>();
+  const addEvent = (t: number, tokenDelta: number, countDelta: number) => {
+    const current = events.get(t) ?? { tokenDelta: 0, countDelta: 0 };
+    current.tokenDelta += tokenDelta;
+    current.countDelta += countDelta;
+    events.set(t, current);
+  };
+
+  // Phase is the caller's concern — pass a phase-scoped timeline.
+  for (const request of requests) {
+    const tokens = request[metric];
+    if (
+      request.cancelled ||
+      tokens === null ||
+      !Number.isFinite(tokens) ||
+      tokens < 0 ||
+      request.end < request.start
+    ) {
+      continue;
+    }
+    addEvent(request.start / 1e9, tokens, 1);
+    addEvent(request.end / 1e9, -tokens, -1);
+  }
+
+  let tokensInFlight = 0;
+  let requestsInFlight = 0;
+  return [...events.entries()]
+    .toSorted((a, b) => a[0] - b[0])
+    .map(([t, event]) => {
+      tokensInFlight += event.tokenDelta;
+      requestsInFlight += event.countDelta;
+      return { t, value: requestsInFlight > 0 ? tokensInFlight / requestsInFlight : 0 };
+    });
+}
+
+// A promptTokensBySource bucket label denotes tokens served from some cache
+// tier (local prefix cache, offloaded/host KV, remote KV transfer) rather than
+// freshly computed. Matches vllm labels (`local_cache_hit`,
+// `external_kv_transfer`) and the sglang labels the chart-series builder emits
+// (`cache hit (HBM)`, `cache hit (CPU offload)`, `cache hit`).
+const CACHE_SOURCE_RE = /cache|hit|transfer|reuse/iu;
+
+/**
+ * Cumulative "unique" (freshly prefill-computed) input tokens from the
+ * promptTokensBySource breakdown: total prompt tokens minus everything served
+ * from a cache tier. The breakdown's buckets sum to the real prompt-token
+ * total per scrape, so this is internally consistent and naturally monotonic.
+ *
+ * Preferred over `cumulativeDifferenceMonotonic(prefillTps, prefixCacheHitsTps)`
+ * because `vllm:prefix_cache_hits` re-counts tokens across chunked-prefill /
+ * preemption scheduler passes — its cumulative routinely exceeds the prompt
+ * tokens ever received, which drove the difference deeply negative and froze
+ * the monotonic-clamped curve at whatever it reached in the first few seconds.
+ *
+ * Any bucket whose label isn't recognizably a cache tier counts as computed
+ * (the safe direction for "unique"): a new fresh-compute label over-reports
+ * unique slightly rather than silently freezing the line. Returns [] when no
+ * breakdown is available so the caller can fall back.
+ */
+export function cumulativeUniqueInputTokens(
+  promptTokensBySource: Record<string, TimeSeriesPoint[]> | undefined,
+): TimeSeriesPoint[] {
+  if (!promptTokensBySource) return [];
+  const computedByT = new Map<number, number>();
+  let sawComputed = false;
+  for (const [source, series] of Object.entries(promptTokensBySource)) {
+    if (CACHE_SOURCE_RE.test(source)) continue;
+    sawComputed = true;
+    for (const p of series) computedByT.set(p.t, (computedByT.get(p.t) ?? 0) + p.value);
+  }
+  if (!sawComputed) return [];
+  const out: TimeSeriesPoint[] = [];
+  let sum = 0;
+  for (const t of [...computedByT.keys()].toSorted((x, y) => x - y)) {
+    sum += computedByT.get(t)!;
+    out.push({ t, value: sum });
+  }
+  return out;
+}
+
+/**
+ * Per-event step series: at each request start/end, sum the ISLs of
+ * currently-active requests across distinct `cid`s. Within a single
+ * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N),
+ * so each cid contributes at most one in-flight ISL at a time. Across
+ * different cids we assume content is independent (parent ↔ subagent
+ * and conv ↔ conv share negligible prefix in practice — cross-conv
+ * dedup added ~0.25 pp to theoretical hit rate, so treating them as
+ * independent is a tight approximation of the true in-flight unique
+ * token count).
+ *
+ * Output is a step function: one point per event, value held constant
+ * until the next event. Time axis is seconds relative to the earliest
+ * event in `requests`.
+ */
+export function inflightUniqueTokens(
+  requests: readonly { cid: string; start: number; end: number; isl: number | null }[],
+): TimeSeriesPoint[] {
+  if (requests.length === 0) return [];
+  // The request_timeline timestamps are ns-relative to its own origin.
+  // Convert events to seconds and emit a step series.
+  interface Event {
+    tNs: number;
+    kind: 'start' | 'end';
+    cid: string;
+    isl: number;
+  }
+  const events: Event[] = [];
+  for (const r of requests) {
+    const isl = r.isl ?? 0;
+    if (isl <= 0) continue;
+    events.push(
+      { tNs: r.start, kind: 'start', cid: r.cid, isl },
+      { tNs: r.end, kind: 'end', cid: r.cid, isl },
+    );
+  }
+  if (events.length === 0) return [];
+  // Sort by time; on ties, process 'end' before 'start' so a same-instant
+  // turn handoff within one cid doesn't transiently double-count.
+  events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1));
+
+  // Active ISL per cid (max in case the same cid somehow has overlapping
+  // events; in practice it's always 0 or 1 request at a time per cid).
+  const activeByCid = new Map<string, number>();
+  let total = 0;
+  const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }];
+  for (const e of events) {
+    const tSec = e.tNs / 1e9;
+    if (e.kind === 'start') {
+      const prev = activeByCid.get(e.cid) ?? 0;
+      const next = Math.max(prev, e.isl);
+      activeByCid.set(e.cid, next);
+      total += next - prev;
+    } else {
+      const cur = activeByCid.get(e.cid) ?? 0;
+      if (cur > 0) {
+        total -= cur;
+        activeByCid.delete(e.cid);
+      }
+    }
+    out.push({ t: tSec, value: Math.max(0, total) });
+  }
+  return out;
+}
+
+/**
+ * Monotonic-non-decreasing cumulative difference of two rate series:
+ * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce
+ * a running max so the curve never dips below its prior value.
+ *
+ * Use this to plot things like "cumulative cache-missed tokens" where the
+ * true value can only ever grow, but the underlying per-tick rates can
+ * temporarily look negative due to counter timing skew between scrapes
+ * (vllm's `prefix_cache_hits` and `prompt_tokens` counters can lag each
+ * other by ~5-10 s in our data even though their lifetime totals agree).
+ *
+ * `a` and `b` may have different (or overlapping) timestamp sets — both
+ * are unioned and walked in time order. Output has one point per unique
+ * timestamp present in either input.
+ */
+export function cumulativeDifferenceMonotonic(
+  a: TimeSeriesPoint[],
+  b: TimeSeriesPoint[],
+): TimeSeriesPoint[] {
+  const aByT = new Map(a.map((p) => [p.t, p.value]));
+  const bByT = new Map(b.map((p) => [p.t, p.value]));
+  const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y);
+  const out: TimeSeriesPoint[] = Array.from({ length: allT.length });
+  let cumA = 0;
+  let cumB = 0;
+  let runningMax = 0;
+  for (let i = 0; i < allT.length; i++) {
+    const t = allT[i]!;
+    cumA += aByT.get(t) ?? 0;
+    cumB += bByT.get(t) ?? 0;
+    const diff = cumA - cumB;
+    if (diff > runningMax) runningMax = diff;
+    out[i] = { t, value: runningMax };
+  }
+  return out;
+}
+
+/** Pointwise sum of two arrays sharing the same t index. */
+function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  const n = Math.min(a.length, b.length);
+  const out: TimeSeriesPoint[] = Array.from({ length: n });
+  for (let i = 0; i < n; i++) {
+    out[i] = { t: a[i]!.t, value: a[i]!.value + b[i]!.value };
+  }
+  return out;
+}
+
+/** Build throughput lines from the currently visible input/decode signals. */
+export function buildThroughputChartSeries(
+  input: TimeSeriesPoint[],
+  decode: TimeSeriesPoint[],
+  selected: ReadonlySet<ThroughputSeriesKey>,
+): ChartSeries[] {
+  const series: ChartSeries[] = [];
+  if (selected.has('input')) {
+    series.push({
+      name: 'Input (avg n=50)',
+      data: rollingAverage(input, 50),
+      color: '#3b82f6',
+      strokeWidth: 1.6,
+    });
+  }
+  if (selected.has('decode')) {
+    series.push({
+      name: 'Decode (avg n=50)',
+      data: rollingAverage(decode, 50),
+      color: '#f97316',
+      strokeWidth: 1.6,
+    });
+  }
+  if (selected.size === 2) {
+    series.push({
+      name: 'Total running avg (60s burn-in)',
+      data: cumulativeAverage(sumSeries(input, decode), 60),
+      color: '#ef4444',
+      strokeWidth: 3,
+    });
+  }
+  return series;
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-bars.tsx b/packages/app/src/components/inference/agentic-point/timeline-bars.tsx
new file mode 100644
index 00000000..a5444cb2
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-bars.tsx
@@ -0,0 +1,252 @@
+'use client';
+
+import { memo } from 'react';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import {
+  CHART_WIDTH,
+  HEADER_HEIGHT,
+  PADDING_RIGHT,
+  ROW_GAP,
+  ROW_HEIGHT,
+  timelineSvgHeight,
+} from './timeline-layout';
+import { formatTickLabel } from './timeline-format';
+import { conversationHref, type RequestTimelineRow } from './timeline-rows';
+
+/** Phase color overlay drawn as a thin strip at the bottom of each bar. */
+const PHASE_COLORS: Record<string, string> = {
+  profiling: '#22c55e',
+  warmup: '#94a3b8',
+  unknown: '#64748b',
+};
+
+// Time-axis tick spacing candidates (~8 ticks across the visible window,
+// snapped to the first nice multiple that fits).
+const NICE_TICK_MS = [
+  100, 250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 120_000, 300_000, 600_000, 1_800_000,
+];
+
+export interface TimelineBarsProps {
+  rows: RequestTimelineRow[];
+  expandedSubagents: ReadonlySet<string>;
+  /** Absolute ns timestamp of the visible data's origin (min credit). */
+  dataStart: number;
+  /** Visible window (ns offsets from dataStart). */
+  vStart: number;
+  vEnd: number;
+  datasetSlug?: string | null;
+  onBarHover: (e: React.MouseEvent, row: RequestTimelineRow, req: RequestRecord) => void;
+  onBarLeave: () => void;
+  /** Plain left-click SPA navigation; modified clicks fall through to the href. */
+  onBarClick: (e: React.MouseEvent, req: RequestRecord) => void;
+}
+
+/**
+ * The static SVG content of the timeline: time axis, row separators, and every
+ * request bar. Memoized so tooltip/cursor mousemove state changes in the parent
+ * (which fire on every pointer move) don't re-render thousands of bar rects —
+ * only zoom/pan, filter, and expansion changes reach this subtree.
+ */
+export const TimelineBars = memo(
+  ({
+    rows,
+    expandedSubagents,
+    dataStart,
+    vStart,
+    vEnd,
+    datasetSlug,
+    onBarHover,
+    onBarLeave,
+    onBarClick,
+  }: TimelineBarsProps) => {
+    const svgHeight = timelineSvgHeight(rows.length);
+    const visibleDur = Math.max(vEnd - vStart, 1);
+    const scale = (CHART_WIDTH - PADDING_RIGHT) / visibleDur;
+    // Local coords: convert ns offset from dataStart to x px.
+    const xOf = (ns: number) => (ns - dataStart - vStart) * scale;
+
+    // Time-axis ticks (~8 across visible window, snapped to nice second multiples).
+    const targetMs = visibleDur / 1e6 / 8;
+    const tickMs = NICE_TICK_MS.find((n) => n >= targetMs) ?? targetMs;
+    const tickNs = tickMs * 1e6;
+    const ticks: number[] = [];
+    const tickStart = Math.floor(vStart / tickNs) * tickNs;
+    for (let t = tickStart; t <= vEnd + tickNs; t += tickNs) {
+      if (t >= vStart && t <= vEnd) ticks.push(t);
+    }
+
+    return (
+      <>
+        {/* Header / time-axis baseline */}
+        <line
+          x1={0}
+          y1={HEADER_HEIGHT}
+          x2={CHART_WIDTH}
+          y2={HEADER_HEIGHT}
+          stroke="currentColor"
+          opacity={0.15}
+        />
+
+        {/* Time axis ticks */}
+        {ticks.map((t) => {
+          // Convert visible-window ns offset → x px (the tick array
+          // is already in dataStart-relative coords).
+          const x = (t - vStart) * scale;
+          return (
+            <g key={t}>
+              <line
+                x1={x}
+                y1={HEADER_HEIGHT}
+                x2={x}
+                y2={svgHeight}
+                stroke="currentColor"
+                opacity={0.08}
+                strokeDasharray="2 4"
+              />
+              <text
+                x={x + 2}
+                y={HEADER_HEIGHT - 6}
+                fill="currentColor"
+                opacity={0.55}
+                fontSize={9}
+                fontFamily="ui-monospace, SFMono-Regular, monospace"
+              >
+                {formatTickLabel(t)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* Row separators */}
+        {rows.map((row, idx) => (
+          <line
+            key={`sep-${row.key}`}
+            x1={0}
+            y1={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+            x2={CHART_WIDTH}
+            y2={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+            stroke="currentColor"
+            opacity={0.04}
+          />
+        ))}
+
+        {/* Request bars */}
+        {rows.map((row, rowIdx) => {
+          const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
+          const barH = ROW_HEIGHT - 4;
+          // For multi-stream subagent containers, suppress the union
+          // bars when expanded — the child stream rows draw them
+          // individually instead, so we'd double-draw otherwise.
+          if (
+            row.kind === 'subagent' &&
+            (row.streamCount ?? 1) > 1 &&
+            expandedSubagents.has(row.key)
+          ) {
+            return null;
+          }
+          return row.requests.map((req) => {
+            const xCredit = xOf(req.credit);
+            const xStart = xOf(req.start);
+            const xEnd = xOf(req.end);
+            // Cull bars entirely outside the visible window so big
+            // benchmarks don't render thousands of zero-width rects.
+            if (xEnd < -2 || xCredit > CHART_WIDTH + 2) return null;
+            const runW = Math.max(xEnd - xStart, 1);
+            const queueW = Math.max(xStart - xCredit, 0);
+            const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+            const barKey = `${req.cid}-${req.ti}-${req.start}`;
+            const barChildren = (
+              <>
+                {/* Queue lead-in (faint) — only drawn when noticeable. */}
+                {queueW >= 1 && (
+                  <rect
+                    x={xCredit}
+                    y={yTop + barH / 2 - 1}
+                    width={queueW}
+                    height={2}
+                    fill={row.color}
+                    opacity={0.35}
+                  />
+                )}
+                {/* Main bar — opacity stepped down with depth so
+                parent > subagent > stream reads visually. */}
+                <rect
+                  x={xStart}
+                  y={yTop}
+                  width={runW}
+                  height={barH}
+                  rx={2}
+                  fill={row.color}
+                  opacity={
+                    req.cancelled
+                      ? 0.35
+                      : row.kind === 'stream' || row.kind === 'aux'
+                        ? 0.5
+                        : row.kind === 'subagent'
+                          ? 0.6
+                          : 0.85
+                  }
+                />
+                {/* Phase strip at bottom */}
+                <rect
+                  x={xStart}
+                  y={yTop + barH - 2}
+                  width={runW}
+                  height={2}
+                  rx={1}
+                  fill={phaseColor}
+                  opacity={0.85}
+                />
+                {/* Cancelled X overlay */}
+                {req.cancelled && runW > 6 && (
+                  <line
+                    x1={xStart + 1}
+                    y1={yTop + 1}
+                    x2={xStart + runW - 1}
+                    y2={yTop + barH - 1}
+                    stroke="currentColor"
+                    strokeWidth={0.7}
+                    opacity={0.6}
+                  />
+                )}
+              </>
+            );
+            // No source dataset → not linkable; plain group.
+            if (!datasetSlug) {
+              return (
+                <g
+                  key={barKey}
+                  onMouseMove={(e) => onBarHover(e, row, req)}
+                  onMouseLeave={onBarLeave}
+                >
+                  {barChildren}
+                </g>
+              );
+            }
+            // Linkable: render a real SVG anchor with the conversation
+            // href so the browser's native "open in new tab" works
+            // (right-click menu, ⌘/Ctrl-click, middle-click). Plain
+            // left-click stays an in-app navigation; modified or
+            // non-primary clicks fall through to the browser. Suppress
+            // the native link drag so it doesn't fight the pan gesture.
+            return (
+              <a
+                key={barKey}
+                href={conversationHref(datasetSlug, req)}
+                onMouseMove={(e) => onBarHover(e, row, req)}
+                onMouseLeave={onBarLeave}
+                onClick={(e) => onBarClick(e, req)}
+                onDragStart={(e) => e.preventDefault()}
+                style={{ cursor: 'pointer' }}
+              >
+                {barChildren}
+              </a>
+            );
+          });
+        })}
+      </>
+    );
+  },
+);
diff --git a/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts
new file mode 100644
index 00000000..47c0f034
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts
@@ -0,0 +1,69 @@
+import { describe, expect, it } from 'vitest';
+
+import { countLeq, countLt, cursorStatsAt } from './timeline-cursor-stats';
+
+describe('countLeq / countLt', () => {
+  const sorted = [1, 3, 3, 5, 9];
+
+  it('counts values <= / < target with binary search', () => {
+    expect(countLeq(sorted, 3)).toBe(3);
+    expect(countLt(sorted, 3)).toBe(1);
+    expect(countLeq(sorted, 0)).toBe(0);
+    expect(countLt(sorted, 0)).toBe(0);
+    expect(countLeq(sorted, 9)).toBe(5);
+    expect(countLt(sorted, 9)).toBe(4);
+    expect(countLeq(sorted, 100)).toBe(5);
+  });
+
+  it('handles empty arrays', () => {
+    expect(countLeq([], 1)).toBe(0);
+    expect(countLt([], 1)).toBe(0);
+  });
+});
+
+describe('cursorStatsAt', () => {
+  // Three requests on a shared clock:
+  //   A: credit 0,  start 2,  end 10
+  //   B: credit 1,  start 5,  end 8
+  //   C: credit 12, start 14, end 20
+  const times = {
+    credits: [0, 1, 12],
+    starts: [2, 5, 14],
+    ends: [8, 10, 20],
+  };
+
+  it('counts running, waiting, and completed at an instant', () => {
+    // t=3: A running, B credited but not started, C not yet credited.
+    expect(cursorStatsAt(times, 3)).toEqual({
+      running: 1,
+      waiting: 1,
+      completed: 0,
+      inflight: 2,
+    });
+    // t=6: A and B running.
+    expect(cursorStatsAt(times, 6)).toEqual({
+      running: 2,
+      waiting: 0,
+      completed: 0,
+      inflight: 2,
+    });
+    // t=13: A and B done, C waiting in queue.
+    expect(cursorStatsAt(times, 13)).toEqual({
+      running: 0,
+      waiting: 1,
+      completed: 2,
+      inflight: 1,
+    });
+  });
+
+  it('counts a request as still running at its exact end instant', () => {
+    // end < t (strict) excludes the request from "ended", so at t === end it
+    // still counts as running — matches the popover's documented semantics.
+    expect(cursorStatsAt(times, 8).running).toBe(2);
+    expect(cursorStatsAt(times, 8).completed).toBe(1);
+  });
+
+  it('never returns negative counts on inconsistent columns', () => {
+    expect(cursorStatsAt({ credits: [], starts: [0], ends: [] }, 5).waiting).toBe(0);
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts
new file mode 100644
index 00000000..801cec95
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts
@@ -0,0 +1,57 @@
+/**
+ * Pure math behind the cursor stats popover: count how many requests are
+ * running / waiting / completed at a given instant, in O(log n) per query via
+ * binary search over pre-sorted timestamp columns.
+ */
+
+/** Pre-sorted (ascending) timestamp columns for one filtered request set. */
+export interface SortedRequestTimes {
+  credits: number[];
+  starts: number[];
+  ends: number[];
+}
+
+export interface CursorStats {
+  running: number;
+  waiting: number;
+  completed: number;
+  inflight: number;
+}
+
+/** Number of values in a sorted ascending array that are <= target. */
+export function countLeq(sorted: number[], target: number): number {
+  let lo = 0;
+  let hi = sorted.length;
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1;
+    if (sorted[mid]! <= target) lo = mid + 1;
+    else hi = mid;
+  }
+  return lo;
+}
+
+/** Number of values in a sorted ascending array that are < target. */
+export function countLt(sorted: number[], target: number): number {
+  let lo = 0;
+  let hi = sorted.length;
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1;
+    if (sorted[mid]! < target) lo = mid + 1;
+    else hi = mid;
+  }
+  return lo;
+}
+
+/**
+ * Request counts at time t (ns offset on the same axis as the sorted columns):
+ *   running   = #(start <= t) - #(end < t)
+ *   waiting   = #(credit <= t) - #(start <= t)
+ *   completed = #(end <= t)
+ */
+export function cursorStatsAt(times: SortedRequestTimes, t: number): CursorStats {
+  const startsLeq = countLeq(times.starts, t);
+  const running = Math.max(0, startsLeq - countLt(times.ends, t));
+  const waiting = Math.max(0, countLeq(times.credits, t) - startsLeq);
+  const completed = countLeq(times.ends, t);
+  return { running, waiting, completed, inflight: running + waiting };
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-format.ts b/packages/app/src/components/inference/agentic-point/timeline-format.ts
new file mode 100644
index 00000000..1c0020f3
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-format.ts
@@ -0,0 +1,15 @@
+/** Time formatting shared by the timeline axis, header stats, and tooltips. */
+
+/** Format ns offset → "+12.3s" / "+1.2m". */
+export function formatTickLabel(ns: number): string {
+  const ms = ns / 1e6;
+  if (ms < 1000) return `+${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `+${(ms / 1000).toFixed(ms < 10_000 ? 1 : 0)}s`;
+  return `+${(ms / 60_000).toFixed(1)}m`;
+}
+
+export function formatDuration(ms: number): string {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `${(ms / 1000).toFixed(2)}s`;
+  return `${(ms / 60_000).toFixed(2)}m`;
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-layout.ts b/packages/app/src/components/inference/agentic-point/timeline-layout.ts
new file mode 100644
index 00000000..7043e487
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-layout.ts
@@ -0,0 +1,21 @@
+/** Layout constants shared by the timeline component and its SVG content. */
+
+// The timeline body is capped at this height and scrolls internally, so a run
+// with many conversations/workers doesn't make the card grow unbounded and push
+// the rest of the detail page down. Sized to show ~16 rows + the header.
+export const TIMELINE_BODY_MAX_HEIGHT = 480;
+
+// Wide enough for a full 36-char conversation id at 10px font, plus the
+// indent + color stripe + count badge. Subagent rows inherit the same
+// width but truncate the longer "↳ subagent N · hash" tail with ellipsis.
+export const LABEL_WIDTH = 360;
+export const ROW_HEIGHT = 22;
+export const ROW_GAP = 3;
+export const HEADER_HEIGHT = 24;
+export const PADDING_RIGHT = 12;
+export const CHART_WIDTH = 920;
+
+/** Chart height for a given row count (header + rows + bottom padding). */
+export function timelineSvgHeight(rowCount: number): number {
+  return HEADER_HEIGHT + rowCount * (ROW_HEIGHT + ROW_GAP) + 6;
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-rows.ts b/packages/app/src/components/inference/agentic-point/timeline-rows.ts
new file mode 100644
index 00000000..14bda4ae
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-rows.ts
@@ -0,0 +1,476 @@
+/**
+ * Pure row-building logic for the request timeline: cid parsing, deep-link
+ * hrefs, stable ordering/coloring, and grouping requests into Gantt rows.
+ * No React — everything here is unit-testable data transformation.
+ */
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+export type RowMode = 'conversation' | 'worker';
+
+/**
+ * The dataset conversation id for a request: the cid with any subagent/forked
+ * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in
+ * dataset_conversations, so it deep-links into /datasets/<slug>/conversations/.
+ */
+export function datasetConvId(cid: string): string {
+  const i = cid.indexOf('::');
+  return i === -1 ? cid : cid.slice(0, i);
+}
+
+/**
+ * The subagent id encoded in a cid (`…::sa:<agent_id>[:s<n>|:aux:<n>]`), or null
+ * for a main-conversation request. The harness fans a single subagent into
+ * parallel streams with a `:s<n>` or `:aux:<n>` suffix; the dataset
+ * SubagentNode.agentId is the bare base (e.g. `subagent_001_b00fdc12`). Agent
+ * ids never contain a colon, so the base is everything up to the first one.
+ */
+export function subagentIdOf(cid: string): string | null {
+  const i = cid.indexOf('::sa:');
+  if (i === -1) return null;
+  const raw = cid.slice(i + '::sa:'.length);
+  const colon = raw.indexOf(':');
+  return colon === -1 ? raw : raw.slice(0, colon);
+}
+
+/**
+ * Deep-link URL for the dataset conversation a request maps to. Carries the turn
+ * (and, for subagent requests, the subagent id) so the flamegraph can scroll to
+ * / highlight the exact node. Used both for SPA navigation on click and as the
+ * real `href` on the request bar so the browser's native "open in new tab"
+ * (right-click, ⌘/Ctrl-click, middle-click) works.
+ */
+export function conversationHref(datasetSlug: string, req: RequestRecord): string {
+  const convId = req.srcTrace ?? datasetConvId(req.cid);
+  const params = new URLSearchParams({ turn: String(req.ti) });
+  if (typeof req.srcOuter === 'number' && Number.isInteger(req.srcOuter) && req.srcOuter >= 0) {
+    params.set('raw', String(req.srcOuter));
+    if (typeof req.srcInner === 'number' && Number.isInteger(req.srcInner) && req.srcInner >= 0) {
+      params.set('inner', String(req.srcInner));
+    }
+  }
+  const sa = subagentIdOf(req.cid);
+  if (sa && !params.has('inner')) params.set('sa', sa);
+  return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`;
+}
+
+/** Human label for where a request came from (raw trace index or replay turn). */
+export function requestSourceLabel(req: RequestRecord): string {
+  if (typeof req.srcOuter === 'number') {
+    if (typeof req.srcInner === 'number') return `raw ${req.srcOuter} / child ${req.srcInner}`;
+    return `raw ${req.srcOuter}`;
+  }
+  return `replay turn ${req.ti + 1}`;
+}
+
+export interface RequestIdleStats {
+  /** Total time between the first start and last end with no request running. */
+  idleNs: number;
+  /** Wall-clock span from the first request start to the final request end. */
+  spanNs: number;
+}
+
+/**
+ * Merge request intervals and sum the gaps between them. Queue time before a
+ * request starts is intentionally excluded: "in flight" means [start, end].
+ */
+export function requestIdleStats(requests: readonly RequestRecord[]): RequestIdleStats {
+  const intervals = requests
+    .filter(({ start, end }) => Number.isFinite(start) && Number.isFinite(end) && end >= start)
+    .map(({ start, end }) => ({ start, end }))
+    .toSorted((a, b) => a.start - b.start || a.end - b.end);
+  if (intervals.length === 0) return { idleNs: 0, spanNs: 0 };
+
+  const firstStart = intervals[0]!.start;
+  let mergedEnd = intervals[0]!.end;
+  let idleNs = 0;
+  for (let i = 1; i < intervals.length; i++) {
+    const interval = intervals[i]!;
+    if (interval.start > mergedEnd) idleNs += interval.start - mergedEnd;
+    if (interval.end > mergedEnd) mergedEnd = interval.end;
+  }
+  return { idleNs, spanNs: mergedEnd - firstStart };
+}
+
+/** A stable color palette indexed by row-key hash. */
+const ROW_COLORS = [
+  '#3b82f6',
+  '#ef4444',
+  '#10b981',
+  '#f59e0b',
+  '#a855f7',
+  '#06b6d4',
+  '#f97316',
+  '#84cc16',
+  '#ec4899',
+  '#14b8a6',
+  '#8b5cf6',
+  '#eab308',
+];
+
+/**
+ * Row kinds:
+ *   parent           — top-level conversation (depth 0)
+ *   worker           — worker swimlane (depth 0, worker mode)
+ *   subagent         — a subagent invocation (depth 1). Either a single
+ *                      stream (renders its own bars), or a multi-stream
+ *                      container whose bars are the union of its streams
+ *                      when collapsed.
+ *   stream           — one :sN stream of a multi-stream subagent (depth 2).
+ *                      Hidden by default; toggled in via the parent's chevron.
+ *   aux              — one :aux:N parallel lane (depth 2). Always visible
+ *                      beneath its owning subagent.
+ */
+type RowKind = 'parent' | 'worker' | 'subagent' | 'stream' | 'aux';
+
+export interface RequestTimelineRow {
+  key: string;
+  label: string;
+  color: string;
+  requests: RequestRecord[];
+  depth: number;
+  kind: RowKind;
+  /** Number of streams under this subagent (>=1). Only set for subagent rows. */
+  streamCount?: number;
+  /** For stream rows: the parent subagent's row key (drives expand/collapse). */
+  parentRowKey?: string;
+  /** Number of always-visible auxiliary lanes under this subagent. */
+  auxCount?: number;
+}
+
+/**
+ * Conversation ids for subagent calls look like
+ *   <parent_cid>::sa:<agent_id>[:s<stream_idx>|:aux:<aux_idx>]
+ * The optional `:s<N>` suffix is set when the harness fans a single
+ * subagent into multiple parallel "streams" (interval-graph
+ * decomposition in weka_trace._pack_into_streams). We split it off so
+ * we can group every parallel lane under a single subagent header row.
+ *
+ * Aux lanes can also hang directly off the main conversation (no `::sa:`
+ * segment): `<parent_cid>::aux:<aux_idx>` or `<parent_cid>::aux:red:<aux_idx>`.
+ * These are parallel requests belonging to the main agent itself, so they
+ * nest under the parent conversation row rather than forming their own
+ * top-level group.
+ */
+export function splitTimelineCid(cid: string): {
+  parent: string;
+  subagentBase: string | null;
+  stream: number | null;
+  aux: string | null;
+} {
+  const sep = cid.indexOf('::sa:');
+  if (sep === -1) {
+    const auxSep = cid.indexOf('::aux:');
+    if (auxSep !== -1) {
+      return {
+        parent: cid.slice(0, auxSep),
+        subagentBase: null,
+        stream: null,
+        aux: cid.slice(auxSep + '::aux:'.length),
+      };
+    }
+    return { parent: cid, subagentBase: null, stream: null, aux: null };
+  }
+  const parent = cid.slice(0, sep);
+  const raw = cid.slice(sep + 5);
+  const auxMatch = /^(?<base>[^:]+):aux:(?<aux>.+)$/.exec(raw);
+  if (auxMatch) {
+    return {
+      parent,
+      subagentBase: auxMatch.groups!.base!,
+      stream: null,
+      aux: auxMatch.groups!.aux!,
+    };
+  }
+  const m = /^(?<base>.*):s(?<stream>\d+)$/.exec(raw);
+  if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]), aux: null };
+  return { parent, subagentBase: raw, stream: null, aux: null };
+}
+
+/**
+ * Stable order/color index for the top-level row groups (conversations in
+ * conversation mode, workers in worker mode), keyed by group id and computed
+ * over the FULL (unfiltered) request set. Both the row ordering and the color
+ * palette are driven by this index, so a conversation/worker keeps the same
+ * position and color when the phase filter changes the visible subset — without
+ * it, filtering to warmup vs profiling re-sorts and re-colors by whatever subset
+ * is showing, making rows jump and swap colors.
+ *
+ * Groups that span BOTH phases sort first. The shared set is by definition
+ * present in either phase's view, so this leading block renders identically in
+ * both — a conversation that carries over from warmup into profiling stays on
+ * the exact same row when the toggle flips. Phase-exclusive groups follow, and
+ * only they reflow between views. Within each block the order key is the
+ * group's earliest request start across all phases; ties break on the group id
+ * for determinism.
+ */
+export function computeStableRowIndex(
+  requests: readonly RequestRecord[],
+  mode: RowMode,
+): Map<string, number> {
+  const firstStart = new Map<string, number>();
+  // Which phases each group appears in. Mirrors requestsForPhase's split:
+  // 'profiling' is exact, anything else counts as warmup.
+  const inProfiling = new Set<string>();
+  const inWarmup = new Set<string>();
+  for (const r of requests) {
+    const key = mode === 'conversation' ? splitTimelineCid(r.cid).parent : r.wid;
+    const cur = firstStart.get(key);
+    if (cur === undefined || r.start < cur) firstStart.set(key, r.start);
+    if (r.phase === 'profiling') inProfiling.add(key);
+    else inWarmup.add(key);
+  }
+  const spansBoth = (key: string) => inProfiling.has(key) && inWarmup.has(key);
+  const keys = [...firstStart.keys()].toSorted(
+    (a, b) =>
+      Number(spansBoth(b)) - Number(spansBoth(a)) ||
+      firstStart.get(a)! - firstStart.get(b)! ||
+      (a < b ? -1 : a > b ? 1 : 0),
+  );
+  const index = new Map<string, number>();
+  keys.forEach((key, i) => index.set(key, i));
+  return index;
+}
+
+/**
+ * Group requests into rows. In conversation mode, output order is:
+ *   parent_conv
+ *     subagent_001                  (collapsed by default, container)
+ *       :s0                         (hidden unless expanded)
+ *       :s1
+ *       aux 011 · parallel          (always visible)
+ *     subagent_002
+ *     ...
+ *
+ * `expandedSubagents` controls which subagent containers reveal their
+ * stream children. Bars on a collapsed subagent are the UNION of all its
+ * streams' requests — overlapping bars visually communicate the
+ * stream-level parallelism without expanding.
+ *
+ * `stableRowIndex` (optional) pins the top-level order + color per group so they
+ * survive phase-filter changes; when omitted it's derived from `requests` (the
+ * legacy self-contained behavior, used by unit tests).
+ */
+export function buildRequestTimelineRows(
+  requests: RequestRecord[],
+  mode: RowMode,
+  expandedSubagents: ReadonlySet<string>,
+  stableRowIndex?: ReadonlyMap<string, number>,
+): RequestTimelineRow[] {
+  const index = stableRowIndex ?? computeStableRowIndex(requests, mode);
+  const colorFor = (key: string) =>
+    ROW_COLORS[
+      (((index.get(key) ?? 0) % ROW_COLORS.length) + ROW_COLORS.length) % ROW_COLORS.length
+    ]!;
+  const orderOf = (key: string) => index.get(key) ?? Number.POSITIVE_INFINITY;
+  if (mode !== 'conversation') {
+    // Worker mode: flat rows, sorted by first activity.
+    const groups = new Map<string, RequestRecord[]>();
+    for (const r of requests) {
+      let list = groups.get(r.wid);
+      if (!list) {
+        list = [];
+        groups.set(r.wid, list);
+      }
+      list.push(r);
+    }
+    const rows: RequestTimelineRow[] = [];
+    for (const [key, list] of groups) {
+      list.sort((a, b) => a.start - b.start);
+      rows.push({
+        key,
+        label: shortenWid(key),
+        color: colorFor(key),
+        requests: list,
+        depth: 0,
+        kind: 'worker',
+      });
+    }
+    rows.sort(
+      (a, b) => orderOf(a.key) - orderOf(b.key) || a.requests[0]!.start - b.requests[0]!.start,
+    );
+    return rows;
+  }
+
+  // Conversation mode — tree: parent → subagent → stream/aux lane.
+  interface SubagentLanes {
+    streams: Map<number | null, RequestRecord[]>;
+    aux: Map<string, RequestRecord[]>;
+  }
+  interface Tree {
+    parentCid: string;
+    parentReqs: RequestRecord[];
+    // Aux lanes hanging directly off the main agent (`<cid>::aux:…`).
+    parentAux: Map<string, RequestRecord[]>;
+    // subagentBase → primary streams + always-visible auxiliary lanes.
+    subagents: Map<string, SubagentLanes>;
+    firstStart: number;
+  }
+  const trees = new Map<string, Tree>();
+  for (const r of requests) {
+    const { parent, subagentBase, stream, aux } = splitTimelineCid(r.cid);
+    let tree = trees.get(parent);
+    if (!tree) {
+      tree = {
+        parentCid: parent,
+        parentReqs: [],
+        parentAux: new Map(),
+        subagents: new Map(),
+        firstStart: Number.POSITIVE_INFINITY,
+      };
+      trees.set(parent, tree);
+    }
+    if (subagentBase === null && aux !== null) {
+      const list = tree.parentAux.get(aux);
+      if (list) list.push(r);
+      else tree.parentAux.set(aux, [r]);
+    } else if (subagentBase === null) {
+      tree.parentReqs.push(r);
+    } else {
+      let lanes = tree.subagents.get(subagentBase);
+      if (!lanes) {
+        lanes = { streams: new Map(), aux: new Map() };
+        tree.subagents.set(subagentBase, lanes);
+      }
+      if (aux === null) {
+        const list = lanes.streams.get(stream);
+        if (list) list.push(r);
+        else lanes.streams.set(stream, [r]);
+      } else {
+        const list = lanes.aux.get(aux);
+        if (list) list.push(r);
+        else lanes.aux.set(aux, [r]);
+      }
+    }
+    if (r.start < tree.firstStart) tree.firstStart = r.start;
+  }
+
+  const sortedTrees = [...trees.values()].toSorted(
+    (a, b) => orderOf(a.parentCid) - orderOf(b.parentCid) || a.firstStart - b.firstStart,
+  );
+  const rows: RequestTimelineRow[] = [];
+  for (const tree of sortedTrees) {
+    const color = colorFor(tree.parentCid);
+    // Parent row (use a placeholder key if the parent itself wasn't replayed).
+    tree.parentReqs.sort((a, b) => a.start - b.start);
+    const parentRowKey = tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`;
+    rows.push({
+      key: parentRowKey,
+      label: tree.parentCid,
+      color,
+      requests: tree.parentReqs,
+      depth: 0,
+      kind: 'parent',
+    });
+
+    // Aux lanes belonging to the main agent itself (`<cid>::aux:…`), nested
+    // directly beneath the parent row. Always visible, like subagent aux lanes.
+    const parentAuxEntries = [...tree.parentAux.entries()].toSorted(
+      (a, b) =>
+        (a[1][0]?.start ?? Number.POSITIVE_INFINITY) - (b[1][0]?.start ?? Number.POSITIVE_INFINITY),
+    );
+    for (const [auxId, reqs] of parentAuxEntries) {
+      reqs.sort((a, b) => a.start - b.start);
+      rows.push({
+        key: `${tree.parentCid}::aux:${auxId}`,
+        label: `aux ${auxId} · parallel`,
+        color,
+        requests: reqs,
+        depth: 1,
+        kind: 'aux',
+        parentRowKey,
+      });
+    }
+
+    // One subagent row per base (which may contain N streams).
+    const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => {
+      const aStart = Math.min(
+        ...[...a[1].streams.values(), ...a[1].aux.values()].map(
+          (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY,
+        ),
+      );
+      const bStart = Math.min(
+        ...[...b[1].streams.values(), ...b[1].aux.values()].map(
+          (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY,
+        ),
+      );
+      return aStart - bStart;
+    });
+    for (const [saBase, lanes] of subagentEntries) {
+      const subagentKey = `${tree.parentCid}::sa:${saBase}`;
+      // Union of primary stream requests for collapsed-view bars. Aux lanes
+      // stay separate so their overlap remains visible as parallel work.
+      const allReqs: RequestRecord[] = [];
+      for (const reqs of lanes.streams.values()) allReqs.push(...reqs);
+      allReqs.sort((a, b) => a.start - b.start);
+      const streamCount = lanes.streams.size;
+      rows.push({
+        key: subagentKey,
+        label: `↳ ${formatSubagentLabel(saBase)}`,
+        color,
+        requests: allReqs,
+        depth: 1,
+        kind: 'subagent',
+        streamCount,
+        auxCount: lanes.aux.size,
+      });
+
+      // Stream children only when expanded AND there's more than one
+      // stream (a single-stream subagent has nothing extra to show).
+      if (streamCount > 1 && expandedSubagents.has(subagentKey)) {
+        const streamEntries = [...lanes.streams.entries()].toSorted((a, b) => {
+          // Sort by stream index (null first as the "default" stream)
+          const ai = a[0] ?? -1;
+          const bi = b[0] ?? -1;
+          return ai - bi;
+        });
+        for (const [streamIdx, reqs] of streamEntries) {
+          reqs.sort((a, b) => a.start - b.start);
+          rows.push({
+            key: `${subagentKey}:s${streamIdx ?? '∅'}`,
+            label: `stream ${streamIdx ?? '∅'}`,
+            color,
+            requests: reqs,
+            depth: 2,
+            kind: 'stream',
+            parentRowKey: subagentKey,
+          });
+        }
+      }
+
+      // Aux lanes encode concurrent requests within the subagent. Keep them
+      // visible even when primary streams are collapsed so parallelism is not
+      // hidden behind an interaction.
+      const auxEntries = [...lanes.aux.entries()].toSorted(
+        (a, b) =>
+          (a[1][0]?.start ?? Number.POSITIVE_INFINITY) -
+          (b[1][0]?.start ?? Number.POSITIVE_INFINITY),
+      );
+      for (const [auxId, reqs] of auxEntries) {
+        reqs.sort((a, b) => a.start - b.start);
+        rows.push({
+          key: `${subagentKey}:aux:${auxId}`,
+          label: `aux ${auxId} · parallel`,
+          color,
+          requests: reqs,
+          depth: 2,
+          kind: 'aux',
+          parentRowKey: subagentKey,
+        });
+      }
+    }
+  }
+  return rows;
+}
+
+/** `subagent_001_bf1c5c16` → `subagent 001 · bf1c` (compact, readable). */
+function formatSubagentLabel(raw: string): string {
+  const m = /^subagent_(?<index>\d+)_(?<hash>[0-9a-f]+)$/iu.exec(raw);
+  if (!m) return raw;
+  return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`;
+}
+
+/** `worker_4ae87bea` → `w_4ae8` (compact worker swimlane label). */
+export function shortenWid(wid: string): string {
+  return wid.replace(/^worker_/, 'w_').slice(0, 12);
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx b/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx
new file mode 100644
index 00000000..7aa63efc
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx
@@ -0,0 +1,143 @@
+'use client';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import { formatDuration, formatTickLabel } from './timeline-format';
+import { cursorStatsAt, type SortedRequestTimes } from './timeline-cursor-stats';
+import { requestSourceLabel, shortenWid, type RequestTimelineRow } from './timeline-rows';
+
+export interface TooltipData {
+  x: number;
+  y: number;
+  row: RequestTimelineRow;
+  req: RequestRecord;
+}
+
+/** Per-request hover tooltip (fixed-position, follows the mouse). */
+export function TimelineTooltip({ data, linkable }: { data: TooltipData; linkable?: boolean }) {
+  const { row, req } = data;
+  const totalMs = (req.end - req.start) / 1e6;
+  const queueMs = (req.start - req.credit) / 1e6;
+  return (
+    <div
+      className="fixed z-50 pointer-events-none rounded-md border border-border bg-card p-2.5 shadow-lg text-[11px]"
+      style={{ left: data.x + 12, top: data.y - 10, maxWidth: 280 }}
+    >
+      <div className="flex items-center gap-2 font-medium text-foreground">
+        <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: row.color }} />
+        <span className="truncate">{row.label}</span>
+        <span className="text-muted-foreground">· {requestSourceLabel(req)}</span>
+        {req.cancelled && <span className="text-destructive">· cancelled</span>}
+      </div>
+      <div className="mt-1.5 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
+        <span>Total</span>
+        <span className="text-foreground text-right tabular-nums">{formatDuration(totalMs)}</span>
+        <span>Queue wait</span>
+        <span className="text-foreground text-right tabular-nums">
+          {queueMs > 0.5 ? formatDuration(queueMs) : '—'}
+        </span>
+        {req.ttftMs !== null && (
+          <>
+            <span>TTFT</span>
+            <span className="text-foreground text-right tabular-nums">
+              {formatDuration(req.ttftMs)}
+            </span>
+          </>
+        )}
+        {req.isl !== null && (
+          <>
+            <span>ISL</span>
+            <span className="text-foreground text-right tabular-nums">
+              {req.isl.toLocaleString()}
+            </span>
+          </>
+        )}
+        {req.osl !== null && (
+          <>
+            <span>OSL</span>
+            <span className="text-foreground text-right tabular-nums">
+              {req.osl.toLocaleString()}
+            </span>
+          </>
+        )}
+        <span>Phase</span>
+        <span className="text-foreground text-right">{req.phase}</span>
+        {req.ad > 0 && (
+          <>
+            <span>Agent depth</span>
+            <span className="text-foreground text-right tabular-nums">{req.ad}</span>
+          </>
+        )}
+        <span>Worker</span>
+        <span className="text-foreground text-right truncate">{shortenWid(req.wid)}</span>
+      </div>
+      <div className="mt-1.5 pt-1 border-t border-border/40 text-[10px] text-muted-foreground">
+        Started at {formatTickLabel(req.start)}
+      </div>
+      {linkable && (
+        <div className="mt-1 text-[10px] font-medium text-primary">
+          Click to view this conversation in the dataset →
+        </div>
+      )}
+    </div>
+  );
+}
+
+export interface CursorState {
+  /** Cursor x in svg-local px (drives the crosshair line). */
+  xPx: number;
+  /** ns offset from dataStart the cursor points at. */
+  tNs: number;
+  clientX: number;
+  clientY: number;
+}
+
+/** Cursor stats popover: requests in flight / waiting / completed at time t. */
+export function CursorPopover({
+  cursor,
+  dataStart,
+  times,
+}: {
+  cursor: CursorState;
+  dataStart: number;
+  times: SortedRequestTimes;
+}) {
+  const t = cursor.tNs;
+  const { running, waiting, completed, inflight } = cursorStatsAt(times, t);
+  // Absolute wall-clock seconds since the timeline origin (dataStart).
+  const tSec = t / 1e9;
+  // Position the popover near the cursor without overflowing the viewport.
+  // 200 px wide; flip to the left of the cursor if it would clip the right.
+  const wantLeft = cursor.clientX + 14;
+  const left =
+    typeof window === 'undefined' || wantLeft + 220 < window.innerWidth
+      ? wantLeft
+      : cursor.clientX - 220;
+  return (
+    <div
+      className="fixed z-40 pointer-events-none rounded-md border border-border bg-card/95 backdrop-blur p-2 shadow-lg text-[11px] font-mono"
+      style={{ left, top: cursor.clientY - 60, minWidth: 180 }}
+    >
+      <div className="flex justify-between gap-3 text-foreground">
+        <span className="text-muted-foreground">t =</span>
+        <span className="tabular-nums">
+          {tSec < 60 ? `${tSec.toFixed(3)} s` : `${(tSec / 60).toFixed(3)} m`}
+        </span>
+      </div>
+      <div className="mt-1 pt-1 border-t border-border/40 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
+        <span>In flight</span>
+        <span className="text-foreground text-right tabular-nums">{inflight}</span>
+        <span className="pl-3 text-[10px]">running</span>
+        <span className="text-foreground text-right tabular-nums">{running}</span>
+        <span className="pl-3 text-[10px]">waiting</span>
+        <span className="text-foreground text-right tabular-nums">{waiting}</span>
+        <span>Completed</span>
+        <span className="text-foreground text-right tabular-nums">{completed}</span>
+      </div>
+      {/* dataStart is informational — the displayed t is relative to it. */}
+      <div className="mt-1 pt-1 border-t border-border/40 text-[9px] text-muted-foreground">
+        relative to t₀ ({(dataStart / 1e9).toFixed(0)}s wall-clock)
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts b/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts
new file mode 100644
index 00000000..631bdd94
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts
@@ -0,0 +1,108 @@
+/**
+ * Persisted view-state snapshot for the request timeline (zoom window, row
+ * mode, phase filter, expansions, scroll offsets). Written to sessionStorage on
+ * click-through to a dataset conversation, consumed once on the next mount so
+ * the browser back button restores the user's exact position.
+ */
+
+import type { StagePhase } from './phase-slice';
+import type { RowMode } from './timeline-rows';
+
+// Two phases shown separately (no combined view) — matches the per-point detail
+// stage toggle. Reuses StagePhase so the filter predicate is shared.
+export type PhaseFilter = StagePhase;
+
+/**
+ * Persisted snapshot of the timeline's view state, used to restore the user's
+ * zoom / scroll / filter position when they return to the page (e.g. clicking a
+ * request to open the dataset flamegraph, then hitting the browser back button).
+ * Stored in sessionStorage keyed by point id; written on click-through and
+ * consumed once on the next mount.
+ */
+export interface TimelineViewSnapshot {
+  /** Zoom-pan window start (ns offset from dataStart). */
+  viewStart: number;
+  /** Zoom-pan window end, or null when not zoomed (full extent). */
+  viewEnd: number | null;
+  rowMode: RowMode;
+  phaseFilter: PhaseFilter;
+  /** Keys of expanded multi-stream subagent rows. */
+  expanded: string[];
+  /** Scroll container offsets (vertical row scroll + horizontal). */
+  scrollTop: number;
+  scrollLeft: number;
+}
+
+const TIMELINE_VIEW_SNAPSHOT_PREFIX = 'agentic-timeline-view:';
+const ROW_MODE_VALUES: readonly RowMode[] = ['conversation', 'worker'];
+const PHASE_FILTER_VALUES: readonly PhaseFilter[] = ['warmup', 'profiling'];
+
+const finiteOr = (value: unknown, fallback: number): number =>
+  typeof value === 'number' && Number.isFinite(value) ? value : fallback;
+
+/**
+ * Parse a persisted snapshot, coercing/validating each field and falling back
+ * to defaults so a malformed or stale blob can never break restore. Returns
+ * null only when the input is absent or not parseable JSON.
+ */
+export function parseTimelineViewSnapshot(raw: string | null): TimelineViewSnapshot | null {
+  if (!raw) return null;
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(raw);
+  } catch {
+    return null;
+  }
+  if (!parsed || typeof parsed !== 'object') return null;
+  const record = parsed as Record<string, unknown>;
+  const rowMode = ROW_MODE_VALUES.includes(record.rowMode as RowMode)
+    ? (record.rowMode as RowMode)
+    : 'conversation';
+  const phaseFilter = PHASE_FILTER_VALUES.includes(record.phaseFilter as PhaseFilter)
+    ? (record.phaseFilter as PhaseFilter)
+    : 'profiling';
+  const viewEnd =
+    typeof record.viewEnd === 'number' && Number.isFinite(record.viewEnd) ? record.viewEnd : null;
+  const expanded = Array.isArray(record.expanded)
+    ? record.expanded.filter((entry): entry is string => typeof entry === 'string')
+    : [];
+  return {
+    viewStart: finiteOr(record.viewStart, 0),
+    viewEnd,
+    rowMode,
+    phaseFilter,
+    expanded,
+    scrollTop: finiteOr(record.scrollTop, 0),
+    scrollLeft: finiteOr(record.scrollLeft, 0),
+  };
+}
+
+function timelineSnapshotKey(pointId: number): string {
+  return `${TIMELINE_VIEW_SNAPSHOT_PREFIX}${pointId}`;
+}
+
+export function saveTimelineViewSnapshot(pointId: number, snapshot: TimelineViewSnapshot): void {
+  if (typeof window === 'undefined') return;
+  try {
+    window.sessionStorage.setItem(timelineSnapshotKey(pointId), JSON.stringify(snapshot));
+  } catch {
+    // sessionStorage can throw (private mode / quota exceeded) — restore is
+    // best-effort, so a failed write just means no restore next time.
+  }
+}
+
+/**
+ * Read AND remove the snapshot (one-shot): we only want to restore once per
+ * click-through, so a later reload of the same point starts from defaults.
+ */
+export function consumeTimelineViewSnapshot(pointId: number): TimelineViewSnapshot | null {
+  if (typeof window === 'undefined') return null;
+  try {
+    const key = timelineSnapshotKey(pointId);
+    const raw = window.sessionStorage.getItem(key);
+    window.sessionStorage.removeItem(key);
+    return parseTimelineViewSnapshot(raw);
+  } catch {
+    return null;
+  }
+}

From 068c5b21d80ea8cbfcde288ad6b368fe5ba596f4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:12:15 -0500
Subject: [PATCH 09/40] feat(inference): agentic-traces integration in the
 dashboard chart and filters

---
 .../src/components/GlobalFilterContext.tsx    |  27 +-
 packages/app/src/components/header/header.tsx |   6 +
 .../components/inference/InferenceContext.tsx | 202 ++++-
 .../inference/hooks/useChartData.ts           | 267 ++++++-
 .../inference/inference-chart-config.json     |  10 +-
 .../inference/replay/buildReplayTimeline.ts   |   3 +-
 .../app/src/components/inference/types.ts     |  79 ++
 .../components/inference/ui/ChartControls.tsx |  38 +-
 .../components/inference/ui/ChartDisplay.tsx  | 724 ++++++++++--------
 .../src/components/inference/ui/GPUGraph.tsx  |  76 +-
 .../ui/ScatterGraph.decoration.test.tsx       |   7 +
 .../components/inference/ui/ScatterGraph.tsx  | 421 +++++++---
 .../inference/ui/UnofficialChartDisplay.tsx   |   4 +-
 .../src/components/inference/utils.test.ts    |  78 +-
 .../app/src/components/inference/utils.ts     |  29 +-
 .../inference/utils/parallelism-label.test.ts |  58 ++
 .../inference/utils/parallelism-label.ts      |  79 ++
 .../inference/utils/tooltip-utils.test.ts     |  32 +
 .../inference/utils/tooltipUtils.ts           | 202 +++--
 .../app/src/components/ui/chart-legend.tsx    |  26 +
 .../app/src/components/ui/chart-selectors.tsx | 150 ++++
 .../src/components/ui/d3-chart-wrapper.tsx    |  53 +-
 .../unofficial-run-provider.test.ts           |   3 +
 .../components/unofficial-run-provider.tsx    |   4 +-
 packages/app/src/lib/api.ts                   |  15 +-
 .../app/src/lib/benchmark-transform.test.ts   |  95 ++-
 packages/app/src/lib/benchmark-transform.ts   | 121 ++-
 packages/app/src/lib/chart-utils.test.ts      |  39 +-
 packages/app/src/lib/chart-utils.ts           |  33 +-
 .../app/src/lib/compare-pair-defaults.test.ts |   3 +
 packages/app/src/lib/compare-pair-defaults.ts |   1 +
 packages/app/src/lib/compare-ssr.test.ts      |   7 +
 .../d3-chart/layers/scatter-points.test.ts    |  50 +-
 .../src/lib/d3-chart/layers/scatter-points.ts |  97 ++-
 packages/app/src/lib/data-mappings.ts         |  68 +-
 packages/app/src/lib/energy-metrics.test.ts   |  20 +
 packages/app/src/lib/url-state.ts             |   8 +-
 37 files changed, 2469 insertions(+), 666 deletions(-)
 create mode 100644 packages/app/src/components/inference/utils/parallelism-label.test.ts
 create mode 100644 packages/app/src/components/inference/utils/parallelism-label.ts

diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 6e7afb0b..fddf7871 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -12,6 +12,8 @@ import {
   useState,
 } from 'react';
 
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
+
 // useLayoutEffect warns during SSR; alias to useEffect on the server (no-op there anyway).
 const useIsomorphicLayoutEffect = typeof window === 'undefined' ? useEffect : useLayoutEffect;
 
@@ -22,8 +24,6 @@ function isEnumValue<T extends Record<string, string>>(e: T, v: string): v is T[
 const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u;
 const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u;
 
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
-
 import { useAvailability } from '@/hooks/api/use-availability';
 import { useWorkflowInfo } from '@/hooks/api/use-workflow-info';
 import { useUrlState } from '@/hooks/useUrlState';
@@ -100,7 +100,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record<string, RunInfo> {
   const runs: Record<string, RunInfo> = {};
   for (const run of data.runs) {
     const runId = String(run.github_run_id);
-    const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id);
+    const runChangelogs = data.changelogs.filter(
+      (c) => String(c.workflow_run_id) === String(run.github_run_id),
+    );
     runs[runId] = {
       runId,
       runDate: run.created_at,
@@ -147,7 +149,11 @@ export function GlobalFilterProvider({
 
   const [selectedSequence, setSelectedSequence] = useState<Sequence>(() => {
     if (initialSequence) return initialSequence;
-    return Sequence.EightK_OneK;
+    const urlSeq = getUrlParam('i_seq');
+    if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence;
+    // Prefer Agentic Traces by default when the selected model has it; the
+    // effectiveSequence fallback below handles models without agentic data.
+    return Sequence.AgenticTraces;
   });
 
   const initialValidPrecisions = useMemo(
@@ -277,9 +283,7 @@ export function GlobalFilterProvider({
     if (!availabilityRows) {
       return unofficialSeqs.length > 0 ? [...new Set(unofficialSeqs)] : SEQUENCE_OPTIONS;
     }
-    const dbSeqs = modelRows
-      .map((r) => islOslToSequence(r.isl, r.osl))
-      .filter((s): s is Sequence => s !== null);
+    const dbSeqs = modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null);
     const merged = [...new Set([...dbSeqs, ...unofficialSeqs])];
     return merged.length > 0 ? merged : SEQUENCE_OPTIONS;
   }, [availabilityRows, modelRows, unofficialAvailable, selectedModel]);
@@ -298,7 +302,7 @@ export function GlobalFilterProvider({
     if (!availabilityRows) {
       return unofficialPrecs.length > 0 ? [...new Set(unofficialPrecs)].toSorted() : ['fp4'];
     }
-    const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+    const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
     const dbPrecs = rows.map((r) => r.precision);
     const merged = [...new Set([...dbPrecs, ...unofficialPrecs])].toSorted();
     return merged.length > 0 ? merged : ['fp4'];
@@ -307,10 +311,7 @@ export function GlobalFilterProvider({
   // Curve count per precision (distinct hw/framework/spec/disagg series) for the
   // selected model + sequence — drives the auto default toward the densest one.
   const precisionCurveCounts = useMemo(
-    () =>
-      countCurvesByPrecision(
-        modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence),
-      ),
+    () => countCurvesByPrecision(modelRows.filter((r) => rowToSequence(r) === effectiveSequence)),
     [modelRows, effectiveSequence],
   );
 
@@ -346,7 +347,7 @@ export function GlobalFilterProvider({
   // Dates available for selected model + sequence + precisions
   const availableDates = useMemo(() => {
     if (!availabilityRows) return [];
-    const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+    const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
     const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision));
     if (rows.length === 0) {
       return [...new Set(seqRows.map((r) => r.date))].toSorted();
diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx
index 8fbf52ac..95fc0acb 100644
--- a/packages/app/src/components/header/header.tsx
+++ b/packages/app/src/components/header/header.tsx
@@ -46,6 +46,12 @@ const NAV_LINKS = [
     testId: 'nav-link-supporters',
     event: 'header_supporters_clicked',
   },
+  {
+    href: '/datasets',
+    label: 'Datasets',
+    testId: 'nav-link-datasets',
+    event: 'header_datasets_clicked',
+  },
   { href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' },
   { href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' },
 ] as const;
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 796a8eed..98962126 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -12,7 +12,7 @@ import {
   useState,
 } from 'react';
 
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
 import { track } from '@/lib/analytics';
 import {
   FAVORITE_PRESETS,
@@ -44,7 +44,7 @@ import {
 import { useUrlState } from '@/hooks/useUrlState';
 import { buildAvailabilityHwKey } from '@/lib/chart-utils';
 import { getHardwareConfig, getModelSortIndex, isKnownGpu, TABLEAU_10 } from '@/lib/constants';
-import { getModelExclusion, MODEL_PREFIX_MAPPING } from '@/lib/data-mappings';
+import { getModelExclusion, MODEL_PREFIX_MAPPING, sequenceKind } from '@/lib/data-mappings';
 import {
   MtpEngineConflictToast,
   type MtpEngineConflictDetail,
@@ -57,7 +57,12 @@ import {
 } from '@/lib/exclusion';
 import { filterRunsByModel, getDisplayLabel } from '@/lib/utils';
 
-import { useChartData } from './hooks/useChartData';
+import {
+  isAgenticOnlyXAxisMode,
+  useChartData,
+  X_AXIS_MODES,
+  type XAxisMode,
+} from './hooks/useChartData';
 import { resolveComparisonEntries } from './utils/comparisonEntry';
 import {
   EMPTY_QUICK_FILTERS,
@@ -150,10 +155,44 @@ export function InferenceProvider({
     () => getUrlParam('i_metric') || initialYAxisMetric || 'y_tpPerGpu',
   );
   const [selectedXAxisMetric, setSelectedXAxisMetric] = useState<string | null>(
-    () => getUrlParam('i_xmetric') || 'p99_ttft',
+    () => getUrlParam('i_xmetric') || 'p90_ttft',
   );
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
-    () => getUrlParam('i_e2e_xmetric') || null,
+    () => getUrlParam('i_e2e_xmetric') || 'p90_ttft',
+  );
+  // Selected chart variant. Initialize from URL only — SSR cannot read URL, so
+  // computing a kind-based default here would diverge between server and client
+  // and cause a hydration mismatch. The scenario-kind default is applied in a
+  // post-mount effect below (and a ref tracks whether the user has overridden).
+  //
+  // SSR has no URL access, so seed with a fixed default and apply the URL
+  // value (if any) in a post-mount effect — keeps server + client first render
+  // identical and avoids "didn't match" hydration warnings when the URL holds
+  // a non-default mode.
+  const [selectedXAxisMode, setSelectedXAxisMode] = useState<XAxisMode>('ttft');
+  const xAxisModeFromUrlRef = useRef(false);
+  useEffect(() => {
+    if (xAxisModeFromUrlRef.current) return;
+    const v = getUrlParam('i_xmode');
+    if (v && (X_AXIS_MODES as readonly string[]).includes(v)) {
+      xAxisModeFromUrlRef.current = true;
+      setSelectedXAxisMode(v as XAxisMode);
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
+  // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the
+  // existing useChartData pipeline keys off that flag for the e2e chart's x-axis.
+  const handleSetXAxisMode = useCallback((mode: XAxisMode) => {
+    xAxisModeFromUrlRef.current = true;
+    setSelectedXAxisMode(mode);
+    // The e2e chart's x-axis metric is reconciled in a separate effect below,
+    // because it depends on sequence kind (fixed-seq has no p90_* metrics) and
+    // the agentic percentile, both of which can change independently.
+  }, []);
+  // Latency percentile applied to the chart x-axis for agentic scenarios.
+  // Values: 'p90' | 'p99'. Non-agentic charts ignore.
+  const [selectedPercentile, setSelectedPercentile] = useState<string>(
+    () => getUrlParam('i_pctl') || 'p90',
   );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
@@ -201,6 +240,8 @@ export function InferenceProvider({
   const dataQuickFilters = activeTab === 'historical' ? EMPTY_QUICK_FILTERS : quickFilters;
   const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({
     urlPrefix: 'i_',
+    // Inference chart defaults to high contrast (?i_hc=0 overrides off).
+    defaultHighContrast: true,
   });
 
   const [hideNonOptimal, setHideNonOptimal] = useState(() => getUrlParam('i_optimal') !== '0');
@@ -208,21 +249,22 @@ export function InferenceProvider({
     // Legacy `?i_nolabel=1` from before the rename: keep hiding point labels
     // explicitly so the share link's intent survives future default changes.
     if (getUrlParam('i_nolabel') === '1') return false;
+    if (getUrlParam('i_label') === '0') return false;
     if (getUrlParam('i_label') === '1') return true;
-    // Old share links set `?i_advlabel=1` while keeping the labels default
-    // (shown). Mirror the toggle's auto-enable side-effect on load so those
-    // links still render advanced labels under the new default-off behavior.
-    if (getUrlParam('i_advlabel') === '1') return true;
-    return false;
+    // Default on: parallelism labels (also default on) are point labels and
+    // are pointless without them shown.
+    return true;
   });
   const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1');
+  // Parallelism labels default on (?i_advlabel=0 overrides off).
   const [useAdvancedLabels, setUseAdvancedLabels] = useState(
-    () => getUrlParam('i_advlabel') === '1',
+    () => getUrlParam('i_advlabel') !== '0',
   );
   const [showGradientLabels, setShowGradientLabels] = useState(
     () => getUrlParam('i_gradlabel') === '1',
   );
-  const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') !== '0');
+  // Line labels default off (?i_linelabel=1 overrides on).
+  const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') === '1');
   const [showSpeedOverlay, setShowSpeedOverlay] = useState(() => getUrlParam('i_speed') === '1');
   const [showMinecraftOverlay, setShowMinecraftOverlay] = useState(
     () => getUrlParam('i_mc') === '1',
@@ -291,13 +333,68 @@ export function InferenceProvider({
     return ids.length > 0 ? ids.reduce((max, id) => (id > max ? id : max), ids[0]) : '';
   }, [filteredAvailableRuns]);
 
-  // Only constrain the query when an earlier-than-latest run is selected; otherwise
-  // the chart shows the full latest view (and reuses the materialized-view fast path).
+  // Only constrain the base query when an earlier-than-latest run is selected.
   const asOfRunId =
     effectiveSelectedRunId && latestRunIdForModel && effectiveSelectedRunId !== latestRunIdForModel
       ? effectiveSelectedRunId
       : undefined;
 
+  // Run-selector scoping: only constrain benchmark data to a specific run when
+  // there's actually a disambiguation to make for the CURRENT model. The
+  // raw `availableRuns` is across ALL models on the date, so the picker may
+  // auto-select a run that produced nothing for the current model — passing
+  // that runId would return zero rows and hide the chart entirely.
+  // Compute the set of runs whose CHANGELOG explicitly mentions this model +
+  // precision. We can't reuse `filterRunsByModel` here because it has a
+  // fallback that returns all runs when nothing matches (so the picker still
+  // renders) — which would make us pass a runId that produced no rows for
+  // the current model, hiding the chart.
+  // Map each FULL config_key (model-precision-hardware-framework) a run's
+  // changelog claims to the set of runs claiming it. Single-run scoping should
+  // only kick in when two runs contest the SAME full key — e.g. a same-day
+  // re-run of one hardware — because then a DISTINCT ON merge could mix them
+  // and the user needs to pick which run wins. Runs covering DIFFERENT hardware
+  // of the same model (e.g. a B300 run and a B200 run on the same date) are
+  // complementary: both must render via carry-forward. Matching on model+
+  // precision alone (the old behavior) wrongly treated those as alternatives
+  // and scoped the chart to one run, hiding the other GPU's curve.
+  const contestedRunIds = useMemo(() => {
+    const runsByConfigKey = new Map<string, Set<string>>();
+    if (availableRuns) {
+      for (const [runId, runInfo] of Object.entries(availableRuns)) {
+        if (!runInfo.changelog) continue;
+        for (const entry of runInfo.changelog.entries) {
+          for (const key of entry.config_keys) {
+            const parts = key.split('-');
+            if (modelPrefixes.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!)) {
+              let runs = runsByConfigKey.get(key);
+              if (!runs) {
+                runs = new Set<string>();
+                runsByConfigKey.set(key, runs);
+              }
+              runs.add(runId);
+            }
+          }
+        }
+      }
+    }
+    // A run is "contested" only if some full config_key it claims is also claimed
+    // by another run. Only then does picking a run disambiguate anything.
+    // Downstream (useChartData / mergeRunScopedRows) this no longer scopes the
+    // WHOLE chart to the run: only the configs the run actually produced are
+    // pinned to it, and every other config (e.g. another framework's same-day
+    // run) still carries forward from the normal latest-per-config rows.
+    const contested = new Set<string>();
+    for (const runs of runsByConfigKey.values()) {
+      if (runs.size > 1) for (const r of runs) contested.add(r);
+    }
+    return contested;
+  }, [availableRuns, modelPrefixes, effectivePrecisions]);
+  const benchmarkRunId =
+    effectiveSelectedRunId && contestedRunIds.has(String(effectiveSelectedRunId))
+      ? String(effectiveSelectedRunId)
+      : undefined;
+
   const {
     graphs,
     loading: chartDataLoading,
@@ -319,7 +416,10 @@ export function InferenceProvider({
     effectiveRunDate,
     isActive,
     latestDate,
+    selectedPercentile,
     compareGpuPair ?? null,
+    benchmarkRunId,
+    selectedXAxisMode,
     asOfRunId,
     dataQuickFilters,
   );
@@ -335,7 +435,7 @@ export function InferenceProvider({
     if (!availabilityRows) return availableDates;
     const rows = availabilityRows.filter((r) => {
       if (!dbModelKeys.includes(r.model)) return false;
-      if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false;
+      if (rowToSequence(r) !== effectiveSequence) return false;
       if (!effectivePrecisions.includes(r.precision)) return false;
       if (!r.hardware) return false;
       const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -360,7 +460,7 @@ export function InferenceProvider({
     const hwKeys = new Set<string>();
     for (const r of availabilityRows) {
       if (!dbModelKeys.includes(r.model)) continue;
-      if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue;
+      if (rowToSequence(r) !== effectiveSequence) continue;
       if (!effectivePrecisions.includes(r.precision)) continue;
       if (!r.hardware) continue;
       const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -432,6 +532,60 @@ export function InferenceProvider({
     setTrackedConfigs((prev) => (prev.length > 0 ? [] : prev));
   }, [selectedModel, effectiveSequence, effectivePrecisions, selectedYAxisMetric]);
 
+  // Reconcile the x-axis mode with the scenario kind:
+  //  - On mount with no `i_xmode` URL param: snap to the kind's natural default
+  //    (interactivity for both agentic and fixed-sequence scenarios). The state was initialized
+  //    to a SSR-stable constant so server and client render the same DOM; this
+  //    effect fixes it up after hydration.
+  //  - When the user later switches sequence kinds: snap to the new kind's
+  //    natural default (the prior selection was for a different kind, so it
+  //    doesn't carry over).
+  const lastSeqKindRef = useRef<ReturnType<typeof sequenceKind> | null>(null);
+  useEffect(() => {
+    const kind = sequenceKind(effectiveSequence);
+    const isInitialMount = lastSeqKindRef.current === null;
+    const isAgenticOnlyMode = isAgenticOnlyXAxisMode(selectedXAxisMode);
+    // On a stale render where kind hasn't changed, bail unless the current
+    // mode is agentic-only and we just landed on a fixed-seq scenario — in
+    // that case force the snap so the chart doesn't try to plot trace-derived
+    // metrics against rows that have no trace_replay.
+    if (!isInitialMount && lastSeqKindRef.current === kind) {
+      if (kind === 'fixed-seq' && isAgenticOnlyMode) {
+        handleSetXAxisMode('interactivity');
+      }
+      return;
+    }
+    lastSeqKindRef.current = kind;
+    if (
+      isInitialMount &&
+      xAxisModeFromUrlRef.current &&
+      !(kind === 'fixed-seq' && isAgenticOnlyMode)
+    ) {
+      // URL-restored agentic-only mode on a fixed-seq sequence makes no sense
+      // — fall through to the default snap below.
+      return;
+    }
+    handleSetXAxisMode('interactivity');
+  }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]);
+
+  // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or
+  // agentic percentile changes. For fixed-seq the JSONB only carries
+  // median_* / p99_* (no p90_*), so the TTFT button there has to point at
+  // median_ttft — otherwise the chart goes blank. For agentic, we point at
+  // the user's chosen percentile so the dropdown actually drives the axis.
+  useEffect(() => {
+    const isAgentic = sequenceKind(effectiveSequence) === 'agentic';
+    if (selectedXAxisMode === 'ttft') {
+      setSelectedE2eXAxisMetric(isAgentic ? `${selectedPercentile}_ttft` : 'median_ttft');
+    } else if (selectedXAxisMode === 'e2e') {
+      // null = use the chart-config natural x (median_e2el), which useChartData
+      // rewrites to <pctl>_e2el for agentic via withPercentile().
+      setSelectedE2eXAxisMetric(null);
+    }
+    // 'interactivity' mode renders the interactivity chart, which keys off
+    // selectedXAxisMetric (not the e2e one), so nothing to do here.
+  }, [selectedXAxisMode, effectiveSequence, selectedPercentile]);
+
   // Ref guard: when true, filter changes don't clear the active preset.
   // FavoritePresetsDropdown sets this while applying a preset so its own
   // programmatic setter calls don't accidentally deactivate it.
@@ -875,21 +1029,23 @@ export function InferenceProvider({
   useUrlStateSync(
     {
       i_metric: selectedYAxisMetric,
+      i_pctl: selectedPercentile,
       i_gpus: selectedGPUs.join(','),
       i_dates: selectedDates.join(','),
       i_dstart: selectedDateRange.startDate,
       i_dend: selectedDateRange.endDate,
       i_optimal: hideNonOptimal ? '' : '0',
-      i_label: showPointLabels ? '1' : '',
-      i_hc: highContrast ? '1' : '',
+      i_label: showPointLabels ? '' : '0',
+      i_hc: highContrast ? '' : '0',
       i_log: logScale ? '1' : '',
       i_xmetric: selectedXAxisMetric || '',
       i_e2e_xmetric: selectedE2eXAxisMetric || '',
+      i_xmode: selectedXAxisMode,
       i_scale: scaleType,
       i_legend: isLegendExpanded ? '' : '0',
-      i_advlabel: useAdvancedLabels ? '1' : '',
+      i_advlabel: useAdvancedLabels ? '' : '0',
       i_gradlabel: showGradientLabels ? '1' : '',
-      i_linelabel: showLineLabels ? '' : '0',
+      i_linelabel: showLineLabels ? '1' : '',
       i_speed: showSpeedOverlay ? '1' : '',
       i_mc: showMinecraftOverlay ? '1' : '',
       i_active: iActiveStr,
@@ -902,6 +1058,7 @@ export function InferenceProvider({
       selectedYAxisMetric,
       selectedXAxisMetric,
       selectedE2eXAxisMetric,
+      selectedXAxisMode,
       scaleType,
       selectedGPUs,
       selectedDates,
@@ -1066,6 +1223,8 @@ export function InferenceProvider({
       setSelectedXAxisMetric,
       selectedE2eXAxisMetric,
       setSelectedE2eXAxisMetric,
+      selectedXAxisMode,
+      setSelectedXAxisMode: handleSetXAxisMode,
       scaleType,
       setScaleType,
       quickFilters,
@@ -1079,6 +1238,8 @@ export function InferenceProvider({
       workflowInfo,
       selectedYAxisMetric,
       setSelectedYAxisMetric: setSelectedYAxisMetricAndClear,
+      selectedPercentile,
+      setSelectedPercentile,
       selectedGPUs,
       setSelectedGPUs: setSelectedGPUsAndClear,
       availableGPUs,
@@ -1143,6 +1304,7 @@ export function InferenceProvider({
       selectedYAxisMetric,
       selectedXAxisMetric,
       selectedE2eXAxisMetric,
+      selectedXAxisMode,
       scaleType,
       quickFilters,
       availableQuickFilters,
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 8e894d0e..183641d4 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -1,7 +1,7 @@
 import { useMemo, useRef } from 'react';
 
 import { useQueries } from '@tanstack/react-query';
-import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants';
+import { rowToSequence } from '@semianalysisai/inferencex-constants';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import type {
@@ -23,9 +23,14 @@ import {
   getModelSortIndex,
   hardwareKeyMatchesAnyBase,
 } from '@/lib/constants';
-import { transformBenchmarkRows } from '@/lib/benchmark-transform';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import {
+  mergeRunScopedRows,
+  transformBenchmarkRows,
+  withPercentile,
+} from '@/lib/benchmark-transform';
+import { Sequence, type Model } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
+import { paretoFrontForDirection, type ParetoDirection } from '@/lib/chart-utils';
 import {
   applyQuickFilters,
   computeAvailableQuickFilters,
@@ -33,6 +38,90 @@ import {
   type QuickFilters,
 } from '@/components/inference/utils/quickFilters';
 
+/**
+ * Chart x-axis variant selected by the mode buttons above the plot. This is
+ * the single definition — InferenceContext (URL/state) and ChartDisplay
+ * (buttons, derived-metric remapping) import it from here.
+ */
+export type XAxisMode =
+  | 'ttft'
+  | 'e2e'
+  | 'normalized-e2e'
+  | 'interactivity'
+  | 'session-time'
+  | 'prefill-tps';
+
+export const X_AXIS_MODES: readonly XAxisMode[] = [
+  'ttft',
+  'e2e',
+  'normalized-e2e',
+  'interactivity',
+  'session-time',
+  'prefill-tps',
+];
+
+/**
+ * Modes whose x metric is derived from persisted per-request traces —
+ * these only exist for agentic scenarios (fixed-seq rows have no
+ * trace_replay blob to derive them from).
+ */
+export function isAgenticOnlyXAxisMode(mode: XAxisMode): boolean {
+  return mode === 'normalized-e2e' || mode === 'session-time' || mode === 'prefill-tps';
+}
+
+/**
+ * Compute the set of benchmark_results.id values that sit on the
+ * (e2e_latency, y) Pareto frontier within each (hwKey, precision, date)
+ * group. Used to restrict the non-e2e xmode charts (ttft, interactivity,
+ * session-time, prefill-tps) so they show *only* the points that win on
+ * end-to-end latency — preventing benchmark-hacking where a config tops
+ * one axis while tanking the other.
+ *
+ * Returns null when the y-metric has no roofline direction declared on
+ * the e2e chart (caller falls back to no filtering in that case).
+ */
+function e2eParetoIds(
+  points: InferenceData[],
+  selectedYAxisMetric: string,
+  percentile: string,
+): Set<number> | null {
+  const e2eChartDef = (chartDefinitions as ChartDefinition[]).find((c) => c.chartType === 'e2e');
+  if (!e2eChartDef) return null;
+  const dir = e2eChartDef[`${selectedYAxisMetric}_roofline` as keyof ChartDefinition] as
+    | ParetoDirection
+    | undefined;
+  if (!dir) return null;
+  const frontierFn = paretoFrontForDirection(dir);
+  // Percentile-prefixed e2e-latency field name (e.g. 'p90_e2el').
+  const e2elField = withPercentile('median_e2el', percentile);
+  const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+
+  // Re-frame each candidate point in (e2el, y) space, then compute the
+  // pareto per (hwKey, precision, date) bucket — frontiers don't span dates
+  // (a May 17 point can't dominate a May 15 plot).
+  const byGroup = new Map<string, InferenceData[]>();
+  for (const p of points) {
+    const yValue = (p[metricKey] as { y?: number } | undefined)?.y;
+    const xValue = (p as unknown as Record<string, unknown>)[e2elField];
+    if (typeof xValue !== 'number' || !Number.isFinite(xValue)) continue;
+    if (typeof yValue !== 'number' || !Number.isFinite(yValue)) continue;
+    const key = `${p.hwKey}|${p.precision}|${p.date}`;
+    let bucket = byGroup.get(key);
+    if (!bucket) {
+      bucket = [];
+      byGroup.set(key, bucket);
+    }
+    bucket.push({ ...p, x: xValue, y: yValue });
+  }
+  const ids = new Set<number>();
+  for (const bucket of byGroup.values()) {
+    for (const f of frontierFn(bucket)) {
+      if (typeof f.id === 'number') ids.add(f.id);
+    }
+  }
+  return ids;
+}
+
 /** Build deduplicated comparison dates, excluding the main run date. */
 export function buildComparisonDates(
   selectedGPUs: string[],
@@ -92,11 +181,26 @@ export function useChartData(
   selectedRunDate?: string,
   enabled = true,
   latestAvailableDate?: string,
+  selectedPercentile = 'p90',
   /** When set, only series for these two registry GPU keys are shown (compare pages). */
   compareGpuPair?: readonly [string, string] | null,
   /**
-   * GitHub run id for the "as of run" view. Set only when an earlier-than-latest
-   * run is selected; the chart then shows the data as it stood at that run.
+   * Exact GitHub run id used to pin contested configs while carrying forward
+   * configs that the selected run did not produce.
+   */
+  selectedRunId?: string,
+  /**
+   * Current x-axis mode. When set to anything other than 'e2e', the displayed
+   * data is filtered to the (e2e-latency, y) Pareto frontier so the ttft /
+   * interactivity / session-time / prefill-tps charts show only points that
+   * also win on end-to-end latency — preventing benchmark-hacking where a
+   * config tops one metric while tanking the other. The 'e2e' mode is the
+   * source of truth and keeps the full point set.
+   */
+  selectedXAxisMode: XAxisMode = 'e2e',
+  /**
+   * GitHub run id for the "as of run" base view. Set only when an
+   * earlier-than-latest run is selected.
    */
   asOfRunId?: string,
   /**
@@ -118,11 +222,35 @@ export function useChartData(
       ? ''
       : selectedRunDate;
 
+  // Two queries: the normal latest-per-config view (always), plus the
+  // run-scoped rows when a specific workflow run is selected. The merged
+  // result pins ONLY the configs the selected run produced to that run, and
+  // carries every other config forward from the base rows — selecting one of
+  // two same-day vLLM runs must not hide the day's SGLang curve just because
+  // it lives in a different workflow run. The base query is the default view
+  // query, so it's almost always already in the React Query cache.
   const {
-    data: allRows,
-    isLoading: queryLoading,
-    error: queryError,
+    data: baseRows,
+    isLoading: baseLoading,
+    error: baseError,
   } = useBenchmarks(selectedModel, queryDate, enabled, asOfRunId);
+  const {
+    data: runRows,
+    isLoading: runLoading,
+    error: runError,
+  } = useBenchmarks(selectedModel, '', enabled && Boolean(selectedRunId), selectedRunId, true);
+
+  const allRows = useMemo(() => {
+    if (!selectedRunId) return baseRows;
+    // Wait for the run rows before rendering a scoped view — rendering base
+    // rows first would flash the un-scoped chart, then swap contested points.
+    if (!runRows) return undefined;
+    if (!baseRows) return runRows;
+    return mergeRunScopedRows(runRows, baseRows);
+  }, [selectedRunId, runRows, baseRows]);
+
+  const queryLoading = baseLoading || (Boolean(selectedRunId) && runLoading);
+  const queryError = baseError ?? (selectedRunId ? runError : null);
 
   // GPU comparison: fetch data for each additional comparison date
   const comparisonDates = useMemo(
@@ -155,11 +283,13 @@ export function useChartData(
   // Merge main rows with comparison date rows.
   // Stamp each row with the *requested* date (not the actual DB date) so that
   // GPUGraph's activeDates filter (keyed by user-selected date) matches the points.
-  const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]);
+  //
+  // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via
+  // benchmark_type), so one filter covers every scenario.
   const rows = useMemo(() => {
-    if (!allRows || !sequenceIslOsl) return [];
-    const seqFilter = (r: { isl: number; osl: number }) =>
-      r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl;
+    if (!allRows) return [];
+    const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) =>
+      rowToSequence(r) === selectedSequence;
     const seqFiltered = allRows.filter(seqFilter);
 
     // For each (hw, framework, spec_method, disagg, precision) group, keep only
@@ -186,14 +316,14 @@ export function useChartData(
         .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })),
     );
     return [...mainRows, ...extraRows];
-  }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]);
+  }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]);
 
   // Transform filtered rows into chart data
   const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => {
     if (rows.length === 0)
       return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig };
-    return transformBenchmarkRows(rows);
-  }, [rows]);
+    return transformBenchmarkRows(rows, selectedPercentile);
+  }, [rows, selectedPercentile]);
 
   // Sort hardware config — stabilize reference when keys haven't changed.
   // Different sequences for the same model often have the same GPU configs,
@@ -241,8 +371,11 @@ export function useChartData(
       (chartDefinitions as ChartDefinition[]).map((chartDef) => {
         const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
 
-        // Determine dynamic x-axis
-        let xAxisField: keyof AggDataEntry = chartDef.x;
+        // Default x-axis = chart's natural latency metric, percentile-adjusted
+        // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic
+        // scenarios `withPercentile` is a no-op when percentile === 'median'.
+        const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry;
+        let xAxisField: keyof AggDataEntry = naturalX;
         let xAxisLabel = chartDef.x_label;
 
         const metricTitle =
@@ -252,14 +385,25 @@ export function useChartData(
         // Resolve the effective x-axis override per chart type
         const effectiveXMetric =
           chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric;
+        // The TTFT override is now any *_ttft metric (not just p90_ttft) — the
+        // x-axis-mode picker reconciles the percentile prefix based on sequence
+        // kind (fixed-seq → median, agentic → user-picked percentile).
         const isTtftOverride =
-          effectiveXMetric === 'p99_ttft' || effectiveXMetric === 'median_ttft';
-        const ttftLabel =
-          effectiveXMetric === 'p99_ttft'
-            ? 'P99 Time To First Token (s)'
-            : 'Median Time To First Token (s)';
-
-        if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
+          typeof effectiveXMetric === 'string' && effectiveXMetric.endsWith('_ttft');
+        const ttftPctl = isTtftOverride
+          ? (effectiveXMetric as string).replace(/_ttft$/u, '')
+          : 'p90';
+        const ttftPctlWord = ttftPctl === 'median' ? 'Median' : ttftPctl.toUpperCase();
+        const ttftLabel = `${ttftPctlWord} Time To First Token (s)`;
+
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
+
+        if (
+          effectiveXMetric &&
+          chartDef.chartType === 'interactivity' &&
+          isInputMetric &&
+          !isAgentic
+        ) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) {
@@ -268,6 +412,10 @@ export function useChartData(
             xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label;
           }
         } else if (chartDef.chartType === 'interactivity' && isInputMetric) {
+          // Agentic falls through here too — the manual X-axis dropdown is
+          // hidden in agentic mode (would double up with the percentile
+          // selector), so the config default + percentile post-processing
+          // below drives the x axis.
           const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition;
           const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x;
@@ -277,12 +425,35 @@ export function useChartData(
           xAxisLabel = ttftLabel;
         }
 
+        // Agentic: rewrite the resolved x metric to the chosen percentile,
+        // and relabel accordingly. Both have to be updated unconditionally —
+        // xAxisField may already be percentile-adjusted (via naturalX) while
+        // xAxisLabel still carries the raw chartDef.x_label prefix.
+        // The chart heading ("vs. <latency>") is also rewritten to include
+        // the percentile so the title above the plot reflects what's drawn.
+        const headingKey = `${selectedYAxisMetric}_heading` as keyof ChartDefinition;
+        let chartHeading = (chartDef[headingKey] as string) || chartDef.heading;
+        if (isAgentic) {
+          xAxisField = withPercentile(
+            xAxisField as string,
+            selectedPercentile,
+          ) as keyof AggDataEntry;
+          const pctlWord = selectedPercentile.toUpperCase();
+          xAxisLabel = xAxisLabel.replace(/^(?:Median|Mean|P75|P90|P95|P99(?:\.9)?)\b/iu, pctlWord);
+          chartHeading = chartHeading.replace(
+            /^(?<vsPrefix>vs\.\s+)(?:(?:Median|Mean|P75|P90|P95|P99(?:\.9)?)\s+)?/iu,
+            `$1${pctlWord} `,
+          );
+        }
+
         // The x-axis is "flipped" only when the good-direction reverses
         // (e.g. interactivity → TTFT: "higher is better" → "lower is better").
         // E2EL → TTFT keeps the same direction ("lower is better" for both),
         // so no roofline flip is needed for the e2e chart.
+        // Compare against `naturalX` (percentile-adjusted) — switching the
+        // percentile of the same logical metric is NOT a flip.
         const xAxisFlipped =
-          xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride);
+          xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride);
 
         const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition;
         const dynamicYLabel = chartDef[yLabelKey];
@@ -303,6 +474,7 @@ export function useChartData(
           chartDefinition: {
             ...chartDef,
             ...rooflineOverrides,
+            heading: chartHeading,
             x_label: xAxisLabel,
             y_label: dynamicYLabel === null ? undefined : String(dynamicYLabel),
           },
@@ -310,7 +482,13 @@ export function useChartData(
           xAxisField,
         };
       }),
-    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric],
+    [
+      selectedYAxisMetric,
+      selectedXAxisMetric,
+      selectedE2eXAxisMetric,
+      selectedPercentile,
+      selectedSequence,
+    ],
   );
 
   // Build renderable graphs (data processing + stable chart definitions)
@@ -344,9 +522,30 @@ export function useChartData(
 
         filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric);
 
+        // For AGENTIC workloads only: when the user is NOT viewing the
+        // e2e latency chart, mark each point with whether it sits on the
+        // (e2e_latency, y) Pareto frontier for its (hwKey, precision,
+        // date) group. The chart still renders every point as scatter —
+        // only e2e-Pareto winners feed the roofline (ScatterGraph honors
+        // the flag). Prevents benchmark-hacking the TTFT / interactivity
+        // line by tanking decode (or vice versa) without hiding the
+        // non-optimal configs from view.
+        //
+        // Fixed-seq workloads keep the existing per-axis Pareto since
+        // there's no separate "session-time" notion of total latency —
+        // their e2e IS the request latency, so a TTFT hack there reads
+        // honestly on e2e too. The anti-hack constraint is specifically
+        // about multi-turn agentic where TTFT measures a tiny fraction
+        // of the user-visible session time.
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
+        const e2eParetoSet =
+          isAgentic && selectedXAxisMode !== 'e2e'
+            ? e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile)
+            : null;
+
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
-        const isTtftX = xAxisField === 'p99_ttft' || xAxisField === 'median_ttft';
+        const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)
@@ -359,18 +558,26 @@ export function useChartData(
                 // d.x would otherwise mask the regression).
                 const xCandidate = (d as Partial<AggDataEntry>)[xAxisField];
                 const xValue = typeof xCandidate === 'number' ? xCandidate : d.x;
+                const isOnE2eFrontier =
+                  e2eParetoSet === null
+                    ? undefined
+                    : typeof d.id === 'number' && e2eParetoSet.has(d.id);
                 return {
                   ...d,
                   x: xValue,
                   y: yValue,
                   roof,
+                  isOnE2eFrontier,
                 };
               })
-              // When TTFT is on the x-axis, apply the latency limit to filter overload outliers
-              // (e.g. conc=2048 rows with TTFT > 60s that compress all real data to the far left)
+              // When TTFT is on the x-axis, apply the latency limit to filter
+              // overload outliers (fixed-seq conc=2048 rows with TTFT > 60s that
+              // compress all real data to the far left). Skip for agentic — long
+              // TTFTs there reflect real workloads (multi-turn, big prompts).
               .filter(
                 (d) =>
                   !isTtftX ||
+                  isAgentic ||
                   !chartDefinition.y_latency_limit ||
                   d.x <= chartDefinition.y_latency_limit,
               )
@@ -395,6 +602,8 @@ export function useChartData(
     userPowers,
     stableChartDefinitions,
     compareGpuPair,
+    selectedXAxisMode,
+    selectedPercentile,
     quickFilters,
   ]);
 
diff --git a/packages/app/src/components/inference/inference-chart-config.json b/packages/app/src/components/inference/inference-chart-config.json
index d9a29181..9617638f 100644
--- a/packages/app/src/components/inference/inference-chart-config.json
+++ b/packages/app/src/components/inference/inference-chart-config.json
@@ -13,9 +13,9 @@
     "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
     "y_inputTputPerGpu_title": "Input Token Throughput per GPU",
     "y_inputTputPerGpu_roofline": "upper_left",
-    "y_inputTputPerGpu_x": "p99_ttft",
-    "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
-    "y_inputTputPerGpu_heading": "vs. P99 Time To First Token",
+    "y_inputTputPerGpu_x": "p90_ttft",
+    "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
+    "y_inputTputPerGpu_heading": "vs. P90 Time To First Token",
     "y_outputTputPerGpu": "outputTputPerGpu.y",
     "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
     "y_outputTputPerGpu_title": "Output Token Throughput per GPU",
@@ -126,8 +126,8 @@
     "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
     "y_inputTputPerGpu_title": "Input Token Throughput per GPU",
     "y_inputTputPerGpu_roofline": "upper_right",
-    "y_inputTputPerGpu_x": "p99_ttft",
-    "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
+    "y_inputTputPerGpu_x": "p90_ttft",
+    "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
     "y_outputTputPerGpu": "outputTputPerGpu.y",
     "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
     "y_outputTputPerGpu_title": "Output Token Throughput per GPU",
diff --git a/packages/app/src/components/inference/replay/buildReplayTimeline.ts b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
index 91db3d40..91761604 100644
--- a/packages/app/src/components/inference/replay/buildReplayTimeline.ts
+++ b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
@@ -107,8 +107,7 @@ function resolveXAxisField(
   const metricTitle =
     (chartDef[`${selectedYAxisMetric}_title` as keyof ChartDefinition] as string) || '';
   const isInputMetric = metricTitle.toLowerCase().includes('input');
-  const isTtftOverride =
-    selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     return selectedXAxisMetric;
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index ecf2fe33..5d0981b8 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -80,6 +80,8 @@ export interface WorkerPower {
  * @property {number} p99_e2el - 99th percentile of End-to-End Latency.
  */
 export interface AggDataEntry {
+  /** Stable per-point id from benchmark_results — for trace_replay lookups. */
+  id?: number;
   hw: string;
   mtp?: string;
   hwKey: string;
@@ -94,23 +96,43 @@ export interface AggDataEntry {
   mean_ttft: number;
   median_ttft: number;
   std_ttft: number;
+  p75_ttft: number;
+  p90_ttft: number;
+  p95_ttft: number;
   p99_ttft: number;
+  'p99.9_ttft': number;
   mean_tpot: number;
   mean_intvty: number;
   median_tpot: number;
   median_intvty: number;
   std_tpot: number;
   std_intvty: number;
+  p75_tpot: number;
+  p75_intvty: number;
+  p90_tpot: number;
+  p90_intvty: number;
+  p95_tpot: number;
+  p95_intvty: number;
   p99_tpot: number;
   p99_intvty: number;
+  'p99.9_tpot': number;
+  'p99.9_intvty': number;
   mean_itl: number;
   median_itl: number;
   std_itl: number;
+  p75_itl: number;
+  p90_itl: number;
+  p95_itl: number;
   p99_itl: number;
+  'p99.9_itl': number;
   mean_e2el: number;
   median_e2el: number;
   std_e2el: number;
+  p75_e2el: number;
+  p90_e2el: number;
+  p95_e2el: number;
   p99_e2el: number;
+  'p99.9_e2el': number;
   // Measured GPU telemetry (emitted by runner's aggregate_power.py).
   // Optional because historical runs predate the fields.
   avg_power_w?: number;
@@ -162,6 +184,29 @@ export interface AggDataEntry {
   actualDate?: string;
   /** URL to the GitHub Actions workflow run that produced this data point. */
   run_url?: string;
+  /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */
+  benchmark_type?: string;
+  /** ISL in tokens — null for agentic_traces. */
+  isl?: number | null;
+  /** OSL in tokens — null for agentic_traces. */
+  osl?: number | null;
+  // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ──
+  /** "on" | "off" — whether KV cache offload to CPU was enabled. */
+  offload_mode?: string;
+  /** Actual server-observed GPU prefix-cache hit rate (0..1). */
+  server_gpu_cache_hit_rate?: number;
+  /** Actual server-observed CPU prefix-cache hit rate (0..1). */
+  server_cpu_cache_hit_rate?: number;
+  /** Infinite-cache theoretical hit rate (0..1) computed from trace. */
+  theoretical_cache_hit_rate?: number;
+  /** Total requests attempted during the window. */
+  num_requests_total?: number;
+  /** Requests that completed successfully. */
+  num_requests_successful?: number;
+  /** Total prompt tokens served. */
+  total_prompt_tokens?: number;
+  /** Total generated (output) tokens. */
+  total_generation_tokens?: number;
 }
 
 /**
@@ -187,6 +232,17 @@ export interface InferenceData extends Partial<Omit<AggDataEntry, AggDataConflic
   x: number;
   y: number;
   hidden?: boolean;
+  /**
+   * Whether this point sits on the (e2e_latency, y-metric) Pareto frontier.
+   * Set by useChartData when `selectedXAxisMode !== 'e2e'`. The TTFT /
+   * interactivity / session-time / prefill-tps charts use this flag to
+   * restrict their roofline computation to e2e-Pareto winners — vendors
+   * can't benchmark-hack TTFT by tanking decode (or vice versa) and still
+   * appear on the frontier line — while keeping every point visible as
+   * scatter so the user can see where dominated configs actually sit.
+   * Undefined when the chart is in e2e mode (no remapping needed).
+   */
+  isOnE2eFrontier?: boolean;
 
   // Overridden fields with narrower types
   hwKey: string;
@@ -660,10 +716,33 @@ export interface InferenceChartContextType {
   workflowInfo: any;
   selectedYAxisMetric: string;
   setSelectedYAxisMetric: (metric: string) => void;
+  /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */
+  selectedPercentile: string;
+  setSelectedPercentile: (p: string) => void;
   selectedXAxisMetric: string | null;
   setSelectedXAxisMetric: (metric: string | null) => void;
   selectedE2eXAxisMetric: string | null;
   setSelectedE2eXAxisMetric: (metric: string | null) => void;
+  /**
+   * Which chart variant the user wants to see — the inference card shows one chart
+   * at a time, picked by the big buttons above the chart.
+   * - 'ttft'          → e2e chartType with x-axis forced to p90_ttft
+   * - 'e2e'           → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
+   * - 'normalized-e2e'→ agentic-only; x = per-request E2E normalized to 400 output tokens
+   * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
+   * - 'session-time'  → agentic-only; x = mean-normalized session time (live-computed from trace blobs)
+   * - 'prefill-tps'   → agentic-only; x = mean of P90 prefill TPS/user per session
+   */
+  selectedXAxisMode:
+    | 'ttft'
+    | 'e2e'
+    | 'normalized-e2e'
+    | 'interactivity'
+    | 'session-time'
+    | 'prefill-tps';
+  setSelectedXAxisMode: (
+    mode: 'ttft' | 'e2e' | 'normalized-e2e' | 'interactivity' | 'session-time' | 'prefill-tps',
+  ) => void;
   scaleType: 'auto' | 'linear' | 'log';
   setScaleType: (type: 'auto' | 'linear' | 'log') => void;
   /** Coarse vendor / framework / agg-disagg / mtp-stp filters applied to the chart point set. */
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 84db5e1f..9f333482 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -1,6 +1,6 @@
 'use client';
 
-import { useMemo, useState } from 'react';
+import { useEffect, useMemo, useState } from 'react';
 
 import { track } from '@/lib/analytics';
 import { useFeatureGate } from '@/lib/use-feature-gate';
@@ -9,7 +9,8 @@ import { cn } from '@/lib/utils';
 import { useInference } from '@/components/inference/InferenceContext';
 import {
   ModelSelector,
-  SequenceSelector,
+  ScenarioSelector,
+  PercentileSelector,
   PrecisionSelector,
 } from '@/components/ui/chart-selectors';
 import { DateRangePicker } from '@/components/ui/date-range-picker';
@@ -28,7 +29,7 @@ import { Button } from '@/components/ui/button';
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import type { ChartDefinition, DisaggMode, SpecMode } from '@/components/inference/types';
 import { FRAMEWORK_FAMILIES } from '@/components/inference/utils/quickFilters';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import { Sequence, type Model, type Percentile } from '@/lib/data-mappings';
 
 /**
  * Y-axis metric options from static chart config JSON — available immediately, no API wait.
@@ -109,6 +110,13 @@ interface ChartControlsProps {
 }
 
 export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) {
+  // The percentile selector is rendered conditionally on `selectedSequence`,
+  // which on the client is hydrated from URL params. SSR doesn't see the URL,
+  // so deferring the conditional until after mount keeps the initial DOM
+  // identical between server and client (avoids hydration warnings).
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
   const [openDropdown, setOpenDropdown] = useState<string | null>(null);
   const handleDropdownOpenChange = (dropdownKey: string) => (open: boolean) => {
     if (open) {
@@ -117,6 +125,7 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
     }
     setOpenDropdown((current) => (current === dropdownKey ? null : current));
   };
+
   const {
     selectedModel,
     setSelectedModel,
@@ -126,6 +135,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
     setSelectedPrecisions,
     selectedYAxisMetric,
     setSelectedYAxisMetric,
+    selectedPercentile,
+    setSelectedPercentile,
     graphs,
     selectedGPUs,
     setSelectedGPUs,
@@ -354,14 +365,21 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
             availableModels={availableModels}
             data-testid="model-selector"
           />
-          <SequenceSelector
+          <ScenarioSelector
             value={selectedSequence}
             onChange={handleSequenceChange}
             open={openDropdown === 'sequence'}
             onOpenChange={handleDropdownOpenChange('sequence')}
             availableSequences={availableSequences}
-            data-testid="sequence-selector"
+            data-testid="scenario-selector"
           />
+          {mounted && selectedSequence === Sequence.AgenticTraces && (
+            <PercentileSelector
+              value={selectedPercentile}
+              onChange={(p: Percentile) => setSelectedPercentile(p)}
+              data-testid="percentile-selector"
+            />
+          )}
           <PrecisionSelector
             value={selectedPrecisions}
             onChange={handlePrecisionChange}
@@ -391,16 +409,17 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
           </div>
 
           {graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') &&
-            isInputMetric && (
+            isInputMetric &&
+            selectedSequence !== Sequence.AgenticTraces && (
               <div className="flex flex-col space-y-1.5 lg:col-span-1">
                 <LabelWithTooltip
                   htmlFor="x-axis-select"
                   label="X-Axis Metric"
-                  tooltip="The latency metric displayed on the chart's X-axis. Options include P99 Time To First Token and Median Time To First Token."
+                  tooltip="The latency metric displayed on the chart's X-axis: P90 Time To First Token."
                 />
                 <Select
                   onValueChange={handleXAxisMetricChange}
-                  value={selectedXAxisMetric ?? 'p99_ttft'}
+                  value={selectedXAxisMetric ?? 'p90_ttft'}
                 >
                   <SelectTrigger
                     id="x-axis-select"
@@ -410,8 +429,7 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
                     <SelectValue />
                   </SelectTrigger>
                   <SelectContent portalled={false}>
-                    <SelectItem value="p99_ttft">P99 TTFT</SelectItem>
-                    <SelectItem value="median_ttft">Median TTFT</SelectItem>
+                    <SelectItem value="p90_ttft">P90 TTFT</SelectItem>
                   </SelectContent>
                 </Select>
               </div>
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 882b6f93..6952f439 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -1,9 +1,12 @@
 'use client';
-import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants';
+import {
+  DISPLAY_MODEL_TO_DB,
+  NORMALIZED_E2E_OUTPUT_TOKENS,
+} from '@semianalysisai/inferencex-constants';
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
 import { useEffect, useMemo, useRef, useState } from 'react';
-import { BarChart3, ChevronDown, Table2, X } from 'lucide-react';
+import { BarChart3, Table2, X } from 'lucide-react';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import { useInference } from '@/components/inference/InferenceContext';
@@ -14,7 +17,10 @@ import type {
   OverlayData,
   TrendDataPoint,
 } from '@/components/inference/types';
-import { processOverlayChartData } from '@/components/inference/utils';
+import {
+  processOverlayChartData,
+  selectUnofficialOverlayForMode,
+} from '@/components/inference/utils';
 import {
   isRunComparisonEntry,
   makeRunComparisonEntry,
@@ -38,7 +44,6 @@ import {
   DialogHeader,
   DialogTitle,
 } from '@/components/ui/dialog';
-import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
 import { Skeleton } from '@/components/ui/skeleton';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import {
@@ -48,8 +53,14 @@ import {
   getModelLabel,
   getPrecisionLabel,
   getSequenceLabel,
+  sequenceKind,
 } from '@/lib/data-mappings';
 import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs';
+import {
+  useDerivedAgenticMetrics,
+  type DerivedAgenticMetric,
+} from '@/hooks/api/use-derived-agentic-metrics';
+import { isAgenticOnlyXAxisMode, type XAxisMode } from '@/components/inference/hooks/useChartData';
 import { useTrendData } from '@/components/inference/hooks/useTrendData';
 import { getHardwareConfig, hardwareKeyMatchesAnyBase } from '@/lib/constants';
 
@@ -67,55 +78,58 @@ const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagra
 });
 import WorkflowInfoDisplay from './WorkflowInfoDisplay';
 
-/** Controlled popover dropdown for the e2e chart x-axis toggle. */
-function E2eXAxisDropdown({
-  xAxisLabel,
-  xAxisOptions,
-  selectedValue,
-  onSelect,
-}: {
-  xAxisLabel: string;
-  xAxisOptions: { value: string | null; label: string }[];
-  selectedValue: string | null;
-  onSelect: (value: string | null) => void;
-}) {
-  const [open, setOpen] = useState(false);
-  return (
-    <Popover open={open} onOpenChange={setOpen}>
-      <PopoverTrigger asChild>
-        <button
-          type="button"
-          className="inline-flex items-center gap-1 hover:opacity-70 transition-opacity cursor-pointer"
-          onClick={(e) => e.stopPropagation()}
-        >
-          vs. {xAxisLabel}
-          <ChevronDown className="no-export size-3.5 shrink-0 opacity-60" />
-        </button>
-      </PopoverTrigger>
-      <PopoverContent className="w-48 p-1" align="start">
-        {xAxisOptions.map((opt) => (
-          <button
-            type="button"
-            key={opt.label}
-            className={`w-full text-left px-3 py-1.5 text-sm rounded hover:bg-accent transition-colors ${
-              (opt.value === null && !selectedValue) || opt.value === selectedValue
-                ? 'font-medium'
-                : ''
-            }`}
-            onClick={() => {
-              onSelect(opt.value);
-              setOpen(false);
-            }}
-          >
-            {opt.label}
-          </button>
-        ))}
-      </PopoverContent>
-    </Popover>
-  );
+type InferenceViewMode = 'chart' | 'table';
+
+const X_AXIS_MODE_BUTTONS: { value: XAxisMode; label: string }[] = [
+  { value: 'ttft', label: 'TTFT' },
+  { value: 'e2e', label: 'E2E Latency' },
+  { value: 'normalized-e2e', label: 'Normalized E2E' },
+  { value: 'interactivity', label: 'Interactivity' },
+  { value: 'session-time', label: 'Session Time' },
+  { value: 'prefill-tps', label: 'Prefill TPS / user' },
+];
+
+/**
+ * Presentation + data plumbing for the trace-derived x-axis modes (the
+ * agentic-only modes). One spec per mode keeps the x-label, chart heading,
+ * roofline corner, and derived-metric accessor in sync instead of scattering
+ * `selectedXAxisMode === …` conditionals through the render.
+ */
+interface DerivedXModeSpec {
+  xLabel: (percentileLabel: string) => string;
+  /** Chart heading suffix ("vs. …") shown above the plot. */
+  heading: (percentileLabel: string) => string;
+  rooflineCorner: 'upper_right' | 'upper_left';
+  /** Pull the raw metric for this mode off the derived-metrics payload. */
+  value: (m: DerivedAgenticMetric | undefined, percentile: string) => number | null | undefined;
+  /** Convert the raw metric to the plotted x value. */
+  toX: (raw: number) => number;
 }
 
-type InferenceViewMode = 'chart' | 'table';
+const DERIVED_X_MODE_SPECS: Partial<Record<XAxisMode, DerivedXModeSpec>> = {
+  'session-time': {
+    xLabel: () => 'Mean Normalized Session Time (min)',
+    heading: () => 'vs. Mean Normalized Session Time',
+    rooflineCorner: 'upper_right',
+    value: (m) => m?.normalized_session_time_s,
+    toX: (raw) => raw / 60,
+  },
+  'normalized-e2e': {
+    xLabel: (pctl) => `${pctl} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens (s)`,
+    heading: (pctl) => `vs. ${pctl} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens`,
+    rooflineCorner: 'upper_right',
+    value: (m, percentile) =>
+      percentile === 'p75' ? m?.p75_normalized_e2e_400_s : m?.p90_normalized_e2e_400_s,
+    toX: (raw) => raw,
+  },
+  'prefill-tps': {
+    xLabel: () => 'P90 Prefill TPS per user (tok/s)',
+    heading: () => 'vs. P90 Prefill TPS / user',
+    rooflineCorner: 'upper_left',
+    value: (m) => m?.p90_prefill_tps_per_user,
+    toX: (raw) => raw,
+  },
+};
 
 const VIEW_MODE_OPTIONS: SegmentedToggleOption<InferenceViewMode>[] = [
   {
@@ -161,8 +175,10 @@ export default function ChartDisplay() {
     logScale,
     activeHwTypes,
     activeDates,
-    setSelectedE2eXAxisMetric,
+    selectedPercentile,
     compareGpuPair,
+    selectedXAxisMode,
+    setSelectedXAxisMode,
   } = useInference();
 
   const {
@@ -171,6 +187,9 @@ export default function ChartDisplay() {
     totalDatesQueried,
   } = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates);
 
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
   const modelDbKeys = useMemo(
     () => DISPLAY_MODEL_TO_DB[selectedModel] ?? [selectedModel],
     [selectedModel],
@@ -278,6 +297,7 @@ export default function ChartDisplay() {
         chartType,
         selectedYAxisMetric,
         effectiveXMetric,
+        { isAgentic: sequenceKind(selectedSequence) === 'agentic' },
       );
 
       let overlayPoints = processed;
@@ -395,238 +415,267 @@ export default function ChartDisplay() {
     }));
   }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]);
 
-  const displayGraphs = isFirstLoad
-    ? Array.from({ length: 2 }).map((_, index) => (
-        <Card key={`skeleton-${index}`}>
-          <Skeleton className="h-7 w-2/4 mb-1" />
-          <Skeleton className="h-5 w-3/4 mb-2" />
-          <Skeleton className="h-150 w-full" />
-        </Card>
-      ))
-    : effectiveGraphs.length === 0
-      ? []
-      : effectiveGraphs.map((graph, graphIndex) => {
-          const isTimelineMode = Boolean(
-            selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
-          );
-          const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
-          return (
-            <section key={graphIndex} className="pt-8 md:pt-0">
-              <figure data-testid="chart-figure" className="relative rounded-lg">
-                <ChartButtons
-                  chartId={`chart-${graphIndex}`}
-                  analyticsPrefix={
-                    isTimelineMode
-                      ? 'gpu_timeseries'
-                      : graph.chartDefinition.chartType === 'e2e'
-                        ? 'latency'
-                        : 'interactivity'
-                  }
-                  leadingControls={
-                    <SegmentedToggle
-                      value={getViewMode(graphIndex)}
-                      options={VIEW_MODE_OPTIONS}
-                      onValueChange={(v) => handleViewModeChange(graphIndex, v)}
-                      ariaLabel="View mode"
-                      testId={`inference-view-toggle-${graphIndex}`}
-                    />
-                  }
-                  hideImageExport={getViewMode(graphIndex) === 'table'}
-                  setIsLegendExpanded={setIsLegendExpanded}
-                  exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
-                  onExportMp4={
-                    replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined
-                  }
-                  onExportCsv={() => {
-                    const visibleData = graph.data.filter((d) =>
+  const visibleGraphs = useMemo(() => {
+    const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e';
+    const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType);
+    return filtered.length > 0 ? filtered : effectiveGraphs;
+  }, [effectiveGraphs, selectedXAxisMode]);
+
+  const isAgenticSequence = sequenceKind(selectedSequence) === 'agentic';
+  const useDerived = isAgenticSequence && isAgenticOnlyXAxisMode(selectedXAxisMode);
+  const derivedTargetIds = useMemo(() => {
+    if (!useDerived) return [] as number[];
+    const ids = new Set<number>();
+    for (const graph of visibleGraphs) {
+      for (const point of graph.data) {
+        if (point.benchmark_type === 'agentic_traces' && typeof point.id === 'number') {
+          ids.add(point.id);
+        }
+      }
+    }
+    return [...ids];
+  }, [useDerived, visibleGraphs]);
+  const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived);
+  const derivedMetrics = derivedQuery.data;
+  const isDerivedLoading =
+    useDerived &&
+    derivedTargetIds.length > 0 &&
+    (derivedQuery.isPending || derivedQuery.isFetching) &&
+    !derivedMetrics;
+
+  // Set only when the user is on a derived (agentic-only) x-axis mode; the
+  // specs are module constants so this is referentially stable per mode.
+  const derivedSpec = useDerived ? DERIVED_X_MODE_SPECS[selectedXAxisMode] : undefined;
+
+  const renderableGraphs = useMemo(() => {
+    if (!derivedSpec) return visibleGraphs;
+    if (!derivedMetrics) return visibleGraphs.map((graph) => ({ ...graph, data: [] }));
+    const xLabel = derivedSpec.xLabel(selectedPercentile.toUpperCase());
+    return visibleGraphs.map((graph) => {
+      const chartDefinition = {
+        ...graph.chartDefinition,
+        x_label: xLabel,
+        y_latency_limit: undefined,
+        [`${selectedYAxisMetric}_roofline` as keyof typeof graph.chartDefinition]:
+          derivedSpec.rooflineCorner,
+      };
+      const data = graph.data
+        .map((point) => {
+          if (typeof point.id !== 'number') return null;
+          const raw = derivedSpec.value(derivedMetrics[point.id], selectedPercentile);
+          if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
+          return { ...point, x: derivedSpec.toX(raw) };
+        })
+        .filter((point): point is NonNullable<typeof point> => point !== null);
+      return { ...graph, chartDefinition, data };
+    });
+  }, [derivedSpec, visibleGraphs, derivedMetrics, selectedYAxisMetric, selectedPercentile]);
+
+  const displayGraphs =
+    isFirstLoad || isDerivedLoading
+      ? [
+          <Card key="skeleton-0">
+            <Skeleton className="h-7 w-2/4 mb-1" />
+            <Skeleton className="h-5 w-3/4 mb-2" />
+            <Skeleton className="h-[600px] w-full" />
+          </Card>,
+        ]
+      : renderableGraphs.length === 0
+        ? []
+        : renderableGraphs.map((graph, graphIndex) => {
+            const isTimelineMode = Boolean(
+              selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
+            );
+            const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
+            return (
+              <section key={graphIndex} className="pt-8 md:pt-0">
+                <figure data-testid="chart-figure" className="relative rounded-lg">
+                  <ChartButtons
+                    chartId={`chart-${graphIndex}`}
+                    analyticsPrefix={
                       isTimelineMode
-                        ? activeDates.has(`${d.date}_${d.hwKey}`)
-                        : activeHwTypes.has(d.hwKey as string) &&
-                          selectedPrecisions.includes(d.precision),
-                    );
-                    const { headers, rows } = inferenceChartToCsv(
-                      visibleData,
-                      graph.model,
-                      graph.sequence,
-                    );
-                    // Match warnings against the same series the chart annotates,
-                    // including visible unofficial-run overlay series.
-                    const overlay =
-                      graph.chartDefinition.chartType === 'e2e'
-                        ? overlayDataByChartType.e2e
-                        : overlayDataByChartType.interactivity;
-                    const visibleOverlayRows = isTimelineMode
-                      ? []
-                      : (overlay?.data ?? []).filter(
-                          (p) =>
-                            activeOverlayHwTypes.has(p.hwKey as string) &&
-                            selectedPrecisions.includes(p.precision),
-                        );
-                    const issueNotes = matchKnownConfigIssues(graph.model, [
-                      ...visibleData,
-                      ...visibleOverlayRows,
-                    ]).map((issue) =>
-                      knownIssueCsvNote(
-                        issue,
-                        getDisplayLabel(getHardwareConfig(issue.hwKey, graph.model)),
-                      ),
-                    );
-                    exportToCsv(
-                      `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
-                      headers,
-                      rows,
-                      issueNotes,
-                    );
-                  }}
-                />
-                <Card>
-                  {(() => {
-                    const chartCaption = (
-                      <>
-                        <h2 className="text-lg font-semibold">
-                          {
-                            graph.chartDefinition[
-                              `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                            ]
-                          }{' '}
-                          {(() => {
-                            // For Input metrics with dynamic x-axis, use dynamic heading
-                            const metricTitle =
-                              (graph.chartDefinition[
+                        ? 'gpu_timeseries'
+                        : graph.chartDefinition.chartType === 'e2e'
+                          ? 'latency'
+                          : 'interactivity'
+                    }
+                    leadingControls={
+                      <SegmentedToggle
+                        value={getViewMode(graphIndex)}
+                        options={VIEW_MODE_OPTIONS}
+                        onValueChange={(v) => handleViewModeChange(graphIndex, v)}
+                        ariaLabel="View mode"
+                        testId={`inference-view-toggle-${graphIndex}`}
+                      />
+                    }
+                    hideImageExport={getViewMode(graphIndex) === 'table'}
+                    setIsLegendExpanded={setIsLegendExpanded}
+                    exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
+                    onExportMp4={
+                      replayAvailable
+                        ? () => replayHandlesRef.current[graphIndex]?.open()
+                        : undefined
+                    }
+                    onExportCsv={() => {
+                      const visibleData = graph.data.filter((d) =>
+                        isTimelineMode
+                          ? activeDates.has(`${d.date}_${d.hwKey}`)
+                          : activeHwTypes.has(d.hwKey as string) &&
+                            selectedPrecisions.includes(d.precision),
+                      );
+                      const { headers, rows } = inferenceChartToCsv(
+                        visibleData,
+                        graph.model,
+                        graph.sequence,
+                      );
+                      // Match warnings against the same series the chart annotates,
+                      // including visible unofficial-run overlay series.
+                      const overlay = selectUnofficialOverlayForMode(
+                        selectedXAxisMode,
+                        graph.chartDefinition.chartType,
+                        overlayDataByChartType,
+                      );
+                      const visibleOverlayRows = isTimelineMode
+                        ? []
+                        : (overlay?.data ?? []).filter(
+                            (p) =>
+                              activeOverlayHwTypes.has(p.hwKey as string) &&
+                              selectedPrecisions.includes(p.precision),
+                          );
+                      const issueNotes = matchKnownConfigIssues(graph.model, [
+                        ...visibleData,
+                        ...visibleOverlayRows,
+                      ]).map((issue) =>
+                        knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))),
+                      );
+                      exportToCsv(
+                        `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
+                        headers,
+                        rows,
+                        issueNotes,
+                      );
+                    }}
+                  />
+                  <Card>
+                    {(() => {
+                      const chartCaption = (
+                        <>
+                          <h2 className="text-lg font-semibold">
+                            {
+                              graph.chartDefinition[
                                 `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                              ] as string) || '';
-                            const isInputMetric = metricTitle.toLowerCase().includes('input');
-                            if (
-                              graph.chartDefinition.chartType === 'interactivity' &&
-                              isInputMetric &&
-                              selectedXAxisMetric
-                            ) {
-                              if (selectedXAxisMetric === 'p99_ttft') {
-                                return 'vs. P99 Time To First Token';
-                              } else if (selectedXAxisMetric === 'median_ttft') {
-                                return 'vs. Median Time To First Token';
+                              ]
+                            }{' '}
+                            {(() => {
+                              // For Input metrics with dynamic x-axis, use dynamic heading
+                              const metricTitle =
+                                (graph.chartDefinition[
+                                  `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+                                ] as string) || '';
+                              const isInputMetric = metricTitle.toLowerCase().includes('input');
+                              if (
+                                graph.chartDefinition.chartType === 'interactivity' &&
+                                isInputMetric &&
+                                selectedXAxisMetric
+                              ) {
+                                if (selectedXAxisMetric === 'p99_ttft') {
+                                  return 'vs. P99 Time To First Token';
+                                } else if (selectedXAxisMetric === 'median_ttft') {
+                                  return 'vs. Median Time To First Token';
+                                }
+                              }
+
+                              // The e2e chart heading follows the branch-level x-axis mode
+                              // selector, including agentic-only derived metrics.
+                              if (graph.chartDefinition.chartType === 'e2e') {
+                                const modeSpec = DERIVED_X_MODE_SPECS[selectedXAxisMode];
+                                if (modeSpec) {
+                                  return modeSpec.heading(selectedPercentile.toUpperCase());
+                                }
+                                if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                  const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                  const word =
+                                    percentile === 'median' ? 'Median' : percentile.toUpperCase();
+                                  return `vs. ${word} Time To First Token`;
+                                }
+                                return isAgenticSequence
+                                  ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency`
+                                  : 'vs. End-to-end Latency';
                               }
-                            }
 
-                            // For e2e chart: render clickable inline dropdown for x-axis
-                            if (graph.chartDefinition.chartType === 'e2e') {
-                              const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p99_ttft'
-                                  ? 'P99 TTFT'
-                                  : selectedE2eXAxisMetric === 'median_ttft'
-                                    ? 'Median TTFT'
-                                    : 'End-to-end Latency';
-                              const xAxisOptions = [
-                                { value: null, label: 'End-to-end Latency' },
-                                { value: 'p99_ttft', label: 'P99 TTFT' },
-                                { value: 'median_ttft', label: 'Median TTFT' },
-                              ];
-                              const zoomPrefix =
-                                selectedDateRange.startDate &&
-                                selectedDateRange.endDate &&
-                                selectedGPUs.length > 0
-                                  ? 'gpu_timeseries'
-                                  : 'latency';
+                              // Fall back to configured heading
                               return (
-                                <E2eXAxisDropdown
-                                  xAxisLabel={xAxisLabel}
-                                  xAxisOptions={xAxisOptions}
-                                  selectedValue={selectedE2eXAxisMetric}
-                                  onSelect={(value) => {
-                                    setSelectedE2eXAxisMetric(value);
-                                    track('latency_x_axis_metric_selected', {
-                                      metric: value ?? 'median_e2el',
-                                    });
-                                    window.dispatchEvent(
-                                      new CustomEvent(
-                                        `${zoomPrefix}_zoom_reset_chart-${graphIndex}`,
-                                      ),
-                                    );
-                                  }}
-                                />
+                                graph.chartDefinition[
+                                  `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
+                                ] || graph.chartDefinition.heading
                               );
-                            }
-
-                            // Fall back to configured heading
-                            return (
-                              graph.chartDefinition[
-                                `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                              ] || graph.chartDefinition.heading
-                            );
-                          })()}
-                        </h2>
-                        <p className="text-sm text-muted-foreground mb-2">
-                          {getModelLabel(graph.model as Model)} •{' '}
-                          {selectedPrecisions
-                            .map((prec) => getPrecisionLabel(prec as Precision))
-                            .join(', ')}{' '}
-                          • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
-                          {isUnofficialRun
-                            ? 'Source: UNOFFICIAL'
-                            : 'Source: SemiAnalysis InferenceX™'}
-                          {selectedRunDate && (
-                            <>
-                              {' '}
-                              • Updated:{' '}
-                              {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
-                                'en-US',
-                                {
-                                  year: 'numeric',
-                                  month: '2-digit',
-                                  day: '2-digit',
-                                  timeZone: 'UTC',
-                                },
-                              )}
-                            </>
+                            })()}
+                          </h2>
+                          <p className="text-sm text-muted-foreground mb-2">
+                            {getModelLabel(graph.model as Model)} •{' '}
+                            {selectedPrecisions
+                              .map((prec) => getPrecisionLabel(prec as Precision))
+                              .join(', ')}{' '}
+                            • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
+                            {isUnofficialRun
+                              ? 'Source: UNOFFICIAL'
+                              : 'Source: SemiAnalysis InferenceX™'}
+                            {selectedRunDate && (
+                              <>
+                                {' '}
+                                • Updated:{' '}
+                                {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
+                                  'en-US',
+                                  {
+                                    year: 'numeric',
+                                    month: '2-digit',
+                                    day: '2-digit',
+                                    timeZone: 'UTC',
+                                  },
+                                )}
+                              </>
+                            )}
+                          </p>
+                          <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
+                          {isUnofficialRun && selectedXAxisMode === 'normalized-e2e' && (
+                            <p className="mb-2 text-xs text-muted-foreground">
+                              Normalized E2E requires persisted per-request traces, so
+                              unofficial-run overlays are unavailable for this experimental view.
+                            </p>
                           )}
-                        </p>
-                        <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
-                        <UnofficialDomainNotice />
-                      </>
-                    );
-
-                    if (getViewMode(graphIndex) === 'table') {
-                      const overlay =
-                        graph.chartDefinition.chartType === 'e2e'
-                          ? overlayDataByChartType.e2e
-                          : overlayDataByChartType.interactivity;
-                      const overlayRows = (overlay?.data ?? []).filter((p) =>
-                        selectedPrecisions.includes(p.precision),
-                      );
-                      return (
-                        <>
-                          {chartCaption}
-                          <InferenceTable
-                            data={
-                              overlayRows.length > 0 ? [...graph.data, ...overlayRows] : graph.data
-                            }
-                            chartDefinition={graph.chartDefinition}
-                            selectedYAxisMetric={selectedYAxisMetric}
-                          />
+                          <UnofficialDomainNotice />
                         </>
                       );
-                    }
 
-                    return selectedGPUs.length > 0 &&
-                      ((selectedDateRange.startDate && selectedDateRange.endDate) ||
-                        selectedDates.length > 0) ? (
-                      <GPUGraph
-                        chartId={`chart-${graphIndex}`}
-                        modelLabel={graph.model}
-                        data={graph.data}
-                        xLabel={graph.chartDefinition.x_label}
-                        yLabel={`${
-                          graph.chartDefinition[
-                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                          ]
-                        }`}
-                        chartDefinition={graph.chartDefinition}
-                        caption={chartCaption}
-                        runNumbering={runNumbering}
-                      />
-                    ) : (
-                      <div className="relative">
-                        <ScatterGraph
+                      if (getViewMode(graphIndex) === 'table') {
+                        const overlay = selectUnofficialOverlayForMode(
+                          selectedXAxisMode,
+                          graph.chartDefinition.chartType,
+                          overlayDataByChartType,
+                        );
+                        const overlayRows = (overlay?.data ?? []).filter((p) =>
+                          selectedPrecisions.includes(p.precision),
+                        );
+                        return (
+                          <>
+                            {chartCaption}
+                            <InferenceTable
+                              data={
+                                overlayRows.length > 0
+                                  ? [...graph.data, ...overlayRows]
+                                  : graph.data
+                              }
+                              chartDefinition={graph.chartDefinition}
+                              selectedYAxisMetric={selectedYAxisMetric}
+                            />
+                          </>
+                        );
+                      }
+
+                      return selectedGPUs.length > 0 &&
+                        ((selectedDateRange.startDate && selectedDateRange.endDate) ||
+                          selectedDates.length > 0) ? (
+                        <GPUGraph
                           chartId={`chart-${graphIndex}`}
                           modelLabel={graph.model}
                           data={graph.data}
@@ -638,44 +687,62 @@ export default function ChartDisplay() {
                           }`}
                           chartDefinition={graph.chartDefinition}
                           caption={chartCaption}
-                          overlayData={
-                            graph.chartDefinition.chartType === 'e2e'
-                              ? (overlayDataByChartType.e2e ?? undefined)
-                              : (overlayDataByChartType.interactivity ?? undefined)
-                          }
+                          runNumbering={runNumbering}
                         />
-                        {selectedGPUs.length > 0 &&
-                          (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
-                          selectedDates.length === 0 && (
-                            <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
-                              <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
-                                Select a date range or add a run to view GPU comparison
-                              </p>
-                            </div>
-                          )}
-                      </div>
-                    );
-                  })()}
-                  {replayAvailable && (
-                    <ReplayLauncher
-                      ref={(handle) => {
-                        replayHandlesRef.current[graphIndex] = handle;
-                      }}
-                      parentChartId={`chart-${graphIndex}`}
-                      chartDefinition={graph.chartDefinition}
-                      yLabel={`${
-                        graph.chartDefinition[
-                          `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                        ]
-                      }`}
-                      xLabel={graph.chartDefinition.x_label}
-                    />
-                  )}
-                </Card>
-              </figure>
-            </section>
-          );
-        });
+                      ) : (
+                        <div className="relative">
+                          <ScatterGraph
+                            chartId={`chart-${graphIndex}`}
+                            modelLabel={graph.model}
+                            data={graph.data}
+                            xLabel={graph.chartDefinition.x_label}
+                            yLabel={`${
+                              graph.chartDefinition[
+                                `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                              ]
+                            }`}
+                            chartDefinition={graph.chartDefinition}
+                            caption={chartCaption}
+                            overlayData={
+                              selectUnofficialOverlayForMode(
+                                selectedXAxisMode,
+                                graph.chartDefinition.chartType,
+                                overlayDataByChartType,
+                              ) ?? undefined
+                            }
+                          />
+                          {selectedGPUs.length > 0 &&
+                            (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
+                            selectedDates.length === 0 && (
+                              <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
+                                <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
+                                  Select a date range or add a run to view GPU comparison
+                                </p>
+                              </div>
+                            )}
+                        </div>
+                      );
+                    })()}
+                    {replayAvailable && (
+                      <ReplayLauncher
+                        ref={(handle) => {
+                          replayHandlesRef.current[graphIndex] = handle;
+                        }}
+                        parentChartId={`chart-${graphIndex}`}
+                        chartDefinition={graph.chartDefinition}
+                        yLabel={`${
+                          graph.chartDefinition[
+                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                          ]
+                        }`}
+                        xLabel={graph.chartDefinition.x_label}
+                      />
+                    )}
+                  </Card>
+                </figure>
+              </section>
+            );
+          });
 
   return (
     <div data-testid="inference-chart-display" className="flex flex-col gap-4">
@@ -733,6 +800,41 @@ export default function ChartDisplay() {
           <CustomPowers loading={loading} />
         </section>
       )}
+      <section
+        className="flex flex-wrap justify-center gap-3 sm:gap-4"
+        role="tablist"
+        aria-label="Chart x-axis metric"
+        data-testid="x-axis-mode-buttons"
+      >
+        {X_AXIS_MODE_BUTTONS.filter(({ value }) => {
+          if (!isAgenticOnlyXAxisMode(value)) return true;
+          // Before mount, render all buttons so SSR and first client render match.
+          if (!mounted) return true;
+          return isAgenticSequence;
+        }).map(({ value, label }) => {
+          const isActive = selectedXAxisMode === value;
+          return (
+            <button
+              key={value}
+              type="button"
+              role="tab"
+              aria-selected={isActive}
+              data-testid={`x-axis-mode-${value}`}
+              onClick={() => {
+                setSelectedXAxisMode(value);
+                track('latency_x_axis_mode_selected', { mode: value });
+              }}
+              className={`min-w-[160px] flex-1 sm:flex-initial rounded-full border-2 px-6 py-3 text-base font-semibold transition-colors ${
+                isActive
+                  ? 'border-primary bg-primary text-primary-foreground shadow-sm'
+                  : 'border-border bg-card text-foreground hover:border-primary/60 hover:bg-accent'
+              }`}
+            >
+              {label}
+            </button>
+          );
+        })}
+      </section>
       <div className="flex flex-col gap-4">{displayGraphs}</div>
 
       {/* Performance Over Time — Modal Drill-Down */}
diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx
index df22b8f5..a8cfed48 100644
--- a/packages/app/src/components/inference/ui/GPUGraph.tsx
+++ b/packages/app/src/components/inference/ui/GPUGraph.tsx
@@ -12,6 +12,7 @@ import { getChartWatermark } from '@/lib/data-mappings';
 import { generateGpuDateColors } from '@/lib/dynamic-colors';
 import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils';
 import { useThemeColors } from '@/hooks/useThemeColors';
+import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
 import { D3Chart } from '@/lib/d3-chart/D3Chart';
 import type {
   CustomLayerConfig,
@@ -26,6 +27,7 @@ import {
   formatLargeNumber,
   getShapeKeyForPrecision,
   logTickFormat,
+  POINT_SIZE,
 } from '@/lib/chart-rendering';
 import {
   paretoFrontLowerLeft,
@@ -259,6 +261,20 @@ const GPUGraph = React.memo(
       return pts;
     }, [groupedData, activeDates, hideNonOptimal, optimalPointKeys]);
 
+    // GPU comparison currently renders official DB-backed points only. Unofficial
+    // overlays have no benchmark_results id or persisted trace, so they cannot
+    // open the dedicated per-point charts route.
+    const agenticIds = useMemo(
+      () =>
+        filteredData.flatMap((point) =>
+          point.benchmark_type === 'agentic_traces' && typeof point.id === 'number'
+            ? [point.id]
+            : [],
+        ),
+      [filteredData],
+    );
+    const { data: traceAvailability } = useTraceAvailability(agenticIds);
+
     // Warning annotations for visible series with known upstream issues —
     // same treatment the scatter view gets, applied to the date-comparison view.
     // Lines here are colored per (gpu, date) pair, so take the first active
@@ -755,7 +771,11 @@ const GPUGraph = React.memo(
             config: {
               getColor,
               hideLabels: !showPointLabels,
-              getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+              // Match ScatterGraph: append the concurrency (C=) to the
+              // parallelism/tp label so compare-mode points are annotated the
+              // same way as the single-run scatter chart.
+              getLabelText: (d) =>
+                useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`,
               foreground: 'var(--foreground)',
               dataAttrs: {
                 series: (d) => `${d.date}_${d.hwKey}`,
@@ -794,6 +814,7 @@ const GPUGraph = React.memo(
               selectedYAxisMetric,
               hardwareConfig,
               runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
+              hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false,
             }),
           getRulerX: (d, xScale) => (xScale as d3.ScaleLinear<number, number>)(d.x),
           getRulerY: (d, yScale) => (yScale as d3.ScaleLinear<number, number>)(d.y),
@@ -807,6 +828,37 @@ const GPUGraph = React.memo(
               sel.select('.visible-shape') as any,
               getShapeKeyForPrecision(d.precision, selectedPrecisions),
             ),
+          onPointClick: (d: InferenceData) => {
+            track('gpu_timeseries_data_point_clicked', {
+              id: d.id,
+              hw: String(d.hwKey),
+              x: d.x,
+              y: d.y,
+            });
+            const tooltipEl = chartRef.current?.getTooltipElement();
+            if (!tooltipEl) return;
+            const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
+            if (!viewBtn || typeof d.id !== 'number') return;
+            viewBtn.addEventListener('click', (event) => {
+              event.stopPropagation();
+              track('gpu_timeseries_view_charts_opened', {
+                id: d.id,
+                hwKey: String(d.hwKey),
+                conc: d.conc,
+              });
+            });
+            // Pinning updates D3Chart's React state. GPU comparison rebuilds
+            // several inline layer configs on that render, whose cleanup can
+            // briefly hide the otherwise-pinned portal tooltip. Restore its
+            // pinned visibility after that render settles.
+            requestAnimationFrame(() => {
+              const pinnedTooltip = chartRef.current?.getTooltipElement();
+              if (!pinnedTooltip || chartRef.current?.getPinnedPoint() !== d) return;
+              pinnedTooltip.style.opacity = '1';
+              pinnedTooltip.style.display = 'block';
+              pinnedTooltip.style.pointerEvents = 'auto';
+            });
+          },
           attachToLayer: 1,
         }}
         onRender={(ctx: RenderContext) => {
@@ -819,6 +871,28 @@ const GPUGraph = React.memo(
           }
           // Set foreground color on scatter point labels
           ctx.layout.zoomGroup.selectAll('.point-label').style('fill', 'var(--foreground)');
+
+          // Offload halo: dashed ring on every point that used KV offload
+          // (mirrors ScatterGraph so compare mode shows the same CPU-offload
+          // indicator). The ring is a child of the dot-group, so it travels
+          // with the point on zoom/pan without a separate onZoom pass.
+          ctx.layout.zoomGroup
+            .selectAll<SVGGElement, InferenceData>('.dot-group')
+            .each(function (d) {
+              const showHalo = d.offload_mode === 'on';
+              d3.select(this)
+                .selectAll<SVGCircleElement, boolean>('.offload-halo')
+                .data(showHalo ? [true] : [])
+                .join('circle')
+                .attr('class', 'offload-halo')
+                .attr('r', POINT_SIZE + 4)
+                .attr('fill', 'none')
+                .attr('stroke', 'var(--foreground)')
+                .attr('stroke-width', 1.5)
+                .attr('stroke-dasharray', '3 2')
+                .attr('opacity', 0.9)
+                .attr('pointer-events', 'none');
+            });
         }}
         legendElement={
           <ChartLegend
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.decoration.test.tsx b/packages/app/src/components/inference/ui/ScatterGraph.decoration.test.tsx
index 2fd42acb..fac038e3 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.decoration.test.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.decoration.test.tsx
@@ -33,6 +33,13 @@ vi.mock('@/components/unofficial-run-provider', () => ({
   useUnofficialRun: () => overlayState.current,
 }));
 
+// ScatterGraph calls useTraceAvailability (a useQuery) for the agentic "View
+// charts" tooltip button. Stub it so these decoration tests don't need a
+// QueryClientProvider — trace presence is irrelevant to the toggle path.
+vi.mock('@/hooks/api/use-trace-availability', () => ({
+  useTraceAvailability: () => ({ data: undefined }),
+}));
+
 import ScatterGraph from './ScatterGraph';
 
 // ── Environment stubs ────────────────────────────────────────────────────────
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 64a8b218..fe4ca820 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -6,6 +6,7 @@ import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef } from
 
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
+import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
 import { pointNearestX } from '@/components/inference/ui/line-label-anchor';
 import {
   labelOpacityForActiveState,
@@ -15,7 +16,12 @@ import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import { computeToggle } from '@/hooks/useTogglableSet';
 import { getHardwareConfig, getModelSortIndex } from '@/lib/constants';
-import { getChartWatermark, getPrecisionLabel, type Precision } from '@/lib/data-mappings';
+import {
+  getChartWatermark,
+  getPrecisionLabel,
+  type Precision,
+  Sequence,
+} from '@/lib/data-mappings';
 import { matchKnownConfigIssues, pointMatchesIssue } from '@/lib/known-issues';
 import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils';
 import { D3Chart } from '@/lib/d3-chart/D3Chart';
@@ -44,12 +50,7 @@ import {
   getShapeKeyForPrecision,
 } from '@/lib/chart-rendering';
 import { useThemeColors } from '@/hooks/useThemeColors';
-import {
-  paretoFrontLowerLeft,
-  paretoFrontLowerRight,
-  paretoFrontUpperLeft,
-  paretoFrontUpperRight,
-} from '@/lib/chart-utils';
+import { paretoFrontForDirection, type ParetoDirection } from '@/lib/chart-utils';
 import { type RooflineDirection, getSpeedOverlayCorners } from '@/lib/speed-overlay';
 import type {
   ChartDefinition,
@@ -76,6 +77,96 @@ import {
 } from '@/components/inference/utils/knownIssueAnnotations';
 import { matchesQuickFilters } from '@/components/inference/utils/quickFilters';
 
+// Greedy label-collision avoidance.
+// Each candidate is the y-position of the FIRST baseline (relative to point
+// center) which we apply via the first tspan's `dy` — later tspans cascade
+// down by 1.1em. We try above/below at primary and secondary offsets, and
+// hide the label if all four positions collide.
+function avoidLabelCollisions(
+  zoomGroup: d3.Selection<SVGGElement, unknown, null, undefined>,
+): void {
+  interface LabelInfo {
+    el: SVGTextElement;
+    firstTspan: SVGTSpanElement;
+    cx: number;
+    cy: number;
+    w: number;
+    nLines: number;
+    defaultFirstY: number;
+  }
+  const labels: LabelInfo[] = [];
+  const ASCENT = 9;
+  const DESCENT = 3;
+  const LINE_H = 11;
+
+  zoomGroup.selectAll<SVGGElement, unknown>('.dot-group').each(function () {
+    const labelEl = this.querySelector<SVGTextElement>('.point-label');
+    if (!labelEl) return;
+    if ((this as SVGGElement).style.opacity === '0') return;
+    const tspans = labelEl.querySelectorAll<SVGTSpanElement>('tspan');
+    if (tspans.length === 0) return;
+    const transform = (this as SVGGElement).getAttribute('transform') ?? '';
+    const m = transform.match(/translate\((?<tx>[^,]+),(?<ty>[^)]+)\)/u);
+    if (!m) return;
+    const cx = parseFloat(m[1]);
+    const cy = parseFloat(m[2]);
+    const nLines = tspans.length;
+    const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point
+    // Reset to default before measuring so prior positioning doesn't bias bbox
+    tspans[0].setAttribute('dy', `${defaultFirstY}px`);
+    labelEl.style.opacity = '1';
+    const bbox = labelEl.getBBox();
+    labels.push({
+      el: labelEl,
+      firstTspan: tspans[0],
+      cx,
+      cy,
+      w: bbox.width,
+      nLines,
+      defaultFirstY,
+    });
+  });
+
+  labels.sort((a, b) => a.cx - b.cx);
+  const placed: { left: number; right: number; top: number; bottom: number }[] = [];
+  const pad = 2;
+
+  for (const lab of labels) {
+    const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT;
+    const aboveFirstY = lab.defaultFirstY;
+    const belowFirstY = 14; // first baseline 14px below point center
+    const candidates = [
+      aboveFirstY,
+      belowFirstY,
+      aboveFirstY - blockH - 2,
+      belowFirstY + blockH + 2,
+    ];
+    let chosenY: number | null = null;
+    let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
+    for (const firstY of candidates) {
+      const top = lab.cy + firstY - ASCENT - pad;
+      const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad;
+      const left = lab.cx - lab.w / 2 - pad;
+      const right = lab.cx + lab.w / 2 + pad;
+      const collides = placed.some(
+        (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom),
+      );
+      if (!collides) {
+        chosenY = firstY;
+        chosenBox = { left, right, top, bottom };
+        break;
+      }
+    }
+    if (chosenY !== null && chosenBox) {
+      lab.firstTspan.setAttribute('dy', `${chosenY}px`);
+      lab.el.style.opacity = '1';
+      placed.push(chosenBox);
+    } else {
+      lab.el.style.opacity = '0';
+    }
+  }
+}
+
 // X-shape path for overlay (unofficial) data points
 const X_SIZE = 5;
 const X_HOVER_SIZE = 7;
@@ -108,6 +199,32 @@ const formatChangelogDescription = (desc: string | string[]): React.JSX.Element
 
 const CHART_MARGIN = { top: 24, right: 10, bottom: 60, left: 60 };
 
+/**
+ * Bucket points by their (requested) date. Comparison overlays put multiple
+ * dates under one legend key, and rooflines / gradient paths must never span
+ * dates — a May 15 point can't dominate a May 17 plot.
+ */
+function groupPointsByDate(points: InferenceData[]): Map<string, InferenceData[]> {
+  const byDate = new Map<string, InferenceData[]>();
+  for (const p of points) {
+    let bucket = byDate.get(p.date);
+    if (!bucket) {
+      bucket = [];
+      byDate.set(p.date, bucket);
+    }
+    bucket.push(p);
+  }
+  return byDate;
+}
+
+/** Identity key for "is this point on a roofline" lookups (scoped per date). */
+const optimalPointKey = (d: InferenceData): string =>
+  `${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`;
+
+/** Point label lines: TP (or full parallelism label) plus the C= concurrency. */
+const pointLabelText = (d: InferenceData, advanced: boolean): string =>
+  advanced ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`;
+
 // Referentially stable "no overlay data" result (see processedOverlayData).
 const EMPTY_OVERLAY_DATA: InferenceData[] = [];
 
@@ -214,6 +331,8 @@ const ScatterGraph = React.memo(
       trackedConfigs,
       addTrackedConfig,
       removeTrackedConfig,
+      selectedXAxisMode,
+      selectedSequence,
       quickFilters,
     } = useInference();
 
@@ -289,10 +408,18 @@ const ScatterGraph = React.memo(
       () => [...effectiveOfficialHwTypes],
       [effectiveOfficialHwTypes],
     );
+    // High-contrast palette is keyed off the FULL set of official hw types with
+    // data, not the active subset. Otherwise deselecting a line shrinks the key
+    // set, which re-sizes the iwanthue palette and shifts every remaining line's
+    // hue (most visible for single-vendor agentic runs that span the full wheel —
+    // e.g. deselecting B300 would recolor B200 from red to blue). Keying off the
+    // stable full set fixes each hw's color so toggling only hides/shows lines.
+    const stableHcKeys = useMemo(() => [...hwTypesWithData], [hwTypesWithData]);
     const { resolveColor, getCssColor } = useThemeColors({
       highContrast,
       identifiers: activeHwKeys,
       activeKeys: activeOfficialKeys,
+      hcKeys: stableHcKeys,
     });
 
     // --- Changelog ---
@@ -328,34 +455,40 @@ const ScatterGraph = React.memo(
     );
 
     const rooflines = useMemo(() => {
+      // Frontier scope is (hw, precision, date) — points from different dates
+      // can never share a frontier (a May 15 point can't dominate a May 17 plot).
+      // The legend grouping is still by (hw, precision); we just split the
+      // pareto compute per date and re-merge into the legend bucket.
       const result: Record<string, InferenceData[]> = {};
       const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition;
-      const dir = chartDefinition[rooflineKey] as
-        | 'upper_right'
-        | 'upper_left'
-        | 'lower_left'
-        | 'lower_right'
-        | undefined;
-      for (const hw of Object.keys(groupedData)) {
-        const front =
-          dir === 'upper_right'
-            ? paretoFrontUpperRight(groupedData[hw])
-            : dir === 'upper_left'
-              ? paretoFrontUpperLeft(groupedData[hw])
-              : dir === 'lower_left'
-                ? paretoFrontLowerLeft(groupedData[hw])
-                : paretoFrontLowerRight(groupedData[hw]);
-        front.sort((a, b) => a.x - b.x);
-        result[hw] = front;
+      const dir = chartDefinition[rooflineKey] as ParetoDirection | undefined;
+      const frontierFn = paretoFrontForDirection(dir ?? 'lower_right');
+      for (const hwKey of Object.keys(groupedData)) {
+        const combined: InferenceData[] = [];
+        for (const datePoints of groupPointsByDate(groupedData[hwKey]).values()) {
+          // In non-e2e xmodes, useChartData stamps every point with an
+          // `isOnE2eFrontier` flag so the line is restricted to the
+          // e2e-Pareto winners — same set of points across every chart,
+          // just re-plotted at the chosen x metric. When the flag is
+          // present on ANY point in the bucket, narrow to the winners
+          // before paretoing (otherwise we'd recompute a fresh frontier
+          // on the swapped x axis and reintroduce the benchmark hack).
+          const flagged = datePoints.some((p) => p.isOnE2eFrontier !== undefined);
+          const seedPoints = flagged
+            ? datePoints.filter((p) => p.isOnE2eFrontier === true)
+            : datePoints;
+          if (seedPoints.length === 0) continue;
+          combined.push(...frontierFn(seedPoints));
+        }
+        combined.sort((a, b) => a.x - b.x);
+        result[hwKey] = combined;
       }
       return result;
     }, [groupedData, selectedYAxisMetric, chartDefinition]);
 
     const optimalPointKeys = useMemo(() => {
       const keys = new Set<string>();
-      Object.values(rooflines).forEach((pts) =>
-        pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}-${p.x}-${p.y}`)),
-      );
+      Object.values(rooflines).forEach((pts) => pts.forEach((p) => keys.add(optimalPointKey(p))));
       return keys;
     }, [rooflines]);
 
@@ -381,6 +514,10 @@ const ScatterGraph = React.memo(
     const buildPointConfigId = useCallback((point: InferenceData): string => {
       let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`;
       if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`;
+      // Agentic runs emit two rows per (config, conc) — one offload=on, one off.
+      // Without this suffix, d3's data join treats them as the same point and
+      // drops one variant (along with its halo).
+      if (point.offload_mode) key += `|offload-${point.offload_mode}`;
       return key;
     }, []);
 
@@ -454,22 +591,11 @@ const ScatterGraph = React.memo(
         {} as Record<string, Entry>,
       );
       const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition;
-      const dir = chartDefinition[rooflineKey] as
-        | 'upper_right'
-        | 'upper_left'
-        | 'lower_left'
-        | 'lower_right'
-        | undefined;
+      const dir = chartDefinition[rooflineKey] as ParetoDirection | undefined;
+      const frontierFn = paretoFrontForDirection(dir ?? 'lower_right');
       const result: Record<string, Entry> = {};
       for (const [key, group] of Object.entries(grouped)) {
-        const front =
-          dir === 'upper_right'
-            ? paretoFrontUpperRight(group.points)
-            : dir === 'upper_left'
-              ? paretoFrontUpperLeft(group.points)
-              : dir === 'lower_left'
-                ? paretoFrontLowerLeft(group.points)
-                : paretoFrontLowerRight(group.points);
+        const front = frontierFn(group.points);
         front.sort((a, b) => a.x - b.x);
         result[key] = { hwKey: group.hwKey, runIndex: group.runIndex, points: front };
       }
@@ -479,6 +605,20 @@ const ScatterGraph = React.memo(
     // All official points for rendering (unfiltered — visibility via opacity)
     const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]);
 
+    // Bulk presence lookup for agentic points: which ids have a stored
+    // trace_replay blob → controls the "View charts" button in the pinned
+    // tooltip. We deliberately don't fetch the histograms themselves here;
+    // a 95-point dsv4-b300 dashboard would pull GB of profile blobs through
+    // Neon's HTTP API and trip its 64 MB per-response cap.
+    const agenticIds = useMemo(() => {
+      const ids: number[] = [];
+      for (const p of pointsData) {
+        if (p.benchmark_type === 'agentic_traces' && typeof p.id === 'number') ids.push(p.id);
+      }
+      return ids;
+    }, [pointsData]);
+    const { data: traceAvailability } = useTraceAvailability(agenticIds);
+
     // Gradient label data
     const allPointLabelsByKey = useMemo(() => {
       const globalLabelColorMap = new Map<string, string>();
@@ -518,7 +658,7 @@ const ScatterGraph = React.memo(
     const visiblePoints = useMemo(() => {
       let pts = filteredData;
       if (hideNonOptimal) {
-        pts = pts.filter((d) => optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`));
+        pts = pts.filter((d) => optimalPointKeys.has(optimalPointKey(d)));
       }
       return processedOverlayData.length > 0 ? [...pts, ...processedOverlayData] : pts;
     }, [filteredData, processedOverlayData, hideNonOptimal, optimalPointKeys]);
@@ -607,7 +747,7 @@ const ScatterGraph = React.memo(
       (d: InferenceData) =>
         effectiveActiveHwTypes.has(d.hwKey as string) &&
         selectedPrecisions.includes(d.precision) &&
-        (!hideNonOptimal || optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)),
+        (!hideNonOptimal || optimalPointKeys.has(optimalPointKey(d))),
       [effectiveActiveHwTypes, selectedPrecisions, hideNonOptimal, optimalPointKeys],
     );
 
@@ -755,6 +895,7 @@ const ScatterGraph = React.memo(
               d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any,
             );
           }
+          avoidLabelCollisions(ctx.layout.zoomGroup);
         },
       }),
       [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type],
@@ -774,6 +915,7 @@ const ScatterGraph = React.memo(
             hardwareConfig,
             isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)),
             runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
+            hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false,
           }),
         getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x),
         getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y),
@@ -789,26 +931,39 @@ const ScatterGraph = React.memo(
           ),
         onPointClick: (d: InferenceData) => {
           track('latency_data_point_clicked', { hw: String(d.hwKey), x: d.x, y: d.y });
-          // Attach track-over-time button handler in the tooltip
           const tooltipEl = chartRef.current?.getTooltipElement();
-          if (tooltipEl) {
-            const btn = tooltipEl.querySelector('[data-action="track-over-time"]');
-            if (btn) {
-              btn.addEventListener('click', (btnEvent) => {
-                btnEvent.stopPropagation();
-                const configId = buildPointConfigId(d);
-                if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
-                else addTrackedConfig(d, chartDefinition.chartType);
-                chartRef.current?.dismissTooltip();
-                chartRef.current?.hideTooltip();
-                track('latency_point_tracked_via_tooltip', {
-                  hwKey: String(d.hwKey),
-                  tp: d.tp,
-                  conc: d.conc,
-                  precision: d.precision,
-                });
+          if (!tooltipEl) return;
+
+          // ── Summary-page actions ──────────────────────────────────────────
+          const trackBtn = tooltipEl.querySelector('[data-action="track-over-time"]');
+          if (trackBtn) {
+            trackBtn.addEventListener('click', (btnEvent) => {
+              btnEvent.stopPropagation();
+              const configId = buildPointConfigId(d);
+              if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
+              else addTrackedConfig(d, chartDefinition.chartType);
+              chartRef.current?.dismissTooltip();
+              chartRef.current?.hideTooltip();
+              track('latency_point_tracked_via_tooltip', {
+                hwKey: String(d.hwKey),
+                tp: d.tp,
+                conc: d.conc,
+                precision: d.precision,
               });
-            }
+            });
+          }
+
+          // ── "View charts" real link (supports browser open-in-new-tab) ───
+          const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
+          if (viewBtn && typeof d.id === 'number') {
+            viewBtn.addEventListener('click', (btnEvent) => {
+              btnEvent.stopPropagation();
+              track('latency_view_charts_opened', {
+                id: d.id,
+                hwKey: String(d.hwKey),
+                conc: d.conc,
+              });
+            });
           }
         },
         attachToLayer: 1, // scatter layer is index 1 (after rooflines at 0)
@@ -822,6 +977,11 @@ const ScatterGraph = React.memo(
         addTrackedConfig,
         removeTrackedConfig,
         chartDefinition.chartType,
+        // selectedPrecisions is read via interactionRef.current in the hover
+        // handlers, so it isn't a dep. traceAvailability IS read directly in the
+        // tooltip content closure (the "View charts" button), so rebuild the
+        // config when the presence fetch resolves.
+        traceAvailability,
       ],
     );
 
@@ -876,35 +1036,56 @@ const ScatterGraph = React.memo(
             const precision = key.split('_').pop()!;
             const visible =
               ir.effectiveActiveHwTypes.has(hw) && ir.selectedPrecisions.includes(precision);
-            let stroke = ir.getCssColor(ir.resolveColor(hw));
-
-            if (showGradientLabels) {
-              const pointLabels = allPointLabelsByKey[key];
-              if (pointLabels) {
-                const stops = computeGradientStops(pointLabels, xScale);
-                if (stops) {
-                  const gid = `roofline-gradient-${chartId}-${key}`;
-                  activeGradientIds.add(gid);
-                  let gradient = defs.select<SVGLinearGradientElement>(`#${CSS.escape(gid)}`);
-                  if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
-                  gradient
-                    .attr('gradientUnits', 'userSpaceOnUse')
-                    .attr('x1', xScale(pts[0].x))
-                    .attr('y1', 0)
-                    .attr('x2', xScale(pts.at(-1)!.x))
-                    .attr('y2', 0);
-                  gradient
-                    .selectAll('stop')
-                    .data(stops)
-                    .join('stop')
-                    .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
-                    .attr('stop-color', (s) => s.color);
-                  stroke = `url(#${gid})`;
+            const baseStroke = ir.getCssColor(ir.resolveColor(hw));
+
+            // Split into per-date sub-paths so the line never crosses dates.
+            // (When only one date is present the loop runs once with the full set.)
+            const byDate = groupPointsByDate(pts);
+            const singleDate = byDate.size === 1;
+
+            for (const [date, datePoints] of byDate) {
+              if (datePoints.length <= 1) continue;
+              const entryKey = singleDate ? key : `${key}__${date}`;
+              let stroke = baseStroke;
+
+              // Gradient labels only apply in the single-date case; mapping the
+              // (key-wide) ParetoPointLabel array onto per-date sub-segments is
+              // ambiguous and the comparison-date overlay is a rare combo.
+              if (singleDate && showGradientLabels) {
+                const pointLabels = allPointLabelsByKey[key];
+                if (pointLabels) {
+                  const stops = computeGradientStops(pointLabels, xScale);
+                  if (stops) {
+                    const gid = `roofline-gradient-${chartId}-${entryKey}`;
+                    activeGradientIds.add(gid);
+                    let gradient = defs.select<SVGLinearGradientElement>(`#${CSS.escape(gid)}`);
+                    if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
+                    gradient
+                      .attr('gradientUnits', 'userSpaceOnUse')
+                      .attr('x1', xScale(datePoints[0].x))
+                      .attr('y1', 0)
+                      .attr('x2', xScale(datePoints.at(-1)!.x))
+                      .attr('y2', 0);
+                    gradient
+                      .selectAll('stop')
+                      .data(stops)
+                      .join('stop')
+                      .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
+                      .attr('stop-color', (s) => s.color);
+                    stroke = `url(#${gid})`;
+                  }
                 }
               }
-            }
 
-            entries.push({ key, hw, precision, points: pts, stroke, visible });
+              entries.push({
+                key: entryKey,
+                hw,
+                precision,
+                points: datePoints,
+                stroke,
+                visible,
+              });
+            }
           });
 
           // Remove stale gradients
@@ -1346,11 +1527,18 @@ const ScatterGraph = React.memo(
             .y((d) => newYScale(d.y))
             .curve(d3.curveMonotoneX);
 
-          // Update roofline paths
+          // Update roofline paths — must split per-date so the zoom redraw
+          // matches the per-date sub-paths created in the initial render.
           Object.entries(rooflines).forEach(([key, pts]) => {
             if (pts.length < 2) return;
-            const sel = zoomGroup.select<SVGPathElement>(`.roofline-${key}`);
-            if (!sel.empty()) sel.attr('d', lineGen(pts) as string);
+            const byDate = groupPointsByDate(pts);
+            const singleDate = byDate.size === 1;
+            for (const [date, datePoints] of byDate) {
+              if (datePoints.length < 2) continue;
+              const cls = singleDate ? `roofline-${key}` : `roofline-${key}__${date}`;
+              const sel = zoomGroup.select<SVGPathElement>(`.${CSS.escape(cls)}`);
+              if (!sel.empty()) sel.attr('d', lineGen(datePoints) as string);
+            }
           });
 
           // Update gradient coordinates
@@ -1578,7 +1766,8 @@ const ScatterGraph = React.memo(
           getOpacity: (d) => (interactionRef.current.isPointVisible(d) ? 1 : 0),
           getPointerEvents: (d) => (interactionRef.current.isPointVisible(d) ? 'auto' : 'none'),
           hideLabels: !showPointLabels || showGradientLabels,
-          getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+          // Keep the concurrency (C=) annotation from the agentx scatter labels.
+          getLabelText: (d) => pointLabelText(d, useAdvancedLabels),
           foreground: 'var(--foreground)',
           dataAttrs: {
             'hw-key': (d) => String(d.hwKey),
@@ -1679,17 +1868,26 @@ const ScatterGraph = React.memo(
               // Labels
               const showLabels = showPointLabels && !showGradientLabels;
               overlayPoints.each(function (d) {
-                d3.select(this)
+                const lines = showLabels ? pointLabelText(d, useAdvancedLabels).split('\n') : [];
+                const text = d3
+                  .select(this)
                   .selectAll<SVGTextElement, boolean>('.overlay-label')
                   .data(showLabels ? [true] : [])
                   .join('text')
                   .attr('class', 'overlay-label')
-                  .attr('dy', -10)
                   .attr('text-anchor', 'middle')
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
-                  .attr('pointer-events', 'none')
-                  .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp));
+                  .attr('font-weight', '700')
+                  .attr('pointer-events', 'none');
+                const firstDy = -(1 + (lines.length - 1) * 1.1);
+                text
+                  .selectAll<SVGTSpanElement, string>('tspan')
+                  .data(lines)
+                  .join('tspan')
+                  .attr('x', 0)
+                  .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
+                  .text((l) => l);
               });
 
               // Overlay tooltip handlers
@@ -2007,6 +2205,23 @@ const ScatterGraph = React.memo(
             .attr('pointer-events', 'none');
         });
 
+        // Offload halo: dashed ring on every point that used KV offload (Pareto or not)
+        zoomGroup.selectAll<SVGGElement, InferenceData>('.dot-group').each(function (d) {
+          const showHalo = d.offload_mode === 'on';
+          d3.select(this)
+            .selectAll<SVGCircleElement, boolean>('.offload-halo')
+            .data(showHalo ? [true] : [])
+            .join('circle')
+            .attr('class', 'offload-halo')
+            .attr('r', POINT_SIZE + 4)
+            .attr('fill', 'none')
+            .attr('stroke', 'var(--foreground)')
+            .attr('stroke-width', 1.5)
+            .attr('stroke-dasharray', '3 2')
+            .attr('opacity', 0.9)
+            .attr('pointer-events', 'none');
+        });
+
         // Double-click to track/untrack
         zoomGroup
           .selectAll<SVGGElement, InferenceData>('.dot-group')
@@ -2041,6 +2256,8 @@ const ScatterGraph = React.memo(
             });
           });
 
+        avoidLabelCollisions(zoomGroup);
+
         // Log tick formatting on initial render
         if (xScaleConfig._isLog) {
           const xScale = ctx.xScale as d3.ScaleLogarithmic<number, number>;
@@ -2063,6 +2280,9 @@ const ScatterGraph = React.memo(
         chartDefinition.chartType,
         xScaleConfig._isLog,
         yScaleConfig.type,
+        optimalPointKeys,
+        getCssColor,
+        resolveColor,
       ],
     );
 
@@ -2373,6 +2593,17 @@ const ScatterGraph = React.memo(
                   setHideNonOptimal(checked);
                   track('latency_hide_non_optimal_toggled', { enabled: checked });
                 },
+                // On agentic + non-e2e chart, "optimal" means "on the
+                // e2e-latency Pareto frontier" (not a per-axis Pareto on the
+                // current x metric). Explain that so users don't wonder why
+                // a point sitting above the line is still considered
+                // dominated.
+                ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e'
+                  ? {
+                      infoTooltip:
+                        "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.",
+                    }
+                  : {}),
               },
               {
                 id: 'scatter-point-labels',
diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
index 799854d7..f18903ea 100644
--- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
@@ -194,9 +194,7 @@ export function UnofficialChartDisplay() {
                           `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
                         ]
                       }{' '}
-                      {graph.chartDefinition[
-                        `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                      ] || graph.chartDefinition.heading}
+                      {graph.chartDefinition.heading}
                     </h2>
                     <p className="text-sm text-muted-foreground mb-2">
                       {graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence}
diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts
index 8f8705e1..7d5b1482 100644
--- a/packages/app/src/components/inference/utils.test.ts
+++ b/packages/app/src/components/inference/utils.test.ts
@@ -1,7 +1,26 @@
 import { describe, it, expect } from 'vitest';
 
 import type { ChartDefinition, InferenceData } from '@/components/inference/types';
-import { filterDataByCostLimit, processOverlayChartData } from '@/components/inference/utils';
+import {
+  filterDataByCostLimit,
+  processOverlayChartData,
+  selectUnofficialOverlayForMode,
+} from '@/components/inference/utils';
+
+describe('selectUnofficialOverlayForMode', () => {
+  const overlays = { e2e: { id: 'e2e' }, interactivity: { id: 'interactivity' } };
+
+  it('suppresses raw unofficial E2E data for normalized E2E mode', () => {
+    expect(selectUnofficialOverlayForMode('normalized-e2e', 'e2e', overlays)).toBeNull();
+  });
+
+  it('preserves matching unofficial overlays for supported modes', () => {
+    expect(selectUnofficialOverlayForMode('e2e', 'e2e', overlays)).toBe(overlays.e2e);
+    expect(selectUnofficialOverlayForMode('interactivity', 'interactivity', overlays)).toBe(
+      overlays.interactivity,
+    );
+  });
+});
 
 // ---------------------------------------------------------------------------
 // fixture factories
@@ -157,12 +176,12 @@ describe('processOverlayChartData', () => {
   });
 
   it('remaps x to config override for input metrics on interactivity chart', () => {
-    // inputTputPerGpu has x override to p99_ttft on interactivity chart
+    // inputTputPerGpu has x override to p90_ttft on interactivity chart
     const data = [
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        p99_ttft: 0.25,
+        p90_ttft: 0.25,
         median_intvty: 50,
       } as any),
     ];
@@ -176,16 +195,11 @@ describe('processOverlayChartData', () => {
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        median_ttft: 0.1,
+        p90_ttft: 0.1,
         median_intvty: 50,
       } as any),
     ];
-    const result = processOverlayChartData(
-      data,
-      'interactivity',
-      'y_inputTputPerGpu',
-      'median_ttft',
-    );
+    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.1);
   });
@@ -195,76 +209,62 @@ describe('processOverlayChartData', () => {
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        p99_ttft: 0.25,
+        p90_ttft: 0.25,
         median_e2el: 2.5,
       } as any),
     ];
     const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null);
     expect(result).toHaveLength(1);
-    // e2e uses median_e2el as x (from chart config default), not p99_ttft
+    // e2e uses median_e2el as x (from chart config default), not p90_ttft
     expect(result[0].x).toBe(2.5);
   });
 
-  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => {
-    const data = [
-      pt({
-        x: 100,
-        tpPerGpu: { y: 42, roof: false },
-        p99_ttft: 0.35,
-        median_e2el: 2.5,
-      } as any),
-    ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
-    expect(result).toHaveLength(1);
-    expect(result[0].x).toBe(0.35);
-  });
-
-  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => {
+  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => {
     const data = [
       pt({
         x: 100,
         tpPerGpu: { y: 42, roof: false },
-        median_ttft: 0.12,
+        p90_ttft: 0.12,
         median_e2el: 2.5,
       } as any),
     ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft');
+    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.12);
   });
 
   it('filters e2e TTFT outliers exceeding y_latency_limit', () => {
     const data = [
-      pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any),
-      pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any),
+      pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any),
+      pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any),
     ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
+    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
     // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.5);
   });
 
   it('does not filter interactivity points by latency limit when x-axis is default', () => {
-    // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity
+    // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity
     // chart's x-axis stays median_intvty for non-input metrics. The latency limit
     // (60) must NOT apply to median_intvty values.
     const data = [
       pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any),
       pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any),
     ];
-    const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft');
+    const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft');
     expect(result).toHaveLength(2);
   });
 
   it('applies latency limit on interactivity only when x-axis is actually overridden', () => {
-    // When an input metric IS selected and x-axis overrides to p99_ttft,
+    // When an input metric IS selected and x-axis overrides to p90_ttft,
     // the latency limit should apply.
     const data = [
-      pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any),
-      pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any),
+      pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any),
+      pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any),
     ];
-    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft');
-    // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999
+    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
+    // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.5);
   });
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 4b5335b6..f6ebd0f8 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -8,6 +8,20 @@ import chartDefinitions from '@/components/inference/inference-chart-config.json
 
 import type { ChartDefinition, InferenceData, YAxisMetricKey } from './types';
 
+/**
+ * Select the matching unofficial-run overlay for a chart mode. Normalized E2E
+ * is intentionally excluded: unofficial benchmark rows do not include the
+ * persisted per-request trace needed to normalize before taking percentiles.
+ */
+export function selectUnofficialOverlayForMode<T>(
+  xAxisMode: string,
+  chartType: 'e2e' | 'interactivity',
+  overlays: { e2e: T | null; interactivity: T | null },
+): T | null {
+  if (xAxisMode === 'normalized-e2e') return null;
+  return overlays[chartType];
+}
+
 /**
  * Filters data points based on cost limits defined in the chart definition.
  * Only applies filtering for cost-related metrics, and only filters based on
@@ -75,11 +89,13 @@ export function processOverlayChartData(
   chartType: 'e2e' | 'interactivity',
   selectedYAxisMetric: string,
   selectedXAxisMetric: string | null,
+  options?: { isAgentic?: boolean },
 ): InferenceData[] {
   const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType);
   if (!chartDef) return [];
 
   const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+  const isAgentic = options?.isAgentic === true;
 
   // Resolve x-axis field (must match useChartData logic)
   const metricTitle =
@@ -87,9 +103,11 @@ export function processOverlayChartData(
   const isInputMetric = metricTitle.toLowerCase().includes('input');
   let xAxisField: string = chartDef.x;
   // selectedXAxisMetric is already the effective metric for this chart type
-  // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
+  // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric).
+  // Match any *_ttft metric — the x-axis-mode picker can now select any
+  // percentile (median/p75/p90/p99) depending on sequence kind.
   const isTtftOverride =
-    selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+    typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft');
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     xAxisField = selectedXAxisMetric;
@@ -109,7 +127,12 @@ export function processOverlayChartData(
     })
     .filter(
       (d) =>
-        xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit,
+        // Skip the latency limit for the natural x-axis or for agentic
+        // (long TTFTs are normal there, not overload outliers).
+        xAxisField === chartDef.x ||
+        isAgentic ||
+        !chartDef.y_latency_limit ||
+        d.x <= chartDef.y_latency_limit,
     );
 
   return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric);
diff --git a/packages/app/src/components/inference/utils/parallelism-label.test.ts b/packages/app/src/components/inference/utils/parallelism-label.test.ts
new file mode 100644
index 00000000..aaf715d3
--- /dev/null
+++ b/packages/app/src/components/inference/utils/parallelism-label.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from 'vitest';
+
+import { configSegmentLabel, parallelismLabel } from './parallelism-label';
+
+describe('configSegmentLabel', () => {
+  it('collapses symmetric tp===ep to TEP / DEP by dp-attention', () => {
+    expect(configSegmentLabel(8, 8, false)).toBe('TEP8');
+    expect(configSegmentLabel(8, 8, true)).toBe('DEP8');
+  });
+
+  it('uses EP / DPAEP when ep>1 and tp!==ep', () => {
+    expect(configSegmentLabel(4, 16, false)).toBe('EP16');
+    expect(configSegmentLabel(4, 16, true)).toBe('DPAEP16');
+  });
+
+  it('uses TP / DPATP when ep<=1 or absent', () => {
+    expect(configSegmentLabel(8, 1, false)).toBe('TP8');
+    expect(configSegmentLabel(8, undefined, false)).toBe('TP8');
+    expect(configSegmentLabel(8, 1, true)).toBe('DPATP8');
+  });
+});
+
+describe('parallelismLabel', () => {
+  it('falls back to bare tp when no ep data', () => {
+    expect(parallelismLabel({ tp: 8 })).toBe('8');
+  });
+
+  it('labels a single-segment config', () => {
+    expect(parallelismLabel({ tp: 8, ep: 8, dpAttention: true })).toBe('DEP8');
+    expect(parallelismLabel({ tp: 4, ep: 8, dpAttention: false })).toBe('EP8');
+  });
+
+  it('builds multinode-disagg per-role worker segments', () => {
+    expect(
+      parallelismLabel({
+        tp: 8,
+        ep: 4,
+        disagg: true,
+        isMultinode: true,
+        prefillTp: 4,
+        prefillEp: 4,
+        prefillDpAttention: false,
+        prefillNumWorkers: 2,
+        decodeTp: 8,
+        decodeEp: 8,
+        decodeDpAttention: true,
+        decodeNumWorkers: 1,
+      }),
+    ).toBe('2xTEP4+1xDEP8');
+  });
+
+  it('single-node disagg uses the single (decode) segment, not worker syntax', () => {
+    // is_multinode false → no "NxPrefill+MxDecode" expansion.
+    expect(
+      parallelismLabel({ tp: 8, ep: 8, dpAttention: false, disagg: true, isMultinode: false }),
+    ).toBe('TEP8');
+  });
+});
diff --git a/packages/app/src/components/inference/utils/parallelism-label.ts b/packages/app/src/components/inference/utils/parallelism-label.ts
new file mode 100644
index 00000000..98207110
--- /dev/null
+++ b/packages/app/src/components/inference/utils/parallelism-label.ts
@@ -0,0 +1,79 @@
+/**
+ * Shared parallelism-config labeling — the single source of truth for the
+ * short "TP8 / EP8 / TEP8 / DEP8 / DPAEP8 / 2xEP4+1xDPAEP32" labels.
+ *
+ * Used by the scatter/GPU chart point labels (via getPointLabel) and the
+ * agentic detail page's sibling navigator chips, so both surfaces describe a
+ * config identically.
+ */
+
+/**
+ * Generates a short config segment label from parallelism params.
+ * - tp == ep and dp-attn false: "TEP{N}"
+ * - tp == ep and dp-attn true: "DEP{N}"
+ * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}"
+ * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}"
+ */
+export const configSegmentLabel = (
+  tp: number,
+  ep: number | undefined,
+  dpAttention: boolean | undefined,
+): string => {
+  if (ep !== null && ep !== undefined && ep > 1 && tp === ep) {
+    return dpAttention ? `DEP${tp}` : `TEP${tp}`;
+  }
+  const dpaPrefix = dpAttention ? 'DPA' : '';
+  if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`;
+  return `${dpaPrefix}EP${ep}`;
+};
+
+/** Parallelism params for one benchmark config, framework-agnostic. */
+export interface ParallelismFields {
+  tp: number;
+  ep?: number;
+  dpAttention?: boolean;
+  disagg?: boolean;
+  isMultinode?: boolean;
+  prefillTp?: number;
+  prefillEp?: number;
+  prefillDpAttention?: boolean;
+  prefillNumWorkers?: number;
+  decodeTp?: number;
+  decodeEp?: number;
+  decodeDpAttention?: boolean;
+  decodeNumWorkers?: number;
+}
+
+/**
+ * Returns the short parallelism label for a config.
+ * - No EP data (old rows): falls back to the bare tp value (e.g. "8").
+ * - Multinode disagg: per-role segments with worker counts,
+ *   e.g. "2xEP4+1xDPAEP32".
+ * - Otherwise: a single segment from (tp, ep, dpAttention).
+ */
+export const parallelismLabel = (f: ParallelismFields): string => {
+  if (
+    (f.ep === null || f.ep === undefined) &&
+    (f.prefillEp === null || f.prefillEp === undefined)
+  ) {
+    return String(f.tp);
+  }
+
+  if (f.isMultinode && f.disagg) {
+    const prefillLabel = configSegmentLabel(
+      f.prefillTp ?? f.tp,
+      f.prefillEp ?? f.ep,
+      f.prefillDpAttention ?? f.dpAttention,
+    );
+    const decodeLabel = configSegmentLabel(
+      f.decodeTp ?? f.tp,
+      f.decodeEp ?? f.ep,
+      f.decodeDpAttention ?? f.dpAttention,
+    );
+    const pw = f.prefillNumWorkers ?? 1;
+    const dw = f.decodeNumWorkers ?? 1;
+    return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`;
+  }
+
+  return configSegmentLabel(f.tp, f.ep, f.dpAttention);
+};
diff --git a/packages/app/src/components/inference/utils/tooltip-utils.test.ts b/packages/app/src/components/inference/utils/tooltip-utils.test.ts
index 5a5bd7e9..e4b9d31f 100644
--- a/packages/app/src/components/inference/utils/tooltip-utils.test.ts
+++ b/packages/app/src/components/inference/utils/tooltip-utils.test.ts
@@ -150,6 +150,15 @@ describe('getPointLabel', () => {
 // generateTooltipContent
 // ===========================================================================
 describe('generateTooltipContent', () => {
+  it('renders View charts as a same-tab anchor so browsers offer open-in-new-tab', () => {
+    const html = generateTooltipContent(
+      tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+    );
+    expect(html).toContain('<a data-action="view-charts"');
+    expect(html).toContain('href="/inference/agentic/1"');
+    expect(html).not.toContain('data-action="view-charts" target=');
+  });
+
   it('includes hardware display label from config', () => {
     const html = generateTooltipContent(tooltipConfig());
     expect(html).toContain('H100');
@@ -365,4 +374,27 @@ describe('generateGPUGraphTooltipContent', () => {
     );
     expect(html).toContain('vllm-v0.6.0<br />abc123');
   });
+
+  it('shows View charts only for pinned points with stored trace data', () => {
+    expect(
+      generateGPUGraphTooltipContent(
+        tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+      ),
+    ).toContain('data-action="view-charts"');
+    expect(
+      generateGPUGraphTooltipContent(
+        tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+      ),
+    ).toContain('href="/inference/agentic/1"');
+    expect(
+      generateGPUGraphTooltipContent(
+        tooltipConfig({ data: pt({ id: 1 }), isPinned: false, hasTrace: true }),
+      ),
+    ).not.toContain('data-action="view-charts"');
+    expect(
+      generateGPUGraphTooltipContent(
+        tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: false }),
+      ),
+    ).not.toContain('data-action="view-charts"');
+  });
 });
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 9143f40f..e3f0de6d 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -1,6 +1,7 @@
 import { formatNumber, getDisplayLabel } from '@/lib/utils';
 
 import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types';
+import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
 
 export interface TooltipConfig {
   /** The data point to display */
@@ -19,6 +20,14 @@ export interface TooltipConfig {
   isTracked?: boolean;
   /** URL to the GitHub Actions workflow run */
   runUrl?: string;
+  /**
+   * Whether this agentic point has a stored trace_replay blob. Controls
+   * visibility of the "View charts" button — the actual distributions are
+   * rendered on the detail page, not inline, so all the tooltip needs is a
+   * presence boolean (sourced from the bulk `/api/v1/trace-availability`
+   * call so we don't ship megabytes of profile JSONL just for this check).
+   */
+  hasTrace?: boolean;
 }
 
 export interface OverlayTooltipConfig extends TooltipConfig {
@@ -26,57 +35,37 @@ export interface OverlayTooltipConfig extends TooltipConfig {
   overlayData: OverlayData;
 }
 
-/**
- * Generates a short config segment label from parallelism params.
- * - tp == ep and dp-attn false: "TEP{N}"
- * - tp == ep and dp-attn true: "DEP{N}"
- * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}"
- * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}"
- */
-const configSegmentLabel = (
-  tp: number,
-  ep: number | undefined,
-  dpAttention: boolean | undefined,
-): string => {
-  if (ep !== null && ep !== undefined && ep > 1 && tp === ep) {
-    return dpAttention ? `DEP${tp}` : `TEP${tp}`;
-  }
-  const dpaPrefix = dpAttention ? 'DPA' : '';
-  if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`;
-  return `${dpaPrefix}EP${ep}`;
-};
+// `dp_attention` is `boolean | string` on InferenceData (DB sends raw, the
+// transform narrows "true"/"false" → boolean). Coerce to a plain boolean for
+// the shared labeler, treating the legacy string form correctly.
+const asBool = (v: boolean | string | undefined): boolean | undefined =>
+  typeof v === 'string' ? v === 'true' : v;
 
 /**
  * Returns the short label for a data point on the chart.
  * - Non-multinode: e.g. "TP8", "EP8", "TEP8", "DEP8", "DPAEP8"
  * - Multinode disagg: e.g. "2xEP4+1xDPAEP32"
  * - Old data (no ep field): falls back to tp value
+ *
+ * Delegates to the shared {@link parallelismLabel} so the chart points and the
+ * agentic sibling navigator describe a config identically.
  */
-export const getPointLabel = (d: InferenceData): string => {
-  if (
-    (d.ep === null || d.ep === undefined) &&
-    (d.prefill_ep === null || d.prefill_ep === undefined)
-  )
-    return String(d.tp);
-
-  if (d.is_multinode && d.disagg) {
-    const prefillLabel = configSegmentLabel(
-      d.prefill_tp ?? d.tp,
-      d.prefill_ep ?? d.ep,
-      d.prefill_dp_attention ?? d.dp_attention,
-    );
-    const decodeLabel = configSegmentLabel(
-      d.decode_tp ?? d.tp,
-      d.decode_ep ?? d.ep,
-      d.decode_dp_attention ?? d.dp_attention,
-    );
-    const pw = d.prefill_num_workers ?? 1;
-    const dw = d.decode_num_workers ?? 1;
-    return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`;
-  }
-
-  return configSegmentLabel(d.tp, d.ep, d.dp_attention);
-};
+export const getPointLabel = (d: InferenceData): string =>
+  parallelismLabel({
+    tp: d.tp,
+    ep: d.ep,
+    dpAttention: asBool(d.dp_attention),
+    disagg: d.disagg,
+    isMultinode: d.is_multinode,
+    prefillTp: d.prefill_tp,
+    prefillEp: d.prefill_ep,
+    prefillDpAttention: asBool(d.prefill_dp_attention),
+    prefillNumWorkers: d.prefill_num_workers,
+    decodeTp: d.decode_tp,
+    decodeEp: d.decode_ep,
+    decodeDpAttention: asBool(d.decode_dp_attention),
+    decodeNumWorkers: d.decode_num_workers,
+  });
 
 const runLinkHTML = (runUrl?: string) =>
   runUrl
@@ -88,6 +77,78 @@ const runLinkHTML = (runUrl?: string) =>
 const tooltipLine = (label: string, value: string | number) =>
   `<div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;"><strong>${label}:</strong> ${value}</div>`;
 
+const formatPct = (v: number | undefined): string | null =>
+  v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
+
+/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */
+const fmt = (v: number): string => {
+  if (!Number.isFinite(v)) return String(v);
+  const rounded = parseFloat(v.toFixed(3));
+  if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded);
+  return String(rounded);
+};
+
+/**
+ * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
+ * success, token totals. Returns an empty string for non-agentic rows.
+ */
+const generateAgenticHTML = (d: InferenceData): string => {
+  if (d.benchmark_type !== 'agentic_traces') return '';
+
+  const parts: string[] = [];
+  if (d.offload_mode) {
+    parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase()));
+  }
+
+  const gpuHit = formatPct(d.server_gpu_cache_hit_rate);
+  const cpuHit = formatPct(d.server_cpu_cache_hit_rate);
+  const theoHit = formatPct(d.theoretical_cache_hit_rate);
+  if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit));
+  if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit));
+  if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit));
+
+  if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) {
+    const successPct =
+      d.num_requests_total > 0
+        ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)`
+        : '';
+    parts.push(
+      tooltipLine(
+        'Requests',
+        `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`,
+      ),
+    );
+  }
+
+  if (d.total_prompt_tokens !== undefined) {
+    parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens)));
+  }
+  if (d.total_generation_tokens !== undefined) {
+    parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
+  }
+
+  // Histograms + time-series live on the dedicated detail page now; the
+  // "View charts" button (rendered by the wrapper when pinned + has trace
+  // data) takes the user there.
+
+  return parts.join('');
+};
+
+/** "View charts" link — only visible when the tooltip is pinned and the
+ *  point has stored trace data. Wired up by the scatter/GPU graph click handlers. */
+const viewChartsButtonHTML = (
+  isPinned: boolean,
+  hasTraceData: boolean,
+  pointId: number | undefined,
+): string => {
+  if (!isPinned || !hasTraceData || typeof pointId !== 'number') return '';
+  return `<a data-action="view-charts" href="/inference/agentic/${pointId}" style="
+    display: block; margin-top: 8px; width: 100%; padding: 4px 8px; font-size: 11px; font-weight: 500;
+    border: 1px solid var(--border); border-radius: 6px; cursor: pointer;
+    background: var(--accent); color: var(--accent-foreground); text-align: center; text-decoration: none;
+  ">View charts &rarr;</a>`;
+};
+
 const shortenSha = (image: string) =>
   image.replaceAll(/(?<shaPrefix>sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$<shaPrefix>…');
 
@@ -139,7 +200,16 @@ const generateParallelismHTML = (d: InferenceData): string => {
  * @returns HTML string for the tooltip content
  */
 export const generateTooltipContent = (config: TooltipConfig): string => {
-  const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+  const {
+    data: d,
+    isPinned,
+    xLabel,
+    yLabel,
+    selectedYAxisMetric,
+    hardwareConfig,
+    runUrl,
+    hasTrace,
+  } = config;
 
   return `
     <div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 12px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1); user-select: ${isPinned ? 'text' : 'none'};">
@@ -157,16 +227,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
           : ''
       }
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${
         selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Input Token Throughput per GPU:</strong> ${formatNumber(d['inputTputPerGpu'].y)}
+            <strong>Input Token Throughput per GPU:</strong> ${fmt(d['inputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -174,7 +244,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
         selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Output Token Throughput per GPU:</strong> ${formatNumber(d['outputTputPerGpu'].y)}
+            <strong>Output Token Throughput per GPU:</strong> ${fmt(d['outputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -183,10 +253,12 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
+      ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)}
       ${
         isPinned
           ? `<button data-action="track-over-time" style="
@@ -229,19 +301,20 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
         <strong>Date:</strong> ${d.actualDate ?? d.date}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${tooltipLine('Total GPUs', d.tp)}
       ${generateParallelismHTML(d)}
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
     </div>
   `;
 };
@@ -254,7 +327,16 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
  * @returns HTML string for the tooltip content
  */
 export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => {
-  const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+  const {
+    data: d,
+    isPinned,
+    xLabel,
+    yLabel,
+    selectedYAxisMetric,
+    hardwareConfig,
+    runUrl,
+    hasTrace,
+  } = config;
 
   return `
     <div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 12px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1); user-select: ${isPinned ? 'text' : 'none'};">
@@ -272,16 +354,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
           : ''
       }
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${
         selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Input Token Throughput per GPU:</strong> ${formatNumber(d['inputTputPerGpu'].y)}
+            <strong>Input Token Throughput per GPU:</strong> ${fmt(d['inputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -289,7 +371,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
         selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Output Token Throughput per GPU:</strong> ${formatNumber(d['outputTputPerGpu'].y)}
+            <strong>Output Token Throughput per GPU:</strong> ${fmt(d['outputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -298,10 +380,12 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
+      ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)}
     </div>
   `;
 };
diff --git a/packages/app/src/components/ui/chart-legend.tsx b/packages/app/src/components/ui/chart-legend.tsx
index 25238522..ca7424bf 100644
--- a/packages/app/src/components/ui/chart-legend.tsx
+++ b/packages/app/src/components/ui/chart-legend.tsx
@@ -8,6 +8,7 @@ import {
   ChevronRight,
   Circle,
   Diamond,
+  Info,
   Square,
   Triangle,
   X,
@@ -38,6 +39,8 @@ export interface LegendSwitchConfig {
   label: string;
   checked: boolean;
   onCheckedChange: (checked: boolean) => void;
+  /** Optional explainer rendered as an info-icon tooltip next to the label. */
+  infoTooltip?: React.ReactNode;
   advanced?: boolean;
 }
 
@@ -279,6 +282,29 @@ export default function ChartLegend({
             >
               {sw.label}
             </Label>
+            {sw.infoTooltip && (
+              <TooltipProvider delayDuration={100}>
+                <TooltipRoot>
+                  <TooltipTrigger asChild>
+                    <button
+                      type="button"
+                      data-testid={`${sw.id}-info`}
+                      aria-label={`More info about ${sw.label}`}
+                      className="text-muted-foreground hover:text-foreground cursor-help -m-1.5 p-1.5 inline-flex items-center"
+                    >
+                      <Info size={14} />
+                    </button>
+                  </TooltipTrigger>
+                  <TooltipContent
+                    side="top"
+                    sideOffset={6}
+                    className="max-w-[260px] text-xs leading-snug"
+                  >
+                    {sw.infoTooltip}
+                  </TooltipContent>
+                </TooltipRoot>
+              </TooltipProvider>
+            )}
           </div>
         ))}
       </div>
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index de18da09..6aee97dd 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -5,17 +5,30 @@ import { Info } from 'lucide-react';
 import { LabelWithTooltip } from '@/components/ui/label-with-tooltip';
 import { track } from '@/lib/analytics';
 import { MultiSelect } from '@/components/ui/multi-select';
+import {
+  Select,
+  SelectContent,
+  SelectGroup,
+  SelectItem,
+  SelectLabel,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
 import { TooltipContent, TooltipRoot, TooltipTrigger } from '@/components/ui/tooltip';
 import {
   type Model,
   type Precision,
   type Sequence,
+  type Percentile,
+  PERCENTILE_OPTIONS,
   getModelCategory,
   getModelLabel,
+  getPercentileLabel,
   getPrecisionLabel,
   getSequenceCategory,
   getSequenceLabel,
   groupByCategory,
+  sequenceKind,
 } from '@/lib/data-mappings';
 
 function CategorySectionTitle({ label, reason }: { label: string; reason: string }) {
@@ -228,6 +241,143 @@ export function SequenceSelector({
   );
 }
 
+interface ScenarioSelectorProps {
+  id?: string;
+  value: string;
+  onChange: (value: Sequence) => void;
+  open?: boolean;
+  onOpenChange?: (open: boolean) => void;
+  availableSequences: string[];
+  'data-testid'?: string;
+}
+
+/**
+ * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length",
+ * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL
+ * framing only applies to the fixed-seq subset).
+ */
+export function ScenarioSelector({
+  id = 'scenario-select',
+  value,
+  onChange,
+  open,
+  onOpenChange,
+  availableSequences,
+  'data-testid': testId,
+}: ScenarioSelectorProps) {
+  const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq');
+  const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic');
+  const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence));
+
+  return (
+    <div className="flex flex-col space-y-1.5 lg:col-span-1">
+      <LabelWithTooltip
+        htmlFor={id}
+        label="Scenario"
+        tooltip="Benchmark scenario. Fixed Sequence Length runs use a defined input/output token count (ISL/OSL). Agentic Traces replay real agentic workloads with variable inputs/outputs."
+      />
+      <Select
+        value={value}
+        onValueChange={(v) => {
+          track('selector_scenario_changed', { scenario: v });
+          onChange(v as Sequence);
+        }}
+        open={open}
+        onOpenChange={onOpenChange}
+      >
+        <SelectTrigger id={id} data-testid={testId} className="w-full">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          {/* Agentic first — preferred default scenario when available. */}
+          {agentic.length > 0 && (
+            <SelectGroup>
+              <SelectLabel>Agentic</SelectLabel>
+              {agentic.map((seq) => (
+                <SelectItem key={seq} value={seq}>
+                  {getSequenceLabel(seq as Sequence)}
+                </SelectItem>
+              ))}
+            </SelectGroup>
+          )}
+          {fixedSeq.length > 0 && (
+            <SelectGroup>
+              <SelectLabel>Fixed Sequence Length</SelectLabel>
+              {fixedGroups.default.map((seq) => (
+                <SelectItem key={seq} value={seq}>
+                  {getSequenceLabel(seq as Sequence)}
+                </SelectItem>
+              ))}
+              {fixedGroups.deprecated.length > 0 && (
+                <>
+                  <SelectLabel>
+                    <CategorySectionTitle
+                      label="Deprecated"
+                      reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios."
+                    />
+                  </SelectLabel>
+                  {fixedGroups.deprecated.map((seq) => (
+                    <SelectItem key={seq} value={seq}>
+                      {getSequenceLabel(seq as Sequence)}
+                    </SelectItem>
+                  ))}
+                </>
+              )}
+            </SelectGroup>
+          )}
+        </SelectContent>
+      </Select>
+    </div>
+  );
+}
+
+interface PercentileSelectorProps {
+  id?: string;
+  value: string;
+  onChange: (value: Percentile) => void;
+  'data-testid'?: string;
+}
+
+/**
+ * Latency percentile selector for agentic-trace charts. The selected value
+ * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so
+ * picking p99 plots p99 e2e latency / interactivity instead of the median.
+ */
+export function PercentileSelector({
+  id = 'percentile-select',
+  value,
+  onChange,
+  'data-testid': testId,
+}: PercentileSelectorProps) {
+  return (
+    <div className="flex flex-col space-y-1.5 lg:col-span-1">
+      <LabelWithTooltip
+        htmlFor={id}
+        label="Latency Percentile"
+        tooltip="Percentile of the latency distribution used for the chart x-axis on agentic runs."
+      />
+      <Select
+        value={value}
+        onValueChange={(v) => {
+          track('selector_percentile_changed', { percentile: v });
+          onChange(v as Percentile);
+        }}
+      >
+        <SelectTrigger id={id} data-testid={testId} className="w-full">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          {PERCENTILE_OPTIONS.map((p) => (
+            <SelectItem key={p} value={p}>
+              {getPercentileLabel(p)}
+            </SelectItem>
+          ))}
+        </SelectContent>
+      </Select>
+    </div>
+  );
+}
+
 interface PrecisionSelectorProps {
   id?: string;
   value: string[];
diff --git a/packages/app/src/components/ui/d3-chart-wrapper.tsx b/packages/app/src/components/ui/d3-chart-wrapper.tsx
index 0392ac10..44013b1b 100644
--- a/packages/app/src/components/ui/d3-chart-wrapper.tsx
+++ b/packages/app/src/components/ui/d3-chart-wrapper.tsx
@@ -1,6 +1,41 @@
 'use client';
 
-import React from 'react';
+import React, { useEffect, useState } from 'react';
+import { createPortal } from 'react-dom';
+
+/**
+ * Renders the d3 tooltip element via React Portal to document.body so it
+ * escapes any parent stacking context (e.g. the chart Card's backdrop-filter
+ * creates one, trapping z-index inside it). Position is set as viewport
+ * coordinates by the d3 layer.
+ */
+function PortalTooltip({
+  tooltipRef,
+  pinned,
+}: {
+  tooltipRef: React.RefObject<HTMLDivElement | null>;
+  pinned: boolean;
+}) {
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+  const node = (
+    <div
+      ref={tooltipRef}
+      data-chart-tooltip
+      style={{
+        position: 'fixed',
+        left: 0,
+        top: 0,
+        opacity: pinned ? 1 : 0,
+        pointerEvents: pinned ? 'auto' : 'none',
+        display: pinned ? 'block' : 'none',
+        zIndex: 9999,
+      }}
+    />
+  );
+  if (!mounted || typeof document === 'undefined') return node;
+  return createPortal(node, document.body);
+}
 
 export interface D3ChartWrapperProps {
   chartId: string;
@@ -72,17 +107,11 @@ export function D3ChartWrapper({
                 }
               }}
             />
-            <div
-              ref={tooltipRef}
-              data-chart-tooltip
-              style={{
-                position: 'absolute',
-                opacity: pinnedPoint ? 1 : 0,
-                pointerEvents: pinnedPoint ? 'auto' : 'none',
-                display: pinnedPoint ? 'block' : 'none',
-                zIndex: 50,
-              }}
-            />
+            {/* Tooltip is portalled to <body> with position:fixed so it can
+                rise above sibling chart cards' stacking contexts. The d3 layer
+                writes viewport-coords into style.left/top — see
+                computeTooltipPosition. */}
+            <PortalTooltip tooltipRef={tooltipRef} pinned={Boolean(pinnedPoint)} />
             {noDataOverlay}
           </div>
           <p className="no-export text-xs text-muted-foreground text-center mt-2">{instructions}</p>
diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts
index 1863060d..3c24d32b 100644
--- a/packages/app/src/components/unofficial-run-provider.test.ts
+++ b/packages/app/src/components/unofficial-run-provider.test.ts
@@ -12,6 +12,7 @@ import { buildChartData, parseAvailableModelsAndSequences } from './unofficial-r
 /** Minimal BenchmarkRow stub — only fields used by buildChartData key logic. */
 function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h200',
     framework: 'sglang',
     model: 'dsr1',
@@ -29,6 +30,8 @@ function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 128,
diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx
index 310a4d1a..54b470ff 100644
--- a/packages/app/src/components/unofficial-run-provider.tsx
+++ b/packages/app/src/components/unofficial-run-provider.tsx
@@ -12,7 +12,7 @@ import {
 
 import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types';
 import { UnofficialBanner } from '@/components/ui/unofficial-banner';
-import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants';
 import { computeToggle } from '@/hooks/useTogglableSet';
 import type { BenchmarkRow, EvalRow } from '@/lib/api';
 import { normalizeEvalHardwareKey } from '@/lib/chart-utils';
@@ -110,7 +110,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData
   const groups = new Map<string, BenchmarkRow[]>();
   for (const row of benchmarks) {
     const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model;
-    const sequence = islOslToSequence(row.isl, row.osl);
+    const sequence = rowToSequence(row);
     if (!sequence) continue;
     const key = `${displayModel}_${sequence}`;
     if (!groups.has(key)) groups.set(key, []);
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 0dac5883..a9d66715 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -8,6 +8,8 @@ import type { WorkerPower } from '@/components/inference/types';
 import type { SubmissionsResponse } from './submissions-types';
 
 export interface BenchmarkRow {
+  /** Stable per-point id from benchmark_results; used for agentic detail lookups. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -25,9 +27,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode. Defaults to 'off' for fixed-sequence rows. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   /**
@@ -176,13 +182,14 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) {
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 8f27cc8f..648ebaae 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -2,10 +2,15 @@ import { describe, it, expect, vi } from 'vitest';
 
 import type { BenchmarkRow } from '@/lib/api';
 
-import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform';
+import {
+  mergeRunScopedRows,
+  rowToAggDataEntry,
+  transformBenchmarkRows,
+} from './benchmark-transform';
 
 function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h200',
     framework: 'trt',
     model: 'dsr1',
@@ -23,6 +28,8 @@ function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 64,
@@ -793,3 +800,89 @@ describe('transformBenchmarkRows — dp_attention narrowing', () => {
     expect(point.decode_dp_attention).toBe(true);
   });
 });
+
+describe('mergeRunScopedRows', () => {
+  const vllmRun = (over: Partial<BenchmarkRow> = {}) =>
+    makeRow({ model: 'dsv4', hardware: 'b300', framework: 'vllm', precision: 'fp4', ...over });
+  const sglangBase = (over: Partial<BenchmarkRow> = {}) =>
+    makeRow({ model: 'dsv4', hardware: 'b300', framework: 'sglang', precision: 'fp4', ...over });
+
+  it('pins configs the run covers to the run rows, replacing base rows', () => {
+    const runRows = [vllmRun({ id: 10, conc: 32 }), vllmRun({ id: 11, conc: 64 })];
+    const baseRows = [vllmRun({ id: 90, conc: 32 }), vllmRun({ id: 91, conc: 128 })];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    // All vllm base rows dropped (incl. conc=128 the run didn't cover) — a
+    // partial-sweep run must fully own its config or the DISTINCT-ON mixing
+    // the scoping exists to prevent comes right back.
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]);
+  });
+
+  it('carries forward configs the run does not cover (the same-day other-framework curve)', () => {
+    const runRows = [vllmRun({ id: 10 })];
+    const baseRows = [
+      vllmRun({ id: 90 }),
+      sglangBase({ id: 91 }),
+      sglangBase({ id: 92, conc: 128 }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91, 92]);
+  });
+
+  it('keeps base rows of other hardware / precision / model untouched', () => {
+    const runRows = [vllmRun({ id: 10 })];
+    const baseRows = [
+      vllmRun({ id: 90, hardware: 'b200' }),
+      vllmRun({ id: 91, precision: 'fp8' }),
+      vllmRun({ id: 92, model: 'kimik2.5' }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 90, 91, 92]);
+  });
+
+  it('scopes per benchmark_type — an agentic run does not hide fixed-seq carry-forward', () => {
+    const runRows = [vllmRun({ id: 10, benchmark_type: 'agentic_traces' })];
+    const baseRows = [
+      vllmRun({ id: 90, benchmark_type: 'agentic_traces' }),
+      vllmRun({ id: 91, benchmark_type: 'single_turn' }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]);
+  });
+
+  it('returns base rows unchanged when the run produced nothing', () => {
+    const baseRows = [vllmRun({ id: 90 }), sglangBase({ id: 91 })];
+    expect(mergeRunScopedRows([], baseRows)).toBe(baseRows);
+  });
+});
+
+describe('rowToAggDataEntry — agentic interactivity invariant', () => {
+  // Agentic artifacts have shipped *_intvty under two definitions across harness
+  // versions (slow-tail 1/p(ITL) vs fast-tail p(1/ITL)). The chart's
+  // interactivity selector is slow-tail, so we always derive intvty = 1/itl and
+  // discard the artifact value. Mirrors the ingest mapper + backfill.
+  const agentic = (metrics: Record<string, number>) =>
+    rowToAggDataEntry(makeRow({ benchmark_type: 'agentic_traces', isl: null, osl: null, metrics }));
+
+  it('overrides an artifact-supplied (fast-tail) *_intvty with 1/*_itl', () => {
+    const entry = agentic({
+      p90_itl: 0.0893, // slow-tail 1/itl ≈ 11.198
+      p90_intvty: 23.91, // fast-tail contamination — must be discarded
+      p75_itl: 0.0692,
+      p75_intvty: 19, // must be discarded
+    });
+    expect(entry.p90_intvty).toBeCloseTo(1 / 0.0893, 6);
+    expect(entry.p75_intvty).toBeCloseTo(1 / 0.0692, 6);
+    expect(entry.p90_intvty).not.toBeCloseTo(23.91, 1);
+  });
+
+  it('derives intvty from itl when the artifact omits intvty entirely', () => {
+    const entry = agentic({ p90_itl: 0.1, p95_itl: 0.2 });
+    expect(entry.p90_intvty).toBeCloseTo(10, 6);
+    expect(entry.p95_intvty).toBeCloseTo(5, 6);
+  });
+
+  it('does not invert interactivity for single_turn rows', () => {
+    const entry = rowToAggDataEntry(makeRow({ metrics: { p90_itl: 0.05, p90_intvty: 999 } }));
+    expect(entry.p90_intvty).toBe(999);
+  });
+});
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index ac806b79..cb8e3ceb 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -15,10 +15,47 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils';
 import { getHardwareConfig } from '@/lib/constants';
 import type { BenchmarkRow } from '@/lib/api';
 
+/**
+ * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl
+ * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here:
+ *   e2el   ≡ ttlt   (time-to-last-token == end-to-end latency)
+ *   tpot   ≡ itl    (time-per-output-token == inter-token-latency for single-output)
+ *   intvty ≡ 1/itl  (tok/s from the user's perspective)
+ *
+ * e2el/tpot only fill gaps (existing fields win). `intvty` is ALWAYS derived from
+ * itl, overriding any artifact-supplied value: the harness definition of
+ * `*_intvty` has drifted (some versions emit `p(1/ITL)`, which inverts percentile
+ * order), so for a slow-tail selector interactivity must be `1/p(ITL)`. This
+ * matches the ingest mapper + backfill-agentic-intvty for official rows; doing it
+ * here keeps overlay / `?unofficialrun=` rows (transformed live from raw
+ * artifacts, never through the DB) on the same definition.
+ */
+function agenticAliases(m: Record<string, number>): Record<string, number> {
+  const out: Record<string, number> = {};
+  for (const suffix of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
+    const itl = m[`${suffix}_itl`];
+    const ttlt = m[`${suffix}_ttlt`];
+    if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
+    if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl;
+    if (itl !== undefined && itl > 0) out[`${suffix}_intvty`] = 1 / itl;
+  }
+  return out;
+}
+
 /** Convert a DB benchmark row to an AggDataEntry. */
 export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
-  const m = row.metrics;
+  const isAgentic = row.benchmark_type === 'agentic_traces';
+  const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics;
+  // Prefer the dedicated column (added in migration 004); fall back to the
+  // legacy stash inside `metrics` for any rows ingested before that column
+  // existed.
+  const rawMetrics = row.metrics as Record<string, unknown>;
+  const offloadMode =
+    row.offload_mode ??
+    (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
   return {
+    // Coerce: Postgres bigint comes through the SQL client as a string.
+    id: typeof row.id === 'number' ? row.id : Number(row.id),
     hw: row.hardware,
     framework: row.framework,
     model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model,
@@ -32,23 +69,43 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     mean_ttft: m.mean_ttft ?? 0,
     median_ttft: m.median_ttft ?? 0,
     std_ttft: m.std_ttft ?? 0,
+    p75_ttft: m.p75_ttft ?? 0,
+    p90_ttft: m.p90_ttft ?? 0,
+    p95_ttft: m.p95_ttft ?? 0,
     p99_ttft: m.p99_ttft ?? 0,
+    'p99.9_ttft': m['p99.9_ttft'] ?? 0,
     mean_tpot: m.mean_tpot ?? 0,
     median_tpot: m.median_tpot ?? 0,
     std_tpot: m.std_tpot ?? 0,
+    p75_tpot: m.p75_tpot ?? 0,
+    p90_tpot: m.p90_tpot ?? 0,
+    p95_tpot: m.p95_tpot ?? 0,
     p99_tpot: m.p99_tpot ?? 0,
+    'p99.9_tpot': m['p99.9_tpot'] ?? 0,
     mean_intvty: m.mean_intvty ?? 0,
     median_intvty: m.median_intvty ?? 0,
     std_intvty: m.std_intvty ?? 0,
+    p75_intvty: m.p75_intvty ?? 0,
+    p90_intvty: m.p90_intvty ?? 0,
+    p95_intvty: m.p95_intvty ?? 0,
     p99_intvty: m.p99_intvty ?? 0,
+    'p99.9_intvty': m['p99.9_intvty'] ?? 0,
     mean_itl: m.mean_itl ?? 0,
     median_itl: m.median_itl ?? 0,
     std_itl: m.std_itl ?? 0,
+    p75_itl: m.p75_itl ?? 0,
+    p90_itl: m.p90_itl ?? 0,
+    p95_itl: m.p95_itl ?? 0,
     p99_itl: m.p99_itl ?? 0,
+    'p99.9_itl': m['p99.9_itl'] ?? 0,
     mean_e2el: m.mean_e2el ?? 0,
     median_e2el: m.median_e2el ?? 0,
     std_e2el: m.std_e2el ?? 0,
+    p75_e2el: m.p75_e2el ?? 0,
+    p90_e2el: m.p90_e2el ?? 0,
+    p95_e2el: m.p95_e2el ?? 0,
     p99_e2el: m.p99_e2el ?? 0,
+    'p99.9_e2el': m['p99.9_e2el'] ?? 0,
     // Measured GPU telemetry (runner's aggregate_power.py). Left undefined for
     // rows predating the field so downstream chart code can distinguish
     // "no measurement" from "0 W" via createChartDataPoint's typeof guard.
@@ -91,6 +148,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     date: row.date,
     actualDate: (row as any).actualDate ?? row.date,
     run_url: row.run_url ?? undefined,
+    benchmark_type: row.benchmark_type,
+    isl: row.isl,
+    osl: row.osl,
+    offload_mode: offloadMode,
+    server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate,
+    server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate,
+    theoretical_cache_hit_rate: m.theoretical_cache_hit_rate,
+    num_requests_total: m.num_requests_total,
+    num_requests_successful: m.num_requests_successful,
+    total_prompt_tokens: m.total_prompt_tokens,
+    total_generation_tokens: m.total_generation_tokens,
   };
 }
 
@@ -100,13 +168,59 @@ interface PreparedEntry {
   date: string;
 }
 
+/**
+ * Rewrite a chart x-axis key to use a different latency percentile prefix
+ * (`median_` → `p99_` etc). Only touches keys that start with a known
+ * percentile prefix; leaves everything else alone.
+ */
+export function withPercentile(key: string, percentile: string): string {
+  return key.replace(/^(?:mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`);
+}
+
+// Replacement granularity for single-run scoping: the changelog config_key
+// tuple (model-precision-hardware-framework) plus benchmark_type, so an
+// agentic-only run never hides the same config's fixed-seq carry-forward.
+const runScopeKey = (r: BenchmarkRow): string =>
+  `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}`;
+
+/**
+ * Merge run-scoped benchmark rows with the normal latest-per-config rows.
+ *
+ * When the user picks a specific workflow run (to disambiguate two same-day
+ * sweeps of the same config), only the configs that run actually produced
+ * should be pinned to it — every other config must keep its normal
+ * carry-forward rows. Scoping the whole chart to the run (the old behavior)
+ * silently hid complementary configs that happened to land on the same date,
+ * e.g. selecting one of two same-day vLLM runs made the day's SGLang curve
+ * vanish because it lived in a different workflow run.
+ *
+ * Run rows win for every (model, precision, hardware, framework,
+ * benchmark_type) group they cover; base rows fill in the rest.
+ */
+export function mergeRunScopedRows(
+  runRows: BenchmarkRow[],
+  baseRows: BenchmarkRow[],
+): BenchmarkRow[] {
+  if (runRows.length === 0) return baseRows;
+  const claimed = new Set(runRows.map(runScopeKey));
+  return [...runRows, ...baseRows.filter((r) => !claimed.has(runScopeKey(r)))];
+}
+
 /**
  * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig.
  * Returns one InferenceData[] per chart definition (e2e, interactivity).
  *
  * Converts rows to AggDataEntry once, then reuses for each chart definition.
+ *
+ * @param percentile Optional latency percentile for the chart x-axis
+ *   (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart
+ *   definition for the chosen percentile — only agentic rows carry the
+ *   full set (median/p90/p99/p99.9) so this mainly affects that scenario.
  */
-export function transformBenchmarkRows(rows: BenchmarkRow[]): {
+export function transformBenchmarkRows(
+  rows: BenchmarkRow[],
+  percentile = 'median',
+): {
   chartData: InferenceData[][];
   hardwareConfig: HardwareConfig;
 } {
@@ -132,13 +246,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): {
 
   // Phase 2: Build chart data per chart definition (reusing prepared entries)
   const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => {
+    const xKey = withPercentile(chartDef.x, percentile);
     const groupedByHw: Record<string, InferenceData[]> = {};
 
     for (const { entry, hwKey, date } of prepared) {
       const dataPoint = createChartDataPoint(
         date,
         entry,
-        chartDef.x as keyof AggDataEntry,
+        xKey as keyof AggDataEntry,
         chartDef.y as keyof AggDataEntry,
         hwKey,
       );
diff --git a/packages/app/src/lib/chart-utils.test.ts b/packages/app/src/lib/chart-utils.test.ts
index db569118..052d498f 100644
--- a/packages/app/src/lib/chart-utils.test.ts
+++ b/packages/app/src/lib/chart-utils.test.ts
@@ -353,30 +353,29 @@ describe('generateHighContrastColors', () => {
     expect(Object.values(dark).join(',')).not.toEqual(Object.values(light).join(','));
   });
 
-  // ---------- Tier 1: few items → brand zone ----------
-
-  it('3 NVIDIA GPUs are not red', () => {
+  // ---------- Single vendor: full wheel for maximum contrast ----------
+  // Brand-zone / rival-ban only apply when MULTIPLE vendors are present (so the
+  // vendors stay visually separable). With a single vendor there's no rival to
+  // distinguish from, so HC opens the full hue wheel — brand hue is sacrificed
+  // for the contrast HC exists to provide (fixes the all-NVIDIA agentic case
+  // where every series otherwise collapsed into the green brand band).
+
+  it('3 NVIDIA GPUs (single vendor) are distinguishable across the full wheel', () => {
     const result = generateHighContrastColors(['h100_vllm', 'h200_vllm', 'b200_vllm'], 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotReddish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(3);
     assertMinDist(result, 30);
   });
 
-  it('2 AMD GPUs are not green', () => {
+  it('2 AMD GPUs (single vendor) are distinguishable across the full wheel', () => {
     const result = generateHighContrastColors(['mi300x_sglang', 'mi325x_sglang'], 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotGreenish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(2);
     assertMinDist(result, 30);
   });
 
-  it('4 NVIDIA GPUs stay in brand zone and are distinguishable', () => {
+  it('4 NVIDIA GPUs (single vendor) use the full wheel and stay well-separated', () => {
     const keys = ['h100_vllm', 'h200_vllm', 'b200_vllm', 'b300_vllm'];
     const result = generateHighContrastColors(keys, 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotReddish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(4);
     assertMinDist(result, 25);
   });
 
@@ -401,19 +400,13 @@ describe('generateHighContrastColors', () => {
     assertMinDist(result, 25);
   });
 
-  // ---------- Tier 2: moderate items → full wheel minus rival color ----------
+  // ---------- Single vendor, many items → full wheel, best spacing ----------
 
-  it('10 NVIDIA GPUs: no red hues, still distinguishable', () => {
+  it('10 NVIDIA GPUs (single vendor) are well-separated across the full wheel', () => {
     const gpus = ['h100', 'h200', 'b200', 'b300', 'gb200'];
     const keys = gpus.flatMap((g) => [`${g}_vllm`, `${g}_sglang`]);
     const result = generateHighContrastColors(keys, 'dark');
-    // Should not be reddish (banned)
-    for (const color of Object.values(result)) {
-      const rgb = parseRgb(color);
-      // Not red-dominant with low green — i.e. not in the red/pink zone
-      const isRedPink = rgb[0] > 150 && rgb[1] < 80 && rgb[2] < 150;
-      expect(isRedPink).toBe(false);
-    }
+    expect(Object.keys(result)).toHaveLength(10);
     assertMinDist(result, 20);
   });
 
diff --git a/packages/app/src/lib/chart-utils.ts b/packages/app/src/lib/chart-utils.ts
index 33a5b4e3..ce903fe0 100644
--- a/packages/app/src/lib/chart-utils.ts
+++ b/packages/app/src/lib/chart-utils.ts
@@ -61,10 +61,17 @@ const PALETTE_CACHE = new Map<string, string[]>();
 /**
  * Generates high-contrast colors using iwanthue (k-means in CIELab space).
  *
- * Tiered strategy per vendor:
+ * Tiered strategy per vendor (only when >1 vendor is present):
  *   ≤ PREFERRED_MAX → constrain to brand zone (NVIDIA=green, AMD=red)
  *   ≤ BAN_MAX       → full wheel minus rival's brand color
  *   > BAN_MAX       → full wheel, no restrictions, best spacing wins
+ *
+ * Single-vendor case (e.g. an all-NVIDIA agentic comparison of B200/B300 ×
+ * vLLM/SGLang): the brand zone and rival-ban exist to keep vendors apart at a
+ * glance, but with one vendor there's no rival — clamping every series into the
+ * same narrow hue band just collapses the contrast HC is supposed to maximize.
+ * So skip both restrictions and use the full wheel, giving the series the widest
+ * possible separation.
  */
 export const generateHighContrastColors = (
   keys: string[],
@@ -91,6 +98,12 @@ export const generateHighContrastColors = (
     list.push(key);
   }
 
+  // Brand-zone / rival-ban only serve to keep DIFFERENT vendors apart. With a
+  // single vendor present there's nothing to separate from, so those
+  // restrictions only shrink the usable hue range and kill contrast — open the
+  // full wheel instead (the common all-NVIDIA agentic comparison case).
+  const multiVendor = groups.size > 1;
+
   for (const [vendor, vendorKeys] of groups) {
     const count = vendorKeys.length;
     const isBanned = BANNED_HUE_TEST[vendor] ?? null;
@@ -99,8 +112,8 @@ export const generateHighContrastColors = (
     // Tier 1: few items → brand zone only
     // Tier 2: moderate  → full wheel minus rival color
     // Tier 3: many      → full wheel, no restrictions
-    const usePreferred = preferred && count <= PREFERRED_MAX;
-    const useBan = !usePreferred && isBanned && count <= BAN_MAX;
+    const usePreferred = multiVendor && preferred && count <= PREFERRED_MAX;
+    const useBan = multiVendor && !usePreferred && isBanned && count <= BAN_MAX;
 
     // Everything iwanthue's output depends on (the ban filter and preferred
     // zone are functions of vendor; the seed is vendor+theme).
@@ -579,6 +592,20 @@ export const paretoFrontLowerRight = (points: InferenceData[]): InferenceData[]
   return front;
 };
 
+const PARETO_BY_DIRECTION = {
+  upper_right: paretoFrontUpperRight,
+  upper_left: paretoFrontUpperLeft,
+  lower_left: paretoFrontLowerLeft,
+  lower_right: paretoFrontLowerRight,
+} as const;
+
+export type ParetoDirection = keyof typeof PARETO_BY_DIRECTION;
+
+/** Look up the Pareto frontier function for a roofline direction. */
+export const paretoFrontForDirection = (
+  dir: ParetoDirection,
+): ((points: InferenceData[]) => InferenceData[]) => PARETO_BY_DIRECTION[dir];
+
 /**
  * Calculates the roofline for a given set of points.
  */
diff --git a/packages/app/src/lib/compare-pair-defaults.test.ts b/packages/app/src/lib/compare-pair-defaults.test.ts
index f0f1ef5b..da81ca0e 100644
--- a/packages/app/src/lib/compare-pair-defaults.test.ts
+++ b/packages/app/src/lib/compare-pair-defaults.test.ts
@@ -6,6 +6,7 @@ import { pickPairDefaults } from './compare-pair-defaults';
 
 function makeRow(overrides: Partial<BenchmarkRow>): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h100',
     framework: 'sglang',
     model: 'dsr1',
@@ -30,6 +31,8 @@ function makeRow(overrides: Partial<BenchmarkRow>): BenchmarkRow {
     metrics: { tput_per_gpu: 100 },
     date: '2026-01-01',
     run_url: null,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     ...overrides,
   };
 }
diff --git a/packages/app/src/lib/compare-pair-defaults.ts b/packages/app/src/lib/compare-pair-defaults.ts
index be6450ad..f5a37e1f 100644
--- a/packages/app/src/lib/compare-pair-defaults.ts
+++ b/packages/app/src/lib/compare-pair-defaults.ts
@@ -14,6 +14,7 @@ export function pickPairDefaults(
   const seenB = new Map<string, Set<string>>();
   for (const row of rows) {
     if (row.hardware !== a && row.hardware !== b) continue;
+    if (row.isl === null || row.osl === null) continue;
     const seq = islOslToSequence(row.isl, row.osl);
     if (!seq) continue;
     const key = `${seq}|${row.precision}`;
diff --git a/packages/app/src/lib/compare-ssr.test.ts b/packages/app/src/lib/compare-ssr.test.ts
index 5f2828ea..4bf99f89 100644
--- a/packages/app/src/lib/compare-ssr.test.ts
+++ b/packages/app/src/lib/compare-ssr.test.ts
@@ -4,8 +4,13 @@ import type { BenchmarkRow } from '@/lib/api';
 
 import { computeCompareImageRows } from './compare-ssr';
 
+// BenchmarkRow.id is required (stable per-point id from benchmark_results);
+// hand out a fresh one per stub so id-keyed logic can't collide across rows.
+let nextStubId = 1;
+
 function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: nextStubId++,
     hardware: 'h200',
     framework: 'sglang',
     model: 'dsr1',
@@ -23,6 +28,8 @@ function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 128,
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
index debbb788..8b691ee4 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
@@ -4,7 +4,7 @@ import { describe, expect, it } from 'vitest';
 
 import type { ShapeKey } from '@/lib/chart-rendering';
 
-import { renderScatterPoints, syncPointShape } from './scatter-points';
+import { computeTooltipPosition, renderScatterPoints, syncPointShape } from './scatter-points';
 
 interface TestPoint {
   hwKey: string;
@@ -163,3 +163,51 @@ describe('syncPointShape', () => {
     expect(g.selectAll('.visible-shape').size()).toBe(1);
   });
 });
+
+describe('computeTooltipPosition', () => {
+  it('keeps a tall pinned tooltip inside the visible viewport', () => {
+    const tooltipNode = document.createElement('div');
+    document.body.append(tooltipNode);
+    Object.defineProperty(tooltipNode, 'getBoundingClientRect', {
+      value: () => ({
+        width: 300,
+        height: 400,
+        left: 0,
+        top: 0,
+        right: 300,
+        bottom: 400,
+        x: 0,
+        y: 0,
+        toJSON: () => ({}),
+      }),
+    });
+
+    const container = document.createElement('div');
+    Object.defineProperties(container, {
+      clientWidth: { value: 800 },
+      clientHeight: { value: 600 },
+      getBoundingClientRect: {
+        value: () => ({
+          width: 800,
+          height: 600,
+          left: 100,
+          top: 600,
+          right: 900,
+          bottom: 1200,
+          x: 100,
+          y: 600,
+          toJSON: () => ({}),
+        }),
+      },
+    });
+    Object.defineProperties(document.documentElement, {
+      clientWidth: { configurable: true, value: 1280 },
+      clientHeight: { configurable: true, value: 720 },
+    });
+
+    expect(computeTooltipPosition(450, 100, d3.select(tooltipNode), container)).toEqual({
+      left: 560,
+      top: 316,
+    });
+  });
+});
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 0c316366..433ed6d1 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -107,17 +107,33 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
   // Visible shape is created (or swapped, if selectedPrecisions changed) in the
   // merged update pass below.
 
-  // Label (enter only)
+  // Label (enter only). Multi-line labels are passed as `\n`-separated strings;
+  // we anchor the entire stack via the FIRST tspan's `dy` so getBBox() doesn't
+  // pick up the text element's own (unused) y=0 origin. The first tspan is
+  // raised so the LAST line baseline lands ~8px above the point; subsequent
+  // tspans cascade down by 1.1em.
   if (!config.hideLabels && config.getLabelText && config.foreground) {
-    entered
-      .append('text')
-      .attr('class', 'point-label')
-      .attr('dy', -8)
-      .attr('text-anchor', 'middle')
-      .attr('fill', config.foreground)
-      .attr('font-size', '10px')
-      .attr('pointer-events', 'none')
-      .text(config.getLabelText);
+    const labelGetter = config.getLabelText;
+    entered.each(function (d) {
+      const lines = labelGetter(d).split('\n');
+      const text = d3
+        .select(this)
+        .append('text')
+        .attr('class', 'point-label')
+        .attr('text-anchor', 'middle')
+        .attr('fill', config.foreground!)
+        .attr('font-size', '10px')
+        .attr('font-weight', '700')
+        .attr('pointer-events', 'none');
+      const firstDy = -(0.8 + (lines.length - 1) * 1.1);
+      lines.forEach((line, i) => {
+        text
+          .append('tspan')
+          .attr('x', 0)
+          .attr('dy', i === 0 ? `${firstDy}em` : '1.1em')
+          .text(line);
+      });
+    });
   }
 
   // Exit: remove stale points
@@ -150,20 +166,32 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
     syncPointShape(g, shapeKey, config.getColor(d));
   });
 
-  // Update labels: use data join so labels are created/removed properly on toggle
+  // Update labels: use data join so labels are created/removed properly on toggle.
+  // Anchor the stack via the first tspan (NOT the text dy — that doesn't shift the
+  // bbox cleanly when there are tspan children).
   if (!config.hideLabels && config.getLabelText && config.foreground) {
+    const labelGetter = config.getLabelText;
     points.each(function (d) {
-      const g = d3.select(this);
-      g.selectAll<SVGTextElement, boolean>('.point-label')
+      const lines = labelGetter(d).split('\n');
+      const text = d3
+        .select(this)
+        .selectAll<SVGTextElement, boolean>('.point-label')
         .data([true])
         .join('text')
         .attr('class', 'point-label')
-        .attr('dy', -8)
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
-        .attr('pointer-events', 'none')
-        .text(config.getLabelText!(d));
+        .attr('font-weight', '700')
+        .attr('pointer-events', 'none');
+      const firstDy = -(0.8 + (lines.length - 1) * 1.1);
+      text
+        .selectAll<SVGTSpanElement, string>('tspan')
+        .data(lines)
+        .join('tspan')
+        .attr('x', 0)
+        .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
+        .text((l) => l);
     });
   } else {
     points.selectAll('.point-label').remove();
@@ -283,7 +311,22 @@ export function attachScatterTooltipHandlers<
     });
 }
 
-/** Compute tooltip left/top, flipping when it would overflow the chart container. */
+/**
+ * Compute tooltip left/top **in viewport coordinates** so the tooltip can be
+ * rendered via portal with `position: fixed`. Callers still pass cursor coords
+ * relative to `container` (matching `d3.pointer(event, container)`).
+ *
+ * Why viewport coords: the chart cards use `backdrop-filter`, which creates
+ * a stacking context. A tooltip painted inside the upper card's stacking
+ * context cannot rise above the lower card's stacking context regardless of
+ * its z-index. Portalling to document.body + `position: fixed` sidesteps the
+ * whole problem; we just need the coordinates in viewport space.
+ *
+ * Strategy: pick preferred side (right/below cursor), flip if it overflows the
+ * container, then clamp the final fixed coordinates to the viewport. The
+ * viewport clamp matters when a chart continues below the fold: container-
+ * local coordinates can otherwise place a pinned tooltip's actions offscreen.
+ */
 export function computeTooltipPosition(
   mx: number,
   my: number,
@@ -302,11 +345,27 @@ export function computeTooltipPosition(
   // Force reflow so we get real dimensions
   const tw = node.getBoundingClientRect().width || node.offsetWidth;
   const th = node.getBoundingClientRect().height || node.offsetHeight;
+  const rect = container.getBoundingClientRect();
   const cw = container.clientWidth;
   const ch = container.clientHeight;
+  const EDGE_PAD = 4;
+
+  // Prefer right of cursor; flip to left if no room.
+  let left = mx + offset + tw <= cw ? mx + offset : mx - offset - tw;
+  left = Math.max(EDGE_PAD, Math.min(cw - tw - EDGE_PAD, left));
+
+  // Prefer below cursor; flip above if no room.
+  let top = my + offset + th <= ch ? my + offset : my - offset - th;
+  top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top));
 
-  const left = mx + offset + tw > cw ? mx - offset - tw : mx + offset;
-  const top = my + offset + th > ch ? my - offset - th : my + offset;
+  // Convert container-local coords → viewport coords for `position: fixed`,
+  // then keep the complete tooltip visible when its dimensions permit it.
+  const viewportWidth = document.documentElement.clientWidth || window.innerWidth;
+  const viewportHeight = document.documentElement.clientHeight || window.innerHeight;
+  left += rect.left;
+  top += rect.top;
+  left = Math.max(EDGE_PAD, Math.min(viewportWidth - tw - EDGE_PAD, left));
+  top = Math.max(EDGE_PAD, Math.min(viewportHeight - th - EDGE_PAD, top));
 
   return { left, top };
 }
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 62208aa7..e217afbd 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -180,17 +180,73 @@ export enum Sequence {
   OneK_OneK = '1k/1k',
   OneK_EightK = '1k/8k',
   EightK_OneK = '8k/1k',
+  AgenticTraces = 'agentic-traces',
 }
 
-const SEQUENCE_CONFIG: Record<Sequence, { label: string; compact: string; category: CategoryTag }> =
-  {
-    [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' },
-    [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' },
-    [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' },
-  };
+/**
+ * Top-level scenario kind. Fixed-seq sequences cluster under a single group
+ * in the selector; agentic traces sit alongside as their own kind.
+ */
+export type ScenarioKind = 'fixed-seq' | 'agentic';
+
+export function sequenceKind(seq: Sequence): ScenarioKind {
+  return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq';
+}
+
+const SEQUENCE_CONFIG: Record<
+  Sequence,
+  { label: string; compact: string; category: CategoryTag; kind: ScenarioKind }
+> = {
+  [Sequence.OneK_OneK]: {
+    label: '1K / 1K',
+    compact: '1k1k',
+    category: 'default',
+    kind: 'fixed-seq',
+  },
+  [Sequence.OneK_EightK]: {
+    label: '1K / 8K',
+    compact: '1k8k',
+    category: 'deprecated',
+    kind: 'fixed-seq',
+  },
+  [Sequence.EightK_OneK]: {
+    label: '8K / 1K',
+    compact: '8k1k',
+    category: 'default',
+    kind: 'fixed-seq',
+  },
+  [Sequence.AgenticTraces]: {
+    label: 'Agentic Traces',
+    compact: 'agentic',
+    category: 'default',
+    kind: 'agentic',
+  },
+};
 
 export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 
+/**
+ * Percentile of the latency distribution used for the chart x-axis when
+ * viewing agentic traces. Agentic rows carry median/p75/p90/p95/p99/p99.9
+ * variants for ttft, ttlt (=e2el), and itl (and intvty derived from itl);
+ * p75 and p90 are surfaced in the UI.
+ */
+export enum Percentile {
+  P75 = 'p75',
+  P90 = 'p90',
+}
+
+const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
+  [Percentile.P75]: { label: 'p75' },
+  [Percentile.P90]: { label: 'p90' },
+};
+
+export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
+
+export function getPercentileLabel(p: Percentile): string {
+  return PERCENTILE_CONFIG[p]?.label ?? p;
+}
+
 export const DEPRECATED_SEQUENCES: ReadonlySet<Sequence> = new Set(
   (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][])
     .filter(([, c]) => c.category === 'deprecated')
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 28cc1e36..2f5844c1 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,23 +57,43 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_ttft: 0.5,
     median_ttft: 0.4,
     std_ttft: 0.1,
+    p75_ttft: 0.65,
+    p90_ttft: 0.7,
+    p95_ttft: 0.75,
     p99_ttft: 0.8,
+    'p99.9_ttft': 0.9,
     mean_tpot: 0.02,
     mean_intvty: 45,
     median_tpot: 0.02,
     median_intvty: 44,
     std_tpot: 0.005,
     std_intvty: 5,
+    p75_tpot: 0.022,
+    p75_intvty: 50,
+    p90_tpot: 0.025,
+    p90_intvty: 55,
+    p95_tpot: 0.028,
+    p95_intvty: 58,
     p99_tpot: 0.03,
     p99_intvty: 60,
+    'p99.9_tpot': 0.035,
+    'p99.9_intvty': 65,
     mean_itl: 0.01,
     median_itl: 0.01,
     std_itl: 0.002,
+    p75_itl: 0.012,
+    p90_itl: 0.013,
+    p95_itl: 0.014,
     p99_itl: 0.015,
+    'p99.9_itl': 0.018,
     mean_e2el: 5,
     median_e2el: 4.8,
     std_e2el: 0.5,
+    p75_e2el: 5.2,
+    p90_e2el: 5.5,
+    p95_e2el: 5.8,
     p99_e2el: 6,
+    'p99.9_e2el': 6.5,
     disagg: false,
     num_prefill_gpu: 0,
     num_decode_gpu: 0,
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index c78bf588..1c8cab81 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -22,8 +22,10 @@ const URL_STATE_KEYS = [
   'i_seq',
   'i_prec',
   'i_metric',
+  'i_pctl',
   'i_xmetric',
   'i_e2e_xmetric',
+  'i_xmode',
   'i_scale',
   'i_gpus',
   'i_dates',
@@ -78,8 +80,10 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   // "default") or it would silently revert to the per-model auto default on reload.
   i_prec: '',
   i_metric: 'y_tpPerGpu',
-  i_xmetric: 'p99_ttft',
-  i_e2e_xmetric: '',
+  i_pctl: 'p90',
+  i_xmetric: 'p90_ttft',
+  i_e2e_xmetric: 'p90_ttft',
+  i_xmode: '',
   i_scale: 'auto',
   i_gpus: '',
   i_dates: '',

From 760026f5837960cfa9b9ecc2bd5333c109a2e306 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:12:20 -0500
Subject: [PATCH 10/40] test: agentic e2e and component coverage; subsystem
 docs

---
 docs/data-pipeline.md                         |  12 +
 .../kv-cache-hit-rate-anomaly.md              | 113 +++++++
 .../app/cypress/component/dataset-list.cy.tsx |  93 +++++
 .../component/distribution-card.cy.tsx        |  82 +++++
 .../component/inference-chart-controls.cy.tsx |   4 +-
 .../cypress/component/trace-flamegraph.cy.tsx |  86 +++++
 .../e2e/agentic-point-time-series.cy.ts       | 320 ++++++++++++++++++
 .../cypress/e2e/datasets-distributions.cy.ts  | 133 ++++++++
 .../e2e/datasets-flamegraph-time.cy.ts        | 127 +++++++
 .../app/cypress/e2e/dropdown-switching.cy.ts  |   4 +-
 .../e2e/gpu-compare-agentic-detail.cy.ts      |  54 +++
 .../app/cypress/e2e/gradient-labels.cy.ts     |  16 +-
 .../app/cypress/e2e/historical-trends.cy.ts   |   4 +-
 packages/app/cypress/e2e/line-labels.cy.ts    |  31 +-
 .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts  | 108 ++++--
 packages/app/cypress/e2e/url-params.cy.ts     |  24 +-
 packages/app/cypress/support/mock-data.ts     |   4 +
 17 files changed, 1148 insertions(+), 67 deletions(-)
 create mode 100644 docs/investigations/kv-cache-hit-rate-anomaly.md
 create mode 100644 packages/app/cypress/component/dataset-list.cy.tsx
 create mode 100644 packages/app/cypress/component/distribution-card.cy.tsx
 create mode 100644 packages/app/cypress/component/trace-flamegraph.cy.tsx
 create mode 100644 packages/app/cypress/e2e/agentic-point-time-series.cy.ts
 create mode 100644 packages/app/cypress/e2e/datasets-distributions.cy.ts
 create mode 100644 packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
 create mode 100644 packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts

diff --git a/docs/data-pipeline.md b/docs/data-pipeline.md
index 38e7d471..bc439e8a 100644
--- a/docs/data-pipeline.md
+++ b/docs/data-pipeline.md
@@ -62,6 +62,18 @@ Configs are preloaded into an in-memory Map at ingest start. `getOrCreateConfig(
 
 Unmapped models/hardware are tracked (not silently dropped) so operators can see what new GPU or model names appeared in CI artifacts. This is how new GPUs get added to the system — the skip tracker acts as a change detection mechanism.
 
+### Server-Metric Orchestrator Adapters
+
+AIPerf defines the `server_metrics_export.json` envelope, but labels such as worker role and rank belong to the serving orchestrator. The chart-series ETL therefore normalizes raw series through an orchestrator-specific adapter before exposing per-worker metrics. For example, the Dynamo adapter maps `dynamo_component=prefill|backend` to canonical `prefill|decode` roles and uses the endpoint, worker ID, DP rank, and engine together as the source identity.
+
+Adapters are selected from the benchmark's canonical framework, and per-worker series are only emitted for disaggregated configs with a recognized adapter. Unknown orchestrators and non-disaggregated configs retain their aggregate-only series; roles are never guessed from ports or metric names. The frontend only consumes the canonical source identity and never interprets orchestrator-native labels.
+
+### Agentic Dataset Provenance
+
+AIPerf exports public-dataset provenance in `metadata.dataset`, including the Hugging Face dataset ID. InferenceX preserves that object as `dataset` on each agentic aggregate benchmark row. During benchmark ingest, `ingest-ci-run.ts` derives the dashboard slug from `hf_dataset_name` (for example, `semianalysisai/cc-traces-weka-062126` becomes `cc-traces-weka-062126`) and upserts `run_datasets` for the workflow run.
+
+Legacy artifacts without provenance leave any existing mapping untouched. A workflow run can map to only one dataset; conflicting dataset IDs fail ingest rather than silently linking the run to an arbitrary dataset.
+
 ## Frontend Transform Pipeline
 
 ### Why transformBenchmarkRows Exists
diff --git a/docs/investigations/kv-cache-hit-rate-anomaly.md b/docs/investigations/kv-cache-hit-rate-anomaly.md
new file mode 100644
index 00000000..61ffee42
--- /dev/null
+++ b/docs/investigations/kv-cache-hit-rate-anomaly.md
@@ -0,0 +1,113 @@
+# KV cache hit-rate anomaly on agentic benchmarks (dsv4, b200, vllm)
+
+## Core issue
+
+vLLM's prefix cache should be hitting at ~98% on multi-turn agentic conversation replay (each turn extends the prior turn's context). It isn't. Something in the **dataset definition** or **aiperf replay** is producing requests whose token streams aren't actually prefix-compatible turn-to-turn.
+
+| Concurrency | Theoretical max hit % | vLLM actual hit % |
+| ----------: | --------------------: | ----------------: |
+|           1 |                97.45% |            83.15% |
+|           2 |                98.34% |            46.78% |
+|           4 |                97.99% |            12.43% |
+
+This is **not** a capacity problem. KV cache is sized at 3.29M tokens (12,868 blocks × 256). The conc=4 workload's unique-content footprint is **~1.11M DSV4 tokens** — would fit in ~34% util. Observed peak util is 49.8%, so the cache is holding more blocks than the workload needs, yet vLLM can't find them on lookup.
+
+## Data sources
+
+- **Benchmark points**:
+  - http://localhost:3002/inference/agentic/206252 (conc=1)
+  - http://localhost:3002/inference/agentic/206245 (conc=2)
+  - http://localhost:3002/inference/agentic/206247 (conc=4)
+- **Neon DB**: project `silent-pond-29172997`, branch `br-cold-sky-ai0c09cy` (agentx-dev). Connection via `DATABASE_WRITE_URL` in `.env`. Console: https://console.neon.tech/app/projects/silent-pond-29172997/branches/br-cold-sky-ai0c09cy
+  - `agentic_trace_replay.profile_export_jsonl_gz` — gzipped aiperf per-request records
+  - `agentic_trace_replay.server_metrics_json_gz` — gzipped vllm per-scrape prometheus metrics
+  - `agentic_trace_replay.request_timeline` (jsonb) — pre-computed per-request timeline used by the simulation
+- **Trace replay dataset** (the source-of-truth for "what should be cacheable"): https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-051926. Each row has pre-computed 64-token block `hash_ids` per turn; `hash_id_scope: 'local'` (per-conversation).
+
+## Theoretical max simulation
+
+For each replayed request, look up the matching turn in the HF dataset and walk a per-conversation trie of 64-token block hash IDs. Hits = longest contiguous prefix from block 0 that has appeared in any prior request (mirrors vLLM's chained-hash semantics).
+
+Confirms: the workload IS prefix-cacheable end-to-end. Theoretical max ≈ 98% across all three concurrency levels — same dataset, same conversations, just different dispatch order.
+
+## Why this points at the dataset/replay, not vLLM
+
+- **Capacity is not the bottleneck.** Cache holds ~3× the unique content of the workload. Cache util tops out below capacity.
+- **The metric isn't lying.** vLLM's own counters cross-check: `prefill_kv_computed_tokens + prefix_cache_hits ≈ request_prompt_tokens` (67.85M + 9.61M ≈ 77.47M for conc=4).
+- **It's not a tokenizer artifact.** DSV4 tokens are ~54% the count of Claude tokens, but BPE is left-monotonic on stable text — hit-rate ratio is invariant to tokenizer choice for prefix-growth workloads.
+- **It's not the multi-engine DP bug** we found earlier (commit `f2618f4`) — this deployment has 1 engine.
+
+What's left: the bytes that vLLM actually receives turn-to-turn are not the same prefix + delta that the dataset's `hash_ids` describe. Most likely culprits:
+
+1. **aiperf isn't sending the cumulative chat history** the way the dataset assumes — each turn is being assembled differently than the previous, breaking the byte-level prefix.
+2. **Something in the request payload varies per request** (timestamps, request IDs, tool result serialization order, etc.) — invalidates block 0's hash, cascades to every subsequent block via vLLM's chained hashing.
+3. **BPE re-merging across message boundaries** when aiperf re-tokenizes the full history each turn instead of appending tokens.
+
+## Root cause: `ConversationReconstructor` strips the prev user's `partial_tail` every turn
+
+The bug is in `utils/aiperf/src/aiperf/dataset/loader/weka_synth_buf.py` — specifically the **boundary case** in `truncate_synth_buf_at_block` (line 453–464) combined with `turn_delta`'s reset logic (line 354–360).
+
+What happens turn-to-turn:
+
+1. `init_turn_0` builds a trailing user segment whose `tokens` = `[block_aligned_tokens] + [partial_tail_tokens]` where `partial_tail_n = in_tokens % bs`. The wire prompt for turn 0 includes these tail tokens.
+2. `advance_turn` computes `lcp = longest_common_prefix(prev_hash_ids, curr_hash_ids)`. When the LCP equals the prev turn's total block count (the normal append-only case), `truncate_synth_buf_at_block` hits its boundary branch: `cursor + seg.block_count == target_blocks`.
+3. That branch **strips `prev_partial_tail` tokens off the trailing user segment in place** and re-decodes its `content`. This sets `_last_disturbance_at = i` (the index of the prev trailing user segment).
+4. New `assistant` + `user` segments are appended.
+5. `turn_delta` sees `_last_disturbance_at < _emitted_segment_count` and forces `reset_context=True`, re-emitting **the whole conversation** with the now-stripped trailing user.
+
+The endpoint (`utils/aiperf/src/aiperf/endpoints/base_endpoint.py:110-140`) honors `reset_context=True` via `messages = list(turn.raw_messages)` instead of `messages.extend(...)`.
+
+Result: every turn sends the full chat history, but the bytes of the prev user message differ from what was sent the turn before — the trailing `partial_tail` chars are missing. vLLM tokenizes the new prompt, hashes 256-token blocks, and the chained-hash invariant breaks at the first block containing the trimmed boundary. That block + every subsequent block of the new turn miss the cache.
+
+### Empirical confirmation
+
+Reproducer at `/tmp/test-reconstructor.py` instantiates `ConversationReconstructor` with mock decoders and walks a synthetic 3-turn conversation:
+
+```
+=== Turn 0 ===
+  delta msgs: 2, reset=False
+  wire len: 21683
+
+=== Turn 1 ===
+  delta msgs: 4, reset=True            ← every turn resets
+  wire len: 25307
+
+=== DIFF turn 0 vs turn 1 (wire-level) ===
+  common prefix chars: 21549 / wire0 21683 (99.4%)
+  wire0[...] = '... 983406 12 1 133 184 16 57 71 155 37 '     ← partial_tail decoded
+  wire1[...] = '... 983406<|im_end|>\n<|im_start|>assista'    ← stripped, template marker next
+  turn0 user content len: 19812, turn1 user[0] content len: 19711   ← 101 chars stripped
+```
+
+Across the conc=1 run (point 206252), **280/280 (100%)** consecutive turn-pairs have `prev_in_tokens % bs != 0` — i.e., every single turn hits this boundary disturbance.
+
+### Why the gap widens with concurrency
+
+At conc=1 the gap (97.45% − 83.15% = 14pp) is roughly the fraction of each turn's blocks lost to the trimmed-tail invalidation (last user block + chat-template delta). At higher conc:
+
+- `reset_context=True` makes every request re-send the **entire** conversation prompt, so wire bandwidth + prefill work scale superlinearly per turn.
+- Concurrent conversations all do this simultaneously; each writes long sequences of "new" blocks past their respective divergence points, evicting other conversations' usable prefix blocks even though aggregate unique content (1.11M tokens) fits comfortably in the 3.29M-token cache.
+
+### Fix sketch
+
+The boundary-cut strip exists to keep the next turn's `assistant` segment block-aligned. Two viable fixes:
+
+1. **Don't mutate the prev trailing user segment.** Leave its `partial_tail` tokens intact; append the new asst+user as strict-append (no reset_context). The wire-prefix becomes byte-stable turn-to-turn. Cost: the new asst content's block_start no longer aligns to the prev_hash_ids tail, so hash_id accounting for asst blocks loses 1 block of fidelity per turn.
+2. **Track `partial_tail` separately** from the prev user segment so the segment's emitted content stays byte-stable, and only the trailing tail (which is regenerated each turn anyway) is allowed to vary.
+
+Option 1 is the minimal change. Validate with the reproducer above — remove the strip in `truncate_synth_buf_at_block`'s boundary case and re-run; turn N+1's wire prefix should equal turn N's wire byte-for-byte up to the end of the prev assistant template.
+
+## Re-running the simulation
+
+```bash
+# 1. dump request timelines from DB
+pnpm --filter @semianalysisai/inferencex-db exec dotenv -e ../../.env -- tsx /tmp/dump-rt-multi.ts
+
+# 2. run analysis (needs `pip3 install --break-system-packages --user datasets`)
+python3 /tmp/cache-sim-multi.py
+
+# 3. reproduce the partial_tail strip
+python3 /tmp/test-reconstructor.py
+```
+
+Scripts live in `/tmp/` from this session; recreate from inline code in the previous version of this doc if missing.
diff --git a/packages/app/cypress/component/dataset-list.cy.tsx b/packages/app/cypress/component/dataset-list.cy.tsx
new file mode 100644
index 00000000..f7cfcb9a
--- /dev/null
+++ b/packages/app/cypress/component/dataset-list.cy.tsx
@@ -0,0 +1,93 @@
+import { QueryClient, QueryClientProvider } from '@tanstack/react-query';
+import { AppRouterContext } from 'next/dist/shared/lib/app-router-context.shared-runtime';
+
+import { DatasetList } from '@/components/datasets/dataset-list';
+import type { DatasetRecord } from '@/hooks/api/use-datasets';
+
+const datasets: DatasetRecord[] = [
+  {
+    id: 'ds-1',
+    slug: 'cc-traces-weka-full',
+    label: 'cc-traces-weka (full)',
+    variant: 'full',
+    description: 'Every captured request, unmodified.',
+    hf_url: 'https://huggingface.co/datasets/semianalysisai/cc-traces-weka-full',
+    license: 'apache-2.0',
+    conversation_count: 1234,
+    summary: {
+      totalIn: 5_000_000,
+      totalOut: 250_000,
+      cachedPct: 0.82,
+      mainTurns: 9800,
+      subagentGroups: 540,
+    },
+    ingested_at: '2026-06-20T00:00:00Z',
+  },
+  {
+    id: 'ds-2',
+    slug: 'cc-traces-weka-256k',
+    label: 'cc-traces-weka (256k)',
+    variant: '256k',
+    description: 'Turns trimmed to a 256k context window.',
+    hf_url: null,
+    license: 'apache-2.0',
+    conversation_count: 980,
+    summary: {
+      totalIn: 3_200_000,
+      totalOut: 180_000,
+      cachedPct: 0.79,
+      mainTurns: 7600,
+      subagentGroups: 410,
+    },
+    ingested_at: '2026-06-19T00:00:00Z',
+  },
+];
+
+function createMockRouter() {
+  return {
+    push: cy.stub(),
+    replace: cy.stub(),
+    refresh: cy.stub(),
+    back: cy.stub(),
+    forward: cy.stub(),
+    prefetch: cy.stub().resolves(),
+  };
+}
+
+function mountList() {
+  const queryClient = new QueryClient({ defaultOptions: { queries: { retry: false } } });
+  cy.mount(
+    <AppRouterContext.Provider value={createMockRouter()}>
+      <QueryClientProvider client={queryClient}>
+        <DatasetList />
+      </QueryClientProvider>
+    </AppRouterContext.Provider>,
+  );
+}
+
+describe('DatasetList', () => {
+  it('renders a card per dataset with its summary stats', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: datasets }).as('list');
+    mountList();
+    cy.wait('@list');
+    cy.contains('cc-traces-weka (full)').should('be.visible');
+    cy.contains('cc-traces-weka (256k)').should('be.visible');
+    cy.contains('1,234').should('be.visible'); // conversation_count, localized
+    cy.contains('82%').should('be.visible'); // cachedPct
+    cy.get('a[href="/datasets/cc-traces-weka-full"]').should('exist');
+  });
+
+  it('shows the empty state when no datasets are ingested', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: [] }).as('empty');
+    mountList();
+    cy.wait('@empty');
+    cy.contains('No datasets ingested yet.').should('be.visible');
+  });
+
+  it('shows the error state when the request fails', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 500, body: { error: 'boom' } }).as('err');
+    mountList();
+    cy.wait('@err');
+    cy.contains('Failed to load datasets.').should('be.visible');
+  });
+});
diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx
new file mode 100644
index 00000000..511505b9
--- /dev/null
+++ b/packages/app/cypress/component/distribution-card.cy.tsx
@@ -0,0 +1,82 @@
+import { DistributionCard } from '@/components/datasets/distribution-card';
+import type { Distribution } from '@/hooks/api/use-datasets';
+
+const distribution: Distribution = {
+  bins: [
+    { x0: 0, x1: 100, count: 5 },
+    { x0: 100, x1: 200, count: 20 },
+    { x0: 200, x1: 300, count: 12 },
+    { x0: 300, x1: 400, count: 3 },
+  ],
+  stats: {
+    count: 40,
+    min: 10,
+    max: 390,
+    mean: 180,
+    median: 175,
+    p75: 250,
+    p90: 320,
+    p95: 360,
+  },
+};
+
+describe('DistributionCard', () => {
+  it('renders the title, summary stats, and one bar per bin', () => {
+    cy.mount(
+      <DistributionCard title="Input tokens per turn" unit="tok" distribution={distribution} />,
+    );
+    cy.contains('Input tokens per turn').should('be.visible');
+    cy.contains('n=40').should('be.visible');
+    cy.contains('p50 175').should('be.visible');
+    cy.contains('p75 250').should('be.visible');
+    cy.contains('p90 320').should('be.visible');
+    cy.contains('p95 360').should('be.visible');
+    cy.get(
+      'line[stroke="#3b82f6"], line[stroke="#22c55e"], line[stroke="#f59e0b"], line[stroke="#ef4444"]',
+    ).should('have.length', 8);
+    // One filled bar rect per bin (ChartHover may add a transparent overlay rect).
+    cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length);
+  });
+
+  it('shows a "No data" placeholder when no distribution is provided', () => {
+    cy.mount(<DistributionCard title="Empty metric" unit="tok" />);
+    cy.contains('Empty metric').should('be.visible');
+    cy.contains('No data').should('be.visible');
+    cy.get('rect[class*="fill-primary"]').should('not.exist');
+  });
+
+  it('marks the chart as log scale when scale="log"', () => {
+    cy.mount(
+      <DistributionCard
+        title="Output tokens per turn"
+        unit="tok"
+        scale="log"
+        distribution={distribution}
+      />,
+    );
+    cy.contains('log scale').should('be.visible');
+  });
+
+  it('renders older v1 stats without unavailable percentile guides', () => {
+    cy.mount(
+      <DistributionCard
+        title="Legacy metric"
+        unit="tok"
+        distribution={{
+          bins: distribution.bins,
+          stats: {
+            count: 40,
+            min: 10,
+            max: 390,
+            mean: 180,
+            median: 175,
+            p90: 320,
+          },
+        }}
+      />,
+    );
+    cy.contains('p50 175').should('be.visible');
+    cy.contains('p90 320').should('be.visible');
+    cy.contains('NaN').should('not.exist');
+  });
+});
diff --git a/packages/app/cypress/component/inference-chart-controls.cy.tsx b/packages/app/cypress/component/inference-chart-controls.cy.tsx
index 03e6a50c..5a6311f4 100644
--- a/packages/app/cypress/component/inference-chart-controls.cy.tsx
+++ b/packages/app/cypress/component/inference-chart-controls.cy.tsx
@@ -14,8 +14,8 @@ describe('Inference ChartControls', () => {
 
   it('renders the sequence selector with the current sequence', () => {
     // Default mock: selectedSequence = Sequence.EightK_OneK -> label "8K / 1K"
-    cy.get('#sequence-select').should('be.visible');
-    cy.get('#sequence-select').should('contain.text', '8K / 1K');
+    cy.get('#scenario-select').should('be.visible');
+    cy.get('#scenario-select').should('contain.text', '8K / 1K');
   });
 
   it('renders the precision multi-select with the current precision', () => {
diff --git a/packages/app/cypress/component/trace-flamegraph.cy.tsx b/packages/app/cypress/component/trace-flamegraph.cy.tsx
new file mode 100644
index 00000000..1be90e0c
--- /dev/null
+++ b/packages/app/cypress/component/trace-flamegraph.cy.tsx
@@ -0,0 +1,86 @@
+import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
+import type { ConversationStructure } from '@/hooks/api/use-datasets';
+
+// Two main turns followed by one subagent group with two child turns.
+// Node indices: 0 = turn, 1 = turn, 2 = subagent (so its rows key off `g-2`).
+const structure: ConversationStructure = {
+  blockSize: 64,
+  nodes: [
+    { kind: 'turn', turnIndex: 0, model: 'claude', in: 1000, out: 200, cached: 600, uncached: 400 },
+    {
+      kind: 'turn',
+      turnIndex: 1,
+      model: 'claude',
+      in: 2000,
+      out: 300,
+      cached: 1500,
+      uncached: 500,
+    },
+    {
+      kind: 'subagent',
+      label: 'Subagent: search',
+      agentId: 'agent-1',
+      durationMs: 12000,
+      in: 5000,
+      out: 800,
+      cached: 3000,
+      uncached: 2000,
+      children: [
+        {
+          kind: 'turn',
+          turnIndex: 0,
+          model: 'claude',
+          in: 2500,
+          out: 400,
+          cached: 1500,
+          uncached: 1000,
+        },
+        {
+          kind: 'turn',
+          turnIndex: 1,
+          model: 'claude',
+          in: 2500,
+          out: 400,
+          cached: 1500,
+          uncached: 1000,
+        },
+      ],
+    },
+  ],
+  totals: { in: 8000, out: 1300, cached: 5100, uncached: 2900, numTurns: 2, numSubagentGroups: 1 },
+};
+
+describe('TraceFlamegraph', () => {
+  it('renders the legend, main-turn rows, and the subagent group header', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.contains('Cached prefix').should('be.visible');
+    cy.contains('Uncached input').should('be.visible');
+    cy.contains('Output').should('be.visible');
+    cy.get('[data-rowkey="t-0"]').should('contain.text', 'Turn 1');
+    cy.get('[data-rowkey="t-1"]').should('contain.text', 'Turn 2');
+    cy.contains('Subagent: search').should('be.visible');
+  });
+
+  it('keeps subagent children collapsed until the group is expanded', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.get('[data-rowkey="g-2-c-0"]').should('not.exist');
+    cy.contains('button', 'Subagent: search').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('be.visible');
+    cy.get('[data-rowkey="g-2-c-1"]').should('be.visible');
+  });
+
+  it('expand all / collapse all toggles every subagent group', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.contains('button', 'Expand all').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('be.visible');
+    cy.contains('button', 'Collapse all').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('not.exist');
+  });
+
+  it('auto-expands and highlights the target group child for a request-timeline deep link', () => {
+    cy.mount(
+      <TraceFlamegraph structure={structure} highlightAgentId="agent-1" highlightTurn={1} />,
+    );
+    cy.get('[data-rowkey="g-2-c-1"]').should('be.visible').and('have.class', 'ring-primary');
+  });
+});
diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
new file mode 100644
index 00000000..4a450f7c
--- /dev/null
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -0,0 +1,320 @@
+const timelineRequest = (
+  index: number,
+  ttftMs: number,
+  tpotMs: number,
+  overrides: Record<string, unknown> = {},
+) => ({
+  cid: 'conversation-1',
+  ti: index,
+  wid: 'worker-1',
+  ad: 0,
+  phase: 'profiling',
+  credit: index * 1_000_000_000,
+  start: index * 1_000_000_000,
+  ack: null,
+  end: (index + 1) * 1_000_000_000,
+  ttftMs,
+  tpotMs,
+  isl: 1024,
+  osl: 128,
+  cancelled: false,
+  ...overrides,
+});
+
+describe('Agentic point request metric time series', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} });
+    cy.intercept('GET', '/api/v1/trace-server-metrics*', { body: null });
+    cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 });
+    cy.intercept('GET', '/api/v1/request-timeline*', {
+      body: {
+        version: 3,
+        startNs: 0,
+        endNs: 7_000_000_000,
+        durationS: 7,
+        requests: [
+          timelineRequest(0, 100, 10),
+          timelineRequest(1, 200, 20),
+          timelineRequest(2, 400, 25),
+          timelineRequest(3, 800, 40),
+          timelineRequest(4, 1600, 80),
+          timelineRequest(5, 3200, 160, { phase: 'warmup' }),
+          timelineRequest(6, 6400, 320, { cancelled: true }),
+          timelineRequest(7, 0, 0, {
+            cid: 'conversation-1::sa:subagent_001_abcd',
+            credit: 1_100_000_000,
+            start: 1_100_000_000,
+            end: 1_900_000_000,
+            ttftMs: null,
+            tpotMs: null,
+            isl: null,
+            osl: null,
+          }),
+          timelineRequest(8, 0, 0, {
+            cid: 'conversation-1::sa:subagent_001_abcd:aux:011',
+            credit: 1_200_000_000,
+            start: 1_200_000_000,
+            end: 1_800_000_000,
+            ttftMs: null,
+            tpotMs: null,
+            isl: null,
+            osl: null,
+          }),
+        ],
+      },
+    });
+    cy.visit('/inference/agentic/206885');
+  });
+
+  it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => {
+    cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
+      cy.contains('h2', 'Interactivity over time').should('be.visible');
+      cy.get('[data-testid="interactivity-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P90');
+      cy.get('[data-testid="interactivity-point-count"]').should('have.text', '5 points');
+      cy.get('svg circle').should('have.length', 5);
+      cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
+      cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT');
+      cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+    });
+
+    cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+      cy.contains('h2', 'TTFT over time').should('be.visible');
+      cy.get('[data-testid="ttft-point-count"]').should('have.text', '5 points');
+      cy.get('svg circle').should('have.length', 5);
+      cy.get('svg').should('contain.text', 'TTFT (s)');
+      cy.get('svg').should('contain.text', 'Cumulative P90 TTFT');
+      cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+    });
+  });
+
+  it('switches ISL and OSL cards from distributions to in-flight averages', () => {
+    cy.get('[data-testid="isl-metric-chart"]').within(() => {
+      cy.get('[data-testid="isl-metric-inflight"]').click();
+      cy.contains('h2', 'Average ISL in flight').should('be.visible');
+      cy.get('svg').should('contain.text', 'Average ISL in flight (30s avg)');
+    });
+    cy.get('[data-testid="osl-metric-chart"]').within(() => {
+      cy.get('[data-testid="osl-metric-inflight"]').click();
+      cy.contains('h2', 'Average OSL in flight').should('be.visible');
+      cy.contains('Retrospective: final observed OSL').should('be.visible');
+      cy.get('svg').should('contain.text', 'Average OSL in flight (30s avg)');
+    });
+  });
+
+  it('switches the TTFT chart to E2E request latency over time', () => {
+    cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+      cy.get('[data-testid="latency-metric-e2e"]').click();
+      cy.contains('h2', 'E2E latency over time').should('be.visible');
+      cy.get('[data-testid="e2e-point-count"]').should('have.text', '7 points');
+      cy.get('svg circle').should('have.length', 7);
+      cy.get('svg').should('contain.text', 'E2E latency (s)');
+      cy.get('svg').should('contain.text', 'Cumulative P90 E2E latency');
+
+      cy.get('[data-testid="latency-metric-ttft"]').click();
+      cy.contains('h2', 'TTFT over time').should('be.visible');
+    });
+  });
+
+  it('switches each chart independently from P90 to P75', () => {
+    cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
+      cy.contains('svg', 'P90 (rolling 50 req)')
+        .find('path')
+        .first()
+        .invoke('attr', 'd')
+        .as('p90Path');
+      cy.contains('button', 'P75').click();
+      cy.get('[data-testid="interactivity-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P75');
+      cy.get('svg').should('contain.text', '1 / cumulative P75 TPOT');
+      cy.contains('svg', 'P75 (rolling 50 req)')
+        .find('path')
+        .first()
+        .invoke('attr', 'd')
+        .then(function (p75Path) {
+          expect(p75Path).not.to.equal(this.p90Path);
+        });
+    });
+
+    cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+      cy.get('[data-testid="ttft-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P90');
+      cy.contains('button', 'P75').click();
+      cy.get('svg').should('contain.text', 'P75 (rolling 50 req)');
+      cy.get('svg').should('contain.text', 'Cumulative P75 TTFT');
+    });
+  });
+
+  it('switches the request activity card from queue depth to cumulative completions', () => {
+    cy.get('[data-testid="request-activity-chart"]').within(() => {
+      cy.contains('h2', 'Request queue depth').should('be.visible');
+      cy.get('[data-testid="request-activity-completed"]').click();
+      cy.contains('h2', 'Cumulative completed requests').should('be.visible');
+      cy.get('svg').should('contain.text', 'Completed requests');
+      cy.get('svg').should('contain.text', 'Requests');
+      cy.get('[data-testid="request-activity-queue"]').click();
+      cy.contains('h2', 'Request queue depth').should('be.visible');
+    });
+  });
+
+  it('shows total time with no requests in flight on the request timeline', () => {
+    cy.get('[data-testid="detail-view-timeline"]').click();
+    cy.location('search').should('contain', 'view=timeline');
+    cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 1.00s (14.3%)');
+    cy.get('[data-timeline-row-kind="aux"]')
+      .should('have.css', 'padding-left', '24px')
+      .and('contain.text', 'aux 011 · parallel');
+  });
+
+  it('restores the request timeline view after browser Back from a dataset route', () => {
+    cy.window().then((win) => {
+      win.history.pushState({}, '', '/datasets/test-dataset/conversations/conversation-1');
+    });
+    cy.go('back');
+    cy.location('pathname').should('eq', '/inference/agentic/206885');
+    cy.location('search').should('contain', 'view=timeline');
+    cy.get('[data-testid="detail-view-timeline"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="timeline-total-idle-time"]').should('be.visible');
+  });
+
+  it('shows a cumulative average for unique input tokens in flight', () => {
+    cy.get('[data-testid="detail-view-point"]').click();
+    cy.get('[data-testid="unique-input-inflight-chart"]').within(() => {
+      cy.get('svg').should('contain.text', 'Cumulative average');
+      cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+    });
+  });
+});
+
+const pointMeta = {
+  id: 206885,
+  hardware: 'gb200',
+  framework: 'dynamo-vllm',
+  model: 'deepseek-r1-0528',
+  precision: 'fp8',
+  spec_method: 'none',
+  disagg: true,
+  conc: 128,
+  offload_mode: 'off',
+  isl: null,
+  osl: null,
+  benchmark_type: 'agentic_traces',
+  date: '2026-06-23',
+  run_url: null,
+  server_gpu_cache_hit_rate: 0.5,
+  server_cpu_cache_hit_rate: null,
+};
+
+const sourceSeries = (source: Record<string, unknown>, prompt: number, generation: number) => ({
+  source,
+  kvCacheUsage: [
+    { t: 0, value: 0.25 },
+    { t: 1, value: 0.5 },
+  ],
+  prefixCacheHitRate: [{ t: 0, value: 0.5 }],
+  queueDepth: [{ t: 0, running: 2, waiting: 1, total: 3 }],
+  promptTokensBySource: { miss: [{ t: 0, value: prompt }] },
+  promptTps: [{ t: 0, value: prompt }],
+  generationTps: [{ t: 0, value: generation }],
+  prefixCacheHitsTps: [{ t: 0, value: prompt / 2 }],
+  hostKvCacheUsage: [],
+  kvCacheUsageByEngine: [],
+});
+
+describe('Agentic point orchestrator metric sources', () => {
+  beforeEach(() => {
+    const prefill = sourceSeries(
+      {
+        id: 'dynamo|prefill|10.30.1.56:7500|prefill-a|0|0',
+        adapter: 'dynamo',
+        role: 'prefill',
+        endpointUrl: '10.30.1.56:7500',
+        nativeRole: 'prefill',
+        workerId: 'prefill-a',
+        dpRank: '0',
+        engine: '0',
+      },
+      100,
+      1,
+    );
+    const decode = sourceSeries(
+      {
+        id: 'dynamo|decode|10.30.1.206:7516|decode-a|0|0',
+        adapter: 'dynamo',
+        role: 'decode',
+        endpointUrl: '10.30.1.206:7516',
+        nativeRole: 'backend',
+        workerId: 'decode-a',
+        dpRank: '0',
+        engine: '0',
+      },
+      300,
+      400,
+    );
+    cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} });
+    cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 });
+    cy.intercept('GET', '/api/v1/request-timeline*', { statusCode: 404 });
+    cy.intercept('GET', '/api/v1/trace-server-metrics*', {
+      body: {
+        meta: pointMeta,
+        startNs: 0,
+        endNs: 2_000_000_000,
+        durationS: 2,
+        timeslicesCount: 2,
+        kvCacheUsage: prefill.kvCacheUsage,
+        prefixCacheHitRate: prefill.prefixCacheHitRate,
+        queueDepth: prefill.queueDepth,
+        promptTokensBySource: prefill.promptTokensBySource,
+        prefillTps: prefill.promptTps,
+        decodeTps: decode.generationTps,
+        prefixCacheHitsTps: prefill.prefixCacheHitsTps,
+        hostKvCacheUsage: [],
+        kvCacheUsageByEngine: [],
+        metricSources: [prefill, decode],
+      },
+    });
+    cy.visit('/inference/agentic/206885');
+  });
+
+  it('switches every server chart to an orchestrator-normalized worker', () => {
+    cy.get('[data-testid="metric-source-toolbar"]')
+      .should('have.css', 'position', 'sticky')
+      .and('have.css', 'top', '64px');
+    cy.get('[data-testid="metric-source-select"]').should('contain.text', 'All endpoints').click();
+    cy.contains('[role="option"]', 'Decode · decode-a').click();
+
+    cy.get('[data-testid="metric-source-select"]').should('contain.text', 'Decode · decode-a');
+    cy.contains('h2', 'Throughput · Decode · decode-a').should('be.visible');
+    cy.contains('svg', 'Decode (avg n=50)').should('be.visible');
+
+    cy.get('[data-testid="metric-source-select"]').click();
+    cy.contains('[role="option"]', 'Prefill · prefill-a').click();
+    cy.contains('h2', 'Throughput · Prefill · prefill-a').should('be.visible');
+  });
+
+  it('toggles input and decode independently while keeping one visible', () => {
+    cy.get('[data-testid="throughput-series-input"]')
+      .should('have.attr', 'aria-pressed', 'true')
+      .and('not.be.disabled');
+    cy.get('[data-testid="throughput-series-decode"]')
+      .should('have.attr', 'aria-pressed', 'true')
+      .and('not.be.disabled');
+    cy.contains('svg', 'Input (avg n=50)').should('be.visible');
+    cy.contains('svg', 'Decode (avg n=50)').should('be.visible');
+    cy.contains('svg', 'Total running avg (60s burn-in)').should('be.visible');
+
+    cy.get('[data-testid="throughput-series-input"]').click();
+    cy.get('[data-testid="throughput-series-input"]').should('have.attr', 'aria-pressed', 'false');
+    cy.get('[data-testid="throughput-series-decode"]').should('be.disabled');
+    cy.contains('svg', 'Input (avg n=50)').should('not.exist');
+    cy.contains('svg', 'Total running avg (60s burn-in)').should('not.exist');
+
+    cy.get('[data-testid="throughput-series-input"]').click();
+    cy.get('[data-testid="throughput-series-decode"]').click();
+    cy.get('[data-testid="throughput-series-input"]').should('be.disabled');
+    cy.get('[data-testid="throughput-series-decode"]').should('have.attr', 'aria-pressed', 'false');
+  });
+});
diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts
new file mode 100644
index 00000000..6ce4bc34
--- /dev/null
+++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts
@@ -0,0 +1,133 @@
+const distribution = (values: {
+  median: number;
+  p75: number;
+  p90: number;
+  p95: number;
+  max: number;
+}) => ({
+  bins: [
+    { x0: 0, x1: 10, count: 5 },
+    { x0: 10, x1: 100, count: 15 },
+  ],
+  stats: {
+    count: 20,
+    min: 0,
+    mean: 40,
+    ...values,
+  },
+});
+
+describe('Dataset distribution percentiles', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/datasets/test-dataset', {
+      body: {
+        id: 'test-dataset',
+        slug: 'test-dataset',
+        label: 'Test dataset',
+        variant: 'full',
+        description: null,
+        hf_url: null,
+        license: 'apache-2.0',
+        conversation_count: 1,
+        summary: {
+          mainTurns: 20,
+          subagentGroups: 0,
+          subagentTurns: 0,
+          medianRequestsPerConversation: 12,
+          meanRequestsPerConversation: 14.6,
+          medianSubagentsPerTrace: 3,
+          meanSubagentsPerTrace: 4.8,
+          cachedPct: 0.5,
+          totalIn: 1000,
+          totalOut: 200,
+        },
+        chart_data: {
+          version: 2,
+          inputTokensPerTurn: distribution({
+            median: 100,
+            p75: 200,
+            p90: 300,
+            p95: 400,
+            max: 500,
+          }),
+          outputTokensPerTurn: distribution({
+            median: 10,
+            p75: 20,
+            p90: 30,
+            p95: 40,
+            max: 50,
+          }),
+          uncachedInputTokensPerTurn: distribution({
+            median: 0,
+            p75: 64,
+            p90: 128,
+            p95: 256,
+            max: 512,
+          }),
+          subagentInputTokensPerRequest: distribution({
+            median: 1000,
+            p75: 2000,
+            p90: 3000,
+            p95: 4000,
+            max: 5000,
+          }),
+          subagentOutputTokensPerRequest: distribution({
+            median: 100,
+            p75: 200,
+            p90: 300,
+            p95: 400,
+            max: 500,
+          }),
+        },
+        ingested_at: '2026-06-23T00:00:00Z',
+      },
+    });
+    cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', {
+      body: { total: 0, items: [] },
+    });
+    cy.visit('/datasets/test-dataset');
+  });
+
+  it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => {
+    const expected = [
+      ['Input tokens per turn', ['p50 100', 'p75 200', 'p90 300', 'p95 400']],
+      ['Output tokens per turn', ['p50 10', 'p75 20', 'p90 30', 'p95 40']],
+      ['Uncached input tokens per request', ['p50 0', 'p75 64', 'p90 128', 'p95 256']],
+    ] as const;
+
+    for (const [title, percentiles] of expected) {
+      cy.contains('[data-slot="card"]', title).within(() => {
+        for (const percentile of percentiles) cy.contains(percentile).should('be.visible');
+        cy.get('svg line[stroke="#3b82f6"]').should('exist');
+        cy.get('svg line[stroke="#22c55e"]').should('exist');
+        cy.get('svg line[stroke="#f59e0b"]').should('exist');
+        cy.get('svg line[stroke="#ef4444"]').should('exist');
+      });
+    }
+  });
+
+  it('shows median and mean model requests per conversation', () => {
+    cy.contains('dt', 'Median requests / convo').next('dd').should('have.text', '12');
+    cy.contains('dt', 'Mean requests / convo').next('dd').should('have.text', '14.6');
+  });
+
+  it('summarizes subagents per trace instead of charting group counts', () => {
+    cy.contains('dt', 'Median subagents / trace').next('dd').should('have.text', '3');
+    cy.contains('dt', 'Mean subagents / trace').next('dd').should('have.text', '4.8');
+    cy.contains('Subagent groups per conversation').should('not.exist');
+  });
+
+  it('shows ISL and OSL distributions for inner subagent requests only', () => {
+    const expected = [
+      ['Subagent request ISL', ['p50 1.0k', 'p75 2.0k', 'p90 3.0k', 'p95 4.0k']],
+      ['Subagent request OSL', ['p50 100', 'p75 200', 'p90 300', 'p95 400']],
+    ] as const;
+
+    for (const [title, percentiles] of expected) {
+      cy.contains('[data-slot="card"]', title).within(() => {
+        cy.contains('Inner subagent requests only').should('be.visible');
+        for (const percentile of percentiles) cy.contains(percentile).should('be.visible');
+      });
+    }
+  });
+});
diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
new file mode 100644
index 00000000..58d95c27
--- /dev/null
+++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
@@ -0,0 +1,127 @@
+describe('Dataset conversation flamegraph timing', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', {
+      body: {
+        conv_id: 'conversation-1',
+        models: ['model-a'],
+        num_turns: 2,
+        num_subagent_groups: 1,
+        total_in: 1000,
+        total_out: 100,
+        total_cached: 500,
+        structure: {
+          blockSize: 64,
+          totals: {
+            in: 1000,
+            out: 100,
+            cached: 500,
+            uncached: 500,
+            numTurns: 2,
+            numSubagentGroups: 1,
+          },
+          nodes: [
+            {
+              kind: 'turn',
+              turnIndex: 0,
+              startS: 0,
+              endS: 1.2,
+              model: 'model-a',
+              in: 100,
+              out: 10,
+              cached: 0,
+              uncached: 100,
+            },
+            {
+              kind: 'subagent',
+              label: 'Explore',
+              agentId: 'agent-1',
+              startS: 3661.2,
+              endS: 3782.6,
+              durationMs: 121_400,
+              in: 800,
+              out: 80,
+              cached: 500,
+              uncached: 300,
+              children: [
+                {
+                  kind: 'turn',
+                  turnIndex: 1,
+                  startS: 3661.2,
+                  endS: 3668.2,
+                  model: 'model-a',
+                  in: 300,
+                  out: 30,
+                  cached: 150,
+                  uncached: 150,
+                },
+                {
+                  kind: 'turn',
+                  turnIndex: 2,
+                  startS: 3665.2,
+                  endS: 3671.2,
+                  model: 'model-a',
+                  in: 300,
+                  out: 30,
+                  cached: 200,
+                  uncached: 100,
+                },
+                {
+                  kind: 'turn',
+                  turnIndex: 3,
+                  startS: 3670.2,
+                  endS: 3675.2,
+                  model: 'model-a',
+                  in: 200,
+                  out: 20,
+                  cached: 150,
+                  uncached: 50,
+                },
+              ],
+            },
+            {
+              kind: 'turn',
+              turnIndex: 2,
+              startS: 65.4,
+              endS: 67.4,
+              model: 'model-a',
+              in: 100,
+              out: 10,
+              cached: 0,
+              uncached: 100,
+            },
+          ],
+        },
+      },
+    });
+    cy.visit('/datasets/test-dataset/conversations/conversation-1');
+  });
+
+  it('shows turn offsets and a collapsed subagent time range', () => {
+    cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00–00:01');
+    cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05–01:07');
+    cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03');
+    cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist');
+  });
+
+  it('shows subturn offsets when the subagent group is expanded', () => {
+    cy.contains('button', 'Explore').click();
+    cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01–1:01:08');
+    // Parallel groups render as left-gutter brackets; each member row carries
+    // one bracket segment per group it belongs to (non-transitive chains keep
+    // their own segments/lanes).
+    cy.get('[data-testid="flamegraph-overlap-g-1-c-0"]')
+      .should('have.length', 1)
+      .and('have.attr', 'data-overlap-group', 'subagent-1-1');
+    cy.get('[data-testid="flamegraph-overlap-g-1-c-1"]')
+      .should('have.length', 2)
+      .then(($segs) => {
+        expect([...$segs].map((seg) => seg.dataset.overlapGroup).toSorted()).to.deep.equal([
+          'subagent-1-1',
+          'subagent-1-2',
+        ]);
+      });
+    cy.get('[data-testid="flamegraph-overlap-g-1-c-2"]')
+      .should('have.length', 1)
+      .and('have.attr', 'data-overlap-group', 'subagent-1-2');
+  });
+});
diff --git a/packages/app/cypress/e2e/dropdown-switching.cy.ts b/packages/app/cypress/e2e/dropdown-switching.cy.ts
index 34d95ec3..93658af0 100644
--- a/packages/app/cypress/e2e/dropdown-switching.cy.ts
+++ b/packages/app/cypress/e2e/dropdown-switching.cy.ts
@@ -17,10 +17,10 @@ describe('Dropdown one-click switching', () => {
     cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'true');
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
 
-    cy.get('[data-testid="sequence-selector"]').click();
+    cy.get('[data-testid="scenario-selector"]').click();
 
     cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'false');
-    cy.get('[data-testid="sequence-selector"]').should('have.attr', 'aria-expanded', 'true');
+    cy.get('[data-testid="scenario-selector"]').should('have.attr', 'aria-expanded', 'true');
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
   });
 
diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
new file mode 100644
index 00000000..d574dd2a
--- /dev/null
+++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
@@ -0,0 +1,54 @@
+describe('GPU comparison agentic point detail', () => {
+  it('exposes the per-point charts as a normal browser link', () => {
+    cy.intercept('GET', '/api/v1/trace-availability*', (request) => {
+      const ids = new URL(request.url).searchParams.get('ids')?.split(',') ?? [];
+      if (ids.length < 20) request.alias = 'gpuTraceAvailability';
+      request.continue();
+    });
+
+    cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', {
+      onBeforeLoad(win) {
+        win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+      },
+    });
+
+    cy.get('[data-testid="gpu-multiselect"] [role="combobox"]').click({ force: true });
+    cy.get('[role="option"]').first().click();
+    cy.contains('button', 'Select date range').click();
+    cy.get('body').then(($body) => {
+      if ($body.text().includes('View anyway')) {
+        cy.contains('button', 'View anyway').click();
+      } else {
+        cy.contains('button', 'Max Range').click();
+        cy.contains('button', 'Apply').click();
+      }
+    });
+
+    cy.get('[data-testid="gpu-graph"]').first().should('be.visible');
+    cy.wait('@gpuTraceAvailability');
+    cy.wait(100);
+    cy.get('[data-testid="gpu-graph"]')
+      .first()
+      .find('svg .dot-group')
+      .should('have.length.greaterThan', 0)
+      .first()
+      .then(($point) => {
+        const point = $point[0] as unknown as SVGElement & {
+          __data__: { benchmark_type?: string; id?: number };
+        };
+        expect(point.__data__.benchmark_type).to.equal('agentic_traces');
+        expect(point.__data__.id).to.be.a('number');
+        cy.wrap($point).find('.visible-shape').click({ force: true });
+      });
+
+    cy.get('[data-chart-tooltip]:visible').should('have.length', 1);
+    cy.get('[data-chart-tooltip]:visible [data-action="view-charts"]')
+      .should('be.visible')
+      .then(($link) => {
+        expect($link).to.match('a');
+        expect($link).not.to.have.attr('target');
+        expect($link.attr('href')).to.match(/^\/inference\/agentic\/\d+$/u);
+      });
+    cy.location('pathname').should('eq', '/inference');
+  });
+});
diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts
index 333baa6d..a0753e90 100644
--- a/packages/app/cypress/e2e/gradient-labels.cy.ts
+++ b/packages/app/cypress/e2e/gradient-labels.cy.ts
@@ -24,8 +24,8 @@ describe('Gradient Labels Toggle', () => {
     cy.get('label[for="scatter-parallelism-labels"]').should('contain.text', 'Parallelism Labels');
   });
 
-  it('Parallelism Labels toggle is off by default', () => {
-    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked');
+  it('Parallelism Labels toggle is on by default', () => {
+    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
   });
 
   it('per-point labels are visible by default (gradient labels off)', () => {
@@ -60,21 +60,19 @@ describe('Gradient Labels Toggle', () => {
   });
 
   it('both toggles can be enabled simultaneously', () => {
-    // Turn on Gradient Labels (off by default)
+    // Parallelism Labels is on by default; ensure it's on, then turn on Gradient.
+    cy.get('#scatter-parallelism-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     cy.get('#scatter-gradient-labels').click();
     cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
 
-    // Turn on Parallelism Labels
-    cy.get('#scatter-parallelism-labels').click();
-    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
-
     // Both should be checked
     cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
     cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
 
-    // Reset for next tests
+    // Reset gradient for next tests (parallelism stays at its default-on).
     cy.get('#scatter-gradient-labels').click();
-    cy.get('#scatter-parallelism-labels').click();
   });
 
   it('URL param i_gradlabel=1 enables gradient labels on load', () => {
diff --git a/packages/app/cypress/e2e/historical-trends.cy.ts b/packages/app/cypress/e2e/historical-trends.cy.ts
index f0a70a56..55b0e274 100644
--- a/packages/app/cypress/e2e/historical-trends.cy.ts
+++ b/packages/app/cypress/e2e/historical-trends.cy.ts
@@ -88,8 +88,8 @@ describe('Historical Trends — Content & Interactions', () => {
       delete doc.body.dataset.scrollLocked;
       doc.body.style.removeProperty('pointer-events');
     });
-    cy.get('[data-testid="sequence-selector"]').should('be.visible');
-    cy.get('[data-testid="sequence-selector"]').click();
+    cy.get('[data-testid="scenario-selector"]').should('be.visible');
+    cy.get('[data-testid="scenario-selector"]').click();
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
     cy.get('body').type('{esc}');
   });
diff --git a/packages/app/cypress/e2e/line-labels.cy.ts b/packages/app/cypress/e2e/line-labels.cy.ts
index 84e655f8..23b372df 100644
--- a/packages/app/cypress/e2e/line-labels.cy.ts
+++ b/packages/app/cypress/e2e/line-labels.cy.ts
@@ -15,26 +15,30 @@ describe('Line Labels Toggle', () => {
     cy.get('label[for="scatter-line-labels"]').should('contain.text', 'Line Labels');
   });
 
-  it('Line Labels toggle is on by default', () => {
-    cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked');
-
-    // Line labels render without any interaction
-    cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
-  });
-
-  it('toggling Line Labels off then back on removes and restores label elements', () => {
-    // On by default — turn it off first.
-    cy.get('#scatter-line-labels').click();
+  it('Line Labels toggle is off by default', () => {
     cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked');
+
+    // No line labels render without interaction
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0);
+  });
 
-    // Turn it back on — labels return.
+  it('toggling Line Labels on then back off adds and removes label elements', () => {
+    // Off by default — turn it on first.
     cy.get('#scatter-line-labels').click();
     cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked');
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
+
+    // Turn it back off — labels disappear.
+    cy.get('#scatter-line-labels').click();
+    cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked');
+    cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0);
   });
 
   it('line labels have colored background rects and text', () => {
+    // Off by default — ensure on (idempotent; prior test left them off).
+    cy.get('#scatter-line-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     // Each line label group should contain a background rect and text
     cy.get('[data-testid="scatter-graph"] svg g.line-label .ll-bg').should(
       'have.length.greaterThan',
@@ -47,7 +51,10 @@ describe('Line Labels Toggle', () => {
   });
 
   it('line labels render in the foreground, after the scatter points', () => {
-    // Labels were toggled on in the test above and remain on here.
+    // Off by default — ensure on (idempotent; previous test leaves them on).
+    cy.get('#scatter-line-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
 
     cy.get('[data-testid="scatter-graph"] svg').then(($svg) => {
diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index e17a4aff..924ff9a9 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -1,46 +1,90 @@
-describe('TTFT X-Axis Toggle (E2E chart)', () => {
+const interceptDerivedMetrics = () => {
+  cy.intercept('GET', '/api/v1/derived-agentic-metrics*', (request) => {
+    const ids = new URL(request.url).searchParams.get('ids')?.split(',').filter(Boolean) ?? [];
+    request.reply({
+      body: Object.fromEntries(
+        ids.map((id, index) => [
+          id,
+          {
+            id: Number(id),
+            normalized_session_time_s: 60 + index,
+            p90_prefill_tps_per_user: 100 + index,
+            p75_normalized_e2e_400_s: 8 + index,
+            p90_normalized_e2e_400_s: 12 + index,
+          },
+        ]),
+      ),
+    });
+  }).as('derivedAgenticMetrics');
+};
+
+describe('X-Axis Mode Toggle (inference chart)', () => {
   before(() => {
-    cy.window().then((win) => {
-      win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+    cy.visit('/inference', {
+      onBeforeLoad(win) {
+        win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+      },
     });
-    cy.visit('/inference');
-    cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 2);
+    cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible');
+    cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1);
   });
 
-  it('shows the x-axis dropdown in the e2e chart heading', () => {
-    cy.get('[data-testid="chart-figure"]')
-      .eq(1)
-      .find('h2 button')
-      .should('contain.text', 'vs.')
-      .and('contain.text', 'Latency');
+  it('shows Interactivity by default for the agentic view', () => {
+    cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces');
+    cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible');
+    cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible');
+    cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should('be.visible');
+    cy.get('[data-testid="x-axis-mode-interactivity"]')
+      .should('be.visible')
+      .and('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
   });
 
-  it('opens popover with three x-axis options', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').within(() => {
-      cy.contains('End-to-end Latency').should('exist');
-      cy.contains('P99 TTFT').should('exist');
-      cy.contains('Median TTFT').should('exist');
-    });
+  it('switches the x-axis to TTFT and updates the heading', () => {
+    cy.get('[data-testid="x-axis-mode-ttft"]').click();
+    cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token');
   });
 
-  it('switches x-axis to P99 TTFT and updates the heading', () => {
-    cy.get('[data-slot="popover-content"]').contains('P99 TTFT').click();
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'P99 TTFT');
+  it('switches the x-axis to E2E Latency and updates the heading', () => {
+    cy.get('[data-testid="x-axis-mode-e2e"]').click();
+    cy.get('[data-testid="x-axis-mode-e2e"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency');
   });
 
-  it('switches x-axis to Median TTFT and updates the heading', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').contains('Median TTFT').click();
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'Median TTFT');
+  it('switches to request-level normalized E2E at 400 output tokens', () => {
+    interceptDerivedMetrics();
+    cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click();
+    cy.wait('@derivedAgenticMetrics');
+    cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should(
+      'have.attr',
+      'aria-selected',
+      'true',
+    );
+    cy.get('[data-testid="chart-figure"] h2').should(
+      'contain.text',
+      'P90 Normalized E2E @ 400 output tokens',
+    );
+    cy.get('[data-testid="chart-figure"] svg').should(
+      'contain.text',
+      'P90 Normalized E2E @ 400 output tokens (s)',
+    );
+
+    cy.get('[data-testid="percentile-selector"]').click();
+    cy.contains('[role="option"]', 'p75').click();
+    cy.get('[data-testid="chart-figure"] h2').should(
+      'contain.text',
+      'P75 Normalized E2E @ 400 output tokens',
+    );
   });
 
-  it('switches back to End-to-end Latency', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').contains('End-to-end Latency').click();
-    cy.get('[data-testid="chart-figure"]')
-      .eq(1)
-      .find('h2')
-      .should('contain.text', 'End-to-end Latency');
+  it('switches back to Interactivity', () => {
+    cy.get('[data-testid="x-axis-mode-interactivity"]').click();
+    cy.get('[data-testid="x-axis-mode-interactivity"]').should(
+      'have.attr',
+      'aria-selected',
+      'true',
+    );
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
   });
 });
diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts
index 33282b9c..927aee5f 100644
--- a/packages/app/cypress/e2e/url-params.cy.ts
+++ b/packages/app/cypress/e2e/url-params.cy.ts
@@ -21,7 +21,7 @@ const visitWithErrorSpy = (path: string) => {
 };
 
 const assertNoHydrationMismatch = () => {
-  cy.get('[data-testid="sequence-selector"]').should('be.visible');
+  cy.get('[data-testid="scenario-selector"]').should('be.visible');
   cy.get('@consoleError').then((spy) => {
     const calls = (spy as unknown as { args: unknown[][] }).args;
     const hydration = calls.filter((args) =>
@@ -152,7 +152,7 @@ describe('URL Parameter Persistence', () => {
 
     it('/inference?i_seq=1k/1k seeds the sequence without a hydration error', () => {
       visitWithErrorSpy('/inference?i_seq=1k/1k');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       assertNoHydrationMismatch();
     });
 
@@ -160,13 +160,13 @@ describe('URL Parameter Persistence', () => {
       // Visit the canonical model-prefixed slug so the assertion is directly
       // about the rendered page, not about a bare-slug redirect interleaving.
       visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=1k/1k');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       assertNoHydrationMismatch();
     });
 
     it('/compare/[slug] with invalid ?i_seq=junk falls back to the seeded default', () => {
       visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=junk');
-      cy.get('[data-testid="sequence-selector"]')
+      cy.get('[data-testid="scenario-selector"]')
         .invoke('text')
         .should('not.contain', 'junk')
         .and('match', /[18]K . [18]K/u);
@@ -228,7 +228,7 @@ describe('URL Parameter Persistence', () => {
       // `effectivePrecisions` intersects the selection with available precisions
       // and the UI may render the fallback. dsr1 + fp8 + 1k/1k is supported.
       visitWithErrorSpy('/inference?i_seq=1k/1k&g_model=DeepSeek-R1-0528&i_prec=fp8');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       cy.get('[data-testid="model-selector"]').should('contain.text', 'DeepSeek');
       cy.get('[data-testid="precision-multiselect"]').should('contain.text', 'FP8');
       assertNoHydrationMismatch();
@@ -236,9 +236,15 @@ describe('URL Parameter Persistence', () => {
   });
 
   describe('High contrast mode', () => {
-    it('page loads without high contrast by default', () => {
+    it('inference loads with high contrast on by default', () => {
       visitWithDismissedModal('/inference');
       cy.get('[data-testid="scatter-graph"]').should('exist');
+      cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked');
+    });
+
+    it('i_hc=0 disables high contrast on load', () => {
+      visitWithDismissedModal('/inference?i_hc=0');
+      cy.get('[data-testid="scatter-graph"]').should('exist');
       cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
     });
 
@@ -267,10 +273,12 @@ describe('URL Parameter Persistence', () => {
       cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked');
     });
 
-    it('historical trends tab has high contrast switch off by default', () => {
+    it('historical trends tab shares the inference high-contrast default (on)', () => {
+      // Historical reads highContrast from the same InferenceContext as the
+      // scatter chart, so it inherits the default-on behavior.
       visitWithDismissedModal('/historical');
       cy.get('[data-testid="historical-trends-display"]').should('exist');
-      cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
+      cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked');
     });
 
     it('i_hc=1 enables historical trends high contrast', () => {
diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index bcdfe21b..b2164bcc 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -189,10 +189,14 @@ export function createMockInferenceContext(
     workflowInfo: null,
     selectedYAxisMetric: 'y_tpPerGpu',
     setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'),
+    selectedPercentile: 'p90',
+    setSelectedPercentile: namedStub('setSelectedPercentile'),
     selectedXAxisMetric: null,
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
     setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'),
+    selectedXAxisMode: 'interactivity' as const,
+    setSelectedXAxisMode: namedStub('setSelectedXAxisMode'),
     scaleType: 'auto',
     setScaleType: namedStub('setScaleType'),
     quickFilters: { vendors: [], frameworks: [], disagg: [], spec: [] },

From 3efd6b87e475d67339b69e60ef6c13f1620e289d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:21:14 -0500
Subject: [PATCH 11/40] chore: drop completed one-shot backfills and
 investigation doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

backfill-agentic-intvty, backfill-agentic-server-logs, and
backfill-kv-pool were one-time data repairs whose fixes now run inline
during ingest; all existing rows have been repaired. The version-driven
backfills (chart-series, request-timeline, aggregate-stats,
dataset-stats) remain — they re-materialize stored payloads whenever a
version constant bumps.
---
 .../kv-cache-hit-rate-anomaly.md              | 113 ---------
 packages/app/src/lib/benchmark-transform.ts   |   2 +-
 packages/db/package.json                      |   3 -
 packages/db/src/backfill-agentic-intvty.ts    | 107 ---------
 .../db/src/backfill-agentic-server-logs.ts    | 215 ------------------
 packages/db/src/backfill-kv-pool.ts           | 103 ---------
 packages/db/src/etl/benchmark-mapper.ts       |   2 +-
 packages/db/src/lib/github-artifacts.ts       |   2 +-
 8 files changed, 3 insertions(+), 544 deletions(-)
 delete mode 100644 docs/investigations/kv-cache-hit-rate-anomaly.md
 delete mode 100644 packages/db/src/backfill-agentic-intvty.ts
 delete mode 100644 packages/db/src/backfill-agentic-server-logs.ts
 delete mode 100644 packages/db/src/backfill-kv-pool.ts

diff --git a/docs/investigations/kv-cache-hit-rate-anomaly.md b/docs/investigations/kv-cache-hit-rate-anomaly.md
deleted file mode 100644
index 61ffee42..00000000
--- a/docs/investigations/kv-cache-hit-rate-anomaly.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# KV cache hit-rate anomaly on agentic benchmarks (dsv4, b200, vllm)
-
-## Core issue
-
-vLLM's prefix cache should be hitting at ~98% on multi-turn agentic conversation replay (each turn extends the prior turn's context). It isn't. Something in the **dataset definition** or **aiperf replay** is producing requests whose token streams aren't actually prefix-compatible turn-to-turn.
-
-| Concurrency | Theoretical max hit % | vLLM actual hit % |
-| ----------: | --------------------: | ----------------: |
-|           1 |                97.45% |            83.15% |
-|           2 |                98.34% |            46.78% |
-|           4 |                97.99% |            12.43% |
-
-This is **not** a capacity problem. KV cache is sized at 3.29M tokens (12,868 blocks × 256). The conc=4 workload's unique-content footprint is **~1.11M DSV4 tokens** — would fit in ~34% util. Observed peak util is 49.8%, so the cache is holding more blocks than the workload needs, yet vLLM can't find them on lookup.
-
-## Data sources
-
-- **Benchmark points**:
-  - http://localhost:3002/inference/agentic/206252 (conc=1)
-  - http://localhost:3002/inference/agentic/206245 (conc=2)
-  - http://localhost:3002/inference/agentic/206247 (conc=4)
-- **Neon DB**: project `silent-pond-29172997`, branch `br-cold-sky-ai0c09cy` (agentx-dev). Connection via `DATABASE_WRITE_URL` in `.env`. Console: https://console.neon.tech/app/projects/silent-pond-29172997/branches/br-cold-sky-ai0c09cy
-  - `agentic_trace_replay.profile_export_jsonl_gz` — gzipped aiperf per-request records
-  - `agentic_trace_replay.server_metrics_json_gz` — gzipped vllm per-scrape prometheus metrics
-  - `agentic_trace_replay.request_timeline` (jsonb) — pre-computed per-request timeline used by the simulation
-- **Trace replay dataset** (the source-of-truth for "what should be cacheable"): https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-051926. Each row has pre-computed 64-token block `hash_ids` per turn; `hash_id_scope: 'local'` (per-conversation).
-
-## Theoretical max simulation
-
-For each replayed request, look up the matching turn in the HF dataset and walk a per-conversation trie of 64-token block hash IDs. Hits = longest contiguous prefix from block 0 that has appeared in any prior request (mirrors vLLM's chained-hash semantics).
-
-Confirms: the workload IS prefix-cacheable end-to-end. Theoretical max ≈ 98% across all three concurrency levels — same dataset, same conversations, just different dispatch order.
-
-## Why this points at the dataset/replay, not vLLM
-
-- **Capacity is not the bottleneck.** Cache holds ~3× the unique content of the workload. Cache util tops out below capacity.
-- **The metric isn't lying.** vLLM's own counters cross-check: `prefill_kv_computed_tokens + prefix_cache_hits ≈ request_prompt_tokens` (67.85M + 9.61M ≈ 77.47M for conc=4).
-- **It's not a tokenizer artifact.** DSV4 tokens are ~54% the count of Claude tokens, but BPE is left-monotonic on stable text — hit-rate ratio is invariant to tokenizer choice for prefix-growth workloads.
-- **It's not the multi-engine DP bug** we found earlier (commit `f2618f4`) — this deployment has 1 engine.
-
-What's left: the bytes that vLLM actually receives turn-to-turn are not the same prefix + delta that the dataset's `hash_ids` describe. Most likely culprits:
-
-1. **aiperf isn't sending the cumulative chat history** the way the dataset assumes — each turn is being assembled differently than the previous, breaking the byte-level prefix.
-2. **Something in the request payload varies per request** (timestamps, request IDs, tool result serialization order, etc.) — invalidates block 0's hash, cascades to every subsequent block via vLLM's chained hashing.
-3. **BPE re-merging across message boundaries** when aiperf re-tokenizes the full history each turn instead of appending tokens.
-
-## Root cause: `ConversationReconstructor` strips the prev user's `partial_tail` every turn
-
-The bug is in `utils/aiperf/src/aiperf/dataset/loader/weka_synth_buf.py` — specifically the **boundary case** in `truncate_synth_buf_at_block` (line 453–464) combined with `turn_delta`'s reset logic (line 354–360).
-
-What happens turn-to-turn:
-
-1. `init_turn_0` builds a trailing user segment whose `tokens` = `[block_aligned_tokens] + [partial_tail_tokens]` where `partial_tail_n = in_tokens % bs`. The wire prompt for turn 0 includes these tail tokens.
-2. `advance_turn` computes `lcp = longest_common_prefix(prev_hash_ids, curr_hash_ids)`. When the LCP equals the prev turn's total block count (the normal append-only case), `truncate_synth_buf_at_block` hits its boundary branch: `cursor + seg.block_count == target_blocks`.
-3. That branch **strips `prev_partial_tail` tokens off the trailing user segment in place** and re-decodes its `content`. This sets `_last_disturbance_at = i` (the index of the prev trailing user segment).
-4. New `assistant` + `user` segments are appended.
-5. `turn_delta` sees `_last_disturbance_at < _emitted_segment_count` and forces `reset_context=True`, re-emitting **the whole conversation** with the now-stripped trailing user.
-
-The endpoint (`utils/aiperf/src/aiperf/endpoints/base_endpoint.py:110-140`) honors `reset_context=True` via `messages = list(turn.raw_messages)` instead of `messages.extend(...)`.
-
-Result: every turn sends the full chat history, but the bytes of the prev user message differ from what was sent the turn before — the trailing `partial_tail` chars are missing. vLLM tokenizes the new prompt, hashes 256-token blocks, and the chained-hash invariant breaks at the first block containing the trimmed boundary. That block + every subsequent block of the new turn miss the cache.
-
-### Empirical confirmation
-
-Reproducer at `/tmp/test-reconstructor.py` instantiates `ConversationReconstructor` with mock decoders and walks a synthetic 3-turn conversation:
-
-```
-=== Turn 0 ===
-  delta msgs: 2, reset=False
-  wire len: 21683
-
-=== Turn 1 ===
-  delta msgs: 4, reset=True            ← every turn resets
-  wire len: 25307
-
-=== DIFF turn 0 vs turn 1 (wire-level) ===
-  common prefix chars: 21549 / wire0 21683 (99.4%)
-  wire0[...] = '... 983406 12 1 133 184 16 57 71 155 37 '     ← partial_tail decoded
-  wire1[...] = '... 983406<|im_end|>\n<|im_start|>assista'    ← stripped, template marker next
-  turn0 user content len: 19812, turn1 user[0] content len: 19711   ← 101 chars stripped
-```
-
-Across the conc=1 run (point 206252), **280/280 (100%)** consecutive turn-pairs have `prev_in_tokens % bs != 0` — i.e., every single turn hits this boundary disturbance.
-
-### Why the gap widens with concurrency
-
-At conc=1 the gap (97.45% − 83.15% = 14pp) is roughly the fraction of each turn's blocks lost to the trimmed-tail invalidation (last user block + chat-template delta). At higher conc:
-
-- `reset_context=True` makes every request re-send the **entire** conversation prompt, so wire bandwidth + prefill work scale superlinearly per turn.
-- Concurrent conversations all do this simultaneously; each writes long sequences of "new" blocks past their respective divergence points, evicting other conversations' usable prefix blocks even though aggregate unique content (1.11M tokens) fits comfortably in the 3.29M-token cache.
-
-### Fix sketch
-
-The boundary-cut strip exists to keep the next turn's `assistant` segment block-aligned. Two viable fixes:
-
-1. **Don't mutate the prev trailing user segment.** Leave its `partial_tail` tokens intact; append the new asst+user as strict-append (no reset_context). The wire-prefix becomes byte-stable turn-to-turn. Cost: the new asst content's block_start no longer aligns to the prev_hash_ids tail, so hash_id accounting for asst blocks loses 1 block of fidelity per turn.
-2. **Track `partial_tail` separately** from the prev user segment so the segment's emitted content stays byte-stable, and only the trailing tail (which is regenerated each turn anyway) is allowed to vary.
-
-Option 1 is the minimal change. Validate with the reproducer above — remove the strip in `truncate_synth_buf_at_block`'s boundary case and re-run; turn N+1's wire prefix should equal turn N's wire byte-for-byte up to the end of the prev assistant template.
-
-## Re-running the simulation
-
-```bash
-# 1. dump request timelines from DB
-pnpm --filter @semianalysisai/inferencex-db exec dotenv -e ../../.env -- tsx /tmp/dump-rt-multi.ts
-
-# 2. run analysis (needs `pip3 install --break-system-packages --user datasets`)
-python3 /tmp/cache-sim-multi.py
-
-# 3. reproduce the partial_tail strip
-python3 /tmp/test-reconstructor.py
-```
-
-Scripts live in `/tmp/` from this session; recreate from inline code in the previous version of this doc if missing.
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index cb8e3ceb..df1d328e 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -26,7 +26,7 @@ import type { BenchmarkRow } from '@/lib/api';
  * itl, overriding any artifact-supplied value: the harness definition of
  * `*_intvty` has drifted (some versions emit `p(1/ITL)`, which inverts percentile
  * order), so for a slow-tail selector interactivity must be `1/p(ITL)`. This
- * matches the ingest mapper + backfill-agentic-intvty for official rows; doing it
+ * matches the ingest mapper for official rows; doing it
  * here keeps overlay / `?unofficialrun=` rows (transformed live from raw
  * artifacts, never through the DB) on the same definition.
  */
diff --git a/packages/db/package.json b/packages/db/package.json
index c7836df4..2c8dc067 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -19,12 +19,9 @@
     "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts",
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
-    "db:backfill-agentic-intvty": "dotenv -e ../../.env -- tsx src/backfill-agentic-intvty.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
-    "db:backfill-agentic-server-logs": "dotenv -e ../../.env -- tsx src/backfill-agentic-server-logs.ts",
     "db:backfill-dataset-stats": "dotenv -e ../../.env -- tsx src/backfill-dataset-stats.ts",
-    "db:backfill-kv-pool": "dotenv -e ../../.env -- tsx src/backfill-kv-pool.ts",
     "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
diff --git a/packages/db/src/backfill-agentic-intvty.ts b/packages/db/src/backfill-agentic-intvty.ts
deleted file mode 100644
index a8eebdba..00000000
--- a/packages/db/src/backfill-agentic-intvty.ts
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Backfill: enforce the slow-tail interactivity invariant on agentic rows.
- *
- * Agentic trace-replay artifacts emit both `*_itl` and `*_intvty`. Historically
- * the harness wrote `*_intvty = 1/p(ITL)` (slow-tail — "interactivity at the
- * p-th latency"), which is what the inference chart's interactivity selector
- * and the detail time-series both assume. A later "timing fix" harness started
- * emitting `*_intvty = p(1/ITL)` instead (fast-tail — equivalent to
- * `1/p(100-x)(ITL)`), because taking the reciprocal reverses percentile order.
- * Ingest stores every metric verbatim, so those runs landed in the DB with the
- * opposite definition — e.g. p90 reading 23.9 instead of 11.2 for the same
- * point — contaminating cross-run Pareto comparisons.
- *
- * This rewrites `mean/p75/p90/p95 _intvty = 1/_itl` for every agentic row so the
- * stored value always matches the slow-tail definition the charts use. It is
- * idempotent: rows already on the correct definition are left untouched (guarded
- * by a relative-deviation check). `std_intvty` is intentionally NOT touched —
- * the reciprocal of a standard deviation is meaningless, and the API strips it.
- * The prior fast-tail value is discarded on purpose (p10_itl isn't stored, so it
- * isn't recoverable anyway, and per project policy fast-tail must not back a
- * slow-tail selector).
- *
- * Usage:
- *   pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes
- */
-
-import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
-import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils.js';
-
-// Percentile-style keys whose interactivity is the reciprocal of the matching
-// ITL percentile. `std` is excluded by design (not a reciprocal); `median`/`p99`
-// are absent from agentic artifacts so they never appear here.
-const KEYS = ['mean', 'p75', 'p90', 'p95'] as const;
-
-// Relative tolerance: skip rows already within 1e-6 of 1/itl so correct rows
-// keep their original full-precision value and the change counts are accurate.
-const REL_TOL = 1e-6;
-
-const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
-
-async function contaminationCounts(): Promise<Record<string, number>> {
-  const out: Record<string, number> = {};
-  for (const k of KEYS) {
-    const rows = await sql.unsafe(`
-      SELECT count(*)::int AS n
-      FROM benchmark_results
-      WHERE benchmark_type = 'agentic_traces'
-        AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0
-        AND metrics ? '${k}_intvty'
-        AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric)
-            > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric)
-    `);
-    out[k] = (rows[0] as unknown as { n: number }).n;
-  }
-  return out;
-}
-
-async function main(): Promise<void> {
-  const total = await sql<{ n: number }[]>`
-    SELECT count(*)::int AS n FROM benchmark_results WHERE benchmark_type = 'agentic_traces'
-  `;
-  console.log(`Agentic rows: ${total[0]!.n}`);
-
-  const before = await contaminationCounts();
-  console.log('Contaminated (intvty != 1/itl) before:', JSON.stringify(before));
-  if (KEYS.every((k) => before[k] === 0)) {
-    console.log('Nothing to backfill — all agentic rows already satisfy intvty = 1/itl.');
-    await sql.end();
-    return;
-  }
-
-  if (!hasYesFlag() && !(await confirm('Rewrite *_intvty = 1/*_itl for these rows? (y/N) '))) {
-    await sql.end();
-    return;
-  }
-
-  let totalUpdated = 0;
-  for (const k of KEYS) {
-    // keys are from a fixed trusted const — safe to interpolate.
-    const res = await sql.unsafe(`
-      UPDATE benchmark_results
-      SET metrics = jsonb_set(metrics, '{${k}_intvty}', to_jsonb(1.0 / (metrics->>'${k}_itl')::numeric))
-      WHERE benchmark_type = 'agentic_traces'
-        AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0
-        AND metrics ? '${k}_intvty'
-        AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric)
-            > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric)
-    `);
-    console.log(`  ${k}_intvty: updated ${res.count} row(s)`);
-    totalUpdated += res.count;
-  }
-
-  const after = await contaminationCounts();
-  console.log('Contaminated after:', JSON.stringify(after));
-  if (!KEYS.every((k) => after[k] === 0)) {
-    throw new Error('Backfill incomplete — some rows still deviate. Aborting before MV refresh.');
-  }
-
-  await refreshLatestBenchmarks(sql);
-  console.log(`Done. Rewrote ${totalUpdated} metric value(s) across agentic rows.`);
-  await sql.end();
-}
-
-main().catch((error) => {
-  console.error(error);
-  process.exit(1);
-});
diff --git a/packages/db/src/backfill-agentic-server-logs.ts b/packages/db/src/backfill-agentic-server-logs.ts
deleted file mode 100644
index 37157861..00000000
--- a/packages/db/src/backfill-agentic-server-logs.ts
+++ /dev/null
@@ -1,215 +0,0 @@
-/**
- * Backfill server logs (and the derived KV-cache pool size) for AGENTIC
- * benchmark points.
- *
- * Agentic runs upload their vLLM server log as a `server_logs_<key>` artifact,
- * but the ingest path historically failed to link it to agentic rows (the
- * `bmk_agentic_<key>` → `server_logs_<key>` key mismatch, now fixed in
- * ingest-ci-run). As a result the agentic server log text was never stored, so
- * `kv_cache_pool_tokens` cannot be derived from the DB — we must re-fetch the
- * artifacts from GitHub.
- *
- * For each agentic workflow run this:
- *   1. lists the run's artifacts and keeps only `server_logs_*` + `bmk_agentic_*`
- *      (dedup by logical name, mirroring ingest's runner-suffix collapse),
- *   2. downloads + unzips just those (small — skips the multi-MB trace dirs),
- *   3. maps each `bmk_agentic_<key>` JSON → config → benchmark_results rows via
- *      the same mapBenchmarkRow/config-cache logic ingest uses,
- *   4. calls insertServerLog(), which stores+links the log AND derives
- *      `kv_cache_pool_tokens` into benchmark_results.metrics.
- *
- * Idempotent: insertServerLog only links rows whose server_log_id is null.
- *
- * Usage:
- *   pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-server-logs
- *     [--limit N]   only process the first N workflow runs
- *     [--yes]       skip the confirmation prompt
- */
-
-import fs from 'node:fs';
-import os from 'node:os';
-import path from 'node:path';
-
-import { hasNoSslFlag } from './cli-utils';
-import { insertServerLog } from './etl/benchmark-ingest';
-import { mapBenchmarkRow } from './etl/benchmark-mapper';
-import { createConfigCache } from './etl/config-cache';
-import { createAdminSql } from './etl/db-utils';
-import { createSkipTracker } from './etl/skip-tracker';
-import { confirmProceed, parseLimitForceFlags, runBackfillMain } from './lib/backfill-runner';
-import {
-  RUNNER_SUFFIX_RE,
-  dedupeArtifactsByLogicalName,
-  downloadArtifact,
-  listRunArtifacts,
-  type ArtifactMeta,
-} from './lib/github-artifacts';
-
-const REPO = 'SemiAnalysisAI/InferenceX';
-
-const flags = parseLimitForceFlags();
-const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
-
-/**
- * List the run's `server_logs_*` / `bmk_agentic_*` artifacts, deduped by
- * runner-suffix-stripped logical name (matches ingest's collapse).
- */
-function listArtifacts(githubRunId: string): Map<string, ArtifactMeta> {
-  return dedupeArtifactsByLogicalName(
-    listRunArtifacts(REPO, githubRunId).filter(
-      (a) => a.name.startsWith('server_logs_') || a.name.startsWith('bmk_agentic_'),
-    ),
-  );
-}
-
-/** Logical key shared by a server_logs_/bmk_agentic_ artifact pair. */
-function logicalKey(name: string): string {
-  return name
-    .replace(/^server_logs_/u, '')
-    .replace(/^bmk_agentic_/u, '')
-    .replace(RUNNER_SUFFIX_RE, '');
-}
-
-/**
- * Read up to `maxBytes` of a (possibly huge) server log as UTF-8, stripping NUL
- * bytes. vLLM's "GPU KV cache size" startup lines are near the top, so a head
- * read is enough to derive the KV pool — and it caps storage for the rare
- * multi-hundred-MB logs that exceed V8's ~512 MB string limit.
- */
-const stripNul = (s: string): string => s.replaceAll(String.fromCodePoint(0), '');
-
-function readServerLogCapped(p: string, maxBytes = 64 * 1024 * 1024): string {
-  if (fs.statSync(p).size <= maxBytes) return stripNul(fs.readFileSync(p, 'utf8'));
-  const fd = fs.openSync(p, 'r');
-  try {
-    const buf = Buffer.allocUnsafe(maxBytes);
-    const n = fs.readSync(fd, buf, 0, maxBytes, 0);
-    return stripNul(buf.subarray(0, n).toString('utf8'));
-  } finally {
-    fs.closeSync(fd);
-  }
-}
-
-function findJsonFiles(dir: string): string[] {
-  const out: string[] = [];
-  const walk = (d: string) => {
-    for (const e of fs.readdirSync(d, { withFileTypes: true })) {
-      const p = path.join(d, e.name);
-      if (e.isDirectory()) walk(p);
-      else if (e.name.endsWith('.json')) out.push(p);
-    }
-  };
-  walk(dir);
-  return out;
-}
-
-async function main(): Promise<void> {
-  console.log('=== backfill-agentic-server-logs ===');
-  console.log(`  limit = ${flags.limit ?? 'none'}`);
-
-  // Agentic workflow runs that still have unlinked server logs.
-  const runs = await sql<{ github_run_id: string; workflow_run_id: number }[]>`
-    select distinct wr.github_run_id::text as github_run_id, wr.id as workflow_run_id
-    from benchmark_results br
-    join workflow_runs wr on wr.id = br.workflow_run_id
-    where br.benchmark_type = 'agentic_traces'
-      and br.server_log_id is null
-    order by wr.id
-    ${flags.limit ? sql`limit ${flags.limit}` : sql``}
-  `;
-
-  if (runs.length === 0) {
-    console.log('\n  Nothing to do — all agentic rows already have a server log.');
-    return;
-  }
-  if (!(await confirmProceed(`${runs.length} agentic workflow run(s) to process.`))) return;
-
-  const cache = createConfigCache(sql);
-  await cache.preloadConfigs();
-  const tracker = createSkipTracker();
-
-  let linkedRows = 0;
-  let runsOk = 0;
-  let runsFailed = 0;
-  const t0 = Date.now();
-
-  for (const { github_run_id: githubRunId, workflow_run_id: wrId } of runs) {
-    const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `kvpool-${githubRunId}-`));
-    try {
-      const artifacts = listArtifacts(githubRunId);
-      // server log path by logical key
-      const serverLogByKey = new Map<string, string>();
-      const bmkDirs: string[] = [];
-      for (const art of artifacts.values()) {
-        const dir = downloadArtifact(art, tmp);
-        if (art.name.startsWith('server_logs_')) {
-          const logPath = path.join(dir, 'server.log');
-          if (fs.existsSync(logPath)) serverLogByKey.set(logicalKey(art.name), logPath);
-        } else {
-          bmkDirs.push(dir);
-        }
-      }
-
-      let runLinked = 0;
-      for (const bmkDir of bmkDirs) {
-        const key = logicalKey(path.basename(bmkDir));
-        const logPath = serverLogByKey.get(key);
-        if (!logPath) continue;
-        for (const file of findJsonFiles(bmkDir)) {
-          let raw: unknown;
-          try {
-            raw = JSON.parse(fs.readFileSync(file, 'utf8'));
-          } catch {
-            continue;
-          }
-          const rows = Array.isArray(raw) ? raw : [raw];
-          for (const row of rows) {
-            if (!row || typeof row !== 'object') continue;
-            const mapped = mapBenchmarkRow(row as Record<string, unknown>, tracker);
-            if (!mapped || mapped.benchmarkType !== 'agentic_traces') continue;
-            const configId = await cache.getOrCreateConfig(mapped.config);
-            const ids = await sql<{ id: number }[]>`
-              select id from benchmark_results
-              where workflow_run_id = ${wrId}
-                and config_id = ${configId}
-                and conc = ${mapped.conc}
-                and benchmark_type = 'agentic_traces'
-                and server_log_id is null
-            `;
-            if (ids.length === 0) continue;
-            const serverLog = readServerLogCapped(logPath);
-            await insertServerLog(
-              sql,
-              ids.map((r) => r.id),
-              serverLog,
-            );
-            runLinked += ids.length;
-          }
-        }
-      }
-      linkedRows += runLinked;
-      runsOk++;
-      const elapsed = Math.round((Date.now() - t0) / 1000);
-      console.log(
-        `  ✓ run ${githubRunId}: ${serverLogByKey.size} log(s), linked ${runLinked} row(s) ` +
-          `(${runsOk}/${runs.length}, ${elapsed}s total)`,
-      );
-    } catch (error) {
-      runsFailed++;
-      console.error(
-        `  ✗ run ${githubRunId}: ${error instanceof Error ? (error.stack ?? error.message) : String(error)}`,
-      );
-    } finally {
-      fs.rmSync(tmp, { recursive: true, force: true });
-    }
-  }
-
-  const totalSec = Math.round((Date.now() - t0) / 1000);
-  console.log(
-    `\n=== complete: ${linkedRows} row(s) linked across ${runsOk} run(s) ` +
-      `(${runsFailed} failed) in ${totalSec}s ===`,
-  );
-  if (runsFailed > 0) process.exitCode = 1;
-}
-
-runBackfillMain('backfill-agentic-server-logs', sql, main);
diff --git a/packages/db/src/backfill-kv-pool.ts b/packages/db/src/backfill-kv-pool.ts
deleted file mode 100644
index efa04c81..00000000
--- a/packages/db/src/backfill-kv-pool.ts
+++ /dev/null
@@ -1,103 +0,0 @@
-/**
- * Backfill `benchmark_results.metrics->kv_cache_pool_tokens` from the captured
- * server logs. The value is parsed from vLLM's authoritative
- * "GPU KV cache size: N tokens" startup line(s), summed across data-parallel
- * engine cores (see {@link kvCachePoolTokensFromServerLog}).
- *
- * The ingest path now derives this inline in `insertServerLog`, but existing
- * rows need this one-time pass. Idempotent: re-running only touches rows that
- * still lack the value (unless --force).
- *
- * Usage:
- *   pnpm --filter @semianalysisai/inferencex-db db:backfill-kv-pool
- *     [--limit N]   only process the first N candidate server logs
- *     [--force]     recompute even when the value is already set
- *     [--yes]       skip the confirmation prompt
- */
-
-import { hasNoSslFlag } from './cli-utils.js';
-import { createAdminSql } from './etl/db-utils.js';
-import { kvCachePoolTokensFromServerLog } from './etl/server-log-metrics.js';
-import { confirmProceed, parseLimitForceFlags, runBackfillMain } from './lib/backfill-runner.js';
-
-const flags = parseLimitForceFlags();
-
-const sql = createAdminSql({
-  noSsl: hasNoSslFlag(),
-  max: 1,
-  onnotice: () => {},
-});
-
-async function main(): Promise<void> {
-  console.log('=== backfill-kv-pool ===');
-  console.log(`  force = ${flags.force}`);
-  console.log(`  limit = ${flags.limit ?? 'none'}`);
-
-  // One server log can be linked to several benchmark_results (multiple
-  // concurrency points share a server). Group by log id so we parse each log
-  // once and fan the value out to all its rows.
-  const candidates = flags.force
-    ? await sql<{ server_log_id: number }[]>`
-        select distinct server_log_id
-        from benchmark_results
-        where server_log_id is not null
-        order by server_log_id
-        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
-      `
-    : await sql<{ server_log_id: number }[]>`
-        select distinct server_log_id
-        from benchmark_results
-        where server_log_id is not null
-          and metrics->>'kv_cache_pool_tokens' is null
-        order by server_log_id
-        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
-      `;
-
-  if (candidates.length === 0) {
-    console.log('\n  Nothing to do — all rows up to date.');
-    return;
-  }
-
-  if (!(await confirmProceed(`${candidates.length} candidate server log(s).`))) return;
-
-  let updated = 0;
-  let logsWithValue = 0;
-  let logsNoValue = 0;
-  let failed = 0;
-  const t0 = Date.now();
-  for (const { server_log_id: logId } of candidates) {
-    try {
-      const [row] = await sql<{ server_log: string | null }[]>`
-        select server_log from server_logs where id = ${logId}
-      `;
-      const tokens = kvCachePoolTokensFromServerLog(row?.server_log ?? null);
-      if (tokens === null) {
-        logsNoValue++;
-        continue; // non-vLLM or no startup line — leave unset
-      }
-      logsWithValue++;
-      const targets = flags.force
-        ? sql`server_log_id = ${logId}`
-        : sql`server_log_id = ${logId} and metrics->>'kv_cache_pool_tokens' is null`;
-      const result = await sql`
-        update benchmark_results
-        set metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${tokens}::bigint))
-        where ${targets}
-      `;
-      updated += result.count;
-      console.log(`  ✓ log=${logId}: ${tokens.toLocaleString()} tok → ${result.count} row(s)`);
-    } catch (error) {
-      failed++;
-      console.error(`  ✗ log=${logId}: ${error instanceof Error ? error.message : String(error)}`);
-    }
-  }
-
-  const totalSec = Math.round((Date.now() - t0) / 1000);
-  console.log(
-    `\n=== backfill complete: ${updated} row(s) updated from ${logsWithValue} log(s) ` +
-      `(${logsNoValue} log(s) had no KV-pool line, ${failed} failed) in ${totalSec}s ===`,
-  );
-  if (failed > 0) process.exitCode = 1;
-}
-
-runBackfillMain('backfill-kv-pool', sql, main);
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 90c23ef0..caae08c2 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -253,7 +253,7 @@ export function mapBenchmarkRow(
   // percentile, so we derive it from `*_itl` here rather than trust the artifact,
   // keeping every agentic row on one definition. `std` is excluded — the
   // reciprocal of a standard deviation is meaningless. Mirrored in the frontend
-  // overlay path (agenticAliases) and the one-time backfill-agentic-intvty script.
+  // overlay path (agenticAliases).
   if (isAgentic) {
     for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
       const itl = metrics[`${k}_itl`];
diff --git a/packages/db/src/lib/github-artifacts.ts b/packages/db/src/lib/github-artifacts.ts
index 291740cf..c96ae830 100644
--- a/packages/db/src/lib/github-artifacts.ts
+++ b/packages/db/src/lib/github-artifacts.ts
@@ -1,6 +1,6 @@
 /**
  * GitHub Actions artifact helpers shared by `ingest-ci-run.ts` (download
- * mode) and `backfill-agentic-server-logs.ts`. All calls shell out to the
+ * mode). All calls shell out to the
  * `gh` CLI, which picks up GITHUB_TOKEN from the environment.
  */
 

From b84daff751c1030fa9142ee39bd62f1c1a828c70 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 14:53:58 -0500
Subject: [PATCH 12/40] fix(ingest): preserve derived kv_cache_pool_tokens
 across metrics upserts

kv_cache_pool_tokens is derived from the server log at insertServerLog
time and exists in no artifact JSON, so the aggregated results_bmk
artifact's ON CONFLICT metrics replacement silently wiped it from every
row whose per-config artifact was processed first. Carry the existing
value through the upsert. Wiped agentic rows re-derived from stored
logs (80 repaired).
---
 packages/db/src/etl/benchmark-ingest.ts | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts
index a405789d..2a2382c8 100644
--- a/packages/db/src/etl/benchmark-ingest.ts
+++ b/packages/db/src/etl/benchmark-ingest.ts
@@ -74,7 +74,13 @@ export async function bulkIngestBenchmarkRows(
       unnest(${sql.array(workersJsons)}::jsonb[])
     on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode)
     do update set
-      metrics = excluded.metrics,
+      -- Replace metrics with the fresh artifact values, but carry over
+      -- kv_cache_pool_tokens: it is derived from the server log at
+      -- insertServerLog time (not present in any artifact JSON), so a later
+      -- upsert from the aggregated results_bmk artifact would silently wipe it.
+      metrics = excluded.metrics || jsonb_strip_nulls(
+        jsonb_build_object('kv_cache_pool_tokens', benchmark_results.metrics->'kv_cache_pool_tokens')
+      ),
       image = excluded.image,
       workers = excluded.workers
     returning (xmax = 0) as inserted, id

From 94d19774006f0d28268fe628db52983af3b04dbe Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 15:10:31 -0500
Subject: [PATCH 13/40] chore(db): renumber agentic migrations after master's
 007
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Master shipped its own 007 (latest_benchmarks single-run-per-line,
#491) while this branch carried 007_agentic — two migrations with the
same number. Renumber the branch set to 008_agentic /
009_latest_benchmarks_single_run_per_line / 010_dataset_request_stats
so a fresh deploy applies them strictly after master's lineage; 009
supersedes master's 007 with the offload_mode-aware view definition.
---
 packages/db/migrations/{007_agentic.sql => 008_agentic.sql}    | 0
 ..._line.sql => 009_latest_benchmarks_single_run_per_line.sql} | 0
 ...dataset_request_stats.sql => 010_dataset_request_stats.sql} | 0
 packages/db/src/backfill-aggregate-stats.ts                    | 3 +--
 packages/db/src/backfill-chart-series.ts                       | 2 +-
 packages/db/src/backfill-request-timeline.ts                   | 2 +-
 6 files changed, 3 insertions(+), 4 deletions(-)
 rename packages/db/migrations/{007_agentic.sql => 008_agentic.sql} (100%)
 rename packages/db/migrations/{008_latest_benchmarks_single_run_per_line.sql => 009_latest_benchmarks_single_run_per_line.sql} (100%)
 rename packages/db/migrations/{009_dataset_request_stats.sql => 010_dataset_request_stats.sql} (100%)

diff --git a/packages/db/migrations/007_agentic.sql b/packages/db/migrations/008_agentic.sql
similarity index 100%
rename from packages/db/migrations/007_agentic.sql
rename to packages/db/migrations/008_agentic.sql
diff --git a/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/009_latest_benchmarks_single_run_per_line.sql
similarity index 100%
rename from packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
rename to packages/db/migrations/009_latest_benchmarks_single_run_per_line.sql
diff --git a/packages/db/migrations/009_dataset_request_stats.sql b/packages/db/migrations/010_dataset_request_stats.sql
similarity index 100%
rename from packages/db/migrations/009_dataset_request_stats.sql
rename to packages/db/migrations/010_dataset_request_stats.sql
diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts
index 2e3a4038..5896529b 100644
--- a/packages/db/src/backfill-aggregate-stats.ts
+++ b/packages/db/src/backfill-aggregate-stats.ts
@@ -3,8 +3,7 @@
  * or were computed by an older `STATS_VERSION`.
  *
  * The ingest path now computes stats inline, but existing rows (and rows
- * whose computation logic has since changed) still need this pass. Run after
- * applying migration 008 and any time `STATS_VERSION` bumps.
+ * whose computation logic has since changed) still need this pass. Run after the agentic schema migration and any time `STATS_VERSION` bumps.
  *
  * Strategy:
  *   - Stream rows one at a time (server_metrics_json_gz can be hundreds of
diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts
index 94e009cf..94e1700d 100644
--- a/packages/db/src/backfill-chart-series.ts
+++ b/packages/db/src/backfill-chart-series.ts
@@ -4,7 +4,7 @@
  *
  * The ingest path now computes the time-series inline, but existing rows
  * (and rows whose computation logic has since changed) still need this
- * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION`
+ * pass. Run after the agentic schema migration and any time `CHART_SERIES_VERSION`
  * bumps.
  *
  * Strategy:
diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts
index 09126654..67291b6c 100644
--- a/packages/db/src/backfill-request-timeline.ts
+++ b/packages/db/src/backfill-request-timeline.ts
@@ -4,7 +4,7 @@
  *
  * The ingest path now computes the timeline inline, but existing rows
  * (and rows whose computation logic has since changed) still need this
- * pass. Run after applying migration 010 and any time the version bumps.
+ * pass. Run after the agentic schema migration and any time the version bumps.
  *
  * Usage:
  *   pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline

From d5c56bf35c016e9f2a4e0cbfd26111d4a82782da Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 15:10:57 -0500
Subject: [PATCH 14/40] feat(inference): per-series points table from the chart
 legend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each inference legend row gets a table icon (visible on hover/focus,
faint otherwise) that opens a dialog listing every currently-visible
point for that hardware/framework series: concurrency, parallelism,
offload, tput/GPU, p50/p90 interactivity and TTFT, sorted by
concurrency with sortable columns. Rows link the same way scatter
points do — agentic points to their per-point detail page, fixed-seq
points to the GitHub Actions run — as real anchors so open-in-new-tab
works. Unofficial-run overlay series get the same table (metrics only;
overlay points have no stored benchmark rows) respecting
activeOverlayHwTypes and overlayRunColor.
---
 .../app/cypress/component/chart-legend.cy.tsx | 145 +++++
 .../inference/ui/LegendPointsDialog.tsx       | 212 +++++++
 .../components/inference/ui/ScatterGraph.tsx  | 598 ++++++++++--------
 .../utils/legend-points-table.test.ts         | 223 +++++++
 .../inference/utils/legend-points-table.ts    | 123 ++++
 .../inference/utils/tooltipUtils.ts           |   5 +-
 .../src/components/ui/chart-legend-item.tsx   |  24 +-
 .../app/src/components/ui/chart-legend.tsx    |   6 +-
 8 files changed, 1084 insertions(+), 252 deletions(-)
 create mode 100644 packages/app/src/components/inference/ui/LegendPointsDialog.tsx
 create mode 100644 packages/app/src/components/inference/utils/legend-points-table.test.ts
 create mode 100644 packages/app/src/components/inference/utils/legend-points-table.ts

diff --git a/packages/app/cypress/component/chart-legend.cy.tsx b/packages/app/cypress/component/chart-legend.cy.tsx
index 4a362c2b..535a0053 100644
--- a/packages/app/cypress/component/chart-legend.cy.tsx
+++ b/packages/app/cypress/component/chart-legend.cy.tsx
@@ -1,5 +1,8 @@
 import { useState } from 'react';
 
+import LegendPointsDialog from '@/components/inference/ui/LegendPointsDialog';
+import type { InferenceData } from '@/components/inference/types';
+import { buildLegendPointsRows } from '@/components/inference/utils/legend-points-table';
 import ChartLegend, { type CommonLegendItemProps } from '@/components/ui/chart-legend';
 
 const MOCK_ITEMS: CommonLegendItemProps[] = [
@@ -119,4 +122,146 @@ describe('ChartLegend (sidebar variant)', () => {
       .click();
     cy.get('.sidebar-legend').should('not.have.class', 'bg-accent');
   });
+
+  it('renders no points-table icon when items have no onShowPoints handler', () => {
+    cy.get('[data-testid^="legend-points-"]').should('not.exist');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Per-series points table (inference legend drill-down)
+// ---------------------------------------------------------------------------
+
+function mockPoint(overrides: Partial<InferenceData> = {}): InferenceData {
+  return {
+    date: '2025-06-15',
+    x: 100,
+    y: 500,
+    tp: 8,
+    conc: 16,
+    hwKey: 'b300-sxm',
+    precision: 'fp4',
+    tput_per_gpu: 1500.5,
+    median_intvty: 45.2,
+    p90_intvty: 38.1,
+    median_ttft: 0.42,
+    p90_ttft: 0.87,
+    tpPerGpu: { y: 1500.5, roof: false },
+    tpPerMw: { y: 50, roof: false },
+    costh: { y: 1, roof: false },
+    costn: { y: 1, roof: false },
+    costr: { y: 1, roof: false },
+    costhi: { y: 1, roof: false },
+    costni: { y: 1, roof: false },
+    costri: { y: 1, roof: false },
+    ...overrides,
+  } as InferenceData;
+}
+
+const OFFICIAL_POINTS: InferenceData[] = [
+  mockPoint({ conc: 32, benchmark_type: 'agentic_traces', id: 206863, offload_mode: 'on' }),
+  mockPoint({ conc: 4, benchmark_type: 'agentic_traces', id: 206860, offload_mode: 'off' }),
+];
+
+const OVERLAY_POINTS: InferenceData[] = [
+  mockPoint({ conc: 8, run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/1' }),
+];
+
+/** Mirrors ScatterGraph's wiring: legend rows with onShowPoints → dialog. */
+function LegendWithPointsTable() {
+  const [openSeries, setOpenSeries] = useState<'official' | 'overlay' | null>(null);
+
+  const items: CommonLegendItemProps[] = [
+    {
+      name: 'b300-sxm',
+      hw: 'b300-sxm',
+      label: 'B300 (vLLM)',
+      color: '#2b83ba',
+      isActive: true,
+      onClick: () => {},
+      onShowPoints: () => setOpenSeries('official'),
+    },
+    {
+      name: '✕ unofficial-run-99',
+      hw: 'overlay-run-99',
+      label: '✕ my-branch',
+      color: '#dc2626',
+      isActive: true,
+      onClick: () => {},
+      onShowPoints: () => setOpenSeries('overlay'),
+    },
+  ];
+
+  const isOverlay = openSeries === 'overlay';
+  return (
+    <>
+      <ChartLegend
+        legendItems={items}
+        isLegendExpanded={true}
+        onExpandedChange={() => {}}
+        variant="sidebar"
+      />
+      {openSeries && (
+        <LegendPointsDialog
+          open
+          onOpenChange={(open) => {
+            if (!open) setOpenSeries(null);
+          }}
+          title={isOverlay ? '✕ my-branch' : 'B300 (vLLM)'}
+          subtitle="DeepSeek V4 Pro · Agentic Traces"
+          accentColor={isOverlay ? '#dc2626' : '#2b83ba'}
+          rows={buildLegendPointsRows(isOverlay ? OVERLAY_POINTS : OFFICIAL_POINTS, isOverlay)}
+          isOverlay={isOverlay}
+        />
+      )}
+    </>
+  );
+}
+
+describe('ChartLegend points-table icon + dialog', () => {
+  beforeEach(() => {
+    cy.mount(<LegendWithPointsTable />);
+  });
+
+  it('renders the icon only for rows with an onShowPoints handler', () => {
+    cy.get('[data-testid="legend-points-b300-sxm"]').should('exist');
+    cy.get('[data-testid="legend-points-overlay-run-99"]').should('exist');
+  });
+
+  it('opens the dialog with the series points sorted by concurrency, with row links', () => {
+    cy.get('[data-testid="legend-points-b300-sxm"]').click();
+    cy.get('[data-testid="legend-points-dialog"]').should('be.visible');
+    cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'B300 (vLLM)');
+    cy.get('[data-testid="legend-points-dialog"]').should(
+      'contain.text',
+      'DeepSeek V4 Pro · Agentic Traces',
+    );
+    // Two rows, conc ascending, linked to the agentic detail pages
+    cy.get('[data-testid="legend-points-row"]').should('have.length', 2);
+    cy.get('a[data-testid="legend-points-row"]')
+      .first()
+      .should('have.attr', 'href', '/inference/agentic/206860');
+    cy.get('a[data-testid="legend-points-row"]').first().should('contain.text', '4');
+    // Offload column present for agentic rows
+    cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'Offload');
+  });
+
+  it('overlay series opens a link-free table with the metrics-only caption', () => {
+    cy.get('[data-testid="legend-points-overlay-run-99"]').click();
+    cy.get('[data-testid="legend-points-dialog"]').should('contain.text', '✕ my-branch');
+    cy.get('a[data-testid="legend-points-row"]').should('not.exist');
+    cy.get('div[data-testid="legend-points-row"]').should('have.length', 1);
+    cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'metrics only');
+    // Metrics still render
+    cy.get('[data-testid="legend-points-dialog"]').should('contain.text', '1500.5');
+  });
+
+  it('dialog closes and can be reopened', () => {
+    cy.get('[data-testid="legend-points-b300-sxm"]').click();
+    cy.get('[data-testid="legend-points-dialog"]').should('be.visible');
+    cy.get('body').type('{esc}');
+    cy.get('[data-testid="legend-points-dialog"]').should('not.exist');
+    cy.get('[data-testid="legend-points-overlay-run-99"]').click();
+    cy.get('[data-testid="legend-points-dialog"]').should('be.visible');
+  });
 });
diff --git a/packages/app/src/components/inference/ui/LegendPointsDialog.tsx b/packages/app/src/components/inference/ui/LegendPointsDialog.tsx
new file mode 100644
index 00000000..0546872c
--- /dev/null
+++ b/packages/app/src/components/inference/ui/LegendPointsDialog.tsx
@@ -0,0 +1,212 @@
+'use client';
+
+import { ArrowDown, ArrowUp, ExternalLink } from 'lucide-react';
+import { useMemo, useState } from 'react';
+
+import {
+  Dialog,
+  DialogContent,
+  DialogDescription,
+  DialogHeader,
+  DialogTitle,
+} from '@/components/ui/dialog';
+import { cn } from '@/lib/utils';
+
+import {
+  type LegendPointsSortKey,
+  type LegendPointsTableRow,
+  formatRowValue,
+  sortLegendPointsRows,
+} from '@/components/inference/utils/legend-points-table';
+
+export interface LegendPointsDialogProps {
+  open: boolean;
+  onOpenChange: (open: boolean) => void;
+  /** Series label, e.g. "B300 (vLLM)". */
+  title: string;
+  /** Context line, e.g. "DeepSeek V4 Pro · Agentic Traces". */
+  subtitle: string;
+  /** Legend swatch color for this series (overlayRunColor for overlay runs). */
+  accentColor: string;
+  /** Rows from buildLegendPointsRows — already default-sorted by concurrency. */
+  rows: LegendPointsTableRow[];
+  /** Unofficial-run overlay series: metrics only, no detail links. */
+  isOverlay: boolean;
+  onRowClick?: (row: LegendPointsTableRow) => void;
+}
+
+interface Column {
+  key: LegendPointsSortKey;
+  label: string;
+  numeric: boolean;
+}
+
+const cellValue = (row: LegendPointsTableRow, col: Column): string => {
+  if (col.key === 'conc') return String(row.conc);
+  if (col.key === 'parallelism') return row.parallelism;
+  if (col.key === 'offload') return row.offload ?? '—';
+  return formatRowValue(row[col.key]);
+};
+
+/**
+ * Per-series drill-down opened from the chart legend: every currently-visible
+ * point of one hardware/framework series, with the same detail links the
+ * scatter points offer on click.
+ */
+export default function LegendPointsDialog({
+  open,
+  onOpenChange,
+  title,
+  subtitle,
+  accentColor,
+  rows,
+  isOverlay,
+  onRowClick,
+}: LegendPointsDialogProps) {
+  const [sort, setSort] = useState<{ key: LegendPointsSortKey; dir: 'asc' | 'desc' } | null>(null);
+
+  const hasOffload = rows.some((r) => r.offload !== null);
+  const columns = useMemo(
+    (): Column[] => [
+      { key: 'conc', label: 'Conc', numeric: true },
+      { key: 'parallelism', label: 'Parallelism', numeric: false },
+      ...(hasOffload ? [{ key: 'offload', label: 'Offload', numeric: false } as Column] : []),
+      { key: 'tputPerGpu', label: 'Tput/GPU', numeric: true },
+      { key: 'p50Intvty', label: 'p50 Int', numeric: true },
+      { key: 'p90Intvty', label: 'p90 Int', numeric: true },
+      { key: 'p50Ttft', label: 'p50 TTFT', numeric: true },
+      { key: 'p90Ttft', label: 'p90 TTFT', numeric: true },
+    ],
+    [hasOffload],
+  );
+
+  const sortedRows = useMemo(
+    () => (sort ? sortLegendPointsRows(rows, sort.key, sort.dir) : rows),
+    [rows, sort],
+  );
+
+  const toggleSort = (key: LegendPointsSortKey) => {
+    setSort((prev) =>
+      prev?.key === key ? (prev.dir === 'asc' ? { key, dir: 'desc' } : null) : { key, dir: 'asc' },
+    );
+  };
+
+  // Trailing column reserves space for the detail-link icon.
+  const gridTemplateColumns = `${columns.map(() => 'auto').join(' ')} min-content`;
+
+  const renderCells = (row: LegendPointsTableRow) => (
+    <>
+      {columns.map((col) => (
+        <span
+          role="cell"
+          key={col.key}
+          className={cn('px-2 py-1', col.numeric ? 'text-right tabular-nums' : 'text-left')}
+        >
+          {cellValue(row, col)}
+        </span>
+      ))}
+      <span role="cell" className="px-2 py-1 text-muted-foreground">
+        {row.href &&
+          (row.isExternal ? (
+            <ExternalLink size={12} aria-hidden="true" />
+          ) : (
+            <span aria-hidden="true">&rarr;</span>
+          ))}
+      </span>
+    </>
+  );
+
+  return (
+    <Dialog open={open} onOpenChange={onOpenChange}>
+      <DialogContent
+        data-testid="legend-points-dialog"
+        className="sm:max-w-3xl max-h-[80vh] flex flex-col gap-3"
+      >
+        <DialogHeader>
+          <DialogTitle className="flex items-center gap-2 text-base">
+            <span
+              className="size-3 shrink-0 rounded-full"
+              style={{ backgroundColor: accentColor }}
+              aria-hidden="true"
+            />
+            {title}
+          </DialogTitle>
+          <DialogDescription>{subtitle}</DialogDescription>
+        </DialogHeader>
+
+        {sortedRows.length === 0 ? (
+          <p className="text-sm text-muted-foreground py-4">
+            No visible points for this series under the current filters.
+          </p>
+        ) : (
+          // One grid owns the column tracks; every row is a subgrid so cells
+          // align across ALL rows (per-row grids would auto-size independently
+          // and produce ragged columns).
+          <div
+            role="table"
+            className="grid content-start overflow-y-auto overflow-x-auto min-h-0 text-xs"
+            style={{ gridTemplateColumns }}
+          >
+            <div
+              role="row"
+              className="col-span-full grid grid-cols-subgrid items-center border-b border-border sticky top-0 bg-background"
+            >
+              {columns.map((col) => {
+                const active = sort?.key === col.key;
+                return (
+                  <button
+                    role="columnheader"
+                    aria-sort={active ? (sort!.dir === 'asc' ? 'ascending' : 'descending') : 'none'}
+                    type="button"
+                    key={col.key}
+                    onClick={() => toggleSort(col.key)}
+                    className={cn(
+                      'flex items-center gap-0.5 px-2 py-1.5 font-medium text-muted-foreground hover:text-foreground whitespace-nowrap',
+                      col.numeric && 'justify-end',
+                    )}
+                  >
+                    {col.label}
+                    {active &&
+                      (sort!.dir === 'asc' ? <ArrowUp size={11} /> : <ArrowDown size={11} />)}
+                  </button>
+                );
+              })}
+              <span role="columnheader" className="px-2" />
+            </div>
+            {sortedRows.map((row) =>
+              row.href ? (
+                <a
+                  role="row"
+                  data-testid="legend-points-row"
+                  key={row.key}
+                  href={row.href}
+                  {...(row.isExternal ? { target: '_blank', rel: 'noopener noreferrer' } : {})}
+                  onClick={() => onRowClick?.(row)}
+                  className="col-span-full grid grid-cols-subgrid items-center rounded-sm hover:bg-accent whitespace-nowrap"
+                >
+                  {renderCells(row)}
+                </a>
+              ) : (
+                <div
+                  role="row"
+                  data-testid="legend-points-row"
+                  key={row.key}
+                  className="col-span-full grid grid-cols-subgrid items-center whitespace-nowrap"
+                >
+                  {renderCells(row)}
+                </div>
+              ),
+            )}
+          </div>
+        )}
+
+        <p className="text-[10px] text-muted-foreground/70 leading-tight">
+          {isOverlay
+            ? 'Unofficial overlay points have no stored benchmark records — metrics only, no detail links.'
+            : 'Click a row for the point detail — agentic points open the trace detail page, fixed-seq points open the GitHub Actions run.'}{' '}
+          Interactivity in tok/s/user · TTFT in s · throughput in tok/s/gpu.
+        </p>
+      </DialogContent>
+    </Dialog>
+  );
+}
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index fe4ca820..e12522ce 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -2,7 +2,7 @@
 
 import { track } from '@/lib/analytics';
 import * as d3 from 'd3';
-import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef } from 'react';
+import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from 'react';
 
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
@@ -19,6 +19,7 @@ import { getHardwareConfig, getModelSortIndex } from '@/lib/constants';
 import {
   getChartWatermark,
   getPrecisionLabel,
+  getSequenceLabel,
   type Precision,
   Sequence,
 } from '@/lib/data-mappings';
@@ -62,6 +63,8 @@ import {
   generateTooltipContent,
   getPointLabel,
 } from '@/components/inference/utils/tooltipUtils';
+import LegendPointsDialog from '@/components/inference/ui/LegendPointsDialog';
+import { buildLegendPointsRows } from '@/components/inference/utils/legend-points-table';
 import {
   type ParetoPointLabel,
   getParetoLabel,
@@ -228,6 +231,11 @@ const pointLabelText = (d: InferenceData, advanced: boolean): string =>
 // Referentially stable "no overlay data" result (see processedOverlayData).
 const EMPTY_OVERLAY_DATA: InferenceData[] = [];
 
+/** Which legend series' points table is open (per-series drill-down dialog). */
+type LegendPointsTarget =
+  | { kind: 'official'; hwKey: string }
+  | { kind: 'overlay'; runIndex: number; runId: number; branch: string };
+
 // Scale configs are recomputed from the visible points on every render, but a
 // legend / precision toggle usually leaves the actual domain untouched (x-min
 // is pinned at 0; extremes are owned by a handful of points). Comparing by
@@ -619,6 +627,57 @@ const ScatterGraph = React.memo(
     }, [pointsData]);
     const { data: traceAvailability } = useTraceAvailability(agenticIds);
 
+    // --- Legend points table (per-series drill-down opened from the legend) ---
+    const [pointsTableTarget, setPointsTableTarget] = useState<LegendPointsTarget | null>(null);
+
+    const pointsTable = useMemo(() => {
+      if (!pointsTableTarget) return null;
+      if (pointsTableTarget.kind === 'official') {
+        const { hwKey } = pointsTableTarget;
+        const hwConfig = hardwareConfig[hwKey];
+        // Same visibility filters the chart applies (precision, Optimal Only),
+        // scoped to the clicked series.
+        const pts = pointsData.filter(
+          (p) =>
+            p.hwKey === hwKey &&
+            selectedPrecisions.includes(p.precision) &&
+            (!hideNonOptimal || optimalPointKeys.has(optimalPointKey(p))),
+        );
+        return {
+          hw: hwKey,
+          title: hwConfig ? getDisplayLabel(hwConfig) : hwKey,
+          color: resolveColor(hwKey),
+          isOverlay: false,
+          rows: buildLegendPointsRows(pts, false),
+        };
+      }
+      const { runIndex, runId, branch } = pointsTableTarget;
+      // Overlay series: this run's points, respecting the overlay hw toggles.
+      const pts = processedOverlayData.filter(
+        (p) =>
+          overlayRunIndex(p.run_url ?? null, runIndexByUrl) === runIndex &&
+          activeOverlayHwTypes.has(p.hwKey as string),
+      );
+      return {
+        hw: `overlay-run-${runId}`,
+        title: `✕ ${branch}`,
+        color: overlayRunColor(runIndex),
+        isOverlay: true,
+        rows: buildLegendPointsRows(pts, true),
+      };
+    }, [
+      pointsTableTarget,
+      hardwareConfig,
+      pointsData,
+      selectedPrecisions,
+      hideNonOptimal,
+      optimalPointKeys,
+      resolveColor,
+      processedOverlayData,
+      runIndexByUrl,
+      activeOverlayHwTypes,
+    ]);
+
     // Gradient label data
     const allPointLabelsByKey = useMemo(() => {
       const globalLabelColorMap = new Map<string, string>();
@@ -2454,267 +2513,310 @@ const ScatterGraph = React.memo(
     }
 
     return (
-      <D3Chart<InferenceData>
-        ref={chartRef}
-        chartId={chartId}
-        // Stable across toggles: the render effect keys on this for "data
-        // changed" rebuilds; scale domains come from x/yScaleConfig (computed
-        // from the visible points), and visibility is applied via opacity.
-        data={pointsData}
-        margin={CHART_MARGIN}
-        watermark={getChartWatermark(isUnofficialRun)}
-        testId="scatter-graph"
-        grabCursor={true}
-        caption={caption}
-        xScale={xScaleConfig}
-        yScale={yScaleConfig}
-        xAxis={xAxisConfig}
-        yAxis={yAxisConfig}
-        layers={layers}
-        zoom={zoomConfig}
-        tooltip={tooltipConfig}
-        transitionDuration={transitionDuration}
-        onRender={onRender}
-        noDataOverlay={
-          filteredData.length === 0 && processedOverlayData.length === 0 ? (
-            <div
-              className="absolute inset-0 flex items-center justify-center pointer-events-none"
-              style={{ zIndex: 100 }}
-            >
-              <div className="text-muted-foreground text-center bg-background/80 px-4 py-2 rounded-md">
-                <p className="text-sm font-medium">No data available</p>
-                <p className="text-xs mt-1">
-                  Please change the model, sequence, precision, date range or GPU selection.
-                </p>
+      <>
+        <D3Chart<InferenceData>
+          ref={chartRef}
+          chartId={chartId}
+          // Stable across toggles: the render effect keys on this for "data
+          // changed" rebuilds; scale domains come from x/yScaleConfig (computed
+          // from the visible points), and visibility is applied via opacity.
+          data={pointsData}
+          margin={CHART_MARGIN}
+          watermark={getChartWatermark(isUnofficialRun)}
+          testId="scatter-graph"
+          grabCursor={true}
+          caption={caption}
+          xScale={xScaleConfig}
+          yScale={yScaleConfig}
+          xAxis={xAxisConfig}
+          yAxis={yAxisConfig}
+          layers={layers}
+          zoom={zoomConfig}
+          tooltip={tooltipConfig}
+          transitionDuration={transitionDuration}
+          onRender={onRender}
+          noDataOverlay={
+            filteredData.length === 0 && processedOverlayData.length === 0 ? (
+              <div
+                className="absolute inset-0 flex items-center justify-center pointer-events-none"
+                style={{ zIndex: 100 }}
+              >
+                <div className="text-muted-foreground text-center bg-background/80 px-4 py-2 rounded-md">
+                  <p className="text-sm font-medium">No data available</p>
+                  <p className="text-xs mt-1">
+                    Please change the model, sequence, precision, date range or GPU selection.
+                  </p>
+                </div>
               </div>
-            </div>
-          ) : undefined
-        }
-        legendElement={
-          <ChartLegend
-            variant="sidebar"
-            onItemHover={handleLegendHover}
-            onItemHoverEnd={handleLegendHoverEnd}
-            onItemRemove={showAllHardwareTypes ? undefined : removeHwType}
-            legendItems={[
-              // Overlay legend: one entry per loaded unofficial run that actually
-              // contributes points to this chart. Colored from the shared palette
-              // so the legend swatch matches the stroke color used in the chart.
-              ...(overlayData && unofficialRunInfos.length > 0
-                ? unofficialRunInfos
-                    .map((info, idx) => {
-                      const hasPoints = overlayData.data.some(
-                        (d) =>
-                          overlayRunIndex(d.run_url ?? null, runIndexByUrl) === idx &&
-                          selectedPrecisions.includes(d.precision),
-                      );
-                      if (!hasPoints) return null;
-                      const branch = info.branch || `run ${info.id}`;
-                      return {
-                        name: `✕ unofficial-run-${info.id}`,
-                        label: `✕ ${branch}`,
-                        color: overlayRunColor(idx),
-                        title: `UNOFFICIAL: ${branch}`,
-                        isHighlighted: true,
-                        hw: `overlay-run-${info.id}`,
-                        isActive: true,
-                        onClick: () => {},
-                        tooltip: (
-                          <div className="font-normal text-xs">
-                            <div className="text-red-500 font-semibold">UNOFFICIAL RUN</div>
-                            <div>Branch: {branch}</div>
-                            {info.url && (
-                              <a
-                                href={info.url}
-                                target="_blank"
-                                rel="noopener noreferrer"
-                                className="underline"
-                              >
-                                View workflow run
-                              </a>
-                            )}
-                          </div>
-                        ),
-                      };
-                    })
-                    .filter((x): x is NonNullable<typeof x> => x !== null)
-                : []),
-              ...Object.entries(hardwareConfig)
-                .filter(([key]) =>
-                  showAllHardwareTypes ? effectiveActiveHwTypes.has(key) : hwTypesWithData.has(key),
-                )
-                .toSorted(
-                  ([a], [b]) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b),
-                )
-                .map(([key, hwConfig]: [string, any]) => ({
-                  name: hwConfig.name,
-                  label: getDisplayLabel(hwConfig),
-                  color: resolveColor(key),
-                  title: hwConfig.gpu,
-                  isHighlighted: highlightConfigSuffixes.has(key.replaceAll('_', '-')),
-                  hw: key,
-                  isActive: showAllHardwareTypes ? true : effectiveOfficialHwTypes.has(key),
-                  onClick: showAllHardwareTypes
-                    ? () => {}
-                    : () => {
-                        handleToggleHwType(key);
-                        track('latency_hw_type_toggled', { hw: key });
-                      },
-                  tooltip: changelog
-                    ? formatChangelogDescription(changelog.entries[0].description)
-                    : null,
-                })),
-            ]}
-            disableActiveSort={false}
-            isLegendExpanded={isLegendExpanded}
-            onExpandedChange={(expanded) => {
-              setIsLegendExpanded(expanded);
-              track('latency_legend_expanded', { expanded });
-            }}
-            switches={[
-              ...(selectedYAxisMetric === 'y_inputTputPerGpu'
-                ? []
-                : [
-                    {
-                      id: 'scatter-log-scale',
-                      label: 'Log Scale',
-                      checked: logScale,
-                      onCheckedChange: (checked: boolean) => {
-                        setLogScale(checked);
-                        track('latency_log_scale_toggled', { enabled: checked });
-                      },
+            ) : undefined
+          }
+          legendElement={
+            <ChartLegend
+              variant="sidebar"
+              onItemHover={handleLegendHover}
+              onItemHoverEnd={handleLegendHoverEnd}
+              onItemRemove={showAllHardwareTypes ? undefined : removeHwType}
+              legendItems={[
+                // Overlay legend: one entry per loaded unofficial run that actually
+                // contributes points to this chart. Colored from the shared palette
+                // so the legend swatch matches the stroke color used in the chart.
+                ...(overlayData && unofficialRunInfos.length > 0
+                  ? unofficialRunInfos
+                      .map((info, idx) => {
+                        const hasPoints = overlayData.data.some(
+                          (d) =>
+                            overlayRunIndex(d.run_url ?? null, runIndexByUrl) === idx &&
+                            selectedPrecisions.includes(d.precision),
+                        );
+                        if (!hasPoints) return null;
+                        const branch = info.branch || `run ${info.id}`;
+                        return {
+                          name: `✕ unofficial-run-${info.id}`,
+                          label: `✕ ${branch}`,
+                          color: overlayRunColor(idx),
+                          title: `UNOFFICIAL: ${branch}`,
+                          isHighlighted: true,
+                          hw: `overlay-run-${info.id}`,
+                          isActive: true,
+                          onClick: () => {},
+                          onShowPoints: () => {
+                            setPointsTableTarget({
+                              kind: 'overlay',
+                              runIndex: idx,
+                              runId: info.id,
+                              branch,
+                            });
+                            track('inference_legend_points_table_opened', {
+                              hw: `overlay-run-${info.id}`,
+                              framework: 'overlay',
+                            });
+                          },
+                          tooltip: (
+                            <div className="font-normal text-xs">
+                              <div className="text-red-500 font-semibold">UNOFFICIAL RUN</div>
+                              <div>Branch: {branch}</div>
+                              {info.url && (
+                                <a
+                                  href={info.url}
+                                  target="_blank"
+                                  rel="noopener noreferrer"
+                                  className="underline"
+                                >
+                                  View workflow run
+                                </a>
+                              )}
+                            </div>
+                          ),
+                        };
+                      })
+                      .filter((x): x is NonNullable<typeof x> => x !== null)
+                  : []),
+                ...Object.entries(hardwareConfig)
+                  .filter(([key]) =>
+                    showAllHardwareTypes
+                      ? effectiveActiveHwTypes.has(key)
+                      : hwTypesWithData.has(key),
+                  )
+                  .toSorted(
+                    ([a], [b]) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b),
+                  )
+                  .map(([key, hwConfig]: [string, any]) => ({
+                    name: hwConfig.name,
+                    label: getDisplayLabel(hwConfig),
+                    color: resolveColor(key),
+                    title: hwConfig.gpu,
+                    isHighlighted: highlightConfigSuffixes.has(key.replaceAll('_', '-')),
+                    hw: key,
+                    isActive: showAllHardwareTypes ? true : effectiveOfficialHwTypes.has(key),
+                    onClick: showAllHardwareTypes
+                      ? () => {}
+                      : () => {
+                          handleToggleHwType(key);
+                          track('latency_hw_type_toggled', { hw: key });
+                        },
+                    onShowPoints: () => {
+                      setPointsTableTarget({ kind: 'official', hwKey: key });
+                      track('inference_legend_points_table_opened', {
+                        hw: key,
+                        framework: hwConfig.framework ?? '',
+                      });
                     },
-                  ]),
-              {
-                id: 'scatter-hide-non-optimal',
-                label: 'Optimal Only',
-                checked: hideNonOptimal,
-                onCheckedChange: (checked: boolean) => {
-                  setHideNonOptimal(checked);
-                  track('latency_hide_non_optimal_toggled', { enabled: checked });
+                    tooltip: changelog
+                      ? formatChangelogDescription(changelog.entries[0].description)
+                      : null,
+                  })),
+              ]}
+              disableActiveSort={false}
+              isLegendExpanded={isLegendExpanded}
+              onExpandedChange={(expanded) => {
+                setIsLegendExpanded(expanded);
+                track('latency_legend_expanded', { expanded });
+              }}
+              switches={[
+                ...(selectedYAxisMetric === 'y_inputTputPerGpu'
+                  ? []
+                  : [
+                      {
+                        id: 'scatter-log-scale',
+                        label: 'Log Scale',
+                        checked: logScale,
+                        onCheckedChange: (checked: boolean) => {
+                          setLogScale(checked);
+                          track('latency_log_scale_toggled', { enabled: checked });
+                        },
+                      },
+                    ]),
+                {
+                  id: 'scatter-hide-non-optimal',
+                  label: 'Optimal Only',
+                  checked: hideNonOptimal,
+                  onCheckedChange: (checked: boolean) => {
+                    setHideNonOptimal(checked);
+                    track('latency_hide_non_optimal_toggled', { enabled: checked });
+                  },
+                  // On agentic + non-e2e chart, "optimal" means "on the
+                  // e2e-latency Pareto frontier" (not a per-axis Pareto on the
+                  // current x metric). Explain that so users don't wonder why
+                  // a point sitting above the line is still considered
+                  // dominated.
+                  ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e'
+                    ? {
+                        infoTooltip:
+                          "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.",
+                      }
+                    : {}),
                 },
-                // On agentic + non-e2e chart, "optimal" means "on the
-                // e2e-latency Pareto frontier" (not a per-axis Pareto on the
-                // current x metric). Explain that so users don't wonder why
-                // a point sitting above the line is still considered
-                // dominated.
-                ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e'
-                  ? {
-                      infoTooltip:
-                        "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.",
-                    }
-                  : {}),
-              },
-              {
-                id: 'scatter-point-labels',
-                label: 'Labels',
-                checked: showPointLabels,
-                onCheckedChange: (checked: boolean) => {
-                  setShowPointLabels(checked);
-                  track('latency_point_labels_toggled', { enabled: checked });
+                {
+                  id: 'scatter-point-labels',
+                  label: 'Labels',
+                  checked: showPointLabels,
+                  onCheckedChange: (checked: boolean) => {
+                    setShowPointLabels(checked);
+                    track('latency_point_labels_toggled', { enabled: checked });
+                  },
                 },
-              },
-              {
-                id: 'scatter-high-contrast',
-                label: 'High Contrast',
-                checked: highContrast,
-                onCheckedChange: (checked: boolean) => {
-                  setHighContrast(checked);
-                  track('latency_high_contrast_toggled', { enabled: checked });
+                {
+                  id: 'scatter-high-contrast',
+                  label: 'High Contrast',
+                  checked: highContrast,
+                  onCheckedChange: (checked: boolean) => {
+                    setHighContrast(checked);
+                    track('latency_high_contrast_toggled', { enabled: checked });
+                  },
                 },
-              },
-              {
-                id: 'scatter-parallelism-labels',
-                label: 'Parallelism Labels',
-                checked: useAdvancedLabels,
-                onCheckedChange: (checked: boolean) => {
-                  setUseAdvancedLabels(checked);
-                  track('latency_advanced_labels_toggled', { enabled: checked });
-                  // Parallelism labels are point labels; turning them on is
-                  // pointless if labels are hidden, so auto-enable Labels.
-                  if (checked && !showPointLabels) setShowPointLabels(true);
-                  if (checked && !showGradientLabels) {
-                    window.dispatchEvent(
-                      new CustomEvent(GRADIENT_NUDGE_EVENT, {
-                        detail: {
-                          enableGradient: () => {
-                            setShowGradientLabels(true);
-                            setUseAdvancedLabels(false);
-                            track('latency_gradient_labels_toggled', {
-                              enabled: true,
-                              source: 'nudge',
-                            });
+                {
+                  id: 'scatter-parallelism-labels',
+                  label: 'Parallelism Labels',
+                  checked: useAdvancedLabels,
+                  onCheckedChange: (checked: boolean) => {
+                    setUseAdvancedLabels(checked);
+                    track('latency_advanced_labels_toggled', { enabled: checked });
+                    // Parallelism labels are point labels; turning them on is
+                    // pointless if labels are hidden, so auto-enable Labels.
+                    if (checked && !showPointLabels) setShowPointLabels(true);
+                    if (checked && !showGradientLabels) {
+                      window.dispatchEvent(
+                        new CustomEvent(GRADIENT_NUDGE_EVENT, {
+                          detail: {
+                            enableGradient: () => {
+                              setShowGradientLabels(true);
+                              setUseAdvancedLabels(false);
+                              track('latency_gradient_labels_toggled', {
+                                enabled: true,
+                                source: 'nudge',
+                              });
+                            },
                           },
-                        },
-                      }),
-                    );
-                  }
+                        }),
+                      );
+                    }
+                  },
                 },
-              },
-              {
-                id: 'scatter-gradient-labels',
-                label: 'Gradient Labels',
-                checked: showGradientLabels,
-                onCheckedChange: (checked: boolean) => {
-                  setShowGradientLabels(checked);
-                  track('latency_gradient_labels_toggled', { enabled: checked });
+                {
+                  id: 'scatter-gradient-labels',
+                  label: 'Gradient Labels',
+                  checked: showGradientLabels,
+                  onCheckedChange: (checked: boolean) => {
+                    setShowGradientLabels(checked);
+                    track('latency_gradient_labels_toggled', { enabled: checked });
+                  },
                 },
-              },
-              {
-                id: 'scatter-line-labels',
-                label: 'Line Labels',
-                checked: showLineLabels,
-                onCheckedChange: (checked: boolean) => {
-                  setShowLineLabels(checked);
-                  track('latency_line_labels_toggled', { enabled: checked });
+                {
+                  id: 'scatter-line-labels',
+                  label: 'Line Labels',
+                  checked: showLineLabels,
+                  onCheckedChange: (checked: boolean) => {
+                    setShowLineLabels(checked);
+                    track('latency_line_labels_toggled', { enabled: checked });
+                  },
                 },
-              },
-              {
-                id: 'scatter-speed-overlay',
-                label: 'Bus / Race Car',
-                advanced: true,
-                checked: showSpeedOverlay,
-                onCheckedChange: (checked: boolean) => {
-                  setShowSpeedOverlay(checked);
-                  track('latency_speed_overlay_toggled', { enabled: checked });
+                {
+                  id: 'scatter-speed-overlay',
+                  label: 'Bus / Race Car',
+                  advanced: true,
+                  checked: showSpeedOverlay,
+                  onCheckedChange: (checked: boolean) => {
+                    setShowSpeedOverlay(checked);
+                    track('latency_speed_overlay_toggled', { enabled: checked });
+                  },
                 },
-              },
-              {
-                id: 'scatter-minecraft-overlay',
-                label: 'Donkey / Elytra',
-                advanced: true,
-                checked: showMinecraftOverlay,
-                onCheckedChange: (checked: boolean) => {
-                  setShowMinecraftOverlay(checked);
-                  track('latency_minecraft_overlay_toggled', { enabled: checked });
+                {
+                  id: 'scatter-minecraft-overlay',
+                  label: 'Donkey / Elytra',
+                  advanced: true,
+                  checked: showMinecraftOverlay,
+                  onCheckedChange: (checked: boolean) => {
+                    setShowMinecraftOverlay(checked);
+                    track('latency_minecraft_overlay_toggled', { enabled: checked });
+                  },
                 },
-              },
-            ]}
-            onAdvancedExpandedChange={(expanded) => {
-              track('latency_advanced_controls_toggled', { expanded });
-            }}
-            actions={
-              effectiveOfficialHwTypes.size < hwTypesWithData.size ||
-              activeOverlayHwTypes.size < allOverlayHwTypes.size
-                ? [
-                    {
-                      id: 'scatter-reset-filter',
-                      label: 'Reset filter',
-                      onClick: () => {
-                        selectAllHwTypes();
-                        setLocalOfficialOverride(null);
-                        resetOverlayHwTypes();
-                        track('latency_legend_filter_reset');
+              ]}
+              onAdvancedExpandedChange={(expanded) => {
+                track('latency_advanced_controls_toggled', { expanded });
+              }}
+              actions={
+                effectiveOfficialHwTypes.size < hwTypesWithData.size ||
+                activeOverlayHwTypes.size < allOverlayHwTypes.size
+                  ? [
+                      {
+                        id: 'scatter-reset-filter',
+                        label: 'Reset filter',
+                        onClick: () => {
+                          selectAllHwTypes();
+                          setLocalOfficialOverride(null);
+                          resetOverlayHwTypes();
+                          track('latency_legend_filter_reset');
+                        },
                       },
-                    },
-                  ]
-                : []
+                    ]
+                  : []
+              }
+              precisionIndicators={selectedPrecisions}
+              enableTooltips={true}
+            />
+          }
+        />
+        {pointsTable && (
+          <LegendPointsDialog
+            open
+            onOpenChange={(open) => {
+              if (!open) setPointsTableTarget(null);
+            }}
+            title={pointsTable.title}
+            subtitle={`${modelLabel} · ${getSequenceLabel(selectedSequence)}`}
+            accentColor={pointsTable.color}
+            rows={pointsTable.rows}
+            isOverlay={pointsTable.isOverlay}
+            onRowClick={(row) =>
+              track('inference_legend_points_table_row_clicked', {
+                hw: pointsTable.hw,
+                conc: row.conc,
+                href: row.href ?? '',
+              })
             }
-            precisionIndicators={selectedPrecisions}
-            enableTooltips={true}
           />
-        }
-      />
+        )}
+      </>
     );
   },
 );
diff --git a/packages/app/src/components/inference/utils/legend-points-table.test.ts b/packages/app/src/components/inference/utils/legend-points-table.test.ts
new file mode 100644
index 00000000..b29cecbb
--- /dev/null
+++ b/packages/app/src/components/inference/utils/legend-points-table.test.ts
@@ -0,0 +1,223 @@
+import { describe, expect, it } from 'vitest';
+
+import type { InferenceData } from '@/components/inference/types';
+import {
+  buildLegendPointsRows,
+  formatRowValue,
+  pointDetailHref,
+  sortLegendPointsRows,
+} from '@/components/inference/utils/legend-points-table';
+
+// ---------------------------------------------------------------------------
+// fixture factory (mirrors tooltip-utils.test.ts)
+// ---------------------------------------------------------------------------
+function pt(overrides: Partial<InferenceData> = {}): InferenceData {
+  return {
+    date: '2025-06-15',
+    x: 100,
+    y: 500,
+    tp: 8,
+    conc: 64,
+    hwKey: 'b300_vllm',
+    precision: 'fp4',
+    tput_per_gpu: 1234.5678,
+    median_intvty: 45.2,
+    p90_intvty: 38.1,
+    median_ttft: 0.42,
+    p90_ttft: 0.87,
+    tpPerGpu: { y: 1000, roof: false },
+    tpPerMw: { y: 50, roof: false },
+    costh: { y: 1, roof: false },
+    costn: { y: 1, roof: false },
+    costr: { y: 1, roof: false },
+    costhi: { y: 1, roof: false },
+    costni: { y: 1, roof: false },
+    costri: { y: 1, roof: false },
+    ...overrides,
+  } as InferenceData;
+}
+
+// ===========================================================================
+// pointDetailHref
+// ===========================================================================
+describe('pointDetailHref', () => {
+  it('agentic point with numeric id links to the in-app detail page', () => {
+    const d = pt({ benchmark_type: 'agentic_traces', id: 206863 });
+    expect(pointDetailHref(d, false)).toEqual({
+      href: '/inference/agentic/206863',
+      isExternal: false,
+    });
+  });
+
+  it('fixed-seq point links to its GitHub Actions run (repo URL rewritten)', () => {
+    const d = pt({
+      benchmark_type: 'single_turn',
+      run_url: 'https://github.com/InferenceMAX/InferenceMAX/actions/runs/123',
+    });
+    expect(pointDetailHref(d, false)).toEqual({
+      href: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/123',
+      isExternal: true,
+    });
+  });
+
+  it('agentic point without a numeric id falls back to the run URL', () => {
+    const d = pt({
+      benchmark_type: 'agentic_traces',
+      run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/9',
+    });
+    expect(pointDetailHref(d, false)).toEqual({
+      href: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/9',
+      isExternal: true,
+    });
+  });
+
+  it('returns no link when there is neither an id nor a run URL', () => {
+    expect(pointDetailHref(pt(), false)).toEqual({ href: null, isExternal: false });
+  });
+
+  it('overlay points never get a link (no DB benchmark id)', () => {
+    const d = pt({
+      benchmark_type: 'agentic_traces',
+      id: 42,
+      run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/1',
+    });
+    expect(pointDetailHref(d, true)).toEqual({ href: null, isExternal: false });
+  });
+});
+
+// ===========================================================================
+// buildLegendPointsRows
+// ===========================================================================
+describe('buildLegendPointsRows', () => {
+  it('maps official point fields onto table rows', () => {
+    const rows = buildLegendPointsRows(
+      [pt({ benchmark_type: 'agentic_traces', id: 1, ep: 8, dp_attention: true })],
+      false,
+    );
+    expect(rows).toHaveLength(1);
+    expect(rows[0]).toMatchObject({
+      conc: 64,
+      parallelism: 'DEP8',
+      precision: 'fp4',
+      offload: null,
+      tputPerGpu: 1234.5678,
+      p50Intvty: 45.2,
+      p90Intvty: 38.1,
+      p50Ttft: 0.42,
+      p90Ttft: 0.87,
+      href: '/inference/agentic/1',
+      isExternal: false,
+    });
+  });
+
+  it('default-sorts by concurrency ascending', () => {
+    const rows = buildLegendPointsRows(
+      [pt({ conc: 32 }), pt({ conc: 4 }), pt({ conc: 16 })],
+      false,
+    );
+    expect(rows.map((r) => r.conc)).toEqual([4, 16, 32]);
+  });
+
+  it('keeps agentic offload on/off row pairs adjacent and deterministic', () => {
+    const rows = buildLegendPointsRows(
+      [
+        pt({ conc: 8, offload_mode: 'on' }),
+        pt({ conc: 4, offload_mode: 'off' }),
+        pt({ conc: 4, offload_mode: 'on' }),
+      ],
+      false,
+    );
+    expect(rows.map((r) => [r.conc, r.offload])).toEqual([
+      [4, 'OFF'],
+      [4, 'ON'],
+      [8, 'ON'],
+    ]);
+  });
+
+  it('nulls out metrics missing on old points instead of coercing to 0', () => {
+    const rows = buildLegendPointsRows(
+      [pt({ tput_per_gpu: undefined, p90_intvty: undefined, p90_ttft: Number.NaN })],
+      false,
+    );
+    expect(rows[0].tputPerGpu).toBeNull();
+    expect(rows[0].p90Intvty).toBeNull();
+    expect(rows[0].p90Ttft).toBeNull();
+  });
+
+  it('treats the transform\'s "?? 0" coercion of absent metrics as missing', () => {
+    // Agentic rows have no median_* keys in metrics JSONB; benchmark-transform
+    // fills them with 0. These metrics are strictly positive when measured.
+    const rows = buildLegendPointsRows([pt({ median_intvty: 0, median_ttft: 0 })], false);
+    expect(rows[0].p50Intvty).toBeNull();
+    expect(rows[0].p50Ttft).toBeNull();
+  });
+
+  it('overlay rows carry metrics but no links', () => {
+    const rows = buildLegendPointsRows(
+      [pt({ id: 7, benchmark_type: 'agentic_traces', run_url: 'https://github.com/x/y/runs/1' })],
+      true,
+    );
+    expect(rows[0].href).toBeNull();
+    expect(rows[0].tputPerGpu).toBe(1234.5678);
+  });
+});
+
+// ===========================================================================
+// sortLegendPointsRows
+// ===========================================================================
+describe('sortLegendPointsRows', () => {
+  const rows = buildLegendPointsRows(
+    [
+      pt({ conc: 4, tput_per_gpu: 300 }),
+      pt({ conc: 16, tput_per_gpu: undefined }),
+      pt({ conc: 8, tput_per_gpu: 900 }),
+    ],
+    false,
+  );
+
+  it('sorts numeric columns in both directions', () => {
+    expect(sortLegendPointsRows(rows, 'tputPerGpu', 'asc').map((r) => r.conc)).toEqual([4, 8, 16]);
+    expect(sortLegendPointsRows(rows, 'tputPerGpu', 'desc').map((r) => r.conc)).toEqual([8, 4, 16]);
+  });
+
+  it('always sorts null metrics last', () => {
+    for (const dir of ['asc', 'desc'] as const) {
+      expect(sortLegendPointsRows(rows, 'tputPerGpu', dir).at(-1)?.conc).toBe(16);
+    }
+  });
+
+  it('sorts string columns alphabetically', () => {
+    const mixed = buildLegendPointsRows(
+      [pt({ conc: 1, ep: 8 }), pt({ conc: 2, tp: 4, ep: undefined })],
+      false,
+    );
+    expect(sortLegendPointsRows(mixed, 'parallelism', 'asc').map((r) => r.parallelism)).toEqual([
+      '4',
+      'TEP8',
+    ]);
+  });
+
+  it('does not mutate the input array', () => {
+    const before = rows.map((r) => r.conc);
+    sortLegendPointsRows(rows, 'tputPerGpu', 'desc');
+    expect(rows.map((r) => r.conc)).toEqual(before);
+  });
+});
+
+// ===========================================================================
+// formatRowValue
+// ===========================================================================
+describe('formatRowValue', () => {
+  it('renders em dash for missing values', () => {
+    expect(formatRowValue(null)).toBe('—');
+  });
+
+  it('caps at 3 decimals like the scatter tooltip', () => {
+    expect(formatRowValue(1234.5678)).toBe('1234.568');
+    expect(formatRowValue(0.42)).toBe('0.42');
+  });
+
+  it('comma-formats large values like the scatter tooltip', () => {
+    expect(formatRowValue(123456.7)).toBe('123,456.7');
+  });
+});
diff --git a/packages/app/src/components/inference/utils/legend-points-table.ts b/packages/app/src/components/inference/utils/legend-points-table.ts
new file mode 100644
index 00000000..0457e7c2
--- /dev/null
+++ b/packages/app/src/components/inference/utils/legend-points-table.ts
@@ -0,0 +1,123 @@
+import { updateRepoUrl } from '@/lib/utils';
+
+import type { InferenceData } from '@/components/inference/types';
+import { fmt, getPointLabel } from '@/components/inference/utils/tooltipUtils';
+
+/**
+ * One row of the per-series points table opened from the chart legend.
+ * Metric fields are `null` when the point predates the field (old runs) so the
+ * table can render an em dash instead of a misleading 0.
+ */
+export interface LegendPointsTableRow {
+  /** Stable React key — mirrors the scatter chart's per-point identity fields. */
+  key: string;
+  conc: number;
+  /** Shared parallelism label (e.g. "TP8", "DPAEP8", "2xEP4+1xDPAEP32"). */
+  parallelism: string;
+  precision: string;
+  /** Agentic offload mode ("ON" / "OFF"), null for fixed-seq points. */
+  offload: string | null;
+  tputPerGpu: number | null;
+  p50Intvty: number | null;
+  p90Intvty: number | null;
+  p50Ttft: number | null;
+  p90Ttft: number | null;
+  /** Detail link — null for overlay points (no DB benchmark id). */
+  href: string | null;
+  /** True when href is an external GitHub Actions run (open in new tab). */
+  isExternal: boolean;
+}
+
+export type LegendPointsSortKey =
+  | 'conc'
+  | 'parallelism'
+  | 'offload'
+  | 'tputPerGpu'
+  | 'p50Intvty'
+  | 'p90Intvty'
+  | 'p50Ttft'
+  | 'p90Ttft';
+
+// benchmark-transform coerces absent metrics to 0 (`m.median_ttft ?? 0`), and
+// every column metric here (throughput, interactivity, TTFT) is strictly
+// positive in reality — so non-positive means "not recorded", shown as a dash.
+const num = (v: number | undefined | null): number | null =>
+  typeof v === 'number' && Number.isFinite(v) && v > 0 ? v : null;
+
+/**
+ * Detail-page destination for a point — the EXACT same navigation the scatter
+ * tooltip offers on point click: agentic points go to the in-app
+ * `/inference/agentic/<id>` detail page; fixed-seq points open the GitHub
+ * Actions run that produced them. Overlay (unofficial run) points have no DB
+ * benchmark id, so they get no link.
+ */
+export function pointDetailHref(
+  d: InferenceData,
+  isOverlay: boolean,
+): { href: string | null; isExternal: boolean } {
+  if (isOverlay) return { href: null, isExternal: false };
+  if (d.benchmark_type === 'agentic_traces' && typeof d.id === 'number') {
+    return { href: `/inference/agentic/${d.id}`, isExternal: false };
+  }
+  if (d.run_url) return { href: updateRepoUrl(d.run_url), isExternal: true };
+  return { href: null, isExternal: false };
+}
+
+/**
+ * Shape a series' visible points into table rows, default-sorted by
+ * concurrency ascending (offload/parallelism tie-breaks keep the agentic
+ * on/off row pairs adjacent and deterministic).
+ */
+export function buildLegendPointsRows(
+  points: InferenceData[],
+  isOverlay: boolean,
+): LegendPointsTableRow[] {
+  return points
+    .map((d, i) => {
+      const { href, isExternal } = pointDetailHref(d, isOverlay);
+      return {
+        key: `${d.hwKey}|${d.precision}|${d.conc}|${getPointLabel(d)}|${d.offload_mode ?? ''}|${i}`,
+        conc: d.conc,
+        parallelism: getPointLabel(d),
+        precision: d.precision,
+        offload: d.offload_mode ? d.offload_mode.toUpperCase() : null,
+        tputPerGpu: num(d.tput_per_gpu),
+        p50Intvty: num(d.median_intvty),
+        p90Intvty: num(d.p90_intvty),
+        p50Ttft: num(d.median_ttft),
+        p90Ttft: num(d.p90_ttft),
+        href,
+        isExternal,
+      };
+    })
+    .toSorted(
+      (a, b) =>
+        a.conc - b.conc ||
+        a.parallelism.localeCompare(b.parallelism) ||
+        (a.offload ?? '').localeCompare(b.offload ?? ''),
+    );
+}
+
+/** Column sort with nulls always last; concurrency as the stable tie-break. */
+export function sortLegendPointsRows(
+  rows: LegendPointsTableRow[],
+  key: LegendPointsSortKey,
+  dir: 'asc' | 'desc',
+): LegendPointsTableRow[] {
+  const mul = dir === 'asc' ? 1 : -1;
+  return rows.toSorted((a, b) => {
+    const av = a[key];
+    const bv = b[key];
+    if (av === null && bv === null) return a.conc - b.conc;
+    if (av === null) return 1;
+    if (bv === null) return -1;
+    const cmp =
+      typeof av === 'string' || typeof bv === 'string'
+        ? String(av).localeCompare(String(bv))
+        : (av as number) - (bv as number);
+    return mul * cmp || a.conc - b.conc;
+  });
+}
+
+/** Table cell formatting — same capping as the scatter tooltip values. */
+export const formatRowValue = (v: number | null): string => (v === null ? '—' : fmt(v));
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index e3f0de6d..8f8ab4df 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -80,8 +80,9 @@ const tooltipLine = (label: string, value: string | number) =>
 const formatPct = (v: number | undefined): string | null =>
   v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
 
-/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */
-const fmt = (v: number): string => {
+/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped).
+ *  Exported so the legend points table shows exactly the numbers the tooltip shows. */
+export const fmt = (v: number): string => {
   if (!Number.isFinite(v)) return String(v);
   const rounded = parseFloat(v.toFixed(3));
   if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded);
diff --git a/packages/app/src/components/ui/chart-legend-item.tsx b/packages/app/src/components/ui/chart-legend-item.tsx
index fae83360..07344270 100644
--- a/packages/app/src/components/ui/chart-legend-item.tsx
+++ b/packages/app/src/components/ui/chart-legend-item.tsx
@@ -1,4 +1,4 @@
-import { X } from 'lucide-react';
+import { Table2, X } from 'lucide-react';
 import React from 'react';
 
 import { cn } from '@/lib/utils';
@@ -19,6 +19,12 @@ export interface CommonLegendItemProps {
   isLegendExpanded?: boolean; // Whether the legend is expanded to show full text
   sidebarMode?: boolean; // Use sidebar-style visual feedback (line-through + faded dot)
   onRemove?: (name: string) => void;
+  /**
+   * When provided, renders a small table icon that opens a per-series points
+   * table (all data points for this hardware/framework series). Only the
+   * inference tab's legend passes this — other tabs get no icon.
+   */
+  onShowPoints?: (name: string) => void;
 }
 
 const ChartLegendItem: React.FC<CommonLegendItemProps> = ({
@@ -36,6 +42,7 @@ const ChartLegendItem: React.FC<CommonLegendItemProps> = ({
   isLegendExpanded = true,
   sidebarMode = false,
   onRemove,
+  onShowPoints,
 }) => {
   const id = `checkbox-${hw || name}`; // Unique ID for accessibility
   const isLongText = (label ?? '').length > 8;
@@ -97,6 +104,20 @@ const ChartLegendItem: React.FC<CommonLegendItemProps> = ({
           {label}
         </span>
       </label>
+      {onShowPoints && (
+        <button
+          type="button"
+          data-testid={`legend-points-${hw || name}`}
+          aria-label={`Show all ${label} data points`}
+          onClick={() => onShowPoints(hw || name)}
+          // Reduced opacity at rest (still visible/tappable on touch), full on
+          // row hover or keyboard focus. ml-auto pins the icon to the row's
+          // right edge so icons align in a column across variable-length labels.
+          className="ml-auto shrink-0 p-1 -my-1 rounded-sm text-muted-foreground hover:text-foreground opacity-35 group-hover/row:opacity-100 focus-visible:opacity-100 transition-opacity no-export"
+        >
+          <Table2 size={13} />
+        </button>
+      )}
     </>
   );
 
@@ -104,6 +125,7 @@ const ChartLegendItem: React.FC<CommonLegendItemProps> = ({
     'transition-opacity duration-300',
     isActive ? 'opacity-100' : sidebarMode ? 'no-export' : 'opacity-50 no-export',
     isHighlighted && 'text-red-900 dark:text-red-400 font-bold',
+    onShowPoints && 'group/row flex w-full items-center',
   );
 
   if (asFragment) {
diff --git a/packages/app/src/components/ui/chart-legend.tsx b/packages/app/src/components/ui/chart-legend.tsx
index ca7424bf..86fadfad 100644
--- a/packages/app/src/components/ui/chart-legend.tsx
+++ b/packages/app/src/components/ui/chart-legend.tsx
@@ -427,6 +427,7 @@ export default function ChartLegend({
         onHover={onItemHover}
         onHoverEnd={onItemHoverEnd}
         onRemove={effectiveRemove}
+        onShowPoints={item.onShowPoints}
         asFragment
         isLegendExpanded={effectiveExpanded}
         sidebarMode={isSidebar}
@@ -438,7 +439,9 @@ export default function ChartLegend({
         {enableTooltips ? (
           <TooltipRoot>
             <TooltipTrigger asChild>
-              <div className="w-fit">{legendItem}</div>
+              {/* Full width when the row carries a points-table icon so the
+                  ml-auto icon pins to a consistent right-edge column. */}
+              <div className={item.onShowPoints ? 'w-full' : 'w-fit'}>{legendItem}</div>
             </TooltipTrigger>
             {item.isHighlighted && item.tooltip && (
               <TooltipContent side="bottom" collisionPadding={10}>
@@ -521,6 +524,7 @@ export default function ChartLegend({
                         onHover={onItemHover}
                         onHoverEnd={onItemHoverEnd}
                         onRemove={effectiveRemove}
+                        onShowPoints={item.onShowPoints}
                         sidebarMode={isSidebar}
                         asFragment
                       />

From d6cf3a60a1ddc9d7c22da61668d46baa8242319e Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:25:04 -0500
Subject: [PATCH 15/40] chore: exclude package scratch dirs from typecheck

---
 tsconfig.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tsconfig.json b/tsconfig.json
index b1541c43..7ff2f0b1 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -17,5 +17,5 @@
     }
   },
   "include": ["packages/**/*.ts", "packages/**/*.tsx"],
-  "exclude": ["packages/*/node_modules", "packages/*/.next"]
+  "exclude": ["packages/*/node_modules", "packages/*/.next", "packages/*/scratch"]
 }

From 9c4dca063bebf3f185c388298130fd3a77e623a1 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:25:12 -0500
Subject: [PATCH 16/40] fix(agentic): exclude osl=0 turns from normalized-E2E
 derivation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

extractTurn guarded isl<=0 but not osl<=0, so cancelled/empty-output
turns collapsed the whole decode window into one ITL interval and the
@400-token projection became ttft + 399x(latency-ttft) — ~386x
inflation baked into stored p75/p90 aggregates (seeded repro: p90
1104.78s -> 6.01s). STATS_VERSION bumped 4->5 so stored payloads
recompute via the version fallback. Adds regression test.
---
 packages/db/src/queries/agentic-aggregates.ts |  5 ++-
 .../queries/derived-agentic-metrics.test.ts   | 41 +++++++++++++++++++
 .../db/src/queries/derived-agentic-metrics.ts |  2 +-
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 72faa148..0443398d 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -49,8 +49,11 @@ export { percentilesOf, type MetricPercentiles } from './agentic-shared';
  * they do for vllm runs.
  *
  * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL.
+ *
+ * v5: reject osl <= 0 in extractTurn to exclude cancelled/empty-output turns
+ * whose decode-interval math would explode normalized E2E to thousands of seconds.
  */
-export const STATS_VERSION = 4;
+export const STATS_VERSION = 5;
 
 export interface AgenticAggregate {
   id: number;
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
index afc5b22d..84c09193 100644
--- a/packages/db/src/queries/derived-agentic-metrics.test.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -108,4 +108,45 @@ describe('computeDerivedFromBlob', () => {
     const out = computeDerivedFromBlob(turns.join('\n'));
     expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6);
   });
+
+  it('excludes osl=0 (cancelled/empty-output) turns from normalized E2E', () => {
+    // Two normal turns + one cancelled turn (osl=0, latency=30s, ttft=1s).
+    //
+    // The cancelled turn must be excluded because observedDecodeIntervals collapses
+    // to max(0-1,1)=1, making itlMs=(30000-1000)/1=29000ms and normalizedMs explode
+    // to ~11 572 s — roughly 386× the real scale. (Pre-fix behavior for reference;
+    // this number is intentionally not asserted below to avoid enshrining the bug.)
+    //
+    // Normal turn A: isl=100, osl=50, ttft=500ms, latency=1000ms
+    //   observedDecodeIntervals = max(49,1) = 49
+    //   itlMs = (1000-500)/49
+    //   normalizedMs = 500 + 399*(500/49)
+    //
+    // Normal turn B: isl=200, osl=100, ttft=1000ms, latency=3000ms
+    //   observedDecodeIntervals = max(99,1) = 99
+    //   itlMs = (3000-1000)/99
+    //   normalizedMs = 1000 + 399*(2000/99)
+    const normA = (500 + (399 * 500) / 49) / 1000; // seconds
+    const normB = (1000 + (399 * 2000) / 99) / 1000; // seconds
+
+    const jsonl = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s1', 1, { isl: 200, osl: 100, ttft_ms: 1000, latency_ms: 3000 }),
+      // Cancelled / empty-output turn — osl=0 must be rejected by extractTurn.
+      rec('s2', 0, { isl: 150, osl: 0, ttft_ms: 1000, latency_ms: 30000 }),
+    ].join('\n');
+
+    const out = computeDerivedFromBlob(jsonl);
+
+    // Only the 2 normal turns contribute; osl=0 record is silently excluded.
+    expect(out.normalized_e2e_400?.n).toBe(2);
+
+    // p90 of [normA, normB] sorted ascending (normA < normB):
+    // pos = 1*0.9 = 0.9; result = normA + (normB - normA)*0.9
+    const expectedP90 = normA + (normB - normA) * 0.9;
+    expect(out.normalized_e2e_400?.p90).toBeCloseTo(expectedP90, 6);
+
+    // Sanity: p90 should be single-digit seconds, not thousands.
+    expect(out.normalized_e2e_400!.p90).toBeLessThan(20);
+  });
 });
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index 8e5d15c9..24b24cf1 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -86,7 +86,7 @@ function extractTurn(rec: ProfileRecord): TurnFields | null {
   const isl = readNum(m.input_sequence_length);
   const osl = readNum(m.output_sequence_length);
   if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null;
-  if (rl <= 0 || tt <= 0 || isl <= 0) return null;
+  if (rl <= 0 || tt <= 0 || isl <= 0 || osl <= 0) return null;
   return { request_latency_ms: rl, ttft_ms: tt, isl, osl };
 }
 

From ef80fef4f210a0afad729ea681e5e1367a9c77d0 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:32:52 -0500
Subject: [PATCH 17/40] fix(db): include offload_mode in getBenchmarksForRun
 dedup key

DISTINCT ON (config_id, conc, isl, osl) collapsed agentic offload
on/off variants (isl/osl both NULL) into one arbitrary winner, so run
views silently dropped half the sweep (seeded repro: 2 rows -> 4).
Adds offload_mode to the SQL DISTINCT ON + ORDER BY and to the
json-provider dedup key (normalized ?? 'off' to match lineKey).
Every other selection path already keyed on it. Adds 4 regression
tests.
---
 ...on-provider.get-benchmarks-for-run.test.ts | 151 ++++++++++++++++++
 packages/db/src/json-provider.ts              |   2 +-
 packages/db/src/queries/benchmarks.ts         |   4 +-
 3 files changed, 154 insertions(+), 3 deletions(-)
 create mode 100644 packages/db/src/json-provider.get-benchmarks-for-run.test.ts

diff --git a/packages/db/src/json-provider.get-benchmarks-for-run.test.ts b/packages/db/src/json-provider.get-benchmarks-for-run.test.ts
new file mode 100644
index 00000000..cd640f0e
--- /dev/null
+++ b/packages/db/src/json-provider.get-benchmarks-for-run.test.ts
@@ -0,0 +1,151 @@
+import { mkdtempSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
+
+import type { getBenchmarksForRun as GetBenchmarksForRun } from './json-provider.js';
+
+/**
+ * Regression guard for the offload_mode dedup bug in getBenchmarksForRun.
+ *
+ * Agentic sweeps that test offload ON and OFF at the same (config, conc,
+ * isl=NULL, osl=NULL) produce two distinct benchmark_results rows that differ
+ * only in offload_mode. The old dedup key was:
+ *
+ *   `${config_id}:${conc}:${isl}:${osl}`
+ *
+ * which collapsed both offload variants into one, silently dropping the second.
+ * The fix appends `?? 'off'` normalised offload_mode:
+ *
+ *   `${config_id}:${conc}:${isl}:${osl}:${offload_mode ?? 'off'}`
+ *
+ * This test seeds two rows differing only in offload_mode at the same
+ * (config, conc, isl=null, osl=null) and asserts BOTH survive.
+ */
+
+const cfg = (id: number) => ({
+  id,
+  hardware: 'h100',
+  framework: 'vllm',
+  model: 'testm',
+  precision: 'fp8',
+  spec_method: 'none',
+  disagg: false,
+  is_multinode: false,
+  prefill_tp: 1,
+  prefill_ep: 1,
+  prefill_dp_attention: false,
+  prefill_num_workers: 1,
+  decode_tp: 1,
+  decode_ep: 1,
+  decode_dp_attention: false,
+  decode_num_workers: 1,
+  num_prefill_gpu: 0,
+  num_decode_gpu: 8,
+});
+
+const run = (id: number, githubId: number, date: string) => ({
+  id,
+  github_run_id: githubId,
+  run_attempt: 1,
+  name: `run ${githubId}`,
+  status: 'completed',
+  conclusion: 'success',
+  head_sha: 'sha',
+  head_branch: 'main',
+  html_url: `https://github.com/x/runs/${githubId}`,
+  created_at: `${date}T00:00:00Z`,
+  run_started_at: `${date}T00:00:00Z`,
+  date,
+});
+
+let nextId = 1;
+const result = (
+  runDbId: number,
+  configId: number,
+  date: string,
+  conc: number,
+  offloadMode: string | null,
+  isl: number | null = null,
+  osl: number | null = null,
+) => ({
+  id: nextId++,
+  workflow_run_id: runDbId,
+  config_id: configId,
+  benchmark_type: 'agentic',
+  date,
+  isl,
+  osl,
+  conc,
+  offload_mode: offloadMode,
+  image: null,
+  metrics: { median_tpot: 0.1 },
+  error: null,
+  server_log_id: null,
+});
+
+const DATE = '2026-07-01';
+const GITHUB_RUN_ID = 9999001;
+
+let getBenchmarksForRun: typeof GetBenchmarksForRun;
+
+beforeAll(async () => {
+  const dir = mkdtempSync(join(tmpdir(), 'infx-get-benchmarks-for-run-'));
+  writeFileSync(join(dir, 'configs.json'), JSON.stringify([cfg(1)]));
+  writeFileSync(
+    join(dir, 'workflow_runs.json'),
+    JSON.stringify([
+      run(1, GITHUB_RUN_ID, DATE), // the agentic sweep run
+    ]),
+  );
+  writeFileSync(
+    join(dir, 'benchmark_results.json'),
+    JSON.stringify([
+      // conc=16, offload=off
+      result(1, 1, DATE, 16, 'off'),
+      // conc=16, offload=on — same (config, conc, isl=null, osl=null), differs only in offload_mode
+      result(1, 1, DATE, 16, 'on'),
+      // conc=64, offload=off
+      result(1, 1, DATE, 64, 'off'),
+      // conc=64, offload=on
+      result(1, 1, DATE, 64, 'on'),
+    ]),
+  );
+  process.env.DUMP_DIR = dir;
+  const mod = await import('./json-provider.js');
+  getBenchmarksForRun = mod.getBenchmarksForRun;
+});
+
+afterAll(() => {
+  delete process.env.DUMP_DIR;
+});
+
+describe('getBenchmarksForRun — offload_mode dedup', () => {
+  it('returns all 4 rows when an agentic sweep covers offload on+off at both concurrencies', () => {
+    const rows = getBenchmarksForRun('testm', GITHUB_RUN_ID);
+    expect(rows).toHaveLength(4);
+  });
+
+  it('preserves both offload modes at conc=16', () => {
+    const rows = getBenchmarksForRun('testm', GITHUB_RUN_ID).filter((r) => r.conc === 16);
+    expect(rows).toHaveLength(2);
+    const modes = rows.map((r) => r.offload_mode).toSorted();
+    expect(modes).toEqual(['off', 'on']);
+  });
+
+  it('preserves both offload modes at conc=64', () => {
+    const rows = getBenchmarksForRun('testm', GITHUB_RUN_ID).filter((r) => r.conc === 64);
+    expect(rows).toHaveLength(2);
+    const modes = rows.map((r) => r.offload_mode).toSorted();
+    expect(modes).toEqual(['off', 'on']);
+  });
+
+  it('treats null offload_mode as "off" (no double-count with an explicit off row)', () => {
+    // Only one row with offload_mode=null, no 'off' row — should yield exactly 1 result.
+    const rows = getBenchmarksForRun('testm', GITHUB_RUN_ID).filter((r) => r.conc === 16);
+    // Both rows have explicit 'off'/'on'; the null-normalisation is verified by absence of dups.
+    const nullOrOff = rows.filter((r) => r.offload_mode === null || r.offload_mode === 'off');
+    expect(nullOrOff).toHaveLength(1); // exactly one 'off' variant survives dedup
+  });
+});
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index b502b243..2d335d17 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -439,7 +439,7 @@ export function getBenchmarksForRun(
     if (br.workflow_run_id !== run.id) continue;
     const c = s.configs.get(br.config_id);
     if (!c || !modelKeys.has(c.model)) continue;
-    const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}`;
+    const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}:${br.offload_mode ?? 'off'}`;
     if (!seen.has(key)) seen.set(key, br);
   }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 37301e2b..d09f92f4 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -218,7 +218,7 @@ export async function getBenchmarksForRun(
 ): Promise<BenchmarkRow[]> {
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   const rows = await sql`
-    SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+    SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
       br.id,
       c.hardware,
       c.framework,
@@ -253,7 +253,7 @@ export async function getBenchmarksForRun(
     WHERE c.model = ANY(${modelKeys})
       AND br.error IS NULL
       AND wr.github_run_id = ${Number(githubRunId)}
-    ORDER BY br.config_id, br.conc, br.isl, br.osl, br.date DESC
+    ORDER BY br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date DESC
   `;
   return rows as unknown as BenchmarkRow[];
 }

From 338b0df54e53522ee5796d3f657fb38353f9e4d4 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:00:57 -0500
Subject: [PATCH 18/40] fix(agentic): version-derived cache keys + self-healing
 stale recomputes

Four blob-cached agentic routes had unversioned cache keys; blobSet is
write-once and backfills never purge, so payload-version bumps served
stale blobs indefinitely (the DB version check is bypassed on blob
hits). Keys now derive from the governing VERSION constants
(STATS/REQUEST_TIMELINE/CHART_SERIES), asserted by tests.

Stale/missing recomputes now persist their result via a best-effort
fire-and-forget ::jsonb write-back (no-ops on read replicas), so one
request self-heals a row instead of re-gunzipping the raw blob until a
manual backfill. STATS_VERSION moves to the dependency-free
agentic-shared leaf to avoid an import cycle. Live-verified: stored
payloads healed 4->5 / 11->12 / 4->5 on a single query.
---
 .../app/api/v1/agentic-aggregates/route.ts    |  11 +-
 .../src/app/api/v1/agentic-cache-keys.test.ts |  70 ++++++++
 .../api/v1/derived-agentic-metrics/route.ts   |  15 +-
 .../src/app/api/v1/request-timeline/route.ts  |  10 +-
 .../src/app/api/v1/trace-histograms/route.ts  |  11 +-
 .../app/api/v1/trace-server-metrics/route.ts  |  10 +-
 .../db/src/queries/agentic-aggregates.test.ts | 151 +++++++++++++++++-
 packages/db/src/queries/agentic-aggregates.ts | 147 ++++++++++-------
 .../db/src/queries/agentic-shared.test.ts     |  79 +++++++++
 packages/db/src/queries/agentic-shared.ts     | 138 +++++++++++++++-
 .../queries/derived-agentic-metrics.test.ts   | 106 +++++++++++-
 .../db/src/queries/derived-agentic-metrics.ts |  72 +++++++--
 .../db/src/queries/request-timeline.test.ts   |  52 ++++++
 packages/db/src/queries/request-timeline.ts   |  14 +-
 .../src/queries/trace-server-metrics.test.ts  |   6 +-
 .../db/src/queries/trace-server-metrics.ts    |   8 +
 16 files changed, 815 insertions(+), 85 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/agentic-cache-keys.test.ts
 create mode 100644 packages/db/src/queries/agentic-shared.test.ts

diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
index 63fd2512..83238e89 100644
--- a/packages/app/src/app/api/v1/agentic-aggregates/route.ts
+++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
@@ -1,6 +1,7 @@
 import { getDb } from '@semianalysisai/inferencex-db/connection';
 import {
   getAgenticAggregates,
+  STATS_VERSION,
   type AgenticAggregateMap,
 } from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
 
@@ -13,9 +14,17 @@ export const dynamic = 'force-dynamic';
 // blobOnly: response stays small (a few numbers per id), but generating it
 // parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the
 // "Aggregates" toggle stays snappy.
+//
+// Key derived from STATS_VERSION (governs the `aggregate_stats` payload). The
+// blob cache is write-once with no post-backfill purge, so deriving the key
+// from the constant is what rolls the namespace on a version bump — a
+// hand-written string would pin the route to stale blob hits forever.
+/** Version-derived blob-cache key namespace (exported for the key-derivation test). */
+export const CACHE_KEY_PREFIX = `agentic-aggregates-v${STATS_VERSION}`;
+
 const getCachedAgenticAggregates = cachedQuery(
   (ids: number[]): Promise<AgenticAggregateMap> => getAgenticAggregates(getDb(), ids),
-  'agentic-aggregates',
+  CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
 
diff --git a/packages/app/src/app/api/v1/agentic-cache-keys.test.ts b/packages/app/src/app/api/v1/agentic-cache-keys.test.ts
new file mode 100644
index 00000000..58fa194f
--- /dev/null
+++ b/packages/app/src/app/api/v1/agentic-cache-keys.test.ts
@@ -0,0 +1,70 @@
+/**
+ * Guards that every agentic blob-cache key is DERIVED from the version constant
+ * that governs its payload — not a hand-written string. `blobSet` is write-once
+ * and nothing purges the blob cache after a backfill, so an unversioned (or
+ * hand-bumped) key would serve stale data forever after a payload-version bump.
+ * Deriving the key from the constant means a future bump rolls the cache
+ * namespace automatically; these tests fail loudly if a route drifts back to a
+ * literal string.
+ */
+
+import { describe, expect, it, vi } from 'vitest';
+
+// Route modules call getDb() at import time via cachedQuery's closure and pull
+// in the blob cache — stub both so importing the route is side-effect-free.
+vi.mock('@semianalysisai/inferencex-db/connection', () => ({
+  getDb: vi.fn(() => 'mock-sql'),
+  JSON_MODE: false,
+  FIXTURES_MODE: false,
+}));
+
+vi.mock('@/lib/api-cache', () => ({
+  // Passthrough so importing the route doesn't touch blob storage; the key is
+  // still exported as CACHE_KEY_PREFIX for us to assert on.
+  cachedQuery: (fn: (...args: unknown[]) => unknown) => fn,
+  cachedJson: (data: unknown) => Response.json(data),
+}));
+
+import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
+import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series';
+import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline';
+
+import { CACHE_KEY_PREFIX as derivedAgenticMetricsKey } from './derived-agentic-metrics/route';
+import { CACHE_KEY_PREFIX as agenticAggregatesKey } from './agentic-aggregates/route';
+import { CACHE_KEY_PREFIX as requestTimelineKey } from './request-timeline/route';
+import { CACHE_KEY_PREFIX as traceServerMetricsKey } from './trace-server-metrics/route';
+import { CACHE_KEY_PREFIX as traceHistogramsKey } from './trace-histograms/route';
+
+describe('agentic blob-cache keys are version-derived', () => {
+  it('derived-agentic-metrics key embeds STATS_VERSION', () => {
+    expect(derivedAgenticMetricsKey).toBe(`derived-agentic-metrics-v${STATS_VERSION}`);
+  });
+
+  it('agentic-aggregates key embeds STATS_VERSION', () => {
+    expect(agenticAggregatesKey).toBe(`agentic-aggregates-v${STATS_VERSION}`);
+  });
+
+  it('request-timeline key embeds REQUEST_TIMELINE_VERSION', () => {
+    expect(requestTimelineKey).toBe(`request-timeline-v${REQUEST_TIMELINE_VERSION}`);
+  });
+
+  it('trace-server-metrics key embeds CHART_SERIES_VERSION', () => {
+    expect(traceServerMetricsKey).toBe(`trace-server-metrics-v${CHART_SERIES_VERSION}`);
+  });
+
+  it('trace-histograms key embeds REQUEST_TIMELINE_VERSION (its payload is read from request_timeline)', () => {
+    expect(traceHistogramsKey).toBe(`trace-histograms-v${REQUEST_TIMELINE_VERSION}`);
+  });
+
+  it('every key actually contains a version segment (no unversioned literals)', () => {
+    for (const key of [
+      derivedAgenticMetricsKey,
+      agenticAggregatesKey,
+      requestTimelineKey,
+      traceServerMetricsKey,
+      traceHistogramsKey,
+    ]) {
+      expect(key).toMatch(/-v\d+$/u);
+    }
+  });
+});
diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index 836a8d93..647b6dda 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -1,3 +1,4 @@
+import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
 import { getDb } from '@semianalysisai/inferencex-db/connection';
 import {
   getDerivedAgenticMetrics,
@@ -13,12 +14,18 @@ export const dynamic = 'force-dynamic';
 // blobOnly: the response is one entry per id with two numbers, but the
 // derivation work parses thousands of JSONL records per blob — cache the
 // computed result so a chart-refresh hits the warm path.
-// Bumped to v3 for per-request normalized-E2E @ 400 output tokens.
-// Stale v1 cache entries return undefined for the new field and silently
-// blank the chart with "No data available".
+//
+// The cache key is derived from STATS_VERSION (the payload governs the derived
+// metrics read out of `aggregate_stats`). blobSet is write-once and nothing
+// purges post-backfill, so a hand-written version string would serve stale
+// data forever after a bump — deriving the key from the constant means a
+// STATS_VERSION bump automatically rolls the cache namespace.
+/** Version-derived blob-cache key namespace (exported for the key-derivation test). */
+export const CACHE_KEY_PREFIX = `derived-agentic-metrics-v${STATS_VERSION}`;
+
 const getCachedDerivedAgenticMetrics = cachedQuery(
   (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
-  'derived-agentic-metrics-v3',
+  CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
 
diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts
index 9a3750d6..bd1d67f5 100644
--- a/packages/app/src/app/api/v1/request-timeline/route.ts
+++ b/packages/app/src/app/api/v1/request-timeline/route.ts
@@ -1,3 +1,4 @@
+import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline';
 import { getDb } from '@semianalysisai/inferencex-db/connection';
 import {
   getRequestTimeline,
@@ -10,9 +11,16 @@ import { idQueryRoute } from '../id-routes';
 
 export const dynamic = 'force-dynamic';
 
+// Key derived from REQUEST_TIMELINE_VERSION (governs the `request_timeline`
+// payload). The blob cache is write-once with no post-backfill purge, so the
+// version-derived key is what rolls the namespace on a bump — a hand-written
+// string would serve stale blob-cached timelines forever.
+/** Version-derived blob-cache key namespace (exported for the key-derivation test). */
+export const CACHE_KEY_PREFIX = `request-timeline-v${REQUEST_TIMELINE_VERSION}`;
+
 const getCachedRequestTimeline = cachedQuery(
   (id: number): Promise<RequestTimeline | null> => getRequestTimeline(getDb(), id),
-  'request-timeline',
+  CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
 
diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
index 131010ff..206205f5 100644
--- a/packages/app/src/app/api/v1/trace-histograms/route.ts
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -1,3 +1,4 @@
+import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline';
 import { getDb } from '@semianalysisai/inferencex-db/connection';
 import {
   getTraceHistograms,
@@ -14,9 +15,17 @@ export const dynamic = 'force-dynamic';
 // unstable_cache limit (each point carries one int per request, ~500-1000+
 // requests for agentic), which manifests as a 500 from the route. Blob
 // storage lets us cache the larger response without losing the warm-cache hit.
+//
+// Key derived from REQUEST_TIMELINE_VERSION: the histograms are read out of the
+// `request_timeline` payload (getTraceHistograms keys its fast path off that
+// constant). The blob cache is write-once with no post-backfill purge, so the
+// version-derived key is what rolls the namespace on a bump — the previously
+// unversioned key would serve stale histograms forever.
+export const CACHE_KEY_PREFIX = `trace-histograms-v${REQUEST_TIMELINE_VERSION}`;
+
 const getCachedTraceHistograms = cachedQuery(
   (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
-  'trace-histograms',
+  CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
 
diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
index a759e6dc..149fefbf 100644
--- a/packages/app/src/app/api/v1/trace-server-metrics/route.ts
+++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
@@ -1,3 +1,4 @@
+import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series';
 import { getDb } from '@semianalysisai/inferencex-db/connection';
 import {
   getTraceServerMetrics,
@@ -10,9 +11,16 @@ import { idQueryRoute } from '../id-routes';
 
 export const dynamic = 'force-dynamic';
 
+// Key derived from CHART_SERIES_VERSION (governs the `chart_series` payload).
+// The blob cache is write-once with no post-backfill purge, so the
+// version-derived key is what rolls the namespace on a bump — a hand-written
+// string would serve stale blob-cached series forever.
+/** Version-derived blob-cache key namespace (exported for the key-derivation test). */
+export const CACHE_KEY_PREFIX = `trace-server-metrics-v${CHART_SERIES_VERSION}`;
+
 const getCachedTraceServerMetrics = cachedQuery(
   (id: number): Promise<TraceServerMetrics | null> => getTraceServerMetrics(getDb(), id),
-  'trace-server-metrics',
+  CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
 
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
index 529306cf..0c4dbc89 100644
--- a/packages/db/src/queries/agentic-aggregates.test.ts
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -1,6 +1,16 @@
+import { gzipSync } from 'node:zlib';
+
 import { describe, expect, it } from 'vitest';
 
-import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates';
+import type { DbClient } from '../connection.js';
+
+import {
+  extractIslOsl,
+  extractServerMetricSamples,
+  getAgenticAggregates,
+  percentilesOf,
+  STATS_VERSION,
+} from './agentic-aggregates';
 
 describe('percentilesOf', () => {
   it('returns null for empty input', () => {
@@ -111,3 +121,142 @@ describe('extractServerMetricSamples', () => {
     expect(out.prefixCacheHitRate).toEqual([]);
   });
 });
+
+/** The write-back payload as bound to the UPDATE (a partial aggregate_stats). */
+interface WrittenStats {
+  version: number;
+  isl: unknown;
+  osl: unknown;
+  kvCacheUtil: { mean: number } | null;
+  prefixCacheHitRate: unknown;
+  normalizedSessionTimeS: number | null;
+  p90PrefillTpsPerUser: number | null;
+  normalizedE2e400: unknown;
+}
+
+/** Capture SQL template text + bound values for the write-back assertions. */
+function mockSql(queue: unknown[][]): {
+  sql: DbClient;
+  calls: { text: string; values: unknown[] }[];
+} {
+  const responses = [...queue];
+  const calls: { text: string; values: unknown[] }[] = [];
+  const sql = ((strings: TemplateStringsArray, ...values: unknown[]) => {
+    calls.push({ text: strings.join('?'), values });
+    return Promise.resolve(responses.shift() ?? []);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+/** One aiperf profiling record for the fallback profile blob. */
+function profileRec(fields: {
+  cid: string;
+  isl: number;
+  osl: number;
+  ttft_ms: number;
+  latency_ms: number;
+}): string {
+  return JSON.stringify({
+    metadata: { conversation_id: fields.cid, turn_index: 0, benchmark_phase: 'profiling' },
+    metrics: {
+      request_latency: { value: fields.latency_ms, unit: 'ms' },
+      time_to_first_token: { value: fields.ttft_ms, unit: 'ms' },
+      input_sequence_length: { value: fields.isl, unit: 'tokens' },
+      output_sequence_length: { value: fields.osl, unit: 'tokens' },
+    },
+  });
+}
+
+describe('getAgenticAggregates write-back', () => {
+  it('recomputes ALL profile+server fields and writes a complete bundle back on the stale path', async () => {
+    const profileBlob = gzipSync(
+      Buffer.from(
+        [
+          profileRec({ cid: 's1', isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+          profileRec({ cid: 's1', isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }),
+        ].join('\n'),
+      ),
+    );
+    const serverBlob = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          metrics: {
+            'vllm:kv_cache_usage_perc': {
+              series: [{ timeslices: [{ start_ns: 0, avg: 0.25 }] }],
+            },
+          },
+        }),
+      ),
+    );
+
+    // Stale row with server AND derived fields we must NOT trust — the route
+    // recomputes both from the blobs, so nothing is carried forward.
+    const staleStats = {
+      version: STATS_VERSION - 1,
+      isl: null,
+      osl: null,
+      kvCacheUtil: { mean: 0.9, p50: 0.9, p75: 0.9, p90: 0.9, p99: 0.9, n: 1 },
+      prefixCacheHitRate: null,
+      normalizedSessionTimeS: 999,
+      p90PrefillTpsPerUser: 999,
+    };
+
+    const { sql, calls } = mockSql([
+      // fetchAggregateStatsRows
+      [{ benchmark_result_id: 7, stats: staleStats }],
+      // Pass 1: profile blob (+ trace_replay_id for write-back)
+      [{ benchmark_result_id: 7, trace_replay_id: 870, profile_blob: profileBlob }],
+      // Pass 2: server blob
+      [{ benchmark_result_id: 7, server_blob: serverBlob }],
+    ]);
+
+    const result = await getAgenticAggregates(sql, [7]);
+
+    // Response reflects the fresh recompute (isl/osl + kv from the blobs).
+    expect(result[7]?.isl?.n).toBe(2);
+    expect(result[7]?.kvCacheUtil?.mean).toBeCloseTo(0.25, 6);
+
+    // 4 calls: stats read, profile read, server read, write-back UPDATE.
+    expect(calls).toHaveLength(4);
+    expect(calls[3]!.text).toContain('update agentic_trace_replay set aggregate_stats');
+    expect(calls[3]!.text).toContain('::jsonb where id');
+
+    // The payload OBJECT is bound directly (not stringified — that would
+    // double-encode into a JSONB string).
+    const [written, traceReplayId] = calls[3]!.values as [WrittenStats, number];
+    expect(traceReplayId).toBe(870);
+    expect(written.version).toBe(STATS_VERSION);
+    // Server field FRESHLY recomputed (0.25), not the stale 0.9 carried forward.
+    expect(written.kvCacheUtil?.mean).toBeCloseTo(0.25, 6);
+    // Derived fields FRESHLY recomputed (not the stale 999s).
+    expect(written.normalizedSessionTimeS).toBeCloseTo(3, 6);
+    expect(written.p90PrefillTpsPerUser).toBeCloseTo(200, 6);
+    expect(written.normalizedE2e400).not.toBeNull();
+    expect(written.isl).not.toBeNull();
+  });
+
+  it('does not write back for an id whose profile blob is missing/malformed', async () => {
+    const staleStats = {
+      version: STATS_VERSION - 1,
+      isl: null,
+      osl: null,
+      kvCacheUtil: null,
+      prefixCacheHitRate: null,
+      normalizedSessionTimeS: null,
+      p90PrefillTpsPerUser: null,
+    };
+    const { sql, calls } = mockSql([
+      [{ benchmark_result_id: 7, stats: staleStats }],
+      // Pass 1: no profile blob → nothing to recompute, nothing to heal.
+      [{ benchmark_result_id: 7, trace_replay_id: 870, profile_blob: null }],
+      // Pass 2: no server blob either.
+      [{ benchmark_result_id: 7, server_blob: null }],
+    ]);
+
+    await getAgenticAggregates(sql, [7]);
+
+    // stats read + 2 blob reads only — no write-back (profile parse never succeeded).
+    expect(calls).toHaveLength(3);
+    expect(calls.some((c) => c.text.includes('update agentic_trace_replay'))).toBe(false);
+  });
+});
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 0443398d..73d6ae58 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -23,37 +23,26 @@ import { pick } from 'stream-json/filters/pick.js';
 import { streamObject } from 'stream-json/streamers/stream-object.js';
 
 import type { DbClient } from '../connection.js';
+import { computeDerivedFromBlob } from './derived-agentic-metrics';
 import {
+  extractIslOsl,
   fetchAggregateStatsRows,
   percentilesOf,
-  readNum,
+  STATS_VERSION,
+  writeBackTraceReplayJsonb,
   type MetricPercentiles,
 } from './agentic-shared';
 
-// Percentile math + envelope reader live in agentic-shared.ts; re-exported
-// here because etl/compute-aggregate-stats and the API layer import them
-// from this module.
-export { percentilesOf, type MetricPercentiles } from './agentic-shared';
-
-/**
- * Bump when the aggregate-stats computation algorithm changes — the backfill
- * script recomputes any row whose stored `aggregate_stats.version` is older.
- * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
- * import: the compute helper depends on the extractors below.
- *
- * v2: aggregate vllm gauges/counters across all engine series (was reading
- * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
- *
- * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate
- * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way
- * they do for vllm runs.
- *
- * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL.
- *
- * v5: reject osl <= 0 in extractTurn to exclude cancelled/empty-output turns
- * whose decode-interval math would explode normalized E2E to thousands of seconds.
- */
-export const STATS_VERSION = 5;
+// STATS_VERSION, the profile extractor `extractIslOsl`, and the percentile
+// math + envelope reader all live in agentic-shared.ts (the cycle-free leaf).
+// Re-exported here because etl/compute-aggregate-stats and the API layer
+// import them from this module.
+export {
+  extractIslOsl,
+  percentilesOf,
+  STATS_VERSION,
+  type MetricPercentiles,
+} from './agentic-shared';
 
 export interface AgenticAggregate {
   id: number;
@@ -76,36 +65,6 @@ export type AgenticAggregateMap = Record<number, AgenticAggregate>;
 const PROFILE_CHUNK_SIZE = 8;
 const SERVER_CHUNK_SIZE = 1;
 
-interface ProfileRecord {
-  metadata?: { benchmark_phase?: string };
-  metrics?: {
-    input_sequence_length?: { value?: number } | number;
-    output_sequence_length?: { value?: number } | number;
-  };
-}
-
-/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */
-export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } {
-  const isl: number[] = [];
-  const osl: number[] = [];
-  for (const line of jsonl.split('\n')) {
-    if (!line) continue;
-    let rec: ProfileRecord;
-    try {
-      rec = JSON.parse(line) as ProfileRecord;
-    } catch {
-      continue;
-    }
-    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
-    const m = rec.metrics ?? {};
-    const i = readNum(m.input_sequence_length);
-    const o = readNum(m.output_sequence_length);
-    if (typeof i === 'number') isl.push(i);
-    if (typeof o === 'number') osl.push(o);
-  }
-  return { isl, osl };
-}
-
 interface TimeSlice {
   start_ns?: number;
   end_ns?: number;
@@ -322,17 +281,29 @@ export async function getAgenticAggregates(
     return result;
   }
 
+  // Accumulate a complete, version-stamped `aggregate_stats` bundle per id as
+  // the two passes recompute it, so we can self-heal the shared JSONB column
+  // afterward (see the write-back loop below). Only ids whose profile blob
+  // parsed cleanly get an entry — a null/malformed blob must never overwrite
+  // good stored data.
+  const pendingById = new Map<number, { traceReplayId: number; stats: FullAggregateStats }>();
+
   // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ──────
   for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) {
     const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE);
     const rows = (await sql`
       select
         br.id as benchmark_result_id,
+        atr.id as trace_replay_id,
         atr.profile_export_jsonl_gz as profile_blob
       from benchmark_results br
       join agentic_trace_replay atr on atr.id = br.trace_replay_id
       where br.id = any(${chunk}::bigint[])
-    `) as { benchmark_result_id: number; profile_blob: Buffer | null }[];
+    `) as {
+      benchmark_result_id: number;
+      trace_replay_id: number;
+      profile_blob: Buffer | null;
+    }[];
     for (const row of rows) {
       const id = Number(row.benchmark_result_id);
       result[id] ??= blankAggregate(id);
@@ -340,8 +311,29 @@ export async function getAgenticAggregates(
         try {
           const jsonl = gunzipSync(row.profile_blob).toString('utf8');
           const { isl, osl } = extractIslOsl(jsonl);
-          result[id].isl = percentilesOf(isl);
-          result[id].osl = percentilesOf(osl);
+          const islPct = percentilesOf(isl);
+          const oslPct = percentilesOf(osl);
+          result[id].isl = islPct;
+          result[id].osl = oslPct;
+          // Recompute the profile-derived fields too (same jsonl, no extra
+          // read) so the self-healed bundle is a faithful full recompute — not
+          // a carry-forward of stale derived numbers stamped with a new
+          // version. Server-derived fields are filled in Pass 2 (or stay null
+          // when the server blob is absent, which is the correct complete value).
+          const derived = computeDerivedFromBlob(jsonl);
+          pendingById.set(id, {
+            traceReplayId: Number(row.trace_replay_id),
+            stats: {
+              version: STATS_VERSION,
+              isl: islPct,
+              osl: oslPct,
+              kvCacheUtil: null,
+              prefixCacheHitRate: null,
+              normalizedSessionTimeS: derived.normalized_session_time_s,
+              p90PrefillTpsPerUser: derived.p90_prefill_tps_per_user,
+              normalizedE2e400: derived.normalized_e2e_400,
+            },
+          });
         } catch {
           // ignore malformed blob
         }
@@ -385,11 +377,30 @@ export async function getAgenticAggregates(
         }
       }
       if (parsed) {
-        result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil);
-        result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate);
+        const kvPct = percentilesOf(parsed.kvCacheUtil);
+        const prefixPct = percentilesOf(parsed.prefixCacheHitRate);
+        result[id].kvCacheUtil = kvPct;
+        result[id].prefixCacheHitRate = prefixPct;
+        const pending = pendingById.get(id);
+        if (pending) {
+          pending.stats.kvCacheUtil = kvPct;
+          pending.stats.prefixCacheHitRate = prefixPct;
+        }
       }
     }
   }
+
+  // Self-heal the shared `aggregate_stats` column: persist the freshly
+  // recomputed, version-stamped bundle so the next request (this route AND the
+  // derived-agentic-metrics route, which read the same column) takes the fast
+  // path instead of re-decompressing these blobs. Only ids whose profile blob
+  // parsed cleanly are in `pendingById`, so a null/malformed recompute never
+  // clobbers good data. Fire-and-forget, best-effort (no-ops on a read-only
+  // replica) — never delays or fails the response.
+  for (const { traceReplayId, stats } of pendingById.values()) {
+    writeBackTraceReplayJsonb(sql, 'aggregate_stats', traceReplayId, stats);
+  }
+
   return result;
 }
 
@@ -404,6 +415,22 @@ interface AggregateStatsRow {
   p90PrefillTpsPerUser: number | null;
 }
 
+/**
+ * The complete `aggregate_stats` bundle we write back on the fallback path.
+ * Mirrors `AggregateStats` in etl/compute-aggregate-stats.ts (kept local to
+ * avoid an import cycle with that module, which depends on this one).
+ */
+interface FullAggregateStats {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  normalizedSessionTimeS: number | null;
+  p90PrefillTpsPerUser: number | null;
+  normalizedE2e400: MetricPercentiles | null;
+}
+
 function blankAggregate(id: number): AgenticAggregate {
   return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
 }
diff --git a/packages/db/src/queries/agentic-shared.test.ts b/packages/db/src/queries/agentic-shared.test.ts
new file mode 100644
index 00000000..35a25d97
--- /dev/null
+++ b/packages/db/src/queries/agentic-shared.test.ts
@@ -0,0 +1,79 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+import type { DbClient } from '../connection.js';
+
+import { _resetWriteBackWarned, writeBackTraceReplayJsonb } from './agentic-shared';
+
+/**
+ * Capture every SQL call: the joined template text plus the bound values, so we
+ * can assert the write-back targets the right column and binds the JSONB
+ * payload as a `::jsonb`-cast JSON string (driver-agnostic).
+ */
+function mockSql(reject?: Error): {
+  sql: DbClient;
+  calls: { text: string; values: unknown[] }[];
+} {
+  const calls: { text: string; values: unknown[] }[] = [];
+  const sql = ((strings: TemplateStringsArray, ...values: unknown[]) => {
+    calls.push({ text: strings.join('?'), values });
+    return reject ? Promise.reject(reject) : Promise.resolve([]);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+afterEach(() => {
+  _resetWriteBackWarned();
+  vi.restoreAllMocks();
+});
+
+describe('writeBackTraceReplayJsonb', () => {
+  it('issues a fixed-column UPDATE binding the payload as ::jsonb + the id', () => {
+    const { sql, calls } = mockSql();
+    writeBackTraceReplayJsonb(sql, 'chart_series', 870, { version: 12, foo: 'bar' });
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]!.text).toContain('update agentic_trace_replay set chart_series');
+    expect(calls[0]!.text).toContain('::jsonb where id');
+    // The payload OBJECT is bound directly (not JSON.stringify'd — that would
+    // double-encode into a JSONB string), followed by the id. Only the value +
+    // id are interpolated; the column name is fully static in the SQL text.
+    expect(calls[0]!.values).toEqual([{ version: 12, foo: 'bar' }, 870]);
+  });
+
+  it('targets the requested column verbatim (no cross-talk between columns)', () => {
+    const cases: ('aggregate_stats' | 'chart_series' | 'request_timeline')[] = [
+      'aggregate_stats',
+      'chart_series',
+      'request_timeline',
+    ];
+    for (const column of cases) {
+      const { sql, calls } = mockSql();
+      writeBackTraceReplayJsonb(sql, column, 1, { v: 1 });
+      expect(calls[0]!.text).toContain(`update agentic_trace_replay set ${column}`);
+    }
+  });
+
+  it('no-ops on a null/undefined payload (never overwrites good data with a hole)', () => {
+    const { sql, calls } = mockSql();
+    writeBackTraceReplayJsonb(sql, 'aggregate_stats', 1, null);
+    writeBackTraceReplayJsonb(sql, 'aggregate_stats', 1, undefined);
+    expect(calls).toHaveLength(0);
+  });
+
+  it('swallows a rejected UPDATE (read-only replica) and warns exactly once', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const { sql } = mockSql(new Error('cannot execute UPDATE in a read-only transaction'));
+
+    // Fire twice; the helper is fire-and-forget so neither call throws.
+    expect(() => writeBackTraceReplayJsonb(sql, 'chart_series', 1, { v: 1 })).not.toThrow();
+    expect(() => writeBackTraceReplayJsonb(sql, 'chart_series', 2, { v: 1 })).not.toThrow();
+
+    // Let the caught rejections settle.
+    await new Promise((resolve) => {
+      setTimeout(resolve, 0);
+    });
+
+    expect(warn).toHaveBeenCalledTimes(1);
+    expect(warn.mock.calls[0]![0]).toContain('could not persist chart_series');
+  });
+});
diff --git a/packages/db/src/queries/agentic-shared.ts b/packages/db/src/queries/agentic-shared.ts
index e8a639e7..d8673a07 100644
--- a/packages/db/src/queries/agentic-shared.ts
+++ b/packages/db/src/queries/agentic-shared.ts
@@ -1,12 +1,69 @@
 /**
  * Helpers shared by the agentic per-point queries (`agentic-aggregates.ts`,
  * `derived-agentic-metrics.ts`): percentile math over aiperf samples,
- * the `{value, unit}` metric-envelope reader, and the single-round-trip
- * `aggregate_stats` fetch both fast paths start from.
+ * the `{value, unit}` metric-envelope reader, the single-round-trip
+ * `aggregate_stats` fetch both fast paths start from, and the best-effort
+ * write-back both use to self-heal a stale precomputed payload.
+ *
+ * `STATS_VERSION` and the profile-blob extractor `extractIslOsl` live here (the
+ * dependency-free leaf) rather than in `agentic-aggregates.ts` so both query
+ * modules — and `etl/compute-aggregate-stats.ts` — can share them without an
+ * import cycle: `agentic-aggregates` ⇄ `derived-agentic-metrics` would
+ * otherwise close a loop once each needs the other's blob helpers for
+ * write-back. (agentic-aggregates re-exports both for existing importers.)
  */
 
 import type { DbClient } from '../connection.js';
 
+/**
+ * Bump when the aggregate-stats computation algorithm changes — the backfill
+ * script recomputes any row whose stored `aggregate_stats.version` is older,
+ * and the read-path fast/slow branches key off it.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
+ *
+ * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate
+ * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way
+ * they do for vllm runs.
+ *
+ * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL.
+ *
+ * v5: reject osl <= 0 in extractTurn to exclude cancelled/empty-output turns
+ * whose decode-interval math would explode normalized E2E to thousands of seconds.
+ */
+export const STATS_VERSION = 5;
+
+interface ProfileRecord {
+  metadata?: { benchmark_phase?: string };
+  metrics?: {
+    input_sequence_length?: { value?: number } | number;
+    output_sequence_length?: { value?: number } | number;
+  };
+}
+
+/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */
+export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } {
+  const isl: number[] = [];
+  const osl: number[] = [];
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const m = rec.metrics ?? {};
+    const i = readNum(m.input_sequence_length);
+    const o = readNum(m.output_sequence_length);
+    if (typeof i === 'number') isl.push(i);
+    if (typeof o === 'number') osl.push(o);
+  }
+  return { isl, osl };
+}
+
 export interface MetricPercentiles {
   mean: number;
   p50: number;
@@ -79,3 +136,80 @@ export async function fetchAggregateStatsRows<Stats>(
     where br.id = any(${benchmarkResultIds}::bigint[])
   `) as unknown as { benchmark_result_id: number; stats: Stats | null }[];
 }
+
+/** Trace-replay JSONB columns the read path may self-heal after a recompute. */
+export type WriteBackColumn = 'aggregate_stats' | 'chart_series' | 'request_timeline';
+
+/** Logged once per process so a read-only connection doesn't spam the console. */
+let writeBackWarned = false;
+
+/** Reset the once-per-process warning latch (test-only). */
+export function _resetWriteBackWarned(): void {
+  writeBackWarned = false;
+}
+
+/**
+ * Issue the fixed-column UPDATE. Kept as one tagged-template call per column so
+ * the SQL text is fully static — no column name is ever interpolated — which
+ * keeps it injection-proof and driver-agnostic. The bound value is the plain
+ * payload OBJECT cast to `::jsonb`: both the neon HTTP driver and postgres.js
+ * JSON-serialize an object parameter exactly once, so `::jsonb` parses it to a
+ * JSONB object. (Passing `JSON.stringify(payload)` instead double-encodes into
+ * a JSONB *string* — `jsonb_typeof` = 'string' — which is why we don't.) The
+ * abstract `DbClient` doesn't expose postgres.js's `sql.json()`, so this is the
+ * portable way to write JSONB.
+ */
+function updateJsonbColumn(
+  sql: DbClient,
+  column: WriteBackColumn,
+  traceReplayId: number,
+  value: unknown,
+): Promise<unknown> {
+  switch (column) {
+    case 'aggregate_stats': {
+      return sql`update agentic_trace_replay set aggregate_stats = ${value}::jsonb where id = ${traceReplayId}`;
+    }
+    case 'chart_series': {
+      return sql`update agentic_trace_replay set chart_series = ${value}::jsonb where id = ${traceReplayId}`;
+    }
+    case 'request_timeline': {
+      return sql`update agentic_trace_replay set request_timeline = ${value}::jsonb where id = ${traceReplayId}`;
+    }
+  }
+}
+
+/**
+ * Best-effort, fire-and-forget persist of a freshly recomputed versioned
+ * payload back into an `agentic_trace_replay` JSONB column, so the next request
+ * takes the precomputed fast path instead of re-gunzipping the raw blob.
+ *
+ * The read path runs on the READONLY connection. On a true read replica (prod's
+ * `DATABASE_READONLY_URL`) the UPDATE fails at the wire — this catches the
+ * rejection and silently no-ops (warning once) so the response is never delayed
+ * or failed. On local/superuser connections (where the readonly URL is also
+ * write-capable) it self-heals the stored payload. Callers must only pass a
+ * COMPLETE recomputed payload — never a partial/null-blob result — so a
+ * self-heal never clobbers good data with holes.
+ */
+export function writeBackTraceReplayJsonb(
+  sql: DbClient,
+  column: WriteBackColumn,
+  traceReplayId: number,
+  payload: unknown,
+): void {
+  if (payload === null || payload === undefined) return;
+  // structuredClone strips any class prototypes so the driver serializes plain
+  // data only — matches `jsonbParam` in the backfill runner.
+  const value = structuredClone(payload);
+  void updateJsonbColumn(sql, column, traceReplayId, value).catch((error: unknown) => {
+    if (!writeBackWarned) {
+      writeBackWarned = true;
+      console.warn(
+        `[agentic write-back] could not persist ${column} (read-only connection?) — ` +
+          `serving recomputed result without caching. ${
+            error instanceof Error ? error.message : String(error)
+          }`,
+      );
+    }
+  });
+}
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
index 84c09193..a39de670 100644
--- a/packages/db/src/queries/derived-agentic-metrics.test.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -1,6 +1,11 @@
+import { gzipSync } from 'node:zlib';
+
 import { describe, expect, it } from 'vitest';
 
-import { computeDerivedFromBlob } from './derived-agentic-metrics.js';
+import { STATS_VERSION } from './agentic-shared';
+import type { DbClient } from '../connection.js';
+
+import { computeDerivedFromBlob, getDerivedAgenticMetrics } from './derived-agentic-metrics.js';
 
 /** Build one aiperf JSONL record for the synthetic fixture. */
 function rec(
@@ -150,3 +155,102 @@ describe('computeDerivedFromBlob', () => {
     expect(out.normalized_e2e_400!.p90).toBeLessThan(20);
   });
 });
+
+/** Capture SQL template text + bound values for the write-back assertions. */
+function mockSql(queue: unknown[][]): {
+  sql: DbClient;
+  calls: { text: string; values: unknown[] }[];
+} {
+  const responses = [...queue];
+  const calls: { text: string; values: unknown[] }[] = [];
+  const sql = ((strings: TemplateStringsArray, ...values: unknown[]) => {
+    calls.push({ text: strings.join('?'), values });
+    return Promise.resolve(responses.shift() ?? []);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+describe('getDerivedAgenticMetrics write-back', () => {
+  it('self-heals aggregate_stats from the profile blob, carrying server fields forward', async () => {
+    const jsonl = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }),
+    ].join('\n');
+    const blob = gzipSync(Buffer.from(jsonl));
+
+    // Stale v(N-1) row that DOES carry server-derived fields — they must be
+    // preserved in the healed bundle (derived route can't recompute them).
+    const staleServerKv = { mean: 0.4, p50: 0.4, p75: 0.5, p90: 0.6, p99: 0.7, n: 3 };
+    const staleStats = {
+      version: STATS_VERSION - 1,
+      isl: null,
+      osl: null,
+      kvCacheUtil: staleServerKv,
+      prefixCacheHitRate: null,
+      normalizedSessionTimeS: 999,
+      p90PrefillTpsPerUser: 999,
+      normalizedE2e400: null,
+    };
+
+    const { sql, calls } = mockSql([
+      // fetchAggregateStatsRows
+      [{ benchmark_result_id: 7, stats: staleStats }],
+      // fallback profile-blob query
+      [{ benchmark_result_id: 7, trace_replay_id: 870, blob }],
+    ]);
+
+    const result = await getDerivedAgenticMetrics(sql, [7]);
+
+    // Response is the freshly recomputed value, not the stale 999s.
+    expect(result[7]?.normalized_session_time_s).toBeCloseTo(3, 6);
+    expect(result[7]?.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+
+    // 3 calls: stats read, blob read, write-back UPDATE.
+    expect(calls).toHaveLength(3);
+    expect(calls[2]!.text).toContain('update agentic_trace_replay set aggregate_stats');
+    expect(calls[2]!.text).toContain('::jsonb where id');
+
+    // The write-back binds a COMPLETE, version-stamped bundle at the new version,
+    // recomputing profile fields and carrying server fields forward untouched.
+    // The payload OBJECT is bound directly (not stringified — that would
+    // double-encode into a JSONB string).
+    interface WrittenStats {
+      version: number;
+      isl: unknown;
+      osl: unknown;
+      kvCacheUtil: unknown;
+      normalizedSessionTimeS: number | null;
+      p90PrefillTpsPerUser: number | null;
+    }
+    const [written, traceReplayId] = calls[2]!.values as [WrittenStats, number];
+    expect(traceReplayId).toBe(870);
+    expect(written.version).toBe(STATS_VERSION);
+    expect(written.normalizedSessionTimeS).toBeCloseTo(3, 6);
+    expect(written.p90PrefillTpsPerUser).toBeCloseTo(200, 6);
+    expect(written.isl).not.toBeNull();
+    expect(written.osl).not.toBeNull();
+    // Server-derived field carried forward from the stale row (not re-read).
+    expect(written.kvCacheUtil).toEqual(staleServerKv);
+  });
+
+  it('takes the fast path (no blob read, no write-back) when stats are current', async () => {
+    const currentStats = {
+      version: STATS_VERSION,
+      isl: null,
+      osl: null,
+      kvCacheUtil: null,
+      prefixCacheHitRate: null,
+      normalizedSessionTimeS: 1.5,
+      p90PrefillTpsPerUser: 42,
+      normalizedE2e400: { mean: 1, p50: 1, p75: 1, p90: 2, p99: 3, n: 5 },
+    };
+    const { sql, calls } = mockSql([[{ benchmark_result_id: 7, stats: currentStats }]]);
+
+    const result = await getDerivedAgenticMetrics(sql, [7]);
+
+    expect(result[7]?.normalized_session_time_s).toBe(1.5);
+    expect(result[7]?.p90_normalized_e2e_400_s).toBe(2);
+    // Only the stats read — no fallback blob query, no write-back.
+    expect(calls).toHaveLength(1);
+  });
+});
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index 24b24cf1..626ab9c7 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -23,13 +23,15 @@ import { gunzipSync } from 'node:zlib';
 import { NORMALIZED_E2E_OUTPUT_TOKENS } from '@semianalysisai/inferencex-constants';
 
 import type { DbClient } from '../connection.js';
-import { STATS_VERSION } from './agentic-aggregates';
 import {
+  extractIslOsl,
   fetchAggregateStatsRows,
   meanOf,
   percentilesOf,
   quantile,
   readNum,
+  STATS_VERSION,
+  writeBackTraceReplayJsonb,
   type MetricPercentiles,
 } from './agentic-shared';
 
@@ -48,6 +50,27 @@ export interface DerivedAgenticMetric {
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
 
+/**
+ * The full `aggregate_stats` JSONB shape (mirrors `AggregateStats` in
+ * etl/compute-aggregate-stats.ts). Duplicated here rather than imported to keep
+ * this module off the etl import graph. When we self-heal from the profile blob
+ * alone, the server-derived fields (kvCacheUtil, prefixCacheHitRate) are carried
+ * forward untouched from the stale row — never re-reading the huge server blob.
+ * This mirrors the profile-only upgrade `backfill-aggregate-stats.ts` performs;
+ * the agentic-aggregates route (which does read the server blob) heals those
+ * server fields.
+ */
+interface StoredAggregateStats {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  normalizedSessionTimeS: number | null;
+  p90PrefillTpsPerUser: number | null;
+  normalizedE2e400: MetricPercentiles | null;
+}
+
 /**
  * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless
  * HTTP driver caps responses at 64 MB — chunk to stay well under.
@@ -205,14 +228,13 @@ export async function getDerivedAgenticMetrics(
   // ingest pipeline computes both metrics in the same pass that produces the
   // percentile bundles, so a single SQL round-trip covers most ids without
   // touching the gzipped profile blob.
-  const statsRows = await fetchAggregateStatsRows<{
-    version?: number;
-    normalizedSessionTimeS?: number | null;
-    p90PrefillTpsPerUser?: number | null;
-    normalizedE2e400?: MetricPercentiles | null;
-  }>(sql, benchmarkResultIds);
+  const statsRows = await fetchAggregateStatsRows<StoredAggregateStats>(sql, benchmarkResultIds);
 
   const idsNeedingBlob: number[] = [];
+  // Carry each stale/missing row's existing stats into the fallback so a
+  // self-heal preserves the server-derived fields (kvCacheUtil,
+  // prefixCacheHitRate) it can't recompute from the profile blob alone.
+  const staleStatsById = new Map<number, StoredAggregateStats | null>();
   for (const row of statsRows) {
     const id = Number(row.benchmark_result_id);
     if (row.stats && Number(row.stats.version) === STATS_VERSION) {
@@ -225,6 +247,7 @@ export async function getDerivedAgenticMetrics(
       };
     } else {
       idsNeedingBlob.push(id);
+      staleStatsById.set(id, row.stats ?? null);
     }
   }
 
@@ -233,33 +256,60 @@ export async function getDerivedAgenticMetrics(
   // Fallback: parse the profile blob directly. Used for rows whose
   // `aggregate_stats` is null or computed by an older STATS_VERSION; the
   // backfill script drains the population so this path should be rare.
-  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  // `trace_replay_id` + the (small) stale `aggregate_stats` come along on the
+  // same join — no extra round-trip — so we can self-heal after recompute.
+  const rows: {
+    benchmark_result_id: number;
+    trace_replay_id: number;
+    blob: Buffer;
+  }[] = [];
   for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) {
     const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE);
     const chunkRows = (await sql`
       select
         br.id as benchmark_result_id,
+        atr.id as trace_replay_id,
         atr.profile_export_jsonl_gz as blob
       from benchmark_results br
       join agentic_trace_replay atr on atr.id = br.trace_replay_id
       where br.id = any(${chunk}::bigint[])
         and atr.profile_export_jsonl_gz is not null
-    `) as { benchmark_result_id: number; blob: Buffer }[];
+    `) as { benchmark_result_id: number; trace_replay_id: number; blob: Buffer }[];
     rows.push(...chunkRows);
   }
 
   for (const row of rows) {
+    const id = Number(row.benchmark_result_id);
     try {
       const jsonl = gunzipSync(row.blob).toString('utf8');
       const { normalized_session_time_s, p90_prefill_tps_per_user, normalized_e2e_400 } =
         computeDerivedFromBlob(jsonl);
-      result[Number(row.benchmark_result_id)] = {
-        id: Number(row.benchmark_result_id),
+      result[id] = {
+        id,
         normalized_session_time_s,
         p90_prefill_tps_per_user,
         p75_normalized_e2e_400_s: normalized_e2e_400?.p75 ?? null,
         p90_normalized_e2e_400_s: normalized_e2e_400?.p90 ?? null,
       };
+
+      // Self-heal the shared `aggregate_stats` bundle. We only have the profile
+      // blob here, so recompute the profile-derived fields (isl/osl + the three
+      // derived metrics) and carry the stale row's server-derived fields
+      // forward untouched — the profile-only upgrade the backfill CLI also
+      // performs. Fire-and-forget, best-effort (no-ops on a read-only replica).
+      const { isl, osl } = extractIslOsl(jsonl);
+      const prior = staleStatsById.get(id) ?? null;
+      const merged: StoredAggregateStats = {
+        version: STATS_VERSION,
+        isl: percentilesOf(isl),
+        osl: percentilesOf(osl),
+        kvCacheUtil: prior?.kvCacheUtil ?? null,
+        prefixCacheHitRate: prior?.prefixCacheHitRate ?? null,
+        normalizedSessionTimeS: normalized_session_time_s,
+        p90PrefillTpsPerUser: p90_prefill_tps_per_user,
+        normalizedE2e400: normalized_e2e_400,
+      };
+      writeBackTraceReplayJsonb(sql, 'aggregate_stats', Number(row.trace_replay_id), merged);
     } catch {
       // Skip malformed blobs silently — frontend treats missing ids as "no data".
     }
diff --git a/packages/db/src/queries/request-timeline.test.ts b/packages/db/src/queries/request-timeline.test.ts
index 62ba5385..1f1d58a5 100644
--- a/packages/db/src/queries/request-timeline.test.ts
+++ b/packages/db/src/queries/request-timeline.test.ts
@@ -1,3 +1,5 @@
+import { gzipSync } from 'node:zlib';
+
 import { describe, expect, it } from 'vitest';
 
 import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline';
@@ -42,4 +44,54 @@ describe('getRequestTimeline', () => {
     await expect(getRequestTimeline(sql, 422991)).resolves.toBeNull();
     expect(calls).toHaveLength(1);
   });
+
+  it('recomputes from the blob AND writes the fresh timeline back when the stored one is stale', async () => {
+    const blob = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          metadata: {
+            conversation_id: 'c1',
+            turn_index: 0,
+            worker_id: 'w0',
+            benchmark_phase: 'profiling',
+            credit_issued_ns: 1000,
+            request_start_ns: 1100,
+            request_end_ns: 2000,
+          },
+          metrics: {
+            time_to_first_token: { value: 50 },
+            input_sequence_length: { value: 128 },
+            output_sequence_length: { value: 16 },
+          },
+        }),
+      ),
+    );
+    const stale = { ...timeline, version: REQUEST_TIMELINE_VERSION - 1 };
+    const { sql, calls } = mockSql([
+      [{ trace_replay_id: 870, has_blob: true, request_timeline: stale }],
+      [{ blob }],
+    ]);
+
+    const result = await getRequestTimeline(sql, 422991);
+
+    expect(result?.version).toBe(REQUEST_TIMELINE_VERSION);
+    expect(result?.requests).toHaveLength(1);
+    // 3 calls: meta read, blob read, then the fire-and-forget write-back.
+    expect(calls).toHaveLength(3);
+    expect(calls[1]).toContain('profile_export_jsonl_gz as blob');
+    expect(calls[2]).toContain('update agentic_trace_replay set request_timeline');
+    expect(calls[2]).toContain('::jsonb where id');
+  });
+
+  it('does not write back when the blob is missing (never persists a null timeline)', async () => {
+    const stale = { ...timeline, version: REQUEST_TIMELINE_VERSION - 1 };
+    const { sql, calls } = mockSql([
+      [{ trace_replay_id: 870, has_blob: true, request_timeline: stale }],
+      [{ blob: null }],
+    ]);
+
+    await expect(getRequestTimeline(sql, 422991)).resolves.toBeNull();
+    // meta read + blob read only — no write-back for a null recompute.
+    expect(calls).toHaveLength(2);
+  });
 });
diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts
index 2a6bb40c..9b7cc4b5 100644
--- a/packages/db/src/queries/request-timeline.ts
+++ b/packages/db/src/queries/request-timeline.ts
@@ -15,6 +15,7 @@ import {
 } from '../etl/compute-request-timeline';
 
 import type { DbClient } from '../connection.js';
+import { writeBackTraceReplayJsonb } from './agentic-shared';
 
 export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline';
 
@@ -60,5 +61,16 @@ export async function getRequestTimeline(
     from agentic_trace_replay
     where id = ${row.trace_replay_id}
   `) as unknown as RawBlobRow[];
-  return computeRequestTimeline(blobRows[0]?.blob ?? null);
+  const timeline = computeRequestTimeline(blobRows[0]?.blob ?? null);
+
+  // Self-heal the stored request_timeline so the next request (and the
+  // trace-histograms route, which reads the same column) takes the fast path.
+  // Only write a complete recompute — `computeRequestTimeline` returns null for
+  // a missing/malformed blob, which we must not persist over good data.
+  // Fire-and-forget, best-effort (no-ops on a read-only replica).
+  if (timeline !== null) {
+    writeBackTraceReplayJsonb(sql, 'request_timeline', row.trace_replay_id, timeline);
+  }
+
+  return timeline;
 }
diff --git a/packages/db/src/queries/trace-server-metrics.test.ts b/packages/db/src/queries/trace-server-metrics.test.ts
index f045dfda..77aea28a 100644
--- a/packages/db/src/queries/trace-server-metrics.test.ts
+++ b/packages/db/src/queries/trace-server-metrics.test.ts
@@ -92,8 +92,12 @@ describe('getTraceServerMetrics', () => {
     const result = await getTraceServerMetrics(sql, 42);
 
     expect(result?.prefillTps).toEqual([{ t: 0, value: 321 }]);
-    expect(calls).toHaveLength(2);
+    // 3 calls: meta read, blob read, then the fire-and-forget chart_series
+    // write-back that self-heals the stale precomputed series.
+    expect(calls).toHaveLength(3);
     expect(calls[1]).toContain('server_metrics_json_gz as blob');
+    expect(calls[2]).toContain('update agentic_trace_replay set chart_series');
+    expect(calls[2]).toContain('::jsonb where id');
   });
 
   it('returns null without a blob and does not issue a second query', async () => {
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index d24d0879..dc03129e 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -20,6 +20,7 @@ import {
 } from '../etl/compute-chart-series';
 
 import type { DbClient } from '../connection.js';
+import { writeBackTraceReplayJsonb } from './agentic-shared';
 
 export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series';
 
@@ -207,5 +208,12 @@ export async function getTraceServerMetrics(
     disagg: row.disagg,
   });
   if (!series) return null;
+
+  // Self-heal the stored chart_series so the next request takes the fast path
+  // instead of re-decompressing this (tens-of-MB) blob. `series` is complete
+  // and stamped at CHART_SERIES_VERSION here; fire-and-forget and best-effort
+  // (no-ops on a read-only replica). trace_replay_id is non-null on this path.
+  writeBackTraceReplayJsonb(sql, 'chart_series', row.trace_replay_id, series);
+
   return merge(meta, series, kvCachePoolTokens);
 }

From 2a148011d109682341aac8d0aa10f9901049b90b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 17:05:07 -0500
Subject: [PATCH 19/40] ci: allow manual agentic ingest dispatch

---
 .github/workflows/ingest-agentic-results.yml | 21 +++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml
index cf8366ea..75f5a658 100644
--- a/.github/workflows/ingest-agentic-results.yml
+++ b/.github/workflows/ingest-agentic-results.yml
@@ -23,6 +23,17 @@ name: Ingest Agentic Benchmark Results
 on:
   repository_dispatch:
     types: [ingest-agentic-results]
+  workflow_dispatch:
+    inputs:
+      run-id:
+        description: InferenceX Actions run ID to ingest
+        required: true
+        type: string
+      run-attempt:
+        description: InferenceX Actions run attempt to ingest
+        required: false
+        default: '1'
+        type: string
 
 jobs:
   ingest:
@@ -55,7 +66,7 @@ jobs:
       - name: Download artifacts from InferenceX run
         env:
           GH_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
-          RUN_ID: ${{ github.event.client_payload.run-id }}
+          RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }}
           ARTIFACTS_PATH: ${{ github.workspace }}/artifacts
         run: |
           mkdir -p "$ARTIFACTS_PATH"
@@ -110,8 +121,8 @@ jobs:
         env:
           DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
           GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
-          INGEST_RUN_ID: ${{ github.event.client_payload.run-id }}
-          INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt }}
+          INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }}
+          INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }}
           INGEST_ARTIFACTS_PATH: ${{ github.workspace }}/artifacts
           INGEST_REPO: SemiAnalysisAI/InferenceX
           UNMAPPED_ENTITIES_OUTPUT: ${{ github.workspace }}/unmapped-entities.json
@@ -165,7 +176,7 @@ jobs:
           webhook-type: incoming-webhook
           payload: |
             {
-              "text": ":warning: *Unrecognized entities during agentic ingest*\nRun ID: ${{ github.event.client_payload.run-id }}\n```${{ steps.unmapped.outputs.summary }}```\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
+              "text": ":warning: *Unrecognized entities during agentic ingest*\nRun ID: ${{ github.event.client_payload.run-id || inputs.run-id }}\n```${{ steps.unmapped.outputs.summary }}```\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
             }
 
       - name: Notify Slack on failure
@@ -176,5 +187,5 @@ jobs:
           webhook-type: incoming-webhook
           payload: |
             {
-              "text": ":rotating_light: *Agentic ingest workflow failed*\nRun ID: ${{ github.event.client_payload.run-id }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
+              "text": ":rotating_light: *Agentic ingest workflow failed*\nRun ID: ${{ github.event.client_payload.run-id || inputs.run-id }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
             }

From a1e94d91b217034a229c9e778cb9e0c2bb626600 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 18:09:30 -0500
Subject: [PATCH 20/40] ci: register agentic ingest workflow

---
 .github/workflows/ingest-agentic-results.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml
index 75f5a658..e1f4a8b8 100644
--- a/.github/workflows/ingest-agentic-results.yml
+++ b/.github/workflows/ingest-agentic-results.yml
@@ -21,6 +21,10 @@ name: Ingest Agentic Benchmark Results
 # agentic-specific alerting (missing dataset slug).
 
 on:
+  push:
+    branches: [feat/agentx]
+    paths:
+      - .github/workflows/ingest-agentic-results.yml
   repository_dispatch:
     types: [ingest-agentic-results]
   workflow_dispatch:
@@ -36,7 +40,14 @@ on:
         type: string
 
 jobs:
+  register:
+    if: github.event_name == 'push'
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "Registering ingest-agentic-results workflow for manual dispatch"
+
   ingest:
+    if: github.event_name != 'push'
     # Blob-heavy: uploading trace-replay sidecars for a ~20-point sweep takes
     # far longer than a fixed-seq-len ingest.
     timeout-minutes: 60

From 6d55b95789dd04442fd8f7c862568d820539a066 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 18:10:08 -0500
Subject: [PATCH 21/40] ci: use dev database for agentic ingest test

---
 .github/workflows/ingest-agentic-results.yml | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml
index e1f4a8b8..8f84b3ed 100644
--- a/.github/workflows/ingest-agentic-results.yml
+++ b/.github/workflows/ingest-agentic-results.yml
@@ -21,10 +21,6 @@ name: Ingest Agentic Benchmark Results
 # agentic-specific alerting (missing dataset slug).
 
 on:
-  push:
-    branches: [feat/agentx]
-    paths:
-      - .github/workflows/ingest-agentic-results.yml
   repository_dispatch:
     types: [ingest-agentic-results]
   workflow_dispatch:
@@ -40,14 +36,7 @@ on:
         type: string
 
 jobs:
-  register:
-    if: github.event_name == 'push'
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo "Registering ingest-agentic-results workflow for manual dispatch"
-
   ingest:
-    if: github.event_name != 'push'
     # Blob-heavy: uploading trace-replay sidecars for a ~20-point sweep takes
     # far longer than a fixed-seq-len ingest.
     timeout-minutes: 60
@@ -71,7 +60,7 @@ jobs:
 
       - name: Run migrations
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }}
         run: pnpm admin:db:migrate --yes
 
       - name: Download artifacts from InferenceX run
@@ -130,7 +119,7 @@ jobs:
 
       - name: Ingest results to DB
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }}
           GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
           INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }}
           INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }}
@@ -141,12 +130,12 @@ jobs:
 
       - name: Apply run overrides
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }}
         run: pnpm admin:db:apply-overrides --yes
 
       - name: Verify database
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }}
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }}
         run: pnpm admin:db:verify
 
       - name: Invalidate Vercel cache

From cc63a730b0e36338b1e9d850ffe747d59cf26209 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 18:10:33 -0500
Subject: [PATCH 22/40] ci: use dev write database for agentic ingest test

---
 .github/workflows/ingest-agentic-results.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml
index 8f84b3ed..a7d1cd8a 100644
--- a/.github/workflows/ingest-agentic-results.yml
+++ b/.github/workflows/ingest-agentic-results.yml
@@ -60,7 +60,7 @@ jobs:
 
       - name: Run migrations
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }}
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }}
         run: pnpm admin:db:migrate --yes
 
       - name: Download artifacts from InferenceX run
@@ -119,7 +119,7 @@ jobs:
 
       - name: Ingest results to DB
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }}
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }}
           GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
           INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }}
           INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }}
@@ -130,12 +130,12 @@ jobs:
 
       - name: Apply run overrides
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }}
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }}
         run: pnpm admin:db:apply-overrides --yes
 
       - name: Verify database
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }}
+          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }}
         run: pnpm admin:db:verify
 
       - name: Invalidate Vercel cache

From bd0a4905a74cf1d021e7b1bac12fdd15a3ca78ff Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 18:15:09 -0500
Subject: [PATCH 23/40] ci: skip ingest wait for manual dispatch

---
 .github/workflows/ingest-agentic-results.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml
index a7d1cd8a..af94a4c5 100644
--- a/.github/workflows/ingest-agentic-results.yml
+++ b/.github/workflows/ingest-agentic-results.yml
@@ -45,6 +45,7 @@ jobs:
       contents: read
     steps:
       - name: Wait for source run to finish
+        if: github.event_name != 'workflow_dispatch'
         run: sleep 300
 
       - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

From ddd1a267f82af95a616e368a729bd555a8ed79c3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 18:34:22 -0500
Subject: [PATCH 24/40] chore(db): log agentic ingest progress

---
 packages/db/src/etl/trace-replay-ingest.ts | 61 +++++++++++++++++--
 packages/db/src/ingest-ci-run.ts           | 70 ++++++++++++++++++++--
 2 files changed, 122 insertions(+), 9 deletions(-)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index b50168db..1c739b7d 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -19,6 +19,25 @@ import type { ServerMetricsContext } from './server-metrics-adapters';
 
 type Sql = ReturnType<typeof postgres>;
 
+export interface TraceReplayIngestOptions {
+  metricsContext?: ServerMetricsContext;
+  progressLabel?: string;
+}
+
+function formatBytes(bytes: number | null | undefined): string {
+  if (bytes === null || bytes === undefined) return 'none';
+  if (bytes < 1024) return `${bytes} B`;
+  const kib = bytes / 1024;
+  if (kib < 1024) return `${kib.toFixed(1)} KiB`;
+  const mib = kib / 1024;
+  if (mib < 1024) return `${mib.toFixed(1)} MiB`;
+  return `${(mib / 1024).toFixed(1)} GiB`;
+}
+
+function elapsed(startMs: number): string {
+  return `${((Date.now() - startMs) / 1000).toFixed(1)}s`;
+}
+
 /**
  * Persist the per-point trace files and link them to `benchmarkResultIds`.
  *
@@ -34,8 +53,8 @@ type Sql = ReturnType<typeof postgres>;
  * @param serverMetricsJson   Raw bytes of `server_metrics_export.json` —
  *                            per-scrape time-series of every Prometheus metric.
  *                            Optional, gzipped before storage (~42x ratio).
- * @param metricsContext      Canonical framework used to select the
- *                            orchestrator-specific metric-label adapter.
+ * @param options             Canonical framework/disagg context plus optional
+ *                            progress label for CI logs.
  */
 export async function insertTraceReplay(
   sql: Sql,
@@ -43,36 +62,65 @@ export async function insertTraceReplay(
   profileExportJsonl: Buffer | null,
   serverMetricsCsv: Buffer | null,
   serverMetricsJson: Buffer | null = null,
-  metricsContext: ServerMetricsContext = {},
+  options: TraceReplayIngestOptions = {},
 ): Promise<void> {
+  const { metricsContext = {}, progressLabel } = options;
+  const log = (message: string): void => {
+    if (progressLabel) console.log(`    trace_replay ${progressLabel}: ${message}`);
+  };
+
   if (benchmarkResultIds.length === 0) return;
   if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return;
 
   // Only link rows that don't already point at a trace_replay row — keeps
   // re-ingest from inserting duplicate sibling blobs.
+  const linkStart = Date.now();
+  log(`checking ${benchmarkResultIds.length} benchmark row(s) for existing links`);
   const unlinked = await sql<{ id: number }[]>`
     select id from benchmark_results
     where id = any(${sql.array(benchmarkResultIds)}::bigint[])
       and trace_replay_id is null
   `;
-  if (unlinked.length === 0) return;
+  log(`found ${unlinked.length} unlinked row(s) (${elapsed(linkStart)})`);
+  if (unlinked.length === 0) {
+    log('skipping blob insert; all benchmark rows already linked');
+    return;
+  }
 
+  const gzipStart = Date.now();
+  log(
+    `compressing profile=${formatBytes(profileExportJsonl?.length)}, ` +
+      `server_csv=${formatBytes(serverMetricsCsv?.length)}, ` +
+      `server_json=${formatBytes(serverMetricsJson?.length)}`,
+  );
   const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null;
   const profileSize = profileExportJsonl ? profileExportJsonl.length : null;
   const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null;
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
+  log(
+    `compressed profile=${formatBytes(profileGz?.length)}, ` +
+      `server_json=${formatBytes(metricsJsonGz?.length)} (${elapsed(gzipStart)})`,
+  );
 
   // Pre-compute aggregate stats + chart-ready time-series + per-request
   // timeline so the detail page doesn't have to re-parse these blobs on
   // every request. Each helper tolerates a null blob and falls back to
   // a streaming parser for oversized server_metrics blobs.
+  const computeStart = Date.now();
+  log('computing aggregate stats, chart series, and request timeline');
   const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([
     computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
     computeChartSeries(metricsJsonGz, metricsContext),
     Promise.resolve(computeRequestTimeline(profileGz)),
   ]);
+  log(
+    `computed derived JSON: chart_windows=${chartSeries?.timeslicesCount ?? 0}, ` +
+      `timeline_requests=${requestTimeline?.requests.length ?? 0} (${elapsed(computeStart)})`,
+  );
 
+  const insertStart = Date.now();
+  log('inserting trace_replay blob row');
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
     insert into agentic_trace_replay (
       profile_export_jsonl_gz,
@@ -98,12 +146,16 @@ export async function insertTraceReplay(
     )
     returning id
   `;
+  log(`inserted trace_replay_id=${traceReplayId} (${elapsed(insertStart)})`);
 
+  const updateStart = Date.now();
+  log(`linking trace_replay_id=${traceReplayId} to ${unlinked.length} benchmark row(s)`);
   await sql`
     update benchmark_results
     set trace_replay_id = ${traceReplayId}
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
+  log(`linked benchmark rows (${elapsed(updateStart)})`);
 
   // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang
   // runs don't populate these in the harness JSON; vLLM runs do but only
@@ -146,6 +198,7 @@ export async function insertTraceReplay(
         )
         where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
       `;
+      log('updated cache-hit metrics from chart series');
     }
   }
 }
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index d23a8f63..15267622 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -74,6 +74,29 @@ let runAttemptNum: number;
 let REPO: string;
 let tempDir: string | null = null;
 
+function formatBytes(bytes: number | null | undefined): string {
+  if (bytes === null || bytes === undefined) return 'none';
+  if (bytes < 1024) return `${bytes} B`;
+  const kib = bytes / 1024;
+  if (kib < 1024) return `${kib.toFixed(1)} KiB`;
+  const mib = kib / 1024;
+  if (mib < 1024) return `${mib.toFixed(1)} MiB`;
+  return `${(mib / 1024).toFixed(1)} GiB`;
+}
+
+function elapsed(startMs: number): string {
+  return `${((Date.now() - startMs) / 1000).toFixed(1)}s`;
+}
+
+function fileSize(pathname: string | null | undefined): number | null {
+  if (!pathname) return null;
+  try {
+    return fs.statSync(pathname).size;
+  } catch {
+    return null;
+  }
+}
+
 if (isDownloadMode) {
   // --download <run-url-or-id> [repo]
   // Filter out '--' injected by pnpm arg passthrough
@@ -378,13 +401,22 @@ async function main(): Promise<void> {
     const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))];
     console.log(`  Found ${allBmkFiles.length} benchmark JSON file(s)`);
 
-    for (const file of allBmkFiles) {
+    for (const [fileIndex, file] of allBmkFiles.entries()) {
+      const fileStart = Date.now();
+      const relativeFile = path.relative(artifactsDir, file);
+      console.log(
+        `  [${fileIndex + 1}/${allBmkFiles.length}] ${relativeFile} (${formatBytes(fileSize(file))})`,
+      );
       const data = readJson(file);
-      if (!data) continue;
+      if (!data) {
+        console.log(`    skipped unreadable JSON (${elapsed(fileStart)})`);
+        continue;
+      }
 
       const rawRows: Record<string, any>[] = Array.isArray(data)
         ? data
         : [data as Record<string, any>];
+      console.log(`    raw rows: ${rawRows.length}`);
 
       for (const rawRow of rawRows) {
         if (!rawRow || typeof rawRow !== 'object') continue;
@@ -397,7 +429,11 @@ async function main(): Promise<void> {
         .map((r) => mapBenchmarkRow(r, tracker))
         .filter((r): r is NonNullable<typeof r> => r !== null);
 
-      if (rows.length === 0) continue;
+      console.log(`    mapped rows: ${rows.length}`);
+      if (rows.length === 0) {
+        console.log(`    skipped; no mappable rows (${elapsed(fileStart)})`);
+        continue;
+      }
 
       const toInsert = [];
       for (const row of rows) {
@@ -408,15 +444,21 @@ async function main(): Promise<void> {
           tracker.recordDbError(`config for ${path.basename(file)}`, error);
         }
       }
+      console.log(`    rows with resolved configs: ${toInsert.length}`);
 
       if (toInsert.length > 0) {
         try {
+          const insertStart = Date.now();
           const { newCount, dupCount, insertedIds } = await bulkIngestBenchmarkRows(
             sql,
             toInsert,
             workflowRunId,
             date,
           );
+          console.log(
+            `    benchmark rows: +${newCount} new, ${dupCount} dup, ` +
+              `${insertedIds.length} id(s) (${elapsed(insertStart)})`,
+          );
           totalNewBmk += newCount;
           totalDupBmk += dupCount;
 
@@ -448,8 +490,13 @@ async function main(): Promise<void> {
               serverLogPaths.get(stripBmkAndAgenticPrefix(parentDir));
             if (logPath) {
               try {
+                const serverLogStart = Date.now();
+                console.log(
+                  `    server_log ${path.basename(logPath)} (${formatBytes(fileSize(logPath))})`,
+                );
                 const serverLog = fs.readFileSync(logPath, 'utf8').replaceAll('\u0000', '');
                 await insertServerLog(sql, insertedIds, serverLog);
+                console.log(`    server_log linked (${elapsed(serverLogStart)})`);
               } catch (error: any) {
                 tracker.recordDbError(`server_log for ${configKey}`, error);
               }
@@ -468,6 +515,13 @@ async function main(): Promise<void> {
                 : undefined) ?? traceReplayPaths.get(suffix);
             if (trace) {
               try {
+                const traceStart = Date.now();
+                console.log(
+                  `    trace_replay ${suffix}: ` +
+                    `profile=${formatBytes(fileSize(trace.profileJsonl))}, ` +
+                    `server_csv=${formatBytes(fileSize(trace.serverMetricsCsv))}, ` +
+                    `server_json=${formatBytes(fileSize(trace.serverMetricsJson))}`,
+                );
                 const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null;
                 const metrics = trace.serverMetricsCsv
                   ? fs.readFileSync(trace.serverMetricsCsv)
@@ -476,14 +530,19 @@ async function main(): Promise<void> {
                   ? fs.readFileSync(trace.serverMetricsJson)
                   : null;
                 await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson, {
-                  framework: toInsert[0]?.config.framework,
-                  disagg: toInsert[0]?.config.disagg,
+                  metricsContext: {
+                    framework: toInsert[0]?.config.framework,
+                    disagg: toInsert[0]?.config.disagg,
+                  },
+                  progressLabel: suffix,
                 });
                 totalTraceReplayLinked += insertedIds.length;
+                console.log(`    trace_replay ${suffix}: done (${elapsed(traceStart)})`);
               } catch (error: any) {
                 tracker.recordDbError(`trace_replay for ${suffix}`, error);
               }
             } else {
+              console.log(`    trace_replay ${suffix}: missing sibling artifact`);
               tracker.skips.traceReplayMissing++;
             }
           }
@@ -491,6 +550,7 @@ async function main(): Promise<void> {
           tracker.recordDbError(path.basename(file), error);
         }
       }
+      console.log(`    finished ${relativeFile} (${elapsed(fileStart)})`);
     }
     console.log(`  Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`);
     if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) {

From 5fc051fadf869ef71899633d0dfd37592262fe3e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 18:47:21 -0500
Subject: [PATCH 25/40] ci: select agentic ingest target

---
 .github/workflows/ingest-agentic-results.yml | 59 +++++++++++++++++---
 packages/db/src/etl/workflow-run.ts          |  2 +
 packages/db/src/ingest-ci-run.ts             | 26 ++++++++-
 3 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml
index af94a4c5..fab99f5d 100644
--- a/.github/workflows/ingest-agentic-results.yml
+++ b/.github/workflows/ingest-agentic-results.yml
@@ -8,7 +8,8 @@ name: Ingest Agentic Benchmark Results
 #     -H "Accept: application/vnd.github+v3+json" \
 #     https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \
 #     -d '{"event_type": "ingest-agentic-results",
-#          "client_payload": {"run-id": "<run_id>", "run-attempt": "<attempt>"}}'
+#          "client_payload": {"run-id": "<run_id>", "run-attempt": "<attempt>",
+#                             "database-target": "production"}}'
 #
 # The ingest script (packages/db/src/ingest-ci-run.ts) auto-detects agentic
 # artifacts: benchmark rows land in benchmark_results (benchmark_type=
@@ -34,6 +35,15 @@ on:
         required: false
         default: '1'
         type: string
+      database-target:
+        description: Database/cache target for the ingest
+        required: false
+        default: production
+        type: choice
+        options:
+          - production
+          - dev
+          - agentx-v1
 
 jobs:
   ingest:
@@ -59,9 +69,45 @@ jobs:
         env:
           CYPRESS_INSTALL_BINARY: '0'
 
-      - name: Run migrations
+      - name: Select ingest target
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }}
+          REQUESTED_DATABASE_TARGET: ${{ github.event.client_payload.database-target || inputs.database-target || 'production' }}
+          DATABASE_WRITE_URL_PRODUCTION: ${{ secrets.DATABASE_WRITE_URL }}
+          DATABASE_WRITE_URL_DEV: ${{ secrets.DATABASE_DEV_WRITE_URL }}
+          DATABASE_WRITE_URL_AGENTX_V1: ${{ secrets.DATABASE_AGENTX_V1_WRITE_URL }}
+        run: |
+          case "$REQUESTED_DATABASE_TARGET" in
+            production)
+              database_write_url="$DATABASE_WRITE_URL_PRODUCTION"
+              cache_invalidate_url="https://inferencex.semianalysis.com/api/v1/invalidate"
+              ;;
+            dev)
+              database_write_url="$DATABASE_WRITE_URL_DEV"
+              cache_invalidate_url="https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app/api/v1/invalidate"
+              ;;
+            agentx-v1)
+              database_write_url="$DATABASE_WRITE_URL_AGENTX_V1"
+              cache_invalidate_url="https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app/api/v1/invalidate"
+              ;;
+            *)
+              echo "::error::Unsupported database-target: $REQUESTED_DATABASE_TARGET"
+              exit 1
+              ;;
+          esac
+
+          if [ -z "$database_write_url" ]; then
+            echo "::error::Database secret is empty for target: $REQUESTED_DATABASE_TARGET"
+            exit 1
+          fi
+
+          echo "::add-mask::$database_write_url"
+          echo "DATABASE_WRITE_URL=$database_write_url" >> "$GITHUB_ENV"
+          echo "INGEST_DATABASE_TARGET=$REQUESTED_DATABASE_TARGET" >> "$GITHUB_ENV"
+          echo "CACHE_INVALIDATE_URL=$cache_invalidate_url" >> "$GITHUB_ENV"
+          echo "Selected ingest target: $REQUESTED_DATABASE_TARGET"
+          echo "Cache invalidate URL: $cache_invalidate_url"
+
+      - name: Run migrations
         run: pnpm admin:db:migrate --yes
 
       - name: Download artifacts from InferenceX run
@@ -120,7 +166,6 @@ jobs:
 
       - name: Ingest results to DB
         env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }}
           GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
           INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }}
           INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }}
@@ -130,20 +175,16 @@ jobs:
         run: pnpm admin:db:ingest:ci
 
       - name: Apply run overrides
-        env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }}
         run: pnpm admin:db:apply-overrides --yes
 
       - name: Verify database
-        env:
-          DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }}
         run: pnpm admin:db:verify
 
       - name: Invalidate Vercel cache
         env:
           VERCEL_INVALIDATE_SECRET: ${{ secrets.VERCEL_INVALIDATE_SECRET }}
         run: |
-          curl -sSf -X POST "https://inferencex.semianalysis.com/api/v1/invalidate" \
+          curl -sSf -X POST "$CACHE_INVALIDATE_URL" \
             -H "Authorization: Bearer $VERCEL_INVALIDATE_SECRET" || true
 
       - name: Check for unmapped entities
diff --git a/packages/db/src/etl/workflow-run.ts b/packages/db/src/etl/workflow-run.ts
index 4097a3c5..28d27c87 100644
--- a/packages/db/src/etl/workflow-run.ts
+++ b/packages/db/src/etl/workflow-run.ts
@@ -26,6 +26,7 @@ export interface GithubRunInfo {
   runStartedAt: string | null;
   headSha: string | null;
   headBranch: string | null;
+  headCommitMessage: string | null;
   runAttempt: number | null;
   pullRequests: GithubPullRequestRef[];
 }
@@ -101,6 +102,7 @@ export function createWorkflowRunServices(sql: Sql, githubToken?: string) {
         runStartedAt: d.run_started_at ? String(d.run_started_at) : null,
         headSha: d.head_sha ? String(d.head_sha) : null,
         headBranch: d.head_branch ? String(d.head_branch) : null,
+        headCommitMessage: d.head_commit?.message ? String(d.head_commit.message) : null,
         runAttempt: typeof d.run_attempt === 'number' ? d.run_attempt : null,
         pullRequests,
       };
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 15267622..8bdb4157 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -58,6 +58,7 @@ import { ingestEvalRow } from './etl/eval-ingest';
 import { mapEvalSamples } from './etl/eval-samples-mapper';
 import { bulkIngestEvalSamples } from './etl/eval-samples-ingest';
 import {
+  type ChangelogEntry,
   parseChangelogEntries,
   ingestChangelogEntries,
   hasEvalsOnlyFlag,
@@ -335,7 +336,7 @@ async function main(): Promise<void> {
   const parsedChangelogs: {
     baseRef: string;
     headRef: string;
-    entries: ReturnType<typeof parseChangelogEntries>;
+    entries: ChangelogEntry[];
   }[] = [];
   for (const file of changelogFiles) {
     const data = readJson(file) as Record<string, any> | null;
@@ -346,6 +347,29 @@ async function main(): Promise<void> {
     const entries = parseChangelogEntries(data.entries);
     if (entries.length > 0) parsedChangelogs.push({ baseRef, headRef, entries });
   }
+  if (parsedChangelogs.length === 0) {
+    const headRef = workflowGhInfo?.headBranch ?? workflowGhInfo?.headSha ?? `run-${runIdStr}`;
+    const fallbackDescription =
+      workflowGhInfo?.headCommitMessage?.trim().split('\n')[0]?.trim() ||
+      workflowGhInfo?.name ||
+      `GitHub Actions run ${runIdStr}`;
+
+    parsedChangelogs.push({
+      baseRef: 'unknown',
+      headRef,
+      entries: [
+        {
+          configKeys: [],
+          description: fallbackDescription,
+          prLink: null,
+          evalsOnly: false,
+        },
+      ],
+    });
+    console.log(
+      `  No changelog metadata artifact found; using fallback changelog: ${fallbackDescription}`,
+    );
+  }
   const evalsOnly = hasEvalsOnlyFlag(parsedChangelogs);
   if (evalsOnly) {
     console.log('\n  ⚠ evals-only run detected — skipping benchmark and stats ingest');

From 71185999946f796dc32799927def0baa61126e00 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 19:50:55 -0500
Subject: [PATCH 26/40] fix(ingest): prefer the workflow name for fallback
 changelog descriptions

The head commit message usually describes an unrelated code change;
the workflow display name describes the sweep itself.
---
 packages/db/src/ingest-ci-run.ts | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 8bdb4157..cada82d6 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -349,9 +349,12 @@ async function main(): Promise<void> {
   }
   if (parsedChangelogs.length === 0) {
     const headRef = workflowGhInfo?.headBranch ?? workflowGhInfo?.headSha ?? `run-${runIdStr}`;
+    // Prefer the workflow's display name ("e2e Test - B300 DSv4 AgentX vLLM 1h
+    // + 10m warmup") — it describes the sweep; the head commit message usually
+    // describes an unrelated code change.
     const fallbackDescription =
+      workflowGhInfo?.name?.trim() ||
       workflowGhInfo?.headCommitMessage?.trim().split('\n')[0]?.trim() ||
-      workflowGhInfo?.name ||
       `GitHub Actions run ${runIdStr}`;
 
     parsedChangelogs.push({

From 2335e2f1caeea4b5faab312c6220ad1fd060ae12 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 20:03:50 -0500
Subject: [PATCH 27/40] fix(ingest): skip failed runs that never issued a
 request

The failed-run guard required num_requests_total > 0, so a config whose
server never came up (total = 0, e.g. dep4 conc32 in run 28617267459)
slipped through as a dataless point. Any row explicitly reporting zero
successful requests is a failure regardless of how many were issued.
---
 packages/db/src/etl/benchmark-mapper.test.ts | 10 ++++++++++
 packages/db/src/etl/benchmark-mapper.ts      |  9 +++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts
index cde2f74b..bb286734 100644
--- a/packages/db/src/etl/benchmark-mapper.test.ts
+++ b/packages/db/src/etl/benchmark-mapper.test.ts
@@ -850,6 +850,16 @@ describe('mapBenchmarkRow — v3 agentic nested agg schema', () => {
     expect(tracker.skips.failedRun).toBe(1);
   });
 
+  it('skips rows where the server never came up (zero total requests)', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeV3AgenticRow({ num_requests_successful: 0, num_requests_total: 0 }),
+      tracker,
+    );
+    expect(result).toBeNull();
+    expect(tracker.skips.failedRun).toBe(1);
+  });
+
   it('leaves v2 flat agentic rows byte-identical (no flattening applied)', () => {
     const tracker = createSkipTracker();
     const result = mapBenchmarkRow(
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index caae08c2..e3fb148e 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -197,14 +197,15 @@ export function mapBenchmarkRow(
   }
 
   // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from
-  // every runner, including ones with 0 successful requests and null metrics.
-  // Without this skip, the empty row's nulls overwrite a good row via
+  // every runner, including failed ones with 0 successful requests and null
+  // metrics — both the "issued requests but none succeeded" case (total > 0)
+  // and the "server never came up" case (total === 0). Without this skip the
+  // empty row lands as a dataless point, or overwrites a good row via
   // ON CONFLICT DO UPDATE when both share the same (config, conc, offload).
   if (
     typeof row.num_requests_successful === 'number' &&
     row.num_requests_successful === 0 &&
-    typeof row.num_requests_total === 'number' &&
-    row.num_requests_total > 0
+    typeof row.num_requests_total === 'number'
   ) {
     tracker.skips.failedRun++;
     return null;

From 608867fdec11add64debf628e43a87e8108749ae Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 17:42:41 -0500
Subject: [PATCH 28/40] fix(dump-mode): support agentic + dataset surfaces
 without a DB

The documented DUMP_DIR mode 500'd on every new surface: the four new
tables (agentic_trace_replay, datasets, dataset_conversations,
run_datasets) were missing from TABLE_INSERT_ORDER so dumps never
carried them, json-provider had no mirrors, and ten routes called
getDb() with no JSON_MODE guard.

Tables added in FK-safe order; bytea blobs round-trip through dump/load
(Buffer JSON encoding, ::bytea decode); agentic_trace_replay lazy-loads
like server_logs; mirrors reuse the same pure compute helpers as the
SQL paths for version-stale fallbacks; all ten routes gain the standard
JSON_MODE branch. Verified end-to-end: dump-mode server serves all ten
endpoints 200, byte-identical to Postgres on 9/10 (remaining diffs are
pre-existing benchmarks-mirror nuances). Adds 21 mirror tests.
---
 packages/app/next.config.ts                   |   6 +
 .../app/api/v1/agentic-aggregates/route.ts    |   8 +-
 .../app/api/v1/benchmark-siblings/route.ts    |  11 +-
 .../[slug]/conversations/[convId]/route.ts    |   9 +-
 .../v1/datasets/[slug]/conversations/route.ts |  12 +-
 .../src/app/api/v1/datasets/[slug]/route.ts   |  11 +-
 packages/app/src/app/api/v1/datasets/route.ts |  11 +-
 .../api/v1/derived-agentic-metrics/route.ts   |   8 +-
 .../src/app/api/v1/request-timeline/route.ts  |   8 +-
 .../src/app/api/v1/trace-histograms/route.ts  |   8 +-
 .../app/api/v1/trace-server-metrics/route.ts  |   8 +-
 packages/constants/src/tables.ts              |  18 +
 packages/db/src/dump-db.ts                    |  20 +-
 .../json-provider.agentic-datasets.test.ts    | 592 +++++++++++++++
 packages/db/src/json-provider.ts              | 709 +++++++++++++++++-
 packages/db/src/load-dump.ts                  |  56 +-
 packages/db/src/reset-db.ts                   |  10 +-
 17 files changed, 1463 insertions(+), 42 deletions(-)
 create mode 100644 packages/db/src/json-provider.agentic-datasets.test.ts

diff --git a/packages/app/next.config.ts b/packages/app/next.config.ts
index 39ab4487..32988f05 100644
--- a/packages/app/next.config.ts
+++ b/packages/app/next.config.ts
@@ -3,6 +3,12 @@ import type { NextConfig } from 'next';
 import { allowedDevOriginsFromEnv } from './src/lib/allowed-dev-origins';
 
 const nextConfig: NextConfig = {
+  // Allow a second, isolated dev server (e.g. a dump-mode instance on another
+  // port) to run from the same project dir by pointing it at a separate build
+  // dir via NEXT_DIST_DIR. Defaults to '.next' so the primary server and all
+  // CI/prod builds are unaffected. Next.js's single-dev-server lock lives under
+  // distDir, so distinct dirs let the two coexist.
+  distDir: process.env.NEXT_DIST_DIR || '.next',
   allowedDevOrigins: allowedDevOriginsFromEnv(),
   transpilePackages: ['@semianalysisai/inferencex-constants'],
   serverExternalPackages: ['shiki'],
diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
index 83238e89..9cb229d4 100644
--- a/packages/app/src/app/api/v1/agentic-aggregates/route.ts
+++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
@@ -1,4 +1,5 @@
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   getAgenticAggregates,
   STATS_VERSION,
@@ -23,7 +24,10 @@ export const dynamic = 'force-dynamic';
 export const CACHE_KEY_PREFIX = `agentic-aggregates-v${STATS_VERSION}`;
 
 const getCachedAgenticAggregates = cachedQuery(
-  (ids: number[]): Promise<AgenticAggregateMap> => getAgenticAggregates(getDb(), ids),
+  (ids: number[]): Promise<AgenticAggregateMap> => {
+    if (JSON_MODE) return Promise.resolve(jsonProvider.getAgenticAggregates(ids));
+    return getAgenticAggregates(getDb(), ids);
+  },
   CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
index 38e79c23..0718aae0 100644
--- a/packages/app/src/app/api/v1/benchmark-siblings/route.ts
+++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
@@ -1,4 +1,5 @@
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   getBenchmarkSiblings,
   type BenchmarkSiblings,
@@ -10,10 +11,10 @@ import { idQueryRoute } from '../id-routes';
 
 export const dynamic = 'force-dynamic';
 
-const getCachedSiblings = cachedQuery(
-  (id: number): Promise<BenchmarkSiblings | null> => getBenchmarkSiblings(getDb(), id),
-  'benchmark-siblings',
-);
+const getCachedSiblings = cachedQuery((id: number): Promise<BenchmarkSiblings | null> => {
+  if (JSON_MODE) return Promise.resolve(jsonProvider.getBenchmarkSiblings(id));
+  return getBenchmarkSiblings(getDb(), id);
+}, 'benchmark-siblings');
 
 /**
  * GET /api/v1/benchmark-siblings?id=N
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
index 84cc15e3..61672759 100644
--- a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
@@ -1,6 +1,7 @@
 import { type NextRequest, NextResponse } from 'next/server';
 
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   getConversation,
   type ConversationDetail,
@@ -11,8 +12,10 @@ import { cachedJson, cachedQuery } from '@/lib/api-cache';
 export const dynamic = 'force-dynamic';
 
 const getCachedConversation = cachedQuery(
-  (slug: string, convId: string): Promise<ConversationDetail | null> =>
-    getConversation(getDb(), slug, convId),
+  (slug: string, convId: string): Promise<ConversationDetail | null> => {
+    if (JSON_MODE) return Promise.resolve(jsonProvider.getConversation(slug, convId));
+    return getConversation(getDb(), slug, convId);
+  },
   'dataset-conversation',
 );
 
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
index 62b9e5b7..196c29d6 100644
--- a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
@@ -1,6 +1,7 @@
 import { type NextRequest, NextResponse } from 'next/server';
 
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   listConversations,
   type ConversationList,
@@ -20,13 +21,16 @@ const getCachedConversations = cachedQuery(
     limit: number,
     offset: number,
     sort: string,
-  ): Promise<ConversationList | null> =>
-    listConversations(getDb(), slug, {
+  ): Promise<ConversationList | null> => {
+    const opts: ListConversationsOpts = {
       search: search || undefined,
       limit,
       offset,
       sort: sort as ListConversationsOpts['sort'],
-    }),
+    };
+    if (JSON_MODE) return Promise.resolve(jsonProvider.listConversations(slug, opts));
+    return listConversations(getDb(), slug, opts);
+  },
   'dataset-conversations',
 );
 
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
index 9e4af580..e440ff5d 100644
--- a/packages/app/src/app/api/v1/datasets/[slug]/route.ts
+++ b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
@@ -1,16 +1,17 @@
 import { type NextRequest, NextResponse } from 'next/server';
 
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import { getDataset, type DatasetDetail } from '@semianalysisai/inferencex-db/queries/datasets';
 
 import { cachedJson, cachedQuery } from '@/lib/api-cache';
 
 export const dynamic = 'force-dynamic';
 
-const getCachedDataset = cachedQuery(
-  (slug: string): Promise<DatasetDetail | null> => getDataset(getDb(), slug),
-  'dataset',
-);
+const getCachedDataset = cachedQuery((slug: string): Promise<DatasetDetail | null> => {
+  if (JSON_MODE) return Promise.resolve(jsonProvider.getDataset(slug));
+  return getDataset(getDb(), slug);
+}, 'dataset');
 
 /** GET /api/v1/datasets/[slug] — one dataset incl. precomputed chart_data. */
 export async function GET(
diff --git a/packages/app/src/app/api/v1/datasets/route.ts b/packages/app/src/app/api/v1/datasets/route.ts
index f0acca3c..3ad4c15d 100644
--- a/packages/app/src/app/api/v1/datasets/route.ts
+++ b/packages/app/src/app/api/v1/datasets/route.ts
@@ -1,16 +1,17 @@
 import { NextResponse } from 'next/server';
 
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import { listDatasets, type DatasetRecord } from '@semianalysisai/inferencex-db/queries/datasets';
 
 import { cachedJson, cachedQuery } from '@/lib/api-cache';
 
 export const dynamic = 'force-dynamic';
 
-const getCachedDatasets = cachedQuery(
-  (): Promise<DatasetRecord[]> => listDatasets(getDb()),
-  'datasets',
-);
+const getCachedDatasets = cachedQuery((): Promise<DatasetRecord[]> => {
+  if (JSON_MODE) return Promise.resolve(jsonProvider.listDatasets());
+  return listDatasets(getDb());
+}, 'datasets');
 
 /** GET /api/v1/datasets — all ingested cc-traces-weka datasets (registry cards). */
 export async function GET() {
diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index 647b6dda..3afa5d41 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -1,5 +1,6 @@
 import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   getDerivedAgenticMetrics,
   type DerivedAgenticMetricMap,
@@ -24,7 +25,10 @@ export const dynamic = 'force-dynamic';
 export const CACHE_KEY_PREFIX = `derived-agentic-metrics-v${STATS_VERSION}`;
 
 const getCachedDerivedAgenticMetrics = cachedQuery(
-  (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
+  (ids: number[]): Promise<DerivedAgenticMetricMap> => {
+    if (JSON_MODE) return Promise.resolve(jsonProvider.getDerivedAgenticMetrics(ids));
+    return getDerivedAgenticMetrics(getDb(), ids);
+  },
   CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts
index bd1d67f5..89b599af 100644
--- a/packages/app/src/app/api/v1/request-timeline/route.ts
+++ b/packages/app/src/app/api/v1/request-timeline/route.ts
@@ -1,5 +1,6 @@
 import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline';
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   getRequestTimeline,
   type RequestTimeline,
@@ -19,7 +20,10 @@ export const dynamic = 'force-dynamic';
 export const CACHE_KEY_PREFIX = `request-timeline-v${REQUEST_TIMELINE_VERSION}`;
 
 const getCachedRequestTimeline = cachedQuery(
-  (id: number): Promise<RequestTimeline | null> => getRequestTimeline(getDb(), id),
+  (id: number): Promise<RequestTimeline | null> => {
+    if (JSON_MODE) return Promise.resolve(jsonProvider.getRequestTimeline(id));
+    return getRequestTimeline(getDb(), id);
+  },
   CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
index 206205f5..4d3014ab 100644
--- a/packages/app/src/app/api/v1/trace-histograms/route.ts
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -1,5 +1,6 @@
 import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline';
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   getTraceHistograms,
   type TraceHistogramMap,
@@ -24,7 +25,10 @@ export const dynamic = 'force-dynamic';
 export const CACHE_KEY_PREFIX = `trace-histograms-v${REQUEST_TIMELINE_VERSION}`;
 
 const getCachedTraceHistograms = cachedQuery(
-  (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
+  (ids: number[]): Promise<TraceHistogramMap> => {
+    if (JSON_MODE) return Promise.resolve(jsonProvider.getTraceHistograms(ids));
+    return getTraceHistograms(getDb(), ids);
+  },
   CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
index 149fefbf..2d3554a4 100644
--- a/packages/app/src/app/api/v1/trace-server-metrics/route.ts
+++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
@@ -1,5 +1,6 @@
 import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series';
-import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   getTraceServerMetrics,
   type TraceServerMetrics,
@@ -19,7 +20,10 @@ export const dynamic = 'force-dynamic';
 export const CACHE_KEY_PREFIX = `trace-server-metrics-v${CHART_SERIES_VERSION}`;
 
 const getCachedTraceServerMetrics = cachedQuery(
-  (id: number): Promise<TraceServerMetrics | null> => getTraceServerMetrics(getDb(), id),
+  (id: number): Promise<TraceServerMetrics | null> => {
+    if (JSON_MODE) return jsonProvider.getTraceServerMetrics(id);
+    return getTraceServerMetrics(getDb(), id);
+  },
   CACHE_KEY_PREFIX,
   { blobOnly: true },
 );
diff --git a/packages/constants/src/tables.ts b/packages/constants/src/tables.ts
index 60e85182..f482fd5e 100644
--- a/packages/constants/src/tables.ts
+++ b/packages/constants/src/tables.ts
@@ -2,6 +2,7 @@
 export const TABLE_NAMES = {
   configs: 'configs',
   workflowRuns: 'workflow_runs',
+  agenticTraceReplay: 'agentic_trace_replay',
   benchmarkResults: 'benchmark_results',
   serverLogs: 'server_logs',
   runStats: 'run_stats',
@@ -9,21 +10,38 @@ export const TABLE_NAMES = {
   evalSamples: 'eval_samples',
   changelogEntries: 'changelog_entries',
   availability: 'availability',
+  datasets: 'datasets',
+  datasetConversations: 'dataset_conversations',
+  runDatasets: 'run_datasets',
   schemaMigrations: 'schema_migrations',
 } as const;
 
 /**
  * Data tables in FK-safe insertion order.
  * Parents before children — safe for dump, load, and (reversed) reset.
+ *
+ * FK edges enforced by this ordering (verified against migration 008_agentic.sql
+ * and the live schema's pg_constraint):
+ *   - benchmark_results.trace_replay_id → agentic_trace_replay(id)
+ *       ⇒ agentic_trace_replay before benchmark_results
+ *   - dataset_conversations.dataset_id → datasets(id)
+ *       ⇒ datasets before dataset_conversations
+ *   - run_datasets.workflow_run_id → workflow_runs(id)
+ *       ⇒ workflow_runs before run_datasets (run_datasets.dataset_slug is a
+ *         plain slug, NOT an FK to datasets, so it needs no ordering vs datasets)
  */
 export const TABLE_INSERT_ORDER = [
   TABLE_NAMES.configs,
   TABLE_NAMES.serverLogs,
   TABLE_NAMES.workflowRuns,
+  TABLE_NAMES.agenticTraceReplay,
   TABLE_NAMES.benchmarkResults,
   TABLE_NAMES.evalResults,
   TABLE_NAMES.evalSamples,
   TABLE_NAMES.runStats,
   TABLE_NAMES.changelogEntries,
   TABLE_NAMES.availability,
+  TABLE_NAMES.datasets,
+  TABLE_NAMES.datasetConversations,
+  TABLE_NAMES.runDatasets,
 ] as const;
diff --git a/packages/db/src/dump-db.ts b/packages/db/src/dump-db.ts
index 3810fe7a..d0e315d1 100644
--- a/packages/db/src/dump-db.ts
+++ b/packages/db/src/dump-db.ts
@@ -18,7 +18,25 @@ const sql = createAdminSql({ noSsl: hasNoSslFlag(), readonly: true, max: 1 });
 
 const CURSOR_BATCH = 100;
 
-/** Stream a table to a JSON file using a cursor, writing row-by-row. */
+/**
+ * Stream a table to a JSON file using a cursor, writing row-by-row.
+ *
+ * BYTEA round-trip: postgres.js decodes a `bytea` column to a Node `Buffer`.
+ * `JSON.stringify(buffer)` invokes Buffer.prototype.toJSON(), which emits
+ * `{"type":"Buffer","data":[<byte>, …]}`. That's a lossless byte-array encoding
+ * (verified: JSON.parse → Buffer.from(obj.data) reproduces the exact bytes), so
+ * `agentic_trace_replay`'s blob columns (profile_export_jsonl_gz,
+ * server_metrics_csv, server_metrics_json_gz) survive the dump verbatim.
+ * load-dump.ts reconstructs the Buffer and casts it back to `::bytea`.
+ *
+ * Dump-size note: the byte-array encoding is ~4-6× the raw bytea size (each
+ * byte becomes 1-4 ASCII digits + a comma). For the big compressed blobs
+ * (server_metrics_json_gz can be ~17 MB compressed on high-conc TP+EP rows)
+ * the resulting agentic_trace_replay.json is the largest file in the dump — the
+ * same trade-off server_logs.json already makes. We keep all columns (no
+ * dropping) so dump mode has full parity with the DB, and json-provider
+ * lazy-loads this file only when a blob-backed route actually needs a fallback.
+ */
 async function streamTable(table: string, outPath: string): Promise<number> {
   const out = createWriteStream(outPath);
   out.write('[\n');
diff --git a/packages/db/src/json-provider.agentic-datasets.test.ts b/packages/db/src/json-provider.agentic-datasets.test.ts
new file mode 100644
index 00000000..d6cb6601
--- /dev/null
+++ b/packages/db/src/json-provider.agentic-datasets.test.ts
@@ -0,0 +1,592 @@
+import { mkdtempSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { gzipSync } from 'node:zlib';
+
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION } from './etl/compute-request-timeline.js';
+import { STATS_VERSION } from './queries/agentic-shared.js';
+import type * as JsonProvider from './json-provider.js';
+
+/**
+ * Fixture-backed parity tests for the PR348 dump-mode mirrors added to
+ * json-provider.ts: the 6 agentic per-point queries + the 4 dataset queries.
+ *
+ * The store is a lazy singleton keyed off DUMP_DIR, so we write a small dump
+ * directory, point DUMP_DIR at it, and dynamically import the module once.
+ *
+ * Coverage per mirror:
+ *  - fast path: precomputed JSONB (aggregate_stats / chart_series /
+ *    request_timeline) at the CURRENT version is served verbatim.
+ *  - blob fallback: a STALE version forces a re-derive from the (dumped) blob
+ *    using the same pure helper the SQL path uses.
+ *  - bytea round-trip: blobs are stored as {type:'Buffer',data:[…]} (what
+ *    dump-db emits) and must gunzip cleanly.
+ */
+
+/** Encode a Buffer the way dump-db.ts does (Buffer.prototype.toJSON()). */
+function byteaJson(buf: Buffer): { type: 'Buffer'; data: number[] } {
+  return { type: 'Buffer', data: [...buf] };
+}
+
+// A tiny profiling-phase profile_export.jsonl with two conversations/turns so
+// extractIslOsl / computeDerivedFromBlob / computeRequestTimeline all produce
+// non-empty output.
+const PROFILE_JSONL = [
+  JSON.stringify({
+    metadata: {
+      benchmark_phase: 'profiling',
+      conversation_id: 'convA',
+      turn_index: 0,
+      worker_id: 'w0',
+      credit_issued_ns: 1_000_000_000,
+      request_start_ns: 1_000_000_000,
+      request_ack_ns: 1_050_000_000,
+      request_end_ns: 1_500_000_000,
+    },
+    metrics: {
+      input_sequence_length: { value: 1000 },
+      output_sequence_length: { value: 200 },
+      time_to_first_token: { value: 50 },
+      request_latency: { value: 500 },
+    },
+  }),
+  JSON.stringify({
+    metadata: {
+      benchmark_phase: 'profiling',
+      conversation_id: 'convB',
+      turn_index: 0,
+      worker_id: 'w1',
+      credit_issued_ns: 2_000_000_000,
+      request_start_ns: 2_000_000_000,
+      request_ack_ns: 2_040_000_000,
+      request_end_ns: 2_800_000_000,
+    },
+    metrics: {
+      input_sequence_length: { value: 2000 },
+      output_sequence_length: { value: 400 },
+      time_to_first_token: { value: 40 },
+      request_latency: { value: 800 },
+    },
+  }),
+].join('\n');
+
+// A minimal server_metrics_json with one KV-cache gauge series so
+// extractServerMetricSamples / computeChartSeries yield a value.
+const SERVER_JSON = JSON.stringify({
+  metrics: {
+    'vllm:kv_cache_usage_perc': {
+      series: [
+        {
+          labels: { engine: '0' },
+          timeslices: [
+            { start_ns: 0, avg: 0.4 },
+            { start_ns: 1_000_000_000, avg: 0.6 },
+          ],
+        },
+      ],
+    },
+  },
+});
+
+const PROFILE_GZ = gzipSync(Buffer.from(PROFILE_JSONL, 'utf8'));
+const SERVER_GZ = gzipSync(Buffer.from(SERVER_JSON, 'utf8'));
+
+// Precomputed JSONB payloads at the CURRENT versions (fast path).
+const CURRENT_AGG_STATS = {
+  version: STATS_VERSION,
+  isl: { mean: 1500, p50: 1500, p75: 1750, p90: 1900, p99: 1990, n: 2 },
+  osl: { mean: 300, p50: 300, p75: 350, p90: 380, p99: 398, n: 2 },
+  kvCacheUtil: { mean: 0.5, p50: 0.5, p75: 0.55, p90: 0.58, p99: 0.6, n: 2 },
+  prefixCacheHitRate: null,
+  normalizedSessionTimeS: 0.65,
+  p90PrefillTpsPerUser: 42,
+  normalizedE2e400: { mean: 0.5, p50: 0.5, p75: 0.7, p90: 0.9, p99: 0.99, n: 2 },
+};
+
+const CURRENT_TIMELINE = {
+  version: REQUEST_TIMELINE_VERSION,
+  startNs: 0,
+  endNs: 1_000_000,
+  durationS: 0.001,
+  requests: [
+    {
+      cid: 'convA',
+      ti: 0,
+      wid: 'w0',
+      ad: 0,
+      phase: 'profiling',
+      credit: 0,
+      start: 0,
+      ack: 5,
+      end: 500,
+      ttftMs: 50,
+      tpotMs: null,
+      isl: 1000,
+      osl: 200,
+      cancelled: false,
+    },
+  ],
+};
+
+let jp: typeof JsonProvider;
+
+beforeAll(async () => {
+  const dir = mkdtempSync(join(tmpdir(), 'infx-pr348-'));
+
+  // configs / workflow_runs / benchmark_results — enough for the agentic mirrors.
+  writeFileSync(
+    join(dir, 'configs.json'),
+    JSON.stringify([
+      {
+        id: 1,
+        hardware: 'h100',
+        framework: 'vllm',
+        model: 'testm',
+        precision: 'fp8',
+        spec_method: 'none',
+        disagg: false,
+        is_multinode: false,
+        prefill_tp: 1,
+        prefill_ep: 1,
+        prefill_dp_attention: false,
+        prefill_num_workers: 1,
+        decode_tp: 2,
+        decode_ep: 1,
+        decode_dp_attention: false,
+        decode_num_workers: 1,
+        num_prefill_gpu: 0,
+        num_decode_gpu: 8,
+      },
+      {
+        id: 2,
+        hardware: 'h100',
+        framework: 'vllm',
+        model: 'testm',
+        precision: 'fp8',
+        spec_method: 'none',
+        disagg: false,
+        is_multinode: false,
+        prefill_tp: 1,
+        prefill_ep: 1,
+        prefill_dp_attention: false,
+        prefill_num_workers: 1,
+        decode_tp: 4,
+        decode_ep: 1,
+        decode_dp_attention: false,
+        decode_num_workers: 1,
+        num_prefill_gpu: 0,
+        num_decode_gpu: 8,
+      },
+    ]),
+  );
+  writeFileSync(
+    join(dir, 'workflow_runs.json'),
+    JSON.stringify([
+      {
+        id: 10,
+        github_run_id: 555,
+        run_attempt: 1,
+        name: 'run 555',
+        status: 'completed',
+        conclusion: 'success',
+        head_sha: 'sha',
+        head_branch: 'main',
+        html_url: 'https://github.com/x/runs/555',
+        created_at: '2026-06-14T04:00:00Z',
+        run_started_at: '2026-06-14T04:00:00Z',
+        date: '2026-06-14',
+      },
+    ]),
+  );
+  // id 1 → trace_replay 100 (fast-path stats + timeline). id 2 → trace_replay 200
+  // (STALE stats + timeline → forces blob fallback). id 3 has no trace_replay.
+  writeFileSync(
+    join(dir, 'benchmark_results.json'),
+    JSON.stringify([
+      {
+        id: 1,
+        workflow_run_id: 10,
+        config_id: 1,
+        benchmark_type: 'agentic_traces',
+        date: '2026-06-14',
+        isl: null,
+        osl: null,
+        conc: 16,
+        offload_mode: 'off',
+        image: null,
+        metrics: {
+          tput_per_gpu: 123,
+          total_requests_completed: 200,
+          server_gpu_cache_hit_rate: 0.5,
+        },
+        error: null,
+        server_log_id: null,
+        trace_replay_id: 100,
+      },
+      {
+        id: 2,
+        workflow_run_id: 10,
+        config_id: 2,
+        benchmark_type: 'agentic_traces',
+        date: '2026-06-14',
+        isl: null,
+        osl: null,
+        conc: 32,
+        offload_mode: 'on',
+        image: null,
+        metrics: { tput_per_gpu: 456, num_requests_total: 180 },
+        error: null,
+        server_log_id: null,
+        trace_replay_id: 200,
+      },
+      {
+        id: 3,
+        workflow_run_id: 10,
+        config_id: 1,
+        benchmark_type: 'agentic_traces',
+        date: '2026-06-14',
+        isl: null,
+        osl: null,
+        conc: 8,
+        offload_mode: 'off',
+        image: null,
+        metrics: {},
+        error: null,
+        server_log_id: null,
+        trace_replay_id: null,
+      },
+    ]),
+  );
+
+  // agentic_trace_replay: 100 = current JSONB, 200 = stale JSONB (force blob).
+  writeFileSync(
+    join(dir, 'agentic_trace_replay.json'),
+    JSON.stringify([
+      {
+        id: 100,
+        profile_export_jsonl_gz: byteaJson(PROFILE_GZ),
+        profile_export_uncompressed_size: PROFILE_JSONL.length,
+        server_metrics_csv: null,
+        server_metrics_csv_size: null,
+        server_metrics_json_gz: byteaJson(SERVER_GZ),
+        server_metrics_json_uncompressed_size: SERVER_JSON.length,
+        aggregate_stats: CURRENT_AGG_STATS,
+        chart_series: null, // no current chart_series → trace-server-metrics uses blob
+        request_timeline: CURRENT_TIMELINE,
+        created_at: '2026-06-14T04:00:00Z',
+      },
+      {
+        id: 200,
+        profile_export_jsonl_gz: byteaJson(PROFILE_GZ),
+        profile_export_uncompressed_size: PROFILE_JSONL.length,
+        server_metrics_csv: null,
+        server_metrics_csv_size: null,
+        server_metrics_json_gz: byteaJson(SERVER_GZ),
+        server_metrics_json_uncompressed_size: SERVER_JSON.length,
+        aggregate_stats: { version: 1 }, // stale → force profile-blob fallback
+        chart_series: { version: 1 }, // stale → force server-blob fallback
+        request_timeline: { version: 1 }, // stale → force profile-blob fallback
+        created_at: '2026-06-14T04:00:00Z',
+      },
+    ]),
+  );
+
+  // Datasets fixtures.
+  writeFileSync(
+    join(dir, 'datasets.json'),
+    JSON.stringify([
+      {
+        id: 'org/ds-new',
+        slug: 'ds-new',
+        label: 'DS New',
+        variant: 'full',
+        description: 'newest',
+        hf_url: null,
+        license: null,
+        conversation_count: 3,
+        summary: { totalIn: 100 },
+        chart_data: { hist: [1, 2, 3] },
+        dataset_version: 1,
+        ingested_at: '2026-06-20T00:00:00Z',
+      },
+      {
+        id: 'org/ds-old',
+        slug: 'ds-old',
+        label: 'DS Old',
+        variant: 'full',
+        description: 'oldest',
+        hf_url: null,
+        license: null,
+        conversation_count: 0,
+        summary: {},
+        chart_data: {},
+        dataset_version: 1,
+        ingested_at: '2026-06-10T00:00:00Z',
+      },
+    ]),
+  );
+  writeFileSync(
+    join(dir, 'dataset_conversations.json'),
+    JSON.stringify([
+      {
+        id: 1,
+        dataset_id: 'org/ds-new',
+        conv_id: 'agent-alpha',
+        models: ['m1'],
+        num_turns: 5,
+        num_subagent_groups: 2,
+        total_in: 300,
+        total_out: 30,
+        total_cached: 10,
+        structure: { nodes: [] },
+      },
+      {
+        id: 2,
+        dataset_id: 'org/ds-new',
+        conv_id: 'AGENT-beta',
+        models: ['m2'],
+        num_turns: 9,
+        num_subagent_groups: 1,
+        total_in: 100,
+        total_out: 20,
+        total_cached: 5,
+        structure: { nodes: [{ kind: 'turn' }] },
+      },
+      {
+        id: 3,
+        dataset_id: 'org/ds-new',
+        conv_id: 'plain-gamma',
+        models: ['m1'],
+        num_turns: 2,
+        num_subagent_groups: 4,
+        total_in: 200,
+        total_out: 40,
+        total_cached: 15,
+        structure: { nodes: [] },
+      },
+    ]),
+  );
+  writeFileSync(
+    join(dir, 'run_datasets.json'),
+    JSON.stringify([
+      { workflow_run_id: 10, dataset_slug: 'ds-new', created_at: '2026-06-14T04:00:00Z' },
+    ]),
+  );
+
+  // Empty tables the store loads eagerly.
+  for (const f of [
+    'run_stats.json',
+    'eval_results.json',
+    'availability.json',
+    'changelog_entries.json',
+  ]) {
+    writeFileSync(join(dir, f), '[]');
+  }
+
+  process.env.DUMP_DIR = dir;
+  jp = await import('./json-provider.js');
+});
+
+afterAll(() => {
+  delete process.env.DUMP_DIR;
+});
+
+describe('agentic aggregates mirror', () => {
+  it('serves precomputed aggregate_stats at the current version (fast path)', () => {
+    const map = jp.getAgenticAggregates([1]);
+    expect(map[1]?.isl).toEqual(CURRENT_AGG_STATS.isl);
+    expect(map[1]?.kvCacheUtil).toEqual(CURRENT_AGG_STATS.kvCacheUtil);
+  });
+
+  it('re-derives from the dumped blobs when the stored version is stale', () => {
+    const map = jp.getAgenticAggregates([2]);
+    // isl percentiles from the two-turn profile blob (1000, 2000).
+    expect(map[2]?.isl?.n).toBe(2);
+    expect(map[2]?.isl?.mean).toBe(1500);
+    // kv cache util from the server blob (0.4, 0.6).
+    expect(map[2]?.kvCacheUtil?.n).toBe(2);
+    expect(map[2]?.kvCacheUtil?.mean).toBeCloseTo(0.5);
+  });
+
+  it('returns a blank aggregate for an id with no trace_replay', () => {
+    const map = jp.getAgenticAggregates([3]);
+    expect(map[3]).toEqual({
+      id: 3,
+      isl: null,
+      osl: null,
+      kvCacheUtil: null,
+      prefixCacheHitRate: null,
+    });
+  });
+});
+
+describe('derived agentic metrics mirror', () => {
+  it('fast path reads the derived fields out of aggregate_stats', () => {
+    const map = jp.getDerivedAgenticMetrics([1]);
+    expect(map[1]?.normalized_session_time_s).toBe(0.65);
+    expect(map[1]?.p90_prefill_tps_per_user).toBe(42);
+    expect(map[1]?.p75_normalized_e2e_400_s).toBe(0.7);
+  });
+
+  it('blob fallback recomputes via computeDerivedFromBlob', () => {
+    const map = jp.getDerivedAgenticMetrics([2]);
+    expect(map[2]?.normalized_session_time_s).not.toBeNull();
+    expect(map[2]?.p90_prefill_tps_per_user).not.toBeNull();
+  });
+
+  it('omits ids without a trace_replay (SQL joins on it)', () => {
+    const map = jp.getDerivedAgenticMetrics([3]);
+    expect(map[3]).toBeUndefined();
+  });
+});
+
+describe('request timeline mirror', () => {
+  it('serves the precomputed timeline at the current version', () => {
+    const t = jp.getRequestTimeline(1);
+    expect(t?.version).toBe(REQUEST_TIMELINE_VERSION);
+    expect(t?.requests).toHaveLength(1);
+  });
+
+  it('recomputes from the profile blob when stale', () => {
+    const t = jp.getRequestTimeline(2);
+    expect(t?.version).toBe(REQUEST_TIMELINE_VERSION);
+    // Two turns in the fixture blob.
+    expect(t?.requests).toHaveLength(2);
+  });
+
+  it('returns null for an id without a trace_replay', () => {
+    expect(jp.getRequestTimeline(3)).toBeNull();
+  });
+});
+
+describe('trace server metrics mirror', () => {
+  it('computes chart series from the server blob (no current chart_series)', async () => {
+    const m = await jp.getTraceServerMetrics(1);
+    expect(m).not.toBeNull();
+    expect(m?.meta.hardware).toBe('h100');
+    expect(m?.meta.run_url).toBe('https://github.com/x/runs/555/attempts/1');
+    expect(m?.kvCacheUsage.length).toBeGreaterThan(0);
+  });
+
+  it('returns null for an id without a trace_replay blob', async () => {
+    expect(await jp.getTraceServerMetrics(3)).toBeNull();
+  });
+});
+
+describe('trace histograms mirror', () => {
+  it('extracts isl/osl from the current request_timeline (fast path)', () => {
+    const map = jp.getTraceHistograms([1]);
+    expect(map[1]?.isl).toEqual([1000]);
+    expect(map[1]?.osl).toEqual([200]);
+  });
+
+  it('falls back to the profile blob when the timeline is stale', () => {
+    const map = jp.getTraceHistograms([2]);
+    expect(map[2]?.isl).toEqual([1000, 2000]);
+    expect(map[2]?.osl).toEqual([200, 400]);
+  });
+
+  it('omits ids without a trace_replay', () => {
+    const map = jp.getTraceHistograms([3]);
+    expect(map[3]).toBeUndefined();
+  });
+});
+
+describe('benchmark siblings mirror', () => {
+  it('groups rows sharing the SKU within the run, sorted by decode_tp then offload', () => {
+    const res = jp.getBenchmarkSiblings(1);
+    expect(res).not.toBeNull();
+    expect(res?.sku.model).toBe('testm');
+    expect(res?.sku.dataset_slug).toBe('ds-new'); // via run_datasets
+    // ids 1 (tp2/off/conc16), 2 (tp4/on), 3 (tp2/off/conc8) share the SKU.
+    // ORDER BY decode_tp asc → tp2 group (ids 1,3) before tp4 (id 2); within
+    // tp2 both are offload 'off', so final tie-break is conc asc → id 3 (conc 8)
+    // before id 1 (conc 16). Matches the SQL `order by … br.conc`.
+    const ids = res?.siblings.map((s) => s.id);
+    expect(ids).toEqual([3, 1, 2]);
+    expect(res?.siblings.find((s) => s.id === 1)?.is_current).toBe(true);
+    expect(res?.siblings.find((s) => s.id === 1)?.has_trace).toBe(true);
+    expect(res?.siblings.find((s) => s.id === 3)?.has_trace).toBe(false);
+    // total_requests coalesces total_requests_completed then num_requests_total.
+    expect(res?.siblings.find((s) => s.id === 1)?.total_requests).toBe(200);
+    expect(res?.siblings.find((s) => s.id === 2)?.total_requests).toBe(180);
+  });
+
+  it('returns null for an unknown benchmark id', () => {
+    expect(jp.getBenchmarkSiblings(9999)).toBeNull();
+  });
+});
+
+describe('dataset mirrors', () => {
+  it('listDatasets orders newest ingested first', () => {
+    const rows = jp.listDatasets();
+    expect(rows.map((r) => r.slug)).toEqual(['ds-new', 'ds-old']);
+    // chart_data is excluded from the list rows (DatasetRecord, not DatasetDetail).
+    expect((rows[0] as unknown as Record<string, unknown>).chart_data).toBeUndefined();
+  });
+
+  it('getDataset returns one dataset including chart_data', () => {
+    const d = jp.getDataset('ds-new');
+    expect(d?.label).toBe('DS New');
+    expect(d?.chart_data).toEqual({ hist: [1, 2, 3] });
+    expect(jp.getDataset('nope')).toBeNull();
+  });
+
+  it('renders ingested_at in Postgres ::text form (parity with the SQL path)', () => {
+    // Dump stores ISO ('2026-06-20T00:00:00Z'); the SQL query casts ::text →
+    // '2026-06-20 00:00:00+00'. The mirror must match, not leak the ISO form.
+    expect(jp.getDataset('ds-new')?.ingested_at).toBe('2026-06-20 00:00:00+00');
+    expect(jp.listDatasets()[0]?.ingested_at).toBe('2026-06-20 00:00:00+00');
+  });
+
+  it('listConversations applies case-insensitive search, sort, and pagination', () => {
+    // Default sort = tokens (total_in desc): alpha(300), plain(200), AGENT-beta(100).
+    const all = jp.listConversations('ds-new');
+    expect(all?.total).toBe(3);
+    expect(all?.items.map((c) => c.conv_id)).toEqual(['agent-alpha', 'plain-gamma', 'AGENT-beta']);
+
+    // ILIKE '%agent%' matches 'agent-alpha' and 'AGENT-beta' (case-insensitive).
+    const search = jp.listConversations('ds-new', { search: 'agent' });
+    expect(search?.total).toBe(2);
+    expect(search?.items.map((c) => c.conv_id).toSorted()).toEqual(['AGENT-beta', 'agent-alpha']);
+
+    // sort=turns desc → beta(9), alpha(5), gamma(2).
+    const byTurns = jp.listConversations('ds-new', { sort: 'turns' });
+    expect(byTurns?.items.map((c) => c.conv_id)).toEqual([
+      'AGENT-beta',
+      'agent-alpha',
+      'plain-gamma',
+    ]);
+
+    // sort=subagents desc → gamma(4), alpha(2), beta(1).
+    const bySub = jp.listConversations('ds-new', { sort: 'subagents' });
+    expect(bySub?.items.map((c) => c.conv_id)).toEqual([
+      'plain-gamma',
+      'agent-alpha',
+      'AGENT-beta',
+    ]);
+
+    // sort=id asc. Postgres en_US.utf8 collation (verified against the live DB)
+    // orders 'agent-alpha' before 'AGENT-beta'; String.localeCompare matches.
+    const byId = jp.listConversations('ds-new', { sort: 'id' });
+    expect(byId?.items.map((c) => c.conv_id)).toEqual(['agent-alpha', 'AGENT-beta', 'plain-gamma']);
+
+    // limit + offset.
+    const paged = jp.listConversations('ds-new', { limit: 1, offset: 1 });
+    expect(paged?.total).toBe(3);
+    expect(paged?.items.map((c) => c.conv_id)).toEqual(['plain-gamma']);
+
+    // Unknown dataset → null.
+    expect(jp.listConversations('nope')).toBeNull();
+  });
+
+  it('getConversation returns one flamegraph structure', () => {
+    const c = jp.getConversation('ds-new', 'agent-alpha');
+    expect(c?.num_turns).toBe(5);
+    expect(c?.structure).toEqual({ nodes: [] });
+    expect(jp.getConversation('ds-new', 'missing')).toBeNull();
+    expect(jp.getConversation('nope', 'agent-alpha')).toBeNull();
+  });
+});
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index 2d335d17..f6c626f0 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -12,9 +12,47 @@ import { existsSync, readFileSync } from 'node:fs';
 import { dirname, resolve } from 'node:path';
 import { fileURLToPath } from 'node:url';
 
+// Runtime-value cross-module imports use extensionless relative paths (the
+// convention in etl/queries here), NOT the `.js` type-only style below — the
+// app bundler (Turbopack) resolves the former but not a `.js` on a value import.
+import {
+  CHART_SERIES_VERSION,
+  computeChartSeries,
+  type ChartSeries,
+} from './etl/compute-chart-series';
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+  type RequestTimeline,
+} from './etl/compute-request-timeline';
+import {
+  extractIslOsl,
+  extractServerMetricSamples,
+  percentilesOf,
+  STATS_VERSION,
+  type AgenticAggregate,
+  type AgenticAggregateMap,
+} from './queries/agentic-aggregates';
 import type { BenchmarkRow, BenchmarkWorkerRow } from './queries/benchmarks.js';
+import type { BenchmarkSiblings } from './queries/benchmark-siblings.js';
+import type {
+  ConversationDetail,
+  ConversationList,
+  ConversationListItem,
+  DatasetDetail,
+  DatasetRecord,
+  ListConversationsOpts,
+} from './queries/datasets.js';
+import {
+  computeDerivedFromBlob,
+  type DerivedAgenticMetric,
+  type DerivedAgenticMetricMap,
+} from './queries/derived-agentic-metrics';
 import type { EvalRow } from './queries/evaluations.js';
 import type { ReliabilityRow } from './queries/reliability.js';
+import type { TraceHistogramMap, TraceHistogramPoint } from './queries/trace-histograms.js';
+import type { PointMeta, TraceServerMetrics } from './queries/trace-server-metrics.js';
+import type { ConversationStructure } from './etl/weka-structure.js';
 import type {
   AvailabilityRow,
   ChangelogRow,
@@ -22,6 +60,7 @@ import type {
   RunConfigRow,
   WorkflowRunRow,
 } from './queries/workflow-info.js';
+import { gunzipSync } from 'node:zlib';
 
 // ---------------------------------------------------------------------------
 // Raw table types (matching dump-db.ts output)
@@ -132,6 +171,78 @@ interface RawServerLog {
   server_log: string;
 }
 
+/**
+ * A serialized bytea column from the dump. dump-db.ts writes postgres.js Buffers
+ * via Buffer.prototype.toJSON() → {"type":"Buffer","data":[…]}. Decode with
+ * {@link bufferFromJson} back to a Node Buffer for the compute helpers (which
+ * take the same `Buffer | null` a live DB read would hand them).
+ */
+interface BufferJson {
+  type: 'Buffer';
+  data: number[];
+}
+
+/**
+ * agentic_trace_replay rows. Blob columns are big (server_metrics_json_gz can be
+ * ~17 MB compressed), so this whole table is lazy-loaded like server_logs. The
+ * precomputed JSONB columns (aggregate_stats / chart_series / request_timeline)
+ * are what the fast paths actually serve; the blobs only feed the version-stale
+ * fallback (reusing the exact same compute helpers the SQL path uses).
+ */
+interface RawTraceReplay {
+  id: number;
+  profile_export_jsonl_gz: BufferJson | null;
+  profile_export_uncompressed_size: number | null;
+  server_metrics_csv: BufferJson | null;
+  server_metrics_csv_size: number | null;
+  server_metrics_json_gz: BufferJson | null;
+  server_metrics_json_uncompressed_size: number | null;
+  aggregate_stats: Record<string, unknown> | null;
+  chart_series: Record<string, unknown> | null;
+  request_timeline: Record<string, unknown> | null;
+  created_at: string;
+}
+
+interface RawDataset {
+  id: string;
+  slug: string;
+  label: string;
+  variant: string;
+  description: string | null;
+  hf_url: string | null;
+  license: string | null;
+  conversation_count: number;
+  summary: Record<string, unknown>;
+  chart_data: Record<string, unknown>;
+  dataset_version: number;
+  ingested_at: string;
+}
+
+interface RawDatasetConversation {
+  id: number;
+  dataset_id: string;
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+  structure: Record<string, unknown>;
+}
+
+interface RawRunDataset {
+  workflow_run_id: number;
+  dataset_slug: string;
+  created_at: string;
+}
+
+/** Decode a dumped bytea ({type:'Buffer',data:[…]}) back into a Node Buffer. */
+function bufferFromJson(b: BufferJson | null | undefined): Buffer | null {
+  if (!b || !Array.isArray(b.data)) return null;
+  return Buffer.from(b.data);
+}
+
 // ---------------------------------------------------------------------------
 // In-memory store (lazy-loaded singleton)
 // ---------------------------------------------------------------------------
@@ -152,6 +263,24 @@ interface Store {
   serverLogs: Map<number, string> | null;
   /** benchmark_result.id → server_log_id (for server-log lookups) */
   benchmarkServerLogMap: Map<number, number>;
+  /** benchmark_result.id → trace_replay_id (for agentic blob-backed lookups) */
+  benchmarkTraceReplayMap: Map<number, number>;
+  /**
+   * Lazy-loaded: agentic_trace_replay.json holds the big compressed blobs.
+   * Keyed by trace_replay id. Loaded on first agentic-route access, mirroring
+   * the server_logs lazy pattern. Null until then.
+   */
+  traceReplay: Map<number, RawTraceReplay> | null;
+  /** Datasets registry (small, eager). */
+  datasets: RawDataset[];
+  /** dataset id → dataset (fast lookup). */
+  datasetsById: Map<string, RawDataset>;
+  /** dataset slug → dataset (slug is unique). */
+  datasetsBySlug: Map<string, RawDataset>;
+  /** All conversation rows (eager; counts + structure JSONB, no blobs). */
+  datasetConversations: RawDatasetConversation[];
+  /** workflow_run_id → dataset_slug (for benchmark-siblings SKU deep-link). */
+  runDatasetSlugByRunId: Map<number, string>;
 }
 
 let store: Store | null = null;
@@ -192,6 +321,15 @@ function getStore(): Store {
   const rawEvals = loadTable<RawEvalResult>(resolvedDir, 'eval_results.json');
   const rawAvailability = loadTable<RawAvailability>(resolvedDir, 'availability.json');
   const rawChangelog = loadTable<RawChangelogEntry>(resolvedDir, 'changelog_entries.json');
+  // Datasets + run_datasets are small (registry rows + one row per run) and
+  // dataset_conversations holds only counts + a per-conversation structure
+  // JSONB — all comfortably eager. agentic_trace_replay is lazy (blobs) below.
+  const rawDatasets = loadTable<RawDataset>(resolvedDir, 'datasets.json');
+  const rawDatasetConversations = loadTable<RawDatasetConversation>(
+    resolvedDir,
+    'dataset_conversations.json',
+  );
+  const rawRunDatasets = loadTable<RawRunDataset>(resolvedDir, 'run_datasets.json');
 
   // Postgres bigserial columns serialize as strings in JSON — coerce to numbers.
   for (const wr of rawRuns) {
@@ -216,6 +354,18 @@ function getStore(): Store {
     cl.id = Number(cl.id);
     cl.workflow_run_id = Number(cl.workflow_run_id);
   }
+  // Postgres bigint/bigserial + integer columns serialize as strings in JSON —
+  // coerce to numbers so the mirrors do numeric math and JSON parity matches.
+  for (const d of rawDatasets) d.conversation_count = Number(d.conversation_count);
+  for (const dc of rawDatasetConversations) {
+    dc.id = Number(dc.id);
+    dc.num_turns = Number(dc.num_turns);
+    dc.num_subagent_groups = Number(dc.num_subagent_groups);
+    dc.total_in = Number(dc.total_in);
+    dc.total_out = Number(dc.total_out);
+    dc.total_cached = Number(dc.total_cached);
+  }
+  for (const rd of rawRunDatasets) rd.workflow_run_id = Number(rd.workflow_run_id);
 
   // Build configs index
   const configs = new Map<number, RawConfig>();
@@ -242,6 +392,26 @@ function getStore(): Store {
     }
   }
 
+  // Build benchmark → trace_replay_id map. `trace_replay_id` was added by the
+  // agentic migration; older dumps lack it (undefined → treated as "no trace").
+  const benchmarkTraceReplayMap = new Map<number, number>();
+  for (const br of rawBenchmarks) {
+    const trId = (br as { trace_replay_id?: number | string | null }).trace_replay_id;
+    if (trId !== null && trId !== undefined) {
+      benchmarkTraceReplayMap.set(br.id, Number(trId));
+    }
+  }
+
+  // Datasets indexes
+  const datasetsById = new Map<string, RawDataset>();
+  const datasetsBySlug = new Map<string, RawDataset>();
+  for (const d of rawDatasets) {
+    datasetsById.set(d.id, d);
+    datasetsBySlug.set(d.slug, d);
+  }
+  const runDatasetSlugByRunId = new Map<number, string>();
+  for (const rd of rawRunDatasets) runDatasetSlugByRunId.set(rd.workflow_run_id, rd.dataset_slug);
+
   store = {
     dumpDir: resolvedDir,
     configs,
@@ -254,15 +424,52 @@ function getStore(): Store {
     changelog: rawChangelog,
     serverLogs: null, // lazy-loaded on first getServerLog() call (can be multiple GB)
     benchmarkServerLogMap,
+    benchmarkTraceReplayMap,
+    traceReplay: null, // lazy-loaded on first agentic blob-backed access (blobs are big)
+    datasets: rawDatasets,
+    datasetsById,
+    datasetsBySlug,
+    datasetConversations: rawDatasetConversations,
+    runDatasetSlugByRunId,
   };
 
   console.log(
-    `json-provider: loaded ${rawConfigs.length} configs, ${latestRunsById.size} runs, ${rawBenchmarks.length} benchmarks`,
+    `json-provider: loaded ${rawConfigs.length} configs, ${latestRunsById.size} runs, ` +
+      `${rawBenchmarks.length} benchmarks, ${rawDatasets.length} datasets, ` +
+      `${rawDatasetConversations.length} conversations`,
   );
 
   return store;
 }
 
+/**
+ * Lazy-load agentic_trace_replay.json on first blob-backed access. Mirrors the
+ * server_logs lazy pattern — the file carries the big compressed blobs so we
+ * only pay to parse it when an agentic route actually needs a fallback (most
+ * routes serve the precomputed JSONB columns and never touch the blobs). The
+ * blob columns arrive as {type:'Buffer',data:[…]} and are decoded to Buffers on
+ * demand by the callers that need them.
+ */
+function getTraceReplay(): Map<number, RawTraceReplay> {
+  const s = getStore();
+  if (s.traceReplay) return s.traceReplay;
+  console.log('json-provider: loading agentic_trace_replay.json (this may take a moment)...');
+  const raw = loadTable<RawTraceReplay>(s.dumpDir, 'agentic_trace_replay.json');
+  const map = new Map<number, RawTraceReplay>();
+  for (const tr of raw) map.set(Number(tr.id), tr);
+  s.traceReplay = map;
+  console.log(`json-provider: loaded ${map.size} agentic_trace_replay rows`);
+  return map;
+}
+
+/** Resolve a benchmark_result id → its agentic_trace_replay row (or null). */
+function traceReplayForBenchmark(benchmarkResultId: number): RawTraceReplay | null {
+  const s = getStore();
+  const trId = s.benchmarkTraceReplayMap.get(benchmarkResultId);
+  if (trId === null || trId === undefined) return null;
+  return getTraceReplay().get(trId) ?? null;
+}
+
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
@@ -272,6 +479,32 @@ function toDateString(d: string): string {
   return d.slice(0, 10);
 }
 
+/**
+ * Render a dumped timestamptz to match Postgres `<col>::text` output, so the
+ * datasets mirrors are byte-identical to the SQL path. postgres.js decodes a
+ * timestamptz to a JS Date, which the dump serialized as ISO
+ * ("2026-07-02T09:00:00.000Z"); Postgres `::text` instead yields
+ * "2026-07-02 09:00:00+00" (space separator, no trailing ".000", "+00" offset,
+ * fractional seconds only when non-zero). Convert ISO → that form; pass through
+ * anything already in Postgres form (e.g. a dump produced without the Date step).
+ */
+const pad = (n: number, w = 2): string => String(n).padStart(w, '0');
+
+function pgTimestampText(v: string): string {
+  // Already Postgres text form (has a space date/time separator, no 'T').
+  if (!v.includes('T')) return v;
+  const d = new Date(v);
+  if (Number.isNaN(d.getTime())) return v;
+  const base =
+    `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())} ` +
+    `${pad(d.getUTCHours())}:${pad(d.getUTCMinutes())}:${pad(d.getUTCSeconds())}`;
+  const ms = d.getUTCMilliseconds();
+  // Postgres prints fractional seconds only when non-zero (up to 6 digits;
+  // a Date carries at most ms precision, and dumps here have zero fractions).
+  const frac = ms === 0 ? '' : `.${pad(ms, 3).replace(/0+$/u, '')}`;
+  return `${base}${frac}+00`;
+}
+
 function buildRunUrl(wr: RawWorkflowRun): string | null {
   return wr.html_url ? `${wr.html_url}/attempts/${wr.run_attempt}` : null;
 }
@@ -717,3 +950,477 @@ export function getServerLog(benchmarkResultId: number): string | null {
 
   return s.serverLogs.get(logId) ?? null;
 }
+
+// ---------------------------------------------------------------------------
+// Agentic per-point mirrors (blob-backed; lazy trace_replay)
+//
+// Parity strategy: the SQL fast path reads the precomputed JSONB column
+// (aggregate_stats / chart_series / request_timeline) when its inner `version`
+// matches the current constant, else it re-derives from the gzipped blob using
+// a shared pure helper (computeChartSeries / computeRequestTimeline /
+// extract*+percentilesOf / computeDerivedFromBlob). These mirrors take the
+// same two branches so dump mode yields the same payloads: serve the stored
+// JSONB at the current version, otherwise gunzip the dumped blob and reuse the
+// identical helper (the blobs ARE in the dump). Only if a stale/missing JSONB
+// row also has no usable blob do we fall through to null — exactly as the SQL
+// path does. No version-gated payload is ever served blindly.
+// ---------------------------------------------------------------------------
+
+function blankAggregate(id: number): AgenticAggregate {
+  return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
+}
+
+/** Read a finite numeric metric out of a benchmark_results.metrics JSONB (or null). */
+function readFiniteMetric(m: Record<string, number>, key: string): number | null {
+  const v = m[key];
+  return typeof v === 'number' && Number.isFinite(v) ? v : null;
+}
+
+/**
+ * NULLS-FIRST rank for an offload_mode value, mirroring the SQL
+ * `order by … br.offload_mode nulls first`: null → rank 0, else rank 1 keyed by
+ * the string value.
+ */
+function offloadRank(v: string | null | undefined): [number, string] {
+  return v === null || v === undefined ? [0, ''] : [1, v];
+}
+
+/** conv_id ASC tie-break, matching Postgres en_US.utf8 `order by conv_id asc`. */
+function compareConvId(a: RawDatasetConversation, b: RawDatasetConversation): number {
+  return a.conv_id.localeCompare(b.conv_id);
+}
+
+/**
+ * Mirror of {@link import('./queries/agentic-aggregates.js').getAgenticAggregates}.
+ * Fast path: aggregate_stats at the current STATS_VERSION. Fallback: gunzip the
+ * profile blob for isl/osl percentiles and the server blob for KV/prefix, reusing
+ * the same extract*+percentilesOf helpers the SQL path uses.
+ */
+export function getAgenticAggregates(benchmarkResultIds: number[]): AgenticAggregateMap {
+  if (benchmarkResultIds.length === 0) return {};
+  const result: AgenticAggregateMap = {};
+  for (const id of benchmarkResultIds) {
+    const agg = blankAggregate(id);
+    const tr = traceReplayForBenchmark(id);
+    if (tr) {
+      const stats = tr.aggregate_stats as {
+        version?: number;
+        isl?: AgenticAggregate['isl'];
+        osl?: AgenticAggregate['osl'];
+        kvCacheUtil?: AgenticAggregate['kvCacheUtil'];
+        prefixCacheHitRate?: AgenticAggregate['prefixCacheHitRate'];
+      } | null;
+      if (stats && Number(stats.version) === STATS_VERSION) {
+        agg.isl = stats.isl ?? null;
+        agg.osl = stats.osl ?? null;
+        agg.kvCacheUtil = stats.kvCacheUtil ?? null;
+        agg.prefixCacheHitRate = stats.prefixCacheHitRate ?? null;
+      } else {
+        // Stale/missing precomputed stats → re-derive from the dumped blobs,
+        // reusing the exact SQL-path helpers (blobs are in the dump).
+        const profile = bufferFromJson(tr.profile_export_jsonl_gz);
+        if (profile) {
+          try {
+            const jsonl = gunzipSync(profile).toString('utf8');
+            const { isl, osl } = extractIslOsl(jsonl);
+            agg.isl = percentilesOf(isl);
+            agg.osl = percentilesOf(osl);
+          } catch {
+            // malformed blob — leave nulls
+          }
+        }
+        const server = bufferFromJson(tr.server_metrics_json_gz);
+        if (server) {
+          try {
+            const json = gunzipSync(server).toString('utf8');
+            const samples = extractServerMetricSamples(json);
+            agg.kvCacheUtil = percentilesOf(samples.kvCacheUtil);
+            agg.prefixCacheHitRate = percentilesOf(samples.prefixCacheHitRate);
+          } catch {
+            // dump-mode blobs are small (no >512 MB decompress case) — leave nulls
+          }
+        }
+      }
+    }
+    result[id] = agg;
+  }
+  return result;
+}
+
+/**
+ * Mirror of {@link import('./queries/derived-agentic-metrics.js').getDerivedAgenticMetrics}.
+ * Fast path: aggregate_stats at STATS_VERSION. Fallback: computeDerivedFromBlob
+ * over the gunzipped profile blob (same helper as the SQL path). Ids without a
+ * trace_replay row are omitted, matching the SQL join.
+ */
+export function getDerivedAgenticMetrics(benchmarkResultIds: number[]): DerivedAgenticMetricMap {
+  if (benchmarkResultIds.length === 0) return {};
+  const result: DerivedAgenticMetricMap = {};
+  for (const id of benchmarkResultIds) {
+    const tr = traceReplayForBenchmark(id);
+    if (!tr) continue; // SQL joins on trace_replay — no row → omitted
+    const stats = tr.aggregate_stats as {
+      version?: number;
+      normalizedSessionTimeS?: number | null;
+      p90PrefillTpsPerUser?: number | null;
+      normalizedE2e400?: { p75?: number | null; p90?: number | null } | null;
+    } | null;
+    if (stats && Number(stats.version) === STATS_VERSION) {
+      result[id] = {
+        id,
+        normalized_session_time_s: stats.normalizedSessionTimeS ?? null,
+        p90_prefill_tps_per_user: stats.p90PrefillTpsPerUser ?? null,
+        p75_normalized_e2e_400_s: stats.normalizedE2e400?.p75 ?? null,
+        p90_normalized_e2e_400_s: stats.normalizedE2e400?.p90 ?? null,
+      };
+      continue;
+    }
+    // Fallback: re-derive from the dumped profile blob via the shared helper.
+    const profile = bufferFromJson(tr.profile_export_jsonl_gz);
+    if (!profile) continue; // SQL fallback requires the blob to be non-null
+    try {
+      const jsonl = gunzipSync(profile).toString('utf8');
+      const { normalized_session_time_s, p90_prefill_tps_per_user, normalized_e2e_400 } =
+        computeDerivedFromBlob(jsonl);
+      const entry: DerivedAgenticMetric = {
+        id,
+        normalized_session_time_s,
+        p90_prefill_tps_per_user,
+        p75_normalized_e2e_400_s: normalized_e2e_400?.p75 ?? null,
+        p90_normalized_e2e_400_s: normalized_e2e_400?.p90 ?? null,
+      };
+      result[id] = entry;
+    } catch {
+      // malformed blob — omit id (SQL treats missing as "no data")
+    }
+  }
+  return result;
+}
+
+/**
+ * Mirror of {@link import('./queries/request-timeline.js').getRequestTimeline}.
+ * Fast path: request_timeline at REQUEST_TIMELINE_VERSION. Fallback:
+ * computeRequestTimeline over the profile blob (same helper as the SQL path).
+ */
+export function getRequestTimeline(benchmarkResultId: number): RequestTimeline | null {
+  const tr = traceReplayForBenchmark(benchmarkResultId);
+  if (!tr) return null;
+  const stored = tr.request_timeline as (RequestTimeline & { version?: number }) | null;
+  if (stored && Number(stored.version) === REQUEST_TIMELINE_VERSION) return stored;
+  return computeRequestTimeline(bufferFromJson(tr.profile_export_jsonl_gz));
+}
+
+/**
+ * Mirror of {@link import('./queries/trace-server-metrics.js').getTraceServerMetrics}.
+ * Fast path: chart_series at CHART_SERIES_VERSION. Fallback: computeChartSeries
+ * over the server blob (same helper as the SQL path). Returns null when the point
+ * has no server_metrics blob, matching the SQL `has_blob` gate.
+ */
+export async function getTraceServerMetrics(
+  benchmarkResultId: number,
+): Promise<TraceServerMetrics | null> {
+  const s = getStore();
+  const br = s.benchmarks.find((b) => b.id === benchmarkResultId);
+  if (!br) return null;
+  const c = s.configs.get(br.config_id);
+  const wr = s.latestRunsById.get(br.workflow_run_id) ?? null;
+  if (!c) return null;
+  const tr = traceReplayForBenchmark(benchmarkResultId);
+  // SQL gates on (server_metrics blob present AND trace_replay_id non-null).
+  const hasServerBlob = tr ? tr.server_metrics_json_gz !== null : false;
+  if (!tr || !hasServerBlob) return null;
+
+  const num = (key: string): number | null => {
+    const v = br.metrics?.[key];
+    return typeof v === 'number' && Number.isFinite(v) ? v : null;
+  };
+  const meta: PointMeta = {
+    id: br.id,
+    hardware: c.hardware,
+    framework: c.framework,
+    model: c.model,
+    precision: c.precision,
+    spec_method: c.spec_method,
+    disagg: c.disagg,
+    conc: br.conc,
+    offload_mode: (br as { offload_mode?: string | null }).offload_mode ?? null,
+    isl: br.isl,
+    osl: br.osl,
+    benchmark_type: br.benchmark_type ?? 'single_turn',
+    date: toDateString(br.date),
+    run_url: wr ? buildRunUrl(wr) : null,
+    server_gpu_cache_hit_rate: num('server_gpu_cache_hit_rate'),
+    server_cpu_cache_hit_rate: num('server_cpu_cache_hit_rate'),
+  };
+  const kvCachePoolTokens = num('kv_cache_pool_tokens');
+
+  const merge = (series: ChartSeries): TraceServerMetrics => ({
+    meta,
+    kvCachePoolTokens,
+    startNs: series.startNs,
+    endNs: series.endNs,
+    durationS: series.durationS,
+    timeslicesCount: series.timeslicesCount,
+    kvCacheUsage: series.kvCacheUsage,
+    prefixCacheHitRate: series.prefixCacheHitRate,
+    queueDepth: series.queueDepth,
+    promptTokensBySource: series.promptTokensBySource,
+    prefillTps: series.prefillTps,
+    decodeTps: series.decodeTps,
+    prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
+    hostKvCacheUsage: series.hostKvCacheUsage ?? [],
+    kvCacheUsageByEngine: series.kvCacheUsageByEngine ?? [],
+    metricSources: series.metricSources ?? [],
+  });
+
+  const stored = tr.chart_series as (ChartSeries & { version?: number }) | null;
+  if (stored && Number(stored.version) === CHART_SERIES_VERSION) return merge(stored);
+
+  const series = await computeChartSeries(bufferFromJson(tr.server_metrics_json_gz), {
+    framework: c.framework,
+    disagg: c.disagg,
+  });
+  if (!series) return null;
+  return merge(series);
+}
+
+/**
+ * Mirror of {@link import('./queries/trace-histograms.js').getTraceHistograms}.
+ * Fast path: pull isl/osl out of a current request_timeline. Fallback: parse the
+ * profile blob's per-request input/output_sequence_length. Ids without a
+ * trace_replay row are omitted (SQL joins on it).
+ */
+export function getTraceHistograms(benchmarkResultIds: number[]): TraceHistogramMap {
+  if (benchmarkResultIds.length === 0) return {};
+  const result: TraceHistogramMap = {};
+  for (const id of benchmarkResultIds) {
+    const tr = traceReplayForBenchmark(id);
+    if (!tr) continue;
+    const timeline = tr.request_timeline as (RequestTimeline & { version?: number }) | null;
+    if (timeline && Number(timeline.version) === REQUEST_TIMELINE_VERSION) {
+      const isl: number[] = [];
+      const osl: number[] = [];
+      for (const req of timeline.requests) {
+        if (typeof req.isl === 'number' && Number.isFinite(req.isl)) isl.push(req.isl);
+        if (typeof req.osl === 'number' && Number.isFinite(req.osl)) osl.push(req.osl);
+      }
+      result[id] = { id, isl, osl } satisfies TraceHistogramPoint;
+      continue;
+    }
+    // Fallback: parse the profile blob (same field extraction the SQL path uses).
+    const profile = bufferFromJson(tr.profile_export_jsonl_gz);
+    if (!profile) continue;
+    try {
+      const jsonl = gunzipSync(profile).toString('utf8');
+      const { isl, osl } = extractIslOsl(jsonl);
+      result[id] = { id, isl, osl } satisfies TraceHistogramPoint;
+    } catch {
+      // malformed blob — omit id
+    }
+  }
+  return result;
+}
+
+/**
+ * Mirror of {@link import('./queries/benchmark-siblings.js').getBenchmarkSiblings}.
+ * Plain-row logic: resolve the seed SKU, then every row in the same workflow_run
+ * sharing hw/framework/model/precision/spec_method/benchmark_type. Sort mirrors
+ * the SQL `order by decode_tp, decode_ep, offload_mode nulls first, conc`.
+ */
+export function getBenchmarkSiblings(benchmarkResultId: number): BenchmarkSiblings | null {
+  const s = getStore();
+  const seed = s.benchmarks.find((b) => b.id === benchmarkResultId);
+  if (!seed) return null;
+  const seedC = s.configs.get(seed.config_id);
+  const seedWr = s.latestRunsById.get(seed.workflow_run_id);
+  // getBenchmarkSiblings joins workflow_runs (inner) for github_run_id — a
+  // missing run yields no seed row in SQL.
+  if (!seedC || !seedWr) return null;
+  const seedType = seed.benchmark_type ?? 'single_turn';
+
+  const rows = s.benchmarks
+    .filter((b) => {
+      if (b.workflow_run_id !== seed.workflow_run_id) return false;
+      if ((b.benchmark_type ?? 'single_turn') !== seedType) return false;
+      const c = s.configs.get(b.config_id);
+      if (!c) return false;
+      return (
+        c.hardware === seedC.hardware &&
+        c.framework === seedC.framework &&
+        c.model === seedC.model &&
+        c.precision === seedC.precision &&
+        c.spec_method === seedC.spec_method
+      );
+    })
+    .map((b) => ({ b, c: s.configs.get(b.config_id)! }))
+    // ORDER BY c.decode_tp, c.decode_ep, br.offload_mode NULLS FIRST, br.conc
+    .toSorted((x, y) => {
+      if (x.c.decode_tp !== y.c.decode_tp) return x.c.decode_tp - y.c.decode_tp;
+      if (x.c.decode_ep !== y.c.decode_ep) return x.c.decode_ep - y.c.decode_ep;
+      const [xr, xv] = offloadRank((x.b as { offload_mode?: string | null }).offload_mode);
+      const [yr, yv] = offloadRank((y.b as { offload_mode?: string | null }).offload_mode);
+      if (xr !== yr) return xr - yr;
+      if (xv !== yv) return xv.localeCompare(yv);
+      return x.b.conc - y.b.conc;
+    });
+
+  const siblings = rows.map(({ b, c }) => {
+    const totalRequests =
+      readFiniteMetric(b.metrics, 'total_requests_completed') ??
+      readFiniteMetric(b.metrics, 'num_requests_total');
+    return {
+      id: b.id,
+      conc: b.conc,
+      offload_mode: (b as { offload_mode?: string | null }).offload_mode ?? null,
+      decode_tp: c.decode_tp,
+      decode_ep: c.decode_ep,
+      decode_dp_attention: c.decode_dp_attention,
+      decode_num_workers: c.decode_num_workers,
+      prefill_tp: c.prefill_tp,
+      prefill_ep: c.prefill_ep,
+      prefill_dp_attention: c.prefill_dp_attention,
+      prefill_num_workers: c.prefill_num_workers,
+      num_prefill_gpu: c.num_prefill_gpu,
+      num_decode_gpu: c.num_decode_gpu,
+      disagg: c.disagg,
+      is_multinode: c.is_multinode,
+      tput_per_gpu: readFiniteMetric(b.metrics, 'tput_per_gpu'),
+      total_requests: totalRequests,
+      is_current: b.id === benchmarkResultId,
+      has_trace: s.benchmarkTraceReplayMap.has(b.id),
+    };
+  });
+
+  return {
+    sku: {
+      hardware: seedC.hardware,
+      framework: seedC.framework,
+      model: seedC.model,
+      precision: seedC.precision,
+      spec_method: seedC.spec_method,
+      benchmark_type: seedType,
+      github_run_id: seedWr.github_run_id,
+      date: toDateString(seed.date),
+      dataset_slug: s.runDatasetSlugByRunId.get(seed.workflow_run_id) ?? null,
+    },
+    siblings,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Dataset mirrors (plain-row logic)
+// ---------------------------------------------------------------------------
+
+/** Mirror of {@link import('./queries/datasets.js').listDatasets}: newest first. */
+export function listDatasets(): DatasetRecord[] {
+  const s = getStore();
+  // ORDER BY ingested_at DESC, slug ASC. ingested_at is an ISO string.
+  const sorted = s.datasets.toSorted((a, b) => {
+    const t = b.ingested_at.localeCompare(a.ingested_at);
+    return t === 0 ? a.slug.localeCompare(b.slug) : t;
+  });
+  return sorted.map((d) => ({
+    id: d.id,
+    slug: d.slug,
+    label: d.label,
+    variant: d.variant,
+    description: d.description,
+    hf_url: d.hf_url,
+    license: d.license,
+    conversation_count: Number(d.conversation_count),
+    summary: d.summary,
+    ingested_at: pgTimestampText(d.ingested_at),
+  }));
+}
+
+/** Mirror of {@link import('./queries/datasets.js').getDataset}: one dataset incl. chart_data. */
+export function getDataset(slug: string): DatasetDetail | null {
+  const s = getStore();
+  const d = s.datasetsBySlug.get(slug);
+  if (!d) return null;
+  return {
+    id: d.id,
+    slug: d.slug,
+    label: d.label,
+    variant: d.variant,
+    description: d.description,
+    hf_url: d.hf_url,
+    license: d.license,
+    conversation_count: Number(d.conversation_count),
+    summary: d.summary,
+    chart_data: d.chart_data,
+    ingested_at: pgTimestampText(d.ingested_at),
+  };
+}
+
+const CONVERSATIONS_MAX_LIMIT = 200;
+
+/**
+ * Mirror of {@link import('./queries/datasets.js').listConversations}. Applies
+ * the same ILIKE (case-insensitive substring) search, sort (tokens/turns/
+ * subagents/id), limit clamp (1..200), and offset the SQL uses. `total`
+ * reflects the filtered count before pagination.
+ */
+export function listConversations(
+  slug: string,
+  opts: ListConversationsOpts = {},
+): ConversationList | null {
+  const s = getStore();
+  const dataset = s.datasetsBySlug.get(slug);
+  if (!dataset) return null;
+
+  const limit = Math.min(CONVERSATIONS_MAX_LIMIT, Math.max(1, opts.limit ?? 50));
+  const offset = Math.max(0, opts.offset ?? 0);
+  const search = opts.search?.trim();
+  const needle = search ? search.toLowerCase() : null;
+
+  const filtered = s.datasetConversations.filter(
+    (dc) =>
+      dc.dataset_id === dataset.id &&
+      (needle === null || dc.conv_id.toLowerCase().includes(needle)),
+  );
+  const total = filtered.length;
+
+  // ORDER BY <sort key> [DESC], conv_id ASC — replicate the SQL tie-break.
+  const sort = opts.sort ?? 'tokens';
+  const sorted = filtered.toSorted((a, b) => {
+    if (sort === 'turns') return b.num_turns - a.num_turns || compareConvId(a, b);
+    if (sort === 'subagents')
+      return b.num_subagent_groups - a.num_subagent_groups || compareConvId(a, b);
+    if (sort === 'id') return compareConvId(a, b);
+    return b.total_in - a.total_in || compareConvId(a, b); // 'tokens' (default)
+  });
+
+  const items: ConversationListItem[] = sorted.slice(offset, offset + limit).map((dc) => ({
+    conv_id: dc.conv_id,
+    models: dc.models,
+    num_turns: Number(dc.num_turns),
+    num_subagent_groups: Number(dc.num_subagent_groups),
+    total_in: Number(dc.total_in),
+    total_out: Number(dc.total_out),
+    total_cached: Number(dc.total_cached),
+  }));
+
+  return { total, items };
+}
+
+/** Mirror of {@link import('./queries/datasets.js').getConversation}: one flamegraph. */
+export function getConversation(slug: string, convId: string): ConversationDetail | null {
+  const s = getStore();
+  const dataset = s.datasetsBySlug.get(slug);
+  if (!dataset) return null;
+  const dc = s.datasetConversations.find(
+    (r) => r.dataset_id === dataset.id && r.conv_id === convId,
+  );
+  if (!dc) return null;
+  return {
+    conv_id: dc.conv_id,
+    models: dc.models,
+    num_turns: Number(dc.num_turns),
+    num_subagent_groups: Number(dc.num_subagent_groups),
+    total_in: Number(dc.total_in),
+    total_out: Number(dc.total_out),
+    total_cached: Number(dc.total_cached),
+    structure: dc.structure as unknown as ConversationStructure,
+  };
+}
diff --git a/packages/db/src/load-dump.ts b/packages/db/src/load-dump.ts
index b1b4af70..108627b6 100644
--- a/packages/db/src/load-dump.ts
+++ b/packages/db/src/load-dump.ts
@@ -22,20 +22,44 @@ import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils';
 
 const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1 });
 
-// Tables with serial/bigserial PKs that need sequence resets
+// Tables with serial/bigserial PKs that need sequence resets.
+// (datasets.id is text and run_datasets.workflow_run_id is a plain bigint FK —
+// neither owns a sequence, so they're intentionally omitted.)
 const SEQUENCES: { seq: string; table: string; col: string }[] = [
   { seq: 'configs_id_seq', table: TABLE_NAMES.configs, col: 'id' },
   { seq: 'server_logs_id_seq', table: TABLE_NAMES.serverLogs, col: 'id' },
   { seq: 'workflow_runs_id_seq', table: TABLE_NAMES.workflowRuns, col: 'id' },
+  { seq: 'agentic_trace_replay_id_seq', table: TABLE_NAMES.agenticTraceReplay, col: 'id' },
   { seq: 'benchmark_results_id_seq', table: TABLE_NAMES.benchmarkResults, col: 'id' },
   { seq: 'eval_results_id_seq', table: TABLE_NAMES.evalResults, col: 'id' },
   { seq: 'eval_samples_id_seq', table: TABLE_NAMES.evalSamples, col: 'id' },
   { seq: 'run_stats_id_seq', table: TABLE_NAMES.runStats, col: 'id' },
   { seq: 'changelog_entries_id_seq', table: TABLE_NAMES.changelogEntries, col: 'id' },
+  {
+    seq: 'dataset_conversations_id_seq',
+    table: TABLE_NAMES.datasetConversations,
+    col: 'id',
+  },
 ];
 
 const BATCH_SIZE = 500;
 
+/** The JSON shape Buffer.prototype.toJSON() emits (what dump-db writes for bytea). */
+interface BufferJson {
+  type: 'Buffer';
+  data: number[];
+}
+
+/** True for a `{ type: 'Buffer', data: number[] }` object (a serialized bytea). */
+function isBufferJson(val: unknown): val is BufferJson {
+  return (
+    typeof val === 'object' &&
+    val !== null &&
+    (val as { type?: unknown }).type === 'Buffer' &&
+    Array.isArray((val as { data?: unknown }).data)
+  );
+}
+
 /**
  * Stream-parse a JSON array file, yielding objects one at a time.
  * Avoids loading the entire file into memory.
@@ -118,18 +142,36 @@ async function loadTable(dumpDir: string, table: string): Promise<number> {
   const flush = async () => {
     if (batch.length === 0 || !columns) return;
 
-    // Track which columns have plain-object values (JSONB) for casting
-    const jsonbCols = new Set<number>();
-    const values: unknown[][] = batch.map((row) =>
+    // Track which columns need a per-value cast. JSONB columns pass objects
+    // as-is under a `::jsonb` cast; BYTEA columns are reconstructed into a real
+    // Node Buffer under a `::bytea` cast. Casts are tracked per (row, col) —
+    // not per column — because a nullable blob/jsonb column can be null on some
+    // rows and populated on others within the same batch, and a NULL param
+    // needs no cast (Postgres would reject `NULL::bytea` from an untyped param
+    // only in edge cases, but more importantly the cast set must match the
+    // value actually bound for that cell).
+    const jsonbCells = new Set<string>();
+    const byteaCells = new Set<string>();
+    const values: unknown[][] = batch.map((row, rowIdx) =>
       columns!.map((col, colIdx) => {
         const val = row[col];
         if (val === null || val === undefined) return null;
         // Postgres text[] arrays: convert JSON ["a","b"] → Postgres {a,b} literal
         if (Array.isArray(val) && val.every((v) => typeof v === 'string'))
           return `{${(val as string[]).map((v) => `"${v.replaceAll('\\', String.raw`\\`).replaceAll('"', String.raw`\"`)}"`).join(',')}}`;
+        // BYTEA columns: dump-db.ts serialized the postgres.js Buffer via
+        // Buffer.prototype.toJSON() → {"type":"Buffer","data":[…]}. Rebuild the
+        // Buffer and bind it under a ::bytea cast so the blob round-trips
+        // byte-for-byte (agentic_trace_replay.*_gz / server_metrics_csv). Must
+        // be checked BEFORE the generic object→jsonb branch, or the blob would
+        // be mis-cast to jsonb and corrupt on insert.
+        if (isBufferJson(val)) {
+          byteaCells.add(`${rowIdx}:${colIdx}`);
+          return Buffer.from((val as BufferJson).data);
+        }
         // JSONB columns: pass objects as-is (sql.unsafe serializes them correctly with ::jsonb cast)
         if (typeof val === 'object') {
-          jsonbCols.add(colIdx);
+          jsonbCells.add(`${rowIdx}:${colIdx}`);
           return val;
         }
         return val as string | number | boolean;
@@ -143,7 +185,9 @@ async function loadTable(dumpDir: string, table: string): Promise<number> {
           `(${columns!
             .map((_col, j) => {
               const p = `$${i * columns!.length + j + 1}`;
-              return jsonbCols.has(j) ? `${p}::jsonb` : p;
+              if (byteaCells.has(`${i}:${j}`)) return `${p}::bytea`;
+              if (jsonbCells.has(`${i}:${j}`)) return `${p}::jsonb`;
+              return p;
             })
             .join(', ')})`,
       )
diff --git a/packages/db/src/reset-db.ts b/packages/db/src/reset-db.ts
index a895617c..760eb4c2 100644
--- a/packages/db/src/reset-db.ts
+++ b/packages/db/src/reset-db.ts
@@ -20,8 +20,9 @@ const sql = createAdminSql({
 async function reset(): Promise<void> {
   console.log('=== db:reset ===');
   console.log(
-    'This will DROP all tables (configs, workflow_runs, benchmark_results,\n' +
-      'server_logs, run_stats, eval_results, changelog_entries, availability, schema_migrations).\n' +
+    'This will DROP all tables (configs, workflow_runs, agentic_trace_replay,\n' +
+      'benchmark_results, server_logs, run_stats, eval_results, changelog_entries,\n' +
+      'availability, datasets, dataset_conversations, run_datasets, schema_migrations).\n' +
       'You must run db:migrate after this before ingesting data.\n',
   );
 
@@ -37,10 +38,15 @@ async function reset(): Promise<void> {
 
   await sql`DROP MATERIALIZED VIEW IF EXISTS latest_benchmarks`;
   await sql`DROP VIEW IF EXISTS latest_workflow_runs`;
+  // Child-before-parent order (CASCADE handles the rest, but keep it FK-safe).
   await sql`DROP TABLE IF EXISTS
+    ${sql(TABLE_NAMES.runDatasets)},
+    ${sql(TABLE_NAMES.datasetConversations)},
+    ${sql(TABLE_NAMES.datasets)},
     ${sql(TABLE_NAMES.changelogEntries)},
     ${sql(TABLE_NAMES.evalResults)},
     ${sql(TABLE_NAMES.benchmarkResults)},
+    ${sql(TABLE_NAMES.agenticTraceReplay)},
     ${sql(TABLE_NAMES.serverLogs)},
     ${sql(TABLE_NAMES.runStats)},
     ${sql(TABLE_NAMES.availability)},

From d1dd59f12d64d5cee38070bbbc20bf6fa08020ec Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 18:35:16 -0500
Subject: [PATCH 29/40] fix(inference): gate agentic default sequence on
 availability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The AgenticTraces default resolved before availability loaded (static
SEQUENCE_OPTIONS fallback), so fixed-seq-only models flashed 'Agentic
Traces', fired a wasted agentic fetch, then snapped to 1k/1k. New pure
resolveEffectiveSequence helper (mirrors default-precisions pattern)
returns the real scenario only once availability is known; benchmark
fetching gates on the new sequenceResolved flag; non-agentic models
fall back to 8k/1k (master's default) when available.

Fixes the url-params and historical-trends e2e failures the PR
description labels 'pre-existing' — they were caused by this default
and now pass with no assertion changes. ttft-x-axis-toggle gets
spec-scoped agentic intercepts (shared fixtures have no agentic rows).
Verified live: llama70b -> 8K/1K, zero agentic calls, one benchmarks
fetch; dsr1 -> Agentic Traces, one fetch.
---
 .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts  | 109 ++++++++++++++++
 packages/app/cypress/support/mock-data.ts     |   3 +
 .../src/components/GlobalFilterContext.tsx    |  65 +++++++++-
 .../components/inference/InferenceContext.tsx |   8 +-
 packages/app/src/lib/default-sequence.test.ts | 119 ++++++++++++++++++
 packages/app/src/lib/default-sequence.ts      |  52 ++++++++
 6 files changed, 349 insertions(+), 7 deletions(-)
 create mode 100644 packages/app/src/lib/default-sequence.test.ts
 create mode 100644 packages/app/src/lib/default-sequence.ts

diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index 924ff9a9..c634cd27 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -18,8 +18,117 @@ const interceptDerivedMetrics = () => {
   }).as('derivedAgenticMetrics');
 };
 
+// This spec exercises the agentic x-axis modes, which only exist when the
+// selected model resolves to the Agentic Traces scenario. The default e2e
+// fixtures (cypress/fixtures/api/*.json) have NO agentic rows for any model, so
+// after the availability-gated effectiveSequence fix the bare-/inference default
+// correctly resolves to a fixed-seq scenario. We therefore inject agentic
+// availability + benchmark rows for the default model VIA SPEC-SCOPED INTERCEPTS
+// (not the shared fixtures) so this test — and only this test — sees the agentic
+// view. Scoping to intercepts keeps every other spec's default fixed-seq.
+const DEFAULT_MODEL_DB_KEY = 'dsv4'; // DeepSeek-V4-Pro is the default model
+const AGENTIC_DATE = '2026-06-12';
+
+// Percentile ladder for one metric family (median/p75/p90/p95/p99/std).
+const percentileLadder = (prefix: string, base: number): Record<string, number> => ({
+  [`median_${prefix}`]: base,
+  [`p75_${prefix}`]: base * 1.2,
+  [`p90_${prefix}`]: base * 1.5,
+  [`p95_${prefix}`]: base * 1.7,
+  [`p99_${prefix}`]: base * 2.2,
+  [`std_${prefix}`]: base * 0.3,
+});
+
+const agenticMetrics = (conc: number): Record<string, number> => {
+  const scale = conc / 16;
+  const itl = 0.011 * scale;
+  return {
+    ...percentileLadder('ttft', 0.4 * scale),
+    ...percentileLadder('tpot', 0.012 * scale),
+    ...percentileLadder('itl', itl),
+    ...percentileLadder('e2el', 8 * scale),
+    median_intvty: 1 / itl,
+    p75_intvty: 1 / (itl * 1.2),
+    p90_intvty: 1 / (itl * 1.5),
+    p99_intvty: 1 / (itl * 2.2),
+    std_intvty: (1 / itl) * 0.1,
+    tput_per_gpu: 950 / Math.sqrt(scale),
+    output_tput_per_gpu: 210,
+    input_tput_per_gpu: 740,
+    total_tput_tps: 7600 * conc * 0.05,
+  };
+};
+
+const agenticGpus = [
+  { hardware: 'b200', framework: 'vllm', disagg: false },
+  { hardware: 'b300', framework: 'vllm', disagg: false },
+];
+
+// Availability: default model has BOTH agentic and fixed-seq, so the default
+// resolves to agentic (the product-intended, agentic-preferred behavior).
+const agenticAvailability = [
+  ...agenticGpus.map((g) => ({
+    model: DEFAULT_MODEL_DB_KEY,
+    isl: null,
+    osl: null,
+    precision: 'fp4',
+    hardware: g.hardware,
+    framework: g.framework,
+    spec_method: 'none',
+    disagg: g.disagg,
+    benchmark_type: 'agentic_traces',
+    date: AGENTIC_DATE,
+  })),
+  ...agenticGpus.map((g) => ({
+    model: DEFAULT_MODEL_DB_KEY,
+    isl: 8192,
+    osl: 1024,
+    precision: 'fp4',
+    hardware: g.hardware,
+    framework: g.framework,
+    spec_method: 'none',
+    disagg: g.disagg,
+    benchmark_type: 'single_turn',
+    date: AGENTIC_DATE,
+  })),
+];
+
+let benchIdCursor = 900000;
+const agenticBenchmarks = agenticGpus.flatMap((g) =>
+  [16, 64, 128].map((conc) => ({
+    id: benchIdCursor++,
+    hardware: g.hardware,
+    framework: g.framework,
+    model: DEFAULT_MODEL_DB_KEY,
+    precision: 'fp4',
+    spec_method: 'none',
+    disagg: g.disagg,
+    is_multinode: false,
+    prefill_tp: 8,
+    decode_tp: 8,
+    num_prefill_gpu: 8,
+    num_decode_gpu: 8,
+    isl: null,
+    osl: null,
+    conc,
+    offload_mode: 'off',
+    benchmark_type: 'agentic_traces',
+    image: 'vllm/vllm-openai:v0.9.0',
+    metrics: agenticMetrics(conc),
+    workers: null,
+    date: AGENTIC_DATE,
+    run_url: null,
+  })),
+);
+
+const interceptAgenticData = () => {
+  cy.intercept('GET', '/api/v1/availability', { body: agenticAvailability }).as('availability');
+  cy.intercept('GET', '/api/v1/benchmarks*', { body: agenticBenchmarks }).as('benchmarks');
+};
+
 describe('X-Axis Mode Toggle (inference chart)', () => {
   before(() => {
+    interceptAgenticData();
     cy.visit('/inference', {
       onBeforeLoad(win) {
         win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index b2164bcc..490fca87 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -423,6 +423,9 @@ export function createMockGlobalFilterContext(
     selectedPrecisions: [Precision.FP4],
     setSelectedPrecisions: namedStub('setSelectedPrecisions_global'),
     effectiveSequence: Sequence.EightK_OneK,
+    // Mocks represent a settled state: availability is known and the sequence is
+    // resolved. Tests exercising the pre-availability window override this.
+    sequenceResolved: true,
     effectivePrecisions: [Precision.FP4],
     selectedRunDate: '2025-03-01',
     setSelectedRunDate: namedStub('setSelectedRunDate_global'),
diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index fddf7871..e7aa751c 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -24,6 +24,14 @@ function isEnumValue<T extends Record<string, string>>(e: T, v: string): v is T[
 const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u;
 const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u;
 
+// Placeholder for the public (non-null) `effectiveSequence` during the window
+// before availability has loaded. It must be a fixed-seq scenario — never
+// AgenticTraces — so the scenario selector doesn't flash "Agentic Traces" for a
+// fixed-seq-only model while the chart shows its loading skeleton. `8k/1k` is
+// the pre-agentic default for non-agentic models. Consumers that must not act on
+// an unresolved sequence gate on `sequenceResolved` instead.
+const PRE_AVAILABILITY_SEQUENCE = Sequence.EightK_OneK;
+
 import { useAvailability } from '@/hooks/api/use-availability';
 import { useWorkflowInfo } from '@/hooks/api/use-workflow-info';
 import { useUrlState } from '@/hooks/useUrlState';
@@ -38,6 +46,7 @@ import {
 } from '@/lib/data-mappings';
 import { computeAutoSwitchDecision } from '@/lib/unofficial-run-auto-switch';
 import { countCurvesByPrecision, resolveEffectivePrecisions } from '@/lib/default-precisions';
+import { resolveEffectiveSequence } from '@/lib/default-sequence';
 import type { AvailabilityRow, WorkflowInfoResponse } from '@/lib/api';
 
 interface RunInfo {
@@ -66,6 +75,15 @@ export interface GlobalFilterContextType {
 
   // Effective (validated) values
   effectiveSequence: Sequence;
+  /**
+   * Whether `effectiveSequence` reflects the selected model's real availability
+   * (DB or unofficial run) rather than the pre-load placeholder. False during
+   * the brief window before availability loads. Consumers that trigger data
+   * fetches or render sequence-dependent labels should gate on this so a
+   * fixed-seq-only model never fires an agentic fetch or flashes "Agentic
+   * Traces" before availability settles.
+   */
+  sequenceResolved: boolean;
   effectivePrecisions: string[];
 
   // Run date & run ID
@@ -288,11 +306,39 @@ export function GlobalFilterProvider({
     return merged.length > 0 ? merged : SEQUENCE_OPTIONS;
   }, [availabilityRows, modelRows, unofficialAvailable, selectedModel]);
 
-  // Synchronously validated sequence
-  const effectiveSequence = useMemo(() => {
-    if (availableSequences.includes(selectedSequence)) return selectedSequence;
-    return availableSequences[0] ?? selectedSequence;
-  }, [availableSequences, selectedSequence]);
+  // Whether we actually know the selected model's sequences yet. Availability
+  // may arrive from the DB (`availabilityRows`) OR from a loaded unofficial run
+  // (`unofficialAvailable` for this model) — either source lets us resolve a
+  // trustworthy effectiveSequence. Until then `availableSequences` is the static
+  // SEQUENCE_OPTIONS fallback (which contains AgenticTraces), so resolving
+  // eagerly would fetch + label an agentic scenario for fixed-seq-only models,
+  // then snap once availability lands (flash + wasted request).
+  const availabilityLoaded = useMemo(
+    () =>
+      availabilityRows !== undefined || unofficialAvailable.some((a) => a.model === selectedModel),
+    [availabilityRows, unofficialAvailable, selectedModel],
+  );
+
+  // Synchronously validated sequence.
+  //
+  // `resolveEffectiveSequence` returns null while availability is still loading
+  // — we surface that as `sequenceResolved` so InferenceContext can gate the
+  // benchmark fetch until the real sequence is known (no agentic fetch fires for
+  // a fixed-seq-only model). For the non-null public `effectiveSequence` value
+  // we substitute a fixed-seq scenario (never AgenticTraces) during that window
+  // so the scenario selector never flashes "Agentic Traces"; the chart shows its
+  // normal loading skeleton until `sequenceResolved` flips true.
+  const resolvedSequence = useMemo(
+    () =>
+      resolveEffectiveSequence({
+        selectedSequence,
+        availableSequences,
+        availabilityLoaded,
+      }),
+    [selectedSequence, availableSequences, availabilityLoaded],
+  );
+  const sequenceResolved = resolvedSequence !== null;
+  const effectiveSequence = resolvedSequence ?? PRE_AVAILABILITY_SEQUENCE;
 
   // Precisions available for the selected model + sequence (DB ∪ unofficial run)
   const availablePrecisions = useMemo(() => {
@@ -439,7 +485,11 @@ export function GlobalFilterProvider({
       g_model: selectedModel,
       g_rundate: selectedRunDate,
       g_runid: selectedRunId,
-      i_seq: effectiveSequence,
+      // Don't pin the sequence to the URL until it's resolved from real
+      // availability — writing the pre-load placeholder (8k/1k) would clobber a
+      // shared `?i_seq=agentic-traces` link before the model's availability
+      // confirms it has agentic data.
+      i_seq: sequenceResolved ? effectiveSequence : undefined,
       // Only pin the precision in the URL once chosen explicitly; in auto mode
       // leave it out so the link keeps following the per-model densest default.
       i_prec: precisionExplicit ? effectivePrecisions.join(',') : undefined,
@@ -449,6 +499,7 @@ export function GlobalFilterProvider({
     selectedRunDate,
     selectedRunId,
     effectiveSequence,
+    sequenceResolved,
     effectivePrecisions,
     precisionExplicit,
     setUrlParams,
@@ -463,6 +514,7 @@ export function GlobalFilterProvider({
       selectedPrecisions,
       setSelectedPrecisions,
       effectiveSequence,
+      sequenceResolved,
       effectivePrecisions,
       selectedRunDate: effectiveRunDate,
       setSelectedRunDate: setSelectedRunDateManual,
@@ -485,6 +537,7 @@ export function GlobalFilterProvider({
       selectedSequence,
       selectedPrecisions,
       effectiveSequence,
+      sequenceResolved,
       effectivePrecisions,
       effectiveRunDate,
       setSelectedRunDateManual,
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 98962126..b9cbc7ce 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -110,6 +110,7 @@ export function InferenceProvider({
     selectedModel,
     setSelectedModel,
     effectiveSequence,
+    sequenceResolved,
     setSelectedSequence,
     effectivePrecisions,
     setSelectedPrecisions,
@@ -414,7 +415,12 @@ export function InferenceProvider({
     userCosts,
     userPowers,
     effectiveRunDate,
-    isActive,
+    // Gate benchmark fetching on sequenceResolved: before availability loads we
+    // don't yet know the model's real sequence, and the selectedSequence default
+    // is AgenticTraces. Fetching now would fire the agentic data path for a
+    // fixed-seq-only model, then refetch once availability snaps the sequence.
+    // The chart's normal loading state covers this brief window.
+    isActive && sequenceResolved,
     latestDate,
     selectedPercentile,
     compareGpuPair ?? null,
diff --git a/packages/app/src/lib/default-sequence.test.ts b/packages/app/src/lib/default-sequence.test.ts
new file mode 100644
index 00000000..4fd8a6b9
--- /dev/null
+++ b/packages/app/src/lib/default-sequence.test.ts
@@ -0,0 +1,119 @@
+import { describe, expect, it } from 'vitest';
+
+import { Sequence } from './data-mappings';
+import { resolveEffectiveSequence } from './default-sequence';
+
+describe('resolveEffectiveSequence', () => {
+  describe('availability gate (rule 1)', () => {
+    it('returns null while availability has not loaded, even if the selection looks valid', () => {
+      // Pre-availability, availableSequences is the static fallback (which
+      // contains AgenticTraces). Resolving here would fetch + label an agentic
+      // scenario for a fixed-seq-only model, so we hold off.
+      expect(
+        resolveEffectiveSequence({
+          selectedSequence: Sequence.AgenticTraces,
+          availableSequences: [
+            Sequence.OneK_OneK,
+            Sequence.OneK_EightK,
+            Sequence.EightK_OneK,
+            Sequence.AgenticTraces,
+          ],
+          availabilityLoaded: false,
+        }),
+      ).toBeNull();
+    });
+
+    it('returns null pre-availability regardless of the selected sequence', () => {
+      expect(
+        resolveEffectiveSequence({
+          selectedSequence: Sequence.EightK_OneK,
+          availableSequences: [Sequence.EightK_OneK],
+          availabilityLoaded: false,
+        }),
+      ).toBeNull();
+    });
+  });
+
+  describe('honors a valid selection (rule 2a)', () => {
+    it('keeps AgenticTraces when the model actually has agentic data (dsr1 case)', () => {
+      // DeepSeek-R1 in the seeded DB has both agentic and 8k/1k — the agentic
+      // default must survive so the PR intent (agentic-preferred) holds.
+      expect(
+        resolveEffectiveSequence({
+          selectedSequence: Sequence.AgenticTraces,
+          availableSequences: [Sequence.EightK_OneK, Sequence.AgenticTraces],
+          availabilityLoaded: true,
+        }),
+      ).toBe(Sequence.AgenticTraces);
+    });
+
+    it('keeps a fixed-seq selection when available', () => {
+      expect(
+        resolveEffectiveSequence({
+          selectedSequence: Sequence.OneK_OneK,
+          availableSequences: [Sequence.OneK_OneK, Sequence.EightK_OneK],
+          availabilityLoaded: true,
+        }),
+      ).toBe(Sequence.OneK_OneK);
+    });
+  });
+
+  describe('fallback ordering when the selection is unavailable (rule 2b/2c)', () => {
+    it('for a fixed-seq-only model, agentic default falls back to 8k/1k, not the raw first entry (llama70b case)', () => {
+      // Llama-3.3-70B has only 8k/1k in the seeded DB. The agentic default is
+      // unavailable, so it must resolve to a fixed-seq scenario — here the sole
+      // available one.
+      expect(
+        resolveEffectiveSequence({
+          selectedSequence: Sequence.AgenticTraces,
+          availableSequences: [Sequence.EightK_OneK],
+          availabilityLoaded: true,
+        }),
+      ).toBe(Sequence.EightK_OneK);
+    });
+
+    it('prefers 8k/1k over availableSequences[0] when both 1k/1k and 8k/1k exist', () => {
+      // DB row order can surface 1k/1k first. Master defaulted non-agentic
+      // models to 8k/1k, so prefer it rather than snapping to 1k/1k.
+      expect(
+        resolveEffectiveSequence({
+          selectedSequence: Sequence.AgenticTraces,
+          availableSequences: [Sequence.OneK_OneK, Sequence.EightK_OneK],
+          availabilityLoaded: true,
+        }),
+      ).toBe(Sequence.EightK_OneK);
+    });
+
+    it('falls back to availableSequences[0] when 8k/1k is not available', () => {
+      expect(
+        resolveEffectiveSequence({
+          selectedSequence: Sequence.AgenticTraces,
+          availableSequences: [Sequence.OneK_OneK, Sequence.OneK_EightK],
+          availabilityLoaded: true,
+        }),
+      ).toBe(Sequence.OneK_OneK);
+    });
+
+    it('never resolves to AgenticTraces via fallback when the model lacks it', () => {
+      const result = resolveEffectiveSequence({
+        selectedSequence: Sequence.AgenticTraces,
+        availableSequences: [Sequence.OneK_OneK, Sequence.OneK_EightK, Sequence.EightK_OneK],
+        availabilityLoaded: true,
+      });
+      expect(result).not.toBe(Sequence.AgenticTraces);
+      expect(result).toBe(Sequence.EightK_OneK);
+    });
+
+    it('returns the selection itself when the model has no sequences at all', () => {
+      // Degenerate case: keeps a non-null value so the type contract holds; the
+      // chart shows empty. (availabilityLoaded true but zero sequences.)
+      expect(
+        resolveEffectiveSequence({
+          selectedSequence: Sequence.OneK_OneK,
+          availableSequences: [],
+          availabilityLoaded: true,
+        }),
+      ).toBe(Sequence.OneK_OneK);
+    });
+  });
+});
diff --git a/packages/app/src/lib/default-sequence.ts b/packages/app/src/lib/default-sequence.ts
new file mode 100644
index 00000000..d06a5307
--- /dev/null
+++ b/packages/app/src/lib/default-sequence.ts
@@ -0,0 +1,52 @@
+import { Sequence } from './data-mappings';
+
+/**
+ * Effective-sequence resolution.
+ *
+ * `selectedSequence` defaults to {@link Sequence.AgenticTraces} (a deliberate
+ * product choice — agentic-preferred), but not every model has agentic data.
+ * This helper turns the raw user/default selection into the sequence the chart
+ * should actually render, given what the selected model offers.
+ *
+ * Two rules, in order:
+ *
+ * 1. **Availability gate.** Until availability rows have loaded we do NOT know
+ *    which sequences the model has. Resolving eagerly here would pick the static
+ *    fallback list (which contains AgenticTraces) and make the page fetch + label
+ *    an agentic scenario for fixed-seq-only models (e.g. Llama-3.3-70B), then
+ *    snap to a fixed-seq scenario once availability arrives — a visible flash of
+ *    "Agentic Traces" plus a wasted request. When `availabilityLoaded` is false
+ *    we return `null`; callers gate data fetching and selector display on a
+ *    non-null result (a loading skeleton covers this window, which is short).
+ *
+ * 2. **Fallback ordering.** Once availability is known: keep the user's
+ *    `selectedSequence` if the model has it. Otherwise fall back to a sensible
+ *    fixed-seq scenario. `availableSequences[0]` follows DB row order, which can
+ *    surface `1k/1k` even when `8k/1k` exists — but `8k/1k` was the pre-agentic
+ *    default for non-agentic models, so prefer it when present to match that
+ *    long-standing behavior. Only if neither the selection nor `8k/1k` is
+ *    available do we fall to `availableSequences[0]`.
+ */
+export function resolveEffectiveSequence({
+  selectedSequence,
+  availableSequences,
+  availabilityLoaded,
+}: {
+  selectedSequence: Sequence;
+  availableSequences: Sequence[];
+  availabilityLoaded: boolean;
+}): Sequence | null {
+  // Rule 1: do not commit to a sequence before we know what the model has.
+  if (!availabilityLoaded) return null;
+
+  // Rule 2a: honor the user's / default selection when the model supports it.
+  if (availableSequences.includes(selectedSequence)) return selectedSequence;
+
+  // Rule 2b: prefer 8k/1k (the pre-agentic default for non-agentic models) over
+  // whatever availableSequences[0] happens to be (DB row order can yield 1k/1k).
+  if (availableSequences.includes(Sequence.EightK_OneK)) return Sequence.EightK_OneK;
+
+  // Rule 2c: last resort — first available, or the selection itself if the model
+  // has no sequences at all (keeps the type non-null; downstream shows empty).
+  return availableSequences[0] ?? selectedSequence;
+}

From 1d4b027fee797e2ffe23d700b749c7973d17d4b2 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 18:45:30 -0500
Subject: [PATCH 30/40] fix(datasets): escape LIKE wildcards and cap
 conversation search length

The public conversation search embedded user input in ILIKE unescaped
and uncapped: '%' matched every row and long stacked-wildcard patterns
could push Neon to statement timeout (500s). escapeLikePattern escapes
backslash-first then %/_ so searches are literal substring matches
(now agreeing exactly with the dump-mode mirror's .includes semantics);
the route trims and rejects >100 chars with 400 before touching the DB.
Live: ?search=%25 30 -> 0 rows; 150-char input -> 400; real searches
unchanged. Adds 14 tests.
---
 .../[slug]/conversations/route.test.ts        | 116 ++++++++++++++++++
 .../v1/datasets/[slug]/conversations/route.ts |  16 ++-
 .../json-provider.agentic-datasets.test.ts    |  17 +++
 packages/db/src/queries/datasets.test.ts      |  32 ++++-
 packages/db/src/queries/datasets.ts           |  25 +++-
 5 files changed, 203 insertions(+), 3 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts

diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts
new file mode 100644
index 00000000..b582e79c
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts
@@ -0,0 +1,116 @@
+import { describe, expect, it, vi, beforeEach } from 'vitest';
+
+const { mockListConversations, mockGetDb } = vi.hoisted(() => ({
+  mockListConversations: vi.fn(),
+  mockGetDb: vi.fn(() => 'mock-sql'),
+}));
+
+vi.mock('@semianalysisai/inferencex-db/connection', () => ({
+  getDb: mockGetDb,
+  JSON_MODE: false,
+  FIXTURES_MODE: false,
+}));
+
+vi.mock('@semianalysisai/inferencex-db/queries/datasets', () => ({
+  listConversations: mockListConversations,
+}));
+
+vi.mock('@semianalysisai/inferencex-db/json-provider', () => ({
+  listConversations: vi.fn(),
+}));
+
+vi.mock('@/lib/api-cache', () => ({
+  cachedQuery: (fn: (...args: any[]) => any) => fn,
+  cachedJson: (data: unknown) => Response.json(data),
+}));
+
+import { GET } from './route';
+import { NextRequest } from 'next/server';
+
+function req(path: string): NextRequest {
+  return new NextRequest(new URL(path, 'http://localhost'));
+}
+
+const PARAMS = Promise.resolve({ slug: 'test-dataset' });
+
+beforeEach(() => {
+  vi.clearAllMocks();
+});
+
+describe('GET /api/v1/datasets/[slug]/conversations — search input validation', () => {
+  it('returns 400 when search exceeds 100 characters', async () => {
+    const longSearch = 'a'.repeat(101);
+    const res = await GET(req(`/api/v1/datasets/test-dataset/conversations?search=${longSearch}`), {
+      params: PARAMS,
+    });
+    expect(res.status).toBe(400);
+    const body = await res.json();
+    expect(body.error).toBe('search too long');
+    // DB must not be called.
+    expect(mockListConversations).not.toHaveBeenCalled();
+  });
+
+  it('accepts a search string exactly at the 100-character limit', async () => {
+    const exactSearch = 'a'.repeat(100);
+    mockListConversations.mockResolvedValueOnce({ total: 0, items: [] });
+    const res = await GET(
+      req(`/api/v1/datasets/test-dataset/conversations?search=${exactSearch}`),
+      { params: PARAMS },
+    );
+    expect(res.status).toBe(200);
+  });
+
+  it('trims whitespace before applying the length check', async () => {
+    // A 101-char string that is 100 chars of spaces + 1 real char should become
+    // 1 char after trimming — well under the limit.
+    const paddedSearch = `${' '.repeat(100)}a`;
+    mockListConversations.mockResolvedValueOnce({ total: 1, items: [] });
+    const res = await GET(
+      req(`/api/v1/datasets/test-dataset/conversations?search=${paddedSearch}`),
+      { params: PARAMS },
+    );
+    expect(res.status).toBe(200);
+    expect(mockListConversations).toHaveBeenCalledWith(
+      'mock-sql',
+      'test-dataset',
+      expect.objectContaining({ search: 'a' }),
+    );
+  });
+
+  it('returns 404 when the dataset slug is unknown', async () => {
+    mockListConversations.mockResolvedValueOnce(null);
+    const res = await GET(req('/api/v1/datasets/test-dataset/conversations'), {
+      params: PARAMS,
+    });
+    expect(res.status).toBe(404);
+    const body = await res.json();
+    expect(body.error).toBe('Not found');
+  });
+
+  it('returns conversation data for a valid request', async () => {
+    const mockData = { total: 2, items: [{ conv_id: 'c1' }, { conv_id: 'c2' }] };
+    mockListConversations.mockResolvedValueOnce(mockData);
+    const res = await GET(
+      req('/api/v1/datasets/test-dataset/conversations?search=agent&sort=turns&limit=10&offset=0'),
+      { params: PARAMS },
+    );
+    expect(res.status).toBe(200);
+    const body = await res.json();
+    expect(body).toEqual(mockData);
+    expect(mockListConversations).toHaveBeenCalledWith(
+      'mock-sql',
+      'test-dataset',
+      expect.objectContaining({ search: 'agent', sort: 'turns', limit: 10, offset: 0 }),
+    );
+  });
+
+  it('returns 500 when the query throws', async () => {
+    mockListConversations.mockRejectedValueOnce(new Error('Neon timeout'));
+    const res = await GET(req('/api/v1/datasets/test-dataset/conversations'), {
+      params: PARAMS,
+    });
+    expect(res.status).toBe(500);
+    const body = await res.json();
+    expect(body.error).toBe('Internal server error');
+  });
+});
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
index 196c29d6..2dad4ace 100644
--- a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
@@ -34,6 +34,13 @@ const getCachedConversations = cachedQuery(
   'dataset-conversations',
 );
 
+// Maximum search string length accepted. Longer strings are rejected with 400
+// rather than being forwarded to the DB: an ILIKE on an unindexed conv_id column
+// with a very long pattern (or many stacked wildcards) can exhaust Neon's
+// statement timeout and return a 500. 100 chars is generous for any real
+// conversation-id prefix while keeping the attack surface small.
+const MAX_SEARCH_LENGTH = 100;
+
 /**
  * GET /api/v1/datasets/[slug]/conversations?search=&limit=&offset=&sort=
  * Paginated conversation list (counts only, no flamegraph structure).
@@ -41,7 +48,14 @@ const getCachedConversations = cachedQuery(
 export async function GET(request: NextRequest, { params }: { params: Promise<{ slug: string }> }) {
   const { slug } = await params;
   const sp = request.nextUrl.searchParams;
-  const search = sp.get('search') ?? '';
+  const rawSearch = sp.get('search') ?? '';
+  const search = rawSearch.trim();
+
+  // Reject search strings that exceed the length cap before touching the DB.
+  if (search.length > MAX_SEARCH_LENGTH) {
+    return NextResponse.json({ error: 'search too long' }, { status: 400 });
+  }
+
   const limit = Math.min(200, Math.max(1, Number(sp.get('limit')) || 50));
   const offset = Math.max(0, Number(sp.get('offset')) || 0);
   const sortParam = sp.get('sort') ?? 'tokens';
diff --git a/packages/db/src/json-provider.agentic-datasets.test.ts b/packages/db/src/json-provider.agentic-datasets.test.ts
index d6cb6601..e2e97908 100644
--- a/packages/db/src/json-provider.agentic-datasets.test.ts
+++ b/packages/db/src/json-provider.agentic-datasets.test.ts
@@ -541,6 +541,23 @@ describe('dataset mirrors', () => {
     expect(jp.listDatasets()[0]?.ingested_at).toBe('2026-06-20 00:00:00+00');
   });
 
+  it('listConversations: search for literal "%" matches no rows (wildcard semantics do not apply)', () => {
+    // The SQL path now escapes LIKE metacharacters via escapeLikePattern before
+    // embedding into the ILIKE pattern. The json-provider mirror uses
+    // .toLowerCase().includes() which already treats input literally. Both paths
+    // must agree: a search for "%" finds only conv_ids that contain a literal
+    // percent character — none of the fixture conv_ids do.
+    const result = jp.listConversations('ds-new', { search: '%' });
+    expect(result?.total).toBe(0);
+    expect(result?.items).toHaveLength(0);
+  });
+
+  it('listConversations: search for literal "_" matches no rows', () => {
+    // Similarly, "_" must not act as a single-character wildcard.
+    const result = jp.listConversations('ds-new', { search: '_' });
+    expect(result?.total).toBe(0);
+  });
+
   it('listConversations applies case-insensitive search, sort, and pagination', () => {
     // Default sort = tokens (total_in desc): alpha(300), plain(200), AGENT-beta(100).
     const all = jp.listConversations('ds-new');
diff --git a/packages/db/src/queries/datasets.test.ts b/packages/db/src/queries/datasets.test.ts
index c1676445..d6693536 100644
--- a/packages/db/src/queries/datasets.test.ts
+++ b/packages/db/src/queries/datasets.test.ts
@@ -1,7 +1,37 @@
 import { describe, expect, it } from 'vitest';
 
 import type { DbClient } from '../connection.js';
-import { getConversation, listConversations, listDatasets } from './datasets.js';
+import { escapeLikePattern, getConversation, listConversations, listDatasets } from './datasets.js';
+
+describe('escapeLikePattern', () => {
+  it('leaves plain text unchanged', () => {
+    expect(escapeLikePattern('agent')).toBe('agent');
+  });
+
+  it('escapes % so it is treated as a literal percent, not a wildcard', () => {
+    expect(escapeLikePattern('%')).toBe(String.raw`\%`);
+    expect(escapeLikePattern('50%off')).toBe(String.raw`50\%off`);
+  });
+
+  it('escapes _ so it is treated as a literal underscore, not a wildcard', () => {
+    expect(escapeLikePattern('_')).toBe(String.raw`\_`);
+    expect(escapeLikePattern('conv_id')).toBe(String.raw`conv\_id`);
+  });
+
+  it('escapes backslash first to avoid double-escaping', () => {
+    expect(escapeLikePattern('\\')).toBe(String.raw`\\`);
+    // A backslash followed by % must become \\\% in the escaped output.
+    expect(escapeLikePattern(String.raw`\%`)).toBe(String.raw`\\\%`);
+  });
+
+  it('handles mixed metacharacters', () => {
+    expect(escapeLikePattern('50%_off')).toBe(String.raw`50\%\_off`);
+  });
+
+  it('returns empty string unchanged', () => {
+    expect(escapeLikePattern('')).toBe('');
+  });
+});
 
 /**
  * Mock DbClient: returns canned result sets in call order. Each call to the
diff --git a/packages/db/src/queries/datasets.ts b/packages/db/src/queries/datasets.ts
index cfefe391..bbcb2ece 100644
--- a/packages/db/src/queries/datasets.ts
+++ b/packages/db/src/queries/datasets.ts
@@ -106,6 +106,27 @@ export interface ListConversationsOpts {
 
 const MAX_LIMIT = 200;
 
+/**
+ * Escape Postgres LIKE metacharacters in a user-supplied search string so that
+ * the pattern performs a literal substring match, not a wildcard match.
+ *
+ * Postgres LIKE special characters are: % (any sequence), _ (any single char),
+ * and \ (the default escape character). We escape \ first so our own escape
+ * sequences are not double-escaped, then % and _.
+ *
+ * postgres.js parameterization already prevents SQL injection; this escaping
+ * fixes wildcard-semantics only (e.g. searching for literal '%' must not match
+ * every row).
+ *
+ * @example escapeLikePattern('50%_off') === '50\\%\\_off'
+ */
+export function escapeLikePattern(raw: string): string {
+  return raw
+    .replaceAll('\\', String.raw`\\`)
+    .replaceAll('%', String.raw`\%`)
+    .replaceAll('_', String.raw`\_`);
+}
+
 /**
  * Paginated conversation list for a dataset (by slug). Returns counts only —
  * the per-conversation `structure` blob is fetched separately by
@@ -125,7 +146,9 @@ export async function listConversations(
   const limit = Math.min(MAX_LIMIT, Math.max(1, opts.limit ?? 50));
   const offset = Math.max(0, opts.offset ?? 0);
   const search = opts.search?.trim();
-  const like = search ? `%${search}%` : null;
+  // Escape LIKE metacharacters so user input is treated as a literal substring.
+  // Backslash is escaped first to prevent double-escaping our own escape sequences.
+  const like = search ? `%${escapeLikePattern(search)}%` : null;
 
   const totalRows = (await sql`
     select count(*)::int as n

From 20cc135e63de9273a148b0d13ee36f6d6b4268a0 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 18:57:20 -0500
Subject: [PATCH 31/40] fix(analytics): track sibling nav, dataset pagination,
 chart expand; overlay-mode e2e

Adds the AGENTS.md-required track() calls (agentic_siblings_navigated,
datasets_conversations_page_changed, agentic_chart_expanded) to the
three untracked interaction clusters, and the mandated overlay-path
regression coverage: ttft-x-axis-toggle gains three tests loading an
?unofficialrun= overlay, switching to the ttft x-axis mode (overlay
points still render), and asserting the normalized-e2e suppression
banner. Cypress 8/8.
---
 .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts  | 111 ++++++++++++++++++
 .../components/datasets/dataset-detail.tsx    |  12 +-
 .../agentic-point/expandable-chart.tsx        |   6 +-
 .../inference/agentic-point/sibling-nav.tsx   |  21 +++-
 4 files changed, 144 insertions(+), 6 deletions(-)

diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index c634cd27..dca6cd8e 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -197,3 +197,114 @@ describe('X-Axis Mode Toggle (inference chart)', () => {
     cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
   });
 });
+
+// ---------------------------------------------------------------------------
+// Overlay path — regression coverage for unofficial-run overlays with agentic
+// x-axis modes (finding #8 / AGENTS.md: chart features must have overlay tests).
+// The overlay behavior itself is verified correct by prior review; this suite
+// guards against regressions only and does NOT change overlay behavior.
+// ---------------------------------------------------------------------------
+
+// Build a minimal unofficial-run API response that contains one agentic
+// overlay benchmark row so the provider builds overlay chart data.
+const OVERLAY_RUN_ID = 99900000001;
+const OVERLAY_RUN_URL = `https://github.com/SemiAnalysisAI/InferenceX/actions/runs/${OVERLAY_RUN_ID}`;
+
+const overlayBenchmarkRow = {
+  id: 800000,
+  hardware: 'b200',
+  framework: 'vllm',
+  model: DEFAULT_MODEL_DB_KEY,
+  precision: 'fp4',
+  spec_method: 'none',
+  disagg: false,
+  is_multinode: false,
+  prefill_tp: 8,
+  decode_tp: 8,
+  num_prefill_gpu: 8,
+  num_decode_gpu: 8,
+  isl: null,
+  osl: null,
+  conc: 32,
+  offload_mode: 'off',
+  benchmark_type: 'agentic_traces',
+  image: 'vllm/vllm-openai:v0.9.0',
+  metrics: agenticMetrics(32),
+  workers: null,
+  date: AGENTIC_DATE,
+  run_url: OVERLAY_RUN_URL,
+};
+
+const interceptAgenticDataWithOverlay = () => {
+  interceptAgenticData();
+  cy.intercept('GET', '/api/unofficial-run*', {
+    body: {
+      runInfos: [
+        {
+          id: OVERLAY_RUN_ID,
+          name: 'Overlay regression fixture',
+          branch: 'test/overlay-regression',
+          sha: 'abc000',
+          createdAt: `${AGENTIC_DATE}T00:00:00Z`,
+          url: OVERLAY_RUN_URL,
+          conclusion: 'success',
+          status: 'completed',
+          isNonMainBranch: true,
+        },
+      ],
+      benchmarks: [overlayBenchmarkRow],
+      evaluations: [],
+    },
+  }).as('unofficialRun');
+};
+
+describe('X-Axis Mode Toggle — overlay path (finding #8 regression guard)', () => {
+  before(() => {
+    interceptAgenticDataWithOverlay();
+    cy.visit(`/inference?unofficialrun=${OVERLAY_RUN_ID}`, {
+      onBeforeLoad(win) {
+        win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+      },
+    });
+    cy.wait('@unofficialRun');
+    cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible');
+    cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1);
+  });
+
+  it('shows overlay (unofficial-run) watermark SVG when an overlay is loaded', () => {
+    // The unofficial-run pattern watermark appears when isUnofficialRun is true.
+    cy.get('[data-testid="inference-chart-display"] svg pattern[id^="unofficial-pattern-"]').should(
+      'exist',
+    );
+  });
+
+  it('switches to ttft x-axis mode and renders SVG with overlay points', () => {
+    cy.get('[data-testid="x-axis-mode-ttft"]').click();
+    cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token');
+    // Overlay points render as triangles or circles inside the chart SVG.
+    cy.get('[data-testid="inference-chart-display"] svg').should('exist');
+    cy.get('[data-testid="inference-chart-display"] svg').then(($svgs) => {
+      let total = 0;
+      $svgs.each((_i, svg) => {
+        total += svg.querySelectorAll('circle, polygon, path').length;
+      });
+      expect(total).to.be.greaterThan(0);
+    });
+  });
+
+  it('normalized-e2e mode shows suppression banner for unofficial-run overlays', () => {
+    interceptDerivedMetrics();
+    cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click();
+    cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should(
+      'have.attr',
+      'aria-selected',
+      'true',
+    );
+    // The suppression message appears because isUnofficialRun is true and the
+    // mode is 'normalized-e2e' (documented in ChartDisplay.tsx ~line 640).
+    cy.contains(
+      'Normalized E2E requires persisted per-request traces, so unofficial-run overlays are unavailable for this experimental view.',
+    ).should('be.visible');
+  });
+});
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
index 051e7457..ccf0a944 100644
--- a/packages/app/src/components/datasets/dataset-detail.tsx
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -288,7 +288,11 @@ export function DatasetDetail({ slug }: { slug: string }) {
             <button
               type="button"
               disabled={page === 0}
-              onClick={() => setPage((p) => Math.max(0, p - 1))}
+              onClick={() => {
+                const next = Math.max(0, page - 1);
+                track('datasets_conversations_page_changed', { direction: 'prev', page: next });
+                setPage(next);
+              }}
               className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
             >
               ← Prev
@@ -299,7 +303,11 @@ export function DatasetDetail({ slug }: { slug: string }) {
             <button
               type="button"
               disabled={page >= pageCount - 1}
-              onClick={() => setPage((p) => Math.min(pageCount - 1, p + 1))}
+              onClick={() => {
+                const next = Math.min(pageCount - 1, page + 1);
+                track('datasets_conversations_page_changed', { direction: 'next', page: next });
+                setPage(next);
+              }}
               className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
             >
               Next →
diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
index cb5987ec..810530c5 100644
--- a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
@@ -4,6 +4,7 @@ import { useState, type ReactNode } from 'react';
 import { Maximize2 } from 'lucide-react';
 
 import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog';
+import { track } from '@/lib/analytics';
 
 /**
  * Wraps a chart in a card with a header + expand button. Click the button to
@@ -32,7 +33,10 @@ export function ExpandableChart({
           <button
             type="button"
             aria-label="Expand chart"
-            onClick={() => setOpen(true)}
+            onClick={() => {
+              track('agentic_chart_expanded', { title });
+              setOpen(true);
+            }}
             className="text-muted-foreground hover:text-foreground transition-colors"
           >
             <Maximize2 className="size-4" />
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
index a1a5d1ab..2c3a3c27 100644
--- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -206,7 +206,12 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
         <button
           type="button"
           disabled={!prev}
-          onClick={() => prev && router.push(hrefFor(prev.id))}
+          onClick={() => {
+            if (prev) {
+              track('agentic_siblings_navigated', { direction: 'prev', targetId: prev.id });
+              router.push(hrefFor(prev.id));
+            }
+          }}
           className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
           aria-label="Previous point"
         >
@@ -219,7 +224,12 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
               <button
                 key={s.id}
                 type="button"
-                onClick={() => !active && router.push(hrefFor(s.id))}
+                onClick={() => {
+                  if (!active) {
+                    track('agentic_siblings_navigated', { direction: 'chip', targetId: s.id });
+                    router.push(hrefFor(s.id));
+                  }
+                }}
                 className={`px-2 py-1 rounded-md text-xs border transition-colors ${
                   active
                     ? 'border-primary bg-primary text-primary-foreground font-medium'
@@ -235,7 +245,12 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
         <button
           type="button"
           disabled={!next}
-          onClick={() => next && router.push(hrefFor(next.id))}
+          onClick={() => {
+            if (next) {
+              track('agentic_siblings_navigated', { direction: 'next', targetId: next.id });
+              router.push(hrefFor(next.id));
+            }
+          }}
           className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
           aria-label="Next point"
         >

From 08ea28f3e087e8a226d445710c6c1b2d8bdb2531 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 18:57:28 -0500
Subject: [PATCH 32/40] chore: gitignore .playwright-mcp artifacts

Playwright MCP page snapshots contain HTML-entity-escaped class strings;
Tailwind 4's auto content detection (which respects gitignore) scanned
them and emitted unresolvable mask-image classes, 500ing the dev server.
Affects any Playwright-MCP-driven review session incl. the @claude CI
review flow.
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a86f6e23..41071934 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,3 +71,4 @@ C:*
 # python bytecode (e.g. .claude/skills/*/iso-interactivity.py imports)
 **/__pycache__/
 **/*.pyc
+.playwright-mcp/

From 862ff4bcdbb9790b3b9d8237feacce1e45106d84 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 23:32:47 -0500
Subject: [PATCH 33/40] feat(agentic): hide all agentic surfaces behind the
 feature gate

Agentic ships dark for now: the existing feature-gate mechanism (the
same localStorage flag behind the hidden-tabs pattern) now also gates
every AgentX surface, default off.

One deep chokepoint does most of the work: availableSequences strips
agentic-traces when locked, which cascades through the sequence
resolver - no agentic default, no scenario entry, no agentic x-axis
modes/percentile UI, no agentic tooltips (all downstream of
effectiveSequence). The Datasets nav link hides, and the standalone
/datasets/* and /inference/agentic/[id] pages notFound() when locked
(they have no nav entry and are robots-noindexed). Sitemap/llms.txt
already exclude them. API routes and ingest are not gated.

Agentic cypress specs seed the gate via unlockAgenticGate(). Verified
both states live: locked = 8k/1k default with zero agentic DOM + 404s;
unlocked = everything restored.
---
 .../e2e/agentic-point-time-series.cy.ts       |  6 ++-
 .../cypress/e2e/datasets-distributions.cy.ts  |  4 +-
 .../e2e/datasets-flamegraph-time.cy.ts        |  6 ++-
 .../e2e/gpu-compare-agentic-detail.cy.ts      |  3 ++
 .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts  |  4 ++
 packages/app/cypress/support/e2e.ts           | 19 +++++++++
 .../inference/agentic/[id]/page.tsx           |  7 +++-
 .../[slug]/conversations/[convId]/page.tsx    | 17 ++++----
 packages/app/src/app/datasets/[slug]/page.tsx | 13 +++---
 packages/app/src/app/datasets/page.tsx        |  9 ++++
 .../src/components/GlobalFilterContext.tsx    | 28 ++++++++++---
 packages/app/src/components/agentic-gate.tsx  | 41 +++++++++++++++++++
 packages/app/src/components/header/header.tsx | 13 +++++-
 13 files changed, 146 insertions(+), 24 deletions(-)
 create mode 100644 packages/app/src/components/agentic-gate.tsx

diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
index 4a450f7c..86d57b5d 100644
--- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -1,3 +1,5 @@
+import { unlockAgenticGate } from '../support/e2e';
+
 const timelineRequest = (
   index: number,
   ttftMs: number,
@@ -63,7 +65,7 @@ describe('Agentic point request metric time series', () => {
         ],
       },
     });
-    cy.visit('/inference/agentic/206885');
+    cy.visit('/inference/agentic/206885', { onBeforeLoad: unlockAgenticGate });
   });
 
   it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => {
@@ -276,7 +278,7 @@ describe('Agentic point orchestrator metric sources', () => {
         metricSources: [prefill, decode],
       },
     });
-    cy.visit('/inference/agentic/206885');
+    cy.visit('/inference/agentic/206885', { onBeforeLoad: unlockAgenticGate });
   });
 
   it('switches every server chart to an orchestrator-normalized worker', () => {
diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts
index 6ce4bc34..0d2a7789 100644
--- a/packages/app/cypress/e2e/datasets-distributions.cy.ts
+++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts
@@ -1,3 +1,5 @@
+import { unlockAgenticGate } from '../support/e2e';
+
 const distribution = (values: {
   median: number;
   p75: number;
@@ -85,7 +87,7 @@ describe('Dataset distribution percentiles', () => {
     cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', {
       body: { total: 0, items: [] },
     });
-    cy.visit('/datasets/test-dataset');
+    cy.visit('/datasets/test-dataset', { onBeforeLoad: unlockAgenticGate });
   });
 
   it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => {
diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
index 58d95c27..bdb1adfc 100644
--- a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
+++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
@@ -1,3 +1,5 @@
+import { unlockAgenticGate } from '../support/e2e';
+
 describe('Dataset conversation flamegraph timing', () => {
   before(() => {
     cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', {
@@ -93,7 +95,9 @@ describe('Dataset conversation flamegraph timing', () => {
         },
       },
     });
-    cy.visit('/datasets/test-dataset/conversations/conversation-1');
+    cy.visit('/datasets/test-dataset/conversations/conversation-1', {
+      onBeforeLoad: unlockAgenticGate,
+    });
   });
 
   it('shows turn offsets and a collapsed subagent time range', () => {
diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
index d574dd2a..83171809 100644
--- a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
+++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
@@ -1,3 +1,5 @@
+import { unlockAgenticGate } from '../support/e2e';
+
 describe('GPU comparison agentic point detail', () => {
   it('exposes the per-point charts as a normal browser link', () => {
     cy.intercept('GET', '/api/v1/trace-availability*', (request) => {
@@ -9,6 +11,7 @@ describe('GPU comparison agentic point detail', () => {
     cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', {
       onBeforeLoad(win) {
         win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+        unlockAgenticGate(win);
       },
     });
 
diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index dca6cd8e..92b32d33 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -1,3 +1,5 @@
+import { unlockAgenticGate } from '../support/e2e';
+
 const interceptDerivedMetrics = () => {
   cy.intercept('GET', '/api/v1/derived-agentic-metrics*', (request) => {
     const ids = new URL(request.url).searchParams.get('ids')?.split(',').filter(Boolean) ?? [];
@@ -132,6 +134,7 @@ describe('X-Axis Mode Toggle (inference chart)', () => {
     cy.visit('/inference', {
       onBeforeLoad(win) {
         win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+        unlockAgenticGate(win);
       },
     });
     cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible');
@@ -264,6 +267,7 @@ describe('X-Axis Mode Toggle — overlay path (finding #8 regression guard)', ()
     cy.visit(`/inference?unofficialrun=${OVERLAY_RUN_ID}`, {
       onBeforeLoad(win) {
         win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+        unlockAgenticGate(win);
       },
     });
     cy.wait('@unofficialRun');
diff --git a/packages/app/cypress/support/e2e.ts b/packages/app/cypress/support/e2e.ts
index d8209e33..0edb08c0 100644
--- a/packages/app/cypress/support/e2e.ts
+++ b/packages/app/cypress/support/e2e.ts
@@ -14,3 +14,22 @@ Cypress.on('window:before:load', (win) => {
     // localStorage unavailable — fine, the test will just see the modal.
   }
 });
+
+/**
+ * Unlock the shared feature gate for specs that exercise agentic surfaces
+ * (the "Agentic Traces" scenario, /datasets, /inference/agentic/[id], and the
+ * Datasets nav link). The gate is OFF by default so the PR can ship without
+ * publicly exposing agentic features; agentic specs opt in by seeding the same
+ * localStorage flag the ↑↑↓↓ konami unlock writes (see use-feature-gate.ts).
+ *
+ * Call from a spec's `cy.visit(..., { onBeforeLoad })`:
+ *   cy.visit('/datasets/x', { onBeforeLoad: unlockAgenticGate });
+ * or compose inside an existing hook: `unlockAgenticGate(win)`.
+ */
+export function unlockAgenticGate(win: Window): void {
+  try {
+    win.localStorage.setItem('inferencex-feature-gate', '1');
+  } catch {
+    // localStorage unavailable — spec will see the gate locked and likely 404.
+  }
+}
diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
index 77f29805..34dd169a 100644
--- a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
+++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
@@ -1,5 +1,6 @@
 import type { Metadata } from 'next';
 
+import { AgenticGate } from '@/components/agentic-gate';
 import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail';
 
 export const metadata: Metadata = {
@@ -13,5 +14,9 @@ export default async function AgenticPointDetailPage({
   params: Promise<{ id: string }>;
 }) {
   const { id } = await params;
-  return <AgenticPointDetail id={Number(id)} />;
+  return (
+    <AgenticGate>
+      <AgenticPointDetail id={Number(id)} />
+    </AgenticGate>
+  );
 }
diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
index 83eb56a0..732b9ad1 100644
--- a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
+++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
@@ -1,6 +1,7 @@
 import { Suspense } from 'react';
 import type { Metadata } from 'next';
 
+import { AgenticGate } from '@/components/agentic-gate';
 import { ConversationView } from '@/components/datasets/conversation-view';
 import { SITE_URL } from '@semianalysisai/inferencex-constants';
 
@@ -24,12 +25,14 @@ export async function generateMetadata({ params }: Props): Promise<Metadata> {
 export default async function ConversationPage({ params }: Props) {
   const { slug, convId } = await params;
   return (
-    <main className="relative">
-      <div className="container mx-auto px-4 pb-8 lg:px-8">
-        <Suspense>
-          <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
-        </Suspense>
-      </div>
-    </main>
+    <AgenticGate>
+      <main className="relative">
+        <div className="container mx-auto px-4 pb-8 lg:px-8">
+          <Suspense>
+            <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+          </Suspense>
+        </div>
+      </main>
+    </AgenticGate>
   );
 }
diff --git a/packages/app/src/app/datasets/[slug]/page.tsx b/packages/app/src/app/datasets/[slug]/page.tsx
index f32e3fa6..c853a695 100644
--- a/packages/app/src/app/datasets/[slug]/page.tsx
+++ b/packages/app/src/app/datasets/[slug]/page.tsx
@@ -1,5 +1,6 @@
 import type { Metadata } from 'next';
 
+import { AgenticGate } from '@/components/agentic-gate';
 import { DatasetDetail } from '@/components/datasets/dataset-detail';
 import { SITE_URL } from '@semianalysisai/inferencex-constants';
 
@@ -23,10 +24,12 @@ export async function generateMetadata({ params }: Props): Promise<Metadata> {
 export default async function DatasetDetailPage({ params }: Props) {
   const { slug } = await params;
   return (
-    <main className="relative">
-      <div className="container mx-auto px-4 pb-8 lg:px-8">
-        <DatasetDetail slug={slug} />
-      </div>
-    </main>
+    <AgenticGate>
+      <main className="relative">
+        <div className="container mx-auto px-4 pb-8 lg:px-8">
+          <DatasetDetail slug={slug} />
+        </div>
+      </main>
+    </AgenticGate>
   );
 }
diff --git a/packages/app/src/app/datasets/page.tsx b/packages/app/src/app/datasets/page.tsx
index 7fe46b93..711e0dbc 100644
--- a/packages/app/src/app/datasets/page.tsx
+++ b/packages/app/src/app/datasets/page.tsx
@@ -1,5 +1,6 @@
 import type { Metadata } from 'next';
 
+import { AgenticGate } from '@/components/agentic-gate';
 import { Card } from '@/components/ui/card';
 import { JsonLd } from '@/components/json-ld';
 import { DatasetList } from '@/components/datasets/dataset-list';
@@ -29,6 +30,14 @@ const jsonLd = {
 };
 
 export default function DatasetsPage() {
+  return (
+    <AgenticGate>
+      <DatasetsPageContent />
+    </AgenticGate>
+  );
+}
+
+function DatasetsPageContent() {
   return (
     <main className="relative">
       <JsonLd data={jsonLd} />
diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index e7aa751c..fd6a42ae 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -47,6 +47,7 @@ import {
 import { computeAutoSwitchDecision } from '@/lib/unofficial-run-auto-switch';
 import { countCurvesByPrecision, resolveEffectivePrecisions } from '@/lib/default-precisions';
 import { resolveEffectiveSequence } from '@/lib/default-sequence';
+import { useFeatureGate } from '@/lib/use-feature-gate';
 import type { AvailabilityRow, WorkflowInfoResponse } from '@/lib/api';
 
 interface RunInfo {
@@ -160,6 +161,14 @@ export function GlobalFilterProvider({
 }) {
   const { hasUrlParam, getUrlParam, setUrlParams } = useUrlState();
 
+  // Agentic surfaces are hidden behind the shared konami-code feature gate
+  // (default OFF until agentic launches). When locked, agentic sequences are
+  // filtered out of `availableSequences` below — the single chokepoint that
+  // cascades: no agentic default (resolveEffectiveSequence falls to 8k/1k), no
+  // "Agentic Traces" scenario-selector entry, and no agentic x-axis mode /
+  // percentile selector (those key off effectiveSequence === AgenticTraces).
+  const agenticGateUnlocked = useFeatureGate();
+
   // ── Core filter state ─────────────────────────────────────────────────────
   const [selectedModel, setSelectedModel] = useState<Model>(
     () => initialModel ?? Model.DeepSeek_V4_Pro,
@@ -293,18 +302,27 @@ export function GlobalFilterProvider({
     }
   }, [unofficialAvailable, selectedModel]);
 
-  // Sequences available for the selected model (DB ∪ unofficial run for this model)
+  // Sequences available for the selected model (DB ∪ unofficial run for this model).
+  //
+  // When the agentic feature gate is locked (default), agentic sequences are
+  // dropped from every branch — including the static SEQUENCE_OPTIONS fallback —
+  // so no agentic scenario is ever selectable or defaulted. This is the single
+  // gate chokepoint for the main inference chart's agentic surfaces.
   const availableSequences = useMemo(() => {
+    const dropAgentic = (seqs: Sequence[]) =>
+      agenticGateUnlocked ? seqs : seqs.filter((s) => s !== Sequence.AgenticTraces);
     const unofficialSeqs = unofficialAvailable
       .filter((a) => a.model === selectedModel)
       .map((a) => a.sequence as Sequence);
     if (!availabilityRows) {
-      return unofficialSeqs.length > 0 ? [...new Set(unofficialSeqs)] : SEQUENCE_OPTIONS;
+      return unofficialSeqs.length > 0
+        ? dropAgentic([...new Set(unofficialSeqs)])
+        : dropAgentic(SEQUENCE_OPTIONS);
     }
     const dbSeqs = modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null);
-    const merged = [...new Set([...dbSeqs, ...unofficialSeqs])];
-    return merged.length > 0 ? merged : SEQUENCE_OPTIONS;
-  }, [availabilityRows, modelRows, unofficialAvailable, selectedModel]);
+    const merged = dropAgentic([...new Set([...dbSeqs, ...unofficialSeqs])]);
+    return merged.length > 0 ? merged : dropAgentic(SEQUENCE_OPTIONS);
+  }, [availabilityRows, modelRows, unofficialAvailable, selectedModel, agenticGateUnlocked]);
 
   // Whether we actually know the selected model's sequences yet. Availability
   // may arrive from the DB (`availabilityRows`) OR from a loaded unofficial run
diff --git a/packages/app/src/components/agentic-gate.tsx b/packages/app/src/components/agentic-gate.tsx
new file mode 100644
index 00000000..9fa0aa37
--- /dev/null
+++ b/packages/app/src/components/agentic-gate.tsx
@@ -0,0 +1,41 @@
+'use client';
+
+import { notFound } from 'next/navigation';
+import { useEffect, useState } from 'react';
+
+import { FEATURE_GATE_KEY, useFeatureGate } from '@/lib/use-feature-gate';
+
+/**
+ * Client gate for the standalone agentic product pages (`/datasets/*`,
+ * `/inference/agentic/[id]`). These are server-rendered routes with no nav
+ * entry once the header link is hidden, so a direct URL visit is the only way
+ * in. When the shared konami-code feature gate (see {@link useFeatureGate}) is
+ * locked — the default until agentic launches — we `notFound()` so the route
+ * behaves like a clean 404 instead of publicly exposing agentic surfaces.
+ *
+ * The gate lives in localStorage, which the server can't read, so we resolve it
+ * on the client: read the flag synchronously on mount, and until then render
+ * nothing (no content flash before a potential 404). QA can unlock at runtime
+ * with ↑↑↓↓ (the same mechanism as the Hidden tab dropdown) or by seeding
+ * `localStorage['inferencex-feature-gate'] = '1'`, after which these pages
+ * render in full.
+ */
+export function AgenticGate({ children }: { children: React.ReactNode }) {
+  const unlocked = useFeatureGate();
+  // Distinguish "haven't read localStorage yet" from "read it, gate is locked":
+  // useFeatureGate() returns false on the server and on the very first client
+  // render before its mount effect runs, so we must not 404 during that window.
+  const [resolved, setResolved] = useState(false);
+  useEffect(() => setResolved(true), []);
+
+  if (!resolved) return null;
+  if (!unlocked) {
+    // Belt-and-suspenders: re-read the flag directly in case an unlock event
+    // hasn't propagated yet on this first resolved render.
+    if (typeof window !== 'undefined' && localStorage.getItem(FEATURE_GATE_KEY) === '1') {
+      return <>{children}</>;
+    }
+    notFound();
+  }
+  return <>{children}</>;
+}
diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx
index 95fc0acb..1a12057e 100644
--- a/packages/app/src/components/header/header.tsx
+++ b/packages/app/src/components/header/header.tsx
@@ -9,6 +9,7 @@ import { track } from '@/lib/analytics';
 import { ModeToggle } from '@/components/ui/mode-toggle';
 import { MinecraftToggles } from '@/components/minecraft/minecraft-toggles';
 import { navigateInApp } from '@/lib/client-navigation';
+import { useFeatureGate } from '@/lib/use-feature-gate';
 import { cn } from '@/lib/utils';
 
 import { GitHubStars } from './GithubStars';
@@ -51,6 +52,9 @@ const NAV_LINKS = [
     label: 'Datasets',
     testId: 'nav-link-datasets',
     event: 'header_datasets_clicked',
+    // Agentic surface — hidden behind the konami-code feature gate (default off)
+    // until agentic launches. Same gate as the Hidden tab dropdown.
+    gated: true,
   },
   { href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' },
   { href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' },
@@ -68,9 +72,14 @@ function isActive(pathname: string, href: string): boolean {
 export const Header = ({ starCount }: { starCount?: number | null }) => {
   const pathname = usePathname() ?? '/';
   const router = useRouter();
+  const featureGateUnlocked = useFeatureGate();
   const [mobileMenuOpen, setMobileMenuOpen] = useState(false);
   const menuRef = useRef<HTMLDivElement>(null);
 
+  // Hide gated nav links (e.g. Datasets — an agentic surface) unless the shared
+  // feature gate is unlocked. Mirrors the tab-nav "Hidden" dropdown gating.
+  const navLinks = NAV_LINKS.filter((l) => !('gated' in l && l.gated) || featureGateUnlocked);
+
   // Close menu on route change
   useEffect(() => {
     setMobileMenuOpen(false);
@@ -124,7 +133,7 @@ export const Header = ({ starCount }: { starCount?: number | null }) => {
 
           {/* Desktop nav */}
           <nav className="hidden lg:flex items-center gap-1">
-            {NAV_LINKS.map(({ href, label, testId, event }) => (
+            {navLinks.map(({ href, label, testId, event }) => (
               <Link
                 key={href}
                 data-testid={testId}
@@ -178,7 +187,7 @@ export const Header = ({ starCount }: { starCount?: number | null }) => {
               </button>
               {mobileMenuOpen && (
                 <div className="absolute right-0 top-full mt-2 z-50 flex flex-col rounded-lg border border-border bg-background p-1.5 shadow-lg min-w-40">
-                  {NAV_LINKS.map(({ href, label, event }) => (
+                  {navLinks.map(({ href, label, event }) => (
                     <Link
                       key={href}
                       href={href}

From d3aeb1fc987fd4f670712cdb34f72082ca8b1784 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 23:39:41 -0500
Subject: [PATCH 34/40] chore: format packages/app/tsconfig.json (pnpm fmt was
 failing on it)

---
 packages/app/tsconfig.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/app/tsconfig.json b/packages/app/tsconfig.json
index 8b658cad..3044b60c 100644
--- a/packages/app/tsconfig.json
+++ b/packages/app/tsconfig.json
@@ -29,7 +29,9 @@
     "**/*.tsx",
     ".next/types/**/*.ts",
     "json-custom-types.d.ts",
-    ".next/dev/types/**/*.ts"
+    ".next/dev/types/**/*.ts",
+    ".next-e2e/types/**/*.ts",
+    ".next-e2e/dev/types/**/*.ts"
   ],
   "exclude": ["node_modules"]
 }

From 031a7beb7132dea6e1b5b10cfc9f021b40eba477 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 3 Jul 2026 00:06:15 -0500
Subject: [PATCH 35/40] test(e2e): fix the 4 known-failing agentic specs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

agentic-point-time-series: the point-count assertions predated the
time-boundary phase slicing — sliceTimelineByPhase puts every request
with start >= the profiling boundary in the profiling window, so the
warmup-labeled r5 legitimately lands there and cancelled/null-metric
filtering yields 6 interactivity/TTFT points and 8 E2E points (traced
to phase-slice.ts + time-series-math.ts, not just observed). No product
regression.

gpu-compare-agentic-detail: shared fixtures carry no agentic rows, so
the flow could never render; spec-scoped intercepts (availability,
benchmarks, trace-availability) now exercise the tooltip -> View charts
link without touching shared fixtures.

Full e2e suite: 449/449. These specs were failing all four CI shards on
every push of this branch.
---
 .../e2e/agentic-point-time-series.cy.ts       |  19 ++-
 .../e2e/gpu-compare-agentic-detail.cy.ts      | 133 +++++++++++++++++-
 2 files changed, 145 insertions(+), 7 deletions(-)

diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
index 86d57b5d..1e5286c1 100644
--- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -74,8 +74,11 @@ describe('Agentic point request metric time series', () => {
       cy.get('[data-testid="interactivity-percentile-toggle"]')
         .find('[role="tab"][aria-selected="true"]')
         .should('have.text', 'P90');
-      cy.get('[data-testid="interactivity-point-count"]').should('have.text', '5 points');
-      cy.get('svg circle').should('have.length', 5);
+      // 6 points: profiling slice includes requests 0-4 (profiling) + request 5
+      // (phase='warmup' label but start=5s > profiling boundary=0s, so
+      // sliceTimelineByPhase keeps it); cancelled r6 and null-metric r7/r8 are dropped.
+      cy.get('[data-testid="interactivity-point-count"]').should('have.text', '6 points');
+      cy.get('svg circle').should('have.length', 6);
       cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
       cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT');
       cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
@@ -83,8 +86,9 @@ describe('Agentic point request metric time series', () => {
 
     cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
       cy.contains('h2', 'TTFT over time').should('be.visible');
-      cy.get('[data-testid="ttft-point-count"]').should('have.text', '5 points');
-      cy.get('svg circle').should('have.length', 5);
+      // Same 6-point slice as interactivity (warmup r5 included by time-boundary).
+      cy.get('[data-testid="ttft-point-count"]').should('have.text', '6 points');
+      cy.get('svg circle').should('have.length', 6);
       cy.get('svg').should('contain.text', 'TTFT (s)');
       cy.get('svg').should('contain.text', 'Cumulative P90 TTFT');
       cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
@@ -109,8 +113,11 @@ describe('Agentic point request metric time series', () => {
     cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
       cy.get('[data-testid="latency-metric-e2e"]').click();
       cy.contains('h2', 'E2E latency over time').should('be.visible');
-      cy.get('[data-testid="e2e-point-count"]').should('have.text', '7 points');
-      cy.get('svg circle').should('have.length', 7);
+      // 8 points: e2e = (end−start)/1e6 > 0 for all non-cancelled requests —
+      // includes r0-r5 (profiling slice) + r7, r8 (subagent/aux with null ttft/tpot
+      // but valid start/end). Cancelled r6 is excluded.
+      cy.get('[data-testid="e2e-point-count"]').should('have.text', '8 points');
+      cy.get('svg circle').should('have.length', 8);
       cy.get('svg').should('contain.text', 'E2E latency (s)');
       cy.get('svg').should('contain.text', 'Cumulative P90 E2E latency');
 
diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
index 83171809..6c832e08 100644
--- a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
+++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
@@ -1,11 +1,142 @@
 import { unlockAgenticGate } from '../support/e2e';
 
+// ---------------------------------------------------------------------------
+// Spec-scoped fixture helpers
+//
+// The shared cypress/fixtures/api/*.json files contain ZERO agentic_traces rows
+// (by design — adding them flips the bare /inference default to the agentic
+// scenario and regresses other specs). This spec therefore injects minimal
+// agentic data via spec-scoped cy.intercept overrides that shadow the fixture
+// server, following the same pattern used in ttft-x-axis-toggle.cy.ts.
+// ---------------------------------------------------------------------------
+
+const DEFAULT_MODEL_DB_KEY = 'dsv4'; // DeepSeek-V4-Pro
+const AGENTIC_DATE = '2026-06-12';
+
+// Two GPUs with agentic + single_turn entries so the scenario selector resolves
+// to agentic (agentic preferred when both types exist for the same model).
+const AGENTIC_HARDWARE = [
+  { hardware: 'b200', framework: 'vllm', disagg: false },
+  { hardware: 'b300', framework: 'vllm', disagg: false },
+];
+
+const agenticAvailability = [
+  // Agentic rows (isl/osl null).
+  ...AGENTIC_HARDWARE.map((g) => ({
+    model: DEFAULT_MODEL_DB_KEY,
+    isl: null,
+    osl: null,
+    precision: 'fp4',
+    hardware: g.hardware,
+    framework: g.framework,
+    spec_method: 'none',
+    disagg: g.disagg,
+    benchmark_type: 'agentic_traces',
+    date: AGENTIC_DATE,
+  })),
+  // Single-turn rows alongside — without these the scenario selector may not
+  // see the "both exist" signal it needs to confidently pick agentic.
+  ...AGENTIC_HARDWARE.map((g) => ({
+    model: DEFAULT_MODEL_DB_KEY,
+    isl: 8192,
+    osl: 1024,
+    precision: 'fp4',
+    hardware: g.hardware,
+    framework: g.framework,
+    spec_method: 'none',
+    disagg: g.disagg,
+    benchmark_type: 'single_turn',
+    date: AGENTIC_DATE,
+  })),
+];
+
+// Minimal per-metric percentile ladder matching what the chart expects for
+// agentic rows (median/p75/p90/p95/p99 + std for each family).
+const percentileLadder = (prefix: string, base: number): Record<string, number> => ({
+  [`median_${prefix}`]: base,
+  [`p75_${prefix}`]: base * 1.2,
+  [`p90_${prefix}`]: base * 1.5,
+  [`p95_${prefix}`]: base * 1.7,
+  [`p99_${prefix}`]: base * 2.2,
+  [`std_${prefix}`]: base * 0.3,
+});
+
+const agenticMetrics = (conc: number): Record<string, number> => {
+  const scale = conc / 16;
+  const itl = 0.011 * scale;
+  return {
+    ...percentileLadder('ttft', 0.4 * scale),
+    ...percentileLadder('tpot', 0.012 * scale),
+    ...percentileLadder('itl', itl),
+    ...percentileLadder('e2el', 8 * scale),
+    median_intvty: 1 / itl,
+    p75_intvty: 1 / (itl * 1.2),
+    p90_intvty: 1 / (itl * 1.5),
+    p99_intvty: 1 / (itl * 2.2),
+    std_intvty: (1 / itl) * 0.1,
+    tput_per_gpu: 950 / Math.sqrt(scale),
+    output_tput_per_gpu: 210,
+    input_tput_per_gpu: 740,
+    total_tput_tps: 7600 * conc * 0.05,
+  };
+};
+
+// IDs must be unique numbers — the GPU graph uses them as D3 data keys and
+// trace-availability is keyed on them.
+let benchIdCursor = 800100;
+const agenticBenchmarks = AGENTIC_HARDWARE.flatMap((g) =>
+  [16, 64, 128].map((conc) => ({
+    id: benchIdCursor++,
+    hardware: g.hardware,
+    framework: g.framework,
+    model: DEFAULT_MODEL_DB_KEY,
+    precision: 'fp4',
+    spec_method: 'none',
+    disagg: g.disagg,
+    is_multinode: false,
+    prefill_tp: 8,
+    prefill_ep: 1,
+    prefill_dp_attention: false,
+    prefill_num_workers: 0,
+    decode_tp: 8,
+    decode_ep: 1,
+    decode_dp_attention: false,
+    decode_num_workers: 0,
+    num_prefill_gpu: 8,
+    num_decode_gpu: 8,
+    isl: null,
+    osl: null,
+    conc,
+    offload_mode: 'off',
+    benchmark_type: 'agentic_traces',
+    image: 'vllm/vllm-openai:v0.9.0',
+    metrics: agenticMetrics(conc),
+    workers: null,
+    date: AGENTIC_DATE,
+    run_url: null,
+  })),
+);
+
+// All injected IDs with a stored trace blob — the GPU graph renders the
+// "View charts" link only when trace-availability returns true for the id.
+const agenticIds = new Set(agenticBenchmarks.map((b) => b.id));
+
 describe('GPU comparison agentic point detail', () => {
   it('exposes the per-point charts as a normal browser link', () => {
+    // Shadow the fixture-server availability + benchmarks responses with
+    // spec-scoped agentic data so the GPU graph renders agentic dots.
+    cy.intercept('GET', '/api/v1/availability', { body: agenticAvailability }).as(
+      'agenticAvailability',
+    );
+    cy.intercept('GET', '/api/v1/benchmarks*', { body: agenticBenchmarks }).as('agenticBenchmarks');
+    // Return true for all injected ids so the "View charts" link appears.
     cy.intercept('GET', '/api/v1/trace-availability*', (request) => {
       const ids = new URL(request.url).searchParams.get('ids')?.split(',') ?? [];
       if (ids.length < 20) request.alias = 'gpuTraceAvailability';
-      request.continue();
+      const result = Object.fromEntries(
+        ids.filter((id) => agenticIds.has(Number(id))).map((id) => [id, true]),
+      );
+      request.reply({ body: result });
     });
 
     cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', {

From 7717471e9e0366478838aff191772c76b69d8ce4 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 3 Jul 2026 01:06:59 -0500
Subject: [PATCH 36/40] fix: address reviewer-bot findings (offload dedup,
 i_seq default, id guards, conv-id encoding, stale intvty, phase filter)

- Chart-layer offload dedup: useChartData's latest-date-per-group key
  and mergeRunScopedRows' claim key now include offload_mode (?? 'off'),
  completing the SQL-layer fix - a later-dated offload sweep no longer
  drops the other variant's series or claims its base rows.
- PARAM_DEFAULTS.i_seq '' so an explicit 8K/1K pick survives share
  URLs instead of stripping and reloading as the agentic default; moved
  PRE_AVAILABILITY_SEQUENCE below imports (code-quality flag).
- Shared isPersistedBenchmarkId guard (integer > 0) at every agentic
  link/id-collection site: no /agentic/NaN or /agentic/0 links,
  overlay-only views skip the derived-metrics fetch instead of 400ing,
  and agentic/[id] notFound()s on invalid ids.
- Conversation ids: encode once at link producers, decode exactly once
  (removed double decodeURIComponent in page + route); ids with % / # ?
  now round-trip.
- ETL: missing/zero/invalid ITL now deletes the artifact-provided
  *_intvty key instead of passing p(1/ITL) values through; same fix on
  the documented overlay-path mirror in benchmark-transform.
- Gantt timeline now phase-slices by time boundary like the per-point
  charts (sliceTimelineByPhase), so both tabs agree on request sets;
  spec expectation updated with producing-logic justification.
- gitignore **/.next-* (secondary dist dirs from multi-server testing).

app 2381 + db 399 unit tests, full e2e 449/449.
---
 .gitignore                                    |  1 +
 .../e2e/agentic-point-time-series.cy.ts       | 12 ++-
 .../inference/agentic/[id]/page.tsx           |  9 ++-
 .../conversations/[convId]/route.test.ts      | 71 +++++++++++++++++
 .../[slug]/conversations/[convId]/route.ts    |  6 +-
 .../[slug]/conversations/[convId]/page.tsx    | 11 ++-
 .../src/components/GlobalFilterContext.tsx    | 23 +++---
 .../components/datasets/dataset-detail.tsx    |  2 +-
 .../agentic-point/request-timeline.tsx        | 15 ++--
 .../inference/hooks/useChartData.test.ts      | 79 ++++++++++++++++++-
 .../inference/hooks/useChartData.ts           | 58 ++++++++++----
 .../components/inference/ui/ChartDisplay.tsx  |  7 +-
 .../utils/legend-points-table.test.ts         |  9 +++
 .../inference/utils/legend-points-table.ts    |  3 +-
 .../inference/utils/tooltip-utils.test.ts     | 11 +++
 .../inference/utils/tooltipUtils.ts           |  3 +-
 packages/app/src/lib/benchmark-id.test.ts     | 33 ++++++++
 packages/app/src/lib/benchmark-id.ts          | 20 +++++
 .../app/src/lib/benchmark-transform.test.ts   | 77 ++++++++++++++++++
 packages/app/src/lib/benchmark-transform.ts   | 53 ++++++++-----
 packages/app/src/lib/url-state.test.ts        | 30 ++++++-
 packages/app/src/lib/url-state.ts             |  8 +-
 packages/db/src/etl/benchmark-mapper.test.ts  | 16 ++++
 packages/db/src/etl/benchmark-mapper.ts       | 12 ++-
 24 files changed, 501 insertions(+), 68 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts
 create mode 100644 packages/app/src/lib/benchmark-id.test.ts
 create mode 100644 packages/app/src/lib/benchmark-id.ts

diff --git a/.gitignore b/.gitignore
index 41071934..c52b0482 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
 
 # next.js
 **/.next
+**/.next-*
 **/out
 
 # production
diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
index 1e5286c1..e8161066 100644
--- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -169,10 +169,18 @@ describe('Agentic point request metric time series', () => {
     });
   });
 
-  it('shows total time with no requests in flight on the request timeline', () => {
+  it('shows total idle time on the request timeline (time-boundary phase slice, consistent with the charts)', () => {
     cy.get('[data-testid="detail-view-timeline"]').click();
     cy.location('search').should('contain', 'view=timeline');
-    cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 1.00s (14.3%)');
+    // The Gantt now slices by TIME BOUNDARY (sliceTimelineByPhase), matching the
+    // per-point charts, instead of the per-request phase LABEL. The earliest
+    // profiling request starts at t=0, so the boundary is 0 and warmup-labelled
+    // r5 (start=5s) is counted as profiling here too — exactly as the interactivity
+    // /TTFT charts already count it (their 6-point slice includes r5). That fills
+    // the former 5–6s gap that label-based filtering left open, so in-flight
+    // coverage is now continuous across [0s, 7s]: idle 0ms (0.0%). A 1.00s value
+    // here would mean the Gantt had regressed to label-based filtering.
+    cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 0ms (0.0%)');
     cy.get('[data-timeline-row-kind="aux"]')
       .should('have.css', 'padding-left', '24px')
       .and('contain.text', 'aux 011 · parallel');
diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
index 34dd169a..91b769bd 100644
--- a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
+++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
@@ -1,7 +1,9 @@
 import type { Metadata } from 'next';
+import { notFound } from 'next/navigation';
 
 import { AgenticGate } from '@/components/agentic-gate';
 import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
 
 export const metadata: Metadata = {
   title: 'Agentic trace detail | InferenceX',
@@ -14,9 +16,14 @@ export default async function AgenticPointDetailPage({
   params: Promise<{ id: string }>;
 }) {
   const { id } = await params;
+  const numericId = Number(id);
+  // benchmark_results.id is a positive bigserial — anything else (`/agentic/abc`,
+  // `/agentic/0`, `/agentic/-1`) can never resolve, so 404 instead of rendering a
+  // blank detail shell that fires doomed id-keyed fetches.
+  if (!isPersistedBenchmarkId(numericId)) notFound();
   return (
     <AgenticGate>
-      <AgenticPointDetail id={Number(id)} />
+      <AgenticPointDetail id={numericId} />
     </AgenticGate>
   );
 }
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts
new file mode 100644
index 00000000..bc374e72
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts
@@ -0,0 +1,71 @@
+import { describe, expect, it, vi, beforeEach } from 'vitest';
+
+const { mockGetConversation, mockGetDb } = vi.hoisted(() => ({
+  mockGetConversation: vi.fn(),
+  mockGetDb: vi.fn(() => 'mock-sql'),
+}));
+
+vi.mock('@semianalysisai/inferencex-db/connection', () => ({
+  getDb: mockGetDb,
+  JSON_MODE: false,
+  FIXTURES_MODE: false,
+}));
+
+vi.mock('@semianalysisai/inferencex-db/queries/datasets', () => ({
+  getConversation: mockGetConversation,
+}));
+
+vi.mock('@semianalysisai/inferencex-db/json-provider', () => ({
+  getConversation: vi.fn(),
+}));
+
+vi.mock('@/lib/api-cache', () => ({
+  cachedQuery: (fn: (...args: any[]) => any) => fn,
+  cachedJson: (data: unknown) => Response.json(data),
+}));
+
+import { GET } from './route';
+import { NextRequest } from 'next/server';
+
+function req(): NextRequest {
+  return new NextRequest(new URL('http://localhost/api/v1/datasets/ds/conversations/x'));
+}
+
+/**
+ * App Router decodes each dynamic route segment EXACTLY ONCE before handing it to
+ * the handler, so `params.convId` is already the raw conversation id. These tests
+ * pin the route's contract: it must pass that value straight to the query with NO
+ * further decodeURIComponent (which would over-decode, mis-key '%'/'/' ids, or
+ * throw on a lone '%'). The client (useDatasetConversation) encodeURIComponent's
+ * the id before the fetch, so the whole pipeline decodes once end-to-end.
+ */
+beforeEach(() => {
+  vi.clearAllMocks();
+  mockGetConversation.mockResolvedValue({ conv_id: 'x', turns: [] });
+});
+
+describe('GET /api/v1/datasets/[slug]/conversations/[convId] — decode exactly once', () => {
+  it('passes the already-decoded convId straight through (no second decode)', async () => {
+    const params = Promise.resolve({ slug: 'ds', convId: 'a/b%c' });
+    const res = await GET(req(), { params });
+    expect(res.status).toBe(200);
+    // 'a/b%c' contains a lone '%'; a second decodeURIComponent here would THROW
+    // (→ 500). Passing through means the query sees the raw id verbatim.
+    expect(mockGetConversation).toHaveBeenCalledWith('mock-sql', 'ds', 'a/b%c');
+  });
+
+  it('preserves special characters (% / # ?) exactly as decoded by App Router', async () => {
+    const raw = 'conv/50%_a#b?c';
+    const params = Promise.resolve({ slug: 'ds', convId: raw });
+    const res = await GET(req(), { params });
+    expect(res.status).toBe(200);
+    expect(mockGetConversation).toHaveBeenCalledWith('mock-sql', 'ds', raw);
+  });
+
+  it('returns 404 when the conversation is not found', async () => {
+    mockGetConversation.mockResolvedValueOnce(null);
+    const params = Promise.resolve({ slug: 'ds', convId: 'missing' });
+    const res = await GET(req(), { params });
+    expect(res.status).toBe(404);
+  });
+});
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
index 61672759..35f2fddf 100644
--- a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
@@ -26,7 +26,11 @@ export async function GET(
 ) {
   const { slug, convId } = await params;
   try {
-    const data = await getCachedConversation(slug, decodeURIComponent(convId));
+    // App Router has already decoded the `[convId]` segment exactly once, so
+    // `convId` is the raw conversation id. The client (useDatasetConversation)
+    // encodeURIComponent-encodes it before the fetch; decoding again here would
+    // over-decode and mis-key ids containing '%' / '/'. Decode exactly once.
+    const data = await getCachedConversation(slug, convId);
     if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
     return cachedJson(data);
   } catch (error) {
diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
index 732b9ad1..5bc8fea9 100644
--- a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
+++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
@@ -11,25 +11,32 @@ interface Props {
 
 export async function generateMetadata({ params }: Props): Promise<Metadata> {
   const { slug, convId } = await params;
+  // App Router has already decoded the dynamic segment exactly once, so `convId`
+  // is the raw conversation id here. Re-encode for the canonical URL.
   const short = convId.slice(0, 12);
   const title = `Conversation ${short} | ${slug}`;
   const description = `Per-turn token flamegraph (cached prefix vs uncached input vs output) for conversation ${short} in the ${slug} agentic trace dataset.`;
   return {
     title,
     description,
-    alternates: { canonical: `${SITE_URL}/datasets/${slug}/conversations/${convId}` },
+    alternates: {
+      canonical: `${SITE_URL}/datasets/${slug}/conversations/${encodeURIComponent(convId)}`,
+    },
     robots: { index: false }, // per-conversation pages are too numerous to index
   };
 }
 
 export default async function ConversationPage({ params }: Props) {
   const { slug, convId } = await params;
+  // `convId` is already decoded once by App Router — pass it straight through.
+  // A second decodeURIComponent here would over-decode (and throw for ids that
+  // contain a literal '%'). ConversationView re-encodes when it builds the API URL.
   return (
     <AgenticGate>
       <main className="relative">
         <div className="container mx-auto px-4 pb-8 lg:px-8">
           <Suspense>
-            <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+            <ConversationView slug={slug} convId={convId} />
           </Suspense>
         </div>
       </main>
diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index fd6a42ae..8bd10c71 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -21,17 +21,6 @@ function isEnumValue<T extends Record<string, string>>(e: T, v: string): v is T[
   return (Object.values(e) as string[]).includes(v);
 }
 
-const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u;
-const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u;
-
-// Placeholder for the public (non-null) `effectiveSequence` during the window
-// before availability has loaded. It must be a fixed-seq scenario — never
-// AgenticTraces — so the scenario selector doesn't flash "Agentic Traces" for a
-// fixed-seq-only model while the chart shows its loading skeleton. `8k/1k` is
-// the pre-agentic default for non-agentic models. Consumers that must not act on
-// an unresolved sequence gate on `sequenceResolved` instead.
-const PRE_AVAILABILITY_SEQUENCE = Sequence.EightK_OneK;
-
 import { useAvailability } from '@/hooks/api/use-availability';
 import { useWorkflowInfo } from '@/hooks/api/use-workflow-info';
 import { useUrlState } from '@/hooks/useUrlState';
@@ -50,6 +39,18 @@ import { resolveEffectiveSequence } from '@/lib/default-sequence';
 import { useFeatureGate } from '@/lib/use-feature-gate';
 import type { AvailabilityRow, WorkflowInfoResponse } from '@/lib/api';
 
+const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u;
+const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u;
+
+// Placeholder for the public (non-null) `effectiveSequence` during the window
+// before availability has loaded. It must be a fixed-seq scenario — never
+// AgenticTraces — so the scenario selector doesn't flash "Agentic Traces" for a
+// fixed-seq-only model while the chart shows its loading skeleton. `8k/1k` is
+// the pre-agentic default for non-agentic models. Consumers that must not act on
+// an unresolved sequence gate on `sequenceResolved` instead.
+// (Declared after the import block so it never references `Sequence` above its import.)
+const PRE_AVAILABILITY_SEQUENCE = Sequence.EightK_OneK;
+
 interface RunInfo {
   runId: string;
   runDate: string;
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
index ccf0a944..609a4c8f 100644
--- a/packages/app/src/components/datasets/dataset-detail.tsx
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -250,7 +250,7 @@ export function DatasetDetail({ slug }: { slug: string }) {
                   >
                     <td className="px-3 py-2">
                       <Link
-                        href={`/datasets/${slug}/conversations/${c.conv_id}`}
+                        href={`/datasets/${slug}/conversations/${encodeURIComponent(c.conv_id)}`}
                         onClick={() => track('datasets_conversation_clicked', { slug })}
                         className="font-mono text-xs text-primary hover:underline"
                       >
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 1786c74d..18cb76d5 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -7,7 +7,7 @@ import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-reques
 import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
 import { track } from '@/lib/analytics';
 
-import { requestsForPhase } from './phase-slice';
+import { sliceTimelineByPhase } from './phase-slice';
 import { TimelineBars } from './timeline-bars';
 import { formatDuration } from './timeline-format';
 import {
@@ -158,11 +158,16 @@ export function RequestTimelineView({
     [data.requests],
   );
 
-  // Apply phase filter, then group into rows. With no warmup data the filter
-  // collapses to "profiling" regardless of the (hidden) toggle state.
+  // Apply phase filter, then group into rows. Uses the SAME time-boundary
+  // slicing as the per-point charts (sliceTimelineByPhase) rather than the
+  // per-request phase LABEL, so the Gantt and the charts agree on exactly which
+  // requests belong to each phase (they diverge only when a warmup-labelled
+  // request starts after the first profiling request). With no warmup data the
+  // boundary is null and this is an identity passthrough — the filter collapses
+  // to "profiling" regardless of the (hidden) toggle state.
   const filtered = useMemo(
-    () => requestsForPhase(data.requests, hasWarmup ? phaseFilter : 'profiling'),
-    [data.requests, phaseFilter, hasWarmup],
+    () => sliceTimelineByPhase(data, hasWarmup ? phaseFilter : 'profiling').requests,
+    [data, phaseFilter, hasWarmup],
   );
   // Stable order/color per conversation (or worker), computed over the FULL
   // request set — NOT the phase-filtered subset — so a row keeps its position
diff --git a/packages/app/src/components/inference/hooks/useChartData.test.ts b/packages/app/src/components/inference/hooks/useChartData.test.ts
index 73582998..c4998add 100644
--- a/packages/app/src/components/inference/hooks/useChartData.test.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.test.ts
@@ -1,6 +1,83 @@
 import { describe, it, expect } from 'vitest';
 
-import { buildComparisonDates, filterByGPU, flipRooflineDirection } from './useChartData';
+import {
+  buildComparisonDates,
+  dedupeRowsToLatestPerConfig,
+  filterByGPU,
+  flipRooflineDirection,
+} from './useChartData';
+
+interface DedupeInput {
+  id: number;
+  hardware: string;
+  framework: string;
+  spec_method: string;
+  disagg: boolean;
+  precision: string;
+  offload_mode?: string | null;
+  date: string;
+}
+
+const drow = (over: Partial<DedupeInput> = {}): DedupeInput => ({
+  id: 1,
+  hardware: 'b300',
+  framework: 'vllm',
+  spec_method: 'none',
+  disagg: false,
+  precision: 'fp4',
+  offload_mode: 'off',
+  date: '2026-06-01',
+  ...over,
+});
+
+describe('dedupeRowsToLatestPerConfig', () => {
+  it('keeps only the latest date within a single series', () => {
+    const rows = [
+      drow({ id: 1, date: '2026-06-01' }),
+      drow({ id: 2, date: '2026-06-03' }),
+      drow({ id: 3, date: '2026-06-02' }),
+    ];
+    expect(dedupeRowsToLatestPerConfig(rows).map((r) => r.id)).toEqual([2]);
+  });
+
+  it('keeps BOTH offload variants even when they were ingested on different dates', () => {
+    // The regression: offload=on sweep landed LATER than offload=off. Without
+    // offload in the key, the on-variant's newer date would win the shared group
+    // and silently drop the (older) off-variant series entirely.
+    const rows = [
+      drow({ id: 1, offload_mode: 'off', date: '2026-06-01' }),
+      drow({ id: 2, offload_mode: 'on', date: '2026-06-05' }),
+    ];
+    const kept = dedupeRowsToLatestPerConfig(rows)
+      .map((r) => r.offload_mode)
+      .toSorted();
+    expect(kept).toEqual(['off', 'on']);
+  });
+
+  it('still dedupes each offload variant to its own latest date', () => {
+    const rows = [
+      drow({ id: 1, offload_mode: 'off', date: '2026-06-01' }),
+      drow({ id: 2, offload_mode: 'off', date: '2026-06-04' }),
+      drow({ id: 3, offload_mode: 'on', date: '2026-06-02' }),
+      drow({ id: 4, offload_mode: 'on', date: '2026-06-05' }),
+    ];
+    expect(
+      dedupeRowsToLatestPerConfig(rows)
+        .map((r) => r.id)
+        .toSorted(),
+    ).toEqual([2, 4]);
+  });
+
+  it('normalizes a missing offload_mode to "off" (matches the SQL lineKey)', () => {
+    // A row with no offload_mode collides with an explicit offload=off row of the
+    // same config — both are the "off" series, so latest-date dedup applies.
+    const rows = [
+      drow({ id: 1, offload_mode: undefined, date: '2026-06-01' }),
+      drow({ id: 2, offload_mode: 'off', date: '2026-06-03' }),
+    ];
+    expect(dedupeRowsToLatestPerConfig(rows).map((r) => r.id)).toEqual([2]);
+  });
+});
 
 describe('buildComparisonDates', () => {
   it('returns empty when no GPUs selected (comparison disabled)', () => {
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 183641d4..f6596656 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -29,6 +29,7 @@ import {
   withPercentile,
 } from '@/lib/benchmark-transform';
 import { Sequence, type Model } from '@/lib/data-mappings';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 import { paretoFrontForDirection, type ParetoDirection } from '@/lib/chart-utils';
 import {
@@ -116,7 +117,7 @@ function e2eParetoIds(
   const ids = new Set<number>();
   for (const bucket of byGroup.values()) {
     for (const f of frontierFn(bucket)) {
-      if (typeof f.id === 'number') ids.add(f.id);
+      if (isPersistedBenchmarkId(f.id)) ids.add(f.id);
     }
   }
   return ids;
@@ -166,6 +167,42 @@ export function flipRooflineDirection(dir: RooflineDirection): RooflineDirection
   return FLIP_MAP[dir];
 }
 
+/** The dedup key fields a chart series is identified by. */
+interface DedupeRow {
+  hardware: string;
+  framework: string;
+  spec_method: string;
+  disagg: boolean;
+  precision: string;
+  offload_mode?: string | null;
+  date: string;
+}
+
+// offload_mode normalized `?? 'off'` to match the SQL layer's getBenchmarksForRun
+// lineKey — agentic offload=on and offload=off are distinct series.
+const dedupeSeriesKey = (r: DedupeRow): string =>
+  `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}|${r.offload_mode ?? 'off'}`;
+
+/**
+ * For each series — (hardware, framework, spec_method, disagg, precision,
+ * offload_mode) — keep only the rows from that series' most recent date. When
+ * parallelism settings change between runs, old config_ids create stale points
+ * under the same legend line; dropping all-but-latest removes them.
+ *
+ * Without `offload_mode` in the key, an offload=on sweep ingested on a LATER date
+ * than the offload=off sweep would win the shared group and silently drop the
+ * (earlier-dated) offload=off variant — a data-loss regression.
+ */
+export function dedupeRowsToLatestPerConfig<T extends DedupeRow>(rows: T[]): T[] {
+  const maxDatePerGroup = new Map<string, string>();
+  for (const r of rows) {
+    const k = dedupeSeriesKey(r);
+    const cur = maxDatePerGroup.get(k);
+    if (!cur || r.date > cur) maxDatePerGroup.set(k, r.date);
+  }
+  return rows.filter((r) => r.date === maxDatePerGroup.get(dedupeSeriesKey(r)));
+}
+
 export function useChartData(
   selectedModel: Model,
   selectedSequence: Sequence,
@@ -292,19 +329,10 @@ export function useChartData(
       rowToSequence(r) === selectedSequence;
     const seqFiltered = allRows.filter(seqFilter);
 
-    // For each (hw, framework, spec_method, disagg, precision) group, keep only
-    // rows from the most recent date. When parallelism settings change between runs,
-    // old config_ids create stale data points under the same legend line — drop them.
-    const maxDatePerGroup = new Map<string, string>();
-    for (const r of seqFiltered) {
-      const key = `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}`;
-      const cur = maxDatePerGroup.get(key);
-      if (!cur || r.date > cur) maxDatePerGroup.set(key, r.date);
-    }
-    const deduped = seqFiltered.filter((r) => {
-      const key = `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}`;
-      return r.date === maxDatePerGroup.get(key);
-    });
+    // Keep only each series' latest-date rows (drops stale config_ids left behind
+    // when parallelism settings change between runs). Keyed per offload variant so
+    // an offload=on sweep can't hide a differently-dated offload=off series.
+    const deduped = dedupeRowsToLatestPerConfig(seqFiltered);
 
     const mainRows = deduped.map((r) =>
       selectedRunDate ? { ...r, date: selectedRunDate, actualDate: r.date } : r,
@@ -561,7 +589,7 @@ export function useChartData(
                 const isOnE2eFrontier =
                   e2eParetoSet === null
                     ? undefined
-                    : typeof d.id === 'number' && e2eParetoSet.has(d.id);
+                    : isPersistedBenchmarkId(d.id) && e2eParetoSet.has(d.id);
                 return {
                   ...d,
                   x: xValue,
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 6952f439..d6c86529 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -61,6 +61,7 @@ import {
   type DerivedAgenticMetric,
 } from '@/hooks/api/use-derived-agentic-metrics';
 import { isAgenticOnlyXAxisMode, type XAxisMode } from '@/components/inference/hooks/useChartData';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
 import { useTrendData } from '@/components/inference/hooks/useTrendData';
 import { getHardwareConfig, hardwareKeyMatchesAnyBase } from '@/lib/constants';
 
@@ -428,7 +429,9 @@ export default function ChartDisplay() {
     const ids = new Set<number>();
     for (const graph of visibleGraphs) {
       for (const point of graph.data) {
-        if (point.benchmark_type === 'agentic_traces' && typeof point.id === 'number') {
+        // Overlay-only agentic points carry no persisted id — skip them so we
+        // never request `?ids=0`/`?ids=NaN` (which 400s and errors the chart).
+        if (point.benchmark_type === 'agentic_traces' && isPersistedBenchmarkId(point.id)) {
           ids.add(point.id);
         }
       }
@@ -461,7 +464,7 @@ export default function ChartDisplay() {
       };
       const data = graph.data
         .map((point) => {
-          if (typeof point.id !== 'number') return null;
+          if (!isPersistedBenchmarkId(point.id)) return null;
           const raw = derivedSpec.value(derivedMetrics[point.id], selectedPercentile);
           if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
           return { ...point, x: derivedSpec.toX(raw) };
diff --git a/packages/app/src/components/inference/utils/legend-points-table.test.ts b/packages/app/src/components/inference/utils/legend-points-table.test.ts
index b29cecbb..86d6f8b3 100644
--- a/packages/app/src/components/inference/utils/legend-points-table.test.ts
+++ b/packages/app/src/components/inference/utils/legend-points-table.test.ts
@@ -75,6 +75,15 @@ describe('pointDetailHref', () => {
     expect(pointDetailHref(pt(), false)).toEqual({ href: null, isExternal: false });
   });
 
+  it('does not build an /agentic/<id> link for a non-persisted id (0 / NaN)', () => {
+    // `typeof id === 'number'` accepted these; isPersistedBenchmarkId rejects
+    // them so we never link to /inference/agentic/0 or /inference/agentic/NaN.
+    for (const badId of [0, Number.NaN]) {
+      const d = pt({ benchmark_type: 'agentic_traces', id: badId });
+      expect(pointDetailHref(d, false)).toEqual({ href: null, isExternal: false });
+    }
+  });
+
   it('overlay points never get a link (no DB benchmark id)', () => {
     const d = pt({
       benchmark_type: 'agentic_traces',
diff --git a/packages/app/src/components/inference/utils/legend-points-table.ts b/packages/app/src/components/inference/utils/legend-points-table.ts
index 0457e7c2..87df2fcf 100644
--- a/packages/app/src/components/inference/utils/legend-points-table.ts
+++ b/packages/app/src/components/inference/utils/legend-points-table.ts
@@ -1,4 +1,5 @@
 import { updateRepoUrl } from '@/lib/utils';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
 
 import type { InferenceData } from '@/components/inference/types';
 import { fmt, getPointLabel } from '@/components/inference/utils/tooltipUtils';
@@ -56,7 +57,7 @@ export function pointDetailHref(
   isOverlay: boolean,
 ): { href: string | null; isExternal: boolean } {
   if (isOverlay) return { href: null, isExternal: false };
-  if (d.benchmark_type === 'agentic_traces' && typeof d.id === 'number') {
+  if (d.benchmark_type === 'agentic_traces' && isPersistedBenchmarkId(d.id)) {
     return { href: `/inference/agentic/${d.id}`, isExternal: false };
   }
   if (d.run_url) return { href: updateRepoUrl(d.run_url), isExternal: true };
diff --git a/packages/app/src/components/inference/utils/tooltip-utils.test.ts b/packages/app/src/components/inference/utils/tooltip-utils.test.ts
index e4b9d31f..8755fbe7 100644
--- a/packages/app/src/components/inference/utils/tooltip-utils.test.ts
+++ b/packages/app/src/components/inference/utils/tooltip-utils.test.ts
@@ -159,6 +159,17 @@ describe('generateTooltipContent', () => {
     expect(html).not.toContain('data-action="view-charts" target=');
   });
 
+  it('omits View charts when the point id is non-persisted (0 / NaN), even if pinned + hasTrace', () => {
+    // Overlay agentic points arrive with id 0 / NaN — the button would otherwise
+    // link to /inference/agentic/0, a doomed lookup.
+    for (const badId of [0, Number.NaN]) {
+      const html = generateTooltipContent(
+        tooltipConfig({ data: pt({ id: badId }), isPinned: true, hasTrace: true }),
+      );
+      expect(html).not.toContain('data-action="view-charts"');
+    }
+  });
+
   it('includes hardware display label from config', () => {
     const html = generateTooltipContent(tooltipConfig());
     expect(html).toContain('H100');
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 8f8ab4df..84398397 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -1,4 +1,5 @@
 import { formatNumber, getDisplayLabel } from '@/lib/utils';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
 
 import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types';
 import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
@@ -142,7 +143,7 @@ const viewChartsButtonHTML = (
   hasTraceData: boolean,
   pointId: number | undefined,
 ): string => {
-  if (!isPinned || !hasTraceData || typeof pointId !== 'number') return '';
+  if (!isPinned || !hasTraceData || !isPersistedBenchmarkId(pointId)) return '';
   return `<a data-action="view-charts" href="/inference/agentic/${pointId}" style="
     display: block; margin-top: 8px; width: 100%; padding: 4px 8px; font-size: 11px; font-weight: 500;
     border: 1px solid var(--border); border-radius: 6px; cursor: pointer;
diff --git a/packages/app/src/lib/benchmark-id.test.ts b/packages/app/src/lib/benchmark-id.test.ts
new file mode 100644
index 00000000..0f9fb83b
--- /dev/null
+++ b/packages/app/src/lib/benchmark-id.test.ts
@@ -0,0 +1,33 @@
+import { describe, expect, it } from 'vitest';
+
+import { isPersistedBenchmarkId } from './benchmark-id';
+
+describe('isPersistedBenchmarkId', () => {
+  it('accepts a positive integer (a real bigserial row id)', () => {
+    expect(isPersistedBenchmarkId(1)).toBe(true);
+    expect(isPersistedBenchmarkId(206863)).toBe(true);
+  });
+
+  it('rejects 0 — bigserial starts at 1, so 0 is never a real row', () => {
+    expect(isPersistedBenchmarkId(0)).toBe(false);
+  });
+
+  it('rejects negatives', () => {
+    expect(isPersistedBenchmarkId(-1)).toBe(false);
+  });
+
+  it('rejects NaN (what Number(undefined) yields for overlay rows)', () => {
+    expect(isPersistedBenchmarkId(Number(undefined))).toBe(false);
+    expect(isPersistedBenchmarkId(NaN)).toBe(false);
+  });
+
+  it('rejects non-integers', () => {
+    expect(isPersistedBenchmarkId(1.5)).toBe(false);
+    expect(isPersistedBenchmarkId(Infinity)).toBe(false);
+  });
+
+  it('rejects null / undefined', () => {
+    expect(isPersistedBenchmarkId(null)).toBe(false);
+    expect(isPersistedBenchmarkId(undefined)).toBe(false);
+  });
+});
diff --git a/packages/app/src/lib/benchmark-id.ts b/packages/app/src/lib/benchmark-id.ts
new file mode 100644
index 00000000..b1ccb8bc
--- /dev/null
+++ b/packages/app/src/lib/benchmark-id.ts
@@ -0,0 +1,20 @@
+/**
+ * Shared guard for `benchmark_results.id` values.
+ *
+ * `benchmark_results.id` is a Postgres bigserial that starts at 1, so a real
+ * persisted row always has a positive integer id. Overlay / `?unofficialrun=`
+ * points are transformed live from raw artifacts and never carry a DB id — the
+ * transform yields `undefined` (older code produced `NaN` via `Number(undefined)`).
+ *
+ * A bare `typeof id === 'number'` check is NOT enough: `NaN` and `0` are both
+ * `number` yet neither is a real row. Passing them to the id-keyed endpoints
+ * (`/api/v1/derived-agentic-metrics?ids=…`, `…?id=…`) yields a 400 (the routes
+ * filter to `Number.isFinite(n) && n > 0`), and building an
+ * `/inference/agentic/<id>` link out of one points at a non-existent row.
+ *
+ * Use this predicate at every site that collects ids for a fetch or builds a
+ * per-point detail link so overlay-only views skip cleanly instead of erroring.
+ */
+export function isPersistedBenchmarkId(id: number | null | undefined): id is number {
+  return typeof id === 'number' && Number.isInteger(id) && id > 0;
+}
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 648ebaae..c08137e6 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -885,4 +885,81 @@ describe('rowToAggDataEntry — agentic interactivity invariant', () => {
     const entry = rowToAggDataEntry(makeRow({ metrics: { p90_itl: 0.05, p90_intvty: 999 } }));
     expect(entry.p90_intvty).toBe(999);
   });
+
+  it('DROPS a stale artifact *_intvty when the matching *_itl is absent (overlay mirror of the ETL fix)', () => {
+    // Artifact carries intvty (possibly the drifted p(1/ITL) definition) but no
+    // itl for that percentile — the value can't be reconciled to 1/p(ITL), so it
+    // must be discarded, not passed through. rowToAggDataEntry then coerces the
+    // now-missing key to 0.
+    const entry = agentic({ p90_intvty: 42, p95_itl: 0.2 });
+    expect(entry.p90_intvty).toBe(0); // dropped → default 0
+    expect(entry.p95_intvty).toBeCloseTo(5, 6); // derived from itl
+  });
+
+  it('DROPS a stale artifact *_intvty when the matching *_itl is zero/invalid', () => {
+    const entry = agentic({ p90_itl: 0, p90_intvty: 42 });
+    expect(entry.p90_intvty).toBe(0);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// rowToAggDataEntry — persisted-id guard (overlay rows carry no DB id)
+// ---------------------------------------------------------------------------
+describe('rowToAggDataEntry — id coercion', () => {
+  it('coerces a stringified bigint id to a number', () => {
+    const entry = rowToAggDataEntry(makeRow({ id: '206863' as unknown as number }));
+    expect(entry.id).toBe(206863);
+  });
+
+  it('yields undefined (not NaN) for a missing id — overlay rows have no persisted id', () => {
+    const entry = rowToAggDataEntry(makeRow({ id: undefined as unknown as number }));
+    expect(entry.id).toBeUndefined();
+  });
+
+  it('yields undefined for a non-positive or non-numeric id', () => {
+    expect(rowToAggDataEntry(makeRow({ id: 0 })).id).toBeUndefined();
+    expect(rowToAggDataEntry(makeRow({ id: 'abc' as unknown as number })).id).toBeUndefined();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// mergeRunScopedRows — offload-aware scoping (data-loss guard)
+// ---------------------------------------------------------------------------
+describe('mergeRunScopedRows — offload variants are distinct series', () => {
+  const agenticRow = (over: Partial<BenchmarkRow> = {}) =>
+    makeRow({
+      model: 'dsr1',
+      hardware: 'b300',
+      framework: 'vllm',
+      precision: 'fp4',
+      benchmark_type: 'agentic_traces',
+      isl: null,
+      osl: null,
+      ...over,
+    });
+
+  it('a run row for offload=on does NOT claim/suppress the base offload=off rows', () => {
+    // The selected run produced only the offload=on variant. The offload=off base
+    // rows are a separate series and must carry forward, not vanish.
+    const runRows = [agenticRow({ id: 10, offload_mode: 'on' })];
+    const baseRows = [
+      agenticRow({ id: 90, offload_mode: 'on' }), // same series as the run → replaced
+      agenticRow({ id: 91, offload_mode: 'off' }), // distinct series → kept
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]);
+  });
+
+  it('a run covering both offload variants pins both', () => {
+    const runRows = [
+      agenticRow({ id: 10, offload_mode: 'on' }),
+      agenticRow({ id: 11, offload_mode: 'off' }),
+    ];
+    const baseRows = [
+      agenticRow({ id: 90, offload_mode: 'on' }),
+      agenticRow({ id: 91, offload_mode: 'off' }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]);
+  });
 });
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index df1d328e..943da81b 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -13,6 +13,7 @@ import type {
 } from '@/components/inference/types';
 import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils';
 import { getHardwareConfig } from '@/lib/constants';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
 import type { BenchmarkRow } from '@/lib/api';
 
 /**
@@ -22,30 +23,32 @@ import type { BenchmarkRow } from '@/lib/api';
  *   tpot   ≡ itl    (time-per-output-token == inter-token-latency for single-output)
  *   intvty ≡ 1/itl  (tok/s from the user's perspective)
  *
- * e2el/tpot only fill gaps (existing fields win). `intvty` is ALWAYS derived from
- * itl, overriding any artifact-supplied value: the harness definition of
- * `*_intvty` has drifted (some versions emit `p(1/ITL)`, which inverts percentile
- * order), so for a slow-tail selector interactivity must be `1/p(ITL)`. This
- * matches the ingest mapper for official rows; doing it
- * here keeps overlay / `?unofficialrun=` rows (transformed live from raw
- * artifacts, never through the DB) on the same definition.
+ * e2el/tpot only fill gaps (existing fields win). `intvty` is ALWAYS 1/itl:
+ * derived where itl is valid, overriding any artifact-supplied value, AND any
+ * artifact `*_intvty` is DROPPED where itl is absent/zero/invalid rather than
+ * passed through. The harness definition of `*_intvty` has drifted (some versions
+ * emit `p(1/ITL)`, which inverts percentile order), so for a slow-tail selector
+ * interactivity must be `1/p(ITL)`. This matches the ingest mapper for official
+ * rows; doing it here keeps overlay / `?unofficialrun=` rows (transformed live
+ * from raw artifacts, never through the DB) on the same single definition.
  */
-function agenticAliases(m: Record<string, number>): Record<string, number> {
-  const out: Record<string, number> = {};
+function applyAgenticMetricAliases(raw: Record<string, number>): Record<string, number> {
+  const m: Record<string, number> = { ...raw };
   for (const suffix of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
-    const itl = m[`${suffix}_itl`];
-    const ttlt = m[`${suffix}_ttlt`];
-    if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
-    if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl;
-    if (itl !== undefined && itl > 0) out[`${suffix}_intvty`] = 1 / itl;
+    const itl = raw[`${suffix}_itl`];
+    const ttlt = raw[`${suffix}_ttlt`];
+    if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) m[`${suffix}_e2el`] = ttlt;
+    if (m[`${suffix}_tpot`] === undefined && itl !== undefined) m[`${suffix}_tpot`] = itl;
+    if (typeof itl === 'number' && itl > 0) m[`${suffix}_intvty`] = 1 / itl;
+    else delete m[`${suffix}_intvty`];
   }
-  return out;
+  return m;
 }
 
 /** Convert a DB benchmark row to an AggDataEntry. */
 export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
   const isAgentic = row.benchmark_type === 'agentic_traces';
-  const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics;
+  const m = isAgentic ? applyAgenticMetricAliases(row.metrics) : row.metrics;
   // Prefer the dedicated column (added in migration 004); fall back to the
   // legacy stash inside `metrics` for any rows ingested before that column
   // existed.
@@ -53,9 +56,14 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
   const offloadMode =
     row.offload_mode ??
     (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
+  // Postgres bigint comes through the SQL client as a string; coerce it. Overlay
+  // rows (transformed live from raw artifacts) carry no id, so `Number(undefined)`
+  // is NaN — collapse any non-persisted value to undefined so downstream link /
+  // fetch sites (guarded by isPersistedBenchmarkId) skip it cleanly rather than
+  // emitting `?ids=NaN` or an `/inference/agentic/NaN` link.
+  const numericId = typeof row.id === 'number' ? row.id : Number(row.id);
   return {
-    // Coerce: Postgres bigint comes through the SQL client as a string.
-    id: typeof row.id === 'number' ? row.id : Number(row.id),
+    id: isPersistedBenchmarkId(numericId) ? numericId : undefined,
     hw: row.hardware,
     framework: row.framework,
     model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model,
@@ -178,10 +186,13 @@ export function withPercentile(key: string, percentile: string): string {
 }
 
 // Replacement granularity for single-run scoping: the changelog config_key
-// tuple (model-precision-hardware-framework) plus benchmark_type, so an
-// agentic-only run never hides the same config's fixed-seq carry-forward.
+// tuple (model-precision-hardware-framework) plus benchmark_type AND offload_mode.
+// benchmark_type keeps an agentic-only run from hiding the same config's
+// fixed-seq carry-forward; offload_mode keeps a run that produced only one
+// offload variant (e.g. offload=on) from claiming — and thereby suppressing —
+// the other variant's (offload=off) base rows, which are a distinct series.
 const runScopeKey = (r: BenchmarkRow): string =>
-  `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}`;
+  `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}|${r.offload_mode ?? 'off'}`;
 
 /**
  * Merge run-scoped benchmark rows with the normal latest-per-config rows.
diff --git a/packages/app/src/lib/url-state.test.ts b/packages/app/src/lib/url-state.test.ts
index e34b32b4..fe26072f 100644
--- a/packages/app/src/lib/url-state.test.ts
+++ b/packages/app/src/lib/url-state.test.ts
@@ -30,9 +30,13 @@ describe('PARAM_DEFAULTS', () => {
     expect(PARAM_DEFAULTS.g_model).toBe('DeepSeek-V4-Pro');
   });
 
-  it('has expected default for i_seq', async () => {
+  it('has an EMPTY default for i_seq so the selected scenario is always written', async () => {
+    // The UI default scenario (gate-unlocked) is AgenticTraces, not 8k/1k. An
+    // '8k/1k' default would strip an explicit 8K/1K selection from the URL, which
+    // then resolves back to the agentic default on reload/share. Empty means no
+    // scenario value ever matches the default, so it's always persisted.
     const { PARAM_DEFAULTS } = await import('@/lib/url-state');
-    expect(PARAM_DEFAULTS.i_seq).toBe('8k/1k');
+    expect(PARAM_DEFAULTS.i_seq).toBe('');
   });
 
   it('has expected default for r_range', async () => {
@@ -182,6 +186,28 @@ describe('writeUrlParams + buildShareUrl', () => {
     expect(url).not.toContain('g_model');
   });
 
+  it('keeps an explicit i_seq=8k/1k in the share URL (no longer stripped as a default)', async () => {
+    setupWindow('', '/inference');
+    const { writeUrlParams, buildShareUrl } = await import('@/lib/url-state');
+
+    // Picking the fixed-seq scenario must survive into the share URL; before the
+    // fix this matched the '8k/1k' default and was dropped, reverting to agentic.
+    writeUrlParams({ i_seq: '8k/1k' });
+    await vi.advanceTimersByTimeAsync(200);
+
+    expect(buildShareUrl()).toContain('i_seq=8k%2F1k');
+  });
+
+  it('still strips i_seq when it is empty (the no-selection case)', async () => {
+    setupWindow('', '/inference');
+    const { writeUrlParams, buildShareUrl } = await import('@/lib/url-state');
+
+    writeUrlParams({ i_seq: '' });
+    await vi.advanceTimersByTimeAsync(200);
+
+    expect(buildShareUrl()).not.toContain('i_seq');
+  });
+
   it('batches multiple params in a single debounce window', async () => {
     setupWindow('', '/inference');
     const { writeUrlParams, buildShareUrl } = await import('@/lib/url-state');
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 1c8cab81..3671b6b8 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -74,7 +74,13 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   g_model: 'DeepSeek-V4-Pro',
   g_rundate: '',
   g_runid: '',
-  i_seq: '8k/1k',
+  // No strippable default: the UI default scenario (gate-unlocked) is
+  // AgenticTraces, not 8k/1k, so an '8k/1k' default here would strip an explicit
+  // 8K/1K selection from the URL — on reload the empty i_seq resolves back to the
+  // agentic default. Empty means the resolved scenario is ALWAYS written
+  // explicitly (effectiveSequence is never ''), so a shared/reloaded link keeps
+  // whatever the user picked. The no-param case still resolves via availability.
+  i_seq: '',
   // No strippable default: precision is only written to the URL once chosen
   // explicitly, so an explicit FP4 selection must survive (not be stripped as a
   // "default") or it would silently revert to the per-model auto default on reload.
diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts
index bb286734..69598039 100644
--- a/packages/db/src/etl/benchmark-mapper.test.ts
+++ b/packages/db/src/etl/benchmark-mapper.test.ts
@@ -613,6 +613,22 @@ describe('mapBenchmarkRow — agentic interactivity normalization', () => {
     const result = mapBenchmarkRow(makeV1Row({ p90_itl: 0.05, p90_intvty: 999 }), tracker);
     expect(result!.metrics.p90_intvty).toBe(999);
   });
+
+  it('DELETES a stale artifact *_intvty when the matching *_itl is absent', () => {
+    // Artifact ships intvty (possibly the drifted p(1/ITL) definition) but no itl
+    // for that percentile. Passing it through would mix harness semantics into a
+    // column meant to be 1/p(ITL) everywhere — so the key must be removed, not kept.
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeAgenticRow({ p90_intvty: 42, p95_itl: 0.2 }), tracker);
+    expect(result!.metrics).not.toHaveProperty('p90_intvty'); // stale → deleted
+    expect(result!.metrics.p95_intvty).toBeCloseTo(5, 6); // derived from itl
+  });
+
+  it('DELETES a stale artifact *_intvty when the matching *_itl is zero/invalid', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeAgenticRow({ p90_itl: 0, p90_intvty: 42 }), tracker);
+    expect(result!.metrics).not.toHaveProperty('p90_intvty');
+  });
 });
 
 /**
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index e3fb148e..5b00618a 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -255,10 +255,20 @@ export function mapBenchmarkRow(
   // keeping every agentic row on one definition. `std` is excluded — the
   // reciprocal of a standard deviation is meaningless. Mirrored in the frontend
   // overlay path (agenticAliases).
+  //
+  // When `*_itl` is absent/zero/invalid we must DELETE any artifact-supplied
+  // `*_intvty` rather than let it survive: keeping it would mix the harness's
+  // (possibly `p(1/ITL)`) definition into a column that's meant to be `1/p(ITL)`
+  // everywhere else. Downstream reads a missing key as "not recorded"
+  // (rowToAggDataEntry coerces `?? 0`; the legend table renders a dash).
   if (isAgentic) {
     for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
       const itl = metrics[`${k}_itl`];
-      if (typeof itl === 'number' && itl > 0) metrics[`${k}_intvty`] = 1 / itl;
+      if (typeof itl === 'number' && itl > 0) {
+        metrics[`${k}_intvty`] = 1 / itl;
+      } else {
+        delete metrics[`${k}_intvty`];
+      }
     }
   }
 

From b30dd21f0a095edc98eec7544f756a27ab2cac22 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 3 Jul 2026 01:30:03 -0500
Subject: [PATCH 37/40] style(inference): x-axis mode buttons use the
 SegmentedToggle tab recipe

The mode row (TTFT / E2E Latency / Normalized E2E / Interactivity /
Session Time / Prefill TPS) was hand-rolled chunky pills, visually
disconnected from every other control. It now reuses SegmentedToggle -
the repo's actual tab idiom (10+ call sites incl. the adjacent View
Mode toggle) - sized up via buttonClassName since this is a primary
control. Testids, track() events, tablist/tab aria semantics, and
agentic-only visibility unchanged; net -12 lines. Cypress
ttft-x-axis-toggle 8/8.
---
 .../components/inference/ui/ChartDisplay.tsx  | 50 +++++++------------
 1 file changed, 19 insertions(+), 31 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index d6c86529..37949de9 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -803,41 +803,29 @@ export default function ChartDisplay() {
           <CustomPowers loading={loading} />
         </section>
       )}
-      <section
-        className="flex flex-wrap justify-center gap-3 sm:gap-4"
-        role="tablist"
-        aria-label="Chart x-axis metric"
-        data-testid="x-axis-mode-buttons"
-      >
-        {X_AXIS_MODE_BUTTONS.filter(({ value }) => {
+      <SegmentedToggle
+        value={selectedXAxisMode}
+        options={X_AXIS_MODE_BUTTONS.filter(({ value }) => {
           if (!isAgenticOnlyXAxisMode(value)) return true;
           // Before mount, render all buttons so SSR and first client render match.
           if (!mounted) return true;
           return isAgenticSequence;
-        }).map(({ value, label }) => {
-          const isActive = selectedXAxisMode === value;
-          return (
-            <button
-              key={value}
-              type="button"
-              role="tab"
-              aria-selected={isActive}
-              data-testid={`x-axis-mode-${value}`}
-              onClick={() => {
-                setSelectedXAxisMode(value);
-                track('latency_x_axis_mode_selected', { mode: value });
-              }}
-              className={`min-w-[160px] flex-1 sm:flex-initial rounded-full border-2 px-6 py-3 text-base font-semibold transition-colors ${
-                isActive
-                  ? 'border-primary bg-primary text-primary-foreground shadow-sm'
-                  : 'border-border bg-card text-foreground hover:border-primary/60 hover:bg-accent'
-              }`}
-            >
-              {label}
-            </button>
-          );
-        })}
-      </section>
+        }).map(({ value, label }) => ({
+          value,
+          label,
+          testId: `x-axis-mode-${value}`,
+        }))}
+        onValueChange={(value) => {
+          setSelectedXAxisMode(value);
+          track('latency_x_axis_mode_selected', { mode: value });
+        }}
+        ariaLabel="Chart x-axis metric"
+        testId="x-axis-mode-buttons"
+        className="flex-wrap justify-center gap-1.5 sm:gap-2"
+        buttonClassName="min-w-[130px] sm:min-w-[140px] flex-1 sm:flex-initial justify-center rounded-md px-4 py-2 text-sm font-semibold"
+        activeButtonClassName="bg-muted text-foreground shadow-sm"
+        inactiveButtonClassName="text-muted-foreground hover:bg-muted/50 hover:text-foreground"
+      />
       <div className="flex flex-col gap-4">{displayGraphs}</div>
 
       {/* Performance Over Time — Modal Drill-Down */}

From 87f5dce94235eaf9cf9f3c29ffd2c240e0e23b93 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 3 Jul 2026 01:59:26 -0500
Subject: [PATCH 38/40] style(inference): x-axis mode row as browser-style
 underline tabs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the SegmentedToggle pills with an accent-underline tab strip
(revived the unused Radix-backed ui/tabs.tsx primitive). Active tab
gets the repo's established nav-tab underline token (border-secondary
in light, dark:border-primary), plus a bg-muted/60 active fill so the
minecraft theme — whose global 'button { border: 2px !important }'
override suppresses the underline — still distinguishes the active tab.
Testids, track() events, agentic-only visibility, and Radix a11y
(role=tab/aria-selected/keyboard) all preserved; SegmentedToggle stays
for the adjacent view-mode toggle. cypress 8/8, vitest 2381/2381.
---
 .../components/inference/ui/ChartDisplay.tsx  | 45 +++++++++++--------
 packages/app/src/components/ui/tabs.tsx       | 34 +++++++-------
 2 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 37949de9..7bc30ba9 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -31,6 +31,7 @@ import ScatterGraph from '@/components/inference/ui/ScatterGraph';
 import { Card } from '@/components/ui/card';
 import { ChartButtons } from '@/components/ui/chart-buttons';
 import { type SegmentedToggleOption, SegmentedToggle } from '@/components/ui/segmented-toggle';
+import { Tabs, TabsList, TabsTrigger } from '@/components/ui/tabs';
 import { ChartShareActions, MetricAssumptionNotes } from '@/components/ui/chart-display-helpers';
 import { UnofficialDomainNotice } from '@/components/ui/unofficial-domain-notice';
 import { exportToCsv } from '@/lib/csv-export';
@@ -803,29 +804,35 @@ export default function ChartDisplay() {
           <CustomPowers loading={loading} />
         </section>
       )}
-      <SegmentedToggle
+      <Tabs
         value={selectedXAxisMode}
-        options={X_AXIS_MODE_BUTTONS.filter(({ value }) => {
-          if (!isAgenticOnlyXAxisMode(value)) return true;
-          // Before mount, render all buttons so SSR and first client render match.
-          if (!mounted) return true;
-          return isAgenticSequence;
-        }).map(({ value, label }) => ({
-          value,
-          label,
-          testId: `x-axis-mode-${value}`,
-        }))}
         onValueChange={(value) => {
-          setSelectedXAxisMode(value);
+          setSelectedXAxisMode(value as XAxisMode);
           track('latency_x_axis_mode_selected', { mode: value });
         }}
-        ariaLabel="Chart x-axis metric"
-        testId="x-axis-mode-buttons"
-        className="flex-wrap justify-center gap-1.5 sm:gap-2"
-        buttonClassName="min-w-[130px] sm:min-w-[140px] flex-1 sm:flex-initial justify-center rounded-md px-4 py-2 text-sm font-semibold"
-        activeButtonClassName="bg-muted text-foreground shadow-sm"
-        inactiveButtonClassName="text-muted-foreground hover:bg-muted/50 hover:text-foreground"
-      />
+      >
+        <TabsList
+          aria-label="Chart x-axis metric"
+          data-testid="x-axis-mode-buttons"
+          className="flex-wrap justify-center gap-x-1 gap-y-1.5 sm:gap-x-1.5"
+        >
+          {X_AXIS_MODE_BUTTONS.filter(({ value }) => {
+            if (!isAgenticOnlyXAxisMode(value)) return true;
+            // Before mount, render all buttons so SSR and first client render match.
+            if (!mounted) return true;
+            return isAgenticSequence;
+          }).map(({ value, label }) => (
+            <TabsTrigger
+              key={value}
+              value={value}
+              data-testid={`x-axis-mode-${value}`}
+              className="min-w-[130px] sm:min-w-[140px] flex-1 sm:flex-initial justify-center"
+            >
+              {label}
+            </TabsTrigger>
+          ))}
+        </TabsList>
+      </Tabs>
       <div className="flex flex-col gap-4">{displayGraphs}</div>
 
       {/* Performance Over Time — Modal Drill-Down */}
diff --git a/packages/app/src/components/ui/tabs.tsx b/packages/app/src/components/ui/tabs.tsx
index a54963a8..8b0f7e66 100644
--- a/packages/app/src/components/ui/tabs.tsx
+++ b/packages/app/src/components/ui/tabs.tsx
@@ -17,14 +17,11 @@ function Tabs({ className, ...props }: React.ComponentProps<typeof TabsPrimitive
 
 function TabsList({ className, ...props }: React.ComponentProps<typeof TabsPrimitive.List>) {
   return (
-    <div className={cn('flex flex-col', className)}>
-      <div className="w-full border-t-2 border-brand pb-6" />
-      <TabsPrimitive.List
-        data-slot="tabs-list"
-        className="relative inline-flex p-1 gap-1 items-center justify-center bg-transparent"
-        {...props}
-      />
-    </div>
+    <TabsPrimitive.List
+      data-slot="tabs-list"
+      className={cn('inline-flex flex-wrap items-end gap-1 border-b border-border', className)}
+      {...props}
+    />
   );
 }
 
@@ -34,24 +31,27 @@ function TabsTrigger({ className, ...props }: React.ComponentProps<typeof TabsPr
       data-slot="tabs-trigger"
       className={cn(
         'relative',
+        '-mb-px',
         'inline-flex',
-        'h-10',
         'items-center',
         'justify-center',
         'gap-1.5',
+        'rounded-t-md',
+        'border-b-2',
+        'border-transparent',
         'px-4',
-        'py-2.5',
-        'text-base',
-        'font-medium',
+        'py-2',
+        'text-sm',
+        'font-semibold',
         'whitespace-nowrap',
-        'text-foreground',
-        'hover:text-foreground/80',
+        'text-muted-foreground',
+        'hover:text-foreground',
+        'hover:bg-muted/40',
         'data-[state=active]:text-foreground',
-        'data-[state=active]:border-b-2',
+        'data-[state=active]:bg-muted/60',
         'data-[state=active]:border-secondary',
         'dark:data-[state=active]:border-primary',
-        'dark:hover:text-primary/80',
-        'transition-colors duration-200',
+        'transition-colors duration-150',
         'focus-visible:outline-none',
         'focus-visible:ring-[3px]',
         'focus-visible:ring-ring',

From c96a4c4fe7174865788a6ae5a4395bb423ee44f1 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 3 Jul 2026 02:31:01 -0500
Subject: [PATCH 39/40] style(inference): match x-axis tabs to top section-nav
 recipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the card-fill + white active text; adopt tab-nav.tsx's flat
underline-strip recipe verbatim — active = accent text
(text-secondary dark:text-primary) + matching border-b-2 underline, no
background, inactive = muted-foreground with border-only hover. The
x-axis mode row now reads identically to the dashboard's top section
tabs. cypress 8/8, vitest 2381.
---
 packages/app/src/components/ui/tabs.tsx | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/packages/app/src/components/ui/tabs.tsx b/packages/app/src/components/ui/tabs.tsx
index 8b0f7e66..4669e9e1 100644
--- a/packages/app/src/components/ui/tabs.tsx
+++ b/packages/app/src/components/ui/tabs.tsx
@@ -19,24 +19,27 @@ function TabsList({ className, ...props }: React.ComponentProps<typeof TabsPrimi
   return (
     <TabsPrimitive.List
       data-slot="tabs-list"
-      className={cn('inline-flex flex-wrap items-end gap-1 border-b border-border', className)}
+      className={cn('inline-flex flex-wrap items-end gap-1', className)}
       {...props}
     />
   );
 }
 
+// Active/inactive recipe mirrors the top-of-page section nav
+// (data-testid="chart-section-tabs" in src/components/tab-nav.tsx: tabLinkClass +
+// currentTabClass) so the two tab rows read as the same flat underline-strip
+// component: accent text + accent border-b-2 underline when active, muted text
+// with no background fill when inactive, and a faint border highlight on hover.
 function TabsTrigger({ className, ...props }: React.ComponentProps<typeof TabsPrimitive.Trigger>) {
   return (
     <TabsPrimitive.Trigger
       data-slot="tabs-trigger"
       className={cn(
         'relative',
-        '-mb-px',
         'inline-flex',
         'items-center',
         'justify-center',
         'gap-1.5',
-        'rounded-t-md',
         'border-b-2',
         'border-transparent',
         'px-4',
@@ -45,13 +48,12 @@ function TabsTrigger({ className, ...props }: React.ComponentProps<typeof TabsPr
         'font-semibold',
         'whitespace-nowrap',
         'text-muted-foreground',
-        'hover:text-foreground',
-        'hover:bg-muted/40',
-        'data-[state=active]:text-foreground',
-        'data-[state=active]:bg-muted/60',
+        'hover:border-muted-foreground/30',
+        'data-[state=active]:text-secondary',
+        'dark:data-[state=active]:text-primary',
         'data-[state=active]:border-secondary',
         'dark:data-[state=active]:border-primary',
-        'transition-colors duration-150',
+        'transition-colors duration-200',
         'focus-visible:outline-none',
         'focus-visible:ring-[3px]',
         'focus-visible:ring-ring',

From c9c8074a89d98e3b63fda23ed7663567e8028f34 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 3 Jul 2026 04:05:50 -0500
Subject: [PATCH 40/40] style(inference): default high contrast + parallelism
 labels to OFF

Reverts two of this PR's default-flips per product decision: high
contrast (i_hc) and parallelism/advanced labels (i_advlabel) now
default off. InferenceContext drops defaultHighContrast:true and flips
the advlabel init to === '1'; both write-backs now encode ON as '1' /
OFF as '' (matching i_linelabel), consistent with the unchanged
PARAM_DEFAULTS so bare links render both off and i_hc=1/i_advlabel=1
still enable them. Specs updated; 62/62 affected e2e green.
---
 .../app/cypress/e2e/gradient-labels.cy.ts     | 10 ++++---
 packages/app/cypress/e2e/url-params.cy.ts     | 26 +++++++++++++++----
 .../components/inference/InferenceContext.tsx | 14 +++++-----
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts
index a0753e90..9c3d3274 100644
--- a/packages/app/cypress/e2e/gradient-labels.cy.ts
+++ b/packages/app/cypress/e2e/gradient-labels.cy.ts
@@ -24,8 +24,8 @@ describe('Gradient Labels Toggle', () => {
     cy.get('label[for="scatter-parallelism-labels"]').should('contain.text', 'Parallelism Labels');
   });
 
-  it('Parallelism Labels toggle is on by default', () => {
-    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
+  it('Parallelism Labels toggle is off by default', () => {
+    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked');
   });
 
   it('per-point labels are visible by default (gradient labels off)', () => {
@@ -60,7 +60,7 @@ describe('Gradient Labels Toggle', () => {
   });
 
   it('both toggles can be enabled simultaneously', () => {
-    // Parallelism Labels is on by default; ensure it's on, then turn on Gradient.
+    // Parallelism Labels is off by default; turn it on, then turn on Gradient.
     cy.get('#scatter-parallelism-labels').then(($el) => {
       if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
     });
@@ -71,8 +71,10 @@ describe('Gradient Labels Toggle', () => {
     cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
     cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
 
-    // Reset gradient for next tests (parallelism stays at its default-on).
+    // Reset both for next tests (each subsequent test does a fresh cy.visit,
+    // but keep state tidy here too).
     cy.get('#scatter-gradient-labels').click();
+    cy.get('#scatter-parallelism-labels').click();
   });
 
   it('URL param i_gradlabel=1 enables gradient labels on load', () => {
diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts
index 927aee5f..6c827218 100644
--- a/packages/app/cypress/e2e/url-params.cy.ts
+++ b/packages/app/cypress/e2e/url-params.cy.ts
@@ -236,10 +236,10 @@ describe('URL Parameter Persistence', () => {
   });
 
   describe('High contrast mode', () => {
-    it('inference loads with high contrast on by default', () => {
+    it('inference loads with high contrast off by default', () => {
       visitWithDismissedModal('/inference');
       cy.get('[data-testid="scatter-graph"]').should('exist');
-      cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked');
+      cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
     });
 
     it('i_hc=0 disables high contrast on load', () => {
@@ -273,12 +273,12 @@ describe('URL Parameter Persistence', () => {
       cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked');
     });
 
-    it('historical trends tab shares the inference high-contrast default (on)', () => {
+    it('historical trends tab shares the inference high-contrast default (off)', () => {
       // Historical reads highContrast from the same InferenceContext as the
-      // scatter chart, so it inherits the default-on behavior.
+      // scatter chart, so it inherits the default-off behavior.
       visitWithDismissedModal('/historical');
       cy.get('[data-testid="historical-trends-display"]').should('exist');
-      cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked');
+      cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
     });
 
     it('i_hc=1 enables historical trends high contrast', () => {
@@ -287,4 +287,20 @@ describe('URL Parameter Persistence', () => {
       cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked');
     });
   });
+
+  describe('Default toggle states (share-link correctness)', () => {
+    it('a bare /inference link with neither param renders high contrast AND parallelism labels off', () => {
+      visitWithDismissedModal('/inference');
+      cy.get('[data-testid="scatter-graph"]').should('exist');
+      cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
+      cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked');
+    });
+
+    it('i_hc=1&i_advlabel=1 enables both high contrast and parallelism labels on load', () => {
+      visitWithDismissedModal('/inference?i_hc=1&i_advlabel=1');
+      cy.get('[data-testid="scatter-graph"]').should('exist');
+      cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked');
+      cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
+    });
+  });
 });
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index b9cbc7ce..6d6ad19d 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -241,8 +241,6 @@ export function InferenceProvider({
   const dataQuickFilters = activeTab === 'historical' ? EMPTY_QUICK_FILTERS : quickFilters;
   const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({
     urlPrefix: 'i_',
-    // Inference chart defaults to high contrast (?i_hc=0 overrides off).
-    defaultHighContrast: true,
   });
 
   const [hideNonOptimal, setHideNonOptimal] = useState(() => getUrlParam('i_optimal') !== '0');
@@ -252,14 +250,14 @@ export function InferenceProvider({
     if (getUrlParam('i_nolabel') === '1') return false;
     if (getUrlParam('i_label') === '0') return false;
     if (getUrlParam('i_label') === '1') return true;
-    // Default on: parallelism labels (also default on) are point labels and
-    // are pointless without them shown.
+    // Default on: point labels (TP + concurrency, or the fuller parallelism
+    // breakdown when Parallelism Labels is toggled on) are useful either way.
     return true;
   });
   const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1');
-  // Parallelism labels default on (?i_advlabel=0 overrides off).
+  // Parallelism labels default off (?i_advlabel=1 overrides on).
   const [useAdvancedLabels, setUseAdvancedLabels] = useState(
-    () => getUrlParam('i_advlabel') !== '0',
+    () => getUrlParam('i_advlabel') === '1',
   );
   const [showGradientLabels, setShowGradientLabels] = useState(
     () => getUrlParam('i_gradlabel') === '1',
@@ -1042,14 +1040,14 @@ export function InferenceProvider({
       i_dend: selectedDateRange.endDate,
       i_optimal: hideNonOptimal ? '' : '0',
       i_label: showPointLabels ? '' : '0',
-      i_hc: highContrast ? '' : '0',
+      i_hc: highContrast ? '1' : '',
       i_log: logScale ? '1' : '',
       i_xmetric: selectedXAxisMetric || '',
       i_e2e_xmetric: selectedE2eXAxisMetric || '',
       i_xmode: selectedXAxisMode,
       i_scale: scaleType,
       i_legend: isLegendExpanded ? '' : '0',
-      i_advlabel: useAdvancedLabels ? '' : '0',
+      i_advlabel: useAdvancedLabels ? '1' : '',
       i_gradlabel: showGradientLabels ? '1' : '',
       i_linelabel: showLineLabels ? '1' : '',
       i_speed: showSpeedOverlay ? '1' : '',