diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md
new file mode 100644
index 00000000..10e37d6c
--- /dev/null
+++ b/.claude/agents/ingest.md
@@ -0,0 +1,196 @@
+---
+name: ingest
+description: Ingest a benchmark run from GitHub Actions into the Neon DB used by the feat/agentx deployment. The target DB write URL must be provided in the invocation. Handles standard ingest, delete+reingest, and changelog entries. Invoke when the user asks to ingest a workflow run URL.
+tools: Bash, Read, Edit, Write
+---
+
+You ingest benchmark runs from `SemiAnalysisAI/InferenceX` GitHub Actions into the Neon branch used by the `feat/agentx` deployment of this dashboard. Operate on `/Users/quilicic/InferenceX-app`.
+
+## Environment
+
+- **Repo root**: `/Users/quilicic/InferenceX-app`
+- **DB write URL — MUST be provided by the invoker.** There is no default: the target Neon branch changes over time, and ingesting into the wrong one silently corrupts a live deployment. If the prompt does not include a `postgresql://` write URL, STOP and ask for it before touching anything. Requirements:
+ - Use the **direct (non-pooled)** host for ingest/migrations — no `-pooler` in the hostname.
+ - For psql diagnostics you may use the same URL directly: `psql "$DATABASE_WRITE_URL" -c "..."`.
+- **Local dev server**: usually `http://localhost:3002` (port 3000 is a different project on this machine — never purge port 3000)
+- **Preview URL**: `https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app`
+- **INVALIDATE_SECRET** lives in repo root `.env` under that key.
+- **GitHub auth**: `gh auth token` for `gh` calls and the GITHUB_TOKEN env var.
+
+## Standard ingest
+
+```bash
+cd /Users/quilicic/InferenceX-app/packages/db
+DATABASE_WRITE_URL='' \
+GITHUB_TOKEN=$(gh auth token) \
+pnpm exec tsx src/ingest-ci-run.ts --download SemiAnalysisAI/InferenceX
+```
+
+Then refresh the materialized view (the script's auto-refresh sometimes races):
+`REFRESH MATERIALIZED VIEW latest_benchmarks;`
+
+## Cache purge (always do after any DB mutation)
+
+```bash
+SECRET=$(grep "^INVALIDATE_SECRET" /Users/quilicic/InferenceX-app/.env | cut -d= -f2 | tr -d '"')
+# Localhost (port 3002, NOT 3000)
+curl -s -X POST -H "Authorization: Bearer $SECRET" http://localhost:3002/api/v1/invalidate
+# Preview
+mkdir -p /tmp/vp && cd /tmp/vp \
+ && vercel link --project inferencemax-app --scope semianalysisai --yes >/dev/null 2>&1 \
+ && vercel curl /api/v1/invalidate \
+ --deployment https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app \
+ --yes -- -sS -X POST -H "Authorization: Bearer $SECRET"
+rm -rf /tmp/vp
+```
+
+## Delete + reingest (use only when user explicitly says "delete and reingest" OR when the run supersedes prior data with the same (model, hw, framework, precision))
+
+```sql
+BEGIN;
+DELETE FROM benchmark_results br USING configs c
+WHERE c.id = br.config_id
+ AND c.model = '' AND c.hardware = '' AND c.framework = ''
+ AND c.precision = '' AND br.benchmark_type = '';
+DELETE FROM availability
+WHERE model = '' AND hardware = '' AND framework = ''
+ AND precision = '' AND benchmark_type = '';
+COMMIT;
+```
+
+If the user says "replace ONLY the points this run produces", scope the DELETE to `AND br.conc IN (...)` so untouched conc levels survive. Don't do this unless asked.
+
+## AIPerf tagging — DO NOT use by default
+
+AIPerf is no longer a separate harness from the user's perspective. **Always** ingest with `spec_method='none'` (the standard path above), regardless of run name. Run names that include the word "aiperf" do NOT mean you should set `spec_decoding='aiperf'` — the user wants those runs to merge into the standard legend entry alongside other runs of the same (model, hw, framework, precision).
+
+Only override this if the user **explicitly** asks for the run to appear as a separate legend line. If they do, the patching procedure is preserved below. Otherwise, use the standard ingest section above and do not touch `spec_decoding`.
+
+
+Explicit-request-only: how to tag a run as `spec_decoding='aiperf'`
+
+```bash
+RID=
+TMPDIR=$(mktemp -d -t aiperf-$RID-XXXX)
+cd $TMPDIR
+
+# 1. Logical-name dedup + download
+gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/$RID/artifacts" --paginate \
+ --jq '.artifacts[] | "\(.name)\t\(.archive_download_url)\t\(.created_at)"' \
+ | python3 -c "
+import sys, re, collections
+seen = collections.OrderedDict()
+for line in sys.stdin:
+ name, url, created = line.rstrip('\n').split('\t')
+ key = re.sub(r'_[a-zA-Z][a-zA-Z0-9.-]*_\d+$', '', name)
+ if key not in seen or seen[key][2] < created:
+ seen[key] = (name, url, created)
+for _, (name, url, _) in seen.items():
+ print(f'{name}\t{url}')
+" > artifacts.tsv
+while IFS=$'\t' read -r name url; do
+ mkdir -p "$name"
+ gh api "$url" > "$name/a.zip" 2>/dev/null
+ unzip -oq "$name/a.zip" -d "$name" 2>/dev/null
+ rm "$name/a.zip"
+done < artifacts.tsv
+
+# 2. Patch every benchmark JSON to set spec_decoding=aiperf
+find $TMPDIR -name "*.json" | python3 -c "
+import sys, json
+for fn in (l.strip() for l in sys.stdin):
+ try:
+ with open(fn) as f: d = json.load(f)
+ except Exception: continue
+ rows = d if isinstance(d, list) else [d]
+ if not rows or not isinstance(rows[0], dict): continue
+ changed = False
+ for row in rows:
+ if isinstance(row, dict) and ('scenario_type' in row or 'infmax_model_prefix' in row or 'tput_per_gpu' in row):
+ row['spec_decoding'] = 'aiperf'
+ changed = True
+ if changed:
+ with open(fn, 'w') as f: json.dump(d if isinstance(d, list) else rows[0], f)
+"
+
+# 3. Ingest in CI mode (reads INGEST_* env vars)
+cd /Users/quilicic/InferenceX-app/packages/db
+INGEST_RUN_ID=$RID INGEST_RUN_ATTEMPT=1 INGEST_ARTIFACTS_PATH=$TMPDIR INGEST_REPO=SemiAnalysisAI/InferenceX \
+DATABASE_WRITE_URL='' \
+GITHUB_TOKEN=$(gh auth token) \
+pnpm exec tsx src/ingest-ci-run.ts
+rm -rf $TMPDIR
+```
+
+The `spec_method` column has a lowercase check constraint — always lowercase.
+
+
+
+## Don't auto-mention "AIPerf" in changelog entries
+
+Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`.
+
+## Adding a perf changelog entry — MANDATORY for every ingest
+
+**You ALWAYS MUST add a changelog entry for every run you ingest. This is not optional.** Every standard ingest, delete+reingest, and partial ingest gets exactly one changelog entry. Never finish an ingest without one.
+
+- If the user gave changelog text, use it verbatim (substitute `` with the run's hardware SKU when the text contains that placeholder).
+- If the user did NOT specify text, DO NOT skip the changelog — derive a sensible description from the run name (see convention below) and add it anyway, then tell the user what you used so they can adjust.
+
+Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `---` (matches what the user actually sees in the filter chain).
+
+```sql
+INSERT INTO changelog_entries (workflow_run_id, date, base_ref, head_ref, config_keys, description, pr_link)
+SELECT id, date, '', '', ARRAY['---'], '', NULL
+FROM latest_workflow_runs WHERE github_run_id =
+RETURNING id, workflow_run_id, date::text, description;
+```
+
+Description convention from prior entries: ` Ingest # ()` — e.g.
+
+- `B200 Kimi Ingest #1`
+- `MI355X Kimi Ingest #2`
+- `H200 Kimi Ingest #1 (mmap cache)`
+
+If the user doesn't specify a description, DO NOT skip the entry and DO NOT block on asking — derive a description from the run name, add the entry, and report what you used so the user can adjust.
+
+## Common gotchas
+
+- **`conclusion IS NULL` filter**: availability hides runs whose `latest_workflow_runs.conclusion` is null (still in_progress). If a user wants in-progress data shown, you can `UPDATE workflow_runs SET conclusion='success', status='completed' WHERE id = ` then `REFRESH MATERIALIZED VIEW latest_benchmarks`.
+- **failed_run filter**: rows where `num_requests_successful === 0 AND num_requests_total > 0` get skipped on purpose — they have null metrics and would overwrite good rows via ON CONFLICT.
+- **Aggregated `results_bmk` artifact** contains rows from all runner attempts merged together — pair the artifact-level logical-name dedup with the row-level failed-run skip to avoid empty-row overwrites.
+- **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `__` suffix.
+- **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection.
+- **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = ''` so the frontend's max-date-per-group dedup doesn't drop the older sweep.
+- **Agentic interactivity normalization (`*_intvty`)**: for `agentic_traces` runs, interactivity MUST be the slow-tail reciprocal of the ITL percentile — `*_intvty = 1/*_itl` (so `p90_intvty = 1/p90_itl`). Some harness versions emit `*_intvty` as `p(1/ITL)` instead (fast-tail — inverts percentile order, e.g. p90 shows ~`1/p10(ITL)`), which silently contaminates cross-run Pareto comparisons. The ingest mapper (`benchmark-mapper.ts`) now **derives `*_intvty` from `*_itl` and discards the artifact's value** for agentic rows, so a normal ingest is self-correcting — no manual step needed. The frontend `agenticAliases` does the same for overlay / `?unofficialrun=` rows. If you ever load agentic data through a path that bypasses the mapper, run `pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes` (idempotent; rewrites `mean/p75/p90/p95 _intvty = 1/_itl`) then refresh the MV + purge cache. `std_intvty` is intentionally left alone (the reciprocal of a std is meaningless; the API strips it anyway).
+
+## Process
+
+1. **Always start by checking the run** with `gh api repos/SemiAnalysisAI/InferenceX/actions/runs/ --jq '{name, status, conclusion}'`. Note the model/hw/precision from the name. If `status != "completed"`, ask the user if they want to ingest in-progress data (will likely have failed_run skips).
+2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding.
+3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line.
+4. **Refresh materialized view**.
+5. **Add changelog entry — ALWAYS, MANDATORY.** Every ingest gets exactly one changelog entry (see "Adding a perf changelog entry — MANDATORY"). Use the user's text if given (substituting ``); otherwise derive one from the run name and add it anyway. Never skip this step.
+6. **Purge both caches** (localhost 3002 + preview — never port 3000).
+7. **Report** the row count, date, hardware, run id, and the changelog id (always present).
+
+## Related: ingesting agentic _datasets_ (not benchmark runs)
+
+This agent ingests **benchmark runs**. The HF agentic trace **datasets** (`semianalysisai/cc-traces-weka-*`) that the agentic benchmark replays are ingested by a separate script, not this flow:
+
+```bash
+cd packages/db && DATABASE_WRITE_URL='' \
+ pnpm exec tsx src/ingest-weka-dataset.ts \
+ [--label "…"] [--variant full|256k] [--description "…"] [--limit N]
+```
+
+It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
+
+New agentic benchmark artifacts preserve AIPerf's `metadata.dataset` provenance as a top-level `dataset` object. Standard benchmark ingest automatically derives the dataset slug from `dataset.hf_dataset_name` and upserts `run_datasets`; do not manually backfill that mapping for new-format runs. Manual mapping is only needed for legacy artifacts that do not contain dataset provenance.
+
+## Don't
+
+- Don't push to git unless the user asked.
+- Don't ingest without permission if it's a delete+reingest of existing data.
+- Don't hit port 3000 for cache purge — it's a different project.
+- Don't capitalize `spec_method` values (DB has a lowercase check constraint).
diff --git a/.eslintignore b/.eslintignore
new file mode 100644
index 00000000..513a873e
--- /dev/null
+++ b/.eslintignore
@@ -0,0 +1,3 @@
+# Stale agent worktrees produced by parallel Claude Code sessions — they
+# hold their own branches and are linted as part of their own runs.
+.claude/worktrees/
diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml
new file mode 100644
index 00000000..fab99f5d
--- /dev/null
+++ b/.github/workflows/ingest-agentic-results.yml
@@ -0,0 +1,233 @@
+name: Ingest Agentic Benchmark Results
+
+# Dispatched from the main InferenceX repo at the end of an agentic (AgentX
+# trace-replay) sweep, mirroring the fixed-seq-len `ingest-results` dispatch:
+#
+# curl -sSf -X POST \
+# -H "Authorization: Bearer $INFX_FRONTEND_PAT" \
+# -H "Accept: application/vnd.github+v3+json" \
+# https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \
+# -d '{"event_type": "ingest-agentic-results",
+# "client_payload": {"run-id": "", "run-attempt": "",
+# "database-target": "production"}}'
+#
+# The ingest script (packages/db/src/ingest-ci-run.ts) auto-detects agentic
+# artifacts: benchmark rows land in benchmark_results (benchmark_type=
+# 'agentic_traces'), raw profile exports + server metrics land in the
+# agentic_trace_replay sidecar with precomputed chart/timeline JSONBs, the
+# run is linked to its dataset in run_datasets, and changelog-metadata is
+# ingested when present. This is a separate workflow from ingest-results.yml
+# because agentic ingests are blob-heavy (100MB+ gzipped profile exports per
+# high-concurrency point) and need a much longer timeout, plus
+# agentic-specific alerting (missing dataset slug).
+
+on:
+ repository_dispatch:
+ types: [ingest-agentic-results]
+ workflow_dispatch:
+ inputs:
+ run-id:
+ description: InferenceX Actions run ID to ingest
+ required: true
+ type: string
+ run-attempt:
+ description: InferenceX Actions run attempt to ingest
+ required: false
+ default: '1'
+ type: string
+ database-target:
+ description: Database/cache target for the ingest
+ required: false
+ default: production
+ type: choice
+ options:
+ - production
+ - dev
+ - agentx-v1
+
+jobs:
+ ingest:
+ # Blob-heavy: uploading trace-replay sidecars for a ~20-point sweep takes
+ # far longer than a fixed-seq-len ingest.
+ timeout-minutes: 60
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ steps:
+ - name: Wait for source run to finish
+ if: github.event_name != 'workflow_dispatch'
+ run: sleep 300
+
+ - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+ - uses: pnpm/action-setup@0e279bb959325dab635dd2c09392533439d90093 # v6.0.8
+ - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+ with:
+ node-version: '24'
+ cache: pnpm
+ - name: Install dependencies
+ run: pnpm install --filter @semianalysisai/inferencex-db...
+ env:
+ CYPRESS_INSTALL_BINARY: '0'
+
+ - name: Select ingest target
+ env:
+ REQUESTED_DATABASE_TARGET: ${{ github.event.client_payload.database-target || inputs.database-target || 'production' }}
+ DATABASE_WRITE_URL_PRODUCTION: ${{ secrets.DATABASE_WRITE_URL }}
+ DATABASE_WRITE_URL_DEV: ${{ secrets.DATABASE_DEV_WRITE_URL }}
+ DATABASE_WRITE_URL_AGENTX_V1: ${{ secrets.DATABASE_AGENTX_V1_WRITE_URL }}
+ run: |
+ case "$REQUESTED_DATABASE_TARGET" in
+ production)
+ database_write_url="$DATABASE_WRITE_URL_PRODUCTION"
+ cache_invalidate_url="https://inferencex.semianalysis.com/api/v1/invalidate"
+ ;;
+ dev)
+ database_write_url="$DATABASE_WRITE_URL_DEV"
+ cache_invalidate_url="https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app/api/v1/invalidate"
+ ;;
+ agentx-v1)
+ database_write_url="$DATABASE_WRITE_URL_AGENTX_V1"
+ cache_invalidate_url="https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app/api/v1/invalidate"
+ ;;
+ *)
+ echo "::error::Unsupported database-target: $REQUESTED_DATABASE_TARGET"
+ exit 1
+ ;;
+ esac
+
+ if [ -z "$database_write_url" ]; then
+ echo "::error::Database secret is empty for target: $REQUESTED_DATABASE_TARGET"
+ exit 1
+ fi
+
+ echo "::add-mask::$database_write_url"
+ echo "DATABASE_WRITE_URL=$database_write_url" >> "$GITHUB_ENV"
+ echo "INGEST_DATABASE_TARGET=$REQUESTED_DATABASE_TARGET" >> "$GITHUB_ENV"
+ echo "CACHE_INVALIDATE_URL=$cache_invalidate_url" >> "$GITHUB_ENV"
+ echo "Selected ingest target: $REQUESTED_DATABASE_TARGET"
+ echo "Cache invalidate URL: $cache_invalidate_url"
+
+ - name: Run migrations
+ run: pnpm admin:db:migrate --yes
+
+ - name: Download artifacts from InferenceX run
+ env:
+ GH_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
+ RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }}
+ ARTIFACTS_PATH: ${{ github.workspace }}/artifacts
+ run: |
+ mkdir -p "$ARTIFACTS_PATH"
+
+ # Download all artifacts for the run, deduplicated by name (keep latest).
+ gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/${RUN_ID}/artifacts" --paginate \
+ | jq -r '
+ [.artifacts[]]
+ | group_by(.name) | map(sort_by(.created_at) | last)[]
+ | "\(.name)\t\(.archive_download_url)"' \
+ | while IFS=$'\t' read -r name url; do
+ echo "Downloading artifact: ${name}"
+ ok=false
+ for attempt in 1 2 3; do
+ if gh api "${url}" > artifact.zip; then
+ ok=true
+ break
+ fi
+ echo " Attempt ${attempt}/3 failed, retrying in ${attempt}s..."
+ sleep "$attempt"
+ done
+ if [ "$ok" = false ]; then
+ echo "::warning::Failed to download artifact after 3 attempts: ${name} — skipping"
+ rm -f artifact.zip
+ echo "$name" >> "$ARTIFACTS_PATH/.failures"
+ continue
+ fi
+ mkdir -p "${ARTIFACTS_PATH}/${name}"
+ if ! unzip -o artifact.zip -d "${ARTIFACTS_PATH}/${name}"; then
+ echo "::warning::Failed to extract artifact: ${name} — skipping"
+ rm -rf "${ARTIFACTS_PATH:?}/${name}"
+ echo "$name" >> "$ARTIFACTS_PATH/.failures"
+ fi
+ rm -f artifact.zip
+ done
+
+ if [ -f "$ARTIFACTS_PATH/.failures" ]; then
+ count=$(wc -l < "$ARTIFACTS_PATH/.failures")
+ rm "$ARTIFACTS_PATH/.failures"
+ echo "::warning::${count} artifact(s) failed to download; ingesting what's available"
+ fi
+
+ echo "Downloaded artifacts:"
+ ls "$ARTIFACTS_PATH/"
+
+ if [ -z "$(ls -A "$ARTIFACTS_PATH")" ]; then
+ echo "::error::No artifacts could be downloaded from run ${RUN_ID}"
+ exit 1
+ fi
+
+ - name: Ingest results to DB
+ env:
+ GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }}
+ INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }}
+ INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }}
+ INGEST_ARTIFACTS_PATH: ${{ github.workspace }}/artifacts
+ INGEST_REPO: SemiAnalysisAI/InferenceX
+ UNMAPPED_ENTITIES_OUTPUT: ${{ github.workspace }}/unmapped-entities.json
+ run: pnpm admin:db:ingest:ci
+
+ - name: Apply run overrides
+ run: pnpm admin:db:apply-overrides --yes
+
+ - name: Verify database
+ run: pnpm admin:db:verify
+
+ - name: Invalidate Vercel cache
+ env:
+ VERCEL_INVALIDATE_SECRET: ${{ secrets.VERCEL_INVALIDATE_SECRET }}
+ run: |
+ curl -sSf -X POST "$CACHE_INVALIDATE_URL" \
+ -H "Authorization: Bearer $VERCEL_INVALIDATE_SECRET" || true
+
+ - name: Check for unmapped entities
+ if: always()
+ id: unmapped
+ run: |
+ f="${{ github.workspace }}/unmapped-entities.json"
+ if [ -f "$f" ]; then
+ echo "found=true" >> "$GITHUB_OUTPUT"
+ models=$(jq -r '.models // [] | join(", ")' "$f")
+ hardware=$(jq -r '.hardware // [] | join(", ")' "$f")
+ precisions=$(jq -r '.precisions // [] | join(", ")' "$f")
+ datasets=$(jq -r '.datasets // [] | join(", ")' "$f")
+ msg=""
+ [ -n "$models" ] && msg="${msg}Models: ${models}\n"
+ [ -n "$hardware" ] && msg="${msg}Hardware: ${hardware}\n"
+ [ -n "$precisions" ] && msg="${msg}Precisions: ${precisions}\n"
+ [ -n "$datasets" ] && msg="${msg}Datasets missing from datasets table (run ingest-weka-dataset): ${datasets}\n"
+ {
+ echo 'summary<> "$GITHUB_OUTPUT"
+ fi
+
+ - name: Notify Slack on unmapped entities
+ if: steps.unmapped.outputs.found == 'true'
+ uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3
+ with:
+ webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
+ webhook-type: incoming-webhook
+ payload: |
+ {
+ "text": ":warning: *Unrecognized entities during agentic ingest*\nRun ID: ${{ github.event.client_payload.run-id || inputs.run-id }}\n```${{ steps.unmapped.outputs.summary }}```\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
+ }
+
+ - name: Notify Slack on failure
+ if: failure()
+ uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3
+ with:
+ webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
+ webhook-type: incoming-webhook
+ payload: |
+ {
+ "text": ":rotating_light: *Agentic ingest workflow failed*\nRun ID: ${{ github.event.client_payload.run-id || inputs.run-id }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
+ }
diff --git a/.gitignore b/.gitignore
index a86f6e23..c52b0482 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
# next.js
**/.next
+**/.next-*
**/out
# production
@@ -71,3 +72,4 @@ C:*
# python bytecode (e.g. .claude/skills/*/iso-interactivity.py imports)
**/__pycache__/
**/*.pyc
+.playwright-mcp/
diff --git a/.oxlintrc.json b/.oxlintrc.json
index ff610e51..5a03a5a0 100644
--- a/.oxlintrc.json
+++ b/.oxlintrc.json
@@ -28,6 +28,7 @@
"no-undef": "off",
"no-underscore-dangle": "off",
"no-useless-undefined": "off",
+ "require-unicode-regexp": "off",
"no-warning-comments": "off",
"prefer-destructuring": "off",
"sort-imports": "off",
diff --git a/docs/data-pipeline.md b/docs/data-pipeline.md
index 38e7d471..bc439e8a 100644
--- a/docs/data-pipeline.md
+++ b/docs/data-pipeline.md
@@ -62,6 +62,18 @@ Configs are preloaded into an in-memory Map at ingest start. `getOrCreateConfig(
Unmapped models/hardware are tracked (not silently dropped) so operators can see what new GPU or model names appeared in CI artifacts. This is how new GPUs get added to the system — the skip tracker acts as a change detection mechanism.
+### Server-Metric Orchestrator Adapters
+
+AIPerf defines the `server_metrics_export.json` envelope, but labels such as worker role and rank belong to the serving orchestrator. The chart-series ETL therefore normalizes raw series through an orchestrator-specific adapter before exposing per-worker metrics. For example, the Dynamo adapter maps `dynamo_component=prefill|backend` to canonical `prefill|decode` roles and uses the endpoint, worker ID, DP rank, and engine together as the source identity.
+
+Adapters are selected from the benchmark's canonical framework, and per-worker series are only emitted for disaggregated configs with a recognized adapter. Unknown orchestrators and non-disaggregated configs retain their aggregate-only series; roles are never guessed from ports or metric names. The frontend only consumes the canonical source identity and never interprets orchestrator-native labels.
+
+### Agentic Dataset Provenance
+
+AIPerf exports public-dataset provenance in `metadata.dataset`, including the Hugging Face dataset ID. InferenceX preserves that object as `dataset` on each agentic aggregate benchmark row. During benchmark ingest, `ingest-ci-run.ts` derives the dashboard slug from `hf_dataset_name` (for example, `semianalysisai/cc-traces-weka-062126` becomes `cc-traces-weka-062126`) and upserts `run_datasets` for the workflow run.
+
+Legacy artifacts without provenance leave any existing mapping untouched. A workflow run can map to only one dataset; conflicting dataset IDs fail ingest rather than silently linking the run to an arbitrary dataset.
+
## Frontend Transform Pipeline
### Why transformBenchmarkRows Exists
diff --git a/packages/app/cypress/component/chart-legend.cy.tsx b/packages/app/cypress/component/chart-legend.cy.tsx
index 4a362c2b..535a0053 100644
--- a/packages/app/cypress/component/chart-legend.cy.tsx
+++ b/packages/app/cypress/component/chart-legend.cy.tsx
@@ -1,5 +1,8 @@
import { useState } from 'react';
+import LegendPointsDialog from '@/components/inference/ui/LegendPointsDialog';
+import type { InferenceData } from '@/components/inference/types';
+import { buildLegendPointsRows } from '@/components/inference/utils/legend-points-table';
import ChartLegend, { type CommonLegendItemProps } from '@/components/ui/chart-legend';
const MOCK_ITEMS: CommonLegendItemProps[] = [
@@ -119,4 +122,146 @@ describe('ChartLegend (sidebar variant)', () => {
.click();
cy.get('.sidebar-legend').should('not.have.class', 'bg-accent');
});
+
+ it('renders no points-table icon when items have no onShowPoints handler', () => {
+ cy.get('[data-testid^="legend-points-"]').should('not.exist');
+ });
+});
+
+// ---------------------------------------------------------------------------
+// Per-series points table (inference legend drill-down)
+// ---------------------------------------------------------------------------
+
+function mockPoint(overrides: Partial = {}): InferenceData {
+ return {
+ date: '2025-06-15',
+ x: 100,
+ y: 500,
+ tp: 8,
+ conc: 16,
+ hwKey: 'b300-sxm',
+ precision: 'fp4',
+ tput_per_gpu: 1500.5,
+ median_intvty: 45.2,
+ p90_intvty: 38.1,
+ median_ttft: 0.42,
+ p90_ttft: 0.87,
+ tpPerGpu: { y: 1500.5, roof: false },
+ tpPerMw: { y: 50, roof: false },
+ costh: { y: 1, roof: false },
+ costn: { y: 1, roof: false },
+ costr: { y: 1, roof: false },
+ costhi: { y: 1, roof: false },
+ costni: { y: 1, roof: false },
+ costri: { y: 1, roof: false },
+ ...overrides,
+ } as InferenceData;
+}
+
+const OFFICIAL_POINTS: InferenceData[] = [
+ mockPoint({ conc: 32, benchmark_type: 'agentic_traces', id: 206863, offload_mode: 'on' }),
+ mockPoint({ conc: 4, benchmark_type: 'agentic_traces', id: 206860, offload_mode: 'off' }),
+];
+
+const OVERLAY_POINTS: InferenceData[] = [
+ mockPoint({ conc: 8, run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/1' }),
+];
+
+/** Mirrors ScatterGraph's wiring: legend rows with onShowPoints → dialog. */
+function LegendWithPointsTable() {
+ const [openSeries, setOpenSeries] = useState<'official' | 'overlay' | null>(null);
+
+ const items: CommonLegendItemProps[] = [
+ {
+ name: 'b300-sxm',
+ hw: 'b300-sxm',
+ label: 'B300 (vLLM)',
+ color: '#2b83ba',
+ isActive: true,
+ onClick: () => {},
+ onShowPoints: () => setOpenSeries('official'),
+ },
+ {
+ name: '✕ unofficial-run-99',
+ hw: 'overlay-run-99',
+ label: '✕ my-branch',
+ color: '#dc2626',
+ isActive: true,
+ onClick: () => {},
+ onShowPoints: () => setOpenSeries('overlay'),
+ },
+ ];
+
+ const isOverlay = openSeries === 'overlay';
+ return (
+ <>
+ {}}
+ variant="sidebar"
+ />
+ {openSeries && (
+ {
+ if (!open) setOpenSeries(null);
+ }}
+ title={isOverlay ? '✕ my-branch' : 'B300 (vLLM)'}
+ subtitle="DeepSeek V4 Pro · Agentic Traces"
+ accentColor={isOverlay ? '#dc2626' : '#2b83ba'}
+ rows={buildLegendPointsRows(isOverlay ? OVERLAY_POINTS : OFFICIAL_POINTS, isOverlay)}
+ isOverlay={isOverlay}
+ />
+ )}
+ >
+ );
+}
+
+describe('ChartLegend points-table icon + dialog', () => {
+ beforeEach(() => {
+ cy.mount( );
+ });
+
+ it('renders the icon only for rows with an onShowPoints handler', () => {
+ cy.get('[data-testid="legend-points-b300-sxm"]').should('exist');
+ cy.get('[data-testid="legend-points-overlay-run-99"]').should('exist');
+ });
+
+ it('opens the dialog with the series points sorted by concurrency, with row links', () => {
+ cy.get('[data-testid="legend-points-b300-sxm"]').click();
+ cy.get('[data-testid="legend-points-dialog"]').should('be.visible');
+ cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'B300 (vLLM)');
+ cy.get('[data-testid="legend-points-dialog"]').should(
+ 'contain.text',
+ 'DeepSeek V4 Pro · Agentic Traces',
+ );
+ // Two rows, conc ascending, linked to the agentic detail pages
+ cy.get('[data-testid="legend-points-row"]').should('have.length', 2);
+ cy.get('a[data-testid="legend-points-row"]')
+ .first()
+ .should('have.attr', 'href', '/inference/agentic/206860');
+ cy.get('a[data-testid="legend-points-row"]').first().should('contain.text', '4');
+ // Offload column present for agentic rows
+ cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'Offload');
+ });
+
+ it('overlay series opens a link-free table with the metrics-only caption', () => {
+ cy.get('[data-testid="legend-points-overlay-run-99"]').click();
+ cy.get('[data-testid="legend-points-dialog"]').should('contain.text', '✕ my-branch');
+ cy.get('a[data-testid="legend-points-row"]').should('not.exist');
+ cy.get('div[data-testid="legend-points-row"]').should('have.length', 1);
+ cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'metrics only');
+ // Metrics still render
+ cy.get('[data-testid="legend-points-dialog"]').should('contain.text', '1500.5');
+ });
+
+ it('dialog closes and can be reopened', () => {
+ cy.get('[data-testid="legend-points-b300-sxm"]').click();
+ cy.get('[data-testid="legend-points-dialog"]').should('be.visible');
+ cy.get('body').type('{esc}');
+ cy.get('[data-testid="legend-points-dialog"]').should('not.exist');
+ cy.get('[data-testid="legend-points-overlay-run-99"]').click();
+ cy.get('[data-testid="legend-points-dialog"]').should('be.visible');
+ });
});
diff --git a/packages/app/cypress/component/dataset-list.cy.tsx b/packages/app/cypress/component/dataset-list.cy.tsx
new file mode 100644
index 00000000..f7cfcb9a
--- /dev/null
+++ b/packages/app/cypress/component/dataset-list.cy.tsx
@@ -0,0 +1,93 @@
+import { QueryClient, QueryClientProvider } from '@tanstack/react-query';
+import { AppRouterContext } from 'next/dist/shared/lib/app-router-context.shared-runtime';
+
+import { DatasetList } from '@/components/datasets/dataset-list';
+import type { DatasetRecord } from '@/hooks/api/use-datasets';
+
+const datasets: DatasetRecord[] = [
+ {
+ id: 'ds-1',
+ slug: 'cc-traces-weka-full',
+ label: 'cc-traces-weka (full)',
+ variant: 'full',
+ description: 'Every captured request, unmodified.',
+ hf_url: 'https://huggingface.co/datasets/semianalysisai/cc-traces-weka-full',
+ license: 'apache-2.0',
+ conversation_count: 1234,
+ summary: {
+ totalIn: 5_000_000,
+ totalOut: 250_000,
+ cachedPct: 0.82,
+ mainTurns: 9800,
+ subagentGroups: 540,
+ },
+ ingested_at: '2026-06-20T00:00:00Z',
+ },
+ {
+ id: 'ds-2',
+ slug: 'cc-traces-weka-256k',
+ label: 'cc-traces-weka (256k)',
+ variant: '256k',
+ description: 'Turns trimmed to a 256k context window.',
+ hf_url: null,
+ license: 'apache-2.0',
+ conversation_count: 980,
+ summary: {
+ totalIn: 3_200_000,
+ totalOut: 180_000,
+ cachedPct: 0.79,
+ mainTurns: 7600,
+ subagentGroups: 410,
+ },
+ ingested_at: '2026-06-19T00:00:00Z',
+ },
+];
+
+function createMockRouter() {
+ return {
+ push: cy.stub(),
+ replace: cy.stub(),
+ refresh: cy.stub(),
+ back: cy.stub(),
+ forward: cy.stub(),
+ prefetch: cy.stub().resolves(),
+ };
+}
+
+function mountList() {
+ const queryClient = new QueryClient({ defaultOptions: { queries: { retry: false } } });
+ cy.mount(
+
+
+
+
+ ,
+ );
+}
+
+describe('DatasetList', () => {
+ it('renders a card per dataset with its summary stats', () => {
+ cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: datasets }).as('list');
+ mountList();
+ cy.wait('@list');
+ cy.contains('cc-traces-weka (full)').should('be.visible');
+ cy.contains('cc-traces-weka (256k)').should('be.visible');
+ cy.contains('1,234').should('be.visible'); // conversation_count, localized
+ cy.contains('82%').should('be.visible'); // cachedPct
+ cy.get('a[href="/datasets/cc-traces-weka-full"]').should('exist');
+ });
+
+ it('shows the empty state when no datasets are ingested', () => {
+ cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: [] }).as('empty');
+ mountList();
+ cy.wait('@empty');
+ cy.contains('No datasets ingested yet.').should('be.visible');
+ });
+
+ it('shows the error state when the request fails', () => {
+ cy.intercept('GET', '/api/v1/datasets', { statusCode: 500, body: { error: 'boom' } }).as('err');
+ mountList();
+ cy.wait('@err');
+ cy.contains('Failed to load datasets.').should('be.visible');
+ });
+});
diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx
new file mode 100644
index 00000000..511505b9
--- /dev/null
+++ b/packages/app/cypress/component/distribution-card.cy.tsx
@@ -0,0 +1,82 @@
+import { DistributionCard } from '@/components/datasets/distribution-card';
+import type { Distribution } from '@/hooks/api/use-datasets';
+
+const distribution: Distribution = {
+ bins: [
+ { x0: 0, x1: 100, count: 5 },
+ { x0: 100, x1: 200, count: 20 },
+ { x0: 200, x1: 300, count: 12 },
+ { x0: 300, x1: 400, count: 3 },
+ ],
+ stats: {
+ count: 40,
+ min: 10,
+ max: 390,
+ mean: 180,
+ median: 175,
+ p75: 250,
+ p90: 320,
+ p95: 360,
+ },
+};
+
+describe('DistributionCard', () => {
+ it('renders the title, summary stats, and one bar per bin', () => {
+ cy.mount(
+ ,
+ );
+ cy.contains('Input tokens per turn').should('be.visible');
+ cy.contains('n=40').should('be.visible');
+ cy.contains('p50 175').should('be.visible');
+ cy.contains('p75 250').should('be.visible');
+ cy.contains('p90 320').should('be.visible');
+ cy.contains('p95 360').should('be.visible');
+ cy.get(
+ 'line[stroke="#3b82f6"], line[stroke="#22c55e"], line[stroke="#f59e0b"], line[stroke="#ef4444"]',
+ ).should('have.length', 8);
+ // One filled bar rect per bin (ChartHover may add a transparent overlay rect).
+ cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length);
+ });
+
+ it('shows a "No data" placeholder when no distribution is provided', () => {
+ cy.mount( );
+ cy.contains('Empty metric').should('be.visible');
+ cy.contains('No data').should('be.visible');
+ cy.get('rect[class*="fill-primary"]').should('not.exist');
+ });
+
+ it('marks the chart as log scale when scale="log"', () => {
+ cy.mount(
+ ,
+ );
+ cy.contains('log scale').should('be.visible');
+ });
+
+ it('renders older v1 stats without unavailable percentile guides', () => {
+ cy.mount(
+ ,
+ );
+ cy.contains('p50 175').should('be.visible');
+ cy.contains('p90 320').should('be.visible');
+ cy.contains('NaN').should('not.exist');
+ });
+});
diff --git a/packages/app/cypress/component/inference-chart-controls.cy.tsx b/packages/app/cypress/component/inference-chart-controls.cy.tsx
index 03e6a50c..5a6311f4 100644
--- a/packages/app/cypress/component/inference-chart-controls.cy.tsx
+++ b/packages/app/cypress/component/inference-chart-controls.cy.tsx
@@ -14,8 +14,8 @@ describe('Inference ChartControls', () => {
it('renders the sequence selector with the current sequence', () => {
// Default mock: selectedSequence = Sequence.EightK_OneK -> label "8K / 1K"
- cy.get('#sequence-select').should('be.visible');
- cy.get('#sequence-select').should('contain.text', '8K / 1K');
+ cy.get('#scenario-select').should('be.visible');
+ cy.get('#scenario-select').should('contain.text', '8K / 1K');
});
it('renders the precision multi-select with the current precision', () => {
diff --git a/packages/app/cypress/component/trace-flamegraph.cy.tsx b/packages/app/cypress/component/trace-flamegraph.cy.tsx
new file mode 100644
index 00000000..1be90e0c
--- /dev/null
+++ b/packages/app/cypress/component/trace-flamegraph.cy.tsx
@@ -0,0 +1,86 @@
+import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
+import type { ConversationStructure } from '@/hooks/api/use-datasets';
+
+// Two main turns followed by one subagent group with two child turns.
+// Node indices: 0 = turn, 1 = turn, 2 = subagent (so its rows key off `g-2`).
+const structure: ConversationStructure = {
+ blockSize: 64,
+ nodes: [
+ { kind: 'turn', turnIndex: 0, model: 'claude', in: 1000, out: 200, cached: 600, uncached: 400 },
+ {
+ kind: 'turn',
+ turnIndex: 1,
+ model: 'claude',
+ in: 2000,
+ out: 300,
+ cached: 1500,
+ uncached: 500,
+ },
+ {
+ kind: 'subagent',
+ label: 'Subagent: search',
+ agentId: 'agent-1',
+ durationMs: 12000,
+ in: 5000,
+ out: 800,
+ cached: 3000,
+ uncached: 2000,
+ children: [
+ {
+ kind: 'turn',
+ turnIndex: 0,
+ model: 'claude',
+ in: 2500,
+ out: 400,
+ cached: 1500,
+ uncached: 1000,
+ },
+ {
+ kind: 'turn',
+ turnIndex: 1,
+ model: 'claude',
+ in: 2500,
+ out: 400,
+ cached: 1500,
+ uncached: 1000,
+ },
+ ],
+ },
+ ],
+ totals: { in: 8000, out: 1300, cached: 5100, uncached: 2900, numTurns: 2, numSubagentGroups: 1 },
+};
+
+describe('TraceFlamegraph', () => {
+ it('renders the legend, main-turn rows, and the subagent group header', () => {
+ cy.mount( );
+ cy.contains('Cached prefix').should('be.visible');
+ cy.contains('Uncached input').should('be.visible');
+ cy.contains('Output').should('be.visible');
+ cy.get('[data-rowkey="t-0"]').should('contain.text', 'Turn 1');
+ cy.get('[data-rowkey="t-1"]').should('contain.text', 'Turn 2');
+ cy.contains('Subagent: search').should('be.visible');
+ });
+
+ it('keeps subagent children collapsed until the group is expanded', () => {
+ cy.mount( );
+ cy.get('[data-rowkey="g-2-c-0"]').should('not.exist');
+ cy.contains('button', 'Subagent: search').click();
+ cy.get('[data-rowkey="g-2-c-0"]').should('be.visible');
+ cy.get('[data-rowkey="g-2-c-1"]').should('be.visible');
+ });
+
+ it('expand all / collapse all toggles every subagent group', () => {
+ cy.mount( );
+ cy.contains('button', 'Expand all').click();
+ cy.get('[data-rowkey="g-2-c-0"]').should('be.visible');
+ cy.contains('button', 'Collapse all').click();
+ cy.get('[data-rowkey="g-2-c-0"]').should('not.exist');
+ });
+
+ it('auto-expands and highlights the target group child for a request-timeline deep link', () => {
+ cy.mount(
+ ,
+ );
+ cy.get('[data-rowkey="g-2-c-1"]').should('be.visible').and('have.class', 'ring-primary');
+ });
+});
diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
new file mode 100644
index 00000000..e8161066
--- /dev/null
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -0,0 +1,337 @@
+import { unlockAgenticGate } from '../support/e2e';
+
+const timelineRequest = (
+ index: number,
+ ttftMs: number,
+ tpotMs: number,
+ overrides: Record = {},
+) => ({
+ cid: 'conversation-1',
+ ti: index,
+ wid: 'worker-1',
+ ad: 0,
+ phase: 'profiling',
+ credit: index * 1_000_000_000,
+ start: index * 1_000_000_000,
+ ack: null,
+ end: (index + 1) * 1_000_000_000,
+ ttftMs,
+ tpotMs,
+ isl: 1024,
+ osl: 128,
+ cancelled: false,
+ ...overrides,
+});
+
+describe('Agentic point request metric time series', () => {
+ before(() => {
+ cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} });
+ cy.intercept('GET', '/api/v1/trace-server-metrics*', { body: null });
+ cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 });
+ cy.intercept('GET', '/api/v1/request-timeline*', {
+ body: {
+ version: 3,
+ startNs: 0,
+ endNs: 7_000_000_000,
+ durationS: 7,
+ requests: [
+ timelineRequest(0, 100, 10),
+ timelineRequest(1, 200, 20),
+ timelineRequest(2, 400, 25),
+ timelineRequest(3, 800, 40),
+ timelineRequest(4, 1600, 80),
+ timelineRequest(5, 3200, 160, { phase: 'warmup' }),
+ timelineRequest(6, 6400, 320, { cancelled: true }),
+ timelineRequest(7, 0, 0, {
+ cid: 'conversation-1::sa:subagent_001_abcd',
+ credit: 1_100_000_000,
+ start: 1_100_000_000,
+ end: 1_900_000_000,
+ ttftMs: null,
+ tpotMs: null,
+ isl: null,
+ osl: null,
+ }),
+ timelineRequest(8, 0, 0, {
+ cid: 'conversation-1::sa:subagent_001_abcd:aux:011',
+ credit: 1_200_000_000,
+ start: 1_200_000_000,
+ end: 1_800_000_000,
+ ttftMs: null,
+ tpotMs: null,
+ isl: null,
+ osl: null,
+ }),
+ ],
+ },
+ });
+ cy.visit('/inference/agentic/206885', { onBeforeLoad: unlockAgenticGate });
+ });
+
+ it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => {
+ cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
+ cy.contains('h2', 'Interactivity over time').should('be.visible');
+ cy.get('[data-testid="interactivity-percentile-toggle"]')
+ .find('[role="tab"][aria-selected="true"]')
+ .should('have.text', 'P90');
+ // 6 points: profiling slice includes requests 0-4 (profiling) + request 5
+ // (phase='warmup' label but start=5s > profiling boundary=0s, so
+ // sliceTimelineByPhase keeps it); cancelled r6 and null-metric r7/r8 are dropped.
+ cy.get('[data-testid="interactivity-point-count"]').should('have.text', '6 points');
+ cy.get('svg circle').should('have.length', 6);
+ cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
+ cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT');
+ cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+ });
+
+ cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+ cy.contains('h2', 'TTFT over time').should('be.visible');
+ // Same 6-point slice as interactivity (warmup r5 included by time-boundary).
+ cy.get('[data-testid="ttft-point-count"]').should('have.text', '6 points');
+ cy.get('svg circle').should('have.length', 6);
+ cy.get('svg').should('contain.text', 'TTFT (s)');
+ cy.get('svg').should('contain.text', 'Cumulative P90 TTFT');
+ cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+ });
+ });
+
+ it('switches ISL and OSL cards from distributions to in-flight averages', () => {
+ cy.get('[data-testid="isl-metric-chart"]').within(() => {
+ cy.get('[data-testid="isl-metric-inflight"]').click();
+ cy.contains('h2', 'Average ISL in flight').should('be.visible');
+ cy.get('svg').should('contain.text', 'Average ISL in flight (30s avg)');
+ });
+ cy.get('[data-testid="osl-metric-chart"]').within(() => {
+ cy.get('[data-testid="osl-metric-inflight"]').click();
+ cy.contains('h2', 'Average OSL in flight').should('be.visible');
+ cy.contains('Retrospective: final observed OSL').should('be.visible');
+ cy.get('svg').should('contain.text', 'Average OSL in flight (30s avg)');
+ });
+ });
+
+ it('switches the TTFT chart to E2E request latency over time', () => {
+ cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+ cy.get('[data-testid="latency-metric-e2e"]').click();
+ cy.contains('h2', 'E2E latency over time').should('be.visible');
+ // 8 points: e2e = (end−start)/1e6 > 0 for all non-cancelled requests —
+ // includes r0-r5 (profiling slice) + r7, r8 (subagent/aux with null ttft/tpot
+ // but valid start/end). Cancelled r6 is excluded.
+ cy.get('[data-testid="e2e-point-count"]').should('have.text', '8 points');
+ cy.get('svg circle').should('have.length', 8);
+ cy.get('svg').should('contain.text', 'E2E latency (s)');
+ cy.get('svg').should('contain.text', 'Cumulative P90 E2E latency');
+
+ cy.get('[data-testid="latency-metric-ttft"]').click();
+ cy.contains('h2', 'TTFT over time').should('be.visible');
+ });
+ });
+
+ it('switches each chart independently from P90 to P75', () => {
+ cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
+ cy.contains('svg', 'P90 (rolling 50 req)')
+ .find('path')
+ .first()
+ .invoke('attr', 'd')
+ .as('p90Path');
+ cy.contains('button', 'P75').click();
+ cy.get('[data-testid="interactivity-percentile-toggle"]')
+ .find('[role="tab"][aria-selected="true"]')
+ .should('have.text', 'P75');
+ cy.get('svg').should('contain.text', '1 / cumulative P75 TPOT');
+ cy.contains('svg', 'P75 (rolling 50 req)')
+ .find('path')
+ .first()
+ .invoke('attr', 'd')
+ .then(function (p75Path) {
+ expect(p75Path).not.to.equal(this.p90Path);
+ });
+ });
+
+ cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+ cy.get('[data-testid="ttft-percentile-toggle"]')
+ .find('[role="tab"][aria-selected="true"]')
+ .should('have.text', 'P90');
+ cy.contains('button', 'P75').click();
+ cy.get('svg').should('contain.text', 'P75 (rolling 50 req)');
+ cy.get('svg').should('contain.text', 'Cumulative P75 TTFT');
+ });
+ });
+
+ it('switches the request activity card from queue depth to cumulative completions', () => {
+ cy.get('[data-testid="request-activity-chart"]').within(() => {
+ cy.contains('h2', 'Request queue depth').should('be.visible');
+ cy.get('[data-testid="request-activity-completed"]').click();
+ cy.contains('h2', 'Cumulative completed requests').should('be.visible');
+ cy.get('svg').should('contain.text', 'Completed requests');
+ cy.get('svg').should('contain.text', 'Requests');
+ cy.get('[data-testid="request-activity-queue"]').click();
+ cy.contains('h2', 'Request queue depth').should('be.visible');
+ });
+ });
+
+ it('shows total idle time on the request timeline (time-boundary phase slice, consistent with the charts)', () => {
+ cy.get('[data-testid="detail-view-timeline"]').click();
+ cy.location('search').should('contain', 'view=timeline');
+ // The Gantt now slices by TIME BOUNDARY (sliceTimelineByPhase), matching the
+ // per-point charts, instead of the per-request phase LABEL. The earliest
+ // profiling request starts at t=0, so the boundary is 0 and warmup-labelled
+ // r5 (start=5s) is counted as profiling here too — exactly as the interactivity
+ // /TTFT charts already count it (their 6-point slice includes r5). That fills
+ // the former 5–6s gap that label-based filtering left open, so in-flight
+ // coverage is now continuous across [0s, 7s]: idle 0ms (0.0%). A 1.00s value
+ // here would mean the Gantt had regressed to label-based filtering.
+ cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 0ms (0.0%)');
+ cy.get('[data-timeline-row-kind="aux"]')
+ .should('have.css', 'padding-left', '24px')
+ .and('contain.text', 'aux 011 · parallel');
+ });
+
+ it('restores the request timeline view after browser Back from a dataset route', () => {
+ cy.window().then((win) => {
+ win.history.pushState({}, '', '/datasets/test-dataset/conversations/conversation-1');
+ });
+ cy.go('back');
+ cy.location('pathname').should('eq', '/inference/agentic/206885');
+ cy.location('search').should('contain', 'view=timeline');
+ cy.get('[data-testid="detail-view-timeline"]').should('have.attr', 'aria-selected', 'true');
+ cy.get('[data-testid="timeline-total-idle-time"]').should('be.visible');
+ });
+
+ it('shows a cumulative average for unique input tokens in flight', () => {
+ cy.get('[data-testid="detail-view-point"]').click();
+ cy.get('[data-testid="unique-input-inflight-chart"]').within(() => {
+ cy.get('svg').should('contain.text', 'Cumulative average');
+ cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+ });
+ });
+});
+
+const pointMeta = {
+ id: 206885,
+ hardware: 'gb200',
+ framework: 'dynamo-vllm',
+ model: 'deepseek-r1-0528',
+ precision: 'fp8',
+ spec_method: 'none',
+ disagg: true,
+ conc: 128,
+ offload_mode: 'off',
+ isl: null,
+ osl: null,
+ benchmark_type: 'agentic_traces',
+ date: '2026-06-23',
+ run_url: null,
+ server_gpu_cache_hit_rate: 0.5,
+ server_cpu_cache_hit_rate: null,
+};
+
+const sourceSeries = (source: Record, prompt: number, generation: number) => ({
+ source,
+ kvCacheUsage: [
+ { t: 0, value: 0.25 },
+ { t: 1, value: 0.5 },
+ ],
+ prefixCacheHitRate: [{ t: 0, value: 0.5 }],
+ queueDepth: [{ t: 0, running: 2, waiting: 1, total: 3 }],
+ promptTokensBySource: { miss: [{ t: 0, value: prompt }] },
+ promptTps: [{ t: 0, value: prompt }],
+ generationTps: [{ t: 0, value: generation }],
+ prefixCacheHitsTps: [{ t: 0, value: prompt / 2 }],
+ hostKvCacheUsage: [],
+ kvCacheUsageByEngine: [],
+});
+
+describe('Agentic point orchestrator metric sources', () => {
+ beforeEach(() => {
+ const prefill = sourceSeries(
+ {
+ id: 'dynamo|prefill|10.30.1.56:7500|prefill-a|0|0',
+ adapter: 'dynamo',
+ role: 'prefill',
+ endpointUrl: '10.30.1.56:7500',
+ nativeRole: 'prefill',
+ workerId: 'prefill-a',
+ dpRank: '0',
+ engine: '0',
+ },
+ 100,
+ 1,
+ );
+ const decode = sourceSeries(
+ {
+ id: 'dynamo|decode|10.30.1.206:7516|decode-a|0|0',
+ adapter: 'dynamo',
+ role: 'decode',
+ endpointUrl: '10.30.1.206:7516',
+ nativeRole: 'backend',
+ workerId: 'decode-a',
+ dpRank: '0',
+ engine: '0',
+ },
+ 300,
+ 400,
+ );
+ cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} });
+ cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 });
+ cy.intercept('GET', '/api/v1/request-timeline*', { statusCode: 404 });
+ cy.intercept('GET', '/api/v1/trace-server-metrics*', {
+ body: {
+ meta: pointMeta,
+ startNs: 0,
+ endNs: 2_000_000_000,
+ durationS: 2,
+ timeslicesCount: 2,
+ kvCacheUsage: prefill.kvCacheUsage,
+ prefixCacheHitRate: prefill.prefixCacheHitRate,
+ queueDepth: prefill.queueDepth,
+ promptTokensBySource: prefill.promptTokensBySource,
+ prefillTps: prefill.promptTps,
+ decodeTps: decode.generationTps,
+ prefixCacheHitsTps: prefill.prefixCacheHitsTps,
+ hostKvCacheUsage: [],
+ kvCacheUsageByEngine: [],
+ metricSources: [prefill, decode],
+ },
+ });
+ cy.visit('/inference/agentic/206885', { onBeforeLoad: unlockAgenticGate });
+ });
+
+ it('switches every server chart to an orchestrator-normalized worker', () => {
+ cy.get('[data-testid="metric-source-toolbar"]')
+ .should('have.css', 'position', 'sticky')
+ .and('have.css', 'top', '64px');
+ cy.get('[data-testid="metric-source-select"]').should('contain.text', 'All endpoints').click();
+ cy.contains('[role="option"]', 'Decode · decode-a').click();
+
+ cy.get('[data-testid="metric-source-select"]').should('contain.text', 'Decode · decode-a');
+ cy.contains('h2', 'Throughput · Decode · decode-a').should('be.visible');
+ cy.contains('svg', 'Decode (avg n=50)').should('be.visible');
+
+ cy.get('[data-testid="metric-source-select"]').click();
+ cy.contains('[role="option"]', 'Prefill · prefill-a').click();
+ cy.contains('h2', 'Throughput · Prefill · prefill-a').should('be.visible');
+ });
+
+ it('toggles input and decode independently while keeping one visible', () => {
+ cy.get('[data-testid="throughput-series-input"]')
+ .should('have.attr', 'aria-pressed', 'true')
+ .and('not.be.disabled');
+ cy.get('[data-testid="throughput-series-decode"]')
+ .should('have.attr', 'aria-pressed', 'true')
+ .and('not.be.disabled');
+ cy.contains('svg', 'Input (avg n=50)').should('be.visible');
+ cy.contains('svg', 'Decode (avg n=50)').should('be.visible');
+ cy.contains('svg', 'Total running avg (60s burn-in)').should('be.visible');
+
+ cy.get('[data-testid="throughput-series-input"]').click();
+ cy.get('[data-testid="throughput-series-input"]').should('have.attr', 'aria-pressed', 'false');
+ cy.get('[data-testid="throughput-series-decode"]').should('be.disabled');
+ cy.contains('svg', 'Input (avg n=50)').should('not.exist');
+ cy.contains('svg', 'Total running avg (60s burn-in)').should('not.exist');
+
+ cy.get('[data-testid="throughput-series-input"]').click();
+ cy.get('[data-testid="throughput-series-decode"]').click();
+ cy.get('[data-testid="throughput-series-input"]').should('be.disabled');
+ cy.get('[data-testid="throughput-series-decode"]').should('have.attr', 'aria-pressed', 'false');
+ });
+});
diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts
new file mode 100644
index 00000000..0d2a7789
--- /dev/null
+++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts
@@ -0,0 +1,135 @@
+import { unlockAgenticGate } from '../support/e2e';
+
+const distribution = (values: {
+ median: number;
+ p75: number;
+ p90: number;
+ p95: number;
+ max: number;
+}) => ({
+ bins: [
+ { x0: 0, x1: 10, count: 5 },
+ { x0: 10, x1: 100, count: 15 },
+ ],
+ stats: {
+ count: 20,
+ min: 0,
+ mean: 40,
+ ...values,
+ },
+});
+
+describe('Dataset distribution percentiles', () => {
+ before(() => {
+ cy.intercept('GET', '/api/v1/datasets/test-dataset', {
+ body: {
+ id: 'test-dataset',
+ slug: 'test-dataset',
+ label: 'Test dataset',
+ variant: 'full',
+ description: null,
+ hf_url: null,
+ license: 'apache-2.0',
+ conversation_count: 1,
+ summary: {
+ mainTurns: 20,
+ subagentGroups: 0,
+ subagentTurns: 0,
+ medianRequestsPerConversation: 12,
+ meanRequestsPerConversation: 14.6,
+ medianSubagentsPerTrace: 3,
+ meanSubagentsPerTrace: 4.8,
+ cachedPct: 0.5,
+ totalIn: 1000,
+ totalOut: 200,
+ },
+ chart_data: {
+ version: 2,
+ inputTokensPerTurn: distribution({
+ median: 100,
+ p75: 200,
+ p90: 300,
+ p95: 400,
+ max: 500,
+ }),
+ outputTokensPerTurn: distribution({
+ median: 10,
+ p75: 20,
+ p90: 30,
+ p95: 40,
+ max: 50,
+ }),
+ uncachedInputTokensPerTurn: distribution({
+ median: 0,
+ p75: 64,
+ p90: 128,
+ p95: 256,
+ max: 512,
+ }),
+ subagentInputTokensPerRequest: distribution({
+ median: 1000,
+ p75: 2000,
+ p90: 3000,
+ p95: 4000,
+ max: 5000,
+ }),
+ subagentOutputTokensPerRequest: distribution({
+ median: 100,
+ p75: 200,
+ p90: 300,
+ p95: 400,
+ max: 500,
+ }),
+ },
+ ingested_at: '2026-06-23T00:00:00Z',
+ },
+ });
+ cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', {
+ body: { total: 0, items: [] },
+ });
+ cy.visit('/datasets/test-dataset', { onBeforeLoad: unlockAgenticGate });
+ });
+
+ it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => {
+ const expected = [
+ ['Input tokens per turn', ['p50 100', 'p75 200', 'p90 300', 'p95 400']],
+ ['Output tokens per turn', ['p50 10', 'p75 20', 'p90 30', 'p95 40']],
+ ['Uncached input tokens per request', ['p50 0', 'p75 64', 'p90 128', 'p95 256']],
+ ] as const;
+
+ for (const [title, percentiles] of expected) {
+ cy.contains('[data-slot="card"]', title).within(() => {
+ for (const percentile of percentiles) cy.contains(percentile).should('be.visible');
+ cy.get('svg line[stroke="#3b82f6"]').should('exist');
+ cy.get('svg line[stroke="#22c55e"]').should('exist');
+ cy.get('svg line[stroke="#f59e0b"]').should('exist');
+ cy.get('svg line[stroke="#ef4444"]').should('exist');
+ });
+ }
+ });
+
+ it('shows median and mean model requests per conversation', () => {
+ cy.contains('dt', 'Median requests / convo').next('dd').should('have.text', '12');
+ cy.contains('dt', 'Mean requests / convo').next('dd').should('have.text', '14.6');
+ });
+
+ it('summarizes subagents per trace instead of charting group counts', () => {
+ cy.contains('dt', 'Median subagents / trace').next('dd').should('have.text', '3');
+ cy.contains('dt', 'Mean subagents / trace').next('dd').should('have.text', '4.8');
+ cy.contains('Subagent groups per conversation').should('not.exist');
+ });
+
+ it('shows ISL and OSL distributions for inner subagent requests only', () => {
+ const expected = [
+ ['Subagent request ISL', ['p50 1.0k', 'p75 2.0k', 'p90 3.0k', 'p95 4.0k']],
+ ['Subagent request OSL', ['p50 100', 'p75 200', 'p90 300', 'p95 400']],
+ ] as const;
+
+ for (const [title, percentiles] of expected) {
+ cy.contains('[data-slot="card"]', title).within(() => {
+ cy.contains('Inner subagent requests only').should('be.visible');
+ for (const percentile of percentiles) cy.contains(percentile).should('be.visible');
+ });
+ }
+ });
+});
diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
new file mode 100644
index 00000000..bdb1adfc
--- /dev/null
+++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
@@ -0,0 +1,131 @@
+import { unlockAgenticGate } from '../support/e2e';
+
+describe('Dataset conversation flamegraph timing', () => {
+ before(() => {
+ cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', {
+ body: {
+ conv_id: 'conversation-1',
+ models: ['model-a'],
+ num_turns: 2,
+ num_subagent_groups: 1,
+ total_in: 1000,
+ total_out: 100,
+ total_cached: 500,
+ structure: {
+ blockSize: 64,
+ totals: {
+ in: 1000,
+ out: 100,
+ cached: 500,
+ uncached: 500,
+ numTurns: 2,
+ numSubagentGroups: 1,
+ },
+ nodes: [
+ {
+ kind: 'turn',
+ turnIndex: 0,
+ startS: 0,
+ endS: 1.2,
+ model: 'model-a',
+ in: 100,
+ out: 10,
+ cached: 0,
+ uncached: 100,
+ },
+ {
+ kind: 'subagent',
+ label: 'Explore',
+ agentId: 'agent-1',
+ startS: 3661.2,
+ endS: 3782.6,
+ durationMs: 121_400,
+ in: 800,
+ out: 80,
+ cached: 500,
+ uncached: 300,
+ children: [
+ {
+ kind: 'turn',
+ turnIndex: 1,
+ startS: 3661.2,
+ endS: 3668.2,
+ model: 'model-a',
+ in: 300,
+ out: 30,
+ cached: 150,
+ uncached: 150,
+ },
+ {
+ kind: 'turn',
+ turnIndex: 2,
+ startS: 3665.2,
+ endS: 3671.2,
+ model: 'model-a',
+ in: 300,
+ out: 30,
+ cached: 200,
+ uncached: 100,
+ },
+ {
+ kind: 'turn',
+ turnIndex: 3,
+ startS: 3670.2,
+ endS: 3675.2,
+ model: 'model-a',
+ in: 200,
+ out: 20,
+ cached: 150,
+ uncached: 50,
+ },
+ ],
+ },
+ {
+ kind: 'turn',
+ turnIndex: 2,
+ startS: 65.4,
+ endS: 67.4,
+ model: 'model-a',
+ in: 100,
+ out: 10,
+ cached: 0,
+ uncached: 100,
+ },
+ ],
+ },
+ },
+ });
+ cy.visit('/datasets/test-dataset/conversations/conversation-1', {
+ onBeforeLoad: unlockAgenticGate,
+ });
+ });
+
+ it('shows turn offsets and a collapsed subagent time range', () => {
+ cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00–00:01');
+ cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05–01:07');
+ cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03');
+ cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist');
+ });
+
+ it('shows subturn offsets when the subagent group is expanded', () => {
+ cy.contains('button', 'Explore').click();
+ cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01–1:01:08');
+ // Parallel groups render as left-gutter brackets; each member row carries
+ // one bracket segment per group it belongs to (non-transitive chains keep
+ // their own segments/lanes).
+ cy.get('[data-testid="flamegraph-overlap-g-1-c-0"]')
+ .should('have.length', 1)
+ .and('have.attr', 'data-overlap-group', 'subagent-1-1');
+ cy.get('[data-testid="flamegraph-overlap-g-1-c-1"]')
+ .should('have.length', 2)
+ .then(($segs) => {
+ expect([...$segs].map((seg) => seg.dataset.overlapGroup).toSorted()).to.deep.equal([
+ 'subagent-1-1',
+ 'subagent-1-2',
+ ]);
+ });
+ cy.get('[data-testid="flamegraph-overlap-g-1-c-2"]')
+ .should('have.length', 1)
+ .and('have.attr', 'data-overlap-group', 'subagent-1-2');
+ });
+});
diff --git a/packages/app/cypress/e2e/dropdown-switching.cy.ts b/packages/app/cypress/e2e/dropdown-switching.cy.ts
index 34d95ec3..93658af0 100644
--- a/packages/app/cypress/e2e/dropdown-switching.cy.ts
+++ b/packages/app/cypress/e2e/dropdown-switching.cy.ts
@@ -17,10 +17,10 @@ describe('Dropdown one-click switching', () => {
cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'true');
cy.get('[role="option"]').should('have.length.greaterThan', 0);
- cy.get('[data-testid="sequence-selector"]').click();
+ cy.get('[data-testid="scenario-selector"]').click();
cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'false');
- cy.get('[data-testid="sequence-selector"]').should('have.attr', 'aria-expanded', 'true');
+ cy.get('[data-testid="scenario-selector"]').should('have.attr', 'aria-expanded', 'true');
cy.get('[role="option"]').should('have.length.greaterThan', 0);
});
diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
new file mode 100644
index 00000000..6c832e08
--- /dev/null
+++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
@@ -0,0 +1,188 @@
+import { unlockAgenticGate } from '../support/e2e';
+
+// ---------------------------------------------------------------------------
+// Spec-scoped fixture helpers
+//
+// The shared cypress/fixtures/api/*.json files contain ZERO agentic_traces rows
+// (by design — adding them flips the bare /inference default to the agentic
+// scenario and regresses other specs). This spec therefore injects minimal
+// agentic data via spec-scoped cy.intercept overrides that shadow the fixture
+// server, following the same pattern used in ttft-x-axis-toggle.cy.ts.
+// ---------------------------------------------------------------------------
+
+const DEFAULT_MODEL_DB_KEY = 'dsv4'; // DeepSeek-V4-Pro
+const AGENTIC_DATE = '2026-06-12';
+
+// Two GPUs with agentic + single_turn entries so the scenario selector resolves
+// to agentic (agentic preferred when both types exist for the same model).
+const AGENTIC_HARDWARE = [
+ { hardware: 'b200', framework: 'vllm', disagg: false },
+ { hardware: 'b300', framework: 'vllm', disagg: false },
+];
+
+const agenticAvailability = [
+ // Agentic rows (isl/osl null).
+ ...AGENTIC_HARDWARE.map((g) => ({
+ model: DEFAULT_MODEL_DB_KEY,
+ isl: null,
+ osl: null,
+ precision: 'fp4',
+ hardware: g.hardware,
+ framework: g.framework,
+ spec_method: 'none',
+ disagg: g.disagg,
+ benchmark_type: 'agentic_traces',
+ date: AGENTIC_DATE,
+ })),
+ // Single-turn rows alongside — without these the scenario selector may not
+ // see the "both exist" signal it needs to confidently pick agentic.
+ ...AGENTIC_HARDWARE.map((g) => ({
+ model: DEFAULT_MODEL_DB_KEY,
+ isl: 8192,
+ osl: 1024,
+ precision: 'fp4',
+ hardware: g.hardware,
+ framework: g.framework,
+ spec_method: 'none',
+ disagg: g.disagg,
+ benchmark_type: 'single_turn',
+ date: AGENTIC_DATE,
+ })),
+];
+
+// Minimal per-metric percentile ladder matching what the chart expects for
+// agentic rows (median/p75/p90/p95/p99 + std for each family).
+const percentileLadder = (prefix: string, base: number): Record => ({
+ [`median_${prefix}`]: base,
+ [`p75_${prefix}`]: base * 1.2,
+ [`p90_${prefix}`]: base * 1.5,
+ [`p95_${prefix}`]: base * 1.7,
+ [`p99_${prefix}`]: base * 2.2,
+ [`std_${prefix}`]: base * 0.3,
+});
+
+const agenticMetrics = (conc: number): Record => {
+ const scale = conc / 16;
+ const itl = 0.011 * scale;
+ return {
+ ...percentileLadder('ttft', 0.4 * scale),
+ ...percentileLadder('tpot', 0.012 * scale),
+ ...percentileLadder('itl', itl),
+ ...percentileLadder('e2el', 8 * scale),
+ median_intvty: 1 / itl,
+ p75_intvty: 1 / (itl * 1.2),
+ p90_intvty: 1 / (itl * 1.5),
+ p99_intvty: 1 / (itl * 2.2),
+ std_intvty: (1 / itl) * 0.1,
+ tput_per_gpu: 950 / Math.sqrt(scale),
+ output_tput_per_gpu: 210,
+ input_tput_per_gpu: 740,
+ total_tput_tps: 7600 * conc * 0.05,
+ };
+};
+
+// IDs must be unique numbers — the GPU graph uses them as D3 data keys and
+// trace-availability is keyed on them.
+let benchIdCursor = 800100;
+const agenticBenchmarks = AGENTIC_HARDWARE.flatMap((g) =>
+ [16, 64, 128].map((conc) => ({
+ id: benchIdCursor++,
+ hardware: g.hardware,
+ framework: g.framework,
+ model: DEFAULT_MODEL_DB_KEY,
+ precision: 'fp4',
+ spec_method: 'none',
+ disagg: g.disagg,
+ is_multinode: false,
+ prefill_tp: 8,
+ prefill_ep: 1,
+ prefill_dp_attention: false,
+ prefill_num_workers: 0,
+ decode_tp: 8,
+ decode_ep: 1,
+ decode_dp_attention: false,
+ decode_num_workers: 0,
+ num_prefill_gpu: 8,
+ num_decode_gpu: 8,
+ isl: null,
+ osl: null,
+ conc,
+ offload_mode: 'off',
+ benchmark_type: 'agentic_traces',
+ image: 'vllm/vllm-openai:v0.9.0',
+ metrics: agenticMetrics(conc),
+ workers: null,
+ date: AGENTIC_DATE,
+ run_url: null,
+ })),
+);
+
+// All injected IDs with a stored trace blob — the GPU graph renders the
+// "View charts" link only when trace-availability returns true for the id.
+const agenticIds = new Set(agenticBenchmarks.map((b) => b.id));
+
+describe('GPU comparison agentic point detail', () => {
+ it('exposes the per-point charts as a normal browser link', () => {
+ // Shadow the fixture-server availability + benchmarks responses with
+ // spec-scoped agentic data so the GPU graph renders agentic dots.
+ cy.intercept('GET', '/api/v1/availability', { body: agenticAvailability }).as(
+ 'agenticAvailability',
+ );
+ cy.intercept('GET', '/api/v1/benchmarks*', { body: agenticBenchmarks }).as('agenticBenchmarks');
+ // Return true for all injected ids so the "View charts" link appears.
+ cy.intercept('GET', '/api/v1/trace-availability*', (request) => {
+ const ids = new URL(request.url).searchParams.get('ids')?.split(',') ?? [];
+ if (ids.length < 20) request.alias = 'gpuTraceAvailability';
+ const result = Object.fromEntries(
+ ids.filter((id) => agenticIds.has(Number(id))).map((id) => [id, true]),
+ );
+ request.reply({ body: result });
+ });
+
+ cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', {
+ onBeforeLoad(win) {
+ win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+ unlockAgenticGate(win);
+ },
+ });
+
+ cy.get('[data-testid="gpu-multiselect"] [role="combobox"]').click({ force: true });
+ cy.get('[role="option"]').first().click();
+ cy.contains('button', 'Select date range').click();
+ cy.get('body').then(($body) => {
+ if ($body.text().includes('View anyway')) {
+ cy.contains('button', 'View anyway').click();
+ } else {
+ cy.contains('button', 'Max Range').click();
+ cy.contains('button', 'Apply').click();
+ }
+ });
+
+ cy.get('[data-testid="gpu-graph"]').first().should('be.visible');
+ cy.wait('@gpuTraceAvailability');
+ cy.wait(100);
+ cy.get('[data-testid="gpu-graph"]')
+ .first()
+ .find('svg .dot-group')
+ .should('have.length.greaterThan', 0)
+ .first()
+ .then(($point) => {
+ const point = $point[0] as unknown as SVGElement & {
+ __data__: { benchmark_type?: string; id?: number };
+ };
+ expect(point.__data__.benchmark_type).to.equal('agentic_traces');
+ expect(point.__data__.id).to.be.a('number');
+ cy.wrap($point).find('.visible-shape').click({ force: true });
+ });
+
+ cy.get('[data-chart-tooltip]:visible').should('have.length', 1);
+ cy.get('[data-chart-tooltip]:visible [data-action="view-charts"]')
+ .should('be.visible')
+ .then(($link) => {
+ expect($link).to.match('a');
+ expect($link).not.to.have.attr('target');
+ expect($link.attr('href')).to.match(/^\/inference\/agentic\/\d+$/u);
+ });
+ cy.location('pathname').should('eq', '/inference');
+ });
+});
diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts
index 333baa6d..9c3d3274 100644
--- a/packages/app/cypress/e2e/gradient-labels.cy.ts
+++ b/packages/app/cypress/e2e/gradient-labels.cy.ts
@@ -60,19 +60,19 @@ describe('Gradient Labels Toggle', () => {
});
it('both toggles can be enabled simultaneously', () => {
- // Turn on Gradient Labels (off by default)
+ // Parallelism Labels is off by default; turn it on, then turn on Gradient.
+ cy.get('#scatter-parallelism-labels').then(($el) => {
+ if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+ });
cy.get('#scatter-gradient-labels').click();
cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
- // Turn on Parallelism Labels
- cy.get('#scatter-parallelism-labels').click();
- cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
-
// Both should be checked
cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
- // Reset for next tests
+ // Reset both for next tests (each subsequent test does a fresh cy.visit,
+ // but keep state tidy here too).
cy.get('#scatter-gradient-labels').click();
cy.get('#scatter-parallelism-labels').click();
});
diff --git a/packages/app/cypress/e2e/historical-trends.cy.ts b/packages/app/cypress/e2e/historical-trends.cy.ts
index f0a70a56..55b0e274 100644
--- a/packages/app/cypress/e2e/historical-trends.cy.ts
+++ b/packages/app/cypress/e2e/historical-trends.cy.ts
@@ -88,8 +88,8 @@ describe('Historical Trends — Content & Interactions', () => {
delete doc.body.dataset.scrollLocked;
doc.body.style.removeProperty('pointer-events');
});
- cy.get('[data-testid="sequence-selector"]').should('be.visible');
- cy.get('[data-testid="sequence-selector"]').click();
+ cy.get('[data-testid="scenario-selector"]').should('be.visible');
+ cy.get('[data-testid="scenario-selector"]').click();
cy.get('[role="option"]').should('have.length.greaterThan', 0);
cy.get('body').type('{esc}');
});
diff --git a/packages/app/cypress/e2e/line-labels.cy.ts b/packages/app/cypress/e2e/line-labels.cy.ts
index 84e655f8..23b372df 100644
--- a/packages/app/cypress/e2e/line-labels.cy.ts
+++ b/packages/app/cypress/e2e/line-labels.cy.ts
@@ -15,26 +15,30 @@ describe('Line Labels Toggle', () => {
cy.get('label[for="scatter-line-labels"]').should('contain.text', 'Line Labels');
});
- it('Line Labels toggle is on by default', () => {
- cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked');
-
- // Line labels render without any interaction
- cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
- });
-
- it('toggling Line Labels off then back on removes and restores label elements', () => {
- // On by default — turn it off first.
- cy.get('#scatter-line-labels').click();
+ it('Line Labels toggle is off by default', () => {
cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked');
+
+ // No line labels render without interaction
cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0);
+ });
- // Turn it back on — labels return.
+ it('toggling Line Labels on then back off adds and removes label elements', () => {
+ // Off by default — turn it on first.
cy.get('#scatter-line-labels').click();
cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked');
cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
+
+ // Turn it back off — labels disappear.
+ cy.get('#scatter-line-labels').click();
+ cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked');
+ cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0);
});
it('line labels have colored background rects and text', () => {
+ // Off by default — ensure on (idempotent; prior test left them off).
+ cy.get('#scatter-line-labels').then(($el) => {
+ if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+ });
// Each line label group should contain a background rect and text
cy.get('[data-testid="scatter-graph"] svg g.line-label .ll-bg').should(
'have.length.greaterThan',
@@ -47,7 +51,10 @@ describe('Line Labels Toggle', () => {
});
it('line labels render in the foreground, after the scatter points', () => {
- // Labels were toggled on in the test above and remain on here.
+ // Off by default — ensure on (idempotent; previous test leaves them on).
+ cy.get('#scatter-line-labels').then(($el) => {
+ if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+ });
cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
cy.get('[data-testid="scatter-graph"] svg').then(($svg) => {
diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index e17a4aff..92b32d33 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -1,46 +1,314 @@
-describe('TTFT X-Axis Toggle (E2E chart)', () => {
+import { unlockAgenticGate } from '../support/e2e';
+
+const interceptDerivedMetrics = () => {
+ cy.intercept('GET', '/api/v1/derived-agentic-metrics*', (request) => {
+ const ids = new URL(request.url).searchParams.get('ids')?.split(',').filter(Boolean) ?? [];
+ request.reply({
+ body: Object.fromEntries(
+ ids.map((id, index) => [
+ id,
+ {
+ id: Number(id),
+ normalized_session_time_s: 60 + index,
+ p90_prefill_tps_per_user: 100 + index,
+ p75_normalized_e2e_400_s: 8 + index,
+ p90_normalized_e2e_400_s: 12 + index,
+ },
+ ]),
+ ),
+ });
+ }).as('derivedAgenticMetrics');
+};
+
+// This spec exercises the agentic x-axis modes, which only exist when the
+// selected model resolves to the Agentic Traces scenario. The default e2e
+// fixtures (cypress/fixtures/api/*.json) have NO agentic rows for any model, so
+// after the availability-gated effectiveSequence fix the bare-/inference default
+// correctly resolves to a fixed-seq scenario. We therefore inject agentic
+// availability + benchmark rows for the default model VIA SPEC-SCOPED INTERCEPTS
+// (not the shared fixtures) so this test — and only this test — sees the agentic
+// view. Scoping to intercepts keeps every other spec's default fixed-seq.
+const DEFAULT_MODEL_DB_KEY = 'dsv4'; // DeepSeek-V4-Pro is the default model
+const AGENTIC_DATE = '2026-06-12';
+
+// Percentile ladder for one metric family (median/p75/p90/p95/p99/std).
+const percentileLadder = (prefix: string, base: number): Record => ({
+ [`median_${prefix}`]: base,
+ [`p75_${prefix}`]: base * 1.2,
+ [`p90_${prefix}`]: base * 1.5,
+ [`p95_${prefix}`]: base * 1.7,
+ [`p99_${prefix}`]: base * 2.2,
+ [`std_${prefix}`]: base * 0.3,
+});
+
+const agenticMetrics = (conc: number): Record => {
+ const scale = conc / 16;
+ const itl = 0.011 * scale;
+ return {
+ ...percentileLadder('ttft', 0.4 * scale),
+ ...percentileLadder('tpot', 0.012 * scale),
+ ...percentileLadder('itl', itl),
+ ...percentileLadder('e2el', 8 * scale),
+ median_intvty: 1 / itl,
+ p75_intvty: 1 / (itl * 1.2),
+ p90_intvty: 1 / (itl * 1.5),
+ p99_intvty: 1 / (itl * 2.2),
+ std_intvty: (1 / itl) * 0.1,
+ tput_per_gpu: 950 / Math.sqrt(scale),
+ output_tput_per_gpu: 210,
+ input_tput_per_gpu: 740,
+ total_tput_tps: 7600 * conc * 0.05,
+ };
+};
+
+const agenticGpus = [
+ { hardware: 'b200', framework: 'vllm', disagg: false },
+ { hardware: 'b300', framework: 'vllm', disagg: false },
+];
+
+// Availability: default model has BOTH agentic and fixed-seq, so the default
+// resolves to agentic (the product-intended, agentic-preferred behavior).
+const agenticAvailability = [
+ ...agenticGpus.map((g) => ({
+ model: DEFAULT_MODEL_DB_KEY,
+ isl: null,
+ osl: null,
+ precision: 'fp4',
+ hardware: g.hardware,
+ framework: g.framework,
+ spec_method: 'none',
+ disagg: g.disagg,
+ benchmark_type: 'agentic_traces',
+ date: AGENTIC_DATE,
+ })),
+ ...agenticGpus.map((g) => ({
+ model: DEFAULT_MODEL_DB_KEY,
+ isl: 8192,
+ osl: 1024,
+ precision: 'fp4',
+ hardware: g.hardware,
+ framework: g.framework,
+ spec_method: 'none',
+ disagg: g.disagg,
+ benchmark_type: 'single_turn',
+ date: AGENTIC_DATE,
+ })),
+];
+
+let benchIdCursor = 900000;
+const agenticBenchmarks = agenticGpus.flatMap((g) =>
+ [16, 64, 128].map((conc) => ({
+ id: benchIdCursor++,
+ hardware: g.hardware,
+ framework: g.framework,
+ model: DEFAULT_MODEL_DB_KEY,
+ precision: 'fp4',
+ spec_method: 'none',
+ disagg: g.disagg,
+ is_multinode: false,
+ prefill_tp: 8,
+ decode_tp: 8,
+ num_prefill_gpu: 8,
+ num_decode_gpu: 8,
+ isl: null,
+ osl: null,
+ conc,
+ offload_mode: 'off',
+ benchmark_type: 'agentic_traces',
+ image: 'vllm/vllm-openai:v0.9.0',
+ metrics: agenticMetrics(conc),
+ workers: null,
+ date: AGENTIC_DATE,
+ run_url: null,
+ })),
+);
+
+const interceptAgenticData = () => {
+ cy.intercept('GET', '/api/v1/availability', { body: agenticAvailability }).as('availability');
+ cy.intercept('GET', '/api/v1/benchmarks*', { body: agenticBenchmarks }).as('benchmarks');
+};
+
+describe('X-Axis Mode Toggle (inference chart)', () => {
before(() => {
- cy.window().then((win) => {
- win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+ interceptAgenticData();
+ cy.visit('/inference', {
+ onBeforeLoad(win) {
+ win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+ unlockAgenticGate(win);
+ },
});
- cy.visit('/inference');
- cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 2);
+ cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible');
+ cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1);
});
- it('shows the x-axis dropdown in the e2e chart heading', () => {
- cy.get('[data-testid="chart-figure"]')
- .eq(1)
- .find('h2 button')
- .should('contain.text', 'vs.')
- .and('contain.text', 'Latency');
+ it('shows Interactivity by default for the agentic view', () => {
+ cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces');
+ cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible');
+ cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible');
+ cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should('be.visible');
+ cy.get('[data-testid="x-axis-mode-interactivity"]')
+ .should('be.visible')
+ .and('have.attr', 'aria-selected', 'true');
+ cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
});
- it('opens popover with three x-axis options', () => {
- cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
- cy.get('[data-slot="popover-content"]').within(() => {
- cy.contains('End-to-end Latency').should('exist');
- cy.contains('P99 TTFT').should('exist');
- cy.contains('Median TTFT').should('exist');
+ it('switches the x-axis to TTFT and updates the heading', () => {
+ cy.get('[data-testid="x-axis-mode-ttft"]').click();
+ cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true');
+ cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token');
+ });
+
+ it('switches the x-axis to E2E Latency and updates the heading', () => {
+ cy.get('[data-testid="x-axis-mode-e2e"]').click();
+ cy.get('[data-testid="x-axis-mode-e2e"]').should('have.attr', 'aria-selected', 'true');
+ cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency');
+ });
+
+ it('switches to request-level normalized E2E at 400 output tokens', () => {
+ interceptDerivedMetrics();
+ cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click();
+ cy.wait('@derivedAgenticMetrics');
+ cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should(
+ 'have.attr',
+ 'aria-selected',
+ 'true',
+ );
+ cy.get('[data-testid="chart-figure"] h2').should(
+ 'contain.text',
+ 'P90 Normalized E2E @ 400 output tokens',
+ );
+ cy.get('[data-testid="chart-figure"] svg').should(
+ 'contain.text',
+ 'P90 Normalized E2E @ 400 output tokens (s)',
+ );
+
+ cy.get('[data-testid="percentile-selector"]').click();
+ cy.contains('[role="option"]', 'p75').click();
+ cy.get('[data-testid="chart-figure"] h2').should(
+ 'contain.text',
+ 'P75 Normalized E2E @ 400 output tokens',
+ );
+ });
+
+ it('switches back to Interactivity', () => {
+ cy.get('[data-testid="x-axis-mode-interactivity"]').click();
+ cy.get('[data-testid="x-axis-mode-interactivity"]').should(
+ 'have.attr',
+ 'aria-selected',
+ 'true',
+ );
+ cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
+ });
+});
+
+// ---------------------------------------------------------------------------
+// Overlay path — regression coverage for unofficial-run overlays with agentic
+// x-axis modes (finding #8 / AGENTS.md: chart features must have overlay tests).
+// The overlay behavior itself is verified correct by prior review; this suite
+// guards against regressions only and does NOT change overlay behavior.
+// ---------------------------------------------------------------------------
+
+// Build a minimal unofficial-run API response that contains one agentic
+// overlay benchmark row so the provider builds overlay chart data.
+const OVERLAY_RUN_ID = 99900000001;
+const OVERLAY_RUN_URL = `https://github.com/SemiAnalysisAI/InferenceX/actions/runs/${OVERLAY_RUN_ID}`;
+
+const overlayBenchmarkRow = {
+ id: 800000,
+ hardware: 'b200',
+ framework: 'vllm',
+ model: DEFAULT_MODEL_DB_KEY,
+ precision: 'fp4',
+ spec_method: 'none',
+ disagg: false,
+ is_multinode: false,
+ prefill_tp: 8,
+ decode_tp: 8,
+ num_prefill_gpu: 8,
+ num_decode_gpu: 8,
+ isl: null,
+ osl: null,
+ conc: 32,
+ offload_mode: 'off',
+ benchmark_type: 'agentic_traces',
+ image: 'vllm/vllm-openai:v0.9.0',
+ metrics: agenticMetrics(32),
+ workers: null,
+ date: AGENTIC_DATE,
+ run_url: OVERLAY_RUN_URL,
+};
+
+const interceptAgenticDataWithOverlay = () => {
+ interceptAgenticData();
+ cy.intercept('GET', '/api/unofficial-run*', {
+ body: {
+ runInfos: [
+ {
+ id: OVERLAY_RUN_ID,
+ name: 'Overlay regression fixture',
+ branch: 'test/overlay-regression',
+ sha: 'abc000',
+ createdAt: `${AGENTIC_DATE}T00:00:00Z`,
+ url: OVERLAY_RUN_URL,
+ conclusion: 'success',
+ status: 'completed',
+ isNonMainBranch: true,
+ },
+ ],
+ benchmarks: [overlayBenchmarkRow],
+ evaluations: [],
+ },
+ }).as('unofficialRun');
+};
+
+describe('X-Axis Mode Toggle — overlay path (finding #8 regression guard)', () => {
+ before(() => {
+ interceptAgenticDataWithOverlay();
+ cy.visit(`/inference?unofficialrun=${OVERLAY_RUN_ID}`, {
+ onBeforeLoad(win) {
+ win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+ unlockAgenticGate(win);
+ },
});
+ cy.wait('@unofficialRun');
+ cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible');
+ cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1);
});
- it('switches x-axis to P99 TTFT and updates the heading', () => {
- cy.get('[data-slot="popover-content"]').contains('P99 TTFT').click();
- cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'P99 TTFT');
+ it('shows overlay (unofficial-run) watermark SVG when an overlay is loaded', () => {
+ // The unofficial-run pattern watermark appears when isUnofficialRun is true.
+ cy.get('[data-testid="inference-chart-display"] svg pattern[id^="unofficial-pattern-"]').should(
+ 'exist',
+ );
});
- it('switches x-axis to Median TTFT and updates the heading', () => {
- cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
- cy.get('[data-slot="popover-content"]').contains('Median TTFT').click();
- cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'Median TTFT');
+ it('switches to ttft x-axis mode and renders SVG with overlay points', () => {
+ cy.get('[data-testid="x-axis-mode-ttft"]').click();
+ cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true');
+ cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token');
+ // Overlay points render as triangles or circles inside the chart SVG.
+ cy.get('[data-testid="inference-chart-display"] svg').should('exist');
+ cy.get('[data-testid="inference-chart-display"] svg').then(($svgs) => {
+ let total = 0;
+ $svgs.each((_i, svg) => {
+ total += svg.querySelectorAll('circle, polygon, path').length;
+ });
+ expect(total).to.be.greaterThan(0);
+ });
});
- it('switches back to End-to-end Latency', () => {
- cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
- cy.get('[data-slot="popover-content"]').contains('End-to-end Latency').click();
- cy.get('[data-testid="chart-figure"]')
- .eq(1)
- .find('h2')
- .should('contain.text', 'End-to-end Latency');
+ it('normalized-e2e mode shows suppression banner for unofficial-run overlays', () => {
+ interceptDerivedMetrics();
+ cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click();
+ cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should(
+ 'have.attr',
+ 'aria-selected',
+ 'true',
+ );
+ // The suppression message appears because isUnofficialRun is true and the
+ // mode is 'normalized-e2e' (documented in ChartDisplay.tsx ~line 640).
+ cy.contains(
+ 'Normalized E2E requires persisted per-request traces, so unofficial-run overlays are unavailable for this experimental view.',
+ ).should('be.visible');
});
});
diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts
index 33282b9c..6c827218 100644
--- a/packages/app/cypress/e2e/url-params.cy.ts
+++ b/packages/app/cypress/e2e/url-params.cy.ts
@@ -21,7 +21,7 @@ const visitWithErrorSpy = (path: string) => {
};
const assertNoHydrationMismatch = () => {
- cy.get('[data-testid="sequence-selector"]').should('be.visible');
+ cy.get('[data-testid="scenario-selector"]').should('be.visible');
cy.get('@consoleError').then((spy) => {
const calls = (spy as unknown as { args: unknown[][] }).args;
const hydration = calls.filter((args) =>
@@ -152,7 +152,7 @@ describe('URL Parameter Persistence', () => {
it('/inference?i_seq=1k/1k seeds the sequence without a hydration error', () => {
visitWithErrorSpy('/inference?i_seq=1k/1k');
- cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+ cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
assertNoHydrationMismatch();
});
@@ -160,13 +160,13 @@ describe('URL Parameter Persistence', () => {
// Visit the canonical model-prefixed slug so the assertion is directly
// about the rendered page, not about a bare-slug redirect interleaving.
visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=1k/1k');
- cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+ cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
assertNoHydrationMismatch();
});
it('/compare/[slug] with invalid ?i_seq=junk falls back to the seeded default', () => {
visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=junk');
- cy.get('[data-testid="sequence-selector"]')
+ cy.get('[data-testid="scenario-selector"]')
.invoke('text')
.should('not.contain', 'junk')
.and('match', /[18]K . [18]K/u);
@@ -228,7 +228,7 @@ describe('URL Parameter Persistence', () => {
// `effectivePrecisions` intersects the selection with available precisions
// and the UI may render the fallback. dsr1 + fp8 + 1k/1k is supported.
visitWithErrorSpy('/inference?i_seq=1k/1k&g_model=DeepSeek-R1-0528&i_prec=fp8');
- cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+ cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
cy.get('[data-testid="model-selector"]').should('contain.text', 'DeepSeek');
cy.get('[data-testid="precision-multiselect"]').should('contain.text', 'FP8');
assertNoHydrationMismatch();
@@ -236,12 +236,18 @@ describe('URL Parameter Persistence', () => {
});
describe('High contrast mode', () => {
- it('page loads without high contrast by default', () => {
+ it('inference loads with high contrast off by default', () => {
visitWithDismissedModal('/inference');
cy.get('[data-testid="scatter-graph"]').should('exist');
cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
});
+ it('i_hc=0 disables high contrast on load', () => {
+ visitWithDismissedModal('/inference?i_hc=0');
+ cy.get('[data-testid="scatter-graph"]').should('exist');
+ cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
+ });
+
it('i_hc=1 applies high contrast on load', () => {
visitWithDismissedModal('/inference?i_hc=1');
cy.get('[data-testid="scatter-graph"]').should('exist');
@@ -267,7 +273,9 @@ describe('URL Parameter Persistence', () => {
cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked');
});
- it('historical trends tab has high contrast switch off by default', () => {
+ it('historical trends tab shares the inference high-contrast default (off)', () => {
+ // Historical reads highContrast from the same InferenceContext as the
+ // scatter chart, so it inherits the default-off behavior.
visitWithDismissedModal('/historical');
cy.get('[data-testid="historical-trends-display"]').should('exist');
cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
@@ -279,4 +287,20 @@ describe('URL Parameter Persistence', () => {
cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked');
});
});
+
+ describe('Default toggle states (share-link correctness)', () => {
+ it('a bare /inference link with neither param renders high contrast AND parallelism labels off', () => {
+ visitWithDismissedModal('/inference');
+ cy.get('[data-testid="scatter-graph"]').should('exist');
+ cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
+ cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked');
+ });
+
+ it('i_hc=1&i_advlabel=1 enables both high contrast and parallelism labels on load', () => {
+ visitWithDismissedModal('/inference?i_hc=1&i_advlabel=1');
+ cy.get('[data-testid="scatter-graph"]').should('exist');
+ cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked');
+ cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
+ });
+ });
});
diff --git a/packages/app/cypress/support/e2e.ts b/packages/app/cypress/support/e2e.ts
index d8209e33..0edb08c0 100644
--- a/packages/app/cypress/support/e2e.ts
+++ b/packages/app/cypress/support/e2e.ts
@@ -14,3 +14,22 @@ Cypress.on('window:before:load', (win) => {
// localStorage unavailable — fine, the test will just see the modal.
}
});
+
+/**
+ * Unlock the shared feature gate for specs that exercise agentic surfaces
+ * (the "Agentic Traces" scenario, /datasets, /inference/agentic/[id], and the
+ * Datasets nav link). The gate is OFF by default so the PR can ship without
+ * publicly exposing agentic features; agentic specs opt in by seeding the same
+ * localStorage flag the ↑↑↓↓ konami unlock writes (see use-feature-gate.ts).
+ *
+ * Call from a spec's `cy.visit(..., { onBeforeLoad })`:
+ * cy.visit('/datasets/x', { onBeforeLoad: unlockAgenticGate });
+ * or compose inside an existing hook: `unlockAgenticGate(win)`.
+ */
+export function unlockAgenticGate(win: Window): void {
+ try {
+ win.localStorage.setItem('inferencex-feature-gate', '1');
+ } catch {
+ // localStorage unavailable — spec will see the gate locked and likely 404.
+ }
+}
diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index bcdfe21b..490fca87 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -189,10 +189,14 @@ export function createMockInferenceContext(
workflowInfo: null,
selectedYAxisMetric: 'y_tpPerGpu',
setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'),
+ selectedPercentile: 'p90',
+ setSelectedPercentile: namedStub('setSelectedPercentile'),
selectedXAxisMetric: null,
setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
selectedE2eXAxisMetric: null,
setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'),
+ selectedXAxisMode: 'interactivity' as const,
+ setSelectedXAxisMode: namedStub('setSelectedXAxisMode'),
scaleType: 'auto',
setScaleType: namedStub('setScaleType'),
quickFilters: { vendors: [], frameworks: [], disagg: [], spec: [] },
@@ -419,6 +423,9 @@ export function createMockGlobalFilterContext(
selectedPrecisions: [Precision.FP4],
setSelectedPrecisions: namedStub('setSelectedPrecisions_global'),
effectiveSequence: Sequence.EightK_OneK,
+ // Mocks represent a settled state: availability is known and the sequence is
+ // resolved. Tests exercising the pre-availability window override this.
+ sequenceResolved: true,
effectivePrecisions: [Precision.FP4],
selectedRunDate: '2025-03-01',
setSelectedRunDate: namedStub('setSelectedRunDate_global'),
diff --git a/packages/app/next.config.ts b/packages/app/next.config.ts
index 39ab4487..32988f05 100644
--- a/packages/app/next.config.ts
+++ b/packages/app/next.config.ts
@@ -3,6 +3,12 @@ import type { NextConfig } from 'next';
import { allowedDevOriginsFromEnv } from './src/lib/allowed-dev-origins';
const nextConfig: NextConfig = {
+ // Allow a second, isolated dev server (e.g. a dump-mode instance on another
+ // port) to run from the same project dir by pointing it at a separate build
+ // dir via NEXT_DIST_DIR. Defaults to '.next' so the primary server and all
+ // CI/prod builds are unaffected. Next.js's single-dev-server lock lives under
+ // distDir, so distinct dirs let the two coexist.
+ distDir: process.env.NEXT_DIST_DIR || '.next',
allowedDevOrigins: allowedDevOriginsFromEnv(),
transpilePackages: ['@semianalysisai/inferencex-constants'],
serverExternalPackages: ['shiki'],
diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
new file mode 100644
index 00000000..91b769bd
--- /dev/null
+++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
@@ -0,0 +1,29 @@
+import type { Metadata } from 'next';
+import { notFound } from 'next/navigation';
+
+import { AgenticGate } from '@/components/agentic-gate';
+import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
+
+export const metadata: Metadata = {
+ title: 'Agentic trace detail | InferenceX',
+ robots: { index: false },
+};
+
+export default async function AgenticPointDetailPage({
+ params,
+}: {
+ params: Promise<{ id: string }>;
+}) {
+ const { id } = await params;
+ const numericId = Number(id);
+ // benchmark_results.id is a positive bigserial — anything else (`/agentic/abc`,
+ // `/agentic/0`, `/agentic/-1`) can never resolve, so 404 instead of rendering a
+ // blank detail shell that fires doomed id-keyed fetches.
+ if (!isPersistedBenchmarkId(numericId)) notFound();
+ return (
+
+
+
+ );
+}
diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts
index 072c99f1..304ccb0b 100644
--- a/packages/app/src/app/api/unofficial-run/route.ts
+++ b/packages/app/src/app/api/unofficial-run/route.ts
@@ -33,6 +33,10 @@ export function normalizeArtifactRows(
if (!params) continue;
const { config } = params;
results.push({
+ // Synthetic id — overlay rows aren't persisted, so trace_replay lookups
+ // (keyed on benchmark_results.id) will always miss, which is the
+ // intended behaviour: overlays never have stored trace_replay blobs.
+ id: 0,
hardware: config.hardware,
framework: config.framework,
model: config.model,
@@ -50,6 +54,8 @@ export function normalizeArtifactRows(
decode_num_workers: config.decodeNumWorkers,
num_prefill_gpu: config.numPrefillGpu,
num_decode_gpu: config.numDecodeGpu,
+ benchmark_type: params.benchmarkType,
+ offload_mode: params.offloadMode,
isl: params.isl,
osl: params.osl,
conc: params.conc,
diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
new file mode 100644
index 00000000..9cb229d4
--- /dev/null
+++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
@@ -0,0 +1,47 @@
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import {
+ getAgenticAggregates,
+ STATS_VERSION,
+ type AgenticAggregateMap,
+} from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idsQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: response stays small (a few numbers per id), but generating it
+// parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the
+// "Aggregates" toggle stays snappy.
+//
+// Key derived from STATS_VERSION (governs the `aggregate_stats` payload). The
+// blob cache is write-once with no post-backfill purge, so deriving the key
+// from the constant is what rolls the namespace on a version bump — a
+// hand-written string would pin the route to stale blob hits forever.
+/** Version-derived blob-cache key namespace (exported for the key-derivation test). */
+export const CACHE_KEY_PREFIX = `agentic-aggregates-v${STATS_VERSION}`;
+
+const getCachedAgenticAggregates = cachedQuery(
+ (ids: number[]): Promise => {
+ if (JSON_MODE) return Promise.resolve(jsonProvider.getAgenticAggregates(ids));
+ return getAgenticAggregates(getDb(), ids);
+ },
+ CACHE_KEY_PREFIX,
+ { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/agentic-aggregates?ids=1,2,3
+ *
+ * Returns per-id mean/p50/p75/p90/p99 for ISL, OSL, KV cache utilization,
+ * and prefix cache hit rate — computed live from the stored aiperf
+ * profile_export.jsonl + server_metrics_json blobs. Ids without a
+ * trace_replay blob (or with no usable samples) get nulls.
+ */
+export const GET = idsQueryRoute({
+ maxIds: 200,
+ logLabel: 'agentic aggregates',
+ fetch: getCachedAgenticAggregates,
+});
diff --git a/packages/app/src/app/api/v1/agentic-cache-keys.test.ts b/packages/app/src/app/api/v1/agentic-cache-keys.test.ts
new file mode 100644
index 00000000..58fa194f
--- /dev/null
+++ b/packages/app/src/app/api/v1/agentic-cache-keys.test.ts
@@ -0,0 +1,70 @@
+/**
+ * Guards that every agentic blob-cache key is DERIVED from the version constant
+ * that governs its payload — not a hand-written string. `blobSet` is write-once
+ * and nothing purges the blob cache after a backfill, so an unversioned (or
+ * hand-bumped) key would serve stale data forever after a payload-version bump.
+ * Deriving the key from the constant means a future bump rolls the cache
+ * namespace automatically; these tests fail loudly if a route drifts back to a
+ * literal string.
+ */
+
+import { describe, expect, it, vi } from 'vitest';
+
+// Route modules call getDb() at import time via cachedQuery's closure and pull
+// in the blob cache — stub both so importing the route is side-effect-free.
+vi.mock('@semianalysisai/inferencex-db/connection', () => ({
+ getDb: vi.fn(() => 'mock-sql'),
+ JSON_MODE: false,
+ FIXTURES_MODE: false,
+}));
+
+vi.mock('@/lib/api-cache', () => ({
+ // Passthrough so importing the route doesn't touch blob storage; the key is
+ // still exported as CACHE_KEY_PREFIX for us to assert on.
+ cachedQuery: (fn: (...args: unknown[]) => unknown) => fn,
+ cachedJson: (data: unknown) => Response.json(data),
+}));
+
+import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
+import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series';
+import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline';
+
+import { CACHE_KEY_PREFIX as derivedAgenticMetricsKey } from './derived-agentic-metrics/route';
+import { CACHE_KEY_PREFIX as agenticAggregatesKey } from './agentic-aggregates/route';
+import { CACHE_KEY_PREFIX as requestTimelineKey } from './request-timeline/route';
+import { CACHE_KEY_PREFIX as traceServerMetricsKey } from './trace-server-metrics/route';
+import { CACHE_KEY_PREFIX as traceHistogramsKey } from './trace-histograms/route';
+
+describe('agentic blob-cache keys are version-derived', () => {
+ it('derived-agentic-metrics key embeds STATS_VERSION', () => {
+ expect(derivedAgenticMetricsKey).toBe(`derived-agentic-metrics-v${STATS_VERSION}`);
+ });
+
+ it('agentic-aggregates key embeds STATS_VERSION', () => {
+ expect(agenticAggregatesKey).toBe(`agentic-aggregates-v${STATS_VERSION}`);
+ });
+
+ it('request-timeline key embeds REQUEST_TIMELINE_VERSION', () => {
+ expect(requestTimelineKey).toBe(`request-timeline-v${REQUEST_TIMELINE_VERSION}`);
+ });
+
+ it('trace-server-metrics key embeds CHART_SERIES_VERSION', () => {
+ expect(traceServerMetricsKey).toBe(`trace-server-metrics-v${CHART_SERIES_VERSION}`);
+ });
+
+ it('trace-histograms key embeds REQUEST_TIMELINE_VERSION (its payload is read from request_timeline)', () => {
+ expect(traceHistogramsKey).toBe(`trace-histograms-v${REQUEST_TIMELINE_VERSION}`);
+ });
+
+ it('every key actually contains a version segment (no unversioned literals)', () => {
+ for (const key of [
+ derivedAgenticMetricsKey,
+ agenticAggregatesKey,
+ requestTimelineKey,
+ traceServerMetricsKey,
+ traceHistogramsKey,
+ ]) {
+ expect(key).toMatch(/-v\d+$/u);
+ }
+ });
+});
diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
new file mode 100644
index 00000000..0718aae0
--- /dev/null
+++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
@@ -0,0 +1,29 @@
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import {
+ getBenchmarkSiblings,
+ type BenchmarkSiblings,
+} from '@semianalysisai/inferencex-db/queries/benchmark-siblings';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedSiblings = cachedQuery((id: number): Promise => {
+ if (JSON_MODE) return Promise.resolve(jsonProvider.getBenchmarkSiblings(id));
+ return getBenchmarkSiblings(getDb(), id);
+}, 'benchmark-siblings');
+
+/**
+ * GET /api/v1/benchmark-siblings?id=N
+ *
+ * Returns the SKU (hw/framework/model/precision/spec/benchmark_type) of the
+ * benchmark_result + all sibling rows that share that SKU within the same
+ * workflow_run. Used by the agentic detail page to render a navigator.
+ */
+export const GET = idQueryRoute({
+ logLabel: 'benchmark siblings',
+ fetch: getCachedSiblings,
+});
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts
new file mode 100644
index 00000000..bc374e72
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts
@@ -0,0 +1,71 @@
+import { describe, expect, it, vi, beforeEach } from 'vitest';
+
+const { mockGetConversation, mockGetDb } = vi.hoisted(() => ({
+ mockGetConversation: vi.fn(),
+ mockGetDb: vi.fn(() => 'mock-sql'),
+}));
+
+vi.mock('@semianalysisai/inferencex-db/connection', () => ({
+ getDb: mockGetDb,
+ JSON_MODE: false,
+ FIXTURES_MODE: false,
+}));
+
+vi.mock('@semianalysisai/inferencex-db/queries/datasets', () => ({
+ getConversation: mockGetConversation,
+}));
+
+vi.mock('@semianalysisai/inferencex-db/json-provider', () => ({
+ getConversation: vi.fn(),
+}));
+
+vi.mock('@/lib/api-cache', () => ({
+ cachedQuery: (fn: (...args: any[]) => any) => fn,
+ cachedJson: (data: unknown) => Response.json(data),
+}));
+
+import { GET } from './route';
+import { NextRequest } from 'next/server';
+
+function req(): NextRequest {
+ return new NextRequest(new URL('http://localhost/api/v1/datasets/ds/conversations/x'));
+}
+
+/**
+ * App Router decodes each dynamic route segment EXACTLY ONCE before handing it to
+ * the handler, so `params.convId` is already the raw conversation id. These tests
+ * pin the route's contract: it must pass that value straight to the query with NO
+ * further decodeURIComponent (which would over-decode, mis-key '%'/'/' ids, or
+ * throw on a lone '%'). The client (useDatasetConversation) encodeURIComponent's
+ * the id before the fetch, so the whole pipeline decodes once end-to-end.
+ */
+beforeEach(() => {
+ vi.clearAllMocks();
+ mockGetConversation.mockResolvedValue({ conv_id: 'x', turns: [] });
+});
+
+describe('GET /api/v1/datasets/[slug]/conversations/[convId] — decode exactly once', () => {
+ it('passes the already-decoded convId straight through (no second decode)', async () => {
+ const params = Promise.resolve({ slug: 'ds', convId: 'a/b%c' });
+ const res = await GET(req(), { params });
+ expect(res.status).toBe(200);
+ // 'a/b%c' contains a lone '%'; a second decodeURIComponent here would THROW
+ // (→ 500). Passing through means the query sees the raw id verbatim.
+ expect(mockGetConversation).toHaveBeenCalledWith('mock-sql', 'ds', 'a/b%c');
+ });
+
+ it('preserves special characters (% / # ?) exactly as decoded by App Router', async () => {
+ const raw = 'conv/50%_a#b?c';
+ const params = Promise.resolve({ slug: 'ds', convId: raw });
+ const res = await GET(req(), { params });
+ expect(res.status).toBe(200);
+ expect(mockGetConversation).toHaveBeenCalledWith('mock-sql', 'ds', raw);
+ });
+
+ it('returns 404 when the conversation is not found', async () => {
+ mockGetConversation.mockResolvedValueOnce(null);
+ const params = Promise.resolve({ slug: 'ds', convId: 'missing' });
+ const res = await GET(req(), { params });
+ expect(res.status).toBe(404);
+ });
+});
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
new file mode 100644
index 00000000..35f2fddf
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
@@ -0,0 +1,40 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import {
+ getConversation,
+ type ConversationDetail,
+} from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedConversation = cachedQuery(
+ (slug: string, convId: string): Promise => {
+ if (JSON_MODE) return Promise.resolve(jsonProvider.getConversation(slug, convId));
+ return getConversation(getDb(), slug, convId);
+ },
+ 'dataset-conversation',
+);
+
+/** GET /api/v1/datasets/[slug]/conversations/[convId] — flamegraph structure. */
+export async function GET(
+ _request: NextRequest,
+ { params }: { params: Promise<{ slug: string; convId: string }> },
+) {
+ const { slug, convId } = await params;
+ try {
+ // App Router has already decoded the `[convId]` segment exactly once, so
+ // `convId` is the raw conversation id. The client (useDatasetConversation)
+ // encodeURIComponent-encodes it before the fetch; decoding again here would
+ // over-decode and mis-key ids containing '%' / '/'. Decode exactly once.
+ const data = await getCachedConversation(slug, convId);
+ if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+ return cachedJson(data);
+ } catch (error) {
+ console.error('Error fetching dataset conversation:', error);
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+ }
+}
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts
new file mode 100644
index 00000000..b582e79c
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts
@@ -0,0 +1,116 @@
+import { describe, expect, it, vi, beforeEach } from 'vitest';
+
+const { mockListConversations, mockGetDb } = vi.hoisted(() => ({
+ mockListConversations: vi.fn(),
+ mockGetDb: vi.fn(() => 'mock-sql'),
+}));
+
+vi.mock('@semianalysisai/inferencex-db/connection', () => ({
+ getDb: mockGetDb,
+ JSON_MODE: false,
+ FIXTURES_MODE: false,
+}));
+
+vi.mock('@semianalysisai/inferencex-db/queries/datasets', () => ({
+ listConversations: mockListConversations,
+}));
+
+vi.mock('@semianalysisai/inferencex-db/json-provider', () => ({
+ listConversations: vi.fn(),
+}));
+
+vi.mock('@/lib/api-cache', () => ({
+ cachedQuery: (fn: (...args: any[]) => any) => fn,
+ cachedJson: (data: unknown) => Response.json(data),
+}));
+
+import { GET } from './route';
+import { NextRequest } from 'next/server';
+
+function req(path: string): NextRequest {
+ return new NextRequest(new URL(path, 'http://localhost'));
+}
+
+const PARAMS = Promise.resolve({ slug: 'test-dataset' });
+
+beforeEach(() => {
+ vi.clearAllMocks();
+});
+
+describe('GET /api/v1/datasets/[slug]/conversations — search input validation', () => {
+ it('returns 400 when search exceeds 100 characters', async () => {
+ const longSearch = 'a'.repeat(101);
+ const res = await GET(req(`/api/v1/datasets/test-dataset/conversations?search=${longSearch}`), {
+ params: PARAMS,
+ });
+ expect(res.status).toBe(400);
+ const body = await res.json();
+ expect(body.error).toBe('search too long');
+ // DB must not be called.
+ expect(mockListConversations).not.toHaveBeenCalled();
+ });
+
+ it('accepts a search string exactly at the 100-character limit', async () => {
+ const exactSearch = 'a'.repeat(100);
+ mockListConversations.mockResolvedValueOnce({ total: 0, items: [] });
+ const res = await GET(
+ req(`/api/v1/datasets/test-dataset/conversations?search=${exactSearch}`),
+ { params: PARAMS },
+ );
+ expect(res.status).toBe(200);
+ });
+
+ it('trims whitespace before applying the length check', async () => {
+ // A 101-char string that is 100 chars of spaces + 1 real char should become
+ // 1 char after trimming — well under the limit.
+ const paddedSearch = `${' '.repeat(100)}a`;
+ mockListConversations.mockResolvedValueOnce({ total: 1, items: [] });
+ const res = await GET(
+ req(`/api/v1/datasets/test-dataset/conversations?search=${paddedSearch}`),
+ { params: PARAMS },
+ );
+ expect(res.status).toBe(200);
+ expect(mockListConversations).toHaveBeenCalledWith(
+ 'mock-sql',
+ 'test-dataset',
+ expect.objectContaining({ search: 'a' }),
+ );
+ });
+
+ it('returns 404 when the dataset slug is unknown', async () => {
+ mockListConversations.mockResolvedValueOnce(null);
+ const res = await GET(req('/api/v1/datasets/test-dataset/conversations'), {
+ params: PARAMS,
+ });
+ expect(res.status).toBe(404);
+ const body = await res.json();
+ expect(body.error).toBe('Not found');
+ });
+
+ it('returns conversation data for a valid request', async () => {
+ const mockData = { total: 2, items: [{ conv_id: 'c1' }, { conv_id: 'c2' }] };
+ mockListConversations.mockResolvedValueOnce(mockData);
+ const res = await GET(
+ req('/api/v1/datasets/test-dataset/conversations?search=agent&sort=turns&limit=10&offset=0'),
+ { params: PARAMS },
+ );
+ expect(res.status).toBe(200);
+ const body = await res.json();
+ expect(body).toEqual(mockData);
+ expect(mockListConversations).toHaveBeenCalledWith(
+ 'mock-sql',
+ 'test-dataset',
+ expect.objectContaining({ search: 'agent', sort: 'turns', limit: 10, offset: 0 }),
+ );
+ });
+
+ it('returns 500 when the query throws', async () => {
+ mockListConversations.mockRejectedValueOnce(new Error('Neon timeout'));
+ const res = await GET(req('/api/v1/datasets/test-dataset/conversations'), {
+ params: PARAMS,
+ });
+ expect(res.status).toBe(500);
+ const body = await res.json();
+ expect(body.error).toBe('Internal server error');
+ });
+});
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
new file mode 100644
index 00000000..2dad4ace
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
@@ -0,0 +1,71 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import {
+ listConversations,
+ type ConversationList,
+ type ListConversationsOpts,
+} from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const SORTS = new Set(['tokens', 'turns', 'subagents', 'id']);
+
+const getCachedConversations = cachedQuery(
+ (
+ slug: string,
+ search: string,
+ limit: number,
+ offset: number,
+ sort: string,
+ ): Promise => {
+ const opts: ListConversationsOpts = {
+ search: search || undefined,
+ limit,
+ offset,
+ sort: sort as ListConversationsOpts['sort'],
+ };
+ if (JSON_MODE) return Promise.resolve(jsonProvider.listConversations(slug, opts));
+ return listConversations(getDb(), slug, opts);
+ },
+ 'dataset-conversations',
+);
+
+// Maximum search string length accepted. Longer strings are rejected with 400
+// rather than being forwarded to the DB: an ILIKE on an unindexed conv_id column
+// with a very long pattern (or many stacked wildcards) can exhaust Neon's
+// statement timeout and return a 500. 100 chars is generous for any real
+// conversation-id prefix while keeping the attack surface small.
+const MAX_SEARCH_LENGTH = 100;
+
+/**
+ * GET /api/v1/datasets/[slug]/conversations?search=&limit=&offset=&sort=
+ * Paginated conversation list (counts only, no flamegraph structure).
+ */
+export async function GET(request: NextRequest, { params }: { params: Promise<{ slug: string }> }) {
+ const { slug } = await params;
+ const sp = request.nextUrl.searchParams;
+ const rawSearch = sp.get('search') ?? '';
+ const search = rawSearch.trim();
+
+ // Reject search strings that exceed the length cap before touching the DB.
+ if (search.length > MAX_SEARCH_LENGTH) {
+ return NextResponse.json({ error: 'search too long' }, { status: 400 });
+ }
+
+ const limit = Math.min(200, Math.max(1, Number(sp.get('limit')) || 50));
+ const offset = Math.max(0, Number(sp.get('offset')) || 0);
+ const sortParam = sp.get('sort') ?? 'tokens';
+ const sort = SORTS.has(sortParam) ? sortParam : 'tokens';
+ try {
+ const data = await getCachedConversations(slug, search, limit, offset, sort);
+ if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+ return cachedJson(data);
+ } catch (error) {
+ console.error('Error fetching dataset conversations:', error);
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+ }
+}
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
new file mode 100644
index 00000000..e440ff5d
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
@@ -0,0 +1,30 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import { getDataset, type DatasetDetail } from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedDataset = cachedQuery((slug: string): Promise => {
+ if (JSON_MODE) return Promise.resolve(jsonProvider.getDataset(slug));
+ return getDataset(getDb(), slug);
+}, 'dataset');
+
+/** GET /api/v1/datasets/[slug] — one dataset incl. precomputed chart_data. */
+export async function GET(
+ _request: NextRequest,
+ { params }: { params: Promise<{ slug: string }> },
+) {
+ const { slug } = await params;
+ try {
+ const data = await getCachedDataset(slug);
+ if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+ return cachedJson(data);
+ } catch (error) {
+ console.error('Error fetching dataset:', error);
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+ }
+}
diff --git a/packages/app/src/app/api/v1/datasets/route.ts b/packages/app/src/app/api/v1/datasets/route.ts
new file mode 100644
index 00000000..3ad4c15d
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/route.ts
@@ -0,0 +1,25 @@
+import { NextResponse } from 'next/server';
+
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import { listDatasets, type DatasetRecord } from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedDatasets = cachedQuery((): Promise => {
+ if (JSON_MODE) return Promise.resolve(jsonProvider.listDatasets());
+ return listDatasets(getDb());
+}, 'datasets');
+
+/** GET /api/v1/datasets — all ingested cc-traces-weka datasets (registry cards). */
+export async function GET() {
+ try {
+ const data = await getCachedDatasets();
+ return cachedJson(data);
+ } catch (error) {
+ console.error('Error fetching datasets:', error);
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+ }
+}
diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
new file mode 100644
index 00000000..3afa5d41
--- /dev/null
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -0,0 +1,54 @@
+import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import {
+ getDerivedAgenticMetrics,
+ type DerivedAgenticMetricMap,
+} from '@semianalysisai/inferencex-db/queries/derived-agentic-metrics';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idsQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: the response is one entry per id with two numbers, but the
+// derivation work parses thousands of JSONL records per blob — cache the
+// computed result so a chart-refresh hits the warm path.
+//
+// The cache key is derived from STATS_VERSION (the payload governs the derived
+// metrics read out of `aggregate_stats`). blobSet is write-once and nothing
+// purges post-backfill, so a hand-written version string would serve stale
+// data forever after a bump — deriving the key from the constant means a
+// STATS_VERSION bump automatically rolls the cache namespace.
+/** Version-derived blob-cache key namespace (exported for the key-derivation test). */
+export const CACHE_KEY_PREFIX = `derived-agentic-metrics-v${STATS_VERSION}`;
+
+const getCachedDerivedAgenticMetrics = cachedQuery(
+ (ids: number[]): Promise => {
+ if (JSON_MODE) return Promise.resolve(jsonProvider.getDerivedAgenticMetrics(ids));
+ return getDerivedAgenticMetrics(getDb(), ids);
+ },
+ CACHE_KEY_PREFIX,
+ { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/derived-agentic-metrics?ids=1,2,3
+ *
+ * Returns per-id derived metrics computed live from the stored aiperf
+ * profile_export.jsonl blobs:
+ * - normalized_session_time_s: mean across sessions of session e2e time
+ * (Σ per-turn request_latency) rescaled by mean_load / session_load.
+ * - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT)
+ * across every turn in every session.
+ * - p75/p90_normalized_e2e_400_s: percentile of per-request
+ * TTFT + 399 × observed ITL.
+ *
+ * Ids without a trace_replay blob or with unparseable records are omitted.
+ */
+export const GET = idsQueryRoute({
+ maxIds: 200,
+ logLabel: 'derived agentic metrics',
+ fetch: getCachedDerivedAgenticMetrics,
+});
diff --git a/packages/app/src/app/api/v1/id-routes.test.ts b/packages/app/src/app/api/v1/id-routes.test.ts
new file mode 100644
index 00000000..32499e99
--- /dev/null
+++ b/packages/app/src/app/api/v1/id-routes.test.ts
@@ -0,0 +1,136 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('@/lib/api-cache', () => ({
+ cachedJson: (data: unknown) => Response.json(data),
+}));
+
+import { NextRequest, NextResponse } from 'next/server';
+
+import { idQueryRoute, idsQueryRoute, parseIdsParam } from './id-routes';
+
+function req(url: string): NextRequest {
+ return new NextRequest(new URL(url, 'http://localhost'));
+}
+
+beforeEach(() => {
+ vi.clearAllMocks();
+});
+
+describe('parseIdsParam', () => {
+ it('parses, dedupes, and sorts ids ascending', () => {
+ const result = parseIdsParam(req('/x?ids=3, 1,2,3'), 200);
+ expect(result).toEqual([1, 2, 3]);
+ });
+
+ it('drops non-finite and non-positive ids', () => {
+ const result = parseIdsParam(req('/x?ids=abc,-1,0,5'), 200);
+ expect(result).toEqual([5]);
+ });
+
+ it('returns 400 when the param is missing', async () => {
+ const result = parseIdsParam(req('/x'), 200);
+ expect(result).toBeInstanceOf(NextResponse);
+ const res = result as NextResponse;
+ expect(res.status).toBe(400);
+ const body = await res.json();
+ expect(body.error).toBe('ids query param is required');
+ });
+
+ it('returns 400 when no valid ids remain', async () => {
+ const result = parseIdsParam(req('/x?ids=abc,-2'), 200);
+ expect(result).toBeInstanceOf(NextResponse);
+ const res = result as NextResponse;
+ expect(res.status).toBe(400);
+ const body = await res.json();
+ expect(body.error).toBe('no valid ids provided');
+ });
+
+ it('returns 400 when the id count exceeds maxIds', async () => {
+ const result = parseIdsParam(req('/x?ids=1,2,3'), 2);
+ expect(result).toBeInstanceOf(NextResponse);
+ const res = result as NextResponse;
+ expect(res.status).toBe(400);
+ const body = await res.json();
+ expect(body.error).toBe('too many ids (max 2)');
+ });
+});
+
+describe('idsQueryRoute', () => {
+ it('fetches with sorted deduped ids and returns the payload', async () => {
+ const fetch = vi.fn().mockResolvedValue({ 1: 'a', 2: 'b' });
+ const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch });
+
+ const res = await GET(req('/x?ids=2,1,2'));
+ expect(res.status).toBe(200);
+ expect(await res.json()).toEqual({ 1: 'a', 2: 'b' });
+ expect(fetch).toHaveBeenCalledWith([1, 2]);
+ });
+
+ it('returns 400 without calling fetch when ids are invalid', async () => {
+ const fetch = vi.fn();
+ const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch });
+
+ const res = await GET(req('/x'));
+ expect(res.status).toBe(400);
+ expect(fetch).not.toHaveBeenCalled();
+ });
+
+ it('returns 500 and logs when the fetch throws', async () => {
+ const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
+ const fetch = vi.fn().mockRejectedValue(new Error('boom'));
+ const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch });
+
+ const res = await GET(req('/x?ids=1'));
+ expect(res.status).toBe(500);
+ const body = await res.json();
+ expect(body.error).toBe('Internal server error');
+ expect(consoleSpy).toHaveBeenCalledWith('Error fetching things:', expect.any(Error));
+ consoleSpy.mockRestore();
+ });
+});
+
+describe('idQueryRoute', () => {
+ it('fetches by id and returns the payload', async () => {
+ const fetch = vi.fn().mockResolvedValue({ value: 42 });
+ const GET = idQueryRoute({ logLabel: 'thing', fetch });
+
+ const res = await GET(req('/x?id=7'));
+ expect(res.status).toBe(200);
+ expect(await res.json()).toEqual({ value: 42 });
+ expect(fetch).toHaveBeenCalledWith(7);
+ });
+
+ it.each(['/x', '/x?id=abc', '/x?id=0'])('returns 400 for %s', async (url) => {
+ const fetch = vi.fn();
+ const GET = idQueryRoute({ logLabel: 'thing', fetch });
+
+ const res = await GET(req(url));
+ expect(res.status).toBe(400);
+ const body = await res.json();
+ expect(body.error).toBe('id is required (benchmark_result_id)');
+ expect(fetch).not.toHaveBeenCalled();
+ });
+
+ it('returns 404 when the fetch yields null', async () => {
+ const fetch = vi.fn().mockResolvedValue(null);
+ const GET = idQueryRoute({ logLabel: 'thing', fetch });
+
+ const res = await GET(req('/x?id=7'));
+ expect(res.status).toBe(404);
+ const body = await res.json();
+ expect(body.error).toBe('Not found');
+ });
+
+ it('returns 500 and logs when the fetch throws', async () => {
+ const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
+ const fetch = vi.fn().mockRejectedValue(new Error('boom'));
+ const GET = idQueryRoute({ logLabel: 'thing', fetch });
+
+ const res = await GET(req('/x?id=7'));
+ expect(res.status).toBe(500);
+ const body = await res.json();
+ expect(body.error).toBe('Internal server error');
+ expect(consoleSpy).toHaveBeenCalledWith('Error fetching thing:', expect.any(Error));
+ consoleSpy.mockRestore();
+ });
+});
diff --git a/packages/app/src/app/api/v1/id-routes.ts b/packages/app/src/app/api/v1/id-routes.ts
new file mode 100644
index 00000000..fea9221b
--- /dev/null
+++ b/packages/app/src/app/api/v1/id-routes.ts
@@ -0,0 +1,85 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { cachedJson } from '@/lib/api-cache';
+
+/**
+ * Shared GET-handler factories for the agentic benchmark routes, which all
+ * key off `benchmark_results.id`. Two shapes exist:
+ * - bulk `?ids=1,2,3` routes returning a map keyed by id
+ * - single `?id=N` routes returning one payload or 404
+ *
+ * Both preserve the v1 error contract: 400 with `{error}` for bad params,
+ * 404 `{error: 'Not found'}` when a single-id lookup misses, and 500
+ * `{error: 'Internal server error'}` (with a console.error) on query failure.
+ * Success payloads go through `cachedJson` for CDN caching + gzip.
+ */
+
+/**
+ * Parse, dedupe, validate, and ascending-sort the `ids` query param.
+ * Sorted so the same set of ids in any order hits the same cache entry.
+ * Returns a NextResponse (400) when the param is missing, empty, or too long.
+ */
+export function parseIdsParam(request: NextRequest, maxIds: number): number[] | NextResponse {
+ const raw = request.nextUrl.searchParams.get('ids');
+ if (!raw) {
+ return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+ }
+
+ const ids = [
+ ...new Set(
+ raw
+ .split(',')
+ .map((s) => Number(s.trim()))
+ .filter((n) => Number.isFinite(n) && n > 0),
+ ),
+ ];
+ if (ids.length === 0) {
+ return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+ }
+ if (ids.length > maxIds) {
+ return NextResponse.json({ error: `too many ids (max ${maxIds})` }, { status: 400 });
+ }
+ return ids.toSorted((a, b) => a - b);
+}
+
+/** Build a GET handler for a bulk `?ids=…` route. */
+export function idsQueryRoute(options: {
+ maxIds: number;
+ /** Human-readable name used in the 500-path console.error. */
+ logLabel: string;
+ fetch: (ids: number[]) => Promise;
+}): (request: NextRequest) => Promise {
+ const { maxIds, logLabel, fetch } = options;
+ return async (request: NextRequest) => {
+ const ids = parseIdsParam(request, maxIds);
+ if (ids instanceof NextResponse) return ids;
+ try {
+ return cachedJson(await fetch(ids));
+ } catch (error) {
+ console.error(`Error fetching ${logLabel}:`, error);
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+ }
+ };
+}
+
+/** Build a GET handler for a single `?id=N` route (404 when the fetch misses). */
+export function idQueryRoute(options: {
+ logLabel: string;
+ fetch: (id: number) => Promise;
+}): (request: NextRequest) => Promise {
+ const { logLabel, fetch } = options;
+ return async (request: NextRequest) => {
+ const id = Number(request.nextUrl.searchParams.get('id'));
+ if (!id || !Number.isFinite(id)) {
+ return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+ }
+ try {
+ const data = await fetch(id);
+ if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+ return cachedJson(data);
+ } catch (error) {
+ console.error(`Error fetching ${logLabel}:`, error);
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+ }
+ };
+}
diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts
new file mode 100644
index 00000000..89b599af
--- /dev/null
+++ b/packages/app/src/app/api/v1/request-timeline/route.ts
@@ -0,0 +1,42 @@
+import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import {
+ getRequestTimeline,
+ type RequestTimeline,
+} from '@semianalysisai/inferencex-db/queries/request-timeline';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+// Key derived from REQUEST_TIMELINE_VERSION (governs the `request_timeline`
+// payload). The blob cache is write-once with no post-backfill purge, so the
+// version-derived key is what rolls the namespace on a bump — a hand-written
+// string would serve stale blob-cached timelines forever.
+/** Version-derived blob-cache key namespace (exported for the key-derivation test). */
+export const CACHE_KEY_PREFIX = `request-timeline-v${REQUEST_TIMELINE_VERSION}`;
+
+const getCachedRequestTimeline = cachedQuery(
+ (id: number): Promise => {
+ if (JSON_MODE) return Promise.resolve(jsonProvider.getRequestTimeline(id));
+ return getRequestTimeline(getDb(), id);
+ },
+ CACHE_KEY_PREFIX,
+ { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/request-timeline?id=N
+ *
+ * Returns the per-request Gantt timeline for one agentic benchmark point.
+ * Each request entry has ns-from-start offsets for credit/start/ack/end,
+ * plus TTFT, ISL, OSL, conversation id, turn index, worker id. 404 if the
+ * point has no stored profile_export.jsonl blob.
+ */
+export const GET = idQueryRoute({
+ logLabel: 'request timeline',
+ fetch: getCachedRequestTimeline,
+});
diff --git a/packages/app/src/app/api/v1/trace-availability/route.ts b/packages/app/src/app/api/v1/trace-availability/route.ts
new file mode 100644
index 00000000..45eafef4
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-availability/route.ts
@@ -0,0 +1,29 @@
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+ getTraceAvailability,
+ type TraceAvailabilityMap,
+} from '@semianalysisai/inferencex-db/queries/trace-availability';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idsQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceAvailability = cachedQuery(
+ (ids: number[]): Promise => getTraceAvailability(getDb(), ids),
+ 'trace-availability',
+);
+
+/**
+ * GET /api/v1/trace-availability?ids=1,2,3
+ *
+ * Returns `{[id]: true}` for ids that have a stored trace_replay blob.
+ * Lightweight presence check used by the scatter tooltip to decide whether
+ * to render the "View charts" button — see queries/trace-availability.ts.
+ */
+export const GET = idsQueryRoute({
+ maxIds: 500,
+ logLabel: 'trace availability',
+ fetch: getCachedTraceAvailability,
+});
diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
new file mode 100644
index 00000000..4d3014ab
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -0,0 +1,47 @@
+import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import {
+ getTraceHistograms,
+ type TraceHistogramMap,
+} from '@semianalysisai/inferencex-db/queries/trace-histograms';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idsQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: a 50-id histogram payload can easily exceed Next.js's 2MB
+// unstable_cache limit (each point carries one int per request, ~500-1000+
+// requests for agentic), which manifests as a 500 from the route. Blob
+// storage lets us cache the larger response without losing the warm-cache hit.
+//
+// Key derived from REQUEST_TIMELINE_VERSION: the histograms are read out of the
+// `request_timeline` payload (getTraceHistograms keys its fast path off that
+// constant). The blob cache is write-once with no post-backfill purge, so the
+// version-derived key is what rolls the namespace on a bump — the previously
+// unversioned key would serve stale histograms forever.
+export const CACHE_KEY_PREFIX = `trace-histograms-v${REQUEST_TIMELINE_VERSION}`;
+
+const getCachedTraceHistograms = cachedQuery(
+ (ids: number[]): Promise => {
+ if (JSON_MODE) return Promise.resolve(jsonProvider.getTraceHistograms(ids));
+ return getTraceHistograms(getDb(), ids);
+ },
+ CACHE_KEY_PREFIX,
+ { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/trace-histograms?ids=1,2,3
+ *
+ * Returns per-request ISL/OSL arrays parsed from the stored aiperf
+ * `profile_export.jsonl` blobs, keyed by `benchmark_results.id`.
+ * Ids without a trace_replay blob are omitted from the response.
+ */
+export const GET = idsQueryRoute({
+ maxIds: 200,
+ logLabel: 'trace histograms',
+ fetch: getCachedTraceHistograms,
+});
diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
new file mode 100644
index 00000000..2d3554a4
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
@@ -0,0 +1,42 @@
+import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series';
+import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
+import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
+import {
+ getTraceServerMetrics,
+ type TraceServerMetrics,
+} from '@semianalysisai/inferencex-db/queries/trace-server-metrics';
+
+import { cachedQuery } from '@/lib/api-cache';
+
+import { idQueryRoute } from '../id-routes';
+
+export const dynamic = 'force-dynamic';
+
+// Key derived from CHART_SERIES_VERSION (governs the `chart_series` payload).
+// The blob cache is write-once with no post-backfill purge, so the
+// version-derived key is what rolls the namespace on a bump — a hand-written
+// string would serve stale blob-cached series forever.
+/** Version-derived blob-cache key namespace (exported for the key-derivation test). */
+export const CACHE_KEY_PREFIX = `trace-server-metrics-v${CHART_SERIES_VERSION}`;
+
+const getCachedTraceServerMetrics = cachedQuery(
+ (id: number): Promise => {
+ if (JSON_MODE) return jsonProvider.getTraceServerMetrics(id);
+ return getTraceServerMetrics(getDb(), id);
+ },
+ CACHE_KEY_PREFIX,
+ { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/trace-server-metrics?id=N
+ *
+ * Returns parsed time-series for the agentic detail view: KV cache usage,
+ * prefix cache hit rate per interval, queue depth, and per-source prompt
+ * token rates. Times are in seconds from benchmark start. 404 if the point
+ * has no stored server_metrics_export.json blob.
+ */
+export const GET = idQueryRoute({
+ logLabel: 'trace server metrics',
+ fetch: getCachedTraceServerMetrics,
+});
diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
new file mode 100644
index 00000000..5bc8fea9
--- /dev/null
+++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
@@ -0,0 +1,45 @@
+import { Suspense } from 'react';
+import type { Metadata } from 'next';
+
+import { AgenticGate } from '@/components/agentic-gate';
+import { ConversationView } from '@/components/datasets/conversation-view';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+interface Props {
+ params: Promise<{ slug: string; convId: string }>;
+}
+
+export async function generateMetadata({ params }: Props): Promise {
+ const { slug, convId } = await params;
+ // App Router has already decoded the dynamic segment exactly once, so `convId`
+ // is the raw conversation id here. Re-encode for the canonical URL.
+ const short = convId.slice(0, 12);
+ const title = `Conversation ${short} | ${slug}`;
+ const description = `Per-turn token flamegraph (cached prefix vs uncached input vs output) for conversation ${short} in the ${slug} agentic trace dataset.`;
+ return {
+ title,
+ description,
+ alternates: {
+ canonical: `${SITE_URL}/datasets/${slug}/conversations/${encodeURIComponent(convId)}`,
+ },
+ robots: { index: false }, // per-conversation pages are too numerous to index
+ };
+}
+
+export default async function ConversationPage({ params }: Props) {
+ const { slug, convId } = await params;
+ // `convId` is already decoded once by App Router — pass it straight through.
+ // A second decodeURIComponent here would over-decode (and throw for ids that
+ // contain a literal '%'). ConversationView re-encodes when it builds the API URL.
+ return (
+
+
+
+
+
+
+
+
+
+ );
+}
diff --git a/packages/app/src/app/datasets/[slug]/page.tsx b/packages/app/src/app/datasets/[slug]/page.tsx
new file mode 100644
index 00000000..c853a695
--- /dev/null
+++ b/packages/app/src/app/datasets/[slug]/page.tsx
@@ -0,0 +1,35 @@
+import type { Metadata } from 'next';
+
+import { AgenticGate } from '@/components/agentic-gate';
+import { DatasetDetail } from '@/components/datasets/dataset-detail';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+interface Props {
+ params: Promise<{ slug: string }>;
+}
+
+export async function generateMetadata({ params }: Props): Promise {
+ const { slug } = await params;
+ const title = `${slug} | Agentic Datasets`;
+ const description = `Distributions, token statistics, and per-conversation flamegraphs for the ${slug} agentic trace dataset.`;
+ return {
+ title,
+ description,
+ alternates: { canonical: `${SITE_URL}/datasets/${slug}` },
+ openGraph: { title: `${title} | InferenceX`, description, url: `${SITE_URL}/datasets/${slug}` },
+ twitter: { title: `${title} | InferenceX`, description },
+ };
+}
+
+export default async function DatasetDetailPage({ params }: Props) {
+ const { slug } = await params;
+ return (
+
+
+
+
+
+
+
+ );
+}
diff --git a/packages/app/src/app/datasets/page.tsx b/packages/app/src/app/datasets/page.tsx
new file mode 100644
index 00000000..711e0dbc
--- /dev/null
+++ b/packages/app/src/app/datasets/page.tsx
@@ -0,0 +1,108 @@
+import type { Metadata } from 'next';
+
+import { AgenticGate } from '@/components/agentic-gate';
+import { Card } from '@/components/ui/card';
+import { JsonLd } from '@/components/json-ld';
+import { DatasetList } from '@/components/datasets/dataset-list';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+const DESCRIPTION =
+ 'The real Claude Code agentic conversation traces that the InferenceX agentic benchmark replays — methodology, distributions, and per-conversation flamegraphs.';
+
+export const metadata: Metadata = {
+ title: 'Agentic Datasets',
+ description: DESCRIPTION,
+ alternates: { canonical: `${SITE_URL}/datasets` },
+ openGraph: {
+ title: 'Agentic Datasets | InferenceX',
+ description: DESCRIPTION,
+ url: `${SITE_URL}/datasets`,
+ },
+ twitter: { title: 'Agentic Datasets | InferenceX', description: DESCRIPTION },
+};
+
+const jsonLd = {
+ '@context': 'https://schema.org',
+ '@type': 'CollectionPage',
+ name: 'InferenceX Agentic Datasets',
+ description: DESCRIPTION,
+ url: `${SITE_URL}/datasets`,
+};
+
+export default function DatasetsPage() {
+ return (
+
+
+
+ );
+}
+
+function DatasetsPageContent() {
+ return (
+
+
+
+
+
+
+ Agentic Benchmark Datasets
+
+
+ InferenceX's agentic benchmark doesn't replay synthetic prompts — it replays
+ real Claude Code coding sessions captured as conversation traces .
+ Each trace is a full multi-turn session: the main agent's turns plus any
+ subagents it spawned, with per-turn input/output token counts and the 64-token
+ KV-cache block hashes needed to reconstruct prefix-cache reuse. The traces are
+ published openly on HuggingFace under semianalysisai/cc-traces-weka-*{' '}
+ (apache-2.0).
+
+
+
+ How traces are captured
+
+
+ Production Claude Code sessions are recorded through a logging proxy that captures
+ every API request: its input and output token counts, the model used, timing (TTFT,
+ inter-token latency), and a list of hash_ids — one per 64-token KV block
+ of the request's input. Subagent invocations are grouped under their parent turn.
+ No prompt or completion text is stored; only token counts and block hashes, so the
+ corpus is shareable while remaining a faithful workload for replay.
+
+
+
+ Cached prefix vs uncached suffix
+
+
+ Agentic workloads are dominated by prefix reuse: each turn resends the growing
+ conversation, so most of its input is already in the KV cache from prior turns. We
+ reconstruct this exactly. Walking a conversation in order under an idealized infinite
+ cache, a turn's cached prefix is its longest run of leading{' '}
+ hash_ids already seen; the rest is the uncached suffix {' '}
+ that must be (re)computed. Blocks are 64 tokens; the split is clamped so cached +
+ uncached equals the turn's effective input even on a partial final block.
+ Subagents run against a snapshot of the parent cache at spawn (their context is
+ separate and is not folded back into the parent).
+
+
+ Dataset variants
+
+
+ full — every captured request, unmodified.
+
+
+ 256k — requests whose input + output exceeds 256,000 tokens are
+ dropped so every turn fits a 256k context window (used when benchmarking engines
+ configured for a 256k max context).
+
+
+
+
+
+
+
+
+ );
+}
diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 6e7afb0b..8bd10c71 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -12,6 +12,8 @@ import {
useState,
} from 'react';
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
+
// useLayoutEffect warns during SSR; alias to useEffect on the server (no-op there anyway).
const useIsomorphicLayoutEffect = typeof window === 'undefined' ? useEffect : useLayoutEffect;
@@ -19,11 +21,6 @@ function isEnumValue>(e: T, v: string): v is T[
return (Object.values(e) as string[]).includes(v);
}
-const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u;
-const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u;
-
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
-
import { useAvailability } from '@/hooks/api/use-availability';
import { useWorkflowInfo } from '@/hooks/api/use-workflow-info';
import { useUrlState } from '@/hooks/useUrlState';
@@ -38,8 +35,22 @@ import {
} from '@/lib/data-mappings';
import { computeAutoSwitchDecision } from '@/lib/unofficial-run-auto-switch';
import { countCurvesByPrecision, resolveEffectivePrecisions } from '@/lib/default-precisions';
+import { resolveEffectiveSequence } from '@/lib/default-sequence';
+import { useFeatureGate } from '@/lib/use-feature-gate';
import type { AvailabilityRow, WorkflowInfoResponse } from '@/lib/api';
+const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u;
+const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u;
+
+// Placeholder for the public (non-null) `effectiveSequence` during the window
+// before availability has loaded. It must be a fixed-seq scenario — never
+// AgenticTraces — so the scenario selector doesn't flash "Agentic Traces" for a
+// fixed-seq-only model while the chart shows its loading skeleton. `8k/1k` is
+// the pre-agentic default for non-agentic models. Consumers that must not act on
+// an unresolved sequence gate on `sequenceResolved` instead.
+// (Declared after the import block so it never references `Sequence` above its import.)
+const PRE_AVAILABILITY_SEQUENCE = Sequence.EightK_OneK;
+
interface RunInfo {
runId: string;
runDate: string;
@@ -66,6 +77,15 @@ export interface GlobalFilterContextType {
// Effective (validated) values
effectiveSequence: Sequence;
+ /**
+ * Whether `effectiveSequence` reflects the selected model's real availability
+ * (DB or unofficial run) rather than the pre-load placeholder. False during
+ * the brief window before availability loads. Consumers that trigger data
+ * fetches or render sequence-dependent labels should gate on this so a
+ * fixed-seq-only model never fires an agentic fetch or flashes "Agentic
+ * Traces" before availability settles.
+ */
+ sequenceResolved: boolean;
effectivePrecisions: string[];
// Run date & run ID
@@ -100,7 +120,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record {
const runs: Record = {};
for (const run of data.runs) {
const runId = String(run.github_run_id);
- const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id);
+ const runChangelogs = data.changelogs.filter(
+ (c) => String(c.workflow_run_id) === String(run.github_run_id),
+ );
runs[runId] = {
runId,
runDate: run.created_at,
@@ -140,6 +162,14 @@ export function GlobalFilterProvider({
}) {
const { hasUrlParam, getUrlParam, setUrlParams } = useUrlState();
+ // Agentic surfaces are hidden behind the shared konami-code feature gate
+ // (default OFF until agentic launches). When locked, agentic sequences are
+ // filtered out of `availableSequences` below — the single chokepoint that
+ // cascades: no agentic default (resolveEffectiveSequence falls to 8k/1k), no
+ // "Agentic Traces" scenario-selector entry, and no agentic x-axis mode /
+ // percentile selector (those key off effectiveSequence === AgenticTraces).
+ const agenticGateUnlocked = useFeatureGate();
+
// ── Core filter state ─────────────────────────────────────────────────────
const [selectedModel, setSelectedModel] = useState(
() => initialModel ?? Model.DeepSeek_V4_Pro,
@@ -147,7 +177,11 @@ export function GlobalFilterProvider({
const [selectedSequence, setSelectedSequence] = useState(() => {
if (initialSequence) return initialSequence;
- return Sequence.EightK_OneK;
+ const urlSeq = getUrlParam('i_seq');
+ if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence;
+ // Prefer Agentic Traces by default when the selected model has it; the
+ // effectiveSequence fallback below handles models without agentic data.
+ return Sequence.AgenticTraces;
});
const initialValidPrecisions = useMemo(
@@ -269,26 +303,61 @@ export function GlobalFilterProvider({
}
}, [unofficialAvailable, selectedModel]);
- // Sequences available for the selected model (DB ∪ unofficial run for this model)
+ // Sequences available for the selected model (DB ∪ unofficial run for this model).
+ //
+ // When the agentic feature gate is locked (default), agentic sequences are
+ // dropped from every branch — including the static SEQUENCE_OPTIONS fallback —
+ // so no agentic scenario is ever selectable or defaulted. This is the single
+ // gate chokepoint for the main inference chart's agentic surfaces.
const availableSequences = useMemo(() => {
+ const dropAgentic = (seqs: Sequence[]) =>
+ agenticGateUnlocked ? seqs : seqs.filter((s) => s !== Sequence.AgenticTraces);
const unofficialSeqs = unofficialAvailable
.filter((a) => a.model === selectedModel)
.map((a) => a.sequence as Sequence);
if (!availabilityRows) {
- return unofficialSeqs.length > 0 ? [...new Set(unofficialSeqs)] : SEQUENCE_OPTIONS;
+ return unofficialSeqs.length > 0
+ ? dropAgentic([...new Set(unofficialSeqs)])
+ : dropAgentic(SEQUENCE_OPTIONS);
}
- const dbSeqs = modelRows
- .map((r) => islOslToSequence(r.isl, r.osl))
- .filter((s): s is Sequence => s !== null);
- const merged = [...new Set([...dbSeqs, ...unofficialSeqs])];
- return merged.length > 0 ? merged : SEQUENCE_OPTIONS;
- }, [availabilityRows, modelRows, unofficialAvailable, selectedModel]);
-
- // Synchronously validated sequence
- const effectiveSequence = useMemo(() => {
- if (availableSequences.includes(selectedSequence)) return selectedSequence;
- return availableSequences[0] ?? selectedSequence;
- }, [availableSequences, selectedSequence]);
+ const dbSeqs = modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null);
+ const merged = dropAgentic([...new Set([...dbSeqs, ...unofficialSeqs])]);
+ return merged.length > 0 ? merged : dropAgentic(SEQUENCE_OPTIONS);
+ }, [availabilityRows, modelRows, unofficialAvailable, selectedModel, agenticGateUnlocked]);
+
+ // Whether we actually know the selected model's sequences yet. Availability
+ // may arrive from the DB (`availabilityRows`) OR from a loaded unofficial run
+ // (`unofficialAvailable` for this model) — either source lets us resolve a
+ // trustworthy effectiveSequence. Until then `availableSequences` is the static
+ // SEQUENCE_OPTIONS fallback (which contains AgenticTraces), so resolving
+ // eagerly would fetch + label an agentic scenario for fixed-seq-only models,
+ // then snap once availability lands (flash + wasted request).
+ const availabilityLoaded = useMemo(
+ () =>
+ availabilityRows !== undefined || unofficialAvailable.some((a) => a.model === selectedModel),
+ [availabilityRows, unofficialAvailable, selectedModel],
+ );
+
+ // Synchronously validated sequence.
+ //
+ // `resolveEffectiveSequence` returns null while availability is still loading
+ // — we surface that as `sequenceResolved` so InferenceContext can gate the
+ // benchmark fetch until the real sequence is known (no agentic fetch fires for
+ // a fixed-seq-only model). For the non-null public `effectiveSequence` value
+ // we substitute a fixed-seq scenario (never AgenticTraces) during that window
+ // so the scenario selector never flashes "Agentic Traces"; the chart shows its
+ // normal loading skeleton until `sequenceResolved` flips true.
+ const resolvedSequence = useMemo(
+ () =>
+ resolveEffectiveSequence({
+ selectedSequence,
+ availableSequences,
+ availabilityLoaded,
+ }),
+ [selectedSequence, availableSequences, availabilityLoaded],
+ );
+ const sequenceResolved = resolvedSequence !== null;
+ const effectiveSequence = resolvedSequence ?? PRE_AVAILABILITY_SEQUENCE;
// Precisions available for the selected model + sequence (DB ∪ unofficial run)
const availablePrecisions = useMemo(() => {
@@ -298,7 +367,7 @@ export function GlobalFilterProvider({
if (!availabilityRows) {
return unofficialPrecs.length > 0 ? [...new Set(unofficialPrecs)].toSorted() : ['fp4'];
}
- const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+ const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
const dbPrecs = rows.map((r) => r.precision);
const merged = [...new Set([...dbPrecs, ...unofficialPrecs])].toSorted();
return merged.length > 0 ? merged : ['fp4'];
@@ -307,10 +376,7 @@ export function GlobalFilterProvider({
// Curve count per precision (distinct hw/framework/spec/disagg series) for the
// selected model + sequence — drives the auto default toward the densest one.
const precisionCurveCounts = useMemo(
- () =>
- countCurvesByPrecision(
- modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence),
- ),
+ () => countCurvesByPrecision(modelRows.filter((r) => rowToSequence(r) === effectiveSequence)),
[modelRows, effectiveSequence],
);
@@ -346,7 +412,7 @@ export function GlobalFilterProvider({
// Dates available for selected model + sequence + precisions
const availableDates = useMemo(() => {
if (!availabilityRows) return [];
- const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+ const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision));
if (rows.length === 0) {
return [...new Set(seqRows.map((r) => r.date))].toSorted();
@@ -438,7 +504,11 @@ export function GlobalFilterProvider({
g_model: selectedModel,
g_rundate: selectedRunDate,
g_runid: selectedRunId,
- i_seq: effectiveSequence,
+ // Don't pin the sequence to the URL until it's resolved from real
+ // availability — writing the pre-load placeholder (8k/1k) would clobber a
+ // shared `?i_seq=agentic-traces` link before the model's availability
+ // confirms it has agentic data.
+ i_seq: sequenceResolved ? effectiveSequence : undefined,
// Only pin the precision in the URL once chosen explicitly; in auto mode
// leave it out so the link keeps following the per-model densest default.
i_prec: precisionExplicit ? effectivePrecisions.join(',') : undefined,
@@ -448,6 +518,7 @@ export function GlobalFilterProvider({
selectedRunDate,
selectedRunId,
effectiveSequence,
+ sequenceResolved,
effectivePrecisions,
precisionExplicit,
setUrlParams,
@@ -462,6 +533,7 @@ export function GlobalFilterProvider({
selectedPrecisions,
setSelectedPrecisions,
effectiveSequence,
+ sequenceResolved,
effectivePrecisions,
selectedRunDate: effectiveRunDate,
setSelectedRunDate: setSelectedRunDateManual,
@@ -484,6 +556,7 @@ export function GlobalFilterProvider({
selectedSequence,
selectedPrecisions,
effectiveSequence,
+ sequenceResolved,
effectivePrecisions,
effectiveRunDate,
setSelectedRunDateManual,
diff --git a/packages/app/src/components/agentic-gate.tsx b/packages/app/src/components/agentic-gate.tsx
new file mode 100644
index 00000000..9fa0aa37
--- /dev/null
+++ b/packages/app/src/components/agentic-gate.tsx
@@ -0,0 +1,41 @@
+'use client';
+
+import { notFound } from 'next/navigation';
+import { useEffect, useState } from 'react';
+
+import { FEATURE_GATE_KEY, useFeatureGate } from '@/lib/use-feature-gate';
+
+/**
+ * Client gate for the standalone agentic product pages (`/datasets/*`,
+ * `/inference/agentic/[id]`). These are server-rendered routes with no nav
+ * entry once the header link is hidden, so a direct URL visit is the only way
+ * in. When the shared konami-code feature gate (see {@link useFeatureGate}) is
+ * locked — the default until agentic launches — we `notFound()` so the route
+ * behaves like a clean 404 instead of publicly exposing agentic surfaces.
+ *
+ * The gate lives in localStorage, which the server can't read, so we resolve it
+ * on the client: read the flag synchronously on mount, and until then render
+ * nothing (no content flash before a potential 404). QA can unlock at runtime
+ * with ↑↑↓↓ (the same mechanism as the Hidden tab dropdown) or by seeding
+ * `localStorage['inferencex-feature-gate'] = '1'`, after which these pages
+ * render in full.
+ */
+export function AgenticGate({ children }: { children: React.ReactNode }) {
+ const unlocked = useFeatureGate();
+ // Distinguish "haven't read localStorage yet" from "read it, gate is locked":
+ // useFeatureGate() returns false on the server and on the very first client
+ // render before its mount effect runs, so we must not 404 during that window.
+ const [resolved, setResolved] = useState(false);
+ useEffect(() => setResolved(true), []);
+
+ if (!resolved) return null;
+ if (!unlocked) {
+ // Belt-and-suspenders: re-read the flag directly in case an unlock event
+ // hasn't propagated yet on this first resolved render.
+ if (typeof window !== 'undefined' && localStorage.getItem(FEATURE_GATE_KEY) === '1') {
+ return <>{children}>;
+ }
+ notFound();
+ }
+ return <>{children}>;
+}
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
new file mode 100644
index 00000000..415a430d
--- /dev/null
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -0,0 +1,109 @@
+'use client';
+
+import Link from 'next/link';
+import { useSearchParams } from 'next/navigation';
+
+import { Card } from '@/components/ui/card';
+import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
+import { useDatasetConversation } from '@/hooks/api/use-datasets';
+import { compact, formatShare } from './format';
+import { Stat } from './stat';
+
+export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
+ const { data, isLoading, isError } = useDatasetConversation(slug, convId);
+
+ // Deep-link target from a request-timeline click: ?raw= or ?turn=[&sa=].
+ // useSearchParams (not a one-shot window.location read) so the params are
+ // present on the very first client-side navigation, not just after a reload.
+ const params = useSearchParams();
+ const turnRaw = params.get('turn');
+ const sourceRaw = params.get('raw');
+ const sourceInner = params.get('inner');
+ const highlight = {
+ turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null,
+ raw: sourceRaw !== null && /^\d+$/u.test(sourceRaw) ? Number(sourceRaw) : null,
+ inner: sourceInner !== null && /^\d+$/u.test(sourceInner) ? Number(sourceInner) : null,
+ agent: params.get('sa'),
+ };
+
+ if (isLoading) {
+ return (
+ Loading conversation…
+ );
+ }
+ if (isError || !data) {
+ return (
+
+ Conversation not found.{' '}
+
+ Back to dataset
+
+
+ );
+ }
+
+ const cachedPct = formatShare(data.total_cached, data.total_in);
+
+ return (
+
+
+
+
+ Datasets
+
+ /
+
+ {slug}
+
+ /
+ conversation
+
+
+ {data.conv_id}
+
+ {data.models.length > 0 && (
+
+ {data.models.map((m) => (
+
+ {m}
+
+ ))}
+
+ )}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Flamegraph
+
+ One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default —
+ click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
+ plus generated output. Timestamps are elapsed from conversation start; subagent headers
+ show their full active range. A colored bracket on the left groups requests in the same
+ main-agent or subagent scope whose original execution intervals overlapped (ran in
+ parallel).
+
+
+
+
+ );
+}
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
new file mode 100644
index 00000000..609a4c8f
--- /dev/null
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -0,0 +1,320 @@
+'use client';
+
+import { useState } from 'react';
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import {
+ Select,
+ SelectContent,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from '@/components/ui/select';
+import { DistributionCard } from '@/components/datasets/distribution-card';
+import {
+ useDataset,
+ useDatasetConversations,
+ type ConversationSort,
+} from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+import { compact, formatPct, formatShare, perConversation } from './format';
+import { Stat } from './stat';
+
+const PAGE = 50;
+
+const SORTS: { value: ConversationSort; label: string }[] = [
+ { value: 'tokens', label: 'Total input ↓' },
+ { value: 'turns', label: 'Turns ↓' },
+ { value: 'subagents', label: 'Subagent groups ↓' },
+ { value: 'id', label: 'Conversation ID' },
+];
+
+export function DatasetDetail({ slug }: { slug: string }) {
+ const { data: dataset, isLoading, isError } = useDataset(slug);
+ const [search, setSearch] = useState('');
+ const [sort, setSort] = useState('tokens');
+ const [page, setPage] = useState(0);
+
+ const { data: convs, isFetching } = useDatasetConversations({
+ slug,
+ search,
+ sort,
+ limit: PAGE,
+ offset: page * PAGE,
+ });
+
+ if (isLoading) {
+ return Loading dataset…
;
+ }
+ if (isError || !dataset) {
+ return (
+
+ Dataset not found.{' '}
+
+ Back to datasets
+
+
+ );
+ }
+
+ const s = dataset.summary ?? {};
+ const cd = dataset.chart_data ?? {};
+ const total = convs?.total ?? 0;
+ const pageCount = Math.ceil(total / PAGE);
+
+ return (
+
+ {/* header */}
+
+
+
+ ← Datasets
+
+
+
+ {dataset.description && (
+
{dataset.description}
+ )}
+
+
+ {/* summary stats */}
+
+
+
+
+
+
+
+
+
+
+
+ {s.modelMix && Object.keys(s.modelMix).length > 0 && (
+
+
+ Model mix (turns)
+
+
+ {Object.entries(s.modelMix)
+ .toSorted((a, b) => b[1] - a[1])
+ .map(([model, count]) => (
+
+ {model} {compact(count)}
+
+ ))}
+
+
+ )}
+
+
+ {/* distribution cards */}
+
+ Distributions
+
+
+
+
+
+
+
+
+
+
+
+ {/* conversation list */}
+
+
+
+ Conversations{' '}
+ ({total})
+
+
+ {
+ setSearch(e.target.value);
+ setPage(0);
+ }}
+ placeholder="Search by ID…"
+ className="h-8 w-40 rounded-md border border-border/40 bg-background px-2 text-xs outline-none focus:border-primary"
+ />
+ {
+ setSort(v as ConversationSort);
+ setPage(0);
+ track('datasets_conversations_sorted', { mode: v });
+ }}
+ >
+
+
+
+
+ {SORTS.map((o) => (
+
+ {o.label}
+
+ ))}
+
+
+
+
+
+
+
+
+
+ Conversation
+ Turns
+ Subagents
+ Input
+ Output
+ Cached
+
+
+
+ {(convs?.items ?? []).map((c) => {
+ const cachedPct = formatShare(c.total_cached, c.total_in);
+ return (
+
+
+ track('datasets_conversation_clicked', { slug })}
+ className="font-mono text-xs text-primary hover:underline"
+ >
+ {c.conv_id.slice(0, 20)}…
+
+ {c.models.length > 0 && (
+
+ {c.models.length} model{c.models.length === 1 ? '' : 's'}
+
+ )}
+
+ {c.num_turns}
+ {c.num_subagent_groups}
+ {compact(c.total_in)}
+ {compact(c.total_out)}
+
+ {cachedPct}
+
+
+ );
+ })}
+ {!isFetching && (convs?.items.length ?? 0) === 0 && (
+
+
+ No conversations match.
+
+
+ )}
+
+
+
+
+ {pageCount > 1 && (
+
+ {
+ const next = Math.max(0, page - 1);
+ track('datasets_conversations_page_changed', { direction: 'prev', page: next });
+ setPage(next);
+ }}
+ className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
+ >
+ ← Prev
+
+
+ Page {page + 1} of {pageCount}
+
+ = pageCount - 1}
+ onClick={() => {
+ const next = Math.min(pageCount - 1, page + 1);
+ track('datasets_conversations_page_changed', { direction: 'next', page: next });
+ setPage(next);
+ }}
+ className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
+ >
+ Next →
+
+
+ )}
+
+
+ );
+}
diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx
new file mode 100644
index 00000000..d85d7eaa
--- /dev/null
+++ b/packages/app/src/components/datasets/dataset-list.tsx
@@ -0,0 +1,86 @@
+'use client';
+
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+import { compact, formatPct, perConversation } from './format';
+
+function DatasetCard({ d }: { d: DatasetRecord }) {
+ const s = d.summary ?? {};
+ const cachedPct = formatPct(s.cachedPct);
+ return (
+ track('datasets_card_clicked', { slug: d.slug })}
+ className="block transition-colors hover:[&_*]:border-primary/40"
+ >
+
+
+
{d.label}
+
+ {d.variant}
+
+
+ {d.description && (
+ {d.description}
+ )}
+
+
+
+
+
+
+
+
+
+
+ View dataset →
+
+
+ );
+}
+
+function Stat({ label, value }: { label: string; value: string }) {
+ return (
+
+
{label}
+ {value}
+
+ );
+}
+
+export function DatasetList() {
+ const { data, isLoading, isError } = useDatasets();
+
+ if (isLoading) {
+ return Loading datasets…
;
+ }
+ if (isError || !data) {
+ return (
+ Failed to load datasets.
+ );
+ }
+ if (data.length === 0) {
+ return (
+
+ No datasets ingested yet.
+
+ );
+ }
+
+ return (
+
+ {data.map((d) => (
+
+ ))}
+
+ );
+}
diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx
new file mode 100644
index 00000000..8adc02ee
--- /dev/null
+++ b/packages/app/src/components/datasets/distribution-card.tsx
@@ -0,0 +1,220 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { Card } from '@/components/ui/card';
+import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover';
+import type { Distribution } from '@/hooks/api/use-datasets';
+import { compact } from './format';
+
+interface DistributionCardProps {
+ title: string;
+ subtitle?: string;
+ unit: string;
+ distribution?: Distribution;
+ scale?: 'log' | 'linear';
+ /** Format the x value (defaults to compact). e.g. percent for cached fraction. */
+ formatValue?: (v: number) => string;
+}
+
+const W = 720;
+const H = 240;
+const PAD = { top: 12, right: 16, bottom: 48, left: 52 };
+
+/**
+ * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a
+ * themeable bar chart with p50/p75/p90/p95 guide lines and a hover tooltip. Bars are
+ * drawn at equal visual width; for log-scaled bins the edge labels are already
+ * log-spaced so the shape reads as a log histogram.
+ */
+export function DistributionCard({
+ title,
+ subtitle,
+ unit,
+ distribution,
+ scale = 'linear',
+ formatValue = compact,
+}: DistributionCardProps) {
+ const computed = useMemo(() => {
+ const bins = distribution?.bins ?? [];
+ if (bins.length === 0) return null;
+ const maxCount = Math.max(1, ...bins.map((b) => b.count));
+ const innerW = W - PAD.left - PAD.right;
+ const innerH = H - PAD.top - PAD.bottom;
+ const n = bins.length;
+ const barW = innerW / n;
+ // Map a data value to an x pixel by locating its bin (positional — works for
+ // both linear and log bins since the edges are precomputed at ingest).
+ // Out-of-range values clamp to the first/last bin.
+ const valueToX = (v: number): number => {
+ for (let i = 0; i < n; i++) {
+ if (v >= bins[i].x0 && (v < bins[i].x1 || i === n - 1)) {
+ return PAD.left + (i + 0.5) * barW;
+ }
+ }
+ if (v <= bins[0].x0) return PAD.left + 0.5 * barW;
+ return PAD.left + (n - 0.5) * barW;
+ };
+ return { bins, maxCount, innerW, innerH, n, barW, valueToX };
+ }, [distribution]);
+
+ if (!computed) {
+ return (
+
+ {title}
+
+ No data
+
+
+ );
+ }
+
+ const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed;
+ const stats = distribution?.stats;
+
+ const guides: { label: string; value: number; color: string }[] = stats
+ ? [
+ { label: 'p50', value: stats.median, color: '#3b82f6' },
+ ...(typeof stats.p75 === 'number'
+ ? [{ label: 'p75', value: stats.p75, color: '#22c55e' }]
+ : []),
+ { label: 'p90', value: stats.p90, color: '#f59e0b' },
+ ...(typeof stats.p95 === 'number'
+ ? [{ label: 'p95', value: stats.p95, color: '#ef4444' }]
+ : []),
+ ]
+ : [];
+
+ // X tick labels from a few bin edges.
+ const tickIdxs = [0, Math.floor(n / 3), Math.floor((2 * n) / 3), n - 1];
+
+ const resolve = (fraction: number) => {
+ const i = Math.min(n - 1, Math.max(0, Math.floor(fraction * n)));
+ const b = bins[i];
+ const items: HoverItem[] = [
+ {
+ color: 'currentColor',
+ label: 'Range',
+ value: `${formatValue(b.x0)}–${formatValue(b.x1)} ${unit}`,
+ },
+ { color: 'currentColor', label: 'Count', value: b.count.toLocaleString() },
+ ];
+ return { items };
+ };
+
+ return (
+
+
+ {title}
+ {scale === 'log' && (
+
+ log scale
+
+ )}
+
+ {subtitle && {subtitle}
}
+ {stats && (
+
+ n={stats.count.toLocaleString()} · p50 {formatValue(stats.median)}
+ {typeof stats.p75 === 'number' && <> · p75 {formatValue(stats.p75)}>} · p90{' '}
+ {formatValue(stats.p90)}
+ {typeof stats.p95 === 'number' && <> · p95 {formatValue(stats.p95)}>} · max{' '}
+ {formatValue(stats.max)} {unit}
+
+ )}
+
+
+ {/* bars */}
+ {bins.map((b, i) => {
+ const h = (b.count / maxCount) * innerH;
+ const x = PAD.left + i * barW;
+ const y = PAD.top + (innerH - h);
+ return (
+
+ );
+ })}
+
+ {/* guide lines */}
+ {guides.map((g) => {
+ const x = valueToX(g.value);
+ return (
+
+ );
+ })}
+
+ {/* x axis */}
+
+ {tickIdxs.map((i, k) => {
+ const anchor = k === 0 ? 'start' : k === tickIdxs.length - 1 ? 'end' : 'middle';
+ const x = PAD.left + (i + 0.5) * barW;
+ return (
+
+ {formatValue(bins[i].x0)}
+
+ );
+ })}
+
+ {unit}
+
+
+ {/* guide legend */}
+ {guides.map((g, i) => (
+
+
+
+ {g.label} {formatValue(g.value)}
+
+
+ ))}
+
+
+
+ );
+}
diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts
new file mode 100644
index 00000000..fd526d12
--- /dev/null
+++ b/packages/app/src/components/datasets/format.ts
@@ -0,0 +1,28 @@
+/**
+ * Compact number formatter for dataset token/count displays:
+ * 1234 → "1.2k", 1_200_000 → "1.2M", 3.2e9 → "3.2B", 0.82 → "0.82".
+ */
+export function compact(n: number): string {
+ const abs = Math.abs(n);
+ if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+ if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+ if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+ if (abs > 0 && abs < 1) return n.toFixed(2);
+ return String(Math.round(n));
+}
+
+/** Format a per-conversation count without hiding a meaningful fractional mean. */
+export function perConversation(n: number | undefined): string {
+ if (typeof n !== 'number' || !Number.isFinite(n)) return '—';
+ return n.toLocaleString(undefined, { maximumFractionDigits: 1 });
+}
+
+/** Format a 0–1 fraction as a whole percent ("42%"), em dash when absent. */
+export function formatPct(fraction: number | undefined): string {
+ return typeof fraction === 'number' ? `${(fraction * 100).toFixed(0)}%` : '—';
+}
+
+/** Percent share of `part` in `total` ("42%"), em dash when `total` is 0. */
+export function formatShare(part: number, total: number): string {
+ return total > 0 ? `${((part / total) * 100).toFixed(0)}%` : '—';
+}
diff --git a/packages/app/src/components/datasets/stat.tsx b/packages/app/src/components/datasets/stat.tsx
new file mode 100644
index 00000000..3fb6a32a
--- /dev/null
+++ b/packages/app/src/components/datasets/stat.tsx
@@ -0,0 +1,9 @@
+/** Label/value pair for the summary grids on dataset and conversation pages. */
+export function Stat({ label, value }: { label: string; value: string }) {
+ return (
+
+
{label}
+ {value}
+
+ );
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph-model.ts b/packages/app/src/components/datasets/trace-flamegraph-model.ts
new file mode 100644
index 00000000..2aff9ac3
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph-model.ts
@@ -0,0 +1,422 @@
+/**
+ * Pure logic for the trace flamegraph: overlap detection, deep-link resolution,
+ * visible-row construction, and bracket-lane layout. No React/DOM — everything
+ * here is unit-testable directly. Rendering lives in trace-flamegraph.tsx.
+ */
+
+import type { StructureNode } from '@/hooks/api/use-datasets';
+
+// Kept distinct from token-segment colors. A row can carry multiple rails when
+// it overlaps different requests during different parts of its lifetime.
+export const OVERLAP_COLORS = ['#06b6d4', '#ec4899', '#6366f1', '#84cc16', '#f97316'] as const;
+
+// Cap on simultaneously-drawn bracket lanes. A pathological conversation (e.g. a
+// long-running session whose subagent fans out into hundreds of children with
+// 15+ concurrent requests) can require dozens of lanes; left unbounded the
+// gutter grows wide enough to push the bars off-screen AND emits one DOM node
+// per lane per row (tens of thousands of empty divs). We bound it: lanes beyond
+// the cap fold into the last "dense" lane, which stays readable for the common
+// case (≤6 concurrent) and degrades gracefully for the outliers.
+export const MAX_LANES = 6;
+
+export interface TimedRequest {
+ key: string;
+ startS?: number;
+ endS?: number;
+}
+
+export interface RequestOverlapGroup {
+ id: string;
+ requestKeys: string[];
+ startS: number;
+ endS: number;
+}
+
+/**
+ * Find maximal sets of requests that were simultaneously in flight.
+ * Intervals are half-open, so one request ending exactly when another begins
+ * is serialized rather than parallel. Maximal-set filtering prevents a nested
+ * A/B pair from duplicating an A/B/C marker, while preserving A/B and B/C as
+ * separate groups when their overlaps happen at different times.
+ */
+export function findRequestOverlapGroups(
+ requests: TimedRequest[],
+ scopeKey = 'scope',
+): RequestOverlapGroup[] {
+ const valid = requests.filter(
+ (request): request is TimedRequest & { startS: number; endS: number } =>
+ Number.isFinite(request.startS) &&
+ Number.isFinite(request.endS) &&
+ request.endS! > request.startS!,
+ );
+ const boundaries = [
+ ...new Set(valid.flatMap((request) => [request.startS, request.endS])),
+ ].toSorted((a, b) => a - b);
+ const candidates = new Map>();
+
+ for (let i = 0; i < boundaries.length - 1; i++) {
+ const startS = boundaries[i]!;
+ const endS = boundaries[i + 1]!;
+ if (endS <= startS) continue;
+ const requestKeys = valid
+ .filter((request) => request.startS <= startS && request.endS >= endS)
+ .map((request) => request.key)
+ .toSorted();
+ if (requestKeys.length < 2) continue;
+ const key = requestKeys.join('\u0000');
+ const existing = candidates.get(key);
+ candidates.set(key, {
+ requestKeys,
+ startS: existing ? Math.min(existing.startS, startS) : startS,
+ endS: existing ? Math.max(existing.endS, endS) : endS,
+ });
+ }
+
+ const maximal = [...candidates.values()].filter(
+ (candidate, _, all) =>
+ !all.some(
+ (other) =>
+ other.requestKeys.length > candidate.requestKeys.length &&
+ candidate.requestKeys.every((key) => other.requestKeys.includes(key)),
+ ),
+ );
+
+ return maximal
+ .toSorted(
+ (a, b) =>
+ a.startS - b.startS ||
+ a.endS - b.endS ||
+ a.requestKeys.join(',').localeCompare(b.requestKeys.join(',')),
+ )
+ .map((group, index) => ({ ...group, id: `${scopeKey}-${index + 1}` }));
+}
+
+export interface RowOverlap {
+ id: string;
+ label: string;
+ color: string;
+ startS: number;
+ endS: number;
+ peerCount: number;
+}
+
+export interface VisibleRow {
+ key: string;
+ label: string;
+ sublabel?: string;
+ timeLabel?: string;
+ cached: number;
+ uncached: number;
+ output: number;
+ total: number;
+ indent: number;
+ isGroup: boolean;
+ isExpanded: boolean;
+ groupIndex?: number;
+ overlaps: RowOverlap[];
+}
+
+/** Format seconds from conversation start as a compact elapsed timestamp. */
+export function formatElapsedTime(seconds: number): string {
+ const total = Math.max(0, Math.round(seconds));
+ const hours = Math.floor(total / 3600);
+ const minutes = Math.floor((total % 3600) / 60);
+ const secs = total % 60;
+ const mm = String(minutes).padStart(2, '0');
+ const ss = String(secs).padStart(2, '0');
+ return hours > 0 ? `${hours}:${mm}:${ss}` : `${mm}:${ss}`;
+}
+
+/** Elapsed-interval label for a row ("+MM:SS–MM:SS"), or undefined when untimed. */
+export function timeLabel(startS?: number, endS?: number): string | undefined {
+ if (startS === undefined || !Number.isFinite(startS)) return undefined;
+ const start = formatElapsedTime(startS);
+ if (endS === undefined || !Number.isFinite(endS) || endS <= startS) return `+${start}`;
+ return `+${start}–${formatElapsedTime(endS)}`;
+}
+
+export interface DeepLinkHighlight {
+ turn?: number | null;
+ raw?: number | null;
+ inner?: number | null;
+ agent?: string | null;
+}
+
+export interface DeepLinkTarget {
+ rowKey: string;
+ expandGroup: number | null;
+}
+
+/**
+ * Resolve a request-timeline deep link to a flamegraph row key (+ the subagent
+ * group that must be expanded to show it). Raw Weka source coordinates are
+ * exact and take precedence:
+ * raw= -> top-level Weka request
+ * raw=&inner= -> subagent child inside that top-level marker
+ * Otherwise main turns match by main-turn ordinal and subagent turns match the
+ * group by agentId, then the ti-th child.
+ *
+ * `buildConversationStructure` emits exactly one node per raw Weka entry (and
+ * one child per nested entry), so a node's array position IS its raw index.
+ * Structures ingested before rawIndex/innerIndex were stored omit the explicit
+ * fields — fall back to the array position so deep links keep resolving against
+ * those older rows instead of silently doing nothing.
+ */
+export function resolveDeepLinkTarget(
+ nodes: readonly StructureNode[],
+ highlight: DeepLinkHighlight,
+): DeepLinkTarget | null {
+ const { turn, raw, inner, agent } = highlight;
+ if (typeof raw === 'number' && raw >= 0) {
+ if (typeof inner === 'number' && inner >= 0) {
+ const gi = nodes.findIndex(
+ (node, i) => node.kind === 'subagent' && (node.rawIndex ?? i) === raw,
+ );
+ if (gi === -1) return null;
+ const group = nodes[gi] as Extract;
+ const ci = group.children.findIndex((child, i) => (child.innerIndex ?? i) === inner);
+ if (ci === -1) return null;
+ return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi };
+ }
+ const i = nodes.findIndex(
+ (node, idx) => node.kind === 'turn' && (node.rawIndex ?? idx) === raw,
+ );
+ if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null };
+ return null;
+ }
+ if (typeof turn !== 'number' || turn < 0) return null;
+ if (agent) {
+ const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === agent);
+ if (gi === -1) return null;
+ const group = nodes[gi] as Extract;
+ if (turn >= group.children.length) return null;
+ return { rowKey: `g-${gi}-c-${turn}`, expandGroup: gi };
+ }
+ let ordinal = 0;
+ for (let i = 0; i < nodes.length; i++) {
+ if (nodes[i].kind === 'turn') {
+ if (ordinal === turn) return { rowKey: `t-${i}`, expandGroup: null };
+ ordinal += 1;
+ }
+ }
+ return null;
+}
+
+/**
+ * Overlap groups per row key. Main-agent turns and each subagent's children are
+ * separate scopes — parallelism is only meaningful within one agent's stream.
+ */
+export function buildRowOverlaps(nodes: readonly StructureNode[]): Map {
+ const mainGroups = findRequestOverlapGroups(
+ nodes.flatMap((node, i) =>
+ node.kind === 'turn' ? [{ key: `t-${i}`, startS: node.startS, endS: node.endS }] : [],
+ ),
+ 'main',
+ );
+ const subagentGroups = nodes.flatMap((node, i) =>
+ node.kind === 'subagent'
+ ? findRequestOverlapGroups(
+ node.children.map((child, ci) => ({
+ key: `g-${i}-c-${ci}`,
+ startS: child.startS,
+ endS: child.endS,
+ })),
+ `subagent-${i}`,
+ )
+ : [],
+ );
+ const groups: RequestOverlapGroup[] = [...mainGroups, ...subagentGroups];
+
+ const byRow = new Map();
+ groups.forEach((group, groupIndex) => {
+ const overlap = {
+ id: group.id,
+ label: `P${groupIndex + 1}`,
+ color: OVERLAP_COLORS[groupIndex % OVERLAP_COLORS.length]!,
+ startS: group.startS,
+ endS: group.endS,
+ peerCount: group.requestKeys.length - 1,
+ };
+ group.requestKeys.forEach((key) => byRow.set(key, [...(byRow.get(key) ?? []), overlap]));
+ });
+ return byRow;
+}
+
+/**
+ * Flatten structure nodes into the rows currently visible: one row per main
+ * turn, one header per subagent group, plus indented children for expanded
+ * groups. Row keys (`t-`, `g-`, `g--c-`) index by node position so
+ * they stay stable across expand/collapse.
+ */
+export function buildVisibleRows(
+ nodes: readonly StructureNode[],
+ expanded: ReadonlySet,
+ overlapsByRow: ReadonlyMap,
+): VisibleRow[] {
+ const out: VisibleRow[] = [];
+ let turnNo = 0;
+ nodes.forEach((node: StructureNode, i) => {
+ if (node.kind === 'turn') {
+ turnNo += 1;
+ out.push({
+ key: `t-${i}`,
+ label: `Turn ${turnNo}`,
+ sublabel: node.model ?? undefined,
+ timeLabel: timeLabel(node.startS, node.endS),
+ cached: node.cached,
+ uncached: node.uncached,
+ output: node.out,
+ total: node.in + node.out,
+ indent: 0,
+ isGroup: false,
+ isExpanded: false,
+ overlaps: overlapsByRow.get(`t-${i}`) ?? [],
+ });
+ } else {
+ const isExpanded = expanded.has(i);
+ out.push({
+ key: `g-${i}`,
+ label: `${node.label}`,
+ sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${
+ node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : ''
+ }`,
+ timeLabel: timeLabel(node.startS, node.endS),
+ cached: node.cached,
+ uncached: node.uncached,
+ output: node.out,
+ total: node.in + node.out,
+ indent: 0,
+ isGroup: true,
+ isExpanded,
+ groupIndex: i,
+ overlaps: [],
+ });
+ if (isExpanded) {
+ node.children.forEach((child, ci) => {
+ out.push({
+ key: `g-${i}-c-${ci}`,
+ label: `↳ subturn ${ci + 1}`,
+ sublabel: child.model ?? undefined,
+ timeLabel: timeLabel(child.startS, child.endS),
+ cached: child.cached,
+ uncached: child.uncached,
+ output: child.out,
+ total: child.in + child.out,
+ indent: 1,
+ isGroup: false,
+ isExpanded: false,
+ overlaps: overlapsByRow.get(`g-${i}-c-${ci}`) ?? [],
+ });
+ });
+ }
+ }
+ });
+ return out;
+}
+
+export interface BraceSeg {
+ role: 'first' | 'middle' | 'last' | 'through';
+ isMember: boolean;
+ color: string;
+ groupId: string;
+ peerCount: number;
+ startS: number;
+ endS: number;
+}
+
+export interface BraceLayout {
+ laneCount: number;
+ overflowLanes: number;
+ /** Per visible row: only the lanes that actually carry a bracket segment. */
+ rowSegs: { lane: number; seg: BraceSeg }[][];
+}
+
+/**
+ * Geometry for the parallel-group brackets drawn in the left gutter. Each
+ * overlap group becomes a vertical bracket spanning from its first to its last
+ * visible member row, with a right-pointing tick on the exact member rows.
+ * Non-transitive chains (a row in two groups) get separate lanes so their
+ * brackets sit side by side. `through` = a row inside a group's span that is
+ * NOT itself a member (the aux-stream edge case) — drawn as a faint connector
+ * with no tick.
+ */
+export function computeBraceLayout(rows: readonly VisibleRow[]): BraceLayout {
+ const groupMap = new Map<
+ string,
+ { id: string; color: string; peerCount: number; startS: number; endS: number; idxs: number[] }
+ >();
+ rows.forEach((r, idx) => {
+ for (const ov of r.overlaps) {
+ const g = groupMap.get(ov.id) ?? {
+ id: ov.id,
+ color: ov.color,
+ peerCount: ov.peerCount,
+ startS: ov.startS,
+ endS: ov.endS,
+ idxs: [],
+ };
+ g.idxs.push(idx);
+ groupMap.set(ov.id, g);
+ }
+ });
+ const groups = [...groupMap.values()]
+ .filter((g) => g.idxs.length >= 2) // need ≥2 visible members to bracket
+ .map((g) => ({
+ ...g,
+ min: Math.min(...g.idxs),
+ max: Math.max(...g.idxs),
+ members: new Set(g.idxs),
+ }))
+ .toSorted((a, b) => a.min - b.min || a.max - b.max);
+
+ // Greedy lane assignment: a group reuses a lane whose previous group ended
+ // before this one starts.
+ const laneEnd: number[] = [];
+ const laneOf = new Map();
+ for (const g of groups) {
+ let lane = laneEnd.findIndex((end) => end < g.min);
+ if (lane === -1) {
+ lane = laneEnd.length;
+ laneEnd.push(g.max);
+ } else {
+ laneEnd[lane] = g.max;
+ }
+ laneOf.set(g.id, lane);
+ }
+ const rawLaneCount = laneEnd.length;
+ // Bound the gutter (see MAX_LANES). Lanes past the cap collapse onto the last
+ // visible lane, so every parallel row still carries a marker but the gutter
+ // width and DOM-node count stay bounded regardless of how parallel the
+ // conversation is.
+ const laneCount = Math.min(rawLaneCount, MAX_LANES);
+ const displayLane = (lane: number) => Math.min(lane, laneCount - 1);
+
+ // Sparse per-row segments: only lanes that actually carry a bracket on a row
+ // are stored (and later rendered). The previous dense matrix emitted one DOM
+ // node per lane per row — catastrophic at 49 lanes × 2k rows.
+ const rowSegs: { lane: number; seg: BraceSeg }[][] = rows.map(() => []);
+ for (const g of groups) {
+ const lane = displayLane(laneOf.get(g.id)!);
+ for (let idx = g.min; idx <= g.max; idx++) {
+ const isMember = g.members.has(idx);
+ const role =
+ idx === g.min ? 'first' : idx === g.max ? 'last' : isMember ? 'middle' : 'through';
+ const seg: BraceSeg = {
+ role,
+ isMember,
+ color: g.color,
+ groupId: g.id,
+ peerCount: g.peerCount,
+ startS: g.startS,
+ endS: g.endS,
+ };
+ const cell = rowSegs[idx]!;
+ const existing = cell.find((c) => c.lane === lane);
+ // Collisions only happen in the folded overflow lane. Prefer a real
+ // member marker over a faint pass-through connector.
+ if (!existing) cell.push({ lane, seg });
+ else if (seg.isMember && !existing.seg.isMember) existing.seg = seg;
+ }
+ }
+ return { laneCount, overflowLanes: rawLaneCount - laneCount, rowSegs };
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts
new file mode 100644
index 00000000..0af344f1
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts
@@ -0,0 +1,246 @@
+import { describe, expect, it } from 'vitest';
+
+import type {
+ StructureNode,
+ SubagentNode,
+ TurnNode,
+} from '@semianalysisai/inferencex-db/etl/weka-structure';
+
+import {
+ buildRowOverlaps,
+ buildVisibleRows,
+ computeBraceLayout,
+ findRequestOverlapGroups,
+ formatElapsedTime,
+ resolveDeepLinkTarget,
+ timeLabel,
+} from './trace-flamegraph-model';
+
+describe('formatElapsedTime', () => {
+ it('formats elapsed seconds below and above one hour', () => {
+ expect(formatElapsedTime(0)).toBe('00:00');
+ expect(formatElapsedTime(65.4)).toBe('01:05');
+ expect(formatElapsedTime(3661.6)).toBe('1:01:02');
+ expect(formatElapsedTime(86_541.149)).toBe('24:02:21');
+ });
+
+ it('clamps negative offsets to the conversation origin', () => {
+ expect(formatElapsedTime(-5)).toBe('00:00');
+ });
+});
+
+describe('timeLabel', () => {
+ it('renders a range when the end is after the start, a point otherwise', () => {
+ expect(timeLabel(65, 130)).toBe('+01:05–02:10');
+ expect(timeLabel(65)).toBe('+01:05');
+ expect(timeLabel(65, 65)).toBe('+01:05');
+ expect(timeLabel(undefined, 130)).toBeUndefined();
+ expect(timeLabel(Number.NaN, 130)).toBeUndefined();
+ });
+});
+
+describe('findRequestOverlapGroups', () => {
+ it('keeps non-transitive overlap chains as separate groups', () => {
+ const groups = findRequestOverlapGroups([
+ { key: 'A', startS: 1, endS: 8 },
+ { key: 'B', startS: 5, endS: 11 },
+ { key: 'C', startS: 9, endS: 15 },
+ ]);
+
+ expect(groups.map((group) => group.requestKeys)).toEqual([
+ ['A', 'B'],
+ ['B', 'C'],
+ ]);
+ expect(groups.map(({ startS, endS }) => [startS, endS])).toEqual([
+ [5, 8],
+ [9, 11],
+ ]);
+ });
+
+ it('does not consider touching or invalid intervals parallel', () => {
+ expect(
+ findRequestOverlapGroups([
+ { key: 'A', startS: 1, endS: 5 },
+ { key: 'B', startS: 5, endS: 8 },
+ { key: 'missing-end', startS: 3 },
+ { key: 'zero-duration', startS: 4, endS: 4 },
+ ]),
+ ).toEqual([]);
+ });
+
+ it('returns only the maximal simultaneous set for nested intervals', () => {
+ const groups = findRequestOverlapGroups([
+ { key: 'A', startS: 1, endS: 10 },
+ { key: 'B', startS: 2, endS: 8 },
+ { key: 'C', startS: 3, endS: 7 },
+ ]);
+ expect(groups).toMatchObject([{ requestKeys: ['A', 'B', 'C'], startS: 3, endS: 7 }]);
+ });
+});
+
+const turn = (turnIndex: number, extra: Partial = {}): TurnNode => ({
+ kind: 'turn',
+ turnIndex,
+ in: 100,
+ out: 10,
+ cached: 0,
+ uncached: 100,
+ ...extra,
+});
+const subagent = (children: TurnNode[], extra: Partial = {}): SubagentNode => ({
+ kind: 'subagent',
+ label: 'Subagent',
+ in: 100,
+ out: 10,
+ cached: 0,
+ uncached: 100,
+ children,
+ ...extra,
+});
+
+describe('resolveDeepLinkTarget', () => {
+ // Node layout mirroring a real Weka conversation: raw entries
+ // 0: turn, 1: subagent (2 children), 2: turn
+ const withRawIndexes: StructureNode[] = [
+ turn(0, { rawIndex: 0 }),
+ subagent([turn(1, { rawIndex: 1, innerIndex: 0 }), turn(2, { rawIndex: 1, innerIndex: 1 })], {
+ agentId: 'subagent_001_abcd1234',
+ rawIndex: 1,
+ }),
+ turn(3, { rawIndex: 2 }),
+ ];
+ // The same conversation as stored by the pre-rawIndex ingest (fields absent).
+ const legacy: StructureNode[] = [
+ turn(0),
+ subagent([turn(1), turn(2)], { agentId: 'subagent_001_abcd1234' }),
+ turn(3),
+ ];
+
+ it('resolves raw source coordinates against explicit rawIndex fields', () => {
+ expect(resolveDeepLinkTarget(withRawIndexes, { raw: 2 })).toEqual({
+ rowKey: 't-2',
+ expandGroup: null,
+ });
+ expect(resolveDeepLinkTarget(withRawIndexes, { raw: 1, inner: 1 })).toEqual({
+ rowKey: 'g-1-c-1',
+ expandGroup: 1,
+ });
+ });
+
+ it('falls back to node array position for structures ingested before rawIndex existed', () => {
+ // One node per raw entry means position === raw index, so the deep link
+ // must still resolve exactly (regression: it previously returned null and
+ // the flamegraph neither scrolled nor highlighted anything).
+ expect(resolveDeepLinkTarget(legacy, { raw: 2, turn: 1 })).toEqual({
+ rowKey: 't-2',
+ expandGroup: null,
+ });
+ expect(resolveDeepLinkTarget(legacy, { raw: 0, turn: 0 })).toEqual({
+ rowKey: 't-0',
+ expandGroup: null,
+ });
+ });
+
+ it('resolves subagent children positionally when innerIndex is absent', () => {
+ expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 1, turn: 1 })).toEqual({
+ rowKey: 'g-1-c-1',
+ expandGroup: 1,
+ });
+ });
+
+ it('returns null for out-of-range raw coordinates instead of guessing', () => {
+ expect(resolveDeepLinkTarget(legacy, { raw: 9 })).toBeNull();
+ expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 5 })).toBeNull();
+ // raw pointing at a subagent marker without inner does not match a turn.
+ expect(resolveDeepLinkTarget(legacy, { raw: 1 })).toBeNull();
+ });
+
+ it('keeps the positional turn/agent fallback for links without raw coordinates', () => {
+ expect(resolveDeepLinkTarget(legacy, { turn: 1 })).toEqual({
+ rowKey: 't-2',
+ expandGroup: null,
+ });
+ expect(resolveDeepLinkTarget(legacy, { turn: 1, agent: 'subagent_001_abcd1234' })).toEqual({
+ rowKey: 'g-1-c-1',
+ expandGroup: 1,
+ });
+ expect(resolveDeepLinkTarget(legacy, {})).toBeNull();
+ });
+});
+
+describe('buildVisibleRows', () => {
+ const nodes: StructureNode[] = [
+ turn(0, { startS: 0, endS: 10, model: 'claude' }),
+ subagent([turn(1), turn(2)], { label: 'Subagent: search', durationMs: 12_000 }),
+ turn(3),
+ ];
+
+ it('hides collapsed subagent children and keys rows by node position', () => {
+ const rows = buildVisibleRows(nodes, new Set(), new Map());
+ expect(rows.map((r) => r.key)).toEqual(['t-0', 'g-1', 't-2']);
+ expect(rows[0]).toMatchObject({
+ label: 'Turn 1',
+ sublabel: 'claude',
+ timeLabel: '+00:00–00:10',
+ total: 110,
+ isGroup: false,
+ });
+ expect(rows[1]).toMatchObject({
+ label: 'Subagent: search',
+ sublabel: '2 turns · 12s',
+ isGroup: true,
+ isExpanded: false,
+ groupIndex: 1,
+ });
+ });
+
+ it('inserts indented child rows for expanded groups and attaches overlaps', () => {
+ const overlap = {
+ id: 'main-1',
+ label: 'P1',
+ color: '#06b6d4',
+ startS: 0,
+ endS: 1,
+ peerCount: 1,
+ };
+ const rows = buildVisibleRows(nodes, new Set([1]), new Map([['g-1-c-0', [overlap]]]));
+ expect(rows.map((r) => r.key)).toEqual(['t-0', 'g-1', 'g-1-c-0', 'g-1-c-1', 't-2']);
+ expect(rows[2]).toMatchObject({ label: '↳ subturn 1', indent: 1, overlaps: [overlap] });
+ expect(rows[3]!.overlaps).toEqual([]);
+ });
+});
+
+describe('buildRowOverlaps and computeBraceLayout', () => {
+ it('brackets parallel main turns and spans a non-member row as pass-through', () => {
+ const nodes: StructureNode[] = [
+ turn(0, { startS: 0, endS: 10 }),
+ turn(1), // untimed — sits inside the bracket span without being a member
+ turn(2, { startS: 5, endS: 30 }), // overlaps turn 0 and turn 3
+ turn(3, { startS: 28, endS: 40 }),
+ ];
+ const overlaps = buildRowOverlaps(nodes);
+ expect([...overlaps.keys()].toSorted()).toEqual(['t-0', 't-2', 't-3']);
+
+ const rows = buildVisibleRows(nodes, new Set(), overlaps);
+ const layout = computeBraceLayout(rows);
+ // Two overlap groups sharing rows 0–2 and 2–3 need two side-by-side lanes.
+ expect(layout.laneCount).toBe(2);
+ expect(layout.overflowLanes).toBe(0);
+ const roles = layout.rowSegs.map((segs) =>
+ segs.map(({ lane, seg }) => `${lane}:${seg.role}${seg.isMember ? '' : ':nonmember'}`),
+ );
+ expect(roles[0]).toEqual(['0:first']);
+ expect(roles[1]).toEqual(['0:through:nonmember']);
+ expect(roles[2]!.toSorted()).toEqual(['0:last', '1:first']);
+ expect(roles[3]).toEqual(['1:last']);
+ });
+
+ it('reports no lanes for a fully serial conversation', () => {
+ const nodes: StructureNode[] = [
+ turn(0, { startS: 0, endS: 5 }),
+ turn(1, { startS: 5, endS: 9 }),
+ ];
+ const rows = buildVisibleRows(nodes, new Set(), buildRowOverlaps(nodes));
+ expect(computeBraceLayout(rows)).toEqual({ laneCount: 0, overflowLanes: 0, rowSegs: [[], []] });
+ });
+});
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
new file mode 100644
index 00000000..d63cc691
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -0,0 +1,439 @@
+'use client';
+
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
+import { createPortal } from 'react-dom';
+
+import type { ConversationStructure } from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+import { compact, formatShare } from './format';
+import {
+ buildRowOverlaps,
+ buildVisibleRows,
+ computeBraceLayout,
+ formatElapsedTime,
+ MAX_LANES,
+ OVERLAP_COLORS,
+ resolveDeepLinkTarget,
+ type VisibleRow,
+} from './trace-flamegraph-model';
+
+// Pure logic lives in trace-flamegraph-model.ts; re-exported here so this file
+// stays the module entry point for the flamegraph's public API.
+export {
+ findRequestOverlapGroups,
+ formatElapsedTime,
+ resolveDeepLinkTarget,
+} from './trace-flamegraph-model';
+export type {
+ DeepLinkHighlight,
+ DeepLinkTarget,
+ RequestOverlapGroup,
+ TimedRequest,
+} from './trace-flamegraph-model';
+
+// Stacked-bar segment colors. Cached prefix vs uncached input vs output —
+// fixed hues (theme-independent) so the meaning is stable in light/dark.
+const SEG = {
+ cached: '#10b981', // emerald-500 — input served from prefix cache
+ uncached: '#f59e0b', // amber-500 — input that must be (re)computed
+ output: '#8b5cf6', // violet-500 — generated tokens
+} as const;
+
+const LEGEND = [
+ { key: 'cached', label: 'Cached prefix', color: SEG.cached },
+ { key: 'uncached', label: 'Uncached input', color: SEG.uncached },
+ { key: 'output', label: 'Output', color: SEG.output },
+] as const;
+
+// Width (px) of one parallel-group bracket lane in the left gutter. Overlapping
+// groups (non-transitive chains) get their own lane so their brackets sit
+// side-by-side instead of stacking visually.
+const LANE_W = 14;
+
+interface TooltipState {
+ x: number;
+ y: number;
+ row: VisibleRow;
+}
+
+/**
+ * Per-conversation flamegraph driven by the precomputed `structure` JSONB.
+ * One row per turn; subagent groups render a collapsible header with indented
+ * children (collapsed by default). Each bar stacks cached-prefix + uncached
+ * input + output, scaled to the widest visible turn.
+ */
+export function TraceFlamegraph({
+ structure,
+ highlightTurn,
+ highlightRawIndex,
+ highlightInnerIndex,
+ highlightAgentId,
+}: {
+ structure: ConversationStructure;
+ /** Turn index to scroll to / highlight (from a request-timeline deep link). */
+ highlightTurn?: number | null;
+ /** Raw Weka top-level request index to scroll to / highlight. */
+ highlightRawIndex?: number | null;
+ /** Raw Weka nested request index under highlightRawIndex, for subagent children. */
+ highlightInnerIndex?: number | null;
+ /** Subagent id when the highlighted turn is inside a subagent group. */
+ highlightAgentId?: string | null;
+}) {
+ const nodes = structure.nodes;
+
+ // Resolve the deep-link target to a row key (+ the group that must be open to
+ // show it). See resolveDeepLinkTarget for the matching rules.
+ const target = useMemo(
+ () =>
+ resolveDeepLinkTarget(nodes, {
+ turn: highlightTurn,
+ raw: highlightRawIndex,
+ inner: highlightInnerIndex,
+ agent: highlightAgentId,
+ }),
+ [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId],
+ );
+
+ // Subagent groups collapsed by default — except the deep-link target's group.
+ const [expanded, setExpanded] = useState>(() =>
+ typeof target?.expandGroup === 'number' ? new Set([target.expandGroup]) : new Set(),
+ );
+ const [tooltip, setTooltip] = useState(null);
+ const scrollRef = useRef(null);
+
+ // Portal target only exists after mount (the tooltip is portaled to body so
+ // its position:fixed is viewport-relative, immune to ancestor transforms).
+ const [mounted, setMounted] = useState(false);
+ useEffect(() => setMounted(true), []);
+
+ // The deep-link target row gets a state-driven highlight (ring + bg flash)
+ // that fades out — state-driven so a re-render can't clobber it, and so the
+ // fade is a real CSS transition rather than an abrupt classList removal.
+ const [highlightKey, setHighlightKey] = useState(target?.rowKey ?? null);
+
+ // When the deep-link target resolves/changes: expand its subagent group, then
+ // (after the row renders) scroll it into view and flash the highlight. Runs on
+ // first load and on any later target change (e.g. clicking another bar into
+ // the same conversation). The row query/scroll is deferred to the next frame
+ // so the just-expanded child row exists in the DOM.
+ useEffect(() => {
+ if (!target) return;
+ if (typeof target.expandGroup === 'number') {
+ const gi = target.expandGroup;
+ setExpanded((prev) => (prev.has(gi) ? prev : new Set(prev).add(gi)));
+ }
+ setHighlightKey(target.rowKey);
+ const raf = requestAnimationFrame(() => {
+ scrollRef.current
+ ?.querySelector(`[data-rowkey="${target.rowKey}"]`)
+ ?.scrollIntoView({ block: 'center', behavior: 'smooth' });
+ });
+ const t = setTimeout(() => setHighlightKey(null), 2200);
+ return () => {
+ cancelAnimationFrame(raf);
+ clearTimeout(t);
+ };
+ }, [target]);
+
+ const groupIndexes = useMemo(() => {
+ const out: number[] = [];
+ nodes.forEach((node, i) => {
+ if (node.kind === 'subagent') out.push(i);
+ });
+ return out;
+ }, [nodes]);
+
+ const toggle = useCallback((i: number) => {
+ setExpanded((prev) => {
+ const next = new Set(prev);
+ if (next.has(i)) next.delete(i);
+ else next.add(i);
+ return next;
+ });
+ }, []);
+
+ const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]);
+ const collapseAll = useCallback(() => setExpanded(new Set()), []);
+
+ const overlapsByRow = useMemo(() => buildRowOverlaps(nodes), [nodes]);
+
+ const rows = useMemo(
+ () => buildVisibleRows(nodes, expanded, overlapsByRow),
+ [nodes, expanded, overlapsByRow],
+ );
+
+ // Two scales: leaf turns/subturns share a per-turn axis (the primary signal —
+ // how cached/uncached evolves), while subagent group headers carry aggregates
+ // orders of magnitude larger, so they get their own axis to stay comparable to
+ // each other. Group bars render slim + muted, so the mixed scale reads as a
+ // distinct "group summary" track rather than a contradiction.
+ const maxTotal = useMemo(
+ () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)),
+ [rows],
+ );
+ const maxGroupTotal = useMemo(
+ () => Math.max(1, ...rows.filter((r) => r.isGroup).map((r) => r.total)),
+ [rows],
+ );
+
+ const braces = useMemo(() => computeBraceLayout(rows), [rows]);
+
+ const onMove = (e: React.MouseEvent, row: VisibleRow) => {
+ setTooltip({ x: e.clientX, y: e.clientY, row });
+ };
+
+ return (
+
+
+
+ {LEGEND.map((l) => (
+
+
+ {l.label}
+
+ ))}
+
+
+ Bracketed rows ran in parallel
+
+
+ {groupIndexes.length > 0 && (
+
+ {
+ track('datasets_flamegraph_expand_all');
+ expandAll();
+ }}
+ className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
+ >
+ Expand all
+
+ {
+ track('datasets_flamegraph_collapse_all');
+ collapseAll();
+ }}
+ className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
+ >
+ Collapse all
+
+
+ )}
+
+
+ {braces.overflowLanes > 0 && (
+
+ Dense parallel region — bracket lanes capped at {MAX_LANES}; {braces.overflowLanes}{' '}
+ further overlapping {braces.overflowLanes === 1 ? 'group is' : 'groups are'} folded into
+ the last lane.
+
+ )}
+
+
+ {/* gap-0 so the per-row bracket segments connect into a continuous
+ vertical rail across the rows of a parallel group. */}
+
+ {rows.map((row, idx) => {
+ // Group headers use the group axis; turns/subturns use the per-turn
+ // axis. Clamp to the track width either way.
+ const denom = row.isGroup ? maxGroupTotal : maxTotal;
+ const widthPct = Math.min(100, Math.max(0.5, (row.total / denom) * 100));
+ const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0;
+ const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0;
+ const ow = row.total > 0 ? (row.output / row.total) * 100 : 0;
+ const isHighlighted = row.key === highlightKey;
+ const segs = braces.rowSegs[idx]!;
+ return (
+
+ {/* Parallel-group bracket gutter (only rendered when the
+ conversation has any overlaps, so non-overlap traces keep a
+ flush-left layout with no dead space). Segments are sparse and
+ absolutely positioned per lane so a row only pays for the
+ lanes it actually touches. */}
+ {braces.laneCount > 0 && (
+
+ {segs.map(({ lane, seg }) => {
+ const top = seg.role === 'first' ? '50%' : '0';
+ const bottom = seg.role === 'last' ? '50%' : '0';
+ return (
+
+ {/* vertical rail */}
+
+ {/* right-pointing tick marking an actual member row */}
+ {seg.isMember && (
+
+ )}
+
+ );
+ })}
+
+ )}
+
+ {/* row content (indented for subagent children) */}
+
+ {/* label / group toggle */}
+
+ {row.isGroup ? (
+ {
+ track('datasets_flamegraph_group_toggled', {
+ expanded: !row.isExpanded,
+ });
+ if (row.groupIndex !== undefined) toggle(row.groupIndex);
+ }}
+ className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
+ >
+
+ {row.isExpanded ? '▾' : '▸'}
+
+ {row.label}
+
+ ) : (
+ {row.label}
+ )}
+
+
+ {/* Original interval, measured from conversation start. */}
+
+ {row.timeLabel ?? '—'}
+
+
+ {/* stacked bar — group headers render as a slim muted summary
+ strip so they read as aggregates, not individual turns. */}
+
onMove(e, row)}
+ onMouseLeave={() => setTooltip(null)}
+ >
+
+
+
+ {/* total */}
+
+ {compact(row.total)}
+
+
+
+ );
+ })}
+
+
+
+ {tooltip &&
+ mounted &&
+ createPortal(
+
+
+ {tooltip.row.label}
+ {tooltip.row.sublabel ? (
+
+ {tooltip.row.sublabel}
+
+ ) : null}
+
+
+ Cached prefix
+
+ {compact(tooltip.row.cached)}
+
+ Uncached input
+
+ {compact(tooltip.row.uncached)}
+
+ Output
+
+ {compact(tooltip.row.output)}
+
+ Cached %
+
+ {formatShare(tooltip.row.cached, tooltip.row.cached + tooltip.row.uncached)}
+
+ From start
+
+ {tooltip.row.timeLabel ?? '—'}
+
+
+
,
+ document.body,
+ )}
+
+ );
+}
diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx
index 8fbf52ac..1a12057e 100644
--- a/packages/app/src/components/header/header.tsx
+++ b/packages/app/src/components/header/header.tsx
@@ -9,6 +9,7 @@ import { track } from '@/lib/analytics';
import { ModeToggle } from '@/components/ui/mode-toggle';
import { MinecraftToggles } from '@/components/minecraft/minecraft-toggles';
import { navigateInApp } from '@/lib/client-navigation';
+import { useFeatureGate } from '@/lib/use-feature-gate';
import { cn } from '@/lib/utils';
import { GitHubStars } from './GithubStars';
@@ -46,6 +47,15 @@ const NAV_LINKS = [
testId: 'nav-link-supporters',
event: 'header_supporters_clicked',
},
+ {
+ href: '/datasets',
+ label: 'Datasets',
+ testId: 'nav-link-datasets',
+ event: 'header_datasets_clicked',
+ // Agentic surface — hidden behind the konami-code feature gate (default off)
+ // until agentic launches. Same gate as the Hidden tab dropdown.
+ gated: true,
+ },
{ href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' },
{ href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' },
] as const;
@@ -62,9 +72,14 @@ function isActive(pathname: string, href: string): boolean {
export const Header = ({ starCount }: { starCount?: number | null }) => {
const pathname = usePathname() ?? '/';
const router = useRouter();
+ const featureGateUnlocked = useFeatureGate();
const [mobileMenuOpen, setMobileMenuOpen] = useState(false);
const menuRef = useRef(null);
+ // Hide gated nav links (e.g. Datasets — an agentic surface) unless the shared
+ // feature gate is unlocked. Mirrors the tab-nav "Hidden" dropdown gating.
+ const navLinks = NAV_LINKS.filter((l) => !('gated' in l && l.gated) || featureGateUnlocked);
+
// Close menu on route change
useEffect(() => {
setMobileMenuOpen(false);
@@ -118,7 +133,7 @@ export const Header = ({ starCount }: { starCount?: number | null }) => {
{/* Desktop nav */}
- {NAV_LINKS.map(({ href, label, testId, event }) => (
+ {navLinks.map(({ href, label, testId, event }) => (
{
{mobileMenuOpen && (
- {NAV_LINKS.map(({ href, label, event }) => (
+ {navLinks.map(({ href, label, event }) => (
getUrlParam('i_metric') || initialYAxisMetric || 'y_tpPerGpu',
);
const [selectedXAxisMetric, setSelectedXAxisMetric] = useState
(
- () => getUrlParam('i_xmetric') || 'p99_ttft',
+ () => getUrlParam('i_xmetric') || 'p90_ttft',
);
const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState(
- () => getUrlParam('i_e2e_xmetric') || null,
+ () => getUrlParam('i_e2e_xmetric') || 'p90_ttft',
+ );
+ // Selected chart variant. Initialize from URL only — SSR cannot read URL, so
+ // computing a kind-based default here would diverge between server and client
+ // and cause a hydration mismatch. The scenario-kind default is applied in a
+ // post-mount effect below (and a ref tracks whether the user has overridden).
+ //
+ // SSR has no URL access, so seed with a fixed default and apply the URL
+ // value (if any) in a post-mount effect — keeps server + client first render
+ // identical and avoids "didn't match" hydration warnings when the URL holds
+ // a non-default mode.
+ const [selectedXAxisMode, setSelectedXAxisMode] = useState('ttft');
+ const xAxisModeFromUrlRef = useRef(false);
+ useEffect(() => {
+ if (xAxisModeFromUrlRef.current) return;
+ const v = getUrlParam('i_xmode');
+ if (v && (X_AXIS_MODES as readonly string[]).includes(v)) {
+ xAxisModeFromUrlRef.current = true;
+ setSelectedXAxisMode(v as XAxisMode);
+ }
+ // eslint-disable-next-line react-hooks/exhaustive-deps
+ }, []);
+ // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the
+ // existing useChartData pipeline keys off that flag for the e2e chart's x-axis.
+ const handleSetXAxisMode = useCallback((mode: XAxisMode) => {
+ xAxisModeFromUrlRef.current = true;
+ setSelectedXAxisMode(mode);
+ // The e2e chart's x-axis metric is reconciled in a separate effect below,
+ // because it depends on sequence kind (fixed-seq has no p90_* metrics) and
+ // the agentic percentile, both of which can change independently.
+ }, []);
+ // Latency percentile applied to the chart x-axis for agentic scenarios.
+ // Values: 'p90' | 'p99'. Non-agentic charts ignore.
+ const [selectedPercentile, setSelectedPercentile] = useState(
+ () => getUrlParam('i_pctl') || 'p90',
);
const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
() => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
@@ -208,21 +248,22 @@ export function InferenceProvider({
// Legacy `?i_nolabel=1` from before the rename: keep hiding point labels
// explicitly so the share link's intent survives future default changes.
if (getUrlParam('i_nolabel') === '1') return false;
+ if (getUrlParam('i_label') === '0') return false;
if (getUrlParam('i_label') === '1') return true;
- // Old share links set `?i_advlabel=1` while keeping the labels default
- // (shown). Mirror the toggle's auto-enable side-effect on load so those
- // links still render advanced labels under the new default-off behavior.
- if (getUrlParam('i_advlabel') === '1') return true;
- return false;
+ // Default on: point labels (TP + concurrency, or the fuller parallelism
+ // breakdown when Parallelism Labels is toggled on) are useful either way.
+ return true;
});
const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1');
+ // Parallelism labels default off (?i_advlabel=1 overrides on).
const [useAdvancedLabels, setUseAdvancedLabels] = useState(
() => getUrlParam('i_advlabel') === '1',
);
const [showGradientLabels, setShowGradientLabels] = useState(
() => getUrlParam('i_gradlabel') === '1',
);
- const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') !== '0');
+ // Line labels default off (?i_linelabel=1 overrides on).
+ const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') === '1');
const [showSpeedOverlay, setShowSpeedOverlay] = useState(() => getUrlParam('i_speed') === '1');
const [showMinecraftOverlay, setShowMinecraftOverlay] = useState(
() => getUrlParam('i_mc') === '1',
@@ -291,13 +332,68 @@ export function InferenceProvider({
return ids.length > 0 ? ids.reduce((max, id) => (id > max ? id : max), ids[0]) : '';
}, [filteredAvailableRuns]);
- // Only constrain the query when an earlier-than-latest run is selected; otherwise
- // the chart shows the full latest view (and reuses the materialized-view fast path).
+ // Only constrain the base query when an earlier-than-latest run is selected.
const asOfRunId =
effectiveSelectedRunId && latestRunIdForModel && effectiveSelectedRunId !== latestRunIdForModel
? effectiveSelectedRunId
: undefined;
+ // Run-selector scoping: only constrain benchmark data to a specific run when
+ // there's actually a disambiguation to make for the CURRENT model. The
+ // raw `availableRuns` is across ALL models on the date, so the picker may
+ // auto-select a run that produced nothing for the current model — passing
+ // that runId would return zero rows and hide the chart entirely.
+ // Compute the set of runs whose CHANGELOG explicitly mentions this model +
+ // precision. We can't reuse `filterRunsByModel` here because it has a
+ // fallback that returns all runs when nothing matches (so the picker still
+ // renders) — which would make us pass a runId that produced no rows for
+ // the current model, hiding the chart.
+ // Map each FULL config_key (model-precision-hardware-framework) a run's
+ // changelog claims to the set of runs claiming it. Single-run scoping should
+ // only kick in when two runs contest the SAME full key — e.g. a same-day
+ // re-run of one hardware — because then a DISTINCT ON merge could mix them
+ // and the user needs to pick which run wins. Runs covering DIFFERENT hardware
+ // of the same model (e.g. a B300 run and a B200 run on the same date) are
+ // complementary: both must render via carry-forward. Matching on model+
+ // precision alone (the old behavior) wrongly treated those as alternatives
+ // and scoped the chart to one run, hiding the other GPU's curve.
+ const contestedRunIds = useMemo(() => {
+ const runsByConfigKey = new Map>();
+ if (availableRuns) {
+ for (const [runId, runInfo] of Object.entries(availableRuns)) {
+ if (!runInfo.changelog) continue;
+ for (const entry of runInfo.changelog.entries) {
+ for (const key of entry.config_keys) {
+ const parts = key.split('-');
+ if (modelPrefixes.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!)) {
+ let runs = runsByConfigKey.get(key);
+ if (!runs) {
+ runs = new Set();
+ runsByConfigKey.set(key, runs);
+ }
+ runs.add(runId);
+ }
+ }
+ }
+ }
+ }
+ // A run is "contested" only if some full config_key it claims is also claimed
+ // by another run. Only then does picking a run disambiguate anything.
+ // Downstream (useChartData / mergeRunScopedRows) this no longer scopes the
+ // WHOLE chart to the run: only the configs the run actually produced are
+ // pinned to it, and every other config (e.g. another framework's same-day
+ // run) still carries forward from the normal latest-per-config rows.
+ const contested = new Set();
+ for (const runs of runsByConfigKey.values()) {
+ if (runs.size > 1) for (const r of runs) contested.add(r);
+ }
+ return contested;
+ }, [availableRuns, modelPrefixes, effectivePrecisions]);
+ const benchmarkRunId =
+ effectiveSelectedRunId && contestedRunIds.has(String(effectiveSelectedRunId))
+ ? String(effectiveSelectedRunId)
+ : undefined;
+
const {
graphs,
loading: chartDataLoading,
@@ -317,9 +413,17 @@ export function InferenceProvider({
userCosts,
userPowers,
effectiveRunDate,
- isActive,
+ // Gate benchmark fetching on sequenceResolved: before availability loads we
+ // don't yet know the model's real sequence, and the selectedSequence default
+ // is AgenticTraces. Fetching now would fire the agentic data path for a
+ // fixed-seq-only model, then refetch once availability snaps the sequence.
+ // The chart's normal loading state covers this brief window.
+ isActive && sequenceResolved,
latestDate,
+ selectedPercentile,
compareGpuPair ?? null,
+ benchmarkRunId,
+ selectedXAxisMode,
asOfRunId,
dataQuickFilters,
);
@@ -335,7 +439,7 @@ export function InferenceProvider({
if (!availabilityRows) return availableDates;
const rows = availabilityRows.filter((r) => {
if (!dbModelKeys.includes(r.model)) return false;
- if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false;
+ if (rowToSequence(r) !== effectiveSequence) return false;
if (!effectivePrecisions.includes(r.precision)) return false;
if (!r.hardware) return false;
const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -360,7 +464,7 @@ export function InferenceProvider({
const hwKeys = new Set();
for (const r of availabilityRows) {
if (!dbModelKeys.includes(r.model)) continue;
- if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue;
+ if (rowToSequence(r) !== effectiveSequence) continue;
if (!effectivePrecisions.includes(r.precision)) continue;
if (!r.hardware) continue;
const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -432,6 +536,60 @@ export function InferenceProvider({
setTrackedConfigs((prev) => (prev.length > 0 ? [] : prev));
}, [selectedModel, effectiveSequence, effectivePrecisions, selectedYAxisMetric]);
+ // Reconcile the x-axis mode with the scenario kind:
+ // - On mount with no `i_xmode` URL param: snap to the kind's natural default
+ // (interactivity for both agentic and fixed-sequence scenarios). The state was initialized
+ // to a SSR-stable constant so server and client render the same DOM; this
+ // effect fixes it up after hydration.
+ // - When the user later switches sequence kinds: snap to the new kind's
+ // natural default (the prior selection was for a different kind, so it
+ // doesn't carry over).
+ const lastSeqKindRef = useRef | null>(null);
+ useEffect(() => {
+ const kind = sequenceKind(effectiveSequence);
+ const isInitialMount = lastSeqKindRef.current === null;
+ const isAgenticOnlyMode = isAgenticOnlyXAxisMode(selectedXAxisMode);
+ // On a stale render where kind hasn't changed, bail unless the current
+ // mode is agentic-only and we just landed on a fixed-seq scenario — in
+ // that case force the snap so the chart doesn't try to plot trace-derived
+ // metrics against rows that have no trace_replay.
+ if (!isInitialMount && lastSeqKindRef.current === kind) {
+ if (kind === 'fixed-seq' && isAgenticOnlyMode) {
+ handleSetXAxisMode('interactivity');
+ }
+ return;
+ }
+ lastSeqKindRef.current = kind;
+ if (
+ isInitialMount &&
+ xAxisModeFromUrlRef.current &&
+ !(kind === 'fixed-seq' && isAgenticOnlyMode)
+ ) {
+ // URL-restored agentic-only mode on a fixed-seq sequence makes no sense
+ // — fall through to the default snap below.
+ return;
+ }
+ handleSetXAxisMode('interactivity');
+ }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]);
+
+ // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or
+ // agentic percentile changes. For fixed-seq the JSONB only carries
+ // median_* / p99_* (no p90_*), so the TTFT button there has to point at
+ // median_ttft — otherwise the chart goes blank. For agentic, we point at
+ // the user's chosen percentile so the dropdown actually drives the axis.
+ useEffect(() => {
+ const isAgentic = sequenceKind(effectiveSequence) === 'agentic';
+ if (selectedXAxisMode === 'ttft') {
+ setSelectedE2eXAxisMetric(isAgentic ? `${selectedPercentile}_ttft` : 'median_ttft');
+ } else if (selectedXAxisMode === 'e2e') {
+ // null = use the chart-config natural x (median_e2el), which useChartData
+ // rewrites to _e2el for agentic via withPercentile().
+ setSelectedE2eXAxisMetric(null);
+ }
+ // 'interactivity' mode renders the interactivity chart, which keys off
+ // selectedXAxisMetric (not the e2e one), so nothing to do here.
+ }, [selectedXAxisMode, effectiveSequence, selectedPercentile]);
+
// Ref guard: when true, filter changes don't clear the active preset.
// FavoritePresetsDropdown sets this while applying a preset so its own
// programmatic setter calls don't accidentally deactivate it.
@@ -875,21 +1033,23 @@ export function InferenceProvider({
useUrlStateSync(
{
i_metric: selectedYAxisMetric,
+ i_pctl: selectedPercentile,
i_gpus: selectedGPUs.join(','),
i_dates: selectedDates.join(','),
i_dstart: selectedDateRange.startDate,
i_dend: selectedDateRange.endDate,
i_optimal: hideNonOptimal ? '' : '0',
- i_label: showPointLabels ? '1' : '',
+ i_label: showPointLabels ? '' : '0',
i_hc: highContrast ? '1' : '',
i_log: logScale ? '1' : '',
i_xmetric: selectedXAxisMetric || '',
i_e2e_xmetric: selectedE2eXAxisMetric || '',
+ i_xmode: selectedXAxisMode,
i_scale: scaleType,
i_legend: isLegendExpanded ? '' : '0',
i_advlabel: useAdvancedLabels ? '1' : '',
i_gradlabel: showGradientLabels ? '1' : '',
- i_linelabel: showLineLabels ? '' : '0',
+ i_linelabel: showLineLabels ? '1' : '',
i_speed: showSpeedOverlay ? '1' : '',
i_mc: showMinecraftOverlay ? '1' : '',
i_active: iActiveStr,
@@ -902,6 +1062,7 @@ export function InferenceProvider({
selectedYAxisMetric,
selectedXAxisMetric,
selectedE2eXAxisMetric,
+ selectedXAxisMode,
scaleType,
selectedGPUs,
selectedDates,
@@ -1066,6 +1227,8 @@ export function InferenceProvider({
setSelectedXAxisMetric,
selectedE2eXAxisMetric,
setSelectedE2eXAxisMetric,
+ selectedXAxisMode,
+ setSelectedXAxisMode: handleSetXAxisMode,
scaleType,
setScaleType,
quickFilters,
@@ -1079,6 +1242,8 @@ export function InferenceProvider({
workflowInfo,
selectedYAxisMetric,
setSelectedYAxisMetric: setSelectedYAxisMetricAndClear,
+ selectedPercentile,
+ setSelectedPercentile,
selectedGPUs,
setSelectedGPUs: setSelectedGPUsAndClear,
availableGPUs,
@@ -1143,6 +1308,7 @@ export function InferenceProvider({
selectedYAxisMetric,
selectedXAxisMetric,
selectedE2eXAxisMetric,
+ selectedXAxisMode,
scaleType,
quickFilters,
availableQuickFilters,
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
new file mode 100644
index 00000000..64742acd
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -0,0 +1,334 @@
+'use client';
+
+import Link from 'next/link';
+import { usePathname, useRouter, useSearchParams } from 'next/navigation';
+import { useCallback, useMemo, useState } from 'react';
+import { ArrowLeft } from 'lucide-react';
+
+import { useAgenticAggregates } from '@/hooks/api/use-agentic-aggregates';
+import { useRequestTimeline } from '@/hooks/api/use-request-timeline';
+import { useTraceServerMetrics } from '@/hooks/api/use-trace-server-metrics';
+import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+import { AggregatesGrid } from './aggregates-grid';
+import { MetricSourceToolbar } from './metric-source-toolbar';
+import {
+ phaseBoundarySec,
+ sliceServerSeriesByPhase,
+ sliceTimelineByPhase,
+ timelineHasWarmup,
+ type ServerSeriesLike,
+ type StagePhase,
+} from './phase-slice';
+import { PointSummary } from './point-summary';
+import { RequestMetricOverTime, SequenceMetricCard } from './request-metric-cards';
+import { RequestTimelineView } from './request-timeline';
+import {
+ CumulativeUniqueInputTokensCard,
+ InflightUniqueTokensCard,
+ KvCacheUtilizationCard,
+ PrefixCacheHitRateCard,
+ PromptTokenSourceCard,
+ RequestActivityCard,
+ ThroughputCard,
+ type RequestActivityView,
+} from './server-metric-cards';
+import { SiblingNav } from './sibling-nav';
+import type { ThroughputSeriesKey } from './time-series-math';
+
+interface Props {
+ id: number;
+}
+
+type DetailView = 'point' | 'timeline' | 'aggregates';
+
+const VIEW_OPTIONS: SegmentedToggleOption[] = [
+ { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
+ { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' },
+ { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
+];
+
+const isDetailView = (value: string | null): value is DetailView =>
+ value === 'point' || value === 'timeline' || value === 'aggregates';
+
+/** URL-persisted detail view (`?view=`; per-point is the unadorned default). */
+function useDetailView(): [DetailView, (nextView: DetailView) => void] {
+ const router = useRouter();
+ const pathname = usePathname();
+ const searchParams = useSearchParams();
+ const requestedView = searchParams.get('view');
+ const view: DetailView = isDetailView(requestedView) ? requestedView : 'point';
+ const setView = useCallback(
+ (nextView: DetailView) => {
+ const nextParams = new URLSearchParams(searchParams.toString());
+ if (nextView === 'point') nextParams.delete('view');
+ else nextParams.set('view', nextView);
+ const query = nextParams.toString();
+ router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false });
+ track('inference_agentic_detail_view_changed', { view: nextView });
+ },
+ [pathname, router, searchParams],
+ );
+ return [view, setView];
+}
+
+export function AgenticPointDetail({ id }: Props) {
+ const router = useRouter();
+ const metricsQuery = useTraceServerMetrics(id, true);
+ const siblingsQuery = useBenchmarkSiblings(id);
+
+ const metrics = metricsQuery.data;
+ const siblingsData = siblingsQuery.data;
+
+ const [view, setView] = useDetailView();
+ const [metricSourceId, setMetricSourceId] = useState('all');
+ const [requestActivityView, setRequestActivityView] = useState('queue');
+ const [throughputSeries, setThroughputSeries] = useState>(
+ () => new Set(['input', 'decode']),
+ );
+ // Fetch aggregates only when the aggregates view is active. Uses the full
+ // sibling set (across parallelism + concurrency configs) so each chart
+ // shows how the metric varies across the SKU.
+ const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
+ const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
+ // Per-request timeline used by the timeline view AND every per-point
+ // request-derived chart (ISL/OSL, latency-over-time, in-flight), so fetch
+ // whenever we're on either view.
+ const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point');
+ const timeline = timelineQuery.data;
+
+ // Warmup vs profiling stage. Only meaningful when the point actually has a
+ // warmup phase (older runs are profiling-only) — when absent the toggle is
+ // hidden and everything falls back to the full (profiling) run.
+ const [phase, setPhase] = useState('profiling');
+ const hasWarmup = useMemo(() => timelineHasWarmup(timeline), [timeline]);
+ const effectivePhase: StagePhase = hasWarmup ? phase : 'profiling';
+
+ // Server-metric boundary on the chart's own t-axis (rebased through absolute
+ // ns — see phase-slice header for the origin-gap invariant). Request charts
+ // get a phase-scoped timeline (filtered + rebased) so they share a 0-based
+ // axis with the server charts for the selected phase.
+ const boundarySec = useMemo(() => phaseBoundarySec(metrics, timeline), [metrics, timeline]);
+ const phaseTimeline = useMemo(
+ () => (timeline ? sliceTimelineByPhase(timeline, effectivePhase) : null),
+ [timeline, effectivePhase],
+ );
+
+ const metricSources = metrics?.metricSources ?? [];
+ const selectedMetricSource = metricSources.find(({ source }) => source.id === metricSourceId);
+ const baseServerSeries: ServerSeriesLike | undefined = useMemo(() => {
+ const src = metrics?.metricSources?.find((m) => m.source.id === metricSourceId);
+ if (src) {
+ return {
+ kvCacheUsage: src.kvCacheUsage,
+ prefixCacheHitRate: src.prefixCacheHitRate,
+ queueDepth: src.queueDepth,
+ promptTokensBySource: src.promptTokensBySource,
+ prefillTps: src.promptTps,
+ decodeTps: src.generationTps,
+ prefixCacheHitsTps: src.prefixCacheHitsTps,
+ hostKvCacheUsage: src.hostKvCacheUsage,
+ kvCacheUsageByEngine: src.kvCacheUsageByEngine,
+ };
+ }
+ return metrics ?? undefined;
+ }, [metrics, metricSourceId]);
+ // Phase-sliced server series (+ matching durationS) consumed by every server
+ // chart. Null only when there are no server metrics at all.
+ const sliced = useMemo(
+ () =>
+ baseServerSeries
+ ? sliceServerSeriesByPhase(
+ baseServerSeries,
+ effectivePhase,
+ boundarySec,
+ metrics?.durationS ?? 0,
+ )
+ : null,
+ [baseServerSeries, effectivePhase, boundarySec, metrics?.durationS],
+ );
+ // Some runs only scrape server metrics during profiling — `chart_series`
+ // starts at the profiling boundary, so the warmup slice collapses to ~0–1
+ // points (just the t=0 origin) even though request-level warmup data exists.
+ // Require ≥2 points in some series to count as real warmup coverage; otherwise
+ // show an explanatory note instead of six silently-blank charts.
+ const slicedHasServerData =
+ (sliced?.series.kvCacheUsage.length ?? 0) > 1 ||
+ (sliced?.series.queueDepth.length ?? 0) > 1 ||
+ (sliced?.series.prefillTps.length ?? 0) > 1 ||
+ (sliced?.series.prefixCacheHitRate.length ?? 0) > 1;
+
+ return (
+
+
+
router.back()}
+ className="inline-flex items-center gap-1 text-sm text-muted-foreground hover:text-foreground"
+ >
+ Back
+
+
·
+
+ Inference chart
+
+
+
+ {siblingsData ? (
+
+ ) : siblingsQuery.isLoading ? (
+
Loading SKU navigator…
+ ) : null}
+
+ {metrics ? (
+
+ ) : metricsQuery.isLoading ? (
+
Loading point metadata…
+ ) : null}
+
+ {metricsQuery.isError && (
+
+ Failed to load trace data for benchmark point #{id}.
+
+ )}
+ {metricsQuery.data === null && !metricsQuery.isLoading && (
+
+ No stored trace_replay blob for benchmark point #{id}. This point predates the aiperf
+ time-series capture, or its source artifacts have expired on GitHub.
+
+ )}
+
+
+
+ {view === 'aggregates' && (
+
+ {siblingIds.length} configs in SKU
+ {aggregatesQuery.isLoading ? ' · loading…' : ''}
+
+ )}
+ {view === 'timeline' && timelineQuery.data && (
+
+ {timelineQuery.data.requests.length} requests
+
+ )}
+
+
+ {view === 'point' && (metricSources.length > 1 || hasWarmup) && (
+
+ )}
+
+ {view === 'aggregates' ? (
+
+ ) : view === 'timeline' ? (
+ timelineQuery.isLoading ? (
+
+ Loading request timeline…
+
+ ) : timelineQuery.data ? (
+
+ ) : (
+
+ No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact
+ isn't stored for this row.
+
+ )
+ ) : (
+ <>
+ {effectivePhase === 'warmup' && (
+
+ Showing the warmup phase — a
+ cache-warming pass whose outputs are capped at 1 token. Warmup OSL ≈ 1, and
+ interactivity/decode are blank (single-token outputs have no inter-token latency).
+ {!slicedHasServerData &&
+ ' Warmup server-side metrics aren’t available for this point, so the server charts below are empty — the request-level charts above still reflect warmup.'}
+
+ )}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ >
+ )}
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
new file mode 100644
index 00000000..d4526d24
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
@@ -0,0 +1,286 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+import { ChartEmpty, PERCENTILE_COLORS } from './chart-shared';
+
+export type PercentileKey = 'mean' | 'p50' | 'p75' | 'p90' | 'p99';
+
+interface PercentileLine {
+ key: PercentileKey;
+ /** Display label in legend / tooltip. */
+ label: string;
+ color: string;
+}
+
+const PERCENTILE_LINES: PercentileLine[] = [
+ { key: 'mean', label: 'Mean', color: PERCENTILE_COLORS.mean },
+ { key: 'p50', label: 'P50', color: PERCENTILE_COLORS.p50 },
+ { key: 'p75', label: 'P75', color: PERCENTILE_COLORS.p75 },
+ { key: 'p90', label: 'P90', color: PERCENTILE_COLORS.p90 },
+ { key: 'p99', label: 'P99', color: PERCENTILE_COLORS.p99 },
+];
+
+// Wider bottom/left padding than CHART_PAD — the x-axis carries rotated
+// per-config labels instead of time ticks.
+const PAD = { top: 16, right: 16, bottom: 90, left: 64 };
+
+export interface AggregatePoint {
+ /** Sibling label rendered on x-axis (e.g. "TP8 • c=8"). */
+ label: string;
+ /** Per-percentile value; missing percentiles are dropped from the plot. */
+ values: Partial>;
+ /** Sibling id — purely informational, used in the tooltip title. */
+ id?: number;
+}
+
+/**
+ * Multi-line chart: one x-position per sibling config, one line per
+ * percentile (mean/p50/p75/p90/p99). Designed for the "Aggregates across
+ * configs" view on the agentic detail page.
+ */
+export function AggregateChart({
+ points,
+ unit,
+ yMax,
+ yFmt,
+ width = 720,
+ height = 320,
+}: {
+ points: readonly AggregatePoint[];
+ unit: string;
+ /** Optional fixed y-axis upper bound (e.g. 1 for percentages). */
+ yMax?: number;
+ /** Optional value formatter (e.g. percentage → "30%"). */
+ yFmt?: (v: number) => string;
+ width?: number;
+ height?: number;
+}) {
+ const W = width;
+ const H = height;
+ const fmt = (v: number) =>
+ yFmt
+ ? yFmt(v)
+ : v >= 10000
+ ? new Intl.NumberFormat('en-US').format(Math.round(v))
+ : v.toFixed(v < 10 ? 2 : 0);
+
+ const computed = useMemo(() => {
+ if (points.length === 0) return null;
+ let yMaxComputed = 0;
+ for (const p of points) {
+ for (const line of PERCENTILE_LINES) {
+ const v = p.values[line.key];
+ if (typeof v === 'number' && Number.isFinite(v) && v > yMaxComputed) yMaxComputed = v;
+ }
+ }
+ const yTop = yMax ?? (yMaxComputed === 0 ? 1 : yMaxComputed * 1.05);
+ const innerW = W - PAD.left - PAD.right;
+ const innerH = H - PAD.top - PAD.bottom;
+ return { yTop, innerW, innerH };
+ }, [points, W, H, yMax]);
+
+ if (!computed) {
+ return ;
+ }
+ const { yTop, innerW, innerH } = computed;
+
+ // X positions: evenly spaced across the inner width.
+ const xOf = (i: number) =>
+ points.length === 1 ? PAD.left + innerW / 2 : PAD.left + (i / (points.length - 1)) * innerW;
+ const yOf = (v: number) => PAD.top + (1 - v / yTop) * innerH;
+
+ // 5 y-axis ticks evenly between 0 and yTop.
+ const yTicks = Array.from({ length: 5 }, (_, i) => (yTop * i) / 4);
+
+ // Resolve hover: snap to nearest sibling index and emit all percentiles
+ // that have data at that x.
+ const resolve = (fraction: number) => {
+ const idx = Math.round(fraction * (points.length - 1));
+ const p = points[Math.max(0, Math.min(points.length - 1, idx))];
+ if (!p) return null;
+ const items: HoverItem[] = [];
+ for (const line of PERCENTILE_LINES) {
+ const v = p.values[line.key];
+ if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+ items.push({ color: line.color, label: line.label, value: fmt(v) });
+ }
+ return { items, title: p.label };
+ };
+
+ return (
+
+
+ {PERCENTILE_LINES.map((line) => (
+
+
+ {line.label}
+
+ ))}
+
+ {points.length} configs · units: {unit}
+
+
+
+ {/* y-axis ticks + gridlines */}
+ {yTicks.map((v, i) => {
+ const y = yOf(v);
+ return (
+
+
+
+ {fmt(v)}
+
+
+ );
+ })}
+
+ {/* X-axis tick labels — one per sibling, rotated 30° to fit. */}
+ {points.map((p, i) => {
+ const x = xOf(i);
+ return (
+
+
+
+ {p.label}
+
+
+ );
+ })}
+
+ {/* X axis baseline */}
+
+
+ {/* Horizontal connecting lines per percentile — faint backdrop so the
+ eye can follow how each percentile changes across configs. */}
+ {PERCENTILE_LINES.map((line) => {
+ const segments: { x1: number; y1: number; x2: number; y2: number }[] = [];
+ let prev: { x: number; y: number } | null = null;
+ for (let i = 0; i < points.length; i++) {
+ const v = points[i]!.values[line.key];
+ if (typeof v !== 'number' || !Number.isFinite(v)) {
+ prev = null;
+ continue;
+ }
+ const x = xOf(i);
+ const y = yOf(v);
+ if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y });
+ prev = { x, y };
+ }
+ return (
+
+ {segments.map((s, j) => (
+
+ ))}
+
+ );
+ })}
+
+ {/* Per-sibling vertical bar spanning the percentile range, with a
+ colored tick at each percentile level. Mean rendered as a small
+ diamond to distinguish from the percentile ticks. */}
+ {points.map((p, i) => {
+ const x = xOf(i);
+ // Collect percentile values present for this sibling.
+ const present = PERCENTILE_LINES.filter(
+ (line) =>
+ typeof p.values[line.key] === 'number' && Number.isFinite(p.values[line.key]!),
+ ).map((line) => ({ ...line, value: p.values[line.key]! }));
+ if (present.length === 0) return null;
+ // Only the *percentile* values define the bar extent; mean might be
+ // outside the percentile span on weird distributions.
+ const pctlOnly = present.filter((p2) => p2.key !== 'mean');
+ const bandValues = pctlOnly.length > 0 ? pctlOnly : present;
+ const bandYs = bandValues.map((b) => yOf(b.value));
+ const yLo = Math.min(...bandYs);
+ const yHi = Math.max(...bandYs);
+ return (
+
+
+ {present.map((b) => {
+ const ty = yOf(b.value);
+ if (b.key === 'mean') {
+ // Diamond marker for mean.
+ const s = 4;
+ return (
+
+ );
+ }
+ // Horizontal tick at each percentile.
+ return (
+
+ );
+ })}
+
+ );
+ })}
+
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx b/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx
new file mode 100644
index 00000000..09252940
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx
@@ -0,0 +1,104 @@
+'use client';
+
+import type { AgenticAggregateMap, MetricPercentiles } from '@/hooks/api/use-agentic-aggregates';
+import type { BenchmarkSibling } from '@/hooks/api/use-benchmark-siblings';
+
+import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
+import { CHART_SIZES } from './chart-shared';
+import { ExpandableChart } from './expandable-chart';
+import { chipLabel } from './sibling-nav';
+
+/** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */
+function toAggPoint(
+ sibling: { id: number; label: string },
+ pct: MetricPercentiles | null | undefined,
+): AggregatePoint {
+ const values: Partial> = {};
+ if (pct) {
+ values.mean = pct.mean;
+ values.p50 = pct.p50;
+ values.p75 = pct.p75;
+ values.p90 = pct.p90;
+ values.p99 = pct.p99;
+ }
+ return { id: sibling.id, label: sibling.label, values };
+}
+
+/** "Aggregates across configs" view: ISL/OSL/KV/prefix stats per SKU sibling. */
+export function AggregatesGrid({
+ siblings,
+ aggregates,
+ isLoading,
+}: {
+ siblings: BenchmarkSibling[];
+ aggregates: AgenticAggregateMap | undefined;
+ isLoading: boolean;
+}) {
+ if (siblings.length === 0) {
+ return (
+
+ SKU sibling list not loaded yet — open a point to populate.
+
+ );
+ }
+ if (isLoading && !aggregates) {
+ return (
+
+ Computing aggregates across {siblings.length} configs… (parsing trace blobs)
+
+ );
+ }
+ const labeled = siblings.map((s) => ({ id: s.id, label: chipLabel(s) }));
+ const islPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.isl));
+ const oslPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.osl));
+ const kvPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.kvCacheUtil));
+ const prefixPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.prefixCacheHitRate));
+ return (
+
+
(
+
+ )}
+ />
+ (
+
+ )}
+ />
+ (
+ `${(v * 100).toFixed(0)}%`}
+ {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+ />
+ )}
+ />
+ (
+ `${(v * 100).toFixed(0)}%`}
+ {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+ />
+ )}
+ />
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/chart-hover.tsx b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
new file mode 100644
index 00000000..24270122
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
@@ -0,0 +1,148 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+
+/** Vertical crosshair + floating value tooltip overlay shared by every chart. */
+export interface HoverItem {
+ /** Color swatch to render next to the label. */
+ color: string;
+ label: string;
+ value: string;
+ /** Optional faint secondary line (e.g. timestamp under main values). */
+ hint?: string;
+}
+
+interface ChartHoverProps {
+ /** Padding inside the SVG; matches the chart's CHART_PAD. */
+ pad: { top: number; right: number; bottom: number; left: number };
+ /** SVG viewBox dimensions used to render the chart. */
+ width: number;
+ height: number;
+ /**
+ * Called with the cursor's normalized x in [0..1] across the plot area.
+ * Returns `null` to hide the tooltip (e.g. cursor outside data range).
+ */
+ resolve: (xFraction: number) => { items: HoverItem[]; title?: string } | null;
+ children: ReactNode;
+}
+
+/**
+ * Wrap a chart's render to add mouse-driven crosshair + tooltip.
+ *
+ * The chart owner renders its bars / lines / axes via `children`; this wrapper
+ * adds an invisible across the plot area to capture pointer events, a
+ * vertical line that follows the cursor, and a floating tooltip on the right
+ * of the cursor (auto-flipping to the left when it would overflow).
+ */
+export function ChartHover({ pad, width, height, resolve, children }: ChartHoverProps) {
+ const [hover, setHover] = useState<{
+ xPx: number;
+ yPx: number;
+ fraction: number;
+ items: HoverItem[];
+ title?: string;
+ } | null>(null);
+
+ const innerW = width - pad.left - pad.right;
+ const innerH = height - pad.top - pad.bottom;
+
+ const onMove = (e: React.MouseEvent) => {
+ const svg = e.currentTarget.ownerSVGElement;
+ if (!svg) return;
+ const rect = svg.getBoundingClientRect();
+ // Convert client coords → SVG viewBox coords.
+ const sx = ((e.clientX - rect.left) * width) / rect.width;
+ const sy = ((e.clientY - rect.top) * height) / rect.height;
+ const fraction = Math.max(0, Math.min(1, (sx - pad.left) / innerW));
+ const resolved = resolve(fraction);
+ if (!resolved) {
+ setHover(null);
+ return;
+ }
+ setHover({ xPx: sx, yPx: sy, fraction, items: resolved.items, title: resolved.title });
+ };
+
+ const onLeave = () => setHover(null);
+
+ return (
+
+
+ {children}
+ {hover && (
+
+ )}
+
+
+ {hover && hover.items.length > 0 && (
+
+ )}
+
+ );
+}
+
+function HoverTooltip({
+ xFraction,
+ containerWidth,
+ padLeft,
+ innerW,
+ title,
+ items,
+}: {
+ xFraction: number;
+ containerWidth: number;
+ padLeft: number;
+ innerW: number;
+ title?: string;
+ items: HoverItem[];
+}) {
+ // Position tooltip near the crosshair as a % of the container.
+ // We flip to the cursor's left side when it would overflow the right edge.
+ const xPx = padLeft + xFraction * innerW;
+ const onRight = xPx < containerWidth * 0.55;
+ const left = onRight ? `${(xPx / containerWidth) * 100}%` : 'auto';
+ const right = onRight ? 'auto' : `${((containerWidth - xPx) / containerWidth) * 100}%`;
+ return (
+
+ {title &&
{title}
}
+ {items.map((it, i) => (
+
+
+ {it.label}
+ {it.value}
+
+ ))}
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/chart-shared.tsx b/packages/app/src/components/inference/agentic-point/chart-shared.tsx
new file mode 100644
index 00000000..f00f4532
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/chart-shared.tsx
@@ -0,0 +1,57 @@
+'use client';
+
+/**
+ * Shared presentational constants and helpers for the agentic point-detail
+ * charts (time-series, stacked-area, distribution, aggregate). These charts
+ * are hand-rolled SVG (not the d3-chart library) and share axis padding,
+ * tick formatting, and empty/loading states.
+ */
+
+/** Axis padding shared by the time-series, stacked-area, and distribution charts. */
+export const CHART_PAD = { top: 12, right: 16, bottom: 56, left: 60 } as const;
+
+/** Sizes passed to charts for the inline (small) vs expanded (dialog) render. */
+export const CHART_SIZES = {
+ inline: { width: 720, height: 260 },
+ expanded: { width: 1300, height: 520 },
+} as const;
+
+/**
+ * Guide-line colors per percentile, shared by the aggregate chart's lines and
+ * the distribution chart's vertical guides so the same percentile reads as the
+ * same color across the detail page.
+ */
+export const PERCENTILE_COLORS = {
+ mean: '#ef4444',
+ p50: '#3b82f6',
+ p75: '#22c55e',
+ p90: '#f59e0b',
+ p95: '#ef4444',
+ p99: '#a855f7',
+} as const;
+
+/** Integer tick label: thousands separators only once the value reaches 10000. */
+export const fmtCount = (n: number): string =>
+ n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
+
+/** Seconds → "42s" / "3m 20s" time-axis tick label. */
+export const fmtSeconds = (s: number): string => {
+ if (s < 60) return `${Math.round(s)}s`;
+ const m = Math.floor(s / 60);
+ const rem = Math.round(s % 60);
+ return `${m}m ${rem}s`;
+};
+
+/** "No data" placeholder sized to match the chart it replaces. */
+export function ChartEmpty({ height = 260 }: { height?: number }) {
+ return (
+
+ No data
+
+ );
+}
+
+/** Loading placeholder for a chart card. */
+export function ChartSkeleton() {
+ return
;
+}
diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
new file mode 100644
index 00000000..f55d6131
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
@@ -0,0 +1,53 @@
+import { describe, expect, it } from 'vitest';
+
+import { datasetConvId, subagentIdOf } from './request-timeline';
+
+describe('datasetConvId', () => {
+ it('returns a plain conversation id unchanged', () => {
+ expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602')).toBe(
+ '002001296e8a8c38ad9d7cc436d691afc602',
+ );
+ });
+
+ it('strips a ::sa: subagent suffix to the parent conv id', () => {
+ expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe(
+ '002001296e8a8c38ad9d7cc436d691afc602',
+ );
+ });
+
+ it('strips a ::fa: forked-agent suffix', () => {
+ expect(datasetConvId('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBe(
+ '02bc0afb13f7a2d9efa86c28511261d85c0e',
+ );
+ });
+
+ it('strips at the first :: even with a trailing stream index', () => {
+ expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc');
+ });
+});
+
+describe('subagentIdOf', () => {
+ it('returns null for a main-conversation cid', () => {
+ expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602')).toBeNull();
+ });
+
+ it('extracts the subagent id from a ::sa: cid', () => {
+ expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe(
+ 'subagent_004_27c95af7',
+ );
+ });
+
+ it('drops a trailing :s index from the subagent id', () => {
+ expect(subagentIdOf('abc::sa:subagent_001_f552fe6f:s3')).toBe('subagent_001_f552fe6f');
+ });
+
+ it('drops an :aux: stream suffix from the subagent id', () => {
+ expect(subagentIdOf('04dba6fe::sa:subagent_001_b00fdc12:aux:011')).toBe(
+ 'subagent_001_b00fdc12',
+ );
+ });
+
+ it('returns null for a ::fa: forked-agent cid (no matching subagent group)', () => {
+ expect(subagentIdOf('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBeNull();
+ });
+});
diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx
new file mode 100644
index 00000000..6573d60c
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/distribution.tsx
@@ -0,0 +1,233 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+import { CHART_PAD, ChartEmpty, PERCENTILE_COLORS, fmtCount } from './chart-shared';
+import { quantile } from './time-series-math';
+
+const PAD = CHART_PAD;
+
+const GUIDES = [
+ { label: 'p50', q: 0.5, color: PERCENTILE_COLORS.p50 },
+ { label: 'p75', q: 0.75, color: PERCENTILE_COLORS.p75 },
+ { label: 'p90', q: 0.9, color: PERCENTILE_COLORS.p90 },
+ { label: 'p95', q: 0.95, color: PERCENTILE_COLORS.p95 },
+] as const;
+
+/**
+ * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the
+ * detail-page card — fills its container width via `viewBox` + 100% width.
+ * Hover shows the bin range + count + cumulative percentile.
+ */
+export function Distribution({
+ values,
+ unit,
+ width = 720,
+ height = 260,
+}: {
+ values: readonly number[];
+ unit: string;
+ width?: number;
+ height?: number;
+}) {
+ const W = width;
+ const H = height;
+
+ const computed = useMemo(() => {
+ if (values.length === 0) return null;
+ const sorted = [...values].toSorted((a, b) => a - b);
+ const min = sorted[0]!;
+ const max = sorted.at(-1)!;
+ const range = Math.max(1e-9, max - min);
+ const innerW = W - PAD.left - PAD.right;
+ const innerH = H - PAD.top - PAD.bottom;
+ const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length))));
+ const counts: number[] = Array.from({ length: nBins }, () => 0);
+ for (const v of values) {
+ const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+ counts[i]!++;
+ }
+ return { sorted, min, max, range, innerW, innerH, nBins, counts };
+ }, [values, W, H]);
+
+ if (!computed) {
+ return ;
+ }
+ const { sorted, min, max, range, innerW, innerH, nBins, counts } = computed;
+ const maxCount = Math.max(...counts, 1);
+ const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
+ const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
+ const barW = innerW / nBins;
+
+ const fmt = fmtCount;
+
+ // Hover: report the bin range under cursor, its count, and what percentile
+ // the bin's midpoint represents in the empirical distribution.
+ const resolve = (fraction: number) => {
+ const v = min + fraction * range;
+ const binIdx = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+ const binLo = min + (binIdx * range) / nBins;
+ const binHi = min + ((binIdx + 1) * range) / nBins;
+ const count = counts[binIdx] ?? 0;
+ // Cumulative % at the bin's right edge.
+ let cumCount = 0;
+ for (let i = 0; i <= binIdx; i++) cumCount += counts[i] ?? 0;
+ const cumPct = (cumCount / values.length) * 100;
+ const items: HoverItem[] = [
+ { color: 'currentColor', label: 'Bin', value: `${fmt(binLo)}–${fmt(binHi)} ${unit}` },
+ { color: 'currentColor', label: 'Count', value: count.toLocaleString() },
+ { color: 'currentColor', label: 'Cumulative', value: `${cumPct.toFixed(1)}%` },
+ ];
+ return { items };
+ };
+
+ const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
+ const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
+
+ return (
+
+
+ {values.length.toLocaleString()} requests · range {fmt(min)}–{fmt(max)} {unit}
+
+
+ {/* y-axis gridlines + labels */}
+ {yTickVals.map((v, i) => {
+ const y = yScale(v);
+ return (
+
+
+
+ {fmt(v)}
+
+
+ );
+ })}
+
+ {/* Bars */}
+ {counts.map((c, i) => {
+ const h = (c / maxCount) * innerH;
+ const x = PAD.left + i * barW;
+ const y = PAD.top + (innerH - h);
+ return (
+
+ );
+ })}
+
+ {/* Percentile guide lines */}
+ {GUIDES.map(({ q, color }) => {
+ const v = quantile(sorted, q);
+ const x = xScale(v);
+ return (
+
+ );
+ })}
+
+ {/* X axis */}
+
+ {xTickVals.map((v, i) => {
+ const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+ return (
+
+ {fmt(v)}
+
+ );
+ })}
+
+ value ({unit})
+
+
+ count
+
+
+ {/* Percentile legend chips */}
+ {(() => {
+ const chipY = H - 8;
+ const chipW = innerW / GUIDES.length;
+ return GUIDES.map(({ label: ql, q, color }, i) => {
+ const v = quantile(sorted, q);
+ const x = PAD.left + i * chipW;
+ return (
+
+
+
+ {ql} {fmt(v)}
+
+
+ );
+ });
+ })()}
+
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
new file mode 100644
index 00000000..810530c5
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
@@ -0,0 +1,60 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+import { Maximize2 } from 'lucide-react';
+
+import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog';
+import { track } from '@/lib/analytics';
+
+/**
+ * Wraps a chart in a card with a header + expand button. Click the button to
+ * open the chart in a large dialog. The `render` prop receives `expanded:true`
+ * inside the dialog so charts can pick larger width/height.
+ */
+export function ExpandableChart({
+ title,
+ render,
+ controls,
+ testId,
+}: {
+ title: string;
+ render: (expanded: boolean) => ReactNode;
+ controls?: ReactNode;
+ testId?: string;
+}) {
+ const [open, setOpen] = useState(false);
+
+ return (
+
+
+
{title}
+
+ {controls}
+ {
+ track('agentic_chart_expanded', { title });
+ setOpen(true);
+ }}
+ className="text-muted-foreground hover:text-foreground transition-colors"
+ >
+
+
+
+
+ {render(false)}
+
+
+
+
+ {title}
+ {controls}
+
+
+ {render(true)}
+
+
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx b/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx
new file mode 100644
index 00000000..e56ddeee
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx
@@ -0,0 +1,130 @@
+'use client';
+
+import type { MetricSource, MetricSourceSeries } from '@/hooks/api/use-trace-server-metrics';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import {
+ Select,
+ SelectContent,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from '@/components/ui/select';
+import { track } from '@/lib/analytics';
+
+import type { StagePhase } from './phase-slice';
+
+const SOURCE_ROLE_LABEL: Record = {
+ router: 'Router',
+ prefill: 'Prefill',
+ decode: 'Decode',
+ combined: 'Combined',
+ unknown: 'Unknown',
+};
+
+/** "Role · instance" label for one server-metrics endpoint. */
+export function metricSourceLabel(source: MetricSource): string {
+ const instance =
+ source.workerId ??
+ (source.dpRank ? `DP ${source.dpRank}` : null) ??
+ source.endpointUrl ??
+ (source.engine ? `engine ${source.engine}` : null);
+ return instance
+ ? `${SOURCE_ROLE_LABEL[source.role]} · ${instance}`
+ : SOURCE_ROLE_LABEL[source.role];
+}
+
+// Warmup vs profiling stage selector. Drives the server-metric charts AND the
+// request-derived charts (ISL/OSL, latency-over-time, in-flight). Only shown
+// when the point actually has a warmup phase.
+const STAGE_PHASE_OPTIONS: SegmentedToggleOption[] = [
+ { value: 'profiling', label: 'Profiling', testId: 'stage-phase-profiling' },
+ { value: 'warmup', label: 'Warmup', testId: 'stage-phase-warmup' },
+];
+
+/**
+ * Sticky per-point toolbar: warmup/profiling stage toggle (when the point has
+ * a warmup phase) and the server-metrics endpoint selector (when the point has
+ * more than one source). The parent decides when to render it at all.
+ */
+export function MetricSourceToolbar({
+ hasWarmup,
+ phase,
+ onPhaseChange,
+ metricSources,
+ selectedSource,
+ onSourceChange,
+ fallbackAdapter,
+}: {
+ hasWarmup: boolean;
+ phase: StagePhase;
+ onPhaseChange: (phase: StagePhase) => void;
+ metricSources: MetricSourceSeries[];
+ selectedSource: MetricSourceSeries | undefined;
+ onSourceChange: (id: string) => void;
+ /** Adapter reported in analytics when the selected source lookup misses. */
+ fallbackAdapter: string | undefined;
+}) {
+ return (
+
+ {hasWarmup ? (
+
+ Stage
+ {
+ onPhaseChange(value);
+ track('inference_agentic_phase_changed', { phase: value });
+ }}
+ ariaLabel="Stage phase"
+ testId="stage-phase-toggle"
+ buttonClassName="px-2.5 py-1 text-xs"
+ />
+
+ ) : (
+
+ )}
+ {metricSources.length > 1 ? (
+
+ Server metrics
+ {
+ onSourceChange(value);
+ const source = metricSources.find((entry) => entry.source.id === value)?.source;
+ track('inference_agentic_metric_source_changed', {
+ source: value,
+ role: source?.role ?? 'all',
+ adapter: source?.adapter ?? fallbackAdapter ?? 'unknown',
+ });
+ }}
+ >
+
+
+
+
+ All endpoints
+ {metricSources.map(({ source }) => (
+
+ {metricSourceLabel(source)}
+
+ ))}
+
+
+
+ ) : null}
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.test.ts b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts
new file mode 100644
index 00000000..ef6cdaab
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts
@@ -0,0 +1,212 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline';
+import {
+ phaseBoundaryNs,
+ phaseBoundarySec,
+ requestsForPhase,
+ sliceServerSeriesByPhase,
+ sliceTimelineByPhase,
+ timelineHasWarmup,
+ type ServerSeriesLike,
+} from './phase-slice';
+
+function req(overrides: Partial): RequestRecord {
+ return {
+ cid: 'c',
+ ti: 0,
+ wid: 'w',
+ ad: 0,
+ phase: 'profiling',
+ credit: 0,
+ start: 0,
+ ack: null,
+ end: 1,
+ ttftMs: null,
+ tpotMs: null,
+ isl: null,
+ osl: null,
+ cancelled: false,
+ ...overrides,
+ };
+}
+
+function timeline(requests: RequestRecord[], startNs = 1_000): RequestTimeline {
+ return { version: 3, startNs, endNs: startNs + 1, durationS: 1, requests };
+}
+
+function makeSeries(ts: number[]): ServerSeriesLike {
+ const pts = ts.map((t) => ({ t, value: t * 10 }));
+ return {
+ kvCacheUsage: pts,
+ prefixCacheHitRate: pts,
+ queueDepth: ts.map((t) => ({ t, running: t, waiting: t + 1, total: 2 * t + 1 })),
+ promptTokensBySource: { src: pts },
+ prefillTps: pts,
+ decodeTps: pts,
+ prefixCacheHitsTps: pts,
+ hostKvCacheUsage: pts,
+ kvCacheUsageByEngine: [{ engineLabel: 'e0', points: pts }],
+ };
+}
+
+describe('phaseBoundaryNs', () => {
+ it('returns null when there are no profiling requests', () => {
+ expect(phaseBoundaryNs(timeline([req({ phase: 'warmup', start: 5 })]))).toBeNull();
+ });
+
+ it('returns null when there are no warmup requests', () => {
+ expect(phaseBoundaryNs(timeline([req({ phase: 'profiling', start: 5 })]))).toBeNull();
+ });
+
+ it('returns startNs + earliest profiling start when both phases present', () => {
+ const t = timeline(
+ [
+ req({ phase: 'warmup', start: 0 }),
+ req({ phase: 'profiling', start: 900 }),
+ req({ phase: 'profiling', start: 700 }),
+ ],
+ 1_000,
+ );
+ expect(phaseBoundaryNs(t)).toBe(1_700);
+ });
+
+ it('returns null for nullish timeline', () => {
+ expect(phaseBoundaryNs(null)).toBeNull();
+ expect(phaseBoundaryNs(undefined)).toBeNull();
+ });
+});
+
+describe('phaseBoundarySec', () => {
+ it('rebases through absolute ns by subtracting serverMetrics.startNs (origin gap)', () => {
+ // timeline origin and server-metrics origin differ — the classic ~124s gap.
+ const tl = timeline(
+ [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 600 * 1e9 })],
+ 200 * 1e9, // timeline.startNs
+ );
+ // boundaryNs = 200e9 + 600e9 = 800e9 ; serverMetrics origin = 124e9 earlier
+ const boundarySec = phaseBoundarySec({ startNs: 76 * 1e9 }, tl);
+ // (800e9 - 76e9)/1e9 = 724
+ expect(boundarySec).toBe(724);
+ });
+
+ it('clamps a negative mapping to 0', () => {
+ const tl = timeline(
+ [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 0 })],
+ 0,
+ );
+ expect(phaseBoundarySec({ startNs: 5 * 1e9 }, tl)).toBe(0);
+ });
+
+ it('returns null when serverMetrics missing or no split', () => {
+ const tl = timeline(
+ [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 1e9 })],
+ 0,
+ );
+ expect(phaseBoundarySec(null, tl)).toBeNull();
+ expect(phaseBoundarySec({ startNs: 0 }, timeline([req({ phase: 'profiling' })]))).toBeNull();
+ });
+});
+
+describe('timelineHasWarmup', () => {
+ it('detects warmup presence', () => {
+ expect(timelineHasWarmup(timeline([req({ phase: 'profiling' })]))).toBe(false);
+ expect(timelineHasWarmup(timeline([req({ phase: 'warmup' })]))).toBe(true);
+ expect(timelineHasWarmup(null)).toBe(false);
+ });
+});
+
+describe('sliceServerSeriesByPhase', () => {
+ it('is an identity passthrough (full duration) when boundary is null', () => {
+ const s = makeSeries([0, 1, 2]);
+ const out = sliceServerSeriesByPhase(s, 'profiling', null, 99);
+ expect(out.series).toBe(s);
+ expect(out.durationS).toBe(99);
+ });
+
+ it('warmup keeps t < boundary, no rebase, durationS = boundary', () => {
+ const s = makeSeries([0, 1, 2, 3, 4]);
+ const out = sliceServerSeriesByPhase(s, 'warmup', 2, 5);
+ expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1]); // excludes t===2
+ expect(out.durationS).toBe(2);
+ });
+
+ it('profiling keeps t >= boundary and rebases to start at 0', () => {
+ const s = makeSeries([0, 1, 2, 3, 4]);
+ const out = sliceServerSeriesByPhase(s, 'profiling', 2, 5);
+ expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1, 2]); // 2,3,4 -> 0,1,2
+ expect(out.series.kvCacheUsage.map((p) => p.value)).toEqual([20, 30, 40]); // values preserved
+ expect(out.durationS).toBe(3); // 5 - 2
+ });
+
+ it('slices queueDepth, promptTokensBySource, and kvCacheUsageByEngine; preserves queue fields', () => {
+ const s = makeSeries([0, 1, 2, 3]);
+ const out = sliceServerSeriesByPhase(s, 'profiling', 2, 4);
+ expect(out.series.queueDepth).toEqual([
+ { t: 0, running: 2, waiting: 3, total: 5 },
+ { t: 1, running: 3, waiting: 4, total: 7 },
+ ]);
+ expect(out.series.promptTokensBySource.src.map((p) => p.t)).toEqual([0, 1]);
+ expect(out.series.kvCacheUsageByEngine[0]!.points.map((p) => p.t)).toEqual([0, 1]);
+ expect(out.series.kvCacheUsageByEngine[0]!.engineLabel).toBe('e0');
+ });
+
+ it('does not mutate the input series', () => {
+ const s = makeSeries([0, 1, 2]);
+ const before = s.kvCacheUsage.map((p) => p.t);
+ sliceServerSeriesByPhase(s, 'profiling', 1, 3);
+ expect(s.kvCacheUsage.map((p) => p.t)).toEqual(before);
+ });
+});
+
+describe('requestsForPhase', () => {
+ const rs = [
+ req({ phase: 'warmup', isl: 1 }),
+ req({ phase: 'profiling', isl: 2 }),
+ req({ phase: 'unknown', isl: 3 }),
+ ];
+
+ it('profiling selects only profiling rows', () => {
+ expect(requestsForPhase(rs, 'profiling').map((r) => r.isl)).toEqual([2]);
+ });
+
+ it('warmup selects everything that is not profiling', () => {
+ expect(requestsForPhase(rs, 'warmup').map((r) => r.isl)).toEqual([1, 3]);
+ });
+});
+
+describe('sliceTimelineByPhase', () => {
+ // startNs origin = 1000; warmup request at offset 0..50, profiling at 100..300.
+ const tl = timeline(
+ [
+ req({ phase: 'warmup', credit: 0, start: 0, ack: 10, end: 50, isl: 1 }),
+ req({ phase: 'profiling', credit: 90, start: 100, ack: 120, end: 300, isl: 2 }),
+ ],
+ 1_000,
+ );
+ // tl.durationS default = 1 from helper; override for window math.
+ const tlDur: RequestTimeline = { ...tl, durationS: 3 };
+
+ it('returns the input unchanged for a single-phase timeline', () => {
+ const single = timeline([req({ phase: 'profiling', start: 5 })]);
+ expect(sliceTimelineByPhase(single, 'profiling')).toBe(single);
+ });
+
+ it('warmup keeps pre-boundary requests, no rebase, startNs unchanged', () => {
+ const out = sliceTimelineByPhase(tlDur, 'warmup');
+ expect(out.requests.map((r) => r.isl)).toEqual([1]);
+ expect(out.requests[0]!.start).toBe(0); // not rebased
+ expect(out.startNs).toBe(1_000);
+ });
+
+ it('profiling keeps post-boundary requests and rebases offsets + startNs', () => {
+ const out = sliceTimelineByPhase(tlDur, 'profiling');
+ expect(out.requests.map((r) => r.isl)).toEqual([2]);
+ // boundary offset = 100 → rebased: start 100→0, end 300→200, ack 120→20, credit 90→-10
+ expect(out.requests[0]!.start).toBe(0);
+ expect(out.requests[0]!.end).toBe(200);
+ expect(out.requests[0]!.ack).toBe(20);
+ // startNs shifts forward by the boundary offset so absolute time is preserved
+ expect(out.startNs).toBe(1_100);
+ });
+});
diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.ts b/packages/app/src/components/inference/agentic-point/phase-slice.ts
new file mode 100644
index 00000000..e6e17719
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/phase-slice.ts
@@ -0,0 +1,188 @@
+/**
+ * Warmup vs profiling phase slicing for the agentic per-point detail page.
+ *
+ * Agentic trace-replay runs have two phases: a warmup (cache-warming) pass, then
+ * the measured profiling window. The server-metric time-series (`chart_series`)
+ * spans the whole run with no per-point phase label, but the per-request
+ * `request_timeline` IS phase-tagged. We derive the warmup→profiling boundary
+ * from the timeline and slice the server series at it.
+ *
+ * ⚠️ ORIGIN-GAP INVARIANT: the two payloads share the aiperf clock but have
+ * DIFFERENT zero origins — `serverMetrics.startNs` is the first server scrape,
+ * `timeline.startNs` is the first request's credit (observed ~124 s apart in
+ * real runs). The boundary must therefore be rebased through absolute ns by
+ * subtracting `serverMetrics.startNs`; a same-axis offset comparison would be
+ * off by the origin gap. This rebasing lives in `phaseBoundarySec` only.
+ */
+
+import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline';
+import type {
+ QueueDepthPoint,
+ TimeSeriesPoint,
+ TraceServerMetrics,
+} from '@/hooks/api/use-trace-server-metrics';
+
+export type StagePhase = 'warmup' | 'profiling';
+
+/**
+ * The subset of server-metric series the per-point charts render. Both the
+ * top-level `TraceServerMetrics` and a per-source object (after the detail page
+ * remaps `promptTps`→`prefillTps`, `generationTps`→`decodeTps`) are assignable.
+ */
+export interface ServerSeriesLike {
+ kvCacheUsage: TimeSeriesPoint[];
+ prefixCacheHitRate: TimeSeriesPoint[];
+ queueDepth: QueueDepthPoint[];
+ promptTokensBySource: Record;
+ prefillTps: TimeSeriesPoint[];
+ decodeTps: TimeSeriesPoint[];
+ prefixCacheHitsTps: TimeSeriesPoint[];
+ hostKvCacheUsage: TimeSeriesPoint[];
+ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+}
+
+/** True when the timeline contains at least one non-profiling (warmup) request. */
+export function timelineHasWarmup(timeline: RequestTimeline | null | undefined): boolean {
+ return Boolean(timeline?.requests.some((r) => r.phase !== 'profiling'));
+}
+
+/**
+ * Absolute-ns wall-clock instant where the profiling phase begins
+ * = `timeline.startNs + earliest profiling request's start offset`.
+ * Returns null unless BOTH a warmup and a profiling request exist (nothing to
+ * split otherwise).
+ */
+export function phaseBoundaryNs(timeline: RequestTimeline | null | undefined): number | null {
+ if (!timeline) return null;
+ let hasWarmup = false;
+ let minProfilingStart: number | null = null;
+ for (const r of timeline.requests) {
+ if (r.phase === 'profiling') {
+ if (minProfilingStart === null || r.start < minProfilingStart) minProfilingStart = r.start;
+ } else {
+ hasWarmup = true;
+ }
+ }
+ if (!hasWarmup || minProfilingStart === null) return null;
+ return timeline.startNs + minProfilingStart;
+}
+
+/**
+ * The profiling-start boundary expressed on the SERVER-METRIC chart's own t-axis
+ * (seconds from `serverMetrics.startNs`). See the origin-gap invariant at the top
+ * of the file — the `- serverMetrics.startNs` subtraction is mandatory.
+ *
+ * Returns null when there's no warmup/profiling split, or `serverMetrics` is
+ * absent (→ callers fall back to the full-run series).
+ */
+export function phaseBoundarySec(
+ serverMetrics: Pick | null | undefined,
+ timeline: RequestTimeline | null | undefined,
+): number | null {
+ if (!serverMetrics) return null;
+ const boundaryNs = phaseBoundaryNs(timeline);
+ if (boundaryNs === null) return null;
+ return Math.max(0, (boundaryNs - serverMetrics.startNs) / 1e9);
+}
+
+export interface PhaseSlicedSeries {
+ series: S;
+ durationS: number;
+}
+
+/**
+ * Slice every server-metric series to one phase:
+ * - warmup: keep points with `t < boundary`, no rebase, `durationS = boundary`
+ * - profiling: keep points with `t >= boundary`, rebased so `t` starts at 0,
+ * `durationS = full - boundary`
+ *
+ * A point exactly at `t === boundary` belongs to profiling. Null boundary
+ * (single-phase point, or no server metrics) → identity passthrough with the
+ * full `durationS`. Pure — returns new objects, never mutates the input.
+ *
+ * NOTE: rebasing the profiling slice to start at 0 makes the cumulative charts
+ * (prompt-token source, unique-input-tokens) read as "since profiling start"
+ * rather than "since run start" — intended.
+ */
+export function sliceServerSeriesByPhase(
+ series: S,
+ phase: StagePhase,
+ boundarySec: number | null,
+ fullDurationS: number,
+): PhaseSlicedSeries {
+ if (boundarySec === null) return { series, durationS: fullDurationS };
+ const b = boundarySec;
+ const keep = phase === 'warmup' ? (t: number) => t < b : (t: number) => t >= b;
+ const rebase = phase === 'profiling' ? (t: number) => t - b : (t: number) => t;
+
+ const sliceTs = (pts: TimeSeriesPoint[]): TimeSeriesPoint[] =>
+ pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) }));
+ const sliceQd = (pts: QueueDepthPoint[]): QueueDepthPoint[] =>
+ pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) }));
+ const sliceRecord = (
+ rec: Record,
+ ): Record => {
+ const out: Record = {};
+ for (const [k, v] of Object.entries(rec)) out[k] = sliceTs(v);
+ return out;
+ };
+
+ const slicedFields: ServerSeriesLike = {
+ kvCacheUsage: sliceTs(series.kvCacheUsage),
+ prefixCacheHitRate: sliceTs(series.prefixCacheHitRate),
+ queueDepth: sliceQd(series.queueDepth),
+ promptTokensBySource: sliceRecord(series.promptTokensBySource),
+ prefillTps: sliceTs(series.prefillTps),
+ decodeTps: sliceTs(series.decodeTps),
+ prefixCacheHitsTps: sliceTs(series.prefixCacheHitsTps),
+ hostKvCacheUsage: sliceTs(series.hostKvCacheUsage),
+ kvCacheUsageByEngine: series.kvCacheUsageByEngine.map((e) => ({
+ engineLabel: e.engineLabel,
+ points: sliceTs(e.points),
+ })),
+ };
+
+ const durationS = phase === 'warmup' ? b : Math.max(1, fullDurationS - b);
+ return { series: { ...series, ...slicedFields } as S, durationS };
+}
+
+/** Filter request-timeline records to one phase (warmup = anything not profiling). */
+export function requestsForPhase(requests: RequestRecord[], phase: StagePhase): RequestRecord[] {
+ return phase === 'warmup'
+ ? requests.filter((r) => r.phase !== 'profiling')
+ : requests.filter((r) => r.phase === 'profiling');
+}
+
+/**
+ * Scope a whole request timeline to one phase: keep only that phase's requests
+ * and, for profiling, rebase every ns offset (and `startNs`) so the phase starts
+ * at t=0 — mirroring `sliceServerSeriesByPhase` so the request-derived charts and
+ * the server charts share a 0-based axis for the same phase. `durationS` becomes
+ * the phase window. Returns the input unchanged when there's no warmup/profiling
+ * split (single-phase point). Pure — new object, original untouched.
+ *
+ * The boundary here is on the REQUEST clock (offset from `timeline.startNs`), so
+ * we use `phaseBoundaryNs` minus `timeline.startNs` rather than the server-axis
+ * `phaseBoundarySec` (different origin — see the file header).
+ */
+export function sliceTimelineByPhase(
+ timeline: RequestTimeline,
+ phase: StagePhase,
+): RequestTimeline {
+ const boundaryNs = phaseBoundaryNs(timeline);
+ if (boundaryNs === null) return timeline;
+ const boundaryOff = boundaryNs - timeline.startNs; // ns offset on the request clock
+ const inPhase = (r: RequestRecord) =>
+ phase === 'warmup' ? r.start < boundaryOff : r.start >= boundaryOff;
+ const shift = phase === 'profiling' ? boundaryOff : 0;
+ const requests = timeline.requests.filter(inPhase).map((r) => ({
+ ...r,
+ credit: r.credit - shift,
+ start: r.start - shift,
+ ack: r.ack === null ? null : r.ack - shift,
+ end: r.end - shift,
+ }));
+ const durationS =
+ phase === 'warmup' ? boundaryOff / 1e9 : Math.max(1, timeline.durationS - boundaryOff / 1e9);
+ return { ...timeline, startNs: timeline.startNs + shift, requests, durationS };
+}
diff --git a/packages/app/src/components/inference/agentic-point/point-summary.tsx b/packages/app/src/components/inference/agentic-point/point-summary.tsx
new file mode 100644
index 00000000..8a777baa
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/point-summary.tsx
@@ -0,0 +1,50 @@
+'use client';
+
+import type { ReactNode } from 'react';
+
+import type { PointMeta } from '@/hooks/api/use-trace-server-metrics';
+
+const fmtPct = (v: number | null | undefined): string =>
+ v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`;
+
+function MetaLine({ label, value }: { label: string; value: ReactNode }) {
+ return (
+
+ {label}
+ {value}
+
+ );
+}
+
+/** Selected-point header: config facts (offload, concurrency, cache hit rates, ISL/OSL). */
+export function PointSummary({ meta }: { meta: PointMeta }) {
+ return (
+
+
+
+ Selected point
+ {meta.disagg ? ' · disagg' : ''}
+ {meta.spec_method && meta.spec_method !== 'none' ? ` · spec=${meta.spec_method}` : ''}
+
+ {meta.run_url && (
+
+ GitHub Actions run →
+
+ )}
+
+
+
+
+
+
+ {meta.isl !== null && }
+ {meta.osl !== null && }
+
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx b/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx
new file mode 100644
index 00000000..8ca85ac9
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx
@@ -0,0 +1,223 @@
+'use client';
+
+import { useState } from 'react';
+
+import type { RequestTimeline } from '@/hooks/api/use-request-timeline';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+import { CHART_SIZES, ChartEmpty, ChartSkeleton } from './chart-shared';
+import { Distribution } from './distribution';
+import { ExpandableChart } from './expandable-chart';
+import { TimeSeriesChart } from './time-series-chart';
+import {
+ averageSequenceLengthInFlight,
+ rollingRequestMetric,
+ timeRollingAverage,
+ type RequestMetric,
+ type RequestPercentile,
+} from './time-series-math';
+
+const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption[] = [
+ { value: 'p75', label: 'P75' },
+ { value: 'p90', label: 'P90' },
+];
+
+const LATENCY_METRIC_OPTIONS: SegmentedToggleOption<'ttft' | 'e2e'>[] = [
+ { value: 'ttft', label: 'TTFT', testId: 'latency-metric-ttft' },
+ { value: 'e2e', label: 'E2E', testId: 'latency-metric-e2e' },
+];
+
+type SequenceMetricView = 'distribution' | 'inflight';
+
+const SEQUENCE_METRIC_OPTIONS: SegmentedToggleOption[] = [
+ { value: 'distribution', label: 'Distribution' },
+ { value: 'inflight', label: 'In-flight avg' },
+];
+
+// Unofficial-run overlays cannot open this persisted point-detail route: they
+// have no benchmark_results id or stored request timeline. These charts are
+// therefore intentionally limited to DB-backed agentic points.
+export function RequestMetricOverTime({
+ title,
+ metric,
+ timeline,
+ isLoading,
+ latencySelector = false,
+}: {
+ title: string;
+ metric: RequestMetric;
+ timeline: RequestTimeline | null | undefined;
+ isLoading: boolean;
+ latencySelector?: boolean;
+}) {
+ const [percentile, setPercentile] = useState('p90');
+ const [latencyMetric, setLatencyMetric] = useState<'ttft' | 'e2e'>('ttft');
+ const selectedMetric = latencySelector ? latencyMetric : metric;
+ const result = timeline
+ ? rollingRequestMetric(timeline.requests, selectedMetric, percentile, 50)
+ : null;
+ const metricLabel =
+ selectedMetric === 'ttft' ? 'TTFT' : selectedMetric === 'e2e' ? 'E2E latency' : 'Interactivity';
+ const color =
+ selectedMetric === 'ttft' ? '#f59e0b' : selectedMetric === 'e2e' ? '#a855f7' : '#06b6d4';
+ const pointCount = result?.raw.length;
+ const isLatency = selectedMetric !== 'interactivity';
+
+ const controls = (
+
+ {latencySelector && (
+ {
+ setLatencyMetric(value);
+ track('inference_agentic_latency_metric_changed', { metric: value });
+ }}
+ ariaLabel="Latency metric"
+ testId="latency-metric-toggle"
+ />
+ )}
+
+ {pointCount === undefined
+ ? '— points'
+ : `${pointCount.toLocaleString()} ${pointCount === 1 ? 'point' : 'points'}`}
+
+ {
+ setPercentile(value);
+ track('inference_agentic_percentile_changed', {
+ metric: selectedMetric,
+ percentile: value,
+ });
+ }}
+ ariaLabel={`${metricLabel} percentile`}
+ testId={`${selectedMetric}-percentile-toggle`}
+ />
+
+ );
+
+ return (
+ {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (!timeline) return isLoading ? : ;
+ return (
+ `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s`
+ : (value) => `${value.toFixed(0)}`
+ }
+ yAxisLabel={isLatency ? `${metricLabel} (s)` : 'Interactivity (tok/s/user)'}
+ {...size}
+ />
+ );
+ }}
+ />
+ );
+}
+
+export function SequenceMetricCard({
+ metric,
+ timeline,
+ timelineLoading,
+}: {
+ metric: 'isl' | 'osl';
+ /** Phase-scoped timeline — distribution values + in-flight are both derived from it. */
+ timeline: RequestTimeline | null | undefined;
+ timelineLoading: boolean;
+}) {
+ const [view, setView] = useState('distribution');
+ const acronym = metric.toUpperCase();
+ const fullName = metric === 'isl' ? 'Input sequence length' : 'Output sequence length';
+ const testPrefix = `${metric}-metric`;
+ // Per-request ISL/OSL for the selected phase (request_timeline carries both,
+ // so the distribution honours the warmup/profiling toggle for free).
+ const values = timeline
+ ? timeline.requests
+ .map((r) => r[metric])
+ .filter((v): v is number => typeof v === 'number' && Number.isFinite(v))
+ : undefined;
+ return (
+ ({
+ ...option,
+ testId: `${testPrefix}-${option.value}`,
+ }))}
+ onValueChange={(value) => {
+ setView(value);
+ track('inference_agentic_sequence_metric_view_changed', { metric, view: value });
+ }}
+ ariaLabel={`${acronym} chart view`}
+ testId={`${testPrefix}-toggle`}
+ buttonClassName="px-2 py-1 text-xs"
+ />
+ }
+ render={(expanded) => {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (view === 'distribution') {
+ if (values && values.length > 0)
+ return ;
+ return timelineLoading ? : ;
+ }
+ if (!timeline) return timelineLoading ? : ;
+ const raw = averageSequenceLengthInFlight(timeline.requests, metric);
+ return (
+
+ {metric === 'osl' && (
+
+ Retrospective: final observed OSL is assigned across each request's lifetime.
+
+ )}
+
+
+ );
+ }}
+ />
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
new file mode 100644
index 00000000..cf43f5ae
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -0,0 +1,378 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import {
+ buildRequestTimelineRows,
+ computeStableRowIndex,
+ conversationHref,
+ parseTimelineViewSnapshot,
+ requestIdleStats,
+ splitTimelineCid,
+ type TimelineViewSnapshot,
+} from './request-timeline';
+
+const request = (start: number, end: number): RequestRecord => ({
+ cid: 'conversation',
+ ti: start,
+ wid: 'worker',
+ ad: 0,
+ phase: 'profiling',
+ credit: start,
+ start,
+ ack: null,
+ end,
+ ttftMs: null,
+ tpotMs: null,
+ isl: null,
+ osl: null,
+ cancelled: false,
+});
+
+describe('requestIdleStats', () => {
+ it('sums only gaps where no requests overlap', () => {
+ expect(
+ requestIdleStats([
+ request(0, 10),
+ request(5, 20),
+ request(30, 40),
+ request(35, 50),
+ request(70, 80),
+ ]),
+ ).toEqual({ idleNs: 30, spanNs: 80 });
+ });
+
+ it('handles unsorted and nested requests without double-counting busy time', () => {
+ expect(requestIdleStats([request(20, 30), request(0, 100), request(10, 40)])).toEqual({
+ idleNs: 0,
+ spanNs: 100,
+ });
+ });
+
+ it('does not count time before the first start or after the final end', () => {
+ expect(requestIdleStats([request(100, 200), request(300, 400)])).toEqual({
+ idleNs: 100,
+ spanNs: 300,
+ });
+ });
+
+ it('returns zeroes for an empty timeline', () => {
+ expect(requestIdleStats([])).toEqual({ idleNs: 0, spanNs: 0 });
+ });
+});
+
+describe('subagent timeline hierarchy', () => {
+ it('parses aux lanes separately from their parent subagent id', () => {
+ expect(splitTimelineCid('conv::sa:subagent_001_abcd:aux:011')).toEqual({
+ parent: 'conv',
+ subagentBase: 'subagent_001_abcd',
+ stream: null,
+ aux: '011',
+ });
+ });
+
+ it('renders aux requests as always-visible children of their subagent', () => {
+ const records = [
+ { ...request(0, 10), cid: 'conv' },
+ { ...request(10, 30), cid: 'conv::sa:subagent_001_abcd' },
+ { ...request(12, 20), cid: 'conv::sa:subagent_001_abcd:aux:011' },
+ { ...request(14, 24), cid: 'conv::sa:subagent_001_abcd:aux:012' },
+ { ...request(40, 50), cid: 'conv::sa:subagent_002_ef01' },
+ ];
+
+ const rows = buildRequestTimelineRows(records, 'conversation', new Set());
+ expect(rows.map(({ kind, depth }) => ({ kind, depth }))).toEqual([
+ { kind: 'parent', depth: 0 },
+ { kind: 'subagent', depth: 1 },
+ { kind: 'aux', depth: 2 },
+ { kind: 'aux', depth: 2 },
+ { kind: 'subagent', depth: 1 },
+ ]);
+ expect(rows[1]!.requests.map((record) => record.cid)).toEqual(['conv::sa:subagent_001_abcd']);
+ expect(rows[1]!.auxCount).toBe(2);
+ expect(rows[2]!.label).toBe('aux 011 · parallel');
+ expect(rows[3]!.label).toBe('aux 012 · parallel');
+ });
+
+ it('keeps aux lanes visible while primary streams remain collapsed', () => {
+ const records = [
+ { ...request(10, 20), cid: 'conv::sa:subagent_001_abcd:s0' },
+ { ...request(12, 22), cid: 'conv::sa:subagent_001_abcd:s1' },
+ { ...request(14, 18), cid: 'conv::sa:subagent_001_abcd:aux:001' },
+ ];
+
+ const rows = buildRequestTimelineRows(records, 'conversation', new Set());
+ expect(rows.map((row) => row.kind)).toEqual(['parent', 'subagent', 'aux']);
+ expect(rows[1]!.requests).toHaveLength(2);
+ expect(rows[2]!.requests).toHaveLength(1);
+ });
+
+ it('parses aux lanes hanging directly off the main conversation', () => {
+ expect(splitTimelineCid('conv::aux:000')).toEqual({
+ parent: 'conv',
+ subagentBase: null,
+ stream: null,
+ aux: '000',
+ });
+ expect(splitTimelineCid('conv::aux:red:002')).toEqual({
+ parent: 'conv',
+ subagentBase: null,
+ stream: null,
+ aux: 'red:002',
+ });
+ expect(splitTimelineCid('conv::sa:subagent_001_abcd:aux:red:002')).toEqual({
+ parent: 'conv',
+ subagentBase: 'subagent_001_abcd',
+ stream: null,
+ aux: 'red:002',
+ });
+ });
+
+ it('nests main-agent aux lanes under the parent conversation row', () => {
+ const records = [
+ { ...request(0, 10), cid: 'conv' },
+ { ...request(2, 8), cid: 'conv::aux:001' },
+ { ...request(4, 12), cid: 'conv::aux:red:002' },
+ { ...request(20, 30), cid: 'conv::sa:subagent_001_abcd' },
+ ];
+
+ const rows = buildRequestTimelineRows(records, 'conversation', new Set());
+ expect(rows.map(({ kind, depth }) => ({ kind, depth }))).toEqual([
+ { kind: 'parent', depth: 0 },
+ { kind: 'aux', depth: 1 },
+ { kind: 'aux', depth: 1 },
+ { kind: 'subagent', depth: 1 },
+ ]);
+ expect(rows[0]!.requests.map((record) => record.cid)).toEqual(['conv']);
+ expect(rows[1]!.label).toBe('aux 001 · parallel');
+ expect(rows[1]!.parentRowKey).toBe('conv');
+ expect(rows[2]!.label).toBe('aux red:002 · parallel');
+ // Aux lanes inherit the parent conversation's color.
+ expect(rows[1]!.color).toBe(rows[0]!.color);
+ expect(rows[2]!.color).toBe(rows[0]!.color);
+ });
+
+ it('groups main-agent aux requests with their parent for stable order/color', () => {
+ const records = [
+ { ...request(50, 60), cid: 'other' },
+ { ...request(0, 10), cid: 'conv::aux:000' },
+ { ...request(5, 15), cid: 'conv' },
+ ];
+ const index = computeStableRowIndex(records, 'conversation');
+ // 'conv' groups with its aux lane (earliest start 0) and sorts before 'other'.
+ expect([...index.keys()].toSorted()).toEqual(['conv', 'other']);
+ expect(index.get('conv')).toBe(0);
+ expect(index.get('other')).toBe(1);
+ });
+
+ it('deep-links a main-agent aux request to the parent conversation without sa', () => {
+ expect(conversationHref('slug', { ...request(0, 10), cid: 'abc123::aux:red:002', ti: 3 })).toBe(
+ '/datasets/slug/conversations/abc123?turn=3',
+ );
+ });
+});
+
+describe('conversationHref', () => {
+ it('builds a turn-carrying dataset link for a main-conversation request', () => {
+ expect(
+ conversationHref('cc-traces-weka-062126', { ...request(0, 10), cid: 'abc123', ti: 4 }),
+ ).toBe('/datasets/cc-traces-weka-062126/conversations/abc123?turn=4');
+ });
+
+ it('carries the subagent id and strips the ::sa suffix from the conv id', () => {
+ expect(
+ conversationHref('slug', {
+ ...request(0, 10),
+ cid: 'abc123::sa:subagent_001_bf1c5c16:s2',
+ ti: 7,
+ }),
+ ).toBe('/datasets/slug/conversations/abc123?turn=7&sa=subagent_001_bf1c5c16');
+ });
+
+ it('uses raw source provenance for flattened-agent dataset links', () => {
+ expect(
+ conversationHref('slug', {
+ ...request(0, 10),
+ cid: '02bc0afb13f7a2d9efa86c28511261d85c0e::fa:003',
+ ti: 3,
+ srcTrace: '02bc0afb13f7a2d9efa86c28511261d85c0e',
+ srcOuter: 204,
+ srcKind: 'weka_flat',
+ }),
+ ).toBe('/datasets/slug/conversations/02bc0afb13f7a2d9efa86c28511261d85c0e?turn=3&raw=204');
+ });
+
+ it('uses raw nested source provenance for subagent child links', () => {
+ expect(
+ conversationHref('slug', {
+ ...request(0, 10),
+ cid: '117ebe75819d050f308a0a81647893abd02d::sa:subagent_010_32ee2daa',
+ ti: 16,
+ srcTrace: '117ebe75819d050f308a0a81647893abd02d',
+ srcOuter: 39,
+ srcInner: 16,
+ srcKind: 'weka_subagent',
+ }),
+ ).toBe(
+ '/datasets/slug/conversations/117ebe75819d050f308a0a81647893abd02d?turn=16&raw=39&inner=16',
+ );
+ });
+});
+
+describe('stable row order + color across phase filters', () => {
+ // Same conversations appear in both warmup and profiling. Their global
+ // first-start order is A (0) < B (10) < C (only profiling, 50). The bug:
+ // filtering to a phase re-sorted + re-colored by the visible subset, so a
+ // conversation jumped rows and swapped color when toggling phases.
+ const rec = (
+ cid: string,
+ phase: RequestRecord['phase'],
+ start: number,
+ end: number,
+ ): RequestRecord => ({ ...request(start, end), cid, phase });
+ const full: RequestRecord[] = [
+ rec('A', 'warmup', 0, 5),
+ rec('A', 'profiling', 100, 110),
+ rec('B', 'warmup', 10, 15),
+ rec('B', 'profiling', 120, 130),
+ rec('C', 'profiling', 50, 60), // profiling-only; earliest profiling start
+ ];
+
+ it('keeps each conversation in the same position and color when the phase changes', () => {
+ const index = computeStableRowIndex(full, 'conversation');
+ const warmupRows = buildRequestTimelineRows(
+ full.filter((r) => r.phase === 'warmup'),
+ 'conversation',
+ new Set(),
+ index,
+ ).filter((r) => r.kind === 'parent');
+ const profilingRows = buildRequestTimelineRows(
+ full.filter((r) => r.phase === 'profiling'),
+ 'conversation',
+ new Set(),
+ index,
+ ).filter((r) => r.kind === 'parent');
+
+ // Position: A before B in both phases (C only shows in profiling, and sorts
+ // after A/B by its global index — NOT first by its earlier profiling start).
+ expect(warmupRows.map((r) => r.label)).toEqual(['A', 'B']);
+ expect(profilingRows.map((r) => r.label)).toEqual(['A', 'B', 'C']);
+
+ // Color: identical per conversation across phases, distinct between them.
+ const warmupColors = Object.fromEntries(warmupRows.map((r) => [r.label, r.color]));
+ const profilingColors = Object.fromEntries(profilingRows.map((r) => [r.label, r.color]));
+ expect(warmupColors.A).toBe(profilingColors.A);
+ expect(warmupColors.B).toBe(profilingColors.B);
+ expect(warmupColors.A).not.toBe(warmupColors.B);
+ });
+
+ it('phase-spanning conversations occupy the same ABSOLUTE row in both phase views', () => {
+ // Warmup-only conversations start earliest — under a plain global-start
+ // ordering they'd sit above the shared ones in the warmup view but be
+ // absent from the profiling view, sliding every shared row up when the
+ // toggle flips. Spanning conversations must sort first so the leading block
+ // is identical in both views and a carried-over conversation never moves.
+ const data: RequestRecord[] = [
+ rec('W1', 'warmup', 0, 2),
+ rec('W2', 'warmup', 3, 4),
+ rec('A', 'warmup', 5, 8),
+ rec('A', 'profiling', 100, 110),
+ rec('B', 'warmup', 10, 15),
+ rec('B', 'profiling', 120, 130),
+ rec('P', 'profiling', 50, 60),
+ ];
+ const index = computeStableRowIndex(data, 'conversation');
+ const parentLabels = (phase: RequestRecord['phase']) =>
+ buildRequestTimelineRows(
+ data.filter((r) => r.phase === phase),
+ 'conversation',
+ new Set(),
+ index,
+ )
+ .filter((r) => r.kind === 'parent')
+ .map((r) => r.label);
+ // Shared block [A, B] leads both views at rows 0 and 1; phase-unique
+ // conversations fill in below.
+ expect(parentLabels('warmup')).toEqual(['A', 'B', 'W1', 'W2']);
+ expect(parentLabels('profiling')).toEqual(['A', 'B', 'P']);
+ });
+
+ it('without a shared index, the same subset re-sorts by its own start times (regression guard)', () => {
+ // Sanity: the legacy self-contained path (no index arg) orders by the
+ // subset's own first-start, which is exactly why the shared index is needed.
+ const profilingOnly = buildRequestTimelineRows(
+ full.filter((r) => r.phase === 'profiling'),
+ 'conversation',
+ new Set(),
+ ).filter((r) => r.kind === 'parent');
+ // C (start 50) sorts first here, ahead of A (100) and B (120).
+ expect(profilingOnly.map((r) => r.label)).toEqual(['C', 'A', 'B']);
+ });
+});
+
+describe('parseTimelineViewSnapshot', () => {
+ const full: TimelineViewSnapshot = {
+ viewStart: 1_000,
+ viewEnd: 5_000,
+ rowMode: 'worker',
+ phaseFilter: 'warmup',
+ expanded: ['conv::sa:subagent_001_abcd'],
+ scrollTop: 240,
+ scrollLeft: 80,
+ };
+
+ it('round-trips a full snapshot', () => {
+ expect(parseTimelineViewSnapshot(JSON.stringify(full))).toEqual(full);
+ });
+
+ it('round-trips the profiling phase and rejects the removed "all" value', () => {
+ expect(
+ parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'profiling' }))?.phaseFilter,
+ ).toBe('profiling');
+ // 'all' is no longer a valid phase — coerces back to the profiling default.
+ expect(
+ parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'all' }))?.phaseFilter,
+ ).toBe('profiling');
+ });
+
+ it('returns null for absent or unparseable input', () => {
+ expect(parseTimelineViewSnapshot(null)).toBeNull();
+ expect(parseTimelineViewSnapshot('')).toBeNull();
+ expect(parseTimelineViewSnapshot('{not json')).toBeNull();
+ expect(parseTimelineViewSnapshot('42')).toBeNull();
+ });
+
+ it('preserves a null viewEnd (not zoomed) and rejects non-finite viewEnd', () => {
+ const restored = parseTimelineViewSnapshot(JSON.stringify({ ...full, viewEnd: null }));
+ expect(restored?.viewEnd).toBeNull();
+ // NaN / Infinity don't survive JSON, but a malformed string value must coerce to null.
+ expect(parseTimelineViewSnapshot('{"viewEnd":"oops"}')?.viewEnd).toBeNull();
+ });
+
+ it('falls back to defaults for invalid enums and missing numbers', () => {
+ expect(parseTimelineViewSnapshot('{}')).toEqual({
+ viewStart: 0,
+ viewEnd: null,
+ rowMode: 'conversation',
+ phaseFilter: 'profiling',
+ expanded: [],
+ scrollTop: 0,
+ scrollLeft: 0,
+ });
+ const bogus = parseTimelineViewSnapshot(
+ JSON.stringify({ rowMode: 'nope', phaseFilter: 'nope', viewStart: 'x', scrollTop: null }),
+ )!;
+ expect(bogus.rowMode).toBe('conversation');
+ expect(bogus.phaseFilter).toBe('profiling');
+ expect(bogus.viewStart).toBe(0);
+ expect(bogus.scrollTop).toBe(0);
+ });
+
+ it('drops non-string entries from the expanded list', () => {
+ expect(parseTimelineViewSnapshot('{"expanded":["a",1,null,"b"]}')!.expanded).toEqual([
+ 'a',
+ 'b',
+ ]);
+ expect(parseTimelineViewSnapshot('{"expanded":"nope"}')!.expanded).toEqual([]);
+ });
+});
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
new file mode 100644
index 00000000..18cb76d5
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -0,0 +1,586 @@
+'use client';
+
+import { useCallback, useLayoutEffect, useMemo, useRef, useState } from 'react';
+import { useRouter } from 'next/navigation';
+
+import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+import { sliceTimelineByPhase } from './phase-slice';
+import { TimelineBars } from './timeline-bars';
+import { formatDuration } from './timeline-format';
+import {
+ CHART_WIDTH,
+ HEADER_HEIGHT,
+ LABEL_WIDTH,
+ PADDING_RIGHT,
+ ROW_GAP,
+ ROW_HEIGHT,
+ TIMELINE_BODY_MAX_HEIGHT,
+ timelineSvgHeight,
+} from './timeline-layout';
+import {
+ buildRequestTimelineRows,
+ computeStableRowIndex,
+ conversationHref,
+ requestIdleStats,
+ type RequestTimelineRow,
+ type RowMode,
+} from './timeline-rows';
+import type { SortedRequestTimes } from './timeline-cursor-stats';
+import {
+ consumeTimelineViewSnapshot,
+ saveTimelineViewSnapshot,
+ type PhaseFilter,
+} from './timeline-view-snapshot';
+import {
+ CursorPopover,
+ TimelineTooltip,
+ type CursorState,
+ type TooltipData,
+} from './timeline-tooltips';
+
+// Stable public API: pure helpers and types live in focused modules, but
+// external consumers (detail page, tests) import them from here.
+export {
+ buildRequestTimelineRows,
+ computeStableRowIndex,
+ conversationHref,
+ datasetConvId,
+ requestIdleStats,
+ splitTimelineCid,
+ subagentIdOf,
+} from './timeline-rows';
+export type { RequestIdleStats, RequestTimelineRow } from './timeline-rows';
+export { parseTimelineViewSnapshot } from './timeline-view-snapshot';
+export type { TimelineViewSnapshot } from './timeline-view-snapshot';
+
+/**
+ * Gantt-style request timeline for one agentic benchmark point.
+ *
+ * Rows are conversations (or workers — toggle in the header). Bars are
+ * individual HTTP requests, drawn from request_start to request_end with a
+ * thin lead-in segment from credit_issued (load gen queue). Shift+scroll
+ * zooms, drag pans, hover shows per-request stats.
+ *
+ * The reference for this layout is the agent-timeline in semianalysis-claude-code-proxy.
+ */
+
+const ROW_MODE_OPTIONS: SegmentedToggleOption[] = [
+ { value: 'conversation', label: 'By conversation', testId: 'timeline-mode-conversation' },
+ { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' },
+];
+
+const PHASE_OPTIONS: SegmentedToggleOption[] = [
+ { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' },
+ { value: 'warmup', label: 'Warmup', testId: 'timeline-phase-warmup' },
+];
+
+const PLOT_WIDTH = CHART_WIDTH - PADDING_RIGHT;
+
+export function RequestTimelineView({
+ data,
+ datasetSlug,
+ pointId,
+}: {
+ data: RequestTimeline;
+ /** Source dataset slug for this run; enables click-to-conversation deep links. */
+ datasetSlug?: string | null;
+ /** benchmark_results.id — keys the per-point view-state snapshot for restore. */
+ pointId: number;
+}) {
+ const router = useRouter();
+ const [rowMode, setRowMode] = useState('conversation');
+ const [phaseFilter, setPhaseFilter] = useState('profiling');
+ const [tooltip, setTooltip] = useState(null);
+
+ // The scroll container (vertical row scroll + horizontal chart scroll) and a
+ // ref mirror of the live view state, so click-through can snapshot the exact
+ // position without rebuilding openConversation on every zoom/pan tick.
+ const scrollRef = useRef(null);
+ const liveStateRef = useRef<{
+ viewStart: number;
+ viewEnd: number | null;
+ rowMode: RowMode;
+ phaseFilter: PhaseFilter;
+ expandedSubagents: ReadonlySet;
+ }>({
+ viewStart: 0,
+ viewEnd: null,
+ rowMode: 'conversation',
+ phaseFilter: 'profiling',
+ expandedSubagents: new Set(),
+ });
+
+ const openConversation = useCallback(
+ (req: RequestRecord) => {
+ if (!datasetSlug) return;
+ // Snapshot the current zoom/scroll/filter position so the browser back
+ // button restores it (see the restore effect below).
+ if (scrollRef.current) {
+ const live = liveStateRef.current;
+ saveTimelineViewSnapshot(pointId, {
+ viewStart: live.viewStart,
+ viewEnd: live.viewEnd,
+ rowMode: live.rowMode,
+ phaseFilter: live.phaseFilter,
+ expanded: [...live.expandedSubagents],
+ scrollTop: scrollRef.current.scrollTop,
+ scrollLeft: scrollRef.current.scrollLeft,
+ });
+ }
+ track('agentic_timeline_to_dataset', { slug: datasetSlug });
+ router.push(conversationHref(datasetSlug, req));
+ },
+ [datasetSlug, router, pointId],
+ );
+ // Which multi-stream subagents currently have their per-stream rows
+ // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id).
+ const [expandedSubagents, setExpandedSubagents] = useState>(() => new Set());
+ const toggleSubagent = useCallback((key: string) => {
+ setExpandedSubagents((prev) => {
+ const next = new Set(prev);
+ if (next.has(key)) next.delete(key);
+ else next.add(key);
+ return next;
+ });
+ }, []);
+ const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
+
+ // The phase toggle only means something when warmup requests are actually
+ // present. aiperf's profile_export only contains profiling-phase requests, so
+ // in practice every record is `profiling` and the toggle is a no-op — hide it
+ // unless a non-profiling request exists (keeps it working if warmup is ever
+ // exported).
+ const hasWarmup = useMemo(
+ () => data.requests.some((r) => r.phase !== 'profiling'),
+ [data.requests],
+ );
+
+ // Apply phase filter, then group into rows. Uses the SAME time-boundary
+ // slicing as the per-point charts (sliceTimelineByPhase) rather than the
+ // per-request phase LABEL, so the Gantt and the charts agree on exactly which
+ // requests belong to each phase (they diverge only when a warmup-labelled
+ // request starts after the first profiling request). With no warmup data the
+ // boundary is null and this is an identity passthrough — the filter collapses
+ // to "profiling" regardless of the (hidden) toggle state.
+ const filtered = useMemo(
+ () => sliceTimelineByPhase(data, hasWarmup ? phaseFilter : 'profiling').requests,
+ [data, phaseFilter, hasWarmup],
+ );
+ // Stable order/color per conversation (or worker), computed over the FULL
+ // request set — NOT the phase-filtered subset — so a row keeps its position
+ // and color when the user toggles between warmup and profiling.
+ const stableRowIndex = useMemo(
+ () => computeStableRowIndex(data.requests, rowMode),
+ [data.requests, rowMode],
+ );
+ const rows = useMemo(
+ () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents, stableRowIndex),
+ [filtered, rowMode, expandedSubagents, stableRowIndex],
+ );
+ const idleStats = useMemo(() => requestIdleStats(filtered), [filtered]);
+
+ // Pre-sort the timestamp columns so the cursor-time stats popover can
+ // count "running / waiting at time t" in O(log n). With a few hundred
+ // requests this is overkill — but it stays smooth on huge runs too.
+ const sortedTimes = useMemo(() => {
+ const credits = filtered.map((r) => r.credit).toSorted((a, b) => a - b);
+ const starts = filtered.map((r) => r.start).toSorted((a, b) => a - b);
+ const ends = filtered.map((r) => r.end).toSorted((a, b) => a - b);
+ return { credits, starts, ends };
+ }, [filtered]);
+
+ // Cursor state (vertical line + stats popover). null when the mouse
+ // isn't over the chart. xPx is svg-local; tNs is the ns offset from
+ // dataStart that the cursor is pointing at.
+ const [cursor, setCursor] = useState(null);
+
+ // Timeline extent (clamped to actual data — if we filtered out warmup
+ // the visible window should shrink to just the profiling phase).
+ const { dataStart, dataEnd } = useMemo(() => {
+ if (filtered.length === 0) return { dataStart: 0, dataEnd: 1 };
+ let min = Number.POSITIVE_INFINITY;
+ let max = Number.NEGATIVE_INFINITY;
+ for (const r of filtered) {
+ if (r.credit < min) min = r.credit;
+ if (r.end > max) max = r.end;
+ }
+ return { dataStart: min, dataEnd: max };
+ }, [filtered]);
+ const totalNs = Math.max(dataEnd - dataStart, 1);
+
+ // Visible window state (ns offsets, relative to dataStart).
+ const [viewStart, setViewStart] = useState(0);
+ const [viewEnd, setViewEnd] = useState(null);
+ const vStart = viewStart;
+ const vEnd = viewEnd ?? totalNs;
+ const visibleDur = Math.max(vEnd - vStart, 1);
+ const isZoomed = viewEnd !== null;
+
+ // Mirror the live view state into a ref so the click-through snapshot reads
+ // the latest values without rebuilding openConversation on every zoom tick.
+ liveStateRef.current = { viewStart, viewEnd, rowMode, phaseFilter, expandedSubagents };
+
+ // Restore the snapshot written on click-through (e.g. open a request in the
+ // dataset flamegraph, then hit the browser back button). Runs once per mount,
+ // keyed by point id; the snapshot is consumed so a later reload starts fresh.
+ // Scroll is applied after the restored filters/expansions re-render the rows
+ // (rAF fires after that synchronous commit, before paint — no visible jump).
+ useLayoutEffect(() => {
+ const snapshot = consumeTimelineViewSnapshot(pointId);
+ if (!snapshot) return;
+ setRowMode(snapshot.rowMode);
+ setPhaseFilter(snapshot.phaseFilter);
+ setExpandedSubagents(new Set(snapshot.expanded));
+ setViewStart(snapshot.viewStart);
+ setViewEnd(snapshot.viewEnd);
+ const target = { top: snapshot.scrollTop, left: snapshot.scrollLeft };
+ requestAnimationFrame(() => {
+ const el = scrollRef.current;
+ if (!el) return;
+ el.scrollTop = target.top;
+ el.scrollLeft = target.left;
+ });
+ // setState setters are stable; only re-run if the point itself changes.
+ // eslint-disable-next-line react-hooks/exhaustive-deps
+ }, [pointId]);
+
+ const svgHeight = timelineSvgHeight(rows.length);
+
+ // Native (non-passive) wheel handler: React's synthetic onWheel is attached
+ // passively, so preventDefault there is silently ignored and shift+scroll
+ // would zoom AND horizontally pan the scroll container.
+ const zoomSvgRef = useRef(null);
+ const handleWheel = useCallback(
+ (e: WheelEvent) => {
+ // Zoom only on shift+scroll so plain scrolling keeps its native meaning
+ // (page / row-container scroll) instead of being hijacked by the chart.
+ if (!e.shiftKey) return;
+ e.preventDefault();
+ const rect = (e.currentTarget as SVGSVGElement).getBoundingClientRect();
+ const mouseX = e.clientX - rect.left;
+ const mouseRatio = Math.max(0, Math.min(1, mouseX / PLOT_WIDTH));
+ const curStart = vStart;
+ const curEnd = vEnd;
+ const curDur = curEnd - curStart;
+ // With shift held, most browsers report the wheel delta on deltaX.
+ const delta = e.deltaY || e.deltaX;
+ const factor = delta > 0 ? 1.2 : 1 / 1.2;
+ const newDur = Math.min(Math.max(curDur * factor, totalNs * 0.001), totalNs);
+ const pivot = curStart + mouseRatio * curDur;
+ let newStart = pivot - mouseRatio * newDur;
+ let newEnd = pivot + (1 - mouseRatio) * newDur;
+ if (newStart < 0) {
+ newEnd -= newStart;
+ newStart = 0;
+ }
+ if (newEnd > totalNs) {
+ newStart -= newEnd - totalNs;
+ newEnd = totalNs;
+ if (newStart < 0) newStart = 0;
+ }
+ if (newEnd - newStart >= totalNs * 0.99) {
+ setViewStart(0);
+ setViewEnd(null);
+ } else {
+ setViewStart(newStart);
+ setViewEnd(newEnd);
+ }
+ },
+ [vStart, vEnd, totalNs],
+ );
+
+ useLayoutEffect(() => {
+ const svg = zoomSvgRef.current;
+ if (!svg) return;
+ svg.addEventListener('wheel', handleWheel, { passive: false });
+ return () => svg.removeEventListener('wheel', handleWheel);
+ }, [handleWheel]);
+
+ const handleMouseDown = useCallback(
+ (e: React.MouseEvent) => {
+ if (e.button !== 0) return;
+ dragRef.current = { startX: e.clientX, vs: vStart, ve: vEnd };
+ },
+ [vStart, vEnd],
+ );
+
+ const handleMouseMove = useCallback(
+ (e: React.MouseEvent) => {
+ // Dragging takes precedence over cursor tracking — panning the view.
+ if (dragRef.current) {
+ const dx = e.clientX - dragRef.current.startX;
+ const nsPerPx = visibleDur / PLOT_WIDTH;
+ const delta = -dx * nsPerPx;
+ let ns = dragRef.current.vs + delta;
+ let ne = dragRef.current.ve + delta;
+ const dur = ne - ns;
+ if (ns < 0) {
+ ns = 0;
+ ne = dur;
+ }
+ if (ne > totalNs) {
+ ne = totalNs;
+ ns = totalNs - dur;
+ if (ns < 0) ns = 0;
+ }
+ setViewStart(ns);
+ setViewEnd(ne);
+ setTooltip(null);
+ setCursor(null);
+ return;
+ }
+ // Track the cursor position in svg-local px and the matching ns offset
+ // so the crosshair + stats popover can render. Clamped to the chart
+ // plot area (don't show a cursor on the axis labels gutter).
+ const rect = e.currentTarget.getBoundingClientRect();
+ const xPx = Math.max(0, Math.min(PLOT_WIDTH, e.clientX - rect.left));
+ const nsPerPx = visibleDur / PLOT_WIDTH;
+ const tNs = vStart + xPx * nsPerPx;
+ setCursor({ xPx, tNs, clientX: e.clientX, clientY: e.clientY });
+ },
+ [visibleDur, totalNs, vStart],
+ );
+
+ const handleMouseUp = useCallback(() => {
+ dragRef.current = null;
+ }, []);
+
+ const handleMouseLeave = useCallback(() => {
+ dragRef.current = null;
+ setCursor(null);
+ }, []);
+
+ const resetZoom = useCallback(() => {
+ setViewStart(0);
+ setViewEnd(null);
+ }, []);
+
+ // Stable bar callbacks so TimelineBars' memo isn't defeated by fresh
+ // closures on every tooltip/cursor state change.
+ const handleBarHover = useCallback(
+ (e: React.MouseEvent, row: RequestTimelineRow, req: RequestRecord) => {
+ setTooltip({ x: e.clientX, y: e.clientY, row, req });
+ },
+ [],
+ );
+ const handleBarLeave = useCallback(() => setTooltip(null), []);
+ const handleBarClick = useCallback(
+ (e: React.MouseEvent, req: RequestRecord) => {
+ if (e.metaKey || e.ctrlKey || e.shiftKey || e.altKey || e.button !== 0) return;
+ e.preventDefault();
+ openConversation(req);
+ },
+ [openConversation],
+ );
+
+ if (rows.length === 0) {
+ return (
+
+ No requests in the current filter.
+
+ );
+ }
+
+ const totalRequests = filtered.length;
+
+ return (
+
+ {/* Controls */}
+
+
+ {hasWarmup && (
+
+ )}
+
+ {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '}
+ {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '}
+ {formatDuration((dataEnd - dataStart) / 1e6)} ·{' '}
+
+ idle {formatDuration(idleStats.idleNs / 1e6)}
+ {idleStats.spanNs > 0
+ ? ` (${((idleStats.idleNs / idleStats.spanNs) * 100).toFixed(1)}%)`
+ : ''}
+
+ {isZoomed && (
+ <>
+ {' · '}
+
+ reset zoom
+
+ >
+ )}
+
+
+
+ {/* Chart container */}
+
+ {/* Fixed-height window: rows scroll vertically and the chart scrolls
+ horizontally inside it, so the card doesn't grow to fit every
+ conversation/worker AND the horizontal scrollbar stays pinned to the
+ window's bottom edge (rather than the bottom of the tall content). */}
+
+
+ {/* Label column — pinned left (sticky) so it stays put during
+ horizontal scroll, while scrolling vertically with the rows. */}
+
+
+
+ {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
+
+
+ {rows.map((row) => {
+ const isSubagentRow = row.kind === 'subagent';
+ const isChildRow = row.kind === 'stream' || row.kind === 'aux';
+ const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
+ const isExpanded = isExpandable && expandedSubagents.has(row.key);
+ return (
+
+ {isExpandable ? (
+ toggleSubagent(row.key)}
+ className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
+ aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
+ title={isExpanded ? 'Collapse streams' : 'Expand streams'}
+ >
+ {isExpanded ? '▾' : '▸'}
+
+ ) : (
+
+ )}
+
+
+ {row.label}
+ {isExpandable && (
+ ×{row.streamCount}
+ )}
+ {isSubagentRow && (row.auxCount ?? 0) > 0 && (
+ +{row.auxCount} aux
+ )}
+
+
+ {row.requests.length > 0 ? row.requests.length : '—'}
+
+
+ );
+ })}
+
+
+ {/* Chart column — horizontal scrolling is handled by the window
+ container above so its scrollbar stays pinned to the window's
+ bottom edge; double-click anywhere resets the zoom. */}
+
+
+
+
+ {/* Cursor crosshair — drawn on top of bars so it stays visible
+ through dense rows. Stats popover is rendered as fixed
+ HTML below the SVG block. */}
+ {cursor && (
+
+ )}
+
+
+
+
+
+
+ {/* Footer — interaction hint only. */}
+
+
+ shift+scroll to zoom · drag to pan · double-click to reset
+
+
+
+ {/* Cursor stats popover: count of in-flight / waiting at the cursor's
+ ns offset. Hidden when the user is hovering an individual bar
+ (per-request tooltip wins). */}
+ {cursor && !tooltip && (
+
+ )}
+
+ {/* Tooltip */}
+ {tooltip &&
}
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx b/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx
new file mode 100644
index 00000000..6eb109b7
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx
@@ -0,0 +1,474 @@
+'use client';
+
+import type { RequestTimeline } from '@/hooks/api/use-request-timeline';
+import type { MetricSourceSeries, QueueDepthPoint } from '@/hooks/api/use-trace-server-metrics';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+import { CHART_SIZES, ChartEmpty, ChartSkeleton } from './chart-shared';
+import { ExpandableChart } from './expandable-chart';
+import { metricSourceLabel } from './metric-source-toolbar';
+import type { PhaseSlicedSeries, ServerSeriesLike } from './phase-slice';
+import { StackedAreaChart, TimeSeriesChart } from './time-series-chart';
+import {
+ cumulativeCompletedRequests,
+ cumulativeDifferenceMonotonic,
+ cumulativeTimeAverage,
+ cumulativeUniqueInputTokens,
+ buildThroughputChartSeries,
+ inflightUniqueTokens,
+ rollingAverage,
+ timeRollingAverage,
+ toggleThroughputSeries,
+ type ThroughputSeriesKey,
+} from './time-series-math';
+
+/**
+ * Phase-sliced server series (+ matching durationS). Null while the trace
+ * blob is loading or absent — cards render a skeleton until it arrives.
+ */
+type SlicedServerSeries = PhaseSlicedSeries | null;
+
+export type RequestActivityView = 'queue' | 'completed';
+
+const REQUEST_ACTIVITY_OPTIONS: SegmentedToggleOption[] = [
+ { value: 'queue', label: 'Queue depth', testId: 'request-activity-queue' },
+ { value: 'completed', label: 'Completed', testId: 'request-activity-completed' },
+];
+
+/** Compact token count for chart labels: 306808 → "307K tok", 3.2e6 → "3.2M tok". */
+const fmtTokensCompact = (n: number): string => {
+ if (n >= 1e6) return `${(n / 1e6).toFixed(1)}M tok`;
+ if (n >= 1e3) return `${Math.round(n / 1e3)}K tok`;
+ return `${Math.round(n)} tok`;
+};
+
+// Per-DP-rank color palette for DEP runs (one distinct color per rank in
+// the KV cache utilization overlay). Mirrors the request-timeline row
+// palette so the same DP index reads as the same color across both views.
+// Wraps mod-N if more than 12 ranks ever land.
+const DP_RANK_PALETTE = [
+ '#3b82f6',
+ '#ef4444',
+ '#10b981',
+ '#f59e0b',
+ '#a855f7',
+ '#06b6d4',
+ '#f97316',
+ '#84cc16',
+ '#ec4899',
+ '#14b8a6',
+ '#8b5cf6',
+ '#eab308',
+];
+
+export function KvCacheUtilizationCard({ sliced }: { sliced: SlicedServerSeries }) {
+ return (
+ {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (!sliced) return ;
+ const serverSeries = sliced.series;
+ // For SGLang hicache rows we have both GPU (HBM) util and
+ // host (CPU offload pool) util — overlay them as two lines.
+ const hasHost = serverSeries.hostKvCacheUsage.length > 0;
+ // DEP runs report one series per engine. When there's more
+ // than one, draw one line per rank in distinct colors so
+ // load skew is visible at a glance; cluster-average sits on
+ // top in white so it stands out.
+ const perEngine = serverSeries.kvCacheUsageByEngine ?? [];
+ const hasPerEngine = perEngine.length > 1;
+ // Render order matters: per-engine first → average drawn on top.
+ const series = [
+ ...(hasPerEngine
+ ? perEngine.map((e, i) => ({
+ name: `DP ${e.engineLabel}`,
+ data: rollingAverage(e.points, 50),
+ color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!,
+ // Thin + translucent so the Avg line on top reads as
+ // the headline number, not just one more series.
+ strokeWidth: 1,
+ strokeOpacity: 0.5,
+ }))
+ : []),
+ {
+ name: hasHost ? 'GPU HBM (avg n=50)' : hasPerEngine ? 'Avg' : 'GPU KV cache (avg n=50)',
+ data: rollingAverage(serverSeries.kvCacheUsage, 50),
+ // Skip raw scatter when per-engine overlay is on — the
+ // DP-rank lines already convey the spread, dots would be noise.
+ rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage,
+ // Bold red Avg sits on top of the translucent per-DP lines.
+ // DP 1 in the palette is #ef4444 (lighter red); the darker
+ // #dc2626 here plus the heavier stroke keeps it distinct.
+ color: hasPerEngine ? '#dc2626' : '#3b82f6',
+ strokeWidth: hasPerEngine ? 3.5 : 2,
+ },
+ ...(hasHost
+ ? [
+ {
+ name: 'CPU offload pool (avg n=50)',
+ data: rollingAverage(serverSeries.hostKvCacheUsage, 50),
+ rawData: serverSeries.hostKvCacheUsage,
+ color: '#f97316',
+ strokeWidth: 2,
+ },
+ ]
+ : []),
+ ];
+ return (
+ `${(v * 100).toFixed(0)}%`}
+ yAxisLabel="KV cache (%)"
+ {...size}
+ />
+ );
+ }}
+ />
+ );
+}
+
+export function RequestActivityCard({
+ sliced,
+ phaseTimeline,
+ timelineLoading,
+ view,
+ onViewChange,
+}: {
+ sliced: SlicedServerSeries;
+ phaseTimeline: RequestTimeline | null;
+ timelineLoading: boolean;
+ view: RequestActivityView;
+ onViewChange: (view: RequestActivityView) => void;
+}) {
+ return (
+ {
+ onViewChange(value);
+ track('inference_agentic_request_activity_changed', { view: value });
+ }}
+ ariaLabel="Request activity metric"
+ testId="request-activity-toggle"
+ buttonClassName="px-2 py-1 text-xs"
+ />
+ }
+ render={(expanded) => {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (view === 'completed') {
+ if (!phaseTimeline) {
+ return timelineLoading ? : ;
+ }
+ return (
+
+ );
+ }
+ if (!sliced) return ;
+ const serverSeries = sliced.series;
+ return (
+ ({
+ t: p.t,
+ value: p.running,
+ })),
+ 50,
+ ),
+ color: '#22c55e',
+ strokeWidth: 2,
+ },
+ {
+ name: 'Waiting (avg n=50)',
+ data: rollingAverage(
+ serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
+ t: p.t,
+ value: p.waiting,
+ })),
+ 50,
+ ),
+ color: '#ef4444',
+ strokeWidth: 2,
+ },
+ {
+ name: 'Total (avg n=50)',
+ data: rollingAverage(
+ serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
+ t: p.t,
+ value: p.total,
+ })),
+ 50,
+ ),
+ color: '#3b82f6',
+ strokeWidth: 2,
+ },
+ ]}
+ durationS={sliced.durationS}
+ yAxisLabel="Requests"
+ {...size}
+ />
+ );
+ }}
+ />
+ );
+}
+
+export function PrefixCacheHitRateCard({ sliced }: { sliced: SlicedServerSeries }) {
+ return (
+ {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (!sliced) return ;
+ const serverSeries = sliced.series;
+ return (
+ `${(v * 100).toFixed(0)}%`}
+ yAxisLabel="Hit rate (%)"
+ {...size}
+ />
+ );
+ }}
+ />
+ );
+}
+
+export function ThroughputCard({
+ sliced,
+ selectedSource,
+ selected,
+ onSelectedChange,
+}: {
+ sliced: SlicedServerSeries;
+ selectedSource: MetricSourceSeries | undefined;
+ selected: ReadonlySet;
+ onSelectedChange: (next: ReadonlySet) => void;
+}) {
+ return (
+
+ {(
+ [
+ ['input', 'Input'],
+ ['decode', 'Decode'],
+ ] as const
+ ).map(([key, label]) => {
+ const active = selected.has(key);
+ const isOnlyActive = active && selected.size === 1;
+ return (
+ {
+ const next = toggleThroughputSeries(selected, key);
+ if (next === selected) return;
+ onSelectedChange(next);
+ track('inference_agentic_throughput_series_toggled', {
+ series: key,
+ enabled: next.has(key),
+ });
+ }}
+ >
+ {label}
+
+ );
+ })}
+
+ }
+ render={(expanded) => {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (!sliced) return ;
+ const serverSeries = sliced.series;
+ return (
+
+ );
+ }}
+ />
+ );
+}
+
+export function PromptTokenSourceCard({ sliced }: { sliced: SlicedServerSeries }) {
+ return (
+ {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (!sliced) return ;
+ return (
+
+ );
+ }}
+ />
+ );
+}
+
+export function CumulativeUniqueInputTokensCard({ sliced }: { sliced: SlicedServerSeries }) {
+ return (
+ {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (!sliced) return ;
+ const serverSeries = sliced.series;
+ // Unique = total prompt tokens received minus tokens served from
+ // any cache tier — i.e. the freshly prefill-computed tokens. Prefer
+ // the promptTokensBySource breakdown (its buckets sum to the real
+ // prompt-token total, so subtracting cache tiers is exact). Fall
+ // back to cumsum(prefillTps - prefixCacheHitsTps) only for older
+ // data without the breakdown: vllm:prefix_cache_hits re-counts
+ // tokens across scheduler passes, so its cumulative can exceed the
+ // prompt tokens received, driving the diff negative and freezing
+ // the monotonic-clamped line after a few seconds.
+ const uniqueFromBreakdown = cumulativeUniqueInputTokens(serverSeries.promptTokensBySource);
+ const uniqueData =
+ uniqueFromBreakdown.length > 0
+ ? uniqueFromBreakdown
+ : cumulativeDifferenceMonotonic(
+ serverSeries.prefillTps,
+ serverSeries.prefixCacheHitsTps,
+ );
+ return (
+
+ );
+ }}
+ />
+ );
+}
+
+export function InflightUniqueTokensCard({
+ phaseTimeline,
+ timelineLoading,
+ kvCachePoolTokens,
+}: {
+ phaseTimeline: RequestTimeline | null;
+ timelineLoading: boolean;
+ /** KV-cache pool size in tokens (vLLM only) — drawn as a constant ceiling. */
+ kvCachePoolTokens: number | null;
+}) {
+ return (
+ {
+ const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+ if (!phaseTimeline) {
+ return timelineLoading ? : ;
+ }
+ // Step function: at each request start/end, sum the ISLs of
+ // currently-active requests across distinct cids. Within one
+ // cid turns are sequential so each cid contributes at most
+ // one in-flight ISL; across cids we treat content as
+ // independent (cross-conv prefix sharing adds <1pp in
+ // practice). Smooth with a 30s time-weighted rolling average
+ // so brief turn-handoff dips don't dominate the chart.
+ const raw = inflightUniqueTokens(phaseTimeline.requests);
+ const smoothed = timeRollingAverage(raw, 30);
+ // KV-cache pool size (vLLM only) drawn as a constant ceiling so
+ // you can see how close the working set gets to eviction
+ // pressure. Phase-independent — it's a static config value.
+ const pool = kvCachePoolTokens;
+ return (
+ 0
+ ? [{ value: pool, label: `KV cache pool · ${fmtTokensCompact(pool)}` }]
+ : undefined
+ }
+ {...size}
+ />
+ );
+ }}
+ />
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
new file mode 100644
index 00000000..2c3a3c27
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -0,0 +1,262 @@
+'use client';
+
+import { useMemo, useState } from 'react';
+import { useRouter } from 'next/navigation';
+import { ChevronLeft, ChevronRight } from 'lucide-react';
+
+import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
+import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
+import {
+ Select,
+ SelectContent,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from '@/components/ui/select';
+import { track } from '@/lib/analytics';
+
+const HW_LABELS: Record = {
+ b200: 'B200',
+ b300: 'B300',
+ gb200: 'GB200',
+ gb300: 'GB300',
+ h100: 'H100',
+ h200: 'H200',
+ mi300x: 'MI300X',
+ mi325x: 'MI325X',
+ mi355x: 'MI355X',
+};
+
+const MODEL_LABELS: Record = {
+ dsr1: 'DeepSeek R1',
+ dsv4: 'DeepSeek V4 Pro',
+ glm5: 'GLM-5',
+ 'glm5.1': 'GLM-5.1',
+ gptoss120b: 'gpt-oss 120B',
+ kimik2: 'Kimi K2',
+ 'kimik2.5': 'Kimi K2.5',
+ 'kimik2.6': 'Kimi K2.6',
+ llama70b: 'Llama 3.3 70B',
+ 'minimaxm2.5': 'MiniMax M2.5',
+ 'minimaxm2.7': 'MiniMax M2.7',
+ 'qwen3.5': 'Qwen 3.5',
+};
+
+function hwLabel(hw: string) {
+ return HW_LABELS[hw] ?? hw.toUpperCase();
+}
+function modelLabel(m: string) {
+ return MODEL_LABELS[m] ?? m;
+}
+function frameworkLabel(fw: string) {
+ if (fw === 'vllm') return 'vLLM';
+ if (fw === 'sglang') return 'SGLang';
+ if (fw === 'trt') return 'TRT';
+ if (fw === 'mori-sglang') return 'Mori-SGLang';
+ if (fw.startsWith('dynamo-')) return `Dynamo ${fw.slice('dynamo-'.length).toUpperCase()}`;
+ return fw;
+}
+
+/** Short label for a sibling chip: parallelism + concurrency. */
+export function chipLabel(s: BenchmarkSibling): string {
+ // Same parallelism labeler the chart points use (TP/EP/TEP/DEP/DPA…).
+ const parallel = parallelismLabel({
+ tp: s.decode_tp,
+ ep: s.decode_ep,
+ dpAttention: s.decode_dp_attention,
+ disagg: s.disagg,
+ isMultinode: s.is_multinode,
+ prefillTp: s.prefill_tp,
+ prefillEp: s.prefill_ep,
+ prefillDpAttention: s.prefill_dp_attention,
+ prefillNumWorkers: s.prefill_num_workers,
+ decodeTp: s.decode_tp,
+ decodeEp: s.decode_ep,
+ decodeDpAttention: s.decode_dp_attention,
+ decodeNumWorkers: s.decode_num_workers,
+ });
+ const offload = s.offload_mode === 'on' ? ' • off=ON' : '';
+ return `${parallel} • c=${s.conc}${offload}`;
+}
+
+type SortMode = 'default' | 'conc' | 'parallelism' | 'tput' | 'requests';
+
+const SORT_OPTIONS: { value: SortMode; label: string }[] = [
+ { value: 'default', label: 'Default' },
+ { value: 'conc', label: 'Concurrency ↑' },
+ { value: 'parallelism', label: 'Parallelism' },
+ { value: 'tput', label: 'Throughput/GPU ↓' },
+ { value: 'requests', label: 'Total requests ↓' },
+];
+
+// Group key for the "parallelism" sort: ep first (so TP/EP1 sorts ahead of
+// EP/TEP/DEP groups), then tp, then dp-attention, then disagg — every config
+// of one parallelism lands together, ordered by concurrency within.
+const parallelRank = (s: BenchmarkSibling): [number, number, number, number] => [
+ s.decode_ep ?? 0,
+ s.decode_tp ?? 0,
+ s.decode_dp_attention ? 1 : 0,
+ s.disagg ? 1 : 0,
+];
+
+function sortSiblings(siblings: BenchmarkSibling[], mode: SortMode): BenchmarkSibling[] {
+ if (mode === 'default') return siblings;
+ const out = [...siblings];
+ if (mode === 'conc') {
+ out.sort((a, b) => a.conc - b.conc);
+ } else if (mode === 'tput') {
+ // Highest throughput/GPU first; rows missing the metric sink to the end.
+ out.sort((a, b) => (b.tput_per_gpu ?? -Infinity) - (a.tput_per_gpu ?? -Infinity));
+ } else if (mode === 'requests') {
+ // Most total requests first; rows missing the metric sink to the end.
+ out.sort((a, b) => (b.total_requests ?? -Infinity) - (a.total_requests ?? -Infinity));
+ } else {
+ out.sort((a, b) => {
+ const ra = parallelRank(a);
+ const rb = parallelRank(b);
+ for (let i = 0; i < ra.length; i++) {
+ if (ra[i] !== rb[i]) return ra[i] - rb[i];
+ }
+ // Within a parallelism group: offload off before on, then concurrency.
+ const oa = a.offload_mode === 'on' ? 1 : 0;
+ const ob = b.offload_mode === 'on' ? 1 : 0;
+ return oa - ob || a.conc - b.conc;
+ });
+ }
+ return out;
+}
+
+const isSortMode = (v: string | null): v is SortMode =>
+ v !== null && SORT_OPTIONS.some((o) => o.value === v);
+
+export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) {
+ const router = useRouter();
+ // Persist the sort in the URL so clicking a point (which remounts this
+ // component on the new route) keeps the chosen order instead of resetting.
+ // Read it once from the URL on mount — this component only renders after the
+ // client-side siblings query resolves, so `window` is always available here
+ // (no SSR/hydration mismatch). Matches the app's window-based url-state read.
+ const [sortMode, setSortMode] = useState(() => {
+ if (typeof window === 'undefined') return 'default';
+ const v = new URLSearchParams(window.location.search).get('sort');
+ return isSortMode(v) ? v : 'default';
+ });
+
+ const sorted = useMemo(() => sortSiblings(siblings, sortMode), [siblings, sortMode]);
+
+ // prev/next follow the displayed (sorted) order so navigation matches the row.
+ const currentIdx = sorted.findIndex((s) => s.is_current);
+ const prev = currentIdx > 0 ? sorted[currentIdx - 1] : null;
+ const next = currentIdx !== -1 && currentIdx < sorted.length - 1 ? sorted[currentIdx + 1] : null;
+
+ // Carry the active sort through every point-to-point link.
+ const hrefFor = (id: number) =>
+ sortMode === 'default'
+ ? `/inference/agentic/${id}`
+ : `/inference/agentic/${id}?sort=${sortMode}`;
+
+ const currentId = siblings.find((s) => s.is_current)?.id;
+
+ const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`;
+
+ return (
+
+
+
{skuLabel}
+
+ {siblings.length} point{siblings.length === 1 ? '' : 's'} in this run · {sku.date}
+
+
+
+
+ Sort by
+ {
+ const mode = v as SortMode;
+ setSortMode(mode);
+ track('agentic_siblings_sorted', { mode });
+ // Mirror into the URL (replace, no history spam) so a refresh —
+ // and the next point's mount — keep the chosen order.
+ if (currentId !== undefined) {
+ const href =
+ mode === 'default'
+ ? `/inference/agentic/${currentId}`
+ : `/inference/agentic/${currentId}?sort=${mode}`;
+ router.replace(href, { scroll: false });
+ }
+ }}
+ >
+
+
+
+
+ {SORT_OPTIONS.map((o) => (
+
+ {o.label}
+
+ ))}
+
+
+
+
{
+ if (prev) {
+ track('agentic_siblings_navigated', { direction: 'prev', targetId: prev.id });
+ router.push(hrefFor(prev.id));
+ }
+ }}
+ className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+ aria-label="Previous point"
+ >
+ prev
+
+
+ {sorted.map((s) => {
+ const active = s.is_current;
+ return (
+ {
+ if (!active) {
+ track('agentic_siblings_navigated', { direction: 'chip', targetId: s.id });
+ router.push(hrefFor(s.id));
+ }
+ }}
+ className={`px-2 py-1 rounded-md text-xs border transition-colors ${
+ active
+ ? 'border-primary bg-primary text-primary-foreground font-medium'
+ : 'border-border/40 text-foreground hover:bg-accent'
+ } ${s.has_trace ? '' : 'opacity-60'}`}
+ title={s.has_trace ? undefined : 'No stored trace data'}
+ >
+ {chipLabel(s)}
+
+ );
+ })}
+
+
{
+ if (next) {
+ track('agentic_siblings_navigated', { direction: 'next', targetId: next.id });
+ router.push(hrefFor(next.id));
+ }
+ }}
+ className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+ aria-label="Next point"
+ >
+ next
+
+
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
new file mode 100644
index 00000000..2131c82e
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -0,0 +1,526 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+import { CHART_PAD, ChartEmpty, fmtCount, fmtSeconds } from './chart-shared';
+import { interpAt, type ChartSeries } from './time-series-math';
+
+// Historical entry point: the pure data-shaping helpers lived in this module
+// before being extracted; re-export them so both import paths stay valid.
+export * from './time-series-math';
+
+/** A constant horizontal reference line (e.g. a capacity ceiling). */
+export interface ReferenceLine {
+ value: number;
+ label: string;
+ /** Line + label color. Defaults to a muted emerald. */
+ color?: string;
+}
+
+interface TimeSeriesChartProps {
+ series: ChartSeries[];
+ durationS: number;
+ yMax?: number;
+ yFmt?: (v: number) => string;
+ yAxisLabel?: string;
+ width?: number;
+ height?: number;
+ /**
+ * Horizontal reference lines drawn across the plot. Their values are folded
+ * into the auto y-max so the line stays on-chart even when it exceeds the
+ * data (e.g. a KV-cache pool ceiling well above the working set).
+ */
+ refLines?: readonly ReferenceLine[];
+}
+
+const NO_REF_LINES: readonly ReferenceLine[] = [];
+
+const PAD = CHART_PAD;
+
+export function TimeSeriesChart({
+ series,
+ durationS,
+ yMax: yMaxOpt,
+ yFmt = fmtCount,
+ yAxisLabel,
+ width = 720,
+ height = 260,
+ refLines = NO_REF_LINES,
+}: TimeSeriesChartProps) {
+ const W = width;
+ const H = height;
+
+ const layout = useMemo(() => {
+ const innerW = W - PAD.left - PAD.right;
+ const innerH = H - PAD.top - PAD.bottom;
+ const xMax = Math.max(durationS, 1);
+ // Fold reference-line values into the auto max so a ceiling above the data
+ // (e.g. KV-cache pool >> working set) still renders inside the plot.
+ const refMax = refLines.length > 0 ? Math.max(...refLines.map((r) => r.value)) : 0;
+ const yMax =
+ yMaxOpt ?? Math.max(1e-9, refMax, ...series.flatMap((s) => s.data.map((d) => d.value)));
+ const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+ const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
+ return { innerW, innerH, xMax, yMax, xScale, yScale };
+ }, [series, durationS, yMaxOpt, refLines, W, H]);
+
+ const { innerW, innerH, xMax, yMax, xScale, yScale } = layout;
+
+ const subsample = (arr: TimeSeriesPoint[]) => {
+ if (arr.length === 0) return arr;
+ const stride = Math.max(1, Math.floor(arr.length / innerW));
+ return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
+ };
+
+ // Pre-format axis ticks.
+ const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+ const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
+
+ const resolve = (fraction: number) => {
+ const t = fraction * xMax;
+ const items: HoverItem[] = [];
+ for (const s of series) {
+ if (s.hideFromHover) continue;
+ const v = interpAt(s.data, t);
+ if (v === null || !Number.isFinite(v)) continue;
+ items.push({ color: s.color, label: s.name, value: yFmt(v) });
+ }
+ if (items.length === 0) return null;
+ return { items, title: fmtSeconds(t) };
+ };
+
+ if (series.every((s) => s.data.length === 0)) {
+ return ;
+ }
+
+ return (
+
+ {/* y-axis gridlines + labels */}
+ {yTickVals.map((v, i) => {
+ const y = yScale(v);
+ return (
+
+
+
+ {yFmt(v)}
+
+
+ );
+ })}
+
+ {/* Raw scatter underlay */}
+ {series
+ .filter((s) => s.rawData && s.rawData.length > 0)
+ .map((s, si) =>
+ subsample(s.rawData!).map((d, i) => (
+
+ )),
+ )}
+
+ {/* Lines */}
+ {series.map((s, si) => {
+ if (s.data.length === 0) return null;
+ const sampled = subsample(s.data);
+ const path = sampled
+ .map(
+ (d, i) =>
+ `${i === 0 ? 'M' : 'L'}${xScale(d.t).toFixed(2)},${yScale(d.value).toFixed(2)}`,
+ )
+ .join(' ');
+ return (
+
+ );
+ })}
+
+ {/* Horizontal reference lines (e.g. KV-cache pool ceiling). Drawn on top
+ of the data lines, with a label pinned to the right edge. */}
+ {refLines.map((ref, i) => {
+ if (!Number.isFinite(ref.value) || ref.value < 0 || ref.value > yMax) return null;
+ const y = yScale(ref.value);
+ const color = ref.color ?? '#16a34a';
+ return (
+
+
+
+ {ref.label}
+
+
+ );
+ })}
+
+ {/* X-axis */}
+
+ {xTickVals.map((v, i) => {
+ const x = xScale(v);
+ const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+ return (
+
+ {fmtSeconds(v)}
+
+ );
+ })}
+
+ time
+
+
+ {yAxisLabel && (
+
+ {yAxisLabel}
+
+ )}
+
+ {/* Legend — skip series flagged hideFromHover so per-engine
+ underlays don't clutter the chip row. */}
+ {(() => {
+ const visible = series.filter((s) => !s.hideFromHover);
+ const chipY = H - 8;
+ const chipW = innerW / Math.max(1, visible.length);
+ return visible.map((s, i) => {
+ const x = PAD.left + i * chipW;
+ return (
+
+
+
+ {s.name}
+
+
+ );
+ });
+ })()}
+
+ );
+}
+
+// Fixed colors for the token-source names the chart-series builder emits
+// (vLLM names first, then the SGLang names compute-chart-series produces).
+const KNOWN_SOURCE_COLORS: Record = {
+ local_compute: '#f97316',
+ local_cache_hit: '#3b82f6',
+ external_kv_transfer: '#22c55e',
+ miss: '#f97316',
+ 'cache hit (HBM)': '#3b82f6',
+ 'cache hit (CPU offload)': '#22c55e',
+ 'cache hit': '#3b82f6',
+ 'compute (miss)': '#f97316',
+};
+
+const SOURCE_LABELS: Record = {
+ local_compute: 'Prefill',
+ local_cache_hit: 'HBM Cache Hit',
+ external_kv_transfer: 'Offload Cache Hit',
+ miss: 'Miss',
+};
+
+// Fallback palette for any source name not in KNOWN_SOURCE_COLORS so we never
+// emit two layers in the same shade. Cycles by stack (insertion) order.
+const FALLBACK_PALETTE = [
+ '#3b82f6',
+ '#f97316',
+ '#22c55e',
+ '#a855f7',
+ '#ef4444',
+ '#06b6d4',
+ '#f59e0b',
+ '#ec4899',
+];
+
+/** Stacked-area chart for token-source share over time. */
+export function StackedAreaChart({
+ sourceSeries,
+ durationS,
+ width = 720,
+ height = 260,
+}: {
+ sourceSeries: Record;
+ durationS: number;
+ width?: number;
+ height?: number;
+}) {
+ const W = width;
+ const H = height;
+
+ const computed = useMemo(() => {
+ const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
+ if (entries.length === 0) return null;
+
+ // Different sources can land on different scrape timestamps
+ // (SGLang's hits/misses fire on alternating ticks), so we MUST
+ // align across all sources before computing shares — otherwise the
+ // share calculation indexes into each source's own time axis and
+ // mixes values from different moments.
+ //
+ // Approach: union all timestamps across sources, then for each
+ // unique timestamp carry forward the cumulative sum for every
+ // source (a source that didn't report at time t holds its previous
+ // cumulative value rather than dropping to 0).
+ const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted(
+ (a, b) => a - b,
+ );
+
+ // For each source, walk its (sorted) array and produce a parallel
+ // cumulative-sum array indexed against `tValues` via carry-forward.
+ const cum: Record = {};
+ for (const [name, arr] of entries) {
+ const valByT = new Map(arr.map((p) => [p.t, p.value]));
+ const out: number[] = Array.from({ length: tValues.length });
+ let acc = 0;
+ for (let i = 0; i < tValues.length; i++) {
+ const v = valByT.get(tValues[i]!);
+ if (v !== undefined) acc += v;
+ out[i] = acc;
+ }
+ cum[name] = out;
+ }
+
+ const shares: Record = {};
+ for (const name of Object.keys(cum)) shares[name] = [];
+ for (let i = 0; i < tValues.length; i++) {
+ const total = entries.reduce((s, [name]) => s + (cum[name]?.[i] ?? 0), 0);
+ for (const [name] of entries) {
+ shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0);
+ }
+ }
+ return { tValues, shares };
+ }, [sourceSeries]);
+
+ if (!computed) {
+ return ;
+ }
+ const { tValues, shares } = computed;
+
+ const stackOrder = Object.keys(shares);
+
+ // Assign colors once per render in stack order so the layers and the hover
+ // tooltip always agree, including for unknown source names on the fallback
+ // palette.
+ const colorByName = new Map();
+ let fallbackIdx = 0;
+ for (const name of stackOrder) {
+ const known = KNOWN_SOURCE_COLORS[name];
+ colorByName.set(name, known ?? FALLBACK_PALETTE[fallbackIdx++ % FALLBACK_PALETTE.length]!);
+ }
+ const colorFor = (name: string): string => colorByName.get(name) ?? FALLBACK_PALETTE[0]!;
+
+ const innerW = W - PAD.left - PAD.right;
+ const innerH = H - PAD.top - PAD.bottom;
+ const xMax = Math.max(durationS, 1);
+ const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+ const yScale = (v: number) => PAD.top + (1 - v) * innerH;
+
+ const lower: number[] = Array.from({ length: tValues.length }, () => 0);
+ const layers = stackOrder.map((name) => {
+ const upper = shares[name]!.map((v, i) => lower[i]! + v);
+ const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+ const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+ const d = `${top
+ .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+ .join(' ')} ${[...bottom]
+ .toReversed()
+ .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
+ .join(' ')} Z`;
+ const color = colorFor(name);
+ for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
+ return { name, color, d };
+ });
+
+ const resolve = (fraction: number) => {
+ const t = fraction * xMax;
+ // Find the closest tValue index.
+ let idx = 0;
+ let bestDist = Infinity;
+ for (let i = 0; i < tValues.length; i++) {
+ const d = Math.abs(tValues[i]! - t);
+ if (d < bestDist) {
+ bestDist = d;
+ idx = i;
+ }
+ }
+ const items: HoverItem[] = stackOrder.map((name) => ({
+ color: colorFor(name),
+ label: SOURCE_LABELS[name] ?? name,
+ value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`,
+ }));
+ return { items, title: fmtSeconds(t) };
+ };
+
+ const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+ const yTickVals = [0, 0.25, 0.5, 0.75, 1];
+
+ return (
+
+ {yTickVals.map((v, i) => {
+ const y = yScale(v);
+ return (
+
+
+
+ {(v * 100).toFixed(0)}%
+
+
+ );
+ })}
+ {layers.map((l, i) => (
+
+ ))}
+
+ {xTickVals.map((v, i) => {
+ const x = xScale(v);
+ const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+ return (
+
+ {fmtSeconds(v)}
+
+ );
+ })}
+
+ time
+
+
+ % of prefill tokens
+
+ {(() => {
+ const chipY = H - 8;
+ const chipW = innerW / Math.max(1, layers.length);
+ return layers.map((l, i) => {
+ const x = PAD.left + i * chipW;
+ return (
+
+
+
+ {SOURCE_LABELS[l.name] ?? l.name}
+
+
+ );
+ });
+ })()}
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-math.test.ts b/packages/app/src/components/inference/agentic-point/time-series-math.test.ts
new file mode 100644
index 00000000..d92fc9ba
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-math.test.ts
@@ -0,0 +1,457 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import {
+ averageSequenceLengthInFlight,
+ buildThroughputChartSeries,
+ cumulativeAverage,
+ cumulativeCompletedRequests,
+ cumulativeDifferenceMonotonic,
+ cumulativeTimeAverage,
+ cumulativeUniqueInputTokens,
+ inflightUniqueTokens,
+ interpAt,
+ rollingAverage,
+ rollingRequestMetric,
+ timeRollingAverage,
+ toggleThroughputSeries,
+} from './time-series-math';
+
+const request = (
+ endS: number,
+ ttftMs: number | null,
+ tpotMs: number | null,
+ overrides: Partial = {},
+): RequestRecord => ({
+ cid: 'conversation',
+ ti: endS,
+ wid: 'worker',
+ ad: 0,
+ phase: 'profiling',
+ credit: 0,
+ start: 0,
+ ack: null,
+ end: endS * 1e9,
+ ttftMs,
+ tpotMs,
+ isl: 100,
+ osl: 10,
+ cancelled: false,
+ ...overrides,
+});
+
+describe('rollingRequestMetric', () => {
+ it('computes a trailing P75 TTFT over the requested window', () => {
+ const result = rollingRequestMetric(
+ [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30), request(4, 400, 40)],
+ 'ttft',
+ 'p75',
+ 3,
+ );
+
+ expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 });
+ expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]);
+ expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.325]);
+ });
+
+ it('inverts the rolling TPOT percentile for interactivity', () => {
+ const result = rollingRequestMetric(
+ [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30)],
+ 'interactivity',
+ 'p90',
+ 3,
+ );
+
+ expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]);
+ expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8);
+ expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]);
+ });
+
+ it('computes E2E latency from request start through request end', () => {
+ const result = rollingRequestMetric(
+ [request(2, 100, 10, { start: 500_000_000 }), request(4, 200, 20, { start: 1_000_000_000 })],
+ 'e2e',
+ 'p90',
+ 50,
+ );
+
+ expect(result.raw).toEqual([
+ { t: 2, value: 1.5 },
+ { t: 4, value: 3 },
+ ]);
+ expect(result.trend.at(-1)?.value).toBeCloseTo(2.85, 8);
+ expect(result.cumulative.at(-1)?.value).toBeCloseTo(2.85, 8);
+ });
+
+ it('drops cancelled, missing, and non-positive samples (phase is the caller’s concern)', () => {
+ const result = rollingRequestMetric(
+ [
+ request(1, 100, 10),
+ request(2, 200, 20, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline
+ request(3, 300, 30, { cancelled: true }),
+ request(4, null, null),
+ request(5, 0, 0),
+ ],
+ 'ttft',
+ 'p90',
+ );
+
+ expect(result.raw).toEqual([
+ { t: 1, value: 0.1 },
+ { t: 2, value: 0.2 },
+ ]);
+ });
+});
+
+describe('timeRollingAverage', () => {
+ it('integrates the step function over the trailing window', () => {
+ const result = timeRollingAverage(
+ [
+ { t: 0, value: 10 },
+ { t: 2, value: 20 },
+ { t: 4, value: 40 },
+ ],
+ 4,
+ );
+
+ // t=0: zero-length window → raw value. t=2: 10 held on [0,2) → 10.
+ // t=4: 10 on [0,2) + 20 on [2,4) = 60 area / 4 s = 15.
+ expect(result).toEqual([
+ { t: 0, value: 10 },
+ { t: 2, value: 10 },
+ { t: 4, value: 15 },
+ ]);
+ });
+
+ it('carries the pre-window step value into a clipped window', () => {
+ const result = timeRollingAverage(
+ [
+ { t: 0, value: 10 },
+ { t: 2, value: 20 },
+ { t: 4, value: 40 },
+ ],
+ 2,
+ );
+
+ // Window [2,4): value 20 held throughout (the t=0 sample sets the step
+ // value at the window start via carry-forward of data[j-1]).
+ expect(result.at(-1)).toEqual({ t: 4, value: 20 });
+ });
+
+ it('passes through empty input and non-positive windows', () => {
+ expect(timeRollingAverage([], 30)).toEqual([]);
+ const data = [{ t: 0, value: 1 }];
+ expect(timeRollingAverage(data, 0)).toBe(data);
+ });
+});
+
+describe('rollingAverage', () => {
+ it('averages a centered window clipped at the edges', () => {
+ const data = [1, 2, 3, 4].map((value, i) => ({ t: i, value }));
+ expect(rollingAverage(data, 3).map((p) => p.value)).toEqual([1.5, 2, 3, 3.5]);
+ });
+
+ it('passes through window sizes of 1 or less', () => {
+ const data = [{ t: 0, value: 5 }];
+ expect(rollingAverage(data, 1)).toBe(data);
+ });
+});
+
+describe('cumulativeAverage', () => {
+ it('hides the startup interval without removing it from later averages', () => {
+ const result = cumulativeAverage(
+ [
+ { t: 0, value: 300 },
+ { t: 30, value: 0 },
+ { t: 60, value: 0 },
+ { t: 90, value: 100 },
+ ],
+ 60,
+ );
+
+ expect(result).toEqual([
+ { t: 60, value: 100 },
+ { t: 90, value: 100 },
+ ]);
+ });
+
+ it('preserves the original behavior when no burn-in is requested', () => {
+ expect(
+ cumulativeAverage([
+ { t: 0, value: 10 },
+ { t: 1, value: 20 },
+ ]),
+ ).toEqual([
+ { t: 0, value: 10 },
+ { t: 1, value: 15 },
+ ]);
+ });
+});
+
+describe('cumulativeTimeAverage', () => {
+ it('computes a run-to-date time-weighted average for a step series', () => {
+ expect(
+ cumulativeTimeAverage([
+ { t: 0, value: 100 },
+ { t: 1, value: 300 },
+ { t: 3, value: 100 },
+ { t: 4, value: 0 },
+ ]),
+ ).toEqual([
+ { t: 0, value: 100 },
+ { t: 1, value: 100 },
+ { t: 3, value: 700 / 3 },
+ { t: 4, value: 200 },
+ ]);
+ });
+
+ it('coalesces same-time request events to their final step value', () => {
+ expect(
+ cumulativeTimeAverage([
+ { t: 0, value: 0 },
+ { t: 0, value: 100 },
+ { t: 2, value: 0 },
+ ]),
+ ).toEqual([
+ { t: 0, value: 100 },
+ { t: 2, value: 100 },
+ ]);
+ });
+});
+
+describe('cumulativeCompletedRequests', () => {
+ it('sorts completions and excludes cancelled requests (phase is the caller’s concern)', () => {
+ expect(
+ cumulativeCompletedRequests([
+ request(4, 100, 10),
+ request(2, 100, 10),
+ request(1, 100, 10, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline
+ request(3, 100, 10, { cancelled: true }),
+ ]),
+ ).toEqual([
+ { t: 0, value: 0 },
+ { t: 1, value: 1 },
+ { t: 2, value: 2 },
+ { t: 4, value: 3 },
+ ]);
+ });
+
+ it('returns no series when there are no successful completions', () => {
+ expect(cumulativeCompletedRequests([request(1, 100, 10, { cancelled: true })])).toEqual([]);
+ });
+});
+
+describe('averageSequenceLengthInFlight', () => {
+ it('computes the event-time average across overlapping profiling requests', () => {
+ expect(
+ averageSequenceLengthInFlight(
+ [
+ request(4, 100, 10, { start: 0, end: 4_000_000_000, isl: 100 }),
+ request(3, 100, 10, { start: 1_000_000_000, end: 3_000_000_000, isl: 300 }),
+ ],
+ 'isl',
+ ),
+ ).toEqual([
+ { t: 0, value: 100 },
+ { t: 1, value: 200 },
+ { t: 3, value: 100 },
+ { t: 4, value: 0 },
+ ]);
+ });
+
+ it('excludes cancelled and missing sequence lengths (phase is the caller’s concern)', () => {
+ // Only the null-osl and cancelled rows are dropped; the warmup row is kept
+ // (the caller passes a phase-scoped timeline), so it produces a step series.
+ expect(
+ averageSequenceLengthInFlight(
+ [
+ request(1, 100, 10, { osl: null }),
+ request(2, 100, 10, { osl: 20, cancelled: true }),
+ request(3, 100, 10, { osl: 30, phase: 'warmup', start: 0, end: 3_000_000_000 }),
+ ],
+ 'osl',
+ ),
+ ).toEqual([
+ { t: 0, value: 30 },
+ { t: 3, value: 0 },
+ ]);
+ });
+});
+
+describe('toggleThroughputSeries', () => {
+ it('allows either series to be hidden when both are selected', () => {
+ expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'input')]).toEqual(['decode']);
+ expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'decode')]).toEqual(['input']);
+ });
+
+ it('does not allow the final visible series to be hidden', () => {
+ const selected = new Set<'input' | 'decode'>(['decode']);
+ expect(toggleThroughputSeries(selected, 'decode')).toBe(selected);
+ });
+
+ it('allows the hidden series to be restored', () => {
+ expect([...toggleThroughputSeries(new Set(['decode']), 'input')]).toEqual(['decode', 'input']);
+ });
+
+ it('only includes the total running average when both series are visible', () => {
+ const input = [{ t: 0, value: 10 }];
+ const decode = [{ t: 0, value: 20 }];
+
+ expect(
+ buildThroughputChartSeries(input, decode, new Set(['input', 'decode'])).map(
+ ({ name }) => name,
+ ),
+ ).toEqual(['Input (avg n=50)', 'Decode (avg n=50)', 'Total running avg (60s burn-in)']);
+ expect(
+ buildThroughputChartSeries(input, decode, new Set(['input'])).map(({ name }) => name),
+ ).toEqual(['Input (avg n=50)']);
+ expect(
+ buildThroughputChartSeries(input, decode, new Set(['decode'])).map(({ name }) => name),
+ ).toEqual(['Decode (avg n=50)']);
+ });
+});
+
+describe('cumulativeUniqueInputTokens', () => {
+ it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => {
+ const out = cumulativeUniqueInputTokens({
+ local_compute: [
+ { t: 0, value: 100 },
+ { t: 1, value: 50 },
+ ],
+ local_cache_hit: [
+ { t: 0, value: 900 },
+ { t: 1, value: 950 },
+ ],
+ external_kv_transfer: [
+ { t: 0, value: 5000 },
+ { t: 1, value: 6000 },
+ ],
+ });
+ expect(out).toEqual([
+ { t: 0, value: 100 },
+ { t: 1, value: 150 },
+ ]);
+ });
+
+ it('recognizes the sglang compute/cache labels the builder emits', () => {
+ const out = cumulativeUniqueInputTokens({
+ 'compute (miss)': [
+ { t: 0, value: 10 },
+ { t: 2, value: 20 },
+ ],
+ 'cache hit (HBM)': [{ t: 0, value: 999 }],
+ 'cache hit (CPU offload)': [{ t: 2, value: 999 }],
+ });
+ expect(out).toEqual([
+ { t: 0, value: 10 },
+ { t: 2, value: 30 },
+ ]);
+ });
+
+ it('sums multiple non-cache buckets at the same timestamp', () => {
+ const out = cumulativeUniqueInputTokens({
+ local_compute: [{ t: 0, value: 100 }],
+ miss: [{ t: 0, value: 25 }],
+ });
+ expect(out).toEqual([{ t: 0, value: 125 }]);
+ });
+
+ it('is monotonic non-decreasing (no clamp needed — values are rates ≥ 0)', () => {
+ const out = cumulativeUniqueInputTokens({
+ local_compute: [
+ { t: 0, value: 300 },
+ { t: 1, value: 0 },
+ { t: 2, value: 10 },
+ ],
+ });
+ expect(out.map((p) => p.value)).toEqual([300, 300, 310]);
+ });
+
+ it('returns [] when there is no breakdown so the caller can fall back', () => {
+ expect(cumulativeUniqueInputTokens(undefined)).toEqual([]);
+ expect(cumulativeUniqueInputTokens({})).toEqual([]);
+ });
+
+ it('returns [] when every bucket is a cache tier (no computed signal)', () => {
+ expect(
+ cumulativeUniqueInputTokens({
+ local_cache_hit: [{ t: 0, value: 100 }],
+ 'cache hit': [{ t: 0, value: 100 }],
+ }),
+ ).toEqual([]);
+ });
+});
+
+describe('inflightUniqueTokens', () => {
+ it('sums active ISLs across cids as a step series (ends before starts on ties)', () => {
+ const out = inflightUniqueTokens([
+ { cid: 'a', start: 0, end: 2e9, isl: 100 },
+ { cid: 'a', start: 2e9, end: 4e9, isl: 150 }, // turn handoff at t=2
+ { cid: 'b', start: 1e9, end: 3e9, isl: 200 },
+ ]);
+ expect(out).toEqual([
+ { t: 0, value: 0 },
+ { t: 0, value: 100 },
+ { t: 1, value: 300 },
+ { t: 2, value: 200 }, // end of a's turn 1 processed first — no double count
+ { t: 2, value: 350 },
+ { t: 3, value: 150 },
+ { t: 4, value: 0 },
+ ]);
+ });
+
+ it('counts one in-flight ISL per cid even when its requests overlap', () => {
+ const out = inflightUniqueTokens([
+ { cid: 'a', start: 0, end: 3e9, isl: 100 },
+ { cid: 'a', start: 1e9, end: 2e9, isl: 50 },
+ ]);
+ expect(out).toEqual([
+ { t: 0, value: 0 },
+ { t: 0, value: 100 },
+ { t: 1, value: 100 }, // nested request folded into the cid's max ISL
+ { t: 2, value: 0 },
+ { t: 3, value: 0 },
+ ]);
+ });
+
+ it('skips requests without a positive ISL and empty input', () => {
+ expect(inflightUniqueTokens([])).toEqual([]);
+ expect(inflightUniqueTokens([{ cid: 'a', start: 0, end: 1e9, isl: null }])).toEqual([]);
+ expect(inflightUniqueTokens([{ cid: 'a', start: 0, end: 1e9, isl: 0 }])).toEqual([]);
+ });
+});
+
+describe('cumulativeDifferenceMonotonic', () => {
+ it('unions timestamps and clamps the difference to its running max', () => {
+ expect(
+ cumulativeDifferenceMonotonic(
+ [
+ { t: 0, value: 10 },
+ { t: 1, value: 10 },
+ ],
+ [
+ { t: 0, value: 5 },
+ { t: 2, value: 20 }, // drives the raw diff negative — clamp holds
+ ],
+ ),
+ ).toEqual([
+ { t: 0, value: 5 },
+ { t: 1, value: 15 },
+ { t: 2, value: 15 },
+ ]);
+ });
+});
+
+describe('interpAt', () => {
+ it('linearly interpolates between samples and clamps outside the range', () => {
+ const data = [
+ { t: 0, value: 0 },
+ { t: 10, value: 100 },
+ ];
+ expect(interpAt(data, 5)).toBe(50);
+ expect(interpAt(data, -1)).toBe(0);
+ expect(interpAt(data, 11)).toBe(100);
+ expect(interpAt([], 5)).toBeNull();
+ });
+});
diff --git a/packages/app/src/components/inference/agentic-point/time-series-math.ts b/packages/app/src/components/inference/agentic-point/time-series-math.ts
new file mode 100644
index 00000000..7242db4d
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-math.ts
@@ -0,0 +1,491 @@
+/**
+ * Pure data-shaping helpers behind the agentic point-detail time-series
+ * charts: rolling/cumulative aggregations over `TimeSeriesPoint[]` server
+ * scrapes and per-request timeline records. No React, no SVG — everything
+ * here is unit-testable in isolation (see time-series-math.test.ts).
+ */
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+
+/** One drawable line in a TimeSeriesChart. */
+export interface ChartSeries {
+ name: string;
+ /** The line to draw (caller pre-smooths if desired). */
+ data: TimeSeriesPoint[];
+ /** Optional raw per-scrape values; rendered as low-opacity scatter behind the line. */
+ rawData?: TimeSeriesPoint[];
+ color: string;
+ /** Override default stroke width (1.8). Use higher values for emphasis lines. */
+ strokeWidth?: number;
+ /** Stroke opacity (0..1). Use < 1 for background/underlay lines. */
+ strokeOpacity?: number;
+ /** Hide from the hover legend (e.g. per-engine underlay lines that
+ * would clutter the tooltip). The path still renders. */
+ hideFromHover?: boolean;
+}
+
+export type RequestMetric = 'interactivity' | 'ttft' | 'e2e';
+export type RequestPercentile = 'p75' | 'p90';
+export type ThroughputSeriesKey = 'input' | 'decode';
+
+/** Toggle one throughput series while preserving the at-least-one invariant. */
+export function toggleThroughputSeries(
+ selected: ReadonlySet,
+ key: ThroughputSeriesKey,
+): ReadonlySet {
+ if (selected.has(key) && selected.size === 1) return selected;
+ const next = new Set(selected);
+ if (next.has(key)) next.delete(key);
+ else next.add(key);
+ return next;
+}
+
+/** Linear-interpolated percentile (matches numpy's default method). */
+export function quantile(sortedAsc: number[], q: number): number {
+ if (sortedAsc.length === 1) return sortedAsc[0]!;
+ const pos = (sortedAsc.length - 1) * q;
+ const lo = Math.floor(pos);
+ const hi = Math.ceil(pos);
+ if (lo === hi) return sortedAsc[lo]!;
+ return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+/** Linear-interpolated value at time `t` from a time-sorted series. */
+export function interpAt(data: TimeSeriesPoint[], t: number): number | null {
+ if (data.length === 0) return null;
+ if (t <= data[0]!.t) return data[0]!.value;
+ if (t >= data.at(-1)!.t) return data.at(-1)!.value;
+ // Binary search
+ let lo = 0;
+ let hi = data.length - 1;
+ while (hi - lo > 1) {
+ const mid = (lo + hi) >> 1;
+ if (data[mid]!.t <= t) lo = mid;
+ else hi = mid;
+ }
+ const a = data[lo]!;
+ const b = data[hi]!;
+ if (b.t === a.t) return a.value;
+ const frac = (t - a.t) / (b.t - a.t);
+ return a.value + (b.value - a.value) * frac;
+}
+
+/**
+ * Build raw request samples plus a trailing request-count percentile. E2E
+ * latency is measured from HTTP request start through final response byte.
+ *
+ * The percentile is computed in latency space. Interactivity then inverts
+ * the selected TPOT percentile, matching the aggregate chart convention:
+ * P90 interactivity = 1 / P90 TPOT (a conservative tail-latency view).
+ */
+export function rollingRequestMetric(
+ requests: readonly RequestRecord[],
+ metric: RequestMetric,
+ percentile: RequestPercentile,
+ windowSize = 50,
+): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } {
+ const q = percentile === 'p75' ? 0.75 : 0.9;
+ // Phase is the caller's concern — the agentic detail page passes a
+ // phase-scoped (warmup or profiling) timeline. Here we only drop cancelled
+ // requests and samples without a usable latency value.
+ const samples = requests
+ .filter((request) => !request.cancelled)
+ .flatMap((request) => {
+ const latencyMs =
+ metric === 'ttft'
+ ? request.ttftMs
+ : metric === 'e2e'
+ ? (request.end - request.start) / 1e6
+ : request.tpotMs;
+ if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return [];
+ return [{ t: request.end / 1e9, latencyMs }];
+ })
+ .toSorted((a, b) => a.t - b.t);
+
+ const raw = samples.map(({ t, latencyMs }) => ({
+ t,
+ value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000,
+ }));
+ const trend = samples.map(({ t }, i) => {
+ const start = Math.max(0, i - Math.max(1, windowSize) + 1);
+ const sorted = samples
+ .slice(start, i + 1)
+ .map((sample) => sample.latencyMs)
+ .toSorted((a, b) => a - b);
+ const latencyMs = quantile(sorted, q);
+ return { t, value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000 };
+ });
+ const prefixLatencies: number[] = [];
+ const cumulative = samples.map(({ t, latencyMs }) => {
+ let lo = 0;
+ let hi = prefixLatencies.length;
+ while (lo < hi) {
+ const mid = (lo + hi) >> 1;
+ if (prefixLatencies[mid]! <= latencyMs) lo = mid + 1;
+ else hi = mid;
+ }
+ prefixLatencies.splice(lo, 0, latencyMs);
+ const cumulativeLatencyMs = quantile(prefixLatencies, q);
+ return {
+ t,
+ value: metric === 'interactivity' ? 1000 / cumulativeLatencyMs : cumulativeLatencyMs / 1000,
+ };
+ });
+
+ return { raw, trend, cumulative };
+}
+
+/**
+ * Time-weighted rolling average over a `windowS`-second trailing window.
+ * Treats the input as a step function (value held constant between
+ * samples) and integrates over the trailing window, dividing by the
+ * window length. Good for smoothing irregularly-sampled event series
+ * (e.g. request start/end events) where the regular sample-count
+ * `rollingAverage` would over-weight bursts of close-together events.
+ */
+export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] {
+ if (data.length === 0 || windowS <= 0) return data;
+ const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+ for (let i = 0; i < data.length; i++) {
+ const tEnd = data[i]!.t;
+ const tStart = Math.max(0, tEnd - windowS);
+ // Find the first sample j whose t is >= tStart; the step value at
+ // tStart is data[j-1].value if j > 0, else data[0].value.
+ let j = 0;
+ while (j < data.length && data[j]!.t < tStart) j++;
+ let prevT = tStart;
+ let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value;
+ let area = 0;
+ for (; j <= i; j++) {
+ const curT = data[j]!.t;
+ area += prevV * (curT - prevT);
+ prevT = curT;
+ prevV = data[j]!.value;
+ }
+ const dur = tEnd - tStart;
+ out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value };
+ }
+ return out;
+}
+
+/** Centered rolling average over `windowSize` samples. */
+export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] {
+ if (data.length === 0 || windowSize <= 1) return data;
+ const half = Math.floor(windowSize / 2);
+ const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+ for (let i = 0; i < data.length; i++) {
+ const start = Math.max(0, i - half);
+ const end = Math.min(data.length, i + half + 1);
+ let sum = 0;
+ let n = 0;
+ for (let j = start; j < end; j++) {
+ sum += data[j]!.value;
+ n++;
+ }
+ out[i] = { t: data[i]!.t, value: n > 0 ? sum / n : 0 };
+ }
+ return out;
+}
+
+/**
+ * Expanding-window cumulative mean from index 0..i.
+ *
+ * `burnInS` suppresses rendering during the unstable startup interval while
+ * retaining those samples in every later average. This avoids visually
+ * promoting a single bursty counter bucket without changing the run-to-date
+ * meaning of the line once it appears.
+ */
+export function cumulativeAverage(data: TimeSeriesPoint[], burnInS = 0): TimeSeriesPoint[] {
+ if (data.length === 0) return data;
+ const out: TimeSeriesPoint[] = [];
+ const firstT = data[0]!.t;
+ let sum = 0;
+ for (let i = 0; i < data.length; i++) {
+ sum += data[i]!.value;
+ if (data[i]!.t - firstT >= burnInS) {
+ out.push({ t: data[i]!.t, value: sum / (i + 1) });
+ }
+ }
+ return out;
+}
+
+/**
+ * Run-to-date time-weighted average of a step series.
+ *
+ * Duplicate timestamps are coalesced to their final value before integration;
+ * this is important for request handoffs where several start/end events occur
+ * at the same instant. Each value is held until the next timestamp.
+ */
+export function cumulativeTimeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+ if (data.length === 0) return [];
+ const points: TimeSeriesPoint[] = [];
+ for (const point of data.toSorted((a, b) => a.t - b.t)) {
+ if (!Number.isFinite(point.t) || !Number.isFinite(point.value)) continue;
+ const previous = points.at(-1);
+ if (previous?.t === point.t) previous.value = point.value;
+ else points.push({ ...point });
+ }
+ if (points.length === 0) return [];
+
+ const firstT = points[0]!.t;
+ let previousT = firstT;
+ let previousValue = points[0]!.value;
+ let area = 0;
+ return points.map((point, index) => {
+ if (index === 0) return { t: point.t, value: point.value };
+ area += previousValue * (point.t - previousT);
+ const duration = point.t - firstT;
+ previousT = point.t;
+ previousValue = point.value;
+ return { t: point.t, value: duration > 0 ? area / duration : point.value };
+ });
+}
+
+/**
+ * Cumulative count of successfully completed (non-cancelled) requests by end
+ * time. Phase is the caller's concern — pass a phase-scoped timeline.
+ */
+export function cumulativeCompletedRequests(requests: readonly RequestRecord[]): TimeSeriesPoint[] {
+ const completionTimes = requests
+ .filter((request) => !request.cancelled)
+ .map((request) => request.end / 1e9)
+ .filter(Number.isFinite)
+ .toSorted((a, b) => a - b);
+ if (completionTimes.length === 0) return [];
+ return [{ t: 0, value: 0 }, ...completionTimes.map((t, index) => ({ t, value: index + 1 }))];
+}
+
+/**
+ * Retrospective average sequence length among requests active at each event.
+ * OSL uses the request's final observed length across its whole lifetime.
+ */
+export function averageSequenceLengthInFlight(
+ requests: readonly RequestRecord[],
+ metric: 'isl' | 'osl',
+): TimeSeriesPoint[] {
+ const events = new Map();
+ const addEvent = (t: number, tokenDelta: number, countDelta: number) => {
+ const current = events.get(t) ?? { tokenDelta: 0, countDelta: 0 };
+ current.tokenDelta += tokenDelta;
+ current.countDelta += countDelta;
+ events.set(t, current);
+ };
+
+ // Phase is the caller's concern — pass a phase-scoped timeline.
+ for (const request of requests) {
+ const tokens = request[metric];
+ if (
+ request.cancelled ||
+ tokens === null ||
+ !Number.isFinite(tokens) ||
+ tokens < 0 ||
+ request.end < request.start
+ ) {
+ continue;
+ }
+ addEvent(request.start / 1e9, tokens, 1);
+ addEvent(request.end / 1e9, -tokens, -1);
+ }
+
+ let tokensInFlight = 0;
+ let requestsInFlight = 0;
+ return [...events.entries()]
+ .toSorted((a, b) => a[0] - b[0])
+ .map(([t, event]) => {
+ tokensInFlight += event.tokenDelta;
+ requestsInFlight += event.countDelta;
+ return { t, value: requestsInFlight > 0 ? tokensInFlight / requestsInFlight : 0 };
+ });
+}
+
+// A promptTokensBySource bucket label denotes tokens served from some cache
+// tier (local prefix cache, offloaded/host KV, remote KV transfer) rather than
+// freshly computed. Matches vllm labels (`local_cache_hit`,
+// `external_kv_transfer`) and the sglang labels the chart-series builder emits
+// (`cache hit (HBM)`, `cache hit (CPU offload)`, `cache hit`).
+const CACHE_SOURCE_RE = /cache|hit|transfer|reuse/iu;
+
+/**
+ * Cumulative "unique" (freshly prefill-computed) input tokens from the
+ * promptTokensBySource breakdown: total prompt tokens minus everything served
+ * from a cache tier. The breakdown's buckets sum to the real prompt-token
+ * total per scrape, so this is internally consistent and naturally monotonic.
+ *
+ * Preferred over `cumulativeDifferenceMonotonic(prefillTps, prefixCacheHitsTps)`
+ * because `vllm:prefix_cache_hits` re-counts tokens across chunked-prefill /
+ * preemption scheduler passes — its cumulative routinely exceeds the prompt
+ * tokens ever received, which drove the difference deeply negative and froze
+ * the monotonic-clamped curve at whatever it reached in the first few seconds.
+ *
+ * Any bucket whose label isn't recognizably a cache tier counts as computed
+ * (the safe direction for "unique"): a new fresh-compute label over-reports
+ * unique slightly rather than silently freezing the line. Returns [] when no
+ * breakdown is available so the caller can fall back.
+ */
+export function cumulativeUniqueInputTokens(
+ promptTokensBySource: Record | undefined,
+): TimeSeriesPoint[] {
+ if (!promptTokensBySource) return [];
+ const computedByT = new Map();
+ let sawComputed = false;
+ for (const [source, series] of Object.entries(promptTokensBySource)) {
+ if (CACHE_SOURCE_RE.test(source)) continue;
+ sawComputed = true;
+ for (const p of series) computedByT.set(p.t, (computedByT.get(p.t) ?? 0) + p.value);
+ }
+ if (!sawComputed) return [];
+ const out: TimeSeriesPoint[] = [];
+ let sum = 0;
+ for (const t of [...computedByT.keys()].toSorted((x, y) => x - y)) {
+ sum += computedByT.get(t)!;
+ out.push({ t, value: sum });
+ }
+ return out;
+}
+
+/**
+ * Per-event step series: at each request start/end, sum the ISLs of
+ * currently-active requests across distinct `cid`s. Within a single
+ * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N),
+ * so each cid contributes at most one in-flight ISL at a time. Across
+ * different cids we assume content is independent (parent ↔ subagent
+ * and conv ↔ conv share negligible prefix in practice — cross-conv
+ * dedup added ~0.25 pp to theoretical hit rate, so treating them as
+ * independent is a tight approximation of the true in-flight unique
+ * token count).
+ *
+ * Output is a step function: one point per event, value held constant
+ * until the next event. Time axis is seconds relative to the earliest
+ * event in `requests`.
+ */
+export function inflightUniqueTokens(
+ requests: readonly { cid: string; start: number; end: number; isl: number | null }[],
+): TimeSeriesPoint[] {
+ if (requests.length === 0) return [];
+ // The request_timeline timestamps are ns-relative to its own origin.
+ // Convert events to seconds and emit a step series.
+ interface Event {
+ tNs: number;
+ kind: 'start' | 'end';
+ cid: string;
+ isl: number;
+ }
+ const events: Event[] = [];
+ for (const r of requests) {
+ const isl = r.isl ?? 0;
+ if (isl <= 0) continue;
+ events.push(
+ { tNs: r.start, kind: 'start', cid: r.cid, isl },
+ { tNs: r.end, kind: 'end', cid: r.cid, isl },
+ );
+ }
+ if (events.length === 0) return [];
+ // Sort by time; on ties, process 'end' before 'start' so a same-instant
+ // turn handoff within one cid doesn't transiently double-count.
+ events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1));
+
+ // Active ISL per cid (max in case the same cid somehow has overlapping
+ // events; in practice it's always 0 or 1 request at a time per cid).
+ const activeByCid = new Map();
+ let total = 0;
+ const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }];
+ for (const e of events) {
+ const tSec = e.tNs / 1e9;
+ if (e.kind === 'start') {
+ const prev = activeByCid.get(e.cid) ?? 0;
+ const next = Math.max(prev, e.isl);
+ activeByCid.set(e.cid, next);
+ total += next - prev;
+ } else {
+ const cur = activeByCid.get(e.cid) ?? 0;
+ if (cur > 0) {
+ total -= cur;
+ activeByCid.delete(e.cid);
+ }
+ }
+ out.push({ t: tSec, value: Math.max(0, total) });
+ }
+ return out;
+}
+
+/**
+ * Monotonic-non-decreasing cumulative difference of two rate series:
+ * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce
+ * a running max so the curve never dips below its prior value.
+ *
+ * Use this to plot things like "cumulative cache-missed tokens" where the
+ * true value can only ever grow, but the underlying per-tick rates can
+ * temporarily look negative due to counter timing skew between scrapes
+ * (vllm's `prefix_cache_hits` and `prompt_tokens` counters can lag each
+ * other by ~5-10 s in our data even though their lifetime totals agree).
+ *
+ * `a` and `b` may have different (or overlapping) timestamp sets — both
+ * are unioned and walked in time order. Output has one point per unique
+ * timestamp present in either input.
+ */
+export function cumulativeDifferenceMonotonic(
+ a: TimeSeriesPoint[],
+ b: TimeSeriesPoint[],
+): TimeSeriesPoint[] {
+ const aByT = new Map(a.map((p) => [p.t, p.value]));
+ const bByT = new Map(b.map((p) => [p.t, p.value]));
+ const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y);
+ const out: TimeSeriesPoint[] = Array.from({ length: allT.length });
+ let cumA = 0;
+ let cumB = 0;
+ let runningMax = 0;
+ for (let i = 0; i < allT.length; i++) {
+ const t = allT[i]!;
+ cumA += aByT.get(t) ?? 0;
+ cumB += bByT.get(t) ?? 0;
+ const diff = cumA - cumB;
+ if (diff > runningMax) runningMax = diff;
+ out[i] = { t, value: runningMax };
+ }
+ return out;
+}
+
+/** Pointwise sum of two arrays sharing the same t index. */
+function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
+ const n = Math.min(a.length, b.length);
+ const out: TimeSeriesPoint[] = Array.from({ length: n });
+ for (let i = 0; i < n; i++) {
+ out[i] = { t: a[i]!.t, value: a[i]!.value + b[i]!.value };
+ }
+ return out;
+}
+
+/** Build throughput lines from the currently visible input/decode signals. */
+export function buildThroughputChartSeries(
+ input: TimeSeriesPoint[],
+ decode: TimeSeriesPoint[],
+ selected: ReadonlySet,
+): ChartSeries[] {
+ const series: ChartSeries[] = [];
+ if (selected.has('input')) {
+ series.push({
+ name: 'Input (avg n=50)',
+ data: rollingAverage(input, 50),
+ color: '#3b82f6',
+ strokeWidth: 1.6,
+ });
+ }
+ if (selected.has('decode')) {
+ series.push({
+ name: 'Decode (avg n=50)',
+ data: rollingAverage(decode, 50),
+ color: '#f97316',
+ strokeWidth: 1.6,
+ });
+ }
+ if (selected.size === 2) {
+ series.push({
+ name: 'Total running avg (60s burn-in)',
+ data: cumulativeAverage(sumSeries(input, decode), 60),
+ color: '#ef4444',
+ strokeWidth: 3,
+ });
+ }
+ return series;
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-bars.tsx b/packages/app/src/components/inference/agentic-point/timeline-bars.tsx
new file mode 100644
index 00000000..a5444cb2
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-bars.tsx
@@ -0,0 +1,252 @@
+'use client';
+
+import { memo } from 'react';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import {
+ CHART_WIDTH,
+ HEADER_HEIGHT,
+ PADDING_RIGHT,
+ ROW_GAP,
+ ROW_HEIGHT,
+ timelineSvgHeight,
+} from './timeline-layout';
+import { formatTickLabel } from './timeline-format';
+import { conversationHref, type RequestTimelineRow } from './timeline-rows';
+
+/** Phase color overlay drawn as a thin strip at the bottom of each bar. */
+const PHASE_COLORS: Record = {
+ profiling: '#22c55e',
+ warmup: '#94a3b8',
+ unknown: '#64748b',
+};
+
+// Time-axis tick spacing candidates (~8 ticks across the visible window,
+// snapped to the first nice multiple that fits).
+const NICE_TICK_MS = [
+ 100, 250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 120_000, 300_000, 600_000, 1_800_000,
+];
+
+export interface TimelineBarsProps {
+ rows: RequestTimelineRow[];
+ expandedSubagents: ReadonlySet;
+ /** Absolute ns timestamp of the visible data's origin (min credit). */
+ dataStart: number;
+ /** Visible window (ns offsets from dataStart). */
+ vStart: number;
+ vEnd: number;
+ datasetSlug?: string | null;
+ onBarHover: (e: React.MouseEvent, row: RequestTimelineRow, req: RequestRecord) => void;
+ onBarLeave: () => void;
+ /** Plain left-click SPA navigation; modified clicks fall through to the href. */
+ onBarClick: (e: React.MouseEvent, req: RequestRecord) => void;
+}
+
+/**
+ * The static SVG content of the timeline: time axis, row separators, and every
+ * request bar. Memoized so tooltip/cursor mousemove state changes in the parent
+ * (which fire on every pointer move) don't re-render thousands of bar rects —
+ * only zoom/pan, filter, and expansion changes reach this subtree.
+ */
+export const TimelineBars = memo(
+ ({
+ rows,
+ expandedSubagents,
+ dataStart,
+ vStart,
+ vEnd,
+ datasetSlug,
+ onBarHover,
+ onBarLeave,
+ onBarClick,
+ }: TimelineBarsProps) => {
+ const svgHeight = timelineSvgHeight(rows.length);
+ const visibleDur = Math.max(vEnd - vStart, 1);
+ const scale = (CHART_WIDTH - PADDING_RIGHT) / visibleDur;
+ // Local coords: convert ns offset from dataStart to x px.
+ const xOf = (ns: number) => (ns - dataStart - vStart) * scale;
+
+ // Time-axis ticks (~8 across visible window, snapped to nice second multiples).
+ const targetMs = visibleDur / 1e6 / 8;
+ const tickMs = NICE_TICK_MS.find((n) => n >= targetMs) ?? targetMs;
+ const tickNs = tickMs * 1e6;
+ const ticks: number[] = [];
+ const tickStart = Math.floor(vStart / tickNs) * tickNs;
+ for (let t = tickStart; t <= vEnd + tickNs; t += tickNs) {
+ if (t >= vStart && t <= vEnd) ticks.push(t);
+ }
+
+ return (
+ <>
+ {/* Header / time-axis baseline */}
+
+
+ {/* Time axis ticks */}
+ {ticks.map((t) => {
+ // Convert visible-window ns offset → x px (the tick array
+ // is already in dataStart-relative coords).
+ const x = (t - vStart) * scale;
+ return (
+
+
+
+ {formatTickLabel(t)}
+
+
+ );
+ })}
+
+ {/* Row separators */}
+ {rows.map((row, idx) => (
+
+ ))}
+
+ {/* Request bars */}
+ {rows.map((row, rowIdx) => {
+ const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
+ const barH = ROW_HEIGHT - 4;
+ // For multi-stream subagent containers, suppress the union
+ // bars when expanded — the child stream rows draw them
+ // individually instead, so we'd double-draw otherwise.
+ if (
+ row.kind === 'subagent' &&
+ (row.streamCount ?? 1) > 1 &&
+ expandedSubagents.has(row.key)
+ ) {
+ return null;
+ }
+ return row.requests.map((req) => {
+ const xCredit = xOf(req.credit);
+ const xStart = xOf(req.start);
+ const xEnd = xOf(req.end);
+ // Cull bars entirely outside the visible window so big
+ // benchmarks don't render thousands of zero-width rects.
+ if (xEnd < -2 || xCredit > CHART_WIDTH + 2) return null;
+ const runW = Math.max(xEnd - xStart, 1);
+ const queueW = Math.max(xStart - xCredit, 0);
+ const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+ const barKey = `${req.cid}-${req.ti}-${req.start}`;
+ const barChildren = (
+ <>
+ {/* Queue lead-in (faint) — only drawn when noticeable. */}
+ {queueW >= 1 && (
+
+ )}
+ {/* Main bar — opacity stepped down with depth so
+ parent > subagent > stream reads visually. */}
+
+ {/* Phase strip at bottom */}
+
+ {/* Cancelled X overlay */}
+ {req.cancelled && runW > 6 && (
+
+ )}
+ >
+ );
+ // No source dataset → not linkable; plain group.
+ if (!datasetSlug) {
+ return (
+ onBarHover(e, row, req)}
+ onMouseLeave={onBarLeave}
+ >
+ {barChildren}
+
+ );
+ }
+ // Linkable: render a real SVG anchor with the conversation
+ // href so the browser's native "open in new tab" works
+ // (right-click menu, ⌘/Ctrl-click, middle-click). Plain
+ // left-click stays an in-app navigation; modified or
+ // non-primary clicks fall through to the browser. Suppress
+ // the native link drag so it doesn't fight the pan gesture.
+ return (
+ onBarHover(e, row, req)}
+ onMouseLeave={onBarLeave}
+ onClick={(e) => onBarClick(e, req)}
+ onDragStart={(e) => e.preventDefault()}
+ style={{ cursor: 'pointer' }}
+ >
+ {barChildren}
+
+ );
+ });
+ })}
+ >
+ );
+ },
+);
diff --git a/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts
new file mode 100644
index 00000000..47c0f034
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts
@@ -0,0 +1,69 @@
+import { describe, expect, it } from 'vitest';
+
+import { countLeq, countLt, cursorStatsAt } from './timeline-cursor-stats';
+
+describe('countLeq / countLt', () => {
+ const sorted = [1, 3, 3, 5, 9];
+
+ it('counts values <= / < target with binary search', () => {
+ expect(countLeq(sorted, 3)).toBe(3);
+ expect(countLt(sorted, 3)).toBe(1);
+ expect(countLeq(sorted, 0)).toBe(0);
+ expect(countLt(sorted, 0)).toBe(0);
+ expect(countLeq(sorted, 9)).toBe(5);
+ expect(countLt(sorted, 9)).toBe(4);
+ expect(countLeq(sorted, 100)).toBe(5);
+ });
+
+ it('handles empty arrays', () => {
+ expect(countLeq([], 1)).toBe(0);
+ expect(countLt([], 1)).toBe(0);
+ });
+});
+
+describe('cursorStatsAt', () => {
+ // Three requests on a shared clock:
+ // A: credit 0, start 2, end 10
+ // B: credit 1, start 5, end 8
+ // C: credit 12, start 14, end 20
+ const times = {
+ credits: [0, 1, 12],
+ starts: [2, 5, 14],
+ ends: [8, 10, 20],
+ };
+
+ it('counts running, waiting, and completed at an instant', () => {
+ // t=3: A running, B credited but not started, C not yet credited.
+ expect(cursorStatsAt(times, 3)).toEqual({
+ running: 1,
+ waiting: 1,
+ completed: 0,
+ inflight: 2,
+ });
+ // t=6: A and B running.
+ expect(cursorStatsAt(times, 6)).toEqual({
+ running: 2,
+ waiting: 0,
+ completed: 0,
+ inflight: 2,
+ });
+ // t=13: A and B done, C waiting in queue.
+ expect(cursorStatsAt(times, 13)).toEqual({
+ running: 0,
+ waiting: 1,
+ completed: 2,
+ inflight: 1,
+ });
+ });
+
+ it('counts a request as still running at its exact end instant', () => {
+ // end < t (strict) excludes the request from "ended", so at t === end it
+ // still counts as running — matches the popover's documented semantics.
+ expect(cursorStatsAt(times, 8).running).toBe(2);
+ expect(cursorStatsAt(times, 8).completed).toBe(1);
+ });
+
+ it('never returns negative counts on inconsistent columns', () => {
+ expect(cursorStatsAt({ credits: [], starts: [0], ends: [] }, 5).waiting).toBe(0);
+ });
+});
diff --git a/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts
new file mode 100644
index 00000000..801cec95
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts
@@ -0,0 +1,57 @@
+/**
+ * Pure math behind the cursor stats popover: count how many requests are
+ * running / waiting / completed at a given instant, in O(log n) per query via
+ * binary search over pre-sorted timestamp columns.
+ */
+
+/** Pre-sorted (ascending) timestamp columns for one filtered request set. */
+export interface SortedRequestTimes {
+ credits: number[];
+ starts: number[];
+ ends: number[];
+}
+
+export interface CursorStats {
+ running: number;
+ waiting: number;
+ completed: number;
+ inflight: number;
+}
+
+/** Number of values in a sorted ascending array that are <= target. */
+export function countLeq(sorted: number[], target: number): number {
+ let lo = 0;
+ let hi = sorted.length;
+ while (lo < hi) {
+ const mid = (lo + hi) >>> 1;
+ if (sorted[mid]! <= target) lo = mid + 1;
+ else hi = mid;
+ }
+ return lo;
+}
+
+/** Number of values in a sorted ascending array that are < target. */
+export function countLt(sorted: number[], target: number): number {
+ let lo = 0;
+ let hi = sorted.length;
+ while (lo < hi) {
+ const mid = (lo + hi) >>> 1;
+ if (sorted[mid]! < target) lo = mid + 1;
+ else hi = mid;
+ }
+ return lo;
+}
+
+/**
+ * Request counts at time t (ns offset on the same axis as the sorted columns):
+ * running = #(start <= t) - #(end < t)
+ * waiting = #(credit <= t) - #(start <= t)
+ * completed = #(end <= t)
+ */
+export function cursorStatsAt(times: SortedRequestTimes, t: number): CursorStats {
+ const startsLeq = countLeq(times.starts, t);
+ const running = Math.max(0, startsLeq - countLt(times.ends, t));
+ const waiting = Math.max(0, countLeq(times.credits, t) - startsLeq);
+ const completed = countLeq(times.ends, t);
+ return { running, waiting, completed, inflight: running + waiting };
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-format.ts b/packages/app/src/components/inference/agentic-point/timeline-format.ts
new file mode 100644
index 00000000..1c0020f3
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-format.ts
@@ -0,0 +1,15 @@
+/** Time formatting shared by the timeline axis, header stats, and tooltips. */
+
+/** Format ns offset → "+12.3s" / "+1.2m". */
+export function formatTickLabel(ns: number): string {
+ const ms = ns / 1e6;
+ if (ms < 1000) return `+${ms.toFixed(0)}ms`;
+ if (ms < 60_000) return `+${(ms / 1000).toFixed(ms < 10_000 ? 1 : 0)}s`;
+ return `+${(ms / 60_000).toFixed(1)}m`;
+}
+
+export function formatDuration(ms: number): string {
+ if (ms < 1000) return `${ms.toFixed(0)}ms`;
+ if (ms < 60_000) return `${(ms / 1000).toFixed(2)}s`;
+ return `${(ms / 60_000).toFixed(2)}m`;
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-layout.ts b/packages/app/src/components/inference/agentic-point/timeline-layout.ts
new file mode 100644
index 00000000..7043e487
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-layout.ts
@@ -0,0 +1,21 @@
+/** Layout constants shared by the timeline component and its SVG content. */
+
+// The timeline body is capped at this height and scrolls internally, so a run
+// with many conversations/workers doesn't make the card grow unbounded and push
+// the rest of the detail page down. Sized to show ~16 rows + the header.
+export const TIMELINE_BODY_MAX_HEIGHT = 480;
+
+// Wide enough for a full 36-char conversation id at 10px font, plus the
+// indent + color stripe + count badge. Subagent rows inherit the same
+// width but truncate the longer "↳ subagent N · hash" tail with ellipsis.
+export const LABEL_WIDTH = 360;
+export const ROW_HEIGHT = 22;
+export const ROW_GAP = 3;
+export const HEADER_HEIGHT = 24;
+export const PADDING_RIGHT = 12;
+export const CHART_WIDTH = 920;
+
+/** Chart height for a given row count (header + rows + bottom padding). */
+export function timelineSvgHeight(rowCount: number): number {
+ return HEADER_HEIGHT + rowCount * (ROW_HEIGHT + ROW_GAP) + 6;
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-rows.ts b/packages/app/src/components/inference/agentic-point/timeline-rows.ts
new file mode 100644
index 00000000..14bda4ae
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-rows.ts
@@ -0,0 +1,476 @@
+/**
+ * Pure row-building logic for the request timeline: cid parsing, deep-link
+ * hrefs, stable ordering/coloring, and grouping requests into Gantt rows.
+ * No React — everything here is unit-testable data transformation.
+ */
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+export type RowMode = 'conversation' | 'worker';
+
+/**
+ * The dataset conversation id for a request: the cid with any subagent/forked
+ * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in
+ * dataset_conversations, so it deep-links into /datasets//conversations/.
+ */
+export function datasetConvId(cid: string): string {
+ const i = cid.indexOf('::');
+ return i === -1 ? cid : cid.slice(0, i);
+}
+
+/**
+ * The subagent id encoded in a cid (`…::sa:[:s|:aux:]`), or null
+ * for a main-conversation request. The harness fans a single subagent into
+ * parallel streams with a `:s` or `:aux:` suffix; the dataset
+ * SubagentNode.agentId is the bare base (e.g. `subagent_001_b00fdc12`). Agent
+ * ids never contain a colon, so the base is everything up to the first one.
+ */
+export function subagentIdOf(cid: string): string | null {
+ const i = cid.indexOf('::sa:');
+ if (i === -1) return null;
+ const raw = cid.slice(i + '::sa:'.length);
+ const colon = raw.indexOf(':');
+ return colon === -1 ? raw : raw.slice(0, colon);
+}
+
+/**
+ * Deep-link URL for the dataset conversation a request maps to. Carries the turn
+ * (and, for subagent requests, the subagent id) so the flamegraph can scroll to
+ * / highlight the exact node. Used both for SPA navigation on click and as the
+ * real `href` on the request bar so the browser's native "open in new tab"
+ * (right-click, ⌘/Ctrl-click, middle-click) works.
+ */
+export function conversationHref(datasetSlug: string, req: RequestRecord): string {
+ const convId = req.srcTrace ?? datasetConvId(req.cid);
+ const params = new URLSearchParams({ turn: String(req.ti) });
+ if (typeof req.srcOuter === 'number' && Number.isInteger(req.srcOuter) && req.srcOuter >= 0) {
+ params.set('raw', String(req.srcOuter));
+ if (typeof req.srcInner === 'number' && Number.isInteger(req.srcInner) && req.srcInner >= 0) {
+ params.set('inner', String(req.srcInner));
+ }
+ }
+ const sa = subagentIdOf(req.cid);
+ if (sa && !params.has('inner')) params.set('sa', sa);
+ return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`;
+}
+
+/** Human label for where a request came from (raw trace index or replay turn). */
+export function requestSourceLabel(req: RequestRecord): string {
+ if (typeof req.srcOuter === 'number') {
+ if (typeof req.srcInner === 'number') return `raw ${req.srcOuter} / child ${req.srcInner}`;
+ return `raw ${req.srcOuter}`;
+ }
+ return `replay turn ${req.ti + 1}`;
+}
+
+export interface RequestIdleStats {
+ /** Total time between the first start and last end with no request running. */
+ idleNs: number;
+ /** Wall-clock span from the first request start to the final request end. */
+ spanNs: number;
+}
+
+/**
+ * Merge request intervals and sum the gaps between them. Queue time before a
+ * request starts is intentionally excluded: "in flight" means [start, end].
+ */
+export function requestIdleStats(requests: readonly RequestRecord[]): RequestIdleStats {
+ const intervals = requests
+ .filter(({ start, end }) => Number.isFinite(start) && Number.isFinite(end) && end >= start)
+ .map(({ start, end }) => ({ start, end }))
+ .toSorted((a, b) => a.start - b.start || a.end - b.end);
+ if (intervals.length === 0) return { idleNs: 0, spanNs: 0 };
+
+ const firstStart = intervals[0]!.start;
+ let mergedEnd = intervals[0]!.end;
+ let idleNs = 0;
+ for (let i = 1; i < intervals.length; i++) {
+ const interval = intervals[i]!;
+ if (interval.start > mergedEnd) idleNs += interval.start - mergedEnd;
+ if (interval.end > mergedEnd) mergedEnd = interval.end;
+ }
+ return { idleNs, spanNs: mergedEnd - firstStart };
+}
+
+/** A stable color palette indexed by row-key hash. */
+const ROW_COLORS = [
+ '#3b82f6',
+ '#ef4444',
+ '#10b981',
+ '#f59e0b',
+ '#a855f7',
+ '#06b6d4',
+ '#f97316',
+ '#84cc16',
+ '#ec4899',
+ '#14b8a6',
+ '#8b5cf6',
+ '#eab308',
+];
+
+/**
+ * Row kinds:
+ * parent — top-level conversation (depth 0)
+ * worker — worker swimlane (depth 0, worker mode)
+ * subagent — a subagent invocation (depth 1). Either a single
+ * stream (renders its own bars), or a multi-stream
+ * container whose bars are the union of its streams
+ * when collapsed.
+ * stream — one :sN stream of a multi-stream subagent (depth 2).
+ * Hidden by default; toggled in via the parent's chevron.
+ * aux — one :aux:N parallel lane (depth 2). Always visible
+ * beneath its owning subagent.
+ */
+type RowKind = 'parent' | 'worker' | 'subagent' | 'stream' | 'aux';
+
+export interface RequestTimelineRow {
+ key: string;
+ label: string;
+ color: string;
+ requests: RequestRecord[];
+ depth: number;
+ kind: RowKind;
+ /** Number of streams under this subagent (>=1). Only set for subagent rows. */
+ streamCount?: number;
+ /** For stream rows: the parent subagent's row key (drives expand/collapse). */
+ parentRowKey?: string;
+ /** Number of always-visible auxiliary lanes under this subagent. */
+ auxCount?: number;
+}
+
+/**
+ * Conversation ids for subagent calls look like
+ * ::sa:[:s|:aux:]
+ * The optional `:s` suffix is set when the harness fans a single
+ * subagent into multiple parallel "streams" (interval-graph
+ * decomposition in weka_trace._pack_into_streams). We split it off so
+ * we can group every parallel lane under a single subagent header row.
+ *
+ * Aux lanes can also hang directly off the main conversation (no `::sa:`
+ * segment): `::aux:` or `::aux:red:`.
+ * These are parallel requests belonging to the main agent itself, so they
+ * nest under the parent conversation row rather than forming their own
+ * top-level group.
+ */
+export function splitTimelineCid(cid: string): {
+ parent: string;
+ subagentBase: string | null;
+ stream: number | null;
+ aux: string | null;
+} {
+ const sep = cid.indexOf('::sa:');
+ if (sep === -1) {
+ const auxSep = cid.indexOf('::aux:');
+ if (auxSep !== -1) {
+ return {
+ parent: cid.slice(0, auxSep),
+ subagentBase: null,
+ stream: null,
+ aux: cid.slice(auxSep + '::aux:'.length),
+ };
+ }
+ return { parent: cid, subagentBase: null, stream: null, aux: null };
+ }
+ const parent = cid.slice(0, sep);
+ const raw = cid.slice(sep + 5);
+ const auxMatch = /^(? [^:]+):aux:(?.+)$/.exec(raw);
+ if (auxMatch) {
+ return {
+ parent,
+ subagentBase: auxMatch.groups!.base!,
+ stream: null,
+ aux: auxMatch.groups!.aux!,
+ };
+ }
+ const m = /^(? .*):s(?\d+)$/.exec(raw);
+ if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]), aux: null };
+ return { parent, subagentBase: raw, stream: null, aux: null };
+}
+
+/**
+ * Stable order/color index for the top-level row groups (conversations in
+ * conversation mode, workers in worker mode), keyed by group id and computed
+ * over the FULL (unfiltered) request set. Both the row ordering and the color
+ * palette are driven by this index, so a conversation/worker keeps the same
+ * position and color when the phase filter changes the visible subset — without
+ * it, filtering to warmup vs profiling re-sorts and re-colors by whatever subset
+ * is showing, making rows jump and swap colors.
+ *
+ * Groups that span BOTH phases sort first. The shared set is by definition
+ * present in either phase's view, so this leading block renders identically in
+ * both — a conversation that carries over from warmup into profiling stays on
+ * the exact same row when the toggle flips. Phase-exclusive groups follow, and
+ * only they reflow between views. Within each block the order key is the
+ * group's earliest request start across all phases; ties break on the group id
+ * for determinism.
+ */
+export function computeStableRowIndex(
+ requests: readonly RequestRecord[],
+ mode: RowMode,
+): Map {
+ const firstStart = new Map();
+ // Which phases each group appears in. Mirrors requestsForPhase's split:
+ // 'profiling' is exact, anything else counts as warmup.
+ const inProfiling = new Set();
+ const inWarmup = new Set();
+ for (const r of requests) {
+ const key = mode === 'conversation' ? splitTimelineCid(r.cid).parent : r.wid;
+ const cur = firstStart.get(key);
+ if (cur === undefined || r.start < cur) firstStart.set(key, r.start);
+ if (r.phase === 'profiling') inProfiling.add(key);
+ else inWarmup.add(key);
+ }
+ const spansBoth = (key: string) => inProfiling.has(key) && inWarmup.has(key);
+ const keys = [...firstStart.keys()].toSorted(
+ (a, b) =>
+ Number(spansBoth(b)) - Number(spansBoth(a)) ||
+ firstStart.get(a)! - firstStart.get(b)! ||
+ (a < b ? -1 : a > b ? 1 : 0),
+ );
+ const index = new Map();
+ keys.forEach((key, i) => index.set(key, i));
+ return index;
+}
+
+/**
+ * Group requests into rows. In conversation mode, output order is:
+ * parent_conv
+ * subagent_001 (collapsed by default, container)
+ * :s0 (hidden unless expanded)
+ * :s1
+ * aux 011 · parallel (always visible)
+ * subagent_002
+ * ...
+ *
+ * `expandedSubagents` controls which subagent containers reveal their
+ * stream children. Bars on a collapsed subagent are the UNION of all its
+ * streams' requests — overlapping bars visually communicate the
+ * stream-level parallelism without expanding.
+ *
+ * `stableRowIndex` (optional) pins the top-level order + color per group so they
+ * survive phase-filter changes; when omitted it's derived from `requests` (the
+ * legacy self-contained behavior, used by unit tests).
+ */
+export function buildRequestTimelineRows(
+ requests: RequestRecord[],
+ mode: RowMode,
+ expandedSubagents: ReadonlySet,
+ stableRowIndex?: ReadonlyMap,
+): RequestTimelineRow[] {
+ const index = stableRowIndex ?? computeStableRowIndex(requests, mode);
+ const colorFor = (key: string) =>
+ ROW_COLORS[
+ (((index.get(key) ?? 0) % ROW_COLORS.length) + ROW_COLORS.length) % ROW_COLORS.length
+ ]!;
+ const orderOf = (key: string) => index.get(key) ?? Number.POSITIVE_INFINITY;
+ if (mode !== 'conversation') {
+ // Worker mode: flat rows, sorted by first activity.
+ const groups = new Map();
+ for (const r of requests) {
+ let list = groups.get(r.wid);
+ if (!list) {
+ list = [];
+ groups.set(r.wid, list);
+ }
+ list.push(r);
+ }
+ const rows: RequestTimelineRow[] = [];
+ for (const [key, list] of groups) {
+ list.sort((a, b) => a.start - b.start);
+ rows.push({
+ key,
+ label: shortenWid(key),
+ color: colorFor(key),
+ requests: list,
+ depth: 0,
+ kind: 'worker',
+ });
+ }
+ rows.sort(
+ (a, b) => orderOf(a.key) - orderOf(b.key) || a.requests[0]!.start - b.requests[0]!.start,
+ );
+ return rows;
+ }
+
+ // Conversation mode — tree: parent → subagent → stream/aux lane.
+ interface SubagentLanes {
+ streams: Map;
+ aux: Map;
+ }
+ interface Tree {
+ parentCid: string;
+ parentReqs: RequestRecord[];
+ // Aux lanes hanging directly off the main agent (`::aux:…`).
+ parentAux: Map;
+ // subagentBase → primary streams + always-visible auxiliary lanes.
+ subagents: Map;
+ firstStart: number;
+ }
+ const trees = new Map();
+ for (const r of requests) {
+ const { parent, subagentBase, stream, aux } = splitTimelineCid(r.cid);
+ let tree = trees.get(parent);
+ if (!tree) {
+ tree = {
+ parentCid: parent,
+ parentReqs: [],
+ parentAux: new Map(),
+ subagents: new Map(),
+ firstStart: Number.POSITIVE_INFINITY,
+ };
+ trees.set(parent, tree);
+ }
+ if (subagentBase === null && aux !== null) {
+ const list = tree.parentAux.get(aux);
+ if (list) list.push(r);
+ else tree.parentAux.set(aux, [r]);
+ } else if (subagentBase === null) {
+ tree.parentReqs.push(r);
+ } else {
+ let lanes = tree.subagents.get(subagentBase);
+ if (!lanes) {
+ lanes = { streams: new Map(), aux: new Map() };
+ tree.subagents.set(subagentBase, lanes);
+ }
+ if (aux === null) {
+ const list = lanes.streams.get(stream);
+ if (list) list.push(r);
+ else lanes.streams.set(stream, [r]);
+ } else {
+ const list = lanes.aux.get(aux);
+ if (list) list.push(r);
+ else lanes.aux.set(aux, [r]);
+ }
+ }
+ if (r.start < tree.firstStart) tree.firstStart = r.start;
+ }
+
+ const sortedTrees = [...trees.values()].toSorted(
+ (a, b) => orderOf(a.parentCid) - orderOf(b.parentCid) || a.firstStart - b.firstStart,
+ );
+ const rows: RequestTimelineRow[] = [];
+ for (const tree of sortedTrees) {
+ const color = colorFor(tree.parentCid);
+ // Parent row (use a placeholder key if the parent itself wasn't replayed).
+ tree.parentReqs.sort((a, b) => a.start - b.start);
+ const parentRowKey = tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`;
+ rows.push({
+ key: parentRowKey,
+ label: tree.parentCid,
+ color,
+ requests: tree.parentReqs,
+ depth: 0,
+ kind: 'parent',
+ });
+
+ // Aux lanes belonging to the main agent itself (`::aux:…`), nested
+ // directly beneath the parent row. Always visible, like subagent aux lanes.
+ const parentAuxEntries = [...tree.parentAux.entries()].toSorted(
+ (a, b) =>
+ (a[1][0]?.start ?? Number.POSITIVE_INFINITY) - (b[1][0]?.start ?? Number.POSITIVE_INFINITY),
+ );
+ for (const [auxId, reqs] of parentAuxEntries) {
+ reqs.sort((a, b) => a.start - b.start);
+ rows.push({
+ key: `${tree.parentCid}::aux:${auxId}`,
+ label: `aux ${auxId} · parallel`,
+ color,
+ requests: reqs,
+ depth: 1,
+ kind: 'aux',
+ parentRowKey,
+ });
+ }
+
+ // One subagent row per base (which may contain N streams).
+ const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => {
+ const aStart = Math.min(
+ ...[...a[1].streams.values(), ...a[1].aux.values()].map(
+ (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY,
+ ),
+ );
+ const bStart = Math.min(
+ ...[...b[1].streams.values(), ...b[1].aux.values()].map(
+ (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY,
+ ),
+ );
+ return aStart - bStart;
+ });
+ for (const [saBase, lanes] of subagentEntries) {
+ const subagentKey = `${tree.parentCid}::sa:${saBase}`;
+ // Union of primary stream requests for collapsed-view bars. Aux lanes
+ // stay separate so their overlap remains visible as parallel work.
+ const allReqs: RequestRecord[] = [];
+ for (const reqs of lanes.streams.values()) allReqs.push(...reqs);
+ allReqs.sort((a, b) => a.start - b.start);
+ const streamCount = lanes.streams.size;
+ rows.push({
+ key: subagentKey,
+ label: `↳ ${formatSubagentLabel(saBase)}`,
+ color,
+ requests: allReqs,
+ depth: 1,
+ kind: 'subagent',
+ streamCount,
+ auxCount: lanes.aux.size,
+ });
+
+ // Stream children only when expanded AND there's more than one
+ // stream (a single-stream subagent has nothing extra to show).
+ if (streamCount > 1 && expandedSubagents.has(subagentKey)) {
+ const streamEntries = [...lanes.streams.entries()].toSorted((a, b) => {
+ // Sort by stream index (null first as the "default" stream)
+ const ai = a[0] ?? -1;
+ const bi = b[0] ?? -1;
+ return ai - bi;
+ });
+ for (const [streamIdx, reqs] of streamEntries) {
+ reqs.sort((a, b) => a.start - b.start);
+ rows.push({
+ key: `${subagentKey}:s${streamIdx ?? '∅'}`,
+ label: `stream ${streamIdx ?? '∅'}`,
+ color,
+ requests: reqs,
+ depth: 2,
+ kind: 'stream',
+ parentRowKey: subagentKey,
+ });
+ }
+ }
+
+ // Aux lanes encode concurrent requests within the subagent. Keep them
+ // visible even when primary streams are collapsed so parallelism is not
+ // hidden behind an interaction.
+ const auxEntries = [...lanes.aux.entries()].toSorted(
+ (a, b) =>
+ (a[1][0]?.start ?? Number.POSITIVE_INFINITY) -
+ (b[1][0]?.start ?? Number.POSITIVE_INFINITY),
+ );
+ for (const [auxId, reqs] of auxEntries) {
+ reqs.sort((a, b) => a.start - b.start);
+ rows.push({
+ key: `${subagentKey}:aux:${auxId}`,
+ label: `aux ${auxId} · parallel`,
+ color,
+ requests: reqs,
+ depth: 2,
+ kind: 'aux',
+ parentRowKey: subagentKey,
+ });
+ }
+ }
+ }
+ return rows;
+}
+
+/** `subagent_001_bf1c5c16` → `subagent 001 · bf1c` (compact, readable). */
+function formatSubagentLabel(raw: string): string {
+ const m = /^subagent_(?\d+)_(?[0-9a-f]+)$/iu.exec(raw);
+ if (!m) return raw;
+ return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`;
+}
+
+/** `worker_4ae87bea` → `w_4ae8` (compact worker swimlane label). */
+export function shortenWid(wid: string): string {
+ return wid.replace(/^worker_/, 'w_').slice(0, 12);
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx b/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx
new file mode 100644
index 00000000..7aa63efc
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx
@@ -0,0 +1,143 @@
+'use client';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import { formatDuration, formatTickLabel } from './timeline-format';
+import { cursorStatsAt, type SortedRequestTimes } from './timeline-cursor-stats';
+import { requestSourceLabel, shortenWid, type RequestTimelineRow } from './timeline-rows';
+
+export interface TooltipData {
+ x: number;
+ y: number;
+ row: RequestTimelineRow;
+ req: RequestRecord;
+}
+
+/** Per-request hover tooltip (fixed-position, follows the mouse). */
+export function TimelineTooltip({ data, linkable }: { data: TooltipData; linkable?: boolean }) {
+ const { row, req } = data;
+ const totalMs = (req.end - req.start) / 1e6;
+ const queueMs = (req.start - req.credit) / 1e6;
+ return (
+
+
+
+ {row.label}
+ · {requestSourceLabel(req)}
+ {req.cancelled && · cancelled }
+
+
+ Total
+ {formatDuration(totalMs)}
+ Queue wait
+
+ {queueMs > 0.5 ? formatDuration(queueMs) : '—'}
+
+ {req.ttftMs !== null && (
+ <>
+ TTFT
+
+ {formatDuration(req.ttftMs)}
+
+ >
+ )}
+ {req.isl !== null && (
+ <>
+ ISL
+
+ {req.isl.toLocaleString()}
+
+ >
+ )}
+ {req.osl !== null && (
+ <>
+ OSL
+
+ {req.osl.toLocaleString()}
+
+ >
+ )}
+ Phase
+ {req.phase}
+ {req.ad > 0 && (
+ <>
+ Agent depth
+ {req.ad}
+ >
+ )}
+ Worker
+ {shortenWid(req.wid)}
+
+
+ Started at {formatTickLabel(req.start)}
+
+ {linkable && (
+
+ Click to view this conversation in the dataset →
+
+ )}
+
+ );
+}
+
+export interface CursorState {
+ /** Cursor x in svg-local px (drives the crosshair line). */
+ xPx: number;
+ /** ns offset from dataStart the cursor points at. */
+ tNs: number;
+ clientX: number;
+ clientY: number;
+}
+
+/** Cursor stats popover: requests in flight / waiting / completed at time t. */
+export function CursorPopover({
+ cursor,
+ dataStart,
+ times,
+}: {
+ cursor: CursorState;
+ dataStart: number;
+ times: SortedRequestTimes;
+}) {
+ const t = cursor.tNs;
+ const { running, waiting, completed, inflight } = cursorStatsAt(times, t);
+ // Absolute wall-clock seconds since the timeline origin (dataStart).
+ const tSec = t / 1e9;
+ // Position the popover near the cursor without overflowing the viewport.
+ // 200 px wide; flip to the left of the cursor if it would clip the right.
+ const wantLeft = cursor.clientX + 14;
+ const left =
+ typeof window === 'undefined' || wantLeft + 220 < window.innerWidth
+ ? wantLeft
+ : cursor.clientX - 220;
+ return (
+
+
+ t =
+
+ {tSec < 60 ? `${tSec.toFixed(3)} s` : `${(tSec / 60).toFixed(3)} m`}
+
+
+
+ In flight
+ {inflight}
+ running
+ {running}
+ waiting
+ {waiting}
+ Completed
+ {completed}
+
+ {/* dataStart is informational — the displayed t is relative to it. */}
+
+ relative to t₀ ({(dataStart / 1e9).toFixed(0)}s wall-clock)
+
+
+ );
+}
diff --git a/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts b/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts
new file mode 100644
index 00000000..631bdd94
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts
@@ -0,0 +1,108 @@
+/**
+ * Persisted view-state snapshot for the request timeline (zoom window, row
+ * mode, phase filter, expansions, scroll offsets). Written to sessionStorage on
+ * click-through to a dataset conversation, consumed once on the next mount so
+ * the browser back button restores the user's exact position.
+ */
+
+import type { StagePhase } from './phase-slice';
+import type { RowMode } from './timeline-rows';
+
+// Two phases shown separately (no combined view) — matches the per-point detail
+// stage toggle. Reuses StagePhase so the filter predicate is shared.
+export type PhaseFilter = StagePhase;
+
+/**
+ * Persisted snapshot of the timeline's view state, used to restore the user's
+ * zoom / scroll / filter position when they return to the page (e.g. clicking a
+ * request to open the dataset flamegraph, then hitting the browser back button).
+ * Stored in sessionStorage keyed by point id; written on click-through and
+ * consumed once on the next mount.
+ */
+export interface TimelineViewSnapshot {
+ /** Zoom-pan window start (ns offset from dataStart). */
+ viewStart: number;
+ /** Zoom-pan window end, or null when not zoomed (full extent). */
+ viewEnd: number | null;
+ rowMode: RowMode;
+ phaseFilter: PhaseFilter;
+ /** Keys of expanded multi-stream subagent rows. */
+ expanded: string[];
+ /** Scroll container offsets (vertical row scroll + horizontal). */
+ scrollTop: number;
+ scrollLeft: number;
+}
+
+const TIMELINE_VIEW_SNAPSHOT_PREFIX = 'agentic-timeline-view:';
+const ROW_MODE_VALUES: readonly RowMode[] = ['conversation', 'worker'];
+const PHASE_FILTER_VALUES: readonly PhaseFilter[] = ['warmup', 'profiling'];
+
+const finiteOr = (value: unknown, fallback: number): number =>
+ typeof value === 'number' && Number.isFinite(value) ? value : fallback;
+
+/**
+ * Parse a persisted snapshot, coercing/validating each field and falling back
+ * to defaults so a malformed or stale blob can never break restore. Returns
+ * null only when the input is absent or not parseable JSON.
+ */
+export function parseTimelineViewSnapshot(raw: string | null): TimelineViewSnapshot | null {
+ if (!raw) return null;
+ let parsed: unknown;
+ try {
+ parsed = JSON.parse(raw);
+ } catch {
+ return null;
+ }
+ if (!parsed || typeof parsed !== 'object') return null;
+ const record = parsed as Record;
+ const rowMode = ROW_MODE_VALUES.includes(record.rowMode as RowMode)
+ ? (record.rowMode as RowMode)
+ : 'conversation';
+ const phaseFilter = PHASE_FILTER_VALUES.includes(record.phaseFilter as PhaseFilter)
+ ? (record.phaseFilter as PhaseFilter)
+ : 'profiling';
+ const viewEnd =
+ typeof record.viewEnd === 'number' && Number.isFinite(record.viewEnd) ? record.viewEnd : null;
+ const expanded = Array.isArray(record.expanded)
+ ? record.expanded.filter((entry): entry is string => typeof entry === 'string')
+ : [];
+ return {
+ viewStart: finiteOr(record.viewStart, 0),
+ viewEnd,
+ rowMode,
+ phaseFilter,
+ expanded,
+ scrollTop: finiteOr(record.scrollTop, 0),
+ scrollLeft: finiteOr(record.scrollLeft, 0),
+ };
+}
+
+function timelineSnapshotKey(pointId: number): string {
+ return `${TIMELINE_VIEW_SNAPSHOT_PREFIX}${pointId}`;
+}
+
+export function saveTimelineViewSnapshot(pointId: number, snapshot: TimelineViewSnapshot): void {
+ if (typeof window === 'undefined') return;
+ try {
+ window.sessionStorage.setItem(timelineSnapshotKey(pointId), JSON.stringify(snapshot));
+ } catch {
+ // sessionStorage can throw (private mode / quota exceeded) — restore is
+ // best-effort, so a failed write just means no restore next time.
+ }
+}
+
+/**
+ * Read AND remove the snapshot (one-shot): we only want to restore once per
+ * click-through, so a later reload of the same point starts from defaults.
+ */
+export function consumeTimelineViewSnapshot(pointId: number): TimelineViewSnapshot | null {
+ if (typeof window === 'undefined') return null;
+ try {
+ const key = timelineSnapshotKey(pointId);
+ const raw = window.sessionStorage.getItem(key);
+ window.sessionStorage.removeItem(key);
+ return parseTimelineViewSnapshot(raw);
+ } catch {
+ return null;
+ }
+}
diff --git a/packages/app/src/components/inference/hooks/useChartData.test.ts b/packages/app/src/components/inference/hooks/useChartData.test.ts
index 73582998..c4998add 100644
--- a/packages/app/src/components/inference/hooks/useChartData.test.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.test.ts
@@ -1,6 +1,83 @@
import { describe, it, expect } from 'vitest';
-import { buildComparisonDates, filterByGPU, flipRooflineDirection } from './useChartData';
+import {
+ buildComparisonDates,
+ dedupeRowsToLatestPerConfig,
+ filterByGPU,
+ flipRooflineDirection,
+} from './useChartData';
+
+interface DedupeInput {
+ id: number;
+ hardware: string;
+ framework: string;
+ spec_method: string;
+ disagg: boolean;
+ precision: string;
+ offload_mode?: string | null;
+ date: string;
+}
+
+const drow = (over: Partial = {}): DedupeInput => ({
+ id: 1,
+ hardware: 'b300',
+ framework: 'vllm',
+ spec_method: 'none',
+ disagg: false,
+ precision: 'fp4',
+ offload_mode: 'off',
+ date: '2026-06-01',
+ ...over,
+});
+
+describe('dedupeRowsToLatestPerConfig', () => {
+ it('keeps only the latest date within a single series', () => {
+ const rows = [
+ drow({ id: 1, date: '2026-06-01' }),
+ drow({ id: 2, date: '2026-06-03' }),
+ drow({ id: 3, date: '2026-06-02' }),
+ ];
+ expect(dedupeRowsToLatestPerConfig(rows).map((r) => r.id)).toEqual([2]);
+ });
+
+ it('keeps BOTH offload variants even when they were ingested on different dates', () => {
+ // The regression: offload=on sweep landed LATER than offload=off. Without
+ // offload in the key, the on-variant's newer date would win the shared group
+ // and silently drop the (older) off-variant series entirely.
+ const rows = [
+ drow({ id: 1, offload_mode: 'off', date: '2026-06-01' }),
+ drow({ id: 2, offload_mode: 'on', date: '2026-06-05' }),
+ ];
+ const kept = dedupeRowsToLatestPerConfig(rows)
+ .map((r) => r.offload_mode)
+ .toSorted();
+ expect(kept).toEqual(['off', 'on']);
+ });
+
+ it('still dedupes each offload variant to its own latest date', () => {
+ const rows = [
+ drow({ id: 1, offload_mode: 'off', date: '2026-06-01' }),
+ drow({ id: 2, offload_mode: 'off', date: '2026-06-04' }),
+ drow({ id: 3, offload_mode: 'on', date: '2026-06-02' }),
+ drow({ id: 4, offload_mode: 'on', date: '2026-06-05' }),
+ ];
+ expect(
+ dedupeRowsToLatestPerConfig(rows)
+ .map((r) => r.id)
+ .toSorted(),
+ ).toEqual([2, 4]);
+ });
+
+ it('normalizes a missing offload_mode to "off" (matches the SQL lineKey)', () => {
+ // A row with no offload_mode collides with an explicit offload=off row of the
+ // same config — both are the "off" series, so latest-date dedup applies.
+ const rows = [
+ drow({ id: 1, offload_mode: undefined, date: '2026-06-01' }),
+ drow({ id: 2, offload_mode: 'off', date: '2026-06-03' }),
+ ];
+ expect(dedupeRowsToLatestPerConfig(rows).map((r) => r.id)).toEqual([2]);
+ });
+});
describe('buildComparisonDates', () => {
it('returns empty when no GPUs selected (comparison disabled)', () => {
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 8e894d0e..f6596656 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -1,7 +1,7 @@
import { useMemo, useRef } from 'react';
import { useQueries } from '@tanstack/react-query';
-import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants';
+import { rowToSequence } from '@semianalysisai/inferencex-constants';
import chartDefinitions from '@/components/inference/inference-chart-config.json';
import type {
@@ -23,9 +23,15 @@ import {
getModelSortIndex,
hardwareKeyMatchesAnyBase,
} from '@/lib/constants';
-import { transformBenchmarkRows } from '@/lib/benchmark-transform';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import {
+ mergeRunScopedRows,
+ transformBenchmarkRows,
+ withPercentile,
+} from '@/lib/benchmark-transform';
+import { Sequence, type Model } from '@/lib/data-mappings';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
+import { paretoFrontForDirection, type ParetoDirection } from '@/lib/chart-utils';
import {
applyQuickFilters,
computeAvailableQuickFilters,
@@ -33,6 +39,90 @@ import {
type QuickFilters,
} from '@/components/inference/utils/quickFilters';
+/**
+ * Chart x-axis variant selected by the mode buttons above the plot. This is
+ * the single definition — InferenceContext (URL/state) and ChartDisplay
+ * (buttons, derived-metric remapping) import it from here.
+ */
+export type XAxisMode =
+ | 'ttft'
+ | 'e2e'
+ | 'normalized-e2e'
+ | 'interactivity'
+ | 'session-time'
+ | 'prefill-tps';
+
+export const X_AXIS_MODES: readonly XAxisMode[] = [
+ 'ttft',
+ 'e2e',
+ 'normalized-e2e',
+ 'interactivity',
+ 'session-time',
+ 'prefill-tps',
+];
+
+/**
+ * Modes whose x metric is derived from persisted per-request traces —
+ * these only exist for agentic scenarios (fixed-seq rows have no
+ * trace_replay blob to derive them from).
+ */
+export function isAgenticOnlyXAxisMode(mode: XAxisMode): boolean {
+ return mode === 'normalized-e2e' || mode === 'session-time' || mode === 'prefill-tps';
+}
+
+/**
+ * Compute the set of benchmark_results.id values that sit on the
+ * (e2e_latency, y) Pareto frontier within each (hwKey, precision, date)
+ * group. Used to restrict the non-e2e xmode charts (ttft, interactivity,
+ * session-time, prefill-tps) so they show *only* the points that win on
+ * end-to-end latency — preventing benchmark-hacking where a config tops
+ * one axis while tanking the other.
+ *
+ * Returns null when the y-metric has no roofline direction declared on
+ * the e2e chart (caller falls back to no filtering in that case).
+ */
+function e2eParetoIds(
+ points: InferenceData[],
+ selectedYAxisMetric: string,
+ percentile: string,
+): Set | null {
+ const e2eChartDef = (chartDefinitions as ChartDefinition[]).find((c) => c.chartType === 'e2e');
+ if (!e2eChartDef) return null;
+ const dir = e2eChartDef[`${selectedYAxisMetric}_roofline` as keyof ChartDefinition] as
+ | ParetoDirection
+ | undefined;
+ if (!dir) return null;
+ const frontierFn = paretoFrontForDirection(dir);
+ // Percentile-prefixed e2e-latency field name (e.g. 'p90_e2el').
+ const e2elField = withPercentile('median_e2el', percentile);
+ const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+
+ // Re-frame each candidate point in (e2el, y) space, then compute the
+ // pareto per (hwKey, precision, date) bucket — frontiers don't span dates
+ // (a May 17 point can't dominate a May 15 plot).
+ const byGroup = new Map();
+ for (const p of points) {
+ const yValue = (p[metricKey] as { y?: number } | undefined)?.y;
+ const xValue = (p as unknown as Record)[e2elField];
+ if (typeof xValue !== 'number' || !Number.isFinite(xValue)) continue;
+ if (typeof yValue !== 'number' || !Number.isFinite(yValue)) continue;
+ const key = `${p.hwKey}|${p.precision}|${p.date}`;
+ let bucket = byGroup.get(key);
+ if (!bucket) {
+ bucket = [];
+ byGroup.set(key, bucket);
+ }
+ bucket.push({ ...p, x: xValue, y: yValue });
+ }
+ const ids = new Set();
+ for (const bucket of byGroup.values()) {
+ for (const f of frontierFn(bucket)) {
+ if (isPersistedBenchmarkId(f.id)) ids.add(f.id);
+ }
+ }
+ return ids;
+}
+
/** Build deduplicated comparison dates, excluding the main run date. */
export function buildComparisonDates(
selectedGPUs: string[],
@@ -77,6 +167,42 @@ export function flipRooflineDirection(dir: RooflineDirection): RooflineDirection
return FLIP_MAP[dir];
}
+/** The dedup key fields a chart series is identified by. */
+interface DedupeRow {
+ hardware: string;
+ framework: string;
+ spec_method: string;
+ disagg: boolean;
+ precision: string;
+ offload_mode?: string | null;
+ date: string;
+}
+
+// offload_mode normalized `?? 'off'` to match the SQL layer's getBenchmarksForRun
+// lineKey — agentic offload=on and offload=off are distinct series.
+const dedupeSeriesKey = (r: DedupeRow): string =>
+ `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}|${r.offload_mode ?? 'off'}`;
+
+/**
+ * For each series — (hardware, framework, spec_method, disagg, precision,
+ * offload_mode) — keep only the rows from that series' most recent date. When
+ * parallelism settings change between runs, old config_ids create stale points
+ * under the same legend line; dropping all-but-latest removes them.
+ *
+ * Without `offload_mode` in the key, an offload=on sweep ingested on a LATER date
+ * than the offload=off sweep would win the shared group and silently drop the
+ * (earlier-dated) offload=off variant — a data-loss regression.
+ */
+export function dedupeRowsToLatestPerConfig(rows: T[]): T[] {
+ const maxDatePerGroup = new Map();
+ for (const r of rows) {
+ const k = dedupeSeriesKey(r);
+ const cur = maxDatePerGroup.get(k);
+ if (!cur || r.date > cur) maxDatePerGroup.set(k, r.date);
+ }
+ return rows.filter((r) => r.date === maxDatePerGroup.get(dedupeSeriesKey(r)));
+}
+
export function useChartData(
selectedModel: Model,
selectedSequence: Sequence,
@@ -92,11 +218,26 @@ export function useChartData(
selectedRunDate?: string,
enabled = true,
latestAvailableDate?: string,
+ selectedPercentile = 'p90',
/** When set, only series for these two registry GPU keys are shown (compare pages). */
compareGpuPair?: readonly [string, string] | null,
/**
- * GitHub run id for the "as of run" view. Set only when an earlier-than-latest
- * run is selected; the chart then shows the data as it stood at that run.
+ * Exact GitHub run id used to pin contested configs while carrying forward
+ * configs that the selected run did not produce.
+ */
+ selectedRunId?: string,
+ /**
+ * Current x-axis mode. When set to anything other than 'e2e', the displayed
+ * data is filtered to the (e2e-latency, y) Pareto frontier so the ttft /
+ * interactivity / session-time / prefill-tps charts show only points that
+ * also win on end-to-end latency — preventing benchmark-hacking where a
+ * config tops one metric while tanking the other. The 'e2e' mode is the
+ * source of truth and keeps the full point set.
+ */
+ selectedXAxisMode: XAxisMode = 'e2e',
+ /**
+ * GitHub run id for the "as of run" base view. Set only when an
+ * earlier-than-latest run is selected.
*/
asOfRunId?: string,
/**
@@ -118,11 +259,35 @@ export function useChartData(
? ''
: selectedRunDate;
+ // Two queries: the normal latest-per-config view (always), plus the
+ // run-scoped rows when a specific workflow run is selected. The merged
+ // result pins ONLY the configs the selected run produced to that run, and
+ // carries every other config forward from the base rows — selecting one of
+ // two same-day vLLM runs must not hide the day's SGLang curve just because
+ // it lives in a different workflow run. The base query is the default view
+ // query, so it's almost always already in the React Query cache.
const {
- data: allRows,
- isLoading: queryLoading,
- error: queryError,
+ data: baseRows,
+ isLoading: baseLoading,
+ error: baseError,
} = useBenchmarks(selectedModel, queryDate, enabled, asOfRunId);
+ const {
+ data: runRows,
+ isLoading: runLoading,
+ error: runError,
+ } = useBenchmarks(selectedModel, '', enabled && Boolean(selectedRunId), selectedRunId, true);
+
+ const allRows = useMemo(() => {
+ if (!selectedRunId) return baseRows;
+ // Wait for the run rows before rendering a scoped view — rendering base
+ // rows first would flash the un-scoped chart, then swap contested points.
+ if (!runRows) return undefined;
+ if (!baseRows) return runRows;
+ return mergeRunScopedRows(runRows, baseRows);
+ }, [selectedRunId, runRows, baseRows]);
+
+ const queryLoading = baseLoading || (Boolean(selectedRunId) && runLoading);
+ const queryError = baseError ?? (selectedRunId ? runError : null);
// GPU comparison: fetch data for each additional comparison date
const comparisonDates = useMemo(
@@ -155,26 +320,19 @@ export function useChartData(
// Merge main rows with comparison date rows.
// Stamp each row with the *requested* date (not the actual DB date) so that
// GPUGraph's activeDates filter (keyed by user-selected date) matches the points.
- const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]);
+ //
+ // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via
+ // benchmark_type), so one filter covers every scenario.
const rows = useMemo(() => {
- if (!allRows || !sequenceIslOsl) return [];
- const seqFilter = (r: { isl: number; osl: number }) =>
- r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl;
+ if (!allRows) return [];
+ const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) =>
+ rowToSequence(r) === selectedSequence;
const seqFiltered = allRows.filter(seqFilter);
- // For each (hw, framework, spec_method, disagg, precision) group, keep only
- // rows from the most recent date. When parallelism settings change between runs,
- // old config_ids create stale data points under the same legend line — drop them.
- const maxDatePerGroup = new Map();
- for (const r of seqFiltered) {
- const key = `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}`;
- const cur = maxDatePerGroup.get(key);
- if (!cur || r.date > cur) maxDatePerGroup.set(key, r.date);
- }
- const deduped = seqFiltered.filter((r) => {
- const key = `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}`;
- return r.date === maxDatePerGroup.get(key);
- });
+ // Keep only each series' latest-date rows (drops stale config_ids left behind
+ // when parallelism settings change between runs). Keyed per offload variant so
+ // an offload=on sweep can't hide a differently-dated offload=off series.
+ const deduped = dedupeRowsToLatestPerConfig(seqFiltered);
const mainRows = deduped.map((r) =>
selectedRunDate ? { ...r, date: selectedRunDate, actualDate: r.date } : r,
@@ -186,14 +344,14 @@ export function useChartData(
.map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })),
);
return [...mainRows, ...extraRows];
- }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]);
+ }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]);
// Transform filtered rows into chart data
const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => {
if (rows.length === 0)
return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig };
- return transformBenchmarkRows(rows);
- }, [rows]);
+ return transformBenchmarkRows(rows, selectedPercentile);
+ }, [rows, selectedPercentile]);
// Sort hardware config — stabilize reference when keys haven't changed.
// Different sequences for the same model often have the same GPU configs,
@@ -241,8 +399,11 @@ export function useChartData(
(chartDefinitions as ChartDefinition[]).map((chartDef) => {
const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
- // Determine dynamic x-axis
- let xAxisField: keyof AggDataEntry = chartDef.x;
+ // Default x-axis = chart's natural latency metric, percentile-adjusted
+ // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic
+ // scenarios `withPercentile` is a no-op when percentile === 'median'.
+ const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry;
+ let xAxisField: keyof AggDataEntry = naturalX;
let xAxisLabel = chartDef.x_label;
const metricTitle =
@@ -252,14 +413,25 @@ export function useChartData(
// Resolve the effective x-axis override per chart type
const effectiveXMetric =
chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric;
+ // The TTFT override is now any *_ttft metric (not just p90_ttft) — the
+ // x-axis-mode picker reconciles the percentile prefix based on sequence
+ // kind (fixed-seq → median, agentic → user-picked percentile).
const isTtftOverride =
- effectiveXMetric === 'p99_ttft' || effectiveXMetric === 'median_ttft';
- const ttftLabel =
- effectiveXMetric === 'p99_ttft'
- ? 'P99 Time To First Token (s)'
- : 'Median Time To First Token (s)';
-
- if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
+ typeof effectiveXMetric === 'string' && effectiveXMetric.endsWith('_ttft');
+ const ttftPctl = isTtftOverride
+ ? (effectiveXMetric as string).replace(/_ttft$/u, '')
+ : 'p90';
+ const ttftPctlWord = ttftPctl === 'median' ? 'Median' : ttftPctl.toUpperCase();
+ const ttftLabel = `${ttftPctlWord} Time To First Token (s)`;
+
+ const isAgentic = selectedSequence === Sequence.AgenticTraces;
+
+ if (
+ effectiveXMetric &&
+ chartDef.chartType === 'interactivity' &&
+ isInputMetric &&
+ !isAgentic
+ ) {
xAxisField = effectiveXMetric as keyof AggDataEntry;
const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) {
@@ -268,6 +440,10 @@ export function useChartData(
xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label;
}
} else if (chartDef.chartType === 'interactivity' && isInputMetric) {
+ // Agentic falls through here too — the manual X-axis dropdown is
+ // hidden in agentic mode (would double up with the percentile
+ // selector), so the config default + percentile post-processing
+ // below drives the x axis.
const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition;
const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x;
@@ -277,12 +453,35 @@ export function useChartData(
xAxisLabel = ttftLabel;
}
+ // Agentic: rewrite the resolved x metric to the chosen percentile,
+ // and relabel accordingly. Both have to be updated unconditionally —
+ // xAxisField may already be percentile-adjusted (via naturalX) while
+ // xAxisLabel still carries the raw chartDef.x_label prefix.
+ // The chart heading ("vs. ") is also rewritten to include
+ // the percentile so the title above the plot reflects what's drawn.
+ const headingKey = `${selectedYAxisMetric}_heading` as keyof ChartDefinition;
+ let chartHeading = (chartDef[headingKey] as string) || chartDef.heading;
+ if (isAgentic) {
+ xAxisField = withPercentile(
+ xAxisField as string,
+ selectedPercentile,
+ ) as keyof AggDataEntry;
+ const pctlWord = selectedPercentile.toUpperCase();
+ xAxisLabel = xAxisLabel.replace(/^(?:Median|Mean|P75|P90|P95|P99(?:\.9)?)\b/iu, pctlWord);
+ chartHeading = chartHeading.replace(
+ /^(?vs\.\s+)(?:(?:Median|Mean|P75|P90|P95|P99(?:\.9)?)\s+)?/iu,
+ `$1${pctlWord} `,
+ );
+ }
+
// The x-axis is "flipped" only when the good-direction reverses
// (e.g. interactivity → TTFT: "higher is better" → "lower is better").
// E2EL → TTFT keeps the same direction ("lower is better" for both),
// so no roofline flip is needed for the e2e chart.
+ // Compare against `naturalX` (percentile-adjusted) — switching the
+ // percentile of the same logical metric is NOT a flip.
const xAxisFlipped =
- xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride);
+ xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride);
const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition;
const dynamicYLabel = chartDef[yLabelKey];
@@ -303,6 +502,7 @@ export function useChartData(
chartDefinition: {
...chartDef,
...rooflineOverrides,
+ heading: chartHeading,
x_label: xAxisLabel,
y_label: dynamicYLabel === null ? undefined : String(dynamicYLabel),
},
@@ -310,7 +510,13 @@ export function useChartData(
xAxisField,
};
}),
- [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric],
+ [
+ selectedYAxisMetric,
+ selectedXAxisMetric,
+ selectedE2eXAxisMetric,
+ selectedPercentile,
+ selectedSequence,
+ ],
);
// Build renderable graphs (data processing + stable chart definitions)
@@ -344,9 +550,30 @@ export function useChartData(
filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric);
+ // For AGENTIC workloads only: when the user is NOT viewing the
+ // e2e latency chart, mark each point with whether it sits on the
+ // (e2e_latency, y) Pareto frontier for its (hwKey, precision,
+ // date) group. The chart still renders every point as scatter —
+ // only e2e-Pareto winners feed the roofline (ScatterGraph honors
+ // the flag). Prevents benchmark-hacking the TTFT / interactivity
+ // line by tanking decode (or vice versa) without hiding the
+ // non-optimal configs from view.
+ //
+ // Fixed-seq workloads keep the existing per-axis Pareto since
+ // there's no separate "session-time" notion of total latency —
+ // their e2e IS the request latency, so a TTFT hack there reads
+ // honestly on e2e too. The anti-hack constraint is specifically
+ // about multi-turn agentic where TTFT measures a tiny fraction
+ // of the user-visible session time.
+ const isAgentic = selectedSequence === Sequence.AgenticTraces;
+ const e2eParetoSet =
+ isAgentic && selectedXAxisMode !== 'e2e'
+ ? e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile)
+ : null;
+
// Filter to points that have the selected metric, then remap x/y
const hasMetric = filteredData.some((d) => metricKey in d);
- const isTtftX = xAxisField === 'p99_ttft' || xAxisField === 'median_ttft';
+ const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
const processedData = hasMetric
? filteredData
.filter((d) => metricKey in d)
@@ -359,18 +586,26 @@ export function useChartData(
// d.x would otherwise mask the regression).
const xCandidate = (d as Partial)[xAxisField];
const xValue = typeof xCandidate === 'number' ? xCandidate : d.x;
+ const isOnE2eFrontier =
+ e2eParetoSet === null
+ ? undefined
+ : isPersistedBenchmarkId(d.id) && e2eParetoSet.has(d.id);
return {
...d,
x: xValue,
y: yValue,
roof,
+ isOnE2eFrontier,
};
})
- // When TTFT is on the x-axis, apply the latency limit to filter overload outliers
- // (e.g. conc=2048 rows with TTFT > 60s that compress all real data to the far left)
+ // When TTFT is on the x-axis, apply the latency limit to filter
+ // overload outliers (fixed-seq conc=2048 rows with TTFT > 60s that
+ // compress all real data to the far left). Skip for agentic — long
+ // TTFTs there reflect real workloads (multi-turn, big prompts).
.filter(
(d) =>
!isTtftX ||
+ isAgentic ||
!chartDefinition.y_latency_limit ||
d.x <= chartDefinition.y_latency_limit,
)
@@ -395,6 +630,8 @@ export function useChartData(
userPowers,
stableChartDefinitions,
compareGpuPair,
+ selectedXAxisMode,
+ selectedPercentile,
quickFilters,
]);
diff --git a/packages/app/src/components/inference/inference-chart-config.json b/packages/app/src/components/inference/inference-chart-config.json
index d9a29181..9617638f 100644
--- a/packages/app/src/components/inference/inference-chart-config.json
+++ b/packages/app/src/components/inference/inference-chart-config.json
@@ -13,9 +13,9 @@
"y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
"y_inputTputPerGpu_title": "Input Token Throughput per GPU",
"y_inputTputPerGpu_roofline": "upper_left",
- "y_inputTputPerGpu_x": "p99_ttft",
- "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
- "y_inputTputPerGpu_heading": "vs. P99 Time To First Token",
+ "y_inputTputPerGpu_x": "p90_ttft",
+ "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
+ "y_inputTputPerGpu_heading": "vs. P90 Time To First Token",
"y_outputTputPerGpu": "outputTputPerGpu.y",
"y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
"y_outputTputPerGpu_title": "Output Token Throughput per GPU",
@@ -126,8 +126,8 @@
"y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
"y_inputTputPerGpu_title": "Input Token Throughput per GPU",
"y_inputTputPerGpu_roofline": "upper_right",
- "y_inputTputPerGpu_x": "p99_ttft",
- "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
+ "y_inputTputPerGpu_x": "p90_ttft",
+ "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
"y_outputTputPerGpu": "outputTputPerGpu.y",
"y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
"y_outputTputPerGpu_title": "Output Token Throughput per GPU",
diff --git a/packages/app/src/components/inference/replay/buildReplayTimeline.ts b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
index 91db3d40..91761604 100644
--- a/packages/app/src/components/inference/replay/buildReplayTimeline.ts
+++ b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
@@ -107,8 +107,7 @@ function resolveXAxisField(
const metricTitle =
(chartDef[`${selectedYAxisMetric}_title` as keyof ChartDefinition] as string) || '';
const isInputMetric = metricTitle.toLowerCase().includes('input');
- const isTtftOverride =
- selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+ const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
return selectedXAxisMetric;
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index ecf2fe33..5d0981b8 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -80,6 +80,8 @@ export interface WorkerPower {
* @property {number} p99_e2el - 99th percentile of End-to-End Latency.
*/
export interface AggDataEntry {
+ /** Stable per-point id from benchmark_results — for trace_replay lookups. */
+ id?: number;
hw: string;
mtp?: string;
hwKey: string;
@@ -94,23 +96,43 @@ export interface AggDataEntry {
mean_ttft: number;
median_ttft: number;
std_ttft: number;
+ p75_ttft: number;
+ p90_ttft: number;
+ p95_ttft: number;
p99_ttft: number;
+ 'p99.9_ttft': number;
mean_tpot: number;
mean_intvty: number;
median_tpot: number;
median_intvty: number;
std_tpot: number;
std_intvty: number;
+ p75_tpot: number;
+ p75_intvty: number;
+ p90_tpot: number;
+ p90_intvty: number;
+ p95_tpot: number;
+ p95_intvty: number;
p99_tpot: number;
p99_intvty: number;
+ 'p99.9_tpot': number;
+ 'p99.9_intvty': number;
mean_itl: number;
median_itl: number;
std_itl: number;
+ p75_itl: number;
+ p90_itl: number;
+ p95_itl: number;
p99_itl: number;
+ 'p99.9_itl': number;
mean_e2el: number;
median_e2el: number;
std_e2el: number;
+ p75_e2el: number;
+ p90_e2el: number;
+ p95_e2el: number;
p99_e2el: number;
+ 'p99.9_e2el': number;
// Measured GPU telemetry (emitted by runner's aggregate_power.py).
// Optional because historical runs predate the fields.
avg_power_w?: number;
@@ -162,6 +184,29 @@ export interface AggDataEntry {
actualDate?: string;
/** URL to the GitHub Actions workflow run that produced this data point. */
run_url?: string;
+ /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */
+ benchmark_type?: string;
+ /** ISL in tokens — null for agentic_traces. */
+ isl?: number | null;
+ /** OSL in tokens — null for agentic_traces. */
+ osl?: number | null;
+ // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ──
+ /** "on" | "off" — whether KV cache offload to CPU was enabled. */
+ offload_mode?: string;
+ /** Actual server-observed GPU prefix-cache hit rate (0..1). */
+ server_gpu_cache_hit_rate?: number;
+ /** Actual server-observed CPU prefix-cache hit rate (0..1). */
+ server_cpu_cache_hit_rate?: number;
+ /** Infinite-cache theoretical hit rate (0..1) computed from trace. */
+ theoretical_cache_hit_rate?: number;
+ /** Total requests attempted during the window. */
+ num_requests_total?: number;
+ /** Requests that completed successfully. */
+ num_requests_successful?: number;
+ /** Total prompt tokens served. */
+ total_prompt_tokens?: number;
+ /** Total generated (output) tokens. */
+ total_generation_tokens?: number;
}
/**
@@ -187,6 +232,17 @@ export interface InferenceData extends Partial void;
+ /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */
+ selectedPercentile: string;
+ setSelectedPercentile: (p: string) => void;
selectedXAxisMetric: string | null;
setSelectedXAxisMetric: (metric: string | null) => void;
selectedE2eXAxisMetric: string | null;
setSelectedE2eXAxisMetric: (metric: string | null) => void;
+ /**
+ * Which chart variant the user wants to see — the inference card shows one chart
+ * at a time, picked by the big buttons above the chart.
+ * - 'ttft' → e2e chartType with x-axis forced to p90_ttft
+ * - 'e2e' → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
+ * - 'normalized-e2e'→ agentic-only; x = per-request E2E normalized to 400 output tokens
+ * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
+ * - 'session-time' → agentic-only; x = mean-normalized session time (live-computed from trace blobs)
+ * - 'prefill-tps' → agentic-only; x = mean of P90 prefill TPS/user per session
+ */
+ selectedXAxisMode:
+ | 'ttft'
+ | 'e2e'
+ | 'normalized-e2e'
+ | 'interactivity'
+ | 'session-time'
+ | 'prefill-tps';
+ setSelectedXAxisMode: (
+ mode: 'ttft' | 'e2e' | 'normalized-e2e' | 'interactivity' | 'session-time' | 'prefill-tps',
+ ) => void;
scaleType: 'auto' | 'linear' | 'log';
setScaleType: (type: 'auto' | 'linear' | 'log') => void;
/** Coarse vendor / framework / agg-disagg / mtp-stp filters applied to the chart point set. */
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 84db5e1f..9f333482 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -1,6 +1,6 @@
'use client';
-import { useMemo, useState } from 'react';
+import { useEffect, useMemo, useState } from 'react';
import { track } from '@/lib/analytics';
import { useFeatureGate } from '@/lib/use-feature-gate';
@@ -9,7 +9,8 @@ import { cn } from '@/lib/utils';
import { useInference } from '@/components/inference/InferenceContext';
import {
ModelSelector,
- SequenceSelector,
+ ScenarioSelector,
+ PercentileSelector,
PrecisionSelector,
} from '@/components/ui/chart-selectors';
import { DateRangePicker } from '@/components/ui/date-range-picker';
@@ -28,7 +29,7 @@ import { Button } from '@/components/ui/button';
import chartDefinitions from '@/components/inference/inference-chart-config.json';
import type { ChartDefinition, DisaggMode, SpecMode } from '@/components/inference/types';
import { FRAMEWORK_FAMILIES } from '@/components/inference/utils/quickFilters';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import { Sequence, type Model, type Percentile } from '@/lib/data-mappings';
/**
* Y-axis metric options from static chart config JSON — available immediately, no API wait.
@@ -109,6 +110,13 @@ interface ChartControlsProps {
}
export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) {
+ // The percentile selector is rendered conditionally on `selectedSequence`,
+ // which on the client is hydrated from URL params. SSR doesn't see the URL,
+ // so deferring the conditional until after mount keeps the initial DOM
+ // identical between server and client (avoids hydration warnings).
+ const [mounted, setMounted] = useState(false);
+ useEffect(() => setMounted(true), []);
+
const [openDropdown, setOpenDropdown] = useState(null);
const handleDropdownOpenChange = (dropdownKey: string) => (open: boolean) => {
if (open) {
@@ -117,6 +125,7 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
}
setOpenDropdown((current) => (current === dropdownKey ? null : current));
};
+
const {
selectedModel,
setSelectedModel,
@@ -126,6 +135,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
setSelectedPrecisions,
selectedYAxisMetric,
setSelectedYAxisMetric,
+ selectedPercentile,
+ setSelectedPercentile,
graphs,
selectedGPUs,
setSelectedGPUs,
@@ -354,14 +365,21 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
availableModels={availableModels}
data-testid="model-selector"
/>
-
+ {mounted && selectedSequence === Sequence.AgenticTraces && (
+ setSelectedPercentile(p)}
+ data-testid="percentile-selector"
+ />
+ )}
{graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') &&
- isInputMetric && (
+ isInputMetric &&
+ selectedSequence !== Sequence.AgenticTraces && (
- P99 TTFT
- Median TTFT
+ P90 TTFT
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 882b6f93..7bc30ba9 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -1,9 +1,12 @@
'use client';
-import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants';
+import {
+ DISPLAY_MODEL_TO_DB,
+ NORMALIZED_E2E_OUTPUT_TOKENS,
+} from '@semianalysisai/inferencex-constants';
import { track } from '@/lib/analytics';
import dynamic from 'next/dynamic';
import { useEffect, useMemo, useRef, useState } from 'react';
-import { BarChart3, ChevronDown, Table2, X } from 'lucide-react';
+import { BarChart3, Table2, X } from 'lucide-react';
import chartDefinitions from '@/components/inference/inference-chart-config.json';
import { useInference } from '@/components/inference/InferenceContext';
@@ -14,7 +17,10 @@ import type {
OverlayData,
TrendDataPoint,
} from '@/components/inference/types';
-import { processOverlayChartData } from '@/components/inference/utils';
+import {
+ processOverlayChartData,
+ selectUnofficialOverlayForMode,
+} from '@/components/inference/utils';
import {
isRunComparisonEntry,
makeRunComparisonEntry,
@@ -25,6 +31,7 @@ import ScatterGraph from '@/components/inference/ui/ScatterGraph';
import { Card } from '@/components/ui/card';
import { ChartButtons } from '@/components/ui/chart-buttons';
import { type SegmentedToggleOption, SegmentedToggle } from '@/components/ui/segmented-toggle';
+import { Tabs, TabsList, TabsTrigger } from '@/components/ui/tabs';
import { ChartShareActions, MetricAssumptionNotes } from '@/components/ui/chart-display-helpers';
import { UnofficialDomainNotice } from '@/components/ui/unofficial-domain-notice';
import { exportToCsv } from '@/lib/csv-export';
@@ -38,7 +45,6 @@ import {
DialogHeader,
DialogTitle,
} from '@/components/ui/dialog';
-import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
import { Skeleton } from '@/components/ui/skeleton';
import { useUnofficialRun } from '@/components/unofficial-run-provider';
import {
@@ -48,8 +54,15 @@ import {
getModelLabel,
getPrecisionLabel,
getSequenceLabel,
+ sequenceKind,
} from '@/lib/data-mappings';
import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs';
+import {
+ useDerivedAgenticMetrics,
+ type DerivedAgenticMetric,
+} from '@/hooks/api/use-derived-agentic-metrics';
+import { isAgenticOnlyXAxisMode, type XAxisMode } from '@/components/inference/hooks/useChartData';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
import { useTrendData } from '@/components/inference/hooks/useTrendData';
import { getHardwareConfig, hardwareKeyMatchesAnyBase } from '@/lib/constants';
@@ -67,55 +80,58 @@ const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagra
});
import WorkflowInfoDisplay from './WorkflowInfoDisplay';
-/** Controlled popover dropdown for the e2e chart x-axis toggle. */
-function E2eXAxisDropdown({
- xAxisLabel,
- xAxisOptions,
- selectedValue,
- onSelect,
-}: {
- xAxisLabel: string;
- xAxisOptions: { value: string | null; label: string }[];
- selectedValue: string | null;
- onSelect: (value: string | null) => void;
-}) {
- const [open, setOpen] = useState(false);
- return (
-
-
- e.stopPropagation()}
- >
- vs. {xAxisLabel}
-
-
-
-
- {xAxisOptions.map((opt) => (
- {
- onSelect(opt.value);
- setOpen(false);
- }}
- >
- {opt.label}
-
- ))}
-
-
- );
+type InferenceViewMode = 'chart' | 'table';
+
+const X_AXIS_MODE_BUTTONS: { value: XAxisMode; label: string }[] = [
+ { value: 'ttft', label: 'TTFT' },
+ { value: 'e2e', label: 'E2E Latency' },
+ { value: 'normalized-e2e', label: 'Normalized E2E' },
+ { value: 'interactivity', label: 'Interactivity' },
+ { value: 'session-time', label: 'Session Time' },
+ { value: 'prefill-tps', label: 'Prefill TPS / user' },
+];
+
+/**
+ * Presentation + data plumbing for the trace-derived x-axis modes (the
+ * agentic-only modes). One spec per mode keeps the x-label, chart heading,
+ * roofline corner, and derived-metric accessor in sync instead of scattering
+ * `selectedXAxisMode === …` conditionals through the render.
+ */
+interface DerivedXModeSpec {
+ xLabel: (percentileLabel: string) => string;
+ /** Chart heading suffix ("vs. …") shown above the plot. */
+ heading: (percentileLabel: string) => string;
+ rooflineCorner: 'upper_right' | 'upper_left';
+ /** Pull the raw metric for this mode off the derived-metrics payload. */
+ value: (m: DerivedAgenticMetric | undefined, percentile: string) => number | null | undefined;
+ /** Convert the raw metric to the plotted x value. */
+ toX: (raw: number) => number;
}
-type InferenceViewMode = 'chart' | 'table';
+const DERIVED_X_MODE_SPECS: Partial> = {
+ 'session-time': {
+ xLabel: () => 'Mean Normalized Session Time (min)',
+ heading: () => 'vs. Mean Normalized Session Time',
+ rooflineCorner: 'upper_right',
+ value: (m) => m?.normalized_session_time_s,
+ toX: (raw) => raw / 60,
+ },
+ 'normalized-e2e': {
+ xLabel: (pctl) => `${pctl} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens (s)`,
+ heading: (pctl) => `vs. ${pctl} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens`,
+ rooflineCorner: 'upper_right',
+ value: (m, percentile) =>
+ percentile === 'p75' ? m?.p75_normalized_e2e_400_s : m?.p90_normalized_e2e_400_s,
+ toX: (raw) => raw,
+ },
+ 'prefill-tps': {
+ xLabel: () => 'P90 Prefill TPS per user (tok/s)',
+ heading: () => 'vs. P90 Prefill TPS / user',
+ rooflineCorner: 'upper_left',
+ value: (m) => m?.p90_prefill_tps_per_user,
+ toX: (raw) => raw,
+ },
+};
const VIEW_MODE_OPTIONS: SegmentedToggleOption[] = [
{
@@ -161,8 +177,10 @@ export default function ChartDisplay() {
logScale,
activeHwTypes,
activeDates,
- setSelectedE2eXAxisMetric,
+ selectedPercentile,
compareGpuPair,
+ selectedXAxisMode,
+ setSelectedXAxisMode,
} = useInference();
const {
@@ -171,6 +189,9 @@ export default function ChartDisplay() {
totalDatesQueried,
} = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates);
+ const [mounted, setMounted] = useState(false);
+ useEffect(() => setMounted(true), []);
+
const modelDbKeys = useMemo(
() => DISPLAY_MODEL_TO_DB[selectedModel] ?? [selectedModel],
[selectedModel],
@@ -278,6 +299,7 @@ export default function ChartDisplay() {
chartType,
selectedYAxisMetric,
effectiveXMetric,
+ { isAgentic: sequenceKind(selectedSequence) === 'agentic' },
);
let overlayPoints = processed;
@@ -395,238 +417,269 @@ export default function ChartDisplay() {
}));
}, [graphs, overlayDataByChartType, selectedModel, selectedSequence]);
- const displayGraphs = isFirstLoad
- ? Array.from({ length: 2 }).map((_, index) => (
-
-
-
-
-
- ))
- : effectiveGraphs.length === 0
- ? []
- : effectiveGraphs.map((graph, graphIndex) => {
- const isTimelineMode = Boolean(
- selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
- );
- const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
- return (
-
-
- handleViewModeChange(graphIndex, v)}
- ariaLabel="View mode"
- testId={`inference-view-toggle-${graphIndex}`}
- />
- }
- hideImageExport={getViewMode(graphIndex) === 'table'}
- setIsLegendExpanded={setIsLegendExpanded}
- exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
- onExportMp4={
- replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined
- }
- onExportCsv={() => {
- const visibleData = graph.data.filter((d) =>
+ const visibleGraphs = useMemo(() => {
+ const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e';
+ const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType);
+ return filtered.length > 0 ? filtered : effectiveGraphs;
+ }, [effectiveGraphs, selectedXAxisMode]);
+
+ const isAgenticSequence = sequenceKind(selectedSequence) === 'agentic';
+ const useDerived = isAgenticSequence && isAgenticOnlyXAxisMode(selectedXAxisMode);
+ const derivedTargetIds = useMemo(() => {
+ if (!useDerived) return [] as number[];
+ const ids = new Set();
+ for (const graph of visibleGraphs) {
+ for (const point of graph.data) {
+ // Overlay-only agentic points carry no persisted id — skip them so we
+ // never request `?ids=0`/`?ids=NaN` (which 400s and errors the chart).
+ if (point.benchmark_type === 'agentic_traces' && isPersistedBenchmarkId(point.id)) {
+ ids.add(point.id);
+ }
+ }
+ }
+ return [...ids];
+ }, [useDerived, visibleGraphs]);
+ const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived);
+ const derivedMetrics = derivedQuery.data;
+ const isDerivedLoading =
+ useDerived &&
+ derivedTargetIds.length > 0 &&
+ (derivedQuery.isPending || derivedQuery.isFetching) &&
+ !derivedMetrics;
+
+ // Set only when the user is on a derived (agentic-only) x-axis mode; the
+ // specs are module constants so this is referentially stable per mode.
+ const derivedSpec = useDerived ? DERIVED_X_MODE_SPECS[selectedXAxisMode] : undefined;
+
+ const renderableGraphs = useMemo(() => {
+ if (!derivedSpec) return visibleGraphs;
+ if (!derivedMetrics) return visibleGraphs.map((graph) => ({ ...graph, data: [] }));
+ const xLabel = derivedSpec.xLabel(selectedPercentile.toUpperCase());
+ return visibleGraphs.map((graph) => {
+ const chartDefinition = {
+ ...graph.chartDefinition,
+ x_label: xLabel,
+ y_latency_limit: undefined,
+ [`${selectedYAxisMetric}_roofline` as keyof typeof graph.chartDefinition]:
+ derivedSpec.rooflineCorner,
+ };
+ const data = graph.data
+ .map((point) => {
+ if (!isPersistedBenchmarkId(point.id)) return null;
+ const raw = derivedSpec.value(derivedMetrics[point.id], selectedPercentile);
+ if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
+ return { ...point, x: derivedSpec.toX(raw) };
+ })
+ .filter((point): point is NonNullable => point !== null);
+ return { ...graph, chartDefinition, data };
+ });
+ }, [derivedSpec, visibleGraphs, derivedMetrics, selectedYAxisMetric, selectedPercentile]);
+
+ const displayGraphs =
+ isFirstLoad || isDerivedLoading
+ ? [
+
+
+
+
+ ,
+ ]
+ : renderableGraphs.length === 0
+ ? []
+ : renderableGraphs.map((graph, graphIndex) => {
+ const isTimelineMode = Boolean(
+ selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
+ );
+ const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
+ return (
+
+
+
- activeOverlayHwTypes.has(p.hwKey as string) &&
- selectedPrecisions.includes(p.precision),
- );
- const issueNotes = matchKnownConfigIssues(graph.model, [
- ...visibleData,
- ...visibleOverlayRows,
- ]).map((issue) =>
- knownIssueCsvNote(
- issue,
- getDisplayLabel(getHardwareConfig(issue.hwKey, graph.model)),
- ),
- );
- exportToCsv(
- `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
- headers,
- rows,
- issueNotes,
- );
- }}
- />
-
- {(() => {
- const chartCaption = (
- <>
-
- {
- graph.chartDefinition[
- `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
- ]
- }{' '}
- {(() => {
- // For Input metrics with dynamic x-axis, use dynamic heading
- const metricTitle =
- (graph.chartDefinition[
+ ? 'gpu_timeseries'
+ : graph.chartDefinition.chartType === 'e2e'
+ ? 'latency'
+ : 'interactivity'
+ }
+ leadingControls={
+ handleViewModeChange(graphIndex, v)}
+ ariaLabel="View mode"
+ testId={`inference-view-toggle-${graphIndex}`}
+ />
+ }
+ hideImageExport={getViewMode(graphIndex) === 'table'}
+ setIsLegendExpanded={setIsLegendExpanded}
+ exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
+ onExportMp4={
+ replayAvailable
+ ? () => replayHandlesRef.current[graphIndex]?.open()
+ : undefined
+ }
+ onExportCsv={() => {
+ const visibleData = graph.data.filter((d) =>
+ isTimelineMode
+ ? activeDates.has(`${d.date}_${d.hwKey}`)
+ : activeHwTypes.has(d.hwKey as string) &&
+ selectedPrecisions.includes(d.precision),
+ );
+ const { headers, rows } = inferenceChartToCsv(
+ visibleData,
+ graph.model,
+ graph.sequence,
+ );
+ // Match warnings against the same series the chart annotates,
+ // including visible unofficial-run overlay series.
+ const overlay = selectUnofficialOverlayForMode(
+ selectedXAxisMode,
+ graph.chartDefinition.chartType,
+ overlayDataByChartType,
+ );
+ const visibleOverlayRows = isTimelineMode
+ ? []
+ : (overlay?.data ?? []).filter(
+ (p) =>
+ activeOverlayHwTypes.has(p.hwKey as string) &&
+ selectedPrecisions.includes(p.precision),
+ );
+ const issueNotes = matchKnownConfigIssues(graph.model, [
+ ...visibleData,
+ ...visibleOverlayRows,
+ ]).map((issue) =>
+ knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))),
+ );
+ exportToCsv(
+ `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
+ headers,
+ rows,
+ issueNotes,
+ );
+ }}
+ />
+
+ {(() => {
+ const chartCaption = (
+ <>
+
+ {
+ graph.chartDefinition[
`${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
- ] as string) || '';
- const isInputMetric = metricTitle.toLowerCase().includes('input');
- if (
- graph.chartDefinition.chartType === 'interactivity' &&
- isInputMetric &&
- selectedXAxisMetric
- ) {
- if (selectedXAxisMetric === 'p99_ttft') {
- return 'vs. P99 Time To First Token';
- } else if (selectedXAxisMetric === 'median_ttft') {
- return 'vs. Median Time To First Token';
+ ]
+ }{' '}
+ {(() => {
+ // For Input metrics with dynamic x-axis, use dynamic heading
+ const metricTitle =
+ (graph.chartDefinition[
+ `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+ ] as string) || '';
+ const isInputMetric = metricTitle.toLowerCase().includes('input');
+ if (
+ graph.chartDefinition.chartType === 'interactivity' &&
+ isInputMetric &&
+ selectedXAxisMetric
+ ) {
+ if (selectedXAxisMetric === 'p99_ttft') {
+ return 'vs. P99 Time To First Token';
+ } else if (selectedXAxisMetric === 'median_ttft') {
+ return 'vs. Median Time To First Token';
+ }
}
- }
- // For e2e chart: render clickable inline dropdown for x-axis
- if (graph.chartDefinition.chartType === 'e2e') {
- const xAxisLabel =
- selectedE2eXAxisMetric === 'p99_ttft'
- ? 'P99 TTFT'
- : selectedE2eXAxisMetric === 'median_ttft'
- ? 'Median TTFT'
- : 'End-to-end Latency';
- const xAxisOptions = [
- { value: null, label: 'End-to-end Latency' },
- { value: 'p99_ttft', label: 'P99 TTFT' },
- { value: 'median_ttft', label: 'Median TTFT' },
- ];
- const zoomPrefix =
- selectedDateRange.startDate &&
- selectedDateRange.endDate &&
- selectedGPUs.length > 0
- ? 'gpu_timeseries'
- : 'latency';
+ // The e2e chart heading follows the branch-level x-axis mode
+ // selector, including agentic-only derived metrics.
+ if (graph.chartDefinition.chartType === 'e2e') {
+ const modeSpec = DERIVED_X_MODE_SPECS[selectedXAxisMode];
+ if (modeSpec) {
+ return modeSpec.heading(selectedPercentile.toUpperCase());
+ }
+ if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+ const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+ const word =
+ percentile === 'median' ? 'Median' : percentile.toUpperCase();
+ return `vs. ${word} Time To First Token`;
+ }
+ return isAgenticSequence
+ ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency`
+ : 'vs. End-to-end Latency';
+ }
+
+ // Fall back to configured heading
return (
- {
- setSelectedE2eXAxisMetric(value);
- track('latency_x_axis_metric_selected', {
- metric: value ?? 'median_e2el',
- });
- window.dispatchEvent(
- new CustomEvent(
- `${zoomPrefix}_zoom_reset_chart-${graphIndex}`,
- ),
- );
- }}
- />
+ graph.chartDefinition[
+ `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
+ ] || graph.chartDefinition.heading
);
- }
-
- // Fall back to configured heading
- return (
- graph.chartDefinition[
- `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
- ] || graph.chartDefinition.heading
- );
- })()}
-
-
- {getModelLabel(graph.model as Model)} •{' '}
- {selectedPrecisions
- .map((prec) => getPrecisionLabel(prec as Precision))
- .join(', ')}{' '}
- • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
- {isUnofficialRun
- ? 'Source: UNOFFICIAL'
- : 'Source: SemiAnalysis InferenceX™'}
- {selectedRunDate && (
- <>
- {' '}
- • Updated:{' '}
- {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
- 'en-US',
- {
- year: 'numeric',
- month: '2-digit',
- day: '2-digit',
- timeZone: 'UTC',
- },
- )}
- >
+ })()}
+
+
+ {getModelLabel(graph.model as Model)} •{' '}
+ {selectedPrecisions
+ .map((prec) => getPrecisionLabel(prec as Precision))
+ .join(', ')}{' '}
+ • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
+ {isUnofficialRun
+ ? 'Source: UNOFFICIAL'
+ : 'Source: SemiAnalysis InferenceX™'}
+ {selectedRunDate && (
+ <>
+ {' '}
+ • Updated:{' '}
+ {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
+ 'en-US',
+ {
+ year: 'numeric',
+ month: '2-digit',
+ day: '2-digit',
+ timeZone: 'UTC',
+ },
+ )}
+ >
+ )}
+
+
+ {isUnofficialRun && selectedXAxisMode === 'normalized-e2e' && (
+
+ Normalized E2E requires persisted per-request traces, so
+ unofficial-run overlays are unavailable for this experimental view.
+
)}
-
-
-
- >
- );
-
- if (getViewMode(graphIndex) === 'table') {
- const overlay =
- graph.chartDefinition.chartType === 'e2e'
- ? overlayDataByChartType.e2e
- : overlayDataByChartType.interactivity;
- const overlayRows = (overlay?.data ?? []).filter((p) =>
- selectedPrecisions.includes(p.precision),
- );
- return (
- <>
- {chartCaption}
- 0 ? [...graph.data, ...overlayRows] : graph.data
- }
- chartDefinition={graph.chartDefinition}
- selectedYAxisMetric={selectedYAxisMetric}
- />
+
>
);
- }
- return selectedGPUs.length > 0 &&
- ((selectedDateRange.startDate && selectedDateRange.endDate) ||
- selectedDates.length > 0) ? (
-
- ) : (
-
-
+ selectedPrecisions.includes(p.precision),
+ );
+ return (
+ <>
+ {chartCaption}
+ 0
+ ? [...graph.data, ...overlayRows]
+ : graph.data
+ }
+ chartDefinition={graph.chartDefinition}
+ selectedYAxisMetric={selectedYAxisMetric}
+ />
+ >
+ );
+ }
+
+ return selectedGPUs.length > 0 &&
+ ((selectedDateRange.startDate && selectedDateRange.endDate) ||
+ selectedDates.length > 0) ? (
+
- {selectedGPUs.length > 0 &&
- (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
- selectedDates.length === 0 && (
-
-
- Select a date range or add a run to view GPU comparison
-
-
- )}
-
- );
- })()}
- {replayAvailable && (
- {
- replayHandlesRef.current[graphIndex] = handle;
- }}
- parentChartId={`chart-${graphIndex}`}
- chartDefinition={graph.chartDefinition}
- yLabel={`${
- graph.chartDefinition[
- `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
- ]
- }`}
- xLabel={graph.chartDefinition.x_label}
- />
- )}
-
-
-
- );
- });
+ ) : (
+
+
+ {selectedGPUs.length > 0 &&
+ (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
+ selectedDates.length === 0 && (
+
+
+ Select a date range or add a run to view GPU comparison
+
+
+ )}
+
+ );
+ })()}
+ {replayAvailable && (
+ {
+ replayHandlesRef.current[graphIndex] = handle;
+ }}
+ parentChartId={`chart-${graphIndex}`}
+ chartDefinition={graph.chartDefinition}
+ yLabel={`${
+ graph.chartDefinition[
+ `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+ ]
+ }`}
+ xLabel={graph.chartDefinition.x_label}
+ />
+ )}
+
+
+
+ );
+ });
return (
@@ -733,6 +804,35 @@ export default function ChartDisplay() {
)}
+
{
+ setSelectedXAxisMode(value as XAxisMode);
+ track('latency_x_axis_mode_selected', { mode: value });
+ }}
+ >
+
+ {X_AXIS_MODE_BUTTONS.filter(({ value }) => {
+ if (!isAgenticOnlyXAxisMode(value)) return true;
+ // Before mount, render all buttons so SSR and first client render match.
+ if (!mounted) return true;
+ return isAgenticSequence;
+ }).map(({ value, label }) => (
+
+ {label}
+
+ ))}
+
+
{displayGraphs}
{/* Performance Over Time — Modal Drill-Down */}
diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx
index df22b8f5..a8cfed48 100644
--- a/packages/app/src/components/inference/ui/GPUGraph.tsx
+++ b/packages/app/src/components/inference/ui/GPUGraph.tsx
@@ -12,6 +12,7 @@ import { getChartWatermark } from '@/lib/data-mappings';
import { generateGpuDateColors } from '@/lib/dynamic-colors';
import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils';
import { useThemeColors } from '@/hooks/useThemeColors';
+import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
import { D3Chart } from '@/lib/d3-chart/D3Chart';
import type {
CustomLayerConfig,
@@ -26,6 +27,7 @@ import {
formatLargeNumber,
getShapeKeyForPrecision,
logTickFormat,
+ POINT_SIZE,
} from '@/lib/chart-rendering';
import {
paretoFrontLowerLeft,
@@ -259,6 +261,20 @@ const GPUGraph = React.memo(
return pts;
}, [groupedData, activeDates, hideNonOptimal, optimalPointKeys]);
+ // GPU comparison currently renders official DB-backed points only. Unofficial
+ // overlays have no benchmark_results id or persisted trace, so they cannot
+ // open the dedicated per-point charts route.
+ const agenticIds = useMemo(
+ () =>
+ filteredData.flatMap((point) =>
+ point.benchmark_type === 'agentic_traces' && typeof point.id === 'number'
+ ? [point.id]
+ : [],
+ ),
+ [filteredData],
+ );
+ const { data: traceAvailability } = useTraceAvailability(agenticIds);
+
// Warning annotations for visible series with known upstream issues —
// same treatment the scatter view gets, applied to the date-comparison view.
// Lines here are colored per (gpu, date) pair, so take the first active
@@ -755,7 +771,11 @@ const GPUGraph = React.memo(
config: {
getColor,
hideLabels: !showPointLabels,
- getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+ // Match ScatterGraph: append the concurrency (C=) to the
+ // parallelism/tp label so compare-mode points are annotated the
+ // same way as the single-run scatter chart.
+ getLabelText: (d) =>
+ useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`,
foreground: 'var(--foreground)',
dataAttrs: {
series: (d) => `${d.date}_${d.hwKey}`,
@@ -794,6 +814,7 @@ const GPUGraph = React.memo(
selectedYAxisMetric,
hardwareConfig,
runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
+ hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false,
}),
getRulerX: (d, xScale) => (xScale as d3.ScaleLinear
)(d.x),
getRulerY: (d, yScale) => (yScale as d3.ScaleLinear)(d.y),
@@ -807,6 +828,37 @@ const GPUGraph = React.memo(
sel.select('.visible-shape') as any,
getShapeKeyForPrecision(d.precision, selectedPrecisions),
),
+ onPointClick: (d: InferenceData) => {
+ track('gpu_timeseries_data_point_clicked', {
+ id: d.id,
+ hw: String(d.hwKey),
+ x: d.x,
+ y: d.y,
+ });
+ const tooltipEl = chartRef.current?.getTooltipElement();
+ if (!tooltipEl) return;
+ const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
+ if (!viewBtn || typeof d.id !== 'number') return;
+ viewBtn.addEventListener('click', (event) => {
+ event.stopPropagation();
+ track('gpu_timeseries_view_charts_opened', {
+ id: d.id,
+ hwKey: String(d.hwKey),
+ conc: d.conc,
+ });
+ });
+ // Pinning updates D3Chart's React state. GPU comparison rebuilds
+ // several inline layer configs on that render, whose cleanup can
+ // briefly hide the otherwise-pinned portal tooltip. Restore its
+ // pinned visibility after that render settles.
+ requestAnimationFrame(() => {
+ const pinnedTooltip = chartRef.current?.getTooltipElement();
+ if (!pinnedTooltip || chartRef.current?.getPinnedPoint() !== d) return;
+ pinnedTooltip.style.opacity = '1';
+ pinnedTooltip.style.display = 'block';
+ pinnedTooltip.style.pointerEvents = 'auto';
+ });
+ },
attachToLayer: 1,
}}
onRender={(ctx: RenderContext) => {
@@ -819,6 +871,28 @@ const GPUGraph = React.memo(
}
// Set foreground color on scatter point labels
ctx.layout.zoomGroup.selectAll('.point-label').style('fill', 'var(--foreground)');
+
+ // Offload halo: dashed ring on every point that used KV offload
+ // (mirrors ScatterGraph so compare mode shows the same CPU-offload
+ // indicator). The ring is a child of the dot-group, so it travels
+ // with the point on zoom/pan without a separate onZoom pass.
+ ctx.layout.zoomGroup
+ .selectAll('.dot-group')
+ .each(function (d) {
+ const showHalo = d.offload_mode === 'on';
+ d3.select(this)
+ .selectAll('.offload-halo')
+ .data(showHalo ? [true] : [])
+ .join('circle')
+ .attr('class', 'offload-halo')
+ .attr('r', POINT_SIZE + 4)
+ .attr('fill', 'none')
+ .attr('stroke', 'var(--foreground)')
+ .attr('stroke-width', 1.5)
+ .attr('stroke-dasharray', '3 2')
+ .attr('opacity', 0.9)
+ .attr('pointer-events', 'none');
+ });
}}
legendElement={
void;
+ /** Series label, e.g. "B300 (vLLM)". */
+ title: string;
+ /** Context line, e.g. "DeepSeek V4 Pro · Agentic Traces". */
+ subtitle: string;
+ /** Legend swatch color for this series (overlayRunColor for overlay runs). */
+ accentColor: string;
+ /** Rows from buildLegendPointsRows — already default-sorted by concurrency. */
+ rows: LegendPointsTableRow[];
+ /** Unofficial-run overlay series: metrics only, no detail links. */
+ isOverlay: boolean;
+ onRowClick?: (row: LegendPointsTableRow) => void;
+}
+
+interface Column {
+ key: LegendPointsSortKey;
+ label: string;
+ numeric: boolean;
+}
+
+const cellValue = (row: LegendPointsTableRow, col: Column): string => {
+ if (col.key === 'conc') return String(row.conc);
+ if (col.key === 'parallelism') return row.parallelism;
+ if (col.key === 'offload') return row.offload ?? '—';
+ return formatRowValue(row[col.key]);
+};
+
+/**
+ * Per-series drill-down opened from the chart legend: every currently-visible
+ * point of one hardware/framework series, with the same detail links the
+ * scatter points offer on click.
+ */
+export default function LegendPointsDialog({
+ open,
+ onOpenChange,
+ title,
+ subtitle,
+ accentColor,
+ rows,
+ isOverlay,
+ onRowClick,
+}: LegendPointsDialogProps) {
+ const [sort, setSort] = useState<{ key: LegendPointsSortKey; dir: 'asc' | 'desc' } | null>(null);
+
+ const hasOffload = rows.some((r) => r.offload !== null);
+ const columns = useMemo(
+ (): Column[] => [
+ { key: 'conc', label: 'Conc', numeric: true },
+ { key: 'parallelism', label: 'Parallelism', numeric: false },
+ ...(hasOffload ? [{ key: 'offload', label: 'Offload', numeric: false } as Column] : []),
+ { key: 'tputPerGpu', label: 'Tput/GPU', numeric: true },
+ { key: 'p50Intvty', label: 'p50 Int', numeric: true },
+ { key: 'p90Intvty', label: 'p90 Int', numeric: true },
+ { key: 'p50Ttft', label: 'p50 TTFT', numeric: true },
+ { key: 'p90Ttft', label: 'p90 TTFT', numeric: true },
+ ],
+ [hasOffload],
+ );
+
+ const sortedRows = useMemo(
+ () => (sort ? sortLegendPointsRows(rows, sort.key, sort.dir) : rows),
+ [rows, sort],
+ );
+
+ const toggleSort = (key: LegendPointsSortKey) => {
+ setSort((prev) =>
+ prev?.key === key ? (prev.dir === 'asc' ? { key, dir: 'desc' } : null) : { key, dir: 'asc' },
+ );
+ };
+
+ // Trailing column reserves space for the detail-link icon.
+ const gridTemplateColumns = `${columns.map(() => 'auto').join(' ')} min-content`;
+
+ const renderCells = (row: LegendPointsTableRow) => (
+ <>
+ {columns.map((col) => (
+
+ {cellValue(row, col)}
+
+ ))}
+
+ {row.href &&
+ (row.isExternal ? (
+
+ ) : (
+ →
+ ))}
+
+ >
+ );
+
+ return (
+
+
+
+
+
+ {title}
+
+ {subtitle}
+
+
+ {sortedRows.length === 0 ? (
+
+ No visible points for this series under the current filters.
+
+ ) : (
+ // One grid owns the column tracks; every row is a subgrid so cells
+ // align across ALL rows (per-row grids would auto-size independently
+ // and produce ragged columns).
+
+ )}
+
+
+ {isOverlay
+ ? 'Unofficial overlay points have no stored benchmark records — metrics only, no detail links.'
+ : 'Click a row for the point detail — agentic points open the trace detail page, fixed-seq points open the GitHub Actions run.'}{' '}
+ Interactivity in tok/s/user · TTFT in s · throughput in tok/s/gpu.
+
+
+
+ );
+}
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.decoration.test.tsx b/packages/app/src/components/inference/ui/ScatterGraph.decoration.test.tsx
index 2fd42acb..fac038e3 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.decoration.test.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.decoration.test.tsx
@@ -33,6 +33,13 @@ vi.mock('@/components/unofficial-run-provider', () => ({
useUnofficialRun: () => overlayState.current,
}));
+// ScatterGraph calls useTraceAvailability (a useQuery) for the agentic "View
+// charts" tooltip button. Stub it so these decoration tests don't need a
+// QueryClientProvider — trace presence is irrelevant to the toggle path.
+vi.mock('@/hooks/api/use-trace-availability', () => ({
+ useTraceAvailability: () => ({ data: undefined }),
+}));
+
import ScatterGraph from './ScatterGraph';
// ── Environment stubs ────────────────────────────────────────────────────────
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 64a8b218..e12522ce 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -2,10 +2,11 @@
import { track } from '@/lib/analytics';
import * as d3 from 'd3';
-import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef } from 'react';
+import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from 'react';
import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
import { useInference } from '@/components/inference/InferenceContext';
+import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
import { pointNearestX } from '@/components/inference/ui/line-label-anchor';
import {
labelOpacityForActiveState,
@@ -15,7 +16,13 @@ import ChartLegend from '@/components/ui/chart-legend';
import { useUnofficialRun } from '@/components/unofficial-run-provider';
import { computeToggle } from '@/hooks/useTogglableSet';
import { getHardwareConfig, getModelSortIndex } from '@/lib/constants';
-import { getChartWatermark, getPrecisionLabel, type Precision } from '@/lib/data-mappings';
+import {
+ getChartWatermark,
+ getPrecisionLabel,
+ getSequenceLabel,
+ type Precision,
+ Sequence,
+} from '@/lib/data-mappings';
import { matchKnownConfigIssues, pointMatchesIssue } from '@/lib/known-issues';
import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils';
import { D3Chart } from '@/lib/d3-chart/D3Chart';
@@ -44,12 +51,7 @@ import {
getShapeKeyForPrecision,
} from '@/lib/chart-rendering';
import { useThemeColors } from '@/hooks/useThemeColors';
-import {
- paretoFrontLowerLeft,
- paretoFrontLowerRight,
- paretoFrontUpperLeft,
- paretoFrontUpperRight,
-} from '@/lib/chart-utils';
+import { paretoFrontForDirection, type ParetoDirection } from '@/lib/chart-utils';
import { type RooflineDirection, getSpeedOverlayCorners } from '@/lib/speed-overlay';
import type {
ChartDefinition,
@@ -61,6 +63,8 @@ import {
generateTooltipContent,
getPointLabel,
} from '@/components/inference/utils/tooltipUtils';
+import LegendPointsDialog from '@/components/inference/ui/LegendPointsDialog';
+import { buildLegendPointsRows } from '@/components/inference/utils/legend-points-table';
import {
type ParetoPointLabel,
getParetoLabel,
@@ -76,6 +80,96 @@ import {
} from '@/components/inference/utils/knownIssueAnnotations';
import { matchesQuickFilters } from '@/components/inference/utils/quickFilters';
+// Greedy label-collision avoidance.
+// Each candidate is the y-position of the FIRST baseline (relative to point
+// center) which we apply via the first tspan's `dy` — later tspans cascade
+// down by 1.1em. We try above/below at primary and secondary offsets, and
+// hide the label if all four positions collide.
+function avoidLabelCollisions(
+ zoomGroup: d3.Selection,
+): void {
+ interface LabelInfo {
+ el: SVGTextElement;
+ firstTspan: SVGTSpanElement;
+ cx: number;
+ cy: number;
+ w: number;
+ nLines: number;
+ defaultFirstY: number;
+ }
+ const labels: LabelInfo[] = [];
+ const ASCENT = 9;
+ const DESCENT = 3;
+ const LINE_H = 11;
+
+ zoomGroup.selectAll('.dot-group').each(function () {
+ const labelEl = this.querySelector('.point-label');
+ if (!labelEl) return;
+ if ((this as SVGGElement).style.opacity === '0') return;
+ const tspans = labelEl.querySelectorAll('tspan');
+ if (tspans.length === 0) return;
+ const transform = (this as SVGGElement).getAttribute('transform') ?? '';
+ const m = transform.match(/translate\((?[^,]+),(?[^)]+)\)/u);
+ if (!m) return;
+ const cx = parseFloat(m[1]);
+ const cy = parseFloat(m[2]);
+ const nLines = tspans.length;
+ const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point
+ // Reset to default before measuring so prior positioning doesn't bias bbox
+ tspans[0].setAttribute('dy', `${defaultFirstY}px`);
+ labelEl.style.opacity = '1';
+ const bbox = labelEl.getBBox();
+ labels.push({
+ el: labelEl,
+ firstTspan: tspans[0],
+ cx,
+ cy,
+ w: bbox.width,
+ nLines,
+ defaultFirstY,
+ });
+ });
+
+ labels.sort((a, b) => a.cx - b.cx);
+ const placed: { left: number; right: number; top: number; bottom: number }[] = [];
+ const pad = 2;
+
+ for (const lab of labels) {
+ const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT;
+ const aboveFirstY = lab.defaultFirstY;
+ const belowFirstY = 14; // first baseline 14px below point center
+ const candidates = [
+ aboveFirstY,
+ belowFirstY,
+ aboveFirstY - blockH - 2,
+ belowFirstY + blockH + 2,
+ ];
+ let chosenY: number | null = null;
+ let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
+ for (const firstY of candidates) {
+ const top = lab.cy + firstY - ASCENT - pad;
+ const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad;
+ const left = lab.cx - lab.w / 2 - pad;
+ const right = lab.cx + lab.w / 2 + pad;
+ const collides = placed.some(
+ (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom),
+ );
+ if (!collides) {
+ chosenY = firstY;
+ chosenBox = { left, right, top, bottom };
+ break;
+ }
+ }
+ if (chosenY !== null && chosenBox) {
+ lab.firstTspan.setAttribute('dy', `${chosenY}px`);
+ lab.el.style.opacity = '1';
+ placed.push(chosenBox);
+ } else {
+ lab.el.style.opacity = '0';
+ }
+ }
+}
+
// X-shape path for overlay (unofficial) data points
const X_SIZE = 5;
const X_HOVER_SIZE = 7;
@@ -108,9 +202,40 @@ const formatChangelogDescription = (desc: string | string[]): React.JSX.Element
const CHART_MARGIN = { top: 24, right: 10, bottom: 60, left: 60 };
+/**
+ * Bucket points by their (requested) date. Comparison overlays put multiple
+ * dates under one legend key, and rooflines / gradient paths must never span
+ * dates — a May 15 point can't dominate a May 17 plot.
+ */
+function groupPointsByDate(points: InferenceData[]): Map {
+ const byDate = new Map();
+ for (const p of points) {
+ let bucket = byDate.get(p.date);
+ if (!bucket) {
+ bucket = [];
+ byDate.set(p.date, bucket);
+ }
+ bucket.push(p);
+ }
+ return byDate;
+}
+
+/** Identity key for "is this point on a roofline" lookups (scoped per date). */
+const optimalPointKey = (d: InferenceData): string =>
+ `${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`;
+
+/** Point label lines: TP (or full parallelism label) plus the C= concurrency. */
+const pointLabelText = (d: InferenceData, advanced: boolean): string =>
+ advanced ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`;
+
// Referentially stable "no overlay data" result (see processedOverlayData).
const EMPTY_OVERLAY_DATA: InferenceData[] = [];
+/** Which legend series' points table is open (per-series drill-down dialog). */
+type LegendPointsTarget =
+ | { kind: 'official'; hwKey: string }
+ | { kind: 'overlay'; runIndex: number; runId: number; branch: string };
+
// Scale configs are recomputed from the visible points on every render, but a
// legend / precision toggle usually leaves the actual domain untouched (x-min
// is pinned at 0; extremes are owned by a handful of points). Comparing by
@@ -214,6 +339,8 @@ const ScatterGraph = React.memo(
trackedConfigs,
addTrackedConfig,
removeTrackedConfig,
+ selectedXAxisMode,
+ selectedSequence,
quickFilters,
} = useInference();
@@ -289,10 +416,18 @@ const ScatterGraph = React.memo(
() => [...effectiveOfficialHwTypes],
[effectiveOfficialHwTypes],
);
+ // High-contrast palette is keyed off the FULL set of official hw types with
+ // data, not the active subset. Otherwise deselecting a line shrinks the key
+ // set, which re-sizes the iwanthue palette and shifts every remaining line's
+ // hue (most visible for single-vendor agentic runs that span the full wheel —
+ // e.g. deselecting B300 would recolor B200 from red to blue). Keying off the
+ // stable full set fixes each hw's color so toggling only hides/shows lines.
+ const stableHcKeys = useMemo(() => [...hwTypesWithData], [hwTypesWithData]);
const { resolveColor, getCssColor } = useThemeColors({
highContrast,
identifiers: activeHwKeys,
activeKeys: activeOfficialKeys,
+ hcKeys: stableHcKeys,
});
// --- Changelog ---
@@ -328,34 +463,40 @@ const ScatterGraph = React.memo(
);
const rooflines = useMemo(() => {
+ // Frontier scope is (hw, precision, date) — points from different dates
+ // can never share a frontier (a May 15 point can't dominate a May 17 plot).
+ // The legend grouping is still by (hw, precision); we just split the
+ // pareto compute per date and re-merge into the legend bucket.
const result: Record = {};
const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition;
- const dir = chartDefinition[rooflineKey] as
- | 'upper_right'
- | 'upper_left'
- | 'lower_left'
- | 'lower_right'
- | undefined;
- for (const hw of Object.keys(groupedData)) {
- const front =
- dir === 'upper_right'
- ? paretoFrontUpperRight(groupedData[hw])
- : dir === 'upper_left'
- ? paretoFrontUpperLeft(groupedData[hw])
- : dir === 'lower_left'
- ? paretoFrontLowerLeft(groupedData[hw])
- : paretoFrontLowerRight(groupedData[hw]);
- front.sort((a, b) => a.x - b.x);
- result[hw] = front;
+ const dir = chartDefinition[rooflineKey] as ParetoDirection | undefined;
+ const frontierFn = paretoFrontForDirection(dir ?? 'lower_right');
+ for (const hwKey of Object.keys(groupedData)) {
+ const combined: InferenceData[] = [];
+ for (const datePoints of groupPointsByDate(groupedData[hwKey]).values()) {
+ // In non-e2e xmodes, useChartData stamps every point with an
+ // `isOnE2eFrontier` flag so the line is restricted to the
+ // e2e-Pareto winners — same set of points across every chart,
+ // just re-plotted at the chosen x metric. When the flag is
+ // present on ANY point in the bucket, narrow to the winners
+ // before paretoing (otherwise we'd recompute a fresh frontier
+ // on the swapped x axis and reintroduce the benchmark hack).
+ const flagged = datePoints.some((p) => p.isOnE2eFrontier !== undefined);
+ const seedPoints = flagged
+ ? datePoints.filter((p) => p.isOnE2eFrontier === true)
+ : datePoints;
+ if (seedPoints.length === 0) continue;
+ combined.push(...frontierFn(seedPoints));
+ }
+ combined.sort((a, b) => a.x - b.x);
+ result[hwKey] = combined;
}
return result;
}, [groupedData, selectedYAxisMetric, chartDefinition]);
const optimalPointKeys = useMemo(() => {
const keys = new Set();
- Object.values(rooflines).forEach((pts) =>
- pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}-${p.x}-${p.y}`)),
- );
+ Object.values(rooflines).forEach((pts) => pts.forEach((p) => keys.add(optimalPointKey(p))));
return keys;
}, [rooflines]);
@@ -381,6 +522,10 @@ const ScatterGraph = React.memo(
const buildPointConfigId = useCallback((point: InferenceData): string => {
let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`;
if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`;
+ // Agentic runs emit two rows per (config, conc) — one offload=on, one off.
+ // Without this suffix, d3's data join treats them as the same point and
+ // drops one variant (along with its halo).
+ if (point.offload_mode) key += `|offload-${point.offload_mode}`;
return key;
}, []);
@@ -454,22 +599,11 @@ const ScatterGraph = React.memo(
{} as Record,
);
const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition;
- const dir = chartDefinition[rooflineKey] as
- | 'upper_right'
- | 'upper_left'
- | 'lower_left'
- | 'lower_right'
- | undefined;
+ const dir = chartDefinition[rooflineKey] as ParetoDirection | undefined;
+ const frontierFn = paretoFrontForDirection(dir ?? 'lower_right');
const result: Record = {};
for (const [key, group] of Object.entries(grouped)) {
- const front =
- dir === 'upper_right'
- ? paretoFrontUpperRight(group.points)
- : dir === 'upper_left'
- ? paretoFrontUpperLeft(group.points)
- : dir === 'lower_left'
- ? paretoFrontLowerLeft(group.points)
- : paretoFrontLowerRight(group.points);
+ const front = frontierFn(group.points);
front.sort((a, b) => a.x - b.x);
result[key] = { hwKey: group.hwKey, runIndex: group.runIndex, points: front };
}
@@ -479,6 +613,71 @@ const ScatterGraph = React.memo(
// All official points for rendering (unfiltered — visibility via opacity)
const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]);
+ // Bulk presence lookup for agentic points: which ids have a stored
+ // trace_replay blob → controls the "View charts" button in the pinned
+ // tooltip. We deliberately don't fetch the histograms themselves here;
+ // a 95-point dsv4-b300 dashboard would pull GB of profile blobs through
+ // Neon's HTTP API and trip its 64 MB per-response cap.
+ const agenticIds = useMemo(() => {
+ const ids: number[] = [];
+ for (const p of pointsData) {
+ if (p.benchmark_type === 'agentic_traces' && typeof p.id === 'number') ids.push(p.id);
+ }
+ return ids;
+ }, [pointsData]);
+ const { data: traceAvailability } = useTraceAvailability(agenticIds);
+
+ // --- Legend points table (per-series drill-down opened from the legend) ---
+ const [pointsTableTarget, setPointsTableTarget] = useState(null);
+
+ const pointsTable = useMemo(() => {
+ if (!pointsTableTarget) return null;
+ if (pointsTableTarget.kind === 'official') {
+ const { hwKey } = pointsTableTarget;
+ const hwConfig = hardwareConfig[hwKey];
+ // Same visibility filters the chart applies (precision, Optimal Only),
+ // scoped to the clicked series.
+ const pts = pointsData.filter(
+ (p) =>
+ p.hwKey === hwKey &&
+ selectedPrecisions.includes(p.precision) &&
+ (!hideNonOptimal || optimalPointKeys.has(optimalPointKey(p))),
+ );
+ return {
+ hw: hwKey,
+ title: hwConfig ? getDisplayLabel(hwConfig) : hwKey,
+ color: resolveColor(hwKey),
+ isOverlay: false,
+ rows: buildLegendPointsRows(pts, false),
+ };
+ }
+ const { runIndex, runId, branch } = pointsTableTarget;
+ // Overlay series: this run's points, respecting the overlay hw toggles.
+ const pts = processedOverlayData.filter(
+ (p) =>
+ overlayRunIndex(p.run_url ?? null, runIndexByUrl) === runIndex &&
+ activeOverlayHwTypes.has(p.hwKey as string),
+ );
+ return {
+ hw: `overlay-run-${runId}`,
+ title: `✕ ${branch}`,
+ color: overlayRunColor(runIndex),
+ isOverlay: true,
+ rows: buildLegendPointsRows(pts, true),
+ };
+ }, [
+ pointsTableTarget,
+ hardwareConfig,
+ pointsData,
+ selectedPrecisions,
+ hideNonOptimal,
+ optimalPointKeys,
+ resolveColor,
+ processedOverlayData,
+ runIndexByUrl,
+ activeOverlayHwTypes,
+ ]);
+
// Gradient label data
const allPointLabelsByKey = useMemo(() => {
const globalLabelColorMap = new Map();
@@ -518,7 +717,7 @@ const ScatterGraph = React.memo(
const visiblePoints = useMemo(() => {
let pts = filteredData;
if (hideNonOptimal) {
- pts = pts.filter((d) => optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`));
+ pts = pts.filter((d) => optimalPointKeys.has(optimalPointKey(d)));
}
return processedOverlayData.length > 0 ? [...pts, ...processedOverlayData] : pts;
}, [filteredData, processedOverlayData, hideNonOptimal, optimalPointKeys]);
@@ -607,7 +806,7 @@ const ScatterGraph = React.memo(
(d: InferenceData) =>
effectiveActiveHwTypes.has(d.hwKey as string) &&
selectedPrecisions.includes(d.precision) &&
- (!hideNonOptimal || optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)),
+ (!hideNonOptimal || optimalPointKeys.has(optimalPointKey(d))),
[effectiveActiveHwTypes, selectedPrecisions, hideNonOptimal, optimalPointKeys],
);
@@ -755,6 +954,7 @@ const ScatterGraph = React.memo(
d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any,
);
}
+ avoidLabelCollisions(ctx.layout.zoomGroup);
},
}),
[zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type],
@@ -774,6 +974,7 @@ const ScatterGraph = React.memo(
hardwareConfig,
isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)),
runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
+ hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false,
}),
getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x),
getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y),
@@ -789,26 +990,39 @@ const ScatterGraph = React.memo(
),
onPointClick: (d: InferenceData) => {
track('latency_data_point_clicked', { hw: String(d.hwKey), x: d.x, y: d.y });
- // Attach track-over-time button handler in the tooltip
const tooltipEl = chartRef.current?.getTooltipElement();
- if (tooltipEl) {
- const btn = tooltipEl.querySelector('[data-action="track-over-time"]');
- if (btn) {
- btn.addEventListener('click', (btnEvent) => {
- btnEvent.stopPropagation();
- const configId = buildPointConfigId(d);
- if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
- else addTrackedConfig(d, chartDefinition.chartType);
- chartRef.current?.dismissTooltip();
- chartRef.current?.hideTooltip();
- track('latency_point_tracked_via_tooltip', {
- hwKey: String(d.hwKey),
- tp: d.tp,
- conc: d.conc,
- precision: d.precision,
- });
+ if (!tooltipEl) return;
+
+ // ── Summary-page actions ──────────────────────────────────────────
+ const trackBtn = tooltipEl.querySelector('[data-action="track-over-time"]');
+ if (trackBtn) {
+ trackBtn.addEventListener('click', (btnEvent) => {
+ btnEvent.stopPropagation();
+ const configId = buildPointConfigId(d);
+ if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
+ else addTrackedConfig(d, chartDefinition.chartType);
+ chartRef.current?.dismissTooltip();
+ chartRef.current?.hideTooltip();
+ track('latency_point_tracked_via_tooltip', {
+ hwKey: String(d.hwKey),
+ tp: d.tp,
+ conc: d.conc,
+ precision: d.precision,
});
- }
+ });
+ }
+
+ // ── "View charts" real link (supports browser open-in-new-tab) ───
+ const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
+ if (viewBtn && typeof d.id === 'number') {
+ viewBtn.addEventListener('click', (btnEvent) => {
+ btnEvent.stopPropagation();
+ track('latency_view_charts_opened', {
+ id: d.id,
+ hwKey: String(d.hwKey),
+ conc: d.conc,
+ });
+ });
}
},
attachToLayer: 1, // scatter layer is index 1 (after rooflines at 0)
@@ -822,6 +1036,11 @@ const ScatterGraph = React.memo(
addTrackedConfig,
removeTrackedConfig,
chartDefinition.chartType,
+ // selectedPrecisions is read via interactionRef.current in the hover
+ // handlers, so it isn't a dep. traceAvailability IS read directly in the
+ // tooltip content closure (the "View charts" button), so rebuild the
+ // config when the presence fetch resolves.
+ traceAvailability,
],
);
@@ -876,35 +1095,56 @@ const ScatterGraph = React.memo(
const precision = key.split('_').pop()!;
const visible =
ir.effectiveActiveHwTypes.has(hw) && ir.selectedPrecisions.includes(precision);
- let stroke = ir.getCssColor(ir.resolveColor(hw));
-
- if (showGradientLabels) {
- const pointLabels = allPointLabelsByKey[key];
- if (pointLabels) {
- const stops = computeGradientStops(pointLabels, xScale);
- if (stops) {
- const gid = `roofline-gradient-${chartId}-${key}`;
- activeGradientIds.add(gid);
- let gradient = defs.select(`#${CSS.escape(gid)}`);
- if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
- gradient
- .attr('gradientUnits', 'userSpaceOnUse')
- .attr('x1', xScale(pts[0].x))
- .attr('y1', 0)
- .attr('x2', xScale(pts.at(-1)!.x))
- .attr('y2', 0);
- gradient
- .selectAll('stop')
- .data(stops)
- .join('stop')
- .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
- .attr('stop-color', (s) => s.color);
- stroke = `url(#${gid})`;
+ const baseStroke = ir.getCssColor(ir.resolveColor(hw));
+
+ // Split into per-date sub-paths so the line never crosses dates.
+ // (When only one date is present the loop runs once with the full set.)
+ const byDate = groupPointsByDate(pts);
+ const singleDate = byDate.size === 1;
+
+ for (const [date, datePoints] of byDate) {
+ if (datePoints.length <= 1) continue;
+ const entryKey = singleDate ? key : `${key}__${date}`;
+ let stroke = baseStroke;
+
+ // Gradient labels only apply in the single-date case; mapping the
+ // (key-wide) ParetoPointLabel array onto per-date sub-segments is
+ // ambiguous and the comparison-date overlay is a rare combo.
+ if (singleDate && showGradientLabels) {
+ const pointLabels = allPointLabelsByKey[key];
+ if (pointLabels) {
+ const stops = computeGradientStops(pointLabels, xScale);
+ if (stops) {
+ const gid = `roofline-gradient-${chartId}-${entryKey}`;
+ activeGradientIds.add(gid);
+ let gradient = defs.select(`#${CSS.escape(gid)}`);
+ if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
+ gradient
+ .attr('gradientUnits', 'userSpaceOnUse')
+ .attr('x1', xScale(datePoints[0].x))
+ .attr('y1', 0)
+ .attr('x2', xScale(datePoints.at(-1)!.x))
+ .attr('y2', 0);
+ gradient
+ .selectAll('stop')
+ .data(stops)
+ .join('stop')
+ .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
+ .attr('stop-color', (s) => s.color);
+ stroke = `url(#${gid})`;
+ }
}
}
- }
- entries.push({ key, hw, precision, points: pts, stroke, visible });
+ entries.push({
+ key: entryKey,
+ hw,
+ precision,
+ points: datePoints,
+ stroke,
+ visible,
+ });
+ }
});
// Remove stale gradients
@@ -1346,11 +1586,18 @@ const ScatterGraph = React.memo(
.y((d) => newYScale(d.y))
.curve(d3.curveMonotoneX);
- // Update roofline paths
+ // Update roofline paths — must split per-date so the zoom redraw
+ // matches the per-date sub-paths created in the initial render.
Object.entries(rooflines).forEach(([key, pts]) => {
if (pts.length < 2) return;
- const sel = zoomGroup.select(`.roofline-${key}`);
- if (!sel.empty()) sel.attr('d', lineGen(pts) as string);
+ const byDate = groupPointsByDate(pts);
+ const singleDate = byDate.size === 1;
+ for (const [date, datePoints] of byDate) {
+ if (datePoints.length < 2) continue;
+ const cls = singleDate ? `roofline-${key}` : `roofline-${key}__${date}`;
+ const sel = zoomGroup.select(`.${CSS.escape(cls)}`);
+ if (!sel.empty()) sel.attr('d', lineGen(datePoints) as string);
+ }
});
// Update gradient coordinates
@@ -1578,7 +1825,8 @@ const ScatterGraph = React.memo(
getOpacity: (d) => (interactionRef.current.isPointVisible(d) ? 1 : 0),
getPointerEvents: (d) => (interactionRef.current.isPointVisible(d) ? 'auto' : 'none'),
hideLabels: !showPointLabels || showGradientLabels,
- getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+ // Keep the concurrency (C=) annotation from the agentx scatter labels.
+ getLabelText: (d) => pointLabelText(d, useAdvancedLabels),
foreground: 'var(--foreground)',
dataAttrs: {
'hw-key': (d) => String(d.hwKey),
@@ -1679,17 +1927,26 @@ const ScatterGraph = React.memo(
// Labels
const showLabels = showPointLabels && !showGradientLabels;
overlayPoints.each(function (d) {
- d3.select(this)
+ const lines = showLabels ? pointLabelText(d, useAdvancedLabels).split('\n') : [];
+ const text = d3
+ .select(this)
.selectAll('.overlay-label')
.data(showLabels ? [true] : [])
.join('text')
.attr('class', 'overlay-label')
- .attr('dy', -10)
.attr('text-anchor', 'middle')
.style('fill', 'var(--foreground)')
.attr('font-size', '10px')
- .attr('pointer-events', 'none')
- .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp));
+ .attr('font-weight', '700')
+ .attr('pointer-events', 'none');
+ const firstDy = -(1 + (lines.length - 1) * 1.1);
+ text
+ .selectAll('tspan')
+ .data(lines)
+ .join('tspan')
+ .attr('x', 0)
+ .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
+ .text((l) => l);
});
// Overlay tooltip handlers
@@ -2007,6 +2264,23 @@ const ScatterGraph = React.memo(
.attr('pointer-events', 'none');
});
+ // Offload halo: dashed ring on every point that used KV offload (Pareto or not)
+ zoomGroup.selectAll('.dot-group').each(function (d) {
+ const showHalo = d.offload_mode === 'on';
+ d3.select(this)
+ .selectAll('.offload-halo')
+ .data(showHalo ? [true] : [])
+ .join('circle')
+ .attr('class', 'offload-halo')
+ .attr('r', POINT_SIZE + 4)
+ .attr('fill', 'none')
+ .attr('stroke', 'var(--foreground)')
+ .attr('stroke-width', 1.5)
+ .attr('stroke-dasharray', '3 2')
+ .attr('opacity', 0.9)
+ .attr('pointer-events', 'none');
+ });
+
// Double-click to track/untrack
zoomGroup
.selectAll('.dot-group')
@@ -2041,6 +2315,8 @@ const ScatterGraph = React.memo(
});
});
+ avoidLabelCollisions(zoomGroup);
+
// Log tick formatting on initial render
if (xScaleConfig._isLog) {
const xScale = ctx.xScale as d3.ScaleLogarithmic;
@@ -2063,6 +2339,9 @@ const ScatterGraph = React.memo(
chartDefinition.chartType,
xScaleConfig._isLog,
yScaleConfig.type,
+ optimalPointKeys,
+ getCssColor,
+ resolveColor,
],
);
@@ -2234,256 +2513,310 @@ const ScatterGraph = React.memo(
}
return (
-
- ref={chartRef}
- chartId={chartId}
- // Stable across toggles: the render effect keys on this for "data
- // changed" rebuilds; scale domains come from x/yScaleConfig (computed
- // from the visible points), and visibility is applied via opacity.
- data={pointsData}
- margin={CHART_MARGIN}
- watermark={getChartWatermark(isUnofficialRun)}
- testId="scatter-graph"
- grabCursor={true}
- caption={caption}
- xScale={xScaleConfig}
- yScale={yScaleConfig}
- xAxis={xAxisConfig}
- yAxis={yAxisConfig}
- layers={layers}
- zoom={zoomConfig}
- tooltip={tooltipConfig}
- transitionDuration={transitionDuration}
- onRender={onRender}
- noDataOverlay={
- filteredData.length === 0 && processedOverlayData.length === 0 ? (
-
-
-
No data available
-
- Please change the model, sequence, precision, date range or GPU selection.
-
+ <>
+
+ ref={chartRef}
+ chartId={chartId}
+ // Stable across toggles: the render effect keys on this for "data
+ // changed" rebuilds; scale domains come from x/yScaleConfig (computed
+ // from the visible points), and visibility is applied via opacity.
+ data={pointsData}
+ margin={CHART_MARGIN}
+ watermark={getChartWatermark(isUnofficialRun)}
+ testId="scatter-graph"
+ grabCursor={true}
+ caption={caption}
+ xScale={xScaleConfig}
+ yScale={yScaleConfig}
+ xAxis={xAxisConfig}
+ yAxis={yAxisConfig}
+ layers={layers}
+ zoom={zoomConfig}
+ tooltip={tooltipConfig}
+ transitionDuration={transitionDuration}
+ onRender={onRender}
+ noDataOverlay={
+ filteredData.length === 0 && processedOverlayData.length === 0 ? (
+
+
+
No data available
+
+ Please change the model, sequence, precision, date range or GPU selection.
+
+
-
- ) : undefined
- }
- legendElement={
-
0
- ? unofficialRunInfos
- .map((info, idx) => {
- const hasPoints = overlayData.data.some(
- (d) =>
- overlayRunIndex(d.run_url ?? null, runIndexByUrl) === idx &&
- selectedPrecisions.includes(d.precision),
- );
- if (!hasPoints) return null;
- const branch = info.branch || `run ${info.id}`;
- return {
- name: `✕ unofficial-run-${info.id}`,
- label: `✕ ${branch}`,
- color: overlayRunColor(idx),
- title: `UNOFFICIAL: ${branch}`,
- isHighlighted: true,
- hw: `overlay-run-${info.id}`,
- isActive: true,
- onClick: () => {},
- tooltip: (
-
- ),
- };
- })
- .filter((x): x is NonNullable => x !== null)
- : []),
- ...Object.entries(hardwareConfig)
- .filter(([key]) =>
- showAllHardwareTypes ? effectiveActiveHwTypes.has(key) : hwTypesWithData.has(key),
- )
- .toSorted(
- ([a], [b]) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b),
- )
- .map(([key, hwConfig]: [string, any]) => ({
- name: hwConfig.name,
- label: getDisplayLabel(hwConfig),
- color: resolveColor(key),
- title: hwConfig.gpu,
- isHighlighted: highlightConfigSuffixes.has(key.replaceAll('_', '-')),
- hw: key,
- isActive: showAllHardwareTypes ? true : effectiveOfficialHwTypes.has(key),
- onClick: showAllHardwareTypes
- ? () => {}
- : () => {
- handleToggleHwType(key);
- track('latency_hw_type_toggled', { hw: key });
- },
- tooltip: changelog
- ? formatChangelogDescription(changelog.entries[0].description)
- : null,
- })),
- ]}
- disableActiveSort={false}
- isLegendExpanded={isLegendExpanded}
- onExpandedChange={(expanded) => {
- setIsLegendExpanded(expanded);
- track('latency_legend_expanded', { expanded });
- }}
- switches={[
- ...(selectedYAxisMetric === 'y_inputTputPerGpu'
- ? []
- : [
- {
- id: 'scatter-log-scale',
- label: 'Log Scale',
- checked: logScale,
- onCheckedChange: (checked: boolean) => {
- setLogScale(checked);
- track('latency_log_scale_toggled', { enabled: checked });
- },
+ ) : undefined
+ }
+ legendElement={
+ 0
+ ? unofficialRunInfos
+ .map((info, idx) => {
+ const hasPoints = overlayData.data.some(
+ (d) =>
+ overlayRunIndex(d.run_url ?? null, runIndexByUrl) === idx &&
+ selectedPrecisions.includes(d.precision),
+ );
+ if (!hasPoints) return null;
+ const branch = info.branch || `run ${info.id}`;
+ return {
+ name: `✕ unofficial-run-${info.id}`,
+ label: `✕ ${branch}`,
+ color: overlayRunColor(idx),
+ title: `UNOFFICIAL: ${branch}`,
+ isHighlighted: true,
+ hw: `overlay-run-${info.id}`,
+ isActive: true,
+ onClick: () => {},
+ onShowPoints: () => {
+ setPointsTableTarget({
+ kind: 'overlay',
+ runIndex: idx,
+ runId: info.id,
+ branch,
+ });
+ track('inference_legend_points_table_opened', {
+ hw: `overlay-run-${info.id}`,
+ framework: 'overlay',
+ });
+ },
+ tooltip: (
+
+ ),
+ };
+ })
+ .filter((x): x is NonNullable => x !== null)
+ : []),
+ ...Object.entries(hardwareConfig)
+ .filter(([key]) =>
+ showAllHardwareTypes
+ ? effectiveActiveHwTypes.has(key)
+ : hwTypesWithData.has(key),
+ )
+ .toSorted(
+ ([a], [b]) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b),
+ )
+ .map(([key, hwConfig]: [string, any]) => ({
+ name: hwConfig.name,
+ label: getDisplayLabel(hwConfig),
+ color: resolveColor(key),
+ title: hwConfig.gpu,
+ isHighlighted: highlightConfigSuffixes.has(key.replaceAll('_', '-')),
+ hw: key,
+ isActive: showAllHardwareTypes ? true : effectiveOfficialHwTypes.has(key),
+ onClick: showAllHardwareTypes
+ ? () => {}
+ : () => {
+ handleToggleHwType(key);
+ track('latency_hw_type_toggled', { hw: key });
+ },
+ onShowPoints: () => {
+ setPointsTableTarget({ kind: 'official', hwKey: key });
+ track('inference_legend_points_table_opened', {
+ hw: key,
+ framework: hwConfig.framework ?? '',
+ });
},
- ]),
- {
- id: 'scatter-hide-non-optimal',
- label: 'Optimal Only',
- checked: hideNonOptimal,
- onCheckedChange: (checked: boolean) => {
- setHideNonOptimal(checked);
- track('latency_hide_non_optimal_toggled', { enabled: checked });
+ tooltip: changelog
+ ? formatChangelogDescription(changelog.entries[0].description)
+ : null,
+ })),
+ ]}
+ disableActiveSort={false}
+ isLegendExpanded={isLegendExpanded}
+ onExpandedChange={(expanded) => {
+ setIsLegendExpanded(expanded);
+ track('latency_legend_expanded', { expanded });
+ }}
+ switches={[
+ ...(selectedYAxisMetric === 'y_inputTputPerGpu'
+ ? []
+ : [
+ {
+ id: 'scatter-log-scale',
+ label: 'Log Scale',
+ checked: logScale,
+ onCheckedChange: (checked: boolean) => {
+ setLogScale(checked);
+ track('latency_log_scale_toggled', { enabled: checked });
+ },
+ },
+ ]),
+ {
+ id: 'scatter-hide-non-optimal',
+ label: 'Optimal Only',
+ checked: hideNonOptimal,
+ onCheckedChange: (checked: boolean) => {
+ setHideNonOptimal(checked);
+ track('latency_hide_non_optimal_toggled', { enabled: checked });
+ },
+ // On agentic + non-e2e chart, "optimal" means "on the
+ // e2e-latency Pareto frontier" (not a per-axis Pareto on the
+ // current x metric). Explain that so users don't wonder why
+ // a point sitting above the line is still considered
+ // dominated.
+ ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e'
+ ? {
+ infoTooltip:
+ "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.",
+ }
+ : {}),
},
- },
- {
- id: 'scatter-point-labels',
- label: 'Labels',
- checked: showPointLabels,
- onCheckedChange: (checked: boolean) => {
- setShowPointLabels(checked);
- track('latency_point_labels_toggled', { enabled: checked });
+ {
+ id: 'scatter-point-labels',
+ label: 'Labels',
+ checked: showPointLabels,
+ onCheckedChange: (checked: boolean) => {
+ setShowPointLabels(checked);
+ track('latency_point_labels_toggled', { enabled: checked });
+ },
},
- },
- {
- id: 'scatter-high-contrast',
- label: 'High Contrast',
- checked: highContrast,
- onCheckedChange: (checked: boolean) => {
- setHighContrast(checked);
- track('latency_high_contrast_toggled', { enabled: checked });
+ {
+ id: 'scatter-high-contrast',
+ label: 'High Contrast',
+ checked: highContrast,
+ onCheckedChange: (checked: boolean) => {
+ setHighContrast(checked);
+ track('latency_high_contrast_toggled', { enabled: checked });
+ },
},
- },
- {
- id: 'scatter-parallelism-labels',
- label: 'Parallelism Labels',
- checked: useAdvancedLabels,
- onCheckedChange: (checked: boolean) => {
- setUseAdvancedLabels(checked);
- track('latency_advanced_labels_toggled', { enabled: checked });
- // Parallelism labels are point labels; turning them on is
- // pointless if labels are hidden, so auto-enable Labels.
- if (checked && !showPointLabels) setShowPointLabels(true);
- if (checked && !showGradientLabels) {
- window.dispatchEvent(
- new CustomEvent(GRADIENT_NUDGE_EVENT, {
- detail: {
- enableGradient: () => {
- setShowGradientLabels(true);
- setUseAdvancedLabels(false);
- track('latency_gradient_labels_toggled', {
- enabled: true,
- source: 'nudge',
- });
+ {
+ id: 'scatter-parallelism-labels',
+ label: 'Parallelism Labels',
+ checked: useAdvancedLabels,
+ onCheckedChange: (checked: boolean) => {
+ setUseAdvancedLabels(checked);
+ track('latency_advanced_labels_toggled', { enabled: checked });
+ // Parallelism labels are point labels; turning them on is
+ // pointless if labels are hidden, so auto-enable Labels.
+ if (checked && !showPointLabels) setShowPointLabels(true);
+ if (checked && !showGradientLabels) {
+ window.dispatchEvent(
+ new CustomEvent(GRADIENT_NUDGE_EVENT, {
+ detail: {
+ enableGradient: () => {
+ setShowGradientLabels(true);
+ setUseAdvancedLabels(false);
+ track('latency_gradient_labels_toggled', {
+ enabled: true,
+ source: 'nudge',
+ });
+ },
},
- },
- }),
- );
- }
+ }),
+ );
+ }
+ },
},
- },
- {
- id: 'scatter-gradient-labels',
- label: 'Gradient Labels',
- checked: showGradientLabels,
- onCheckedChange: (checked: boolean) => {
- setShowGradientLabels(checked);
- track('latency_gradient_labels_toggled', { enabled: checked });
+ {
+ id: 'scatter-gradient-labels',
+ label: 'Gradient Labels',
+ checked: showGradientLabels,
+ onCheckedChange: (checked: boolean) => {
+ setShowGradientLabels(checked);
+ track('latency_gradient_labels_toggled', { enabled: checked });
+ },
},
- },
- {
- id: 'scatter-line-labels',
- label: 'Line Labels',
- checked: showLineLabels,
- onCheckedChange: (checked: boolean) => {
- setShowLineLabels(checked);
- track('latency_line_labels_toggled', { enabled: checked });
+ {
+ id: 'scatter-line-labels',
+ label: 'Line Labels',
+ checked: showLineLabels,
+ onCheckedChange: (checked: boolean) => {
+ setShowLineLabels(checked);
+ track('latency_line_labels_toggled', { enabled: checked });
+ },
},
- },
- {
- id: 'scatter-speed-overlay',
- label: 'Bus / Race Car',
- advanced: true,
- checked: showSpeedOverlay,
- onCheckedChange: (checked: boolean) => {
- setShowSpeedOverlay(checked);
- track('latency_speed_overlay_toggled', { enabled: checked });
+ {
+ id: 'scatter-speed-overlay',
+ label: 'Bus / Race Car',
+ advanced: true,
+ checked: showSpeedOverlay,
+ onCheckedChange: (checked: boolean) => {
+ setShowSpeedOverlay(checked);
+ track('latency_speed_overlay_toggled', { enabled: checked });
+ },
},
- },
- {
- id: 'scatter-minecraft-overlay',
- label: 'Donkey / Elytra',
- advanced: true,
- checked: showMinecraftOverlay,
- onCheckedChange: (checked: boolean) => {
- setShowMinecraftOverlay(checked);
- track('latency_minecraft_overlay_toggled', { enabled: checked });
+ {
+ id: 'scatter-minecraft-overlay',
+ label: 'Donkey / Elytra',
+ advanced: true,
+ checked: showMinecraftOverlay,
+ onCheckedChange: (checked: boolean) => {
+ setShowMinecraftOverlay(checked);
+ track('latency_minecraft_overlay_toggled', { enabled: checked });
+ },
},
- },
- ]}
- onAdvancedExpandedChange={(expanded) => {
- track('latency_advanced_controls_toggled', { expanded });
- }}
- actions={
- effectiveOfficialHwTypes.size < hwTypesWithData.size ||
- activeOverlayHwTypes.size < allOverlayHwTypes.size
- ? [
- {
- id: 'scatter-reset-filter',
- label: 'Reset filter',
- onClick: () => {
- selectAllHwTypes();
- setLocalOfficialOverride(null);
- resetOverlayHwTypes();
- track('latency_legend_filter_reset');
+ ]}
+ onAdvancedExpandedChange={(expanded) => {
+ track('latency_advanced_controls_toggled', { expanded });
+ }}
+ actions={
+ effectiveOfficialHwTypes.size < hwTypesWithData.size ||
+ activeOverlayHwTypes.size < allOverlayHwTypes.size
+ ? [
+ {
+ id: 'scatter-reset-filter',
+ label: 'Reset filter',
+ onClick: () => {
+ selectAllHwTypes();
+ setLocalOfficialOverride(null);
+ resetOverlayHwTypes();
+ track('latency_legend_filter_reset');
+ },
},
- },
- ]
- : []
+ ]
+ : []
+ }
+ precisionIndicators={selectedPrecisions}
+ enableTooltips={true}
+ />
+ }
+ />
+ {pointsTable && (
+ {
+ if (!open) setPointsTableTarget(null);
+ }}
+ title={pointsTable.title}
+ subtitle={`${modelLabel} · ${getSequenceLabel(selectedSequence)}`}
+ accentColor={pointsTable.color}
+ rows={pointsTable.rows}
+ isOverlay={pointsTable.isOverlay}
+ onRowClick={(row) =>
+ track('inference_legend_points_table_row_clicked', {
+ hw: pointsTable.hw,
+ conc: row.conc,
+ href: row.href ?? '',
+ })
}
- precisionIndicators={selectedPrecisions}
- enableTooltips={true}
/>
- }
- />
+ )}
+ >
);
},
);
diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
index 799854d7..f18903ea 100644
--- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
@@ -194,9 +194,7 @@ export function UnofficialChartDisplay() {
`${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
]
}{' '}
- {graph.chartDefinition[
- `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
- ] || graph.chartDefinition.heading}
+ {graph.chartDefinition.heading}
{graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence}
diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts
index 8f8705e1..7d5b1482 100644
--- a/packages/app/src/components/inference/utils.test.ts
+++ b/packages/app/src/components/inference/utils.test.ts
@@ -1,7 +1,26 @@
import { describe, it, expect } from 'vitest';
import type { ChartDefinition, InferenceData } from '@/components/inference/types';
-import { filterDataByCostLimit, processOverlayChartData } from '@/components/inference/utils';
+import {
+ filterDataByCostLimit,
+ processOverlayChartData,
+ selectUnofficialOverlayForMode,
+} from '@/components/inference/utils';
+
+describe('selectUnofficialOverlayForMode', () => {
+ const overlays = { e2e: { id: 'e2e' }, interactivity: { id: 'interactivity' } };
+
+ it('suppresses raw unofficial E2E data for normalized E2E mode', () => {
+ expect(selectUnofficialOverlayForMode('normalized-e2e', 'e2e', overlays)).toBeNull();
+ });
+
+ it('preserves matching unofficial overlays for supported modes', () => {
+ expect(selectUnofficialOverlayForMode('e2e', 'e2e', overlays)).toBe(overlays.e2e);
+ expect(selectUnofficialOverlayForMode('interactivity', 'interactivity', overlays)).toBe(
+ overlays.interactivity,
+ );
+ });
+});
// ---------------------------------------------------------------------------
// fixture factories
@@ -157,12 +176,12 @@ describe('processOverlayChartData', () => {
});
it('remaps x to config override for input metrics on interactivity chart', () => {
- // inputTputPerGpu has x override to p99_ttft on interactivity chart
+ // inputTputPerGpu has x override to p90_ttft on interactivity chart
const data = [
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- p99_ttft: 0.25,
+ p90_ttft: 0.25,
median_intvty: 50,
} as any),
];
@@ -176,16 +195,11 @@ describe('processOverlayChartData', () => {
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- median_ttft: 0.1,
+ p90_ttft: 0.1,
median_intvty: 50,
} as any),
];
- const result = processOverlayChartData(
- data,
- 'interactivity',
- 'y_inputTputPerGpu',
- 'median_ttft',
- );
+ const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.1);
});
@@ -195,76 +209,62 @@ describe('processOverlayChartData', () => {
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- p99_ttft: 0.25,
+ p90_ttft: 0.25,
median_e2el: 2.5,
} as any),
];
const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null);
expect(result).toHaveLength(1);
- // e2e uses median_e2el as x (from chart config default), not p99_ttft
+ // e2e uses median_e2el as x (from chart config default), not p90_ttft
expect(result[0].x).toBe(2.5);
});
- it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => {
- const data = [
- pt({
- x: 100,
- tpPerGpu: { y: 42, roof: false },
- p99_ttft: 0.35,
- median_e2el: 2.5,
- } as any),
- ];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
- expect(result).toHaveLength(1);
- expect(result[0].x).toBe(0.35);
- });
-
- it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => {
+ it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => {
const data = [
pt({
x: 100,
tpPerGpu: { y: 42, roof: false },
- median_ttft: 0.12,
+ p90_ttft: 0.12,
median_e2el: 2.5,
} as any),
];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft');
+ const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.12);
});
it('filters e2e TTFT outliers exceeding y_latency_limit', () => {
const data = [
- pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any),
- pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any),
+ pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any),
+ pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any),
];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
+ const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
// y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.5);
});
it('does not filter interactivity points by latency limit when x-axis is default', () => {
- // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity
+ // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity
// chart's x-axis stays median_intvty for non-input metrics. The latency limit
// (60) must NOT apply to median_intvty values.
const data = [
pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any),
pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any),
];
- const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft');
+ const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft');
expect(result).toHaveLength(2);
});
it('applies latency limit on interactivity only when x-axis is actually overridden', () => {
- // When an input metric IS selected and x-axis overrides to p99_ttft,
+ // When an input metric IS selected and x-axis overrides to p90_ttft,
// the latency limit should apply.
const data = [
- pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any),
- pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any),
+ pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any),
+ pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any),
];
- const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft');
- // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999
+ const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
+ // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.5);
});
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 4b5335b6..f6ebd0f8 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -8,6 +8,20 @@ import chartDefinitions from '@/components/inference/inference-chart-config.json
import type { ChartDefinition, InferenceData, YAxisMetricKey } from './types';
+/**
+ * Select the matching unofficial-run overlay for a chart mode. Normalized E2E
+ * is intentionally excluded: unofficial benchmark rows do not include the
+ * persisted per-request trace needed to normalize before taking percentiles.
+ */
+export function selectUnofficialOverlayForMode(
+ xAxisMode: string,
+ chartType: 'e2e' | 'interactivity',
+ overlays: { e2e: T | null; interactivity: T | null },
+): T | null {
+ if (xAxisMode === 'normalized-e2e') return null;
+ return overlays[chartType];
+}
+
/**
* Filters data points based on cost limits defined in the chart definition.
* Only applies filtering for cost-related metrics, and only filters based on
@@ -75,11 +89,13 @@ export function processOverlayChartData(
chartType: 'e2e' | 'interactivity',
selectedYAxisMetric: string,
selectedXAxisMetric: string | null,
+ options?: { isAgentic?: boolean },
): InferenceData[] {
const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType);
if (!chartDef) return [];
const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+ const isAgentic = options?.isAgentic === true;
// Resolve x-axis field (must match useChartData logic)
const metricTitle =
@@ -87,9 +103,11 @@ export function processOverlayChartData(
const isInputMetric = metricTitle.toLowerCase().includes('input');
let xAxisField: string = chartDef.x;
// selectedXAxisMetric is already the effective metric for this chart type
- // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
+ // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric).
+ // Match any *_ttft metric — the x-axis-mode picker can now select any
+ // percentile (median/p75/p90/p99) depending on sequence kind.
const isTtftOverride =
- selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+ typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft');
if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
xAxisField = selectedXAxisMetric;
@@ -109,7 +127,12 @@ export function processOverlayChartData(
})
.filter(
(d) =>
- xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit,
+ // Skip the latency limit for the natural x-axis or for agentic
+ // (long TTFTs are normal there, not overload outliers).
+ xAxisField === chartDef.x ||
+ isAgentic ||
+ !chartDef.y_latency_limit ||
+ d.x <= chartDef.y_latency_limit,
);
return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric);
diff --git a/packages/app/src/components/inference/utils/legend-points-table.test.ts b/packages/app/src/components/inference/utils/legend-points-table.test.ts
new file mode 100644
index 00000000..86d6f8b3
--- /dev/null
+++ b/packages/app/src/components/inference/utils/legend-points-table.test.ts
@@ -0,0 +1,232 @@
+import { describe, expect, it } from 'vitest';
+
+import type { InferenceData } from '@/components/inference/types';
+import {
+ buildLegendPointsRows,
+ formatRowValue,
+ pointDetailHref,
+ sortLegendPointsRows,
+} from '@/components/inference/utils/legend-points-table';
+
+// ---------------------------------------------------------------------------
+// fixture factory (mirrors tooltip-utils.test.ts)
+// ---------------------------------------------------------------------------
+function pt(overrides: Partial = {}): InferenceData {
+ return {
+ date: '2025-06-15',
+ x: 100,
+ y: 500,
+ tp: 8,
+ conc: 64,
+ hwKey: 'b300_vllm',
+ precision: 'fp4',
+ tput_per_gpu: 1234.5678,
+ median_intvty: 45.2,
+ p90_intvty: 38.1,
+ median_ttft: 0.42,
+ p90_ttft: 0.87,
+ tpPerGpu: { y: 1000, roof: false },
+ tpPerMw: { y: 50, roof: false },
+ costh: { y: 1, roof: false },
+ costn: { y: 1, roof: false },
+ costr: { y: 1, roof: false },
+ costhi: { y: 1, roof: false },
+ costni: { y: 1, roof: false },
+ costri: { y: 1, roof: false },
+ ...overrides,
+ } as InferenceData;
+}
+
+// ===========================================================================
+// pointDetailHref
+// ===========================================================================
+describe('pointDetailHref', () => {
+ it('agentic point with numeric id links to the in-app detail page', () => {
+ const d = pt({ benchmark_type: 'agentic_traces', id: 206863 });
+ expect(pointDetailHref(d, false)).toEqual({
+ href: '/inference/agentic/206863',
+ isExternal: false,
+ });
+ });
+
+ it('fixed-seq point links to its GitHub Actions run (repo URL rewritten)', () => {
+ const d = pt({
+ benchmark_type: 'single_turn',
+ run_url: 'https://github.com/InferenceMAX/InferenceMAX/actions/runs/123',
+ });
+ expect(pointDetailHref(d, false)).toEqual({
+ href: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/123',
+ isExternal: true,
+ });
+ });
+
+ it('agentic point without a numeric id falls back to the run URL', () => {
+ const d = pt({
+ benchmark_type: 'agentic_traces',
+ run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/9',
+ });
+ expect(pointDetailHref(d, false)).toEqual({
+ href: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/9',
+ isExternal: true,
+ });
+ });
+
+ it('returns no link when there is neither an id nor a run URL', () => {
+ expect(pointDetailHref(pt(), false)).toEqual({ href: null, isExternal: false });
+ });
+
+ it('does not build an /agentic/ link for a non-persisted id (0 / NaN)', () => {
+ // `typeof id === 'number'` accepted these; isPersistedBenchmarkId rejects
+ // them so we never link to /inference/agentic/0 or /inference/agentic/NaN.
+ for (const badId of [0, Number.NaN]) {
+ const d = pt({ benchmark_type: 'agentic_traces', id: badId });
+ expect(pointDetailHref(d, false)).toEqual({ href: null, isExternal: false });
+ }
+ });
+
+ it('overlay points never get a link (no DB benchmark id)', () => {
+ const d = pt({
+ benchmark_type: 'agentic_traces',
+ id: 42,
+ run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/1',
+ });
+ expect(pointDetailHref(d, true)).toEqual({ href: null, isExternal: false });
+ });
+});
+
+// ===========================================================================
+// buildLegendPointsRows
+// ===========================================================================
+describe('buildLegendPointsRows', () => {
+ it('maps official point fields onto table rows', () => {
+ const rows = buildLegendPointsRows(
+ [pt({ benchmark_type: 'agentic_traces', id: 1, ep: 8, dp_attention: true })],
+ false,
+ );
+ expect(rows).toHaveLength(1);
+ expect(rows[0]).toMatchObject({
+ conc: 64,
+ parallelism: 'DEP8',
+ precision: 'fp4',
+ offload: null,
+ tputPerGpu: 1234.5678,
+ p50Intvty: 45.2,
+ p90Intvty: 38.1,
+ p50Ttft: 0.42,
+ p90Ttft: 0.87,
+ href: '/inference/agentic/1',
+ isExternal: false,
+ });
+ });
+
+ it('default-sorts by concurrency ascending', () => {
+ const rows = buildLegendPointsRows(
+ [pt({ conc: 32 }), pt({ conc: 4 }), pt({ conc: 16 })],
+ false,
+ );
+ expect(rows.map((r) => r.conc)).toEqual([4, 16, 32]);
+ });
+
+ it('keeps agentic offload on/off row pairs adjacent and deterministic', () => {
+ const rows = buildLegendPointsRows(
+ [
+ pt({ conc: 8, offload_mode: 'on' }),
+ pt({ conc: 4, offload_mode: 'off' }),
+ pt({ conc: 4, offload_mode: 'on' }),
+ ],
+ false,
+ );
+ expect(rows.map((r) => [r.conc, r.offload])).toEqual([
+ [4, 'OFF'],
+ [4, 'ON'],
+ [8, 'ON'],
+ ]);
+ });
+
+ it('nulls out metrics missing on old points instead of coercing to 0', () => {
+ const rows = buildLegendPointsRows(
+ [pt({ tput_per_gpu: undefined, p90_intvty: undefined, p90_ttft: Number.NaN })],
+ false,
+ );
+ expect(rows[0].tputPerGpu).toBeNull();
+ expect(rows[0].p90Intvty).toBeNull();
+ expect(rows[0].p90Ttft).toBeNull();
+ });
+
+ it('treats the transform\'s "?? 0" coercion of absent metrics as missing', () => {
+ // Agentic rows have no median_* keys in metrics JSONB; benchmark-transform
+ // fills them with 0. These metrics are strictly positive when measured.
+ const rows = buildLegendPointsRows([pt({ median_intvty: 0, median_ttft: 0 })], false);
+ expect(rows[0].p50Intvty).toBeNull();
+ expect(rows[0].p50Ttft).toBeNull();
+ });
+
+ it('overlay rows carry metrics but no links', () => {
+ const rows = buildLegendPointsRows(
+ [pt({ id: 7, benchmark_type: 'agentic_traces', run_url: 'https://github.com/x/y/runs/1' })],
+ true,
+ );
+ expect(rows[0].href).toBeNull();
+ expect(rows[0].tputPerGpu).toBe(1234.5678);
+ });
+});
+
+// ===========================================================================
+// sortLegendPointsRows
+// ===========================================================================
+describe('sortLegendPointsRows', () => {
+ const rows = buildLegendPointsRows(
+ [
+ pt({ conc: 4, tput_per_gpu: 300 }),
+ pt({ conc: 16, tput_per_gpu: undefined }),
+ pt({ conc: 8, tput_per_gpu: 900 }),
+ ],
+ false,
+ );
+
+ it('sorts numeric columns in both directions', () => {
+ expect(sortLegendPointsRows(rows, 'tputPerGpu', 'asc').map((r) => r.conc)).toEqual([4, 8, 16]);
+ expect(sortLegendPointsRows(rows, 'tputPerGpu', 'desc').map((r) => r.conc)).toEqual([8, 4, 16]);
+ });
+
+ it('always sorts null metrics last', () => {
+ for (const dir of ['asc', 'desc'] as const) {
+ expect(sortLegendPointsRows(rows, 'tputPerGpu', dir).at(-1)?.conc).toBe(16);
+ }
+ });
+
+ it('sorts string columns alphabetically', () => {
+ const mixed = buildLegendPointsRows(
+ [pt({ conc: 1, ep: 8 }), pt({ conc: 2, tp: 4, ep: undefined })],
+ false,
+ );
+ expect(sortLegendPointsRows(mixed, 'parallelism', 'asc').map((r) => r.parallelism)).toEqual([
+ '4',
+ 'TEP8',
+ ]);
+ });
+
+ it('does not mutate the input array', () => {
+ const before = rows.map((r) => r.conc);
+ sortLegendPointsRows(rows, 'tputPerGpu', 'desc');
+ expect(rows.map((r) => r.conc)).toEqual(before);
+ });
+});
+
+// ===========================================================================
+// formatRowValue
+// ===========================================================================
+describe('formatRowValue', () => {
+ it('renders em dash for missing values', () => {
+ expect(formatRowValue(null)).toBe('—');
+ });
+
+ it('caps at 3 decimals like the scatter tooltip', () => {
+ expect(formatRowValue(1234.5678)).toBe('1234.568');
+ expect(formatRowValue(0.42)).toBe('0.42');
+ });
+
+ it('comma-formats large values like the scatter tooltip', () => {
+ expect(formatRowValue(123456.7)).toBe('123,456.7');
+ });
+});
diff --git a/packages/app/src/components/inference/utils/legend-points-table.ts b/packages/app/src/components/inference/utils/legend-points-table.ts
new file mode 100644
index 00000000..87df2fcf
--- /dev/null
+++ b/packages/app/src/components/inference/utils/legend-points-table.ts
@@ -0,0 +1,124 @@
+import { updateRepoUrl } from '@/lib/utils';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
+
+import type { InferenceData } from '@/components/inference/types';
+import { fmt, getPointLabel } from '@/components/inference/utils/tooltipUtils';
+
+/**
+ * One row of the per-series points table opened from the chart legend.
+ * Metric fields are `null` when the point predates the field (old runs) so the
+ * table can render an em dash instead of a misleading 0.
+ */
+export interface LegendPointsTableRow {
+ /** Stable React key — mirrors the scatter chart's per-point identity fields. */
+ key: string;
+ conc: number;
+ /** Shared parallelism label (e.g. "TP8", "DPAEP8", "2xEP4+1xDPAEP32"). */
+ parallelism: string;
+ precision: string;
+ /** Agentic offload mode ("ON" / "OFF"), null for fixed-seq points. */
+ offload: string | null;
+ tputPerGpu: number | null;
+ p50Intvty: number | null;
+ p90Intvty: number | null;
+ p50Ttft: number | null;
+ p90Ttft: number | null;
+ /** Detail link — null for overlay points (no DB benchmark id). */
+ href: string | null;
+ /** True when href is an external GitHub Actions run (open in new tab). */
+ isExternal: boolean;
+}
+
+export type LegendPointsSortKey =
+ | 'conc'
+ | 'parallelism'
+ | 'offload'
+ | 'tputPerGpu'
+ | 'p50Intvty'
+ | 'p90Intvty'
+ | 'p50Ttft'
+ | 'p90Ttft';
+
+// benchmark-transform coerces absent metrics to 0 (`m.median_ttft ?? 0`), and
+// every column metric here (throughput, interactivity, TTFT) is strictly
+// positive in reality — so non-positive means "not recorded", shown as a dash.
+const num = (v: number | undefined | null): number | null =>
+ typeof v === 'number' && Number.isFinite(v) && v > 0 ? v : null;
+
+/**
+ * Detail-page destination for a point — the EXACT same navigation the scatter
+ * tooltip offers on point click: agentic points go to the in-app
+ * `/inference/agentic/` detail page; fixed-seq points open the GitHub
+ * Actions run that produced them. Overlay (unofficial run) points have no DB
+ * benchmark id, so they get no link.
+ */
+export function pointDetailHref(
+ d: InferenceData,
+ isOverlay: boolean,
+): { href: string | null; isExternal: boolean } {
+ if (isOverlay) return { href: null, isExternal: false };
+ if (d.benchmark_type === 'agentic_traces' && isPersistedBenchmarkId(d.id)) {
+ return { href: `/inference/agentic/${d.id}`, isExternal: false };
+ }
+ if (d.run_url) return { href: updateRepoUrl(d.run_url), isExternal: true };
+ return { href: null, isExternal: false };
+}
+
+/**
+ * Shape a series' visible points into table rows, default-sorted by
+ * concurrency ascending (offload/parallelism tie-breaks keep the agentic
+ * on/off row pairs adjacent and deterministic).
+ */
+export function buildLegendPointsRows(
+ points: InferenceData[],
+ isOverlay: boolean,
+): LegendPointsTableRow[] {
+ return points
+ .map((d, i) => {
+ const { href, isExternal } = pointDetailHref(d, isOverlay);
+ return {
+ key: `${d.hwKey}|${d.precision}|${d.conc}|${getPointLabel(d)}|${d.offload_mode ?? ''}|${i}`,
+ conc: d.conc,
+ parallelism: getPointLabel(d),
+ precision: d.precision,
+ offload: d.offload_mode ? d.offload_mode.toUpperCase() : null,
+ tputPerGpu: num(d.tput_per_gpu),
+ p50Intvty: num(d.median_intvty),
+ p90Intvty: num(d.p90_intvty),
+ p50Ttft: num(d.median_ttft),
+ p90Ttft: num(d.p90_ttft),
+ href,
+ isExternal,
+ };
+ })
+ .toSorted(
+ (a, b) =>
+ a.conc - b.conc ||
+ a.parallelism.localeCompare(b.parallelism) ||
+ (a.offload ?? '').localeCompare(b.offload ?? ''),
+ );
+}
+
+/** Column sort with nulls always last; concurrency as the stable tie-break. */
+export function sortLegendPointsRows(
+ rows: LegendPointsTableRow[],
+ key: LegendPointsSortKey,
+ dir: 'asc' | 'desc',
+): LegendPointsTableRow[] {
+ const mul = dir === 'asc' ? 1 : -1;
+ return rows.toSorted((a, b) => {
+ const av = a[key];
+ const bv = b[key];
+ if (av === null && bv === null) return a.conc - b.conc;
+ if (av === null) return 1;
+ if (bv === null) return -1;
+ const cmp =
+ typeof av === 'string' || typeof bv === 'string'
+ ? String(av).localeCompare(String(bv))
+ : (av as number) - (bv as number);
+ return mul * cmp || a.conc - b.conc;
+ });
+}
+
+/** Table cell formatting — same capping as the scatter tooltip values. */
+export const formatRowValue = (v: number | null): string => (v === null ? '—' : fmt(v));
diff --git a/packages/app/src/components/inference/utils/parallelism-label.test.ts b/packages/app/src/components/inference/utils/parallelism-label.test.ts
new file mode 100644
index 00000000..aaf715d3
--- /dev/null
+++ b/packages/app/src/components/inference/utils/parallelism-label.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from 'vitest';
+
+import { configSegmentLabel, parallelismLabel } from './parallelism-label';
+
+describe('configSegmentLabel', () => {
+ it('collapses symmetric tp===ep to TEP / DEP by dp-attention', () => {
+ expect(configSegmentLabel(8, 8, false)).toBe('TEP8');
+ expect(configSegmentLabel(8, 8, true)).toBe('DEP8');
+ });
+
+ it('uses EP / DPAEP when ep>1 and tp!==ep', () => {
+ expect(configSegmentLabel(4, 16, false)).toBe('EP16');
+ expect(configSegmentLabel(4, 16, true)).toBe('DPAEP16');
+ });
+
+ it('uses TP / DPATP when ep<=1 or absent', () => {
+ expect(configSegmentLabel(8, 1, false)).toBe('TP8');
+ expect(configSegmentLabel(8, undefined, false)).toBe('TP8');
+ expect(configSegmentLabel(8, 1, true)).toBe('DPATP8');
+ });
+});
+
+describe('parallelismLabel', () => {
+ it('falls back to bare tp when no ep data', () => {
+ expect(parallelismLabel({ tp: 8 })).toBe('8');
+ });
+
+ it('labels a single-segment config', () => {
+ expect(parallelismLabel({ tp: 8, ep: 8, dpAttention: true })).toBe('DEP8');
+ expect(parallelismLabel({ tp: 4, ep: 8, dpAttention: false })).toBe('EP8');
+ });
+
+ it('builds multinode-disagg per-role worker segments', () => {
+ expect(
+ parallelismLabel({
+ tp: 8,
+ ep: 4,
+ disagg: true,
+ isMultinode: true,
+ prefillTp: 4,
+ prefillEp: 4,
+ prefillDpAttention: false,
+ prefillNumWorkers: 2,
+ decodeTp: 8,
+ decodeEp: 8,
+ decodeDpAttention: true,
+ decodeNumWorkers: 1,
+ }),
+ ).toBe('2xTEP4+1xDEP8');
+ });
+
+ it('single-node disagg uses the single (decode) segment, not worker syntax', () => {
+ // is_multinode false → no "NxPrefill+MxDecode" expansion.
+ expect(
+ parallelismLabel({ tp: 8, ep: 8, dpAttention: false, disagg: true, isMultinode: false }),
+ ).toBe('TEP8');
+ });
+});
diff --git a/packages/app/src/components/inference/utils/parallelism-label.ts b/packages/app/src/components/inference/utils/parallelism-label.ts
new file mode 100644
index 00000000..98207110
--- /dev/null
+++ b/packages/app/src/components/inference/utils/parallelism-label.ts
@@ -0,0 +1,79 @@
+/**
+ * Shared parallelism-config labeling — the single source of truth for the
+ * short "TP8 / EP8 / TEP8 / DEP8 / DPAEP8 / 2xEP4+1xDPAEP32" labels.
+ *
+ * Used by the scatter/GPU chart point labels (via getPointLabel) and the
+ * agentic detail page's sibling navigator chips, so both surfaces describe a
+ * config identically.
+ */
+
+/**
+ * Generates a short config segment label from parallelism params.
+ * - tp == ep and dp-attn false: "TEP{N}"
+ * - tp == ep and dp-attn true: "DEP{N}"
+ * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}"
+ * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}"
+ */
+export const configSegmentLabel = (
+ tp: number,
+ ep: number | undefined,
+ dpAttention: boolean | undefined,
+): string => {
+ if (ep !== null && ep !== undefined && ep > 1 && tp === ep) {
+ return dpAttention ? `DEP${tp}` : `TEP${tp}`;
+ }
+ const dpaPrefix = dpAttention ? 'DPA' : '';
+ if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`;
+ return `${dpaPrefix}EP${ep}`;
+};
+
+/** Parallelism params for one benchmark config, framework-agnostic. */
+export interface ParallelismFields {
+ tp: number;
+ ep?: number;
+ dpAttention?: boolean;
+ disagg?: boolean;
+ isMultinode?: boolean;
+ prefillTp?: number;
+ prefillEp?: number;
+ prefillDpAttention?: boolean;
+ prefillNumWorkers?: number;
+ decodeTp?: number;
+ decodeEp?: number;
+ decodeDpAttention?: boolean;
+ decodeNumWorkers?: number;
+}
+
+/**
+ * Returns the short parallelism label for a config.
+ * - No EP data (old rows): falls back to the bare tp value (e.g. "8").
+ * - Multinode disagg: per-role segments with worker counts,
+ * e.g. "2xEP4+1xDPAEP32".
+ * - Otherwise: a single segment from (tp, ep, dpAttention).
+ */
+export const parallelismLabel = (f: ParallelismFields): string => {
+ if (
+ (f.ep === null || f.ep === undefined) &&
+ (f.prefillEp === null || f.prefillEp === undefined)
+ ) {
+ return String(f.tp);
+ }
+
+ if (f.isMultinode && f.disagg) {
+ const prefillLabel = configSegmentLabel(
+ f.prefillTp ?? f.tp,
+ f.prefillEp ?? f.ep,
+ f.prefillDpAttention ?? f.dpAttention,
+ );
+ const decodeLabel = configSegmentLabel(
+ f.decodeTp ?? f.tp,
+ f.decodeEp ?? f.ep,
+ f.decodeDpAttention ?? f.dpAttention,
+ );
+ const pw = f.prefillNumWorkers ?? 1;
+ const dw = f.decodeNumWorkers ?? 1;
+ return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`;
+ }
+
+ return configSegmentLabel(f.tp, f.ep, f.dpAttention);
+};
diff --git a/packages/app/src/components/inference/utils/tooltip-utils.test.ts b/packages/app/src/components/inference/utils/tooltip-utils.test.ts
index 5a5bd7e9..8755fbe7 100644
--- a/packages/app/src/components/inference/utils/tooltip-utils.test.ts
+++ b/packages/app/src/components/inference/utils/tooltip-utils.test.ts
@@ -150,6 +150,26 @@ describe('getPointLabel', () => {
// generateTooltipContent
// ===========================================================================
describe('generateTooltipContent', () => {
+ it('renders View charts as a same-tab anchor so browsers offer open-in-new-tab', () => {
+ const html = generateTooltipContent(
+ tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+ );
+ expect(html).toContain(' {
+ // Overlay agentic points arrive with id 0 / NaN — the button would otherwise
+ // link to /inference/agentic/0, a doomed lookup.
+ for (const badId of [0, Number.NaN]) {
+ const html = generateTooltipContent(
+ tooltipConfig({ data: pt({ id: badId }), isPinned: true, hasTrace: true }),
+ );
+ expect(html).not.toContain('data-action="view-charts"');
+ }
+ });
+
it('includes hardware display label from config', () => {
const html = generateTooltipContent(tooltipConfig());
expect(html).toContain('H100');
@@ -365,4 +385,27 @@ describe('generateGPUGraphTooltipContent', () => {
);
expect(html).toContain('vllm-v0.6.0 abc123');
});
+
+ it('shows View charts only for pinned points with stored trace data', () => {
+ expect(
+ generateGPUGraphTooltipContent(
+ tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+ ),
+ ).toContain('data-action="view-charts"');
+ expect(
+ generateGPUGraphTooltipContent(
+ tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+ ),
+ ).toContain('href="/inference/agentic/1"');
+ expect(
+ generateGPUGraphTooltipContent(
+ tooltipConfig({ data: pt({ id: 1 }), isPinned: false, hasTrace: true }),
+ ),
+ ).not.toContain('data-action="view-charts"');
+ expect(
+ generateGPUGraphTooltipContent(
+ tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: false }),
+ ),
+ ).not.toContain('data-action="view-charts"');
+ });
});
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 9143f40f..84398397 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -1,6 +1,8 @@
import { formatNumber, getDisplayLabel } from '@/lib/utils';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types';
+import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
export interface TooltipConfig {
/** The data point to display */
@@ -19,6 +21,14 @@ export interface TooltipConfig {
isTracked?: boolean;
/** URL to the GitHub Actions workflow run */
runUrl?: string;
+ /**
+ * Whether this agentic point has a stored trace_replay blob. Controls
+ * visibility of the "View charts" button — the actual distributions are
+ * rendered on the detail page, not inline, so all the tooltip needs is a
+ * presence boolean (sourced from the bulk `/api/v1/trace-availability`
+ * call so we don't ship megabytes of profile JSONL just for this check).
+ */
+ hasTrace?: boolean;
}
export interface OverlayTooltipConfig extends TooltipConfig {
@@ -26,57 +36,37 @@ export interface OverlayTooltipConfig extends TooltipConfig {
overlayData: OverlayData;
}
-/**
- * Generates a short config segment label from parallelism params.
- * - tp == ep and dp-attn false: "TEP{N}"
- * - tp == ep and dp-attn true: "DEP{N}"
- * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}"
- * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}"
- */
-const configSegmentLabel = (
- tp: number,
- ep: number | undefined,
- dpAttention: boolean | undefined,
-): string => {
- if (ep !== null && ep !== undefined && ep > 1 && tp === ep) {
- return dpAttention ? `DEP${tp}` : `TEP${tp}`;
- }
- const dpaPrefix = dpAttention ? 'DPA' : '';
- if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`;
- return `${dpaPrefix}EP${ep}`;
-};
+// `dp_attention` is `boolean | string` on InferenceData (DB sends raw, the
+// transform narrows "true"/"false" → boolean). Coerce to a plain boolean for
+// the shared labeler, treating the legacy string form correctly.
+const asBool = (v: boolean | string | undefined): boolean | undefined =>
+ typeof v === 'string' ? v === 'true' : v;
/**
* Returns the short label for a data point on the chart.
* - Non-multinode: e.g. "TP8", "EP8", "TEP8", "DEP8", "DPAEP8"
* - Multinode disagg: e.g. "2xEP4+1xDPAEP32"
* - Old data (no ep field): falls back to tp value
+ *
+ * Delegates to the shared {@link parallelismLabel} so the chart points and the
+ * agentic sibling navigator describe a config identically.
*/
-export const getPointLabel = (d: InferenceData): string => {
- if (
- (d.ep === null || d.ep === undefined) &&
- (d.prefill_ep === null || d.prefill_ep === undefined)
- )
- return String(d.tp);
-
- if (d.is_multinode && d.disagg) {
- const prefillLabel = configSegmentLabel(
- d.prefill_tp ?? d.tp,
- d.prefill_ep ?? d.ep,
- d.prefill_dp_attention ?? d.dp_attention,
- );
- const decodeLabel = configSegmentLabel(
- d.decode_tp ?? d.tp,
- d.decode_ep ?? d.ep,
- d.decode_dp_attention ?? d.dp_attention,
- );
- const pw = d.prefill_num_workers ?? 1;
- const dw = d.decode_num_workers ?? 1;
- return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`;
- }
-
- return configSegmentLabel(d.tp, d.ep, d.dp_attention);
-};
+export const getPointLabel = (d: InferenceData): string =>
+ parallelismLabel({
+ tp: d.tp,
+ ep: d.ep,
+ dpAttention: asBool(d.dp_attention),
+ disagg: d.disagg,
+ isMultinode: d.is_multinode,
+ prefillTp: d.prefill_tp,
+ prefillEp: d.prefill_ep,
+ prefillDpAttention: asBool(d.prefill_dp_attention),
+ prefillNumWorkers: d.prefill_num_workers,
+ decodeTp: d.decode_tp,
+ decodeEp: d.decode_ep,
+ decodeDpAttention: asBool(d.decode_dp_attention),
+ decodeNumWorkers: d.decode_num_workers,
+ });
const runLinkHTML = (runUrl?: string) =>
runUrl
@@ -88,6 +78,79 @@ const runLinkHTML = (runUrl?: string) =>
const tooltipLine = (label: string, value: string | number) =>
`${label}: ${value}
`;
+const formatPct = (v: number | undefined): string | null =>
+ v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
+
+/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped).
+ * Exported so the legend points table shows exactly the numbers the tooltip shows. */
+export const fmt = (v: number): string => {
+ if (!Number.isFinite(v)) return String(v);
+ const rounded = parseFloat(v.toFixed(3));
+ if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded);
+ return String(rounded);
+};
+
+/**
+ * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
+ * success, token totals. Returns an empty string for non-agentic rows.
+ */
+const generateAgenticHTML = (d: InferenceData): string => {
+ if (d.benchmark_type !== 'agentic_traces') return '';
+
+ const parts: string[] = [];
+ if (d.offload_mode) {
+ parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase()));
+ }
+
+ const gpuHit = formatPct(d.server_gpu_cache_hit_rate);
+ const cpuHit = formatPct(d.server_cpu_cache_hit_rate);
+ const theoHit = formatPct(d.theoretical_cache_hit_rate);
+ if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit));
+ if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit));
+ if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit));
+
+ if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) {
+ const successPct =
+ d.num_requests_total > 0
+ ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)`
+ : '';
+ parts.push(
+ tooltipLine(
+ 'Requests',
+ `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`,
+ ),
+ );
+ }
+
+ if (d.total_prompt_tokens !== undefined) {
+ parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens)));
+ }
+ if (d.total_generation_tokens !== undefined) {
+ parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
+ }
+
+ // Histograms + time-series live on the dedicated detail page now; the
+ // "View charts" button (rendered by the wrapper when pinned + has trace
+ // data) takes the user there.
+
+ return parts.join('');
+};
+
+/** "View charts" link — only visible when the tooltip is pinned and the
+ * point has stored trace data. Wired up by the scatter/GPU graph click handlers. */
+const viewChartsButtonHTML = (
+ isPinned: boolean,
+ hasTraceData: boolean,
+ pointId: number | undefined,
+): string => {
+ if (!isPinned || !hasTraceData || !isPersistedBenchmarkId(pointId)) return '';
+ return ` View charts → `;
+};
+
const shortenSha = (image: string) =>
image.replaceAll(/(?sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$…');
@@ -139,7 +202,16 @@ const generateParallelismHTML = (d: InferenceData): string => {
* @returns HTML string for the tooltip content
*/
export const generateTooltipContent = (config: TooltipConfig): string => {
- const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+ const {
+ data: d,
+ isPinned,
+ xLabel,
+ yLabel,
+ selectedYAxisMetric,
+ hardwareConfig,
+ runUrl,
+ hasTrace,
+ } = config;
return `
@@ -157,16 +229,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
: ''
}
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${
selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)}
+ Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
`
: ''
}
@@ -174,7 +246,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)}
+ Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
`
: ''
}
@@ -183,10 +255,12 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
${runLinkHTML(runUrl)}
+ ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)}
${
isPinned
? `
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${tooltipLine('Total GPUs', d.tp)}
${generateParallelismHTML(d)}
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
`;
};
@@ -254,7 +329,16 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
* @returns HTML string for the tooltip content
*/
export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => {
- const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+ const {
+ data: d,
+ isPinned,
+ xLabel,
+ yLabel,
+ selectedYAxisMetric,
+ hardwareConfig,
+ runUrl,
+ hasTrace,
+ } = config;
return `
@@ -272,16 +356,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
: ''
}
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${
selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)}
+ Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
`
: ''
}
@@ -289,7 +373,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)}
+ Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
`
: ''
}
@@ -298,10 +382,12 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
${runLinkHTML(runUrl)}
+ ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)}
`;
};
diff --git a/packages/app/src/components/ui/chart-legend-item.tsx b/packages/app/src/components/ui/chart-legend-item.tsx
index fae83360..07344270 100644
--- a/packages/app/src/components/ui/chart-legend-item.tsx
+++ b/packages/app/src/components/ui/chart-legend-item.tsx
@@ -1,4 +1,4 @@
-import { X } from 'lucide-react';
+import { Table2, X } from 'lucide-react';
import React from 'react';
import { cn } from '@/lib/utils';
@@ -19,6 +19,12 @@ export interface CommonLegendItemProps {
isLegendExpanded?: boolean; // Whether the legend is expanded to show full text
sidebarMode?: boolean; // Use sidebar-style visual feedback (line-through + faded dot)
onRemove?: (name: string) => void;
+ /**
+ * When provided, renders a small table icon that opens a per-series points
+ * table (all data points for this hardware/framework series). Only the
+ * inference tab's legend passes this — other tabs get no icon.
+ */
+ onShowPoints?: (name: string) => void;
}
const ChartLegendItem: React.FC
= ({
@@ -36,6 +42,7 @@ const ChartLegendItem: React.FC = ({
isLegendExpanded = true,
sidebarMode = false,
onRemove,
+ onShowPoints,
}) => {
const id = `checkbox-${hw || name}`; // Unique ID for accessibility
const isLongText = (label ?? '').length > 8;
@@ -97,6 +104,20 @@ const ChartLegendItem: React.FC = ({
{label}
+ {onShowPoints && (
+ onShowPoints(hw || name)}
+ // Reduced opacity at rest (still visible/tappable on touch), full on
+ // row hover or keyboard focus. ml-auto pins the icon to the row's
+ // right edge so icons align in a column across variable-length labels.
+ className="ml-auto shrink-0 p-1 -my-1 rounded-sm text-muted-foreground hover:text-foreground opacity-35 group-hover/row:opacity-100 focus-visible:opacity-100 transition-opacity no-export"
+ >
+
+
+ )}
>
);
@@ -104,6 +125,7 @@ const ChartLegendItem: React.FC = ({
'transition-opacity duration-300',
isActive ? 'opacity-100' : sidebarMode ? 'no-export' : 'opacity-50 no-export',
isHighlighted && 'text-red-900 dark:text-red-400 font-bold',
+ onShowPoints && 'group/row flex w-full items-center',
);
if (asFragment) {
diff --git a/packages/app/src/components/ui/chart-legend.tsx b/packages/app/src/components/ui/chart-legend.tsx
index 25238522..86fadfad 100644
--- a/packages/app/src/components/ui/chart-legend.tsx
+++ b/packages/app/src/components/ui/chart-legend.tsx
@@ -8,6 +8,7 @@ import {
ChevronRight,
Circle,
Diamond,
+ Info,
Square,
Triangle,
X,
@@ -38,6 +39,8 @@ export interface LegendSwitchConfig {
label: string;
checked: boolean;
onCheckedChange: (checked: boolean) => void;
+ /** Optional explainer rendered as an info-icon tooltip next to the label. */
+ infoTooltip?: React.ReactNode;
advanced?: boolean;
}
@@ -279,6 +282,29 @@ export default function ChartLegend({
>
{sw.label}
+ {sw.infoTooltip && (
+
+
+
+
+
+
+
+
+ {sw.infoTooltip}
+
+
+
+ )}
))}
@@ -401,6 +427,7 @@ export default function ChartLegend({
onHover={onItemHover}
onHoverEnd={onItemHoverEnd}
onRemove={effectiveRemove}
+ onShowPoints={item.onShowPoints}
asFragment
isLegendExpanded={effectiveExpanded}
sidebarMode={isSidebar}
@@ -412,7 +439,9 @@ export default function ChartLegend({
{enableTooltips ? (
- {legendItem}
+ {/* Full width when the row carries a points-table icon so the
+ ml-auto icon pins to a consistent right-edge column. */}
+ {legendItem}
{item.isHighlighted && item.tooltip && (
@@ -495,6 +524,7 @@ export default function ChartLegend({
onHover={onItemHover}
onHoverEnd={onItemHoverEnd}
onRemove={effectiveRemove}
+ onShowPoints={item.onShowPoints}
sidebarMode={isSidebar}
asFragment
/>
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index de18da09..6aee97dd 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -5,17 +5,30 @@ import { Info } from 'lucide-react';
import { LabelWithTooltip } from '@/components/ui/label-with-tooltip';
import { track } from '@/lib/analytics';
import { MultiSelect } from '@/components/ui/multi-select';
+import {
+ Select,
+ SelectContent,
+ SelectGroup,
+ SelectItem,
+ SelectLabel,
+ SelectTrigger,
+ SelectValue,
+} from '@/components/ui/select';
import { TooltipContent, TooltipRoot, TooltipTrigger } from '@/components/ui/tooltip';
import {
type Model,
type Precision,
type Sequence,
+ type Percentile,
+ PERCENTILE_OPTIONS,
getModelCategory,
getModelLabel,
+ getPercentileLabel,
getPrecisionLabel,
getSequenceCategory,
getSequenceLabel,
groupByCategory,
+ sequenceKind,
} from '@/lib/data-mappings';
function CategorySectionTitle({ label, reason }: { label: string; reason: string }) {
@@ -228,6 +241,143 @@ export function SequenceSelector({
);
}
+interface ScenarioSelectorProps {
+ id?: string;
+ value: string;
+ onChange: (value: Sequence) => void;
+ open?: boolean;
+ onOpenChange?: (open: boolean) => void;
+ availableSequences: string[];
+ 'data-testid'?: string;
+}
+
+/**
+ * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length",
+ * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL
+ * framing only applies to the fixed-seq subset).
+ */
+export function ScenarioSelector({
+ id = 'scenario-select',
+ value,
+ onChange,
+ open,
+ onOpenChange,
+ availableSequences,
+ 'data-testid': testId,
+}: ScenarioSelectorProps) {
+ const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq');
+ const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic');
+ const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence));
+
+ return (
+
+
+ {
+ track('selector_scenario_changed', { scenario: v });
+ onChange(v as Sequence);
+ }}
+ open={open}
+ onOpenChange={onOpenChange}
+ >
+
+
+
+
+ {/* Agentic first — preferred default scenario when available. */}
+ {agentic.length > 0 && (
+
+ Agentic
+ {agentic.map((seq) => (
+
+ {getSequenceLabel(seq as Sequence)}
+
+ ))}
+
+ )}
+ {fixedSeq.length > 0 && (
+
+ Fixed Sequence Length
+ {fixedGroups.default.map((seq) => (
+
+ {getSequenceLabel(seq as Sequence)}
+
+ ))}
+ {fixedGroups.deprecated.length > 0 && (
+ <>
+
+
+
+ {fixedGroups.deprecated.map((seq) => (
+
+ {getSequenceLabel(seq as Sequence)}
+
+ ))}
+ >
+ )}
+
+ )}
+
+
+
+ );
+}
+
+interface PercentileSelectorProps {
+ id?: string;
+ value: string;
+ onChange: (value: Percentile) => void;
+ 'data-testid'?: string;
+}
+
+/**
+ * Latency percentile selector for agentic-trace charts. The selected value
+ * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so
+ * picking p99 plots p99 e2e latency / interactivity instead of the median.
+ */
+export function PercentileSelector({
+ id = 'percentile-select',
+ value,
+ onChange,
+ 'data-testid': testId,
+}: PercentileSelectorProps) {
+ return (
+
+
+ {
+ track('selector_percentile_changed', { percentile: v });
+ onChange(v as Percentile);
+ }}
+ >
+
+
+
+
+ {PERCENTILE_OPTIONS.map((p) => (
+
+ {getPercentileLabel(p)}
+
+ ))}
+
+
+
+ );
+}
+
interface PrecisionSelectorProps {
id?: string;
value: string[];
diff --git a/packages/app/src/components/ui/d3-chart-wrapper.tsx b/packages/app/src/components/ui/d3-chart-wrapper.tsx
index 0392ac10..44013b1b 100644
--- a/packages/app/src/components/ui/d3-chart-wrapper.tsx
+++ b/packages/app/src/components/ui/d3-chart-wrapper.tsx
@@ -1,6 +1,41 @@
'use client';
-import React from 'react';
+import React, { useEffect, useState } from 'react';
+import { createPortal } from 'react-dom';
+
+/**
+ * Renders the d3 tooltip element via React Portal to document.body so it
+ * escapes any parent stacking context (e.g. the chart Card's backdrop-filter
+ * creates one, trapping z-index inside it). Position is set as viewport
+ * coordinates by the d3 layer.
+ */
+function PortalTooltip({
+ tooltipRef,
+ pinned,
+}: {
+ tooltipRef: React.RefObject;
+ pinned: boolean;
+}) {
+ const [mounted, setMounted] = useState(false);
+ useEffect(() => setMounted(true), []);
+ const node = (
+
+ );
+ if (!mounted || typeof document === 'undefined') return node;
+ return createPortal(node, document.body);
+}
export interface D3ChartWrapperProps {
chartId: string;
@@ -72,17 +107,11 @@ export function D3ChartWrapper({
}
}}
/>
-
+ {/* Tooltip is portalled to with position:fixed so it can
+ rise above sibling chart cards' stacking contexts. The d3 layer
+ writes viewport-coords into style.left/top — see
+ computeTooltipPosition. */}
+
{noDataOverlay}
{instructions}
diff --git a/packages/app/src/components/ui/tabs.tsx b/packages/app/src/components/ui/tabs.tsx
index a54963a8..4669e9e1 100644
--- a/packages/app/src/components/ui/tabs.tsx
+++ b/packages/app/src/components/ui/tabs.tsx
@@ -17,17 +17,19 @@ function Tabs({ className, ...props }: React.ComponentProps) {
return (
-
+
);
}
+// Active/inactive recipe mirrors the top-of-page section nav
+// (data-testid="chart-section-tabs" in src/components/tab-nav.tsx: tabLinkClass +
+// currentTabClass) so the two tab rows read as the same flat underline-strip
+// component: accent text + accent border-b-2 underline when active, muted text
+// with no background fill when inactive, and a faint border highlight on hover.
function TabsTrigger({ className, ...props }: React.ComponentProps) {
return (
= {}): BenchmarkRow {
return {
+ id: 1,
hardware: 'h200',
framework: 'sglang',
model: 'dsr1',
@@ -29,6 +30,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow {
decode_num_workers: 0,
num_prefill_gpu: 8,
num_decode_gpu: 8,
+ benchmark_type: 'single_turn',
+ offload_mode: 'off',
isl: 1024,
osl: 1024,
conc: 128,
diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx
index 310a4d1a..54b470ff 100644
--- a/packages/app/src/components/unofficial-run-provider.tsx
+++ b/packages/app/src/components/unofficial-run-provider.tsx
@@ -12,7 +12,7 @@ import {
import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types';
import { UnofficialBanner } from '@/components/ui/unofficial-banner';
-import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants';
import { computeToggle } from '@/hooks/useTogglableSet';
import type { BenchmarkRow, EvalRow } from '@/lib/api';
import { normalizeEvalHardwareKey } from '@/lib/chart-utils';
@@ -110,7 +110,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData
const groups = new Map();
for (const row of benchmarks) {
const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model;
- const sequence = islOslToSequence(row.isl, row.osl);
+ const sequence = rowToSequence(row);
if (!sequence) continue;
const key = `${displayModel}_${sequence}`;
if (!groups.has(key)) groups.set(key, []);
diff --git a/packages/app/src/hooks/api/benchmark-id-query.test.ts b/packages/app/src/hooks/api/benchmark-id-query.test.ts
new file mode 100644
index 00000000..c7d951f4
--- /dev/null
+++ b/packages/app/src/hooks/api/benchmark-id-query.test.ts
@@ -0,0 +1,37 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+import { bulkIdsFetcher } from './benchmark-id-query';
+
+afterEach(() => {
+ vi.unstubAllGlobals();
+});
+
+describe('bulkIdsFetcher', () => {
+ it('returns an empty map without fetching for an empty id set', async () => {
+ const fetchMock = vi.fn();
+ vi.stubGlobal('fetch', fetchMock);
+
+ const result = await bulkIdsFetcher('trace-availability')([]);
+ expect(result).toEqual({});
+ expect(fetchMock).not.toHaveBeenCalled();
+ });
+
+ it('fetches the endpoint with comma-joined ids and returns the parsed map', async () => {
+ const fetchMock = vi.fn().mockResolvedValue(Response.json({ 1: true, 3: true }));
+ vi.stubGlobal('fetch', fetchMock);
+
+ const result = await bulkIdsFetcher('trace-availability')([1, 3]);
+ expect(result).toEqual({ 1: true, 3: true });
+ expect(fetchMock).toHaveBeenCalledWith('/api/v1/trace-availability?ids=1,3', {
+ signal: undefined,
+ });
+ });
+
+ it('throws with the endpoint name and status on a non-ok response', async () => {
+ vi.stubGlobal('fetch', vi.fn().mockResolvedValue(new Response('nope', { status: 500 })));
+
+ await expect(bulkIdsFetcher('trace-histograms')([1])).rejects.toThrow(
+ 'trace-histograms 500',
+ );
+ });
+});
diff --git a/packages/app/src/hooks/api/benchmark-id-query.ts b/packages/app/src/hooks/api/benchmark-id-query.ts
new file mode 100644
index 00000000..0aa50687
--- /dev/null
+++ b/packages/app/src/hooks/api/benchmark-id-query.ts
@@ -0,0 +1,59 @@
+import { useQuery } from '@tanstack/react-query';
+
+/**
+ * Shared React Query plumbing for the agentic endpoints keyed by
+ * `benchmark_results.id` (`/api/v1/?ids=…` bulk maps and
+ * `/api/v1/?id=N` single lookups).
+ *
+ * Conventions kept identical across all of these hooks:
+ * - queryKey = [endpoint, sorted-deduped-ids-comma-joined] so any
+ * permutation of the same id set hits the same cache entry
+ * - staleTime = 5 minutes (the underlying blobs are immutable per run)
+ * - bulk queries disabled for empty id sets; single queries 404 → null
+ */
+
+const STALE_TIME_MS = 5 * 60 * 1000;
+
+/** Build the standard bulk fetcher: GET `/api/v1/?ids=…` → map. */
+export function bulkIdsFetcher(
+ endpoint: string,
+): (ids: number[], signal?: AbortSignal) => Promise> {
+ return async (ids, signal) => {
+ if (ids.length === 0) return {};
+ const res = await fetch(`/api/v1/${endpoint}?ids=${ids.join(',')}`, { signal });
+ if (!res.ok) throw new Error(`${endpoint} ${res.status}`);
+ return (await res.json()) as Record;
+ };
+}
+
+/** Bulk map query over a set of benchmark_results ids. */
+export function useBulkIdsQuery(
+ endpoint: string,
+ ids: number[],
+ enabled: boolean,
+ fetchByIds: (ids: number[], signal?: AbortSignal) => Promise,
+) {
+ const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+ return useQuery({
+ queryKey: [endpoint, sortedKey.join(',')] as const,
+ queryFn: ({ signal }: { signal: AbortSignal }) => fetchByIds(sortedKey, signal),
+ enabled: enabled && sortedKey.length > 0,
+ staleTime: STALE_TIME_MS,
+ });
+}
+
+/** Single-payload query for one benchmark_results id; 404 resolves to null. */
+export function useByIdQuery(endpoint: string, id: number | null, enabled: boolean) {
+ return useQuery({
+ queryKey: [endpoint, id] as const,
+ queryFn: async ({ signal }): Promise => {
+ if (!id) return null;
+ const res = await fetch(`/api/v1/${endpoint}?id=${id}`, { signal });
+ if (res.status === 404) return null;
+ if (!res.ok) throw new Error(`${endpoint} ${res.status}`);
+ return (await res.json()) as T;
+ },
+ enabled,
+ staleTime: STALE_TIME_MS,
+ });
+}
diff --git a/packages/app/src/hooks/api/use-agentic-aggregates.ts b/packages/app/src/hooks/api/use-agentic-aggregates.ts
new file mode 100644
index 00000000..7ca029cf
--- /dev/null
+++ b/packages/app/src/hooks/api/use-agentic-aggregates.ts
@@ -0,0 +1,31 @@
+import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query';
+
+export interface MetricPercentiles {
+ mean: number;
+ p50: number;
+ p75: number;
+ p90: number;
+ p99: number;
+ n: number;
+}
+
+export interface AgenticAggregate {
+ id: number;
+ isl: MetricPercentiles | null;
+ osl: MetricPercentiles | null;
+ kvCacheUtil: MetricPercentiles | null;
+ prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record;
+
+const fetchAgenticAggregates = bulkIdsFetcher('agentic-aggregates');
+
+/**
+ * Fetch per-id aggregate stats (mean/p50/p75/p90/p99) for ISL, OSL, KV
+ * cache utilization, and prefix cache hit rate. Used by the "Aggregates
+ * across configs" view on the agentic detail page.
+ */
+export function useAgenticAggregates(ids: number[], enabled = true) {
+ return useBulkIdsQuery('agentic-aggregates', ids, enabled, fetchAgenticAggregates);
+}
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
new file mode 100644
index 00000000..58469c26
--- /dev/null
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -0,0 +1,44 @@
+import { useByIdQuery } from './benchmark-id-query';
+
+export interface BenchmarkSibling {
+ id: number;
+ conc: number;
+ offload_mode: string | null;
+ decode_tp: number;
+ decode_ep: number;
+ decode_dp_attention: boolean;
+ decode_num_workers: number;
+ prefill_tp: number;
+ prefill_ep: number;
+ prefill_dp_attention: boolean;
+ prefill_num_workers: number;
+ num_prefill_gpu: number;
+ num_decode_gpu: number;
+ disagg: boolean;
+ is_multinode: boolean;
+ tput_per_gpu: number | null;
+ total_requests: number | null;
+ is_current: boolean;
+ has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+ hardware: string;
+ framework: string;
+ model: string;
+ precision: string;
+ spec_method: string;
+ benchmark_type: string;
+ github_run_id: number;
+ date: string;
+ dataset_slug: string | null;
+}
+
+export interface BenchmarkSiblings {
+ sku: BenchmarkSku;
+ siblings: BenchmarkSibling[];
+}
+
+export function useBenchmarkSiblings(id: number | null) {
+ return useByIdQuery('benchmark-siblings', id, id !== null && id > 0);
+}
diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts
index a8d634f1..095cf192 100644
--- a/packages/app/src/hooks/api/use-benchmarks.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.ts
@@ -28,6 +28,14 @@ export function benchmarkQueryOptions(
};
}
-export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) {
- return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId));
+export function useBenchmarks(
+ model: string,
+ date?: string,
+ enabled = true,
+ runId?: string,
+ exactRun?: boolean,
+) {
+ return useQuery(
+ benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId, exactRun),
+ );
}
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts
new file mode 100644
index 00000000..ea1b17cf
--- /dev/null
+++ b/packages/app/src/hooks/api/use-datasets.ts
@@ -0,0 +1,199 @@
+import { useQuery, keepPreviousData } from '@tanstack/react-query';
+
+import type {
+ ConversationStructure,
+ StructureNode,
+} from '@semianalysisai/inferencex-db/etl/weka-structure';
+
+export type { ConversationStructure, StructureNode };
+
+export interface DatasetSummary {
+ blockSize?: number;
+ hashIdScope?: string | null;
+ totalIn?: number;
+ totalOut?: number;
+ totalCached?: number;
+ cachedPct?: number;
+ mainTurns?: number;
+ subagentGroups?: number;
+ subagentTurns?: number;
+ meanRequestsPerConversation?: number;
+ medianRequestsPerConversation?: number;
+ meanSubagentsPerTrace?: number;
+ medianSubagentsPerTrace?: number;
+ modelMix?: Record;
+ [k: string]: unknown;
+}
+
+export interface DatasetRecord {
+ id: string;
+ slug: string;
+ label: string;
+ variant: string;
+ description: string | null;
+ hf_url: string | null;
+ license: string | null;
+ conversation_count: number;
+ summary: DatasetSummary;
+ ingested_at: string;
+}
+
+export interface HistogramBin {
+ x0: number;
+ x1: number;
+ count: number;
+}
+
+export interface DistributionStats {
+ count: number;
+ min: number;
+ max: number;
+ mean: number;
+ median: number;
+ /** Added in chart_data v2. */
+ p75?: number;
+ p90: number;
+ /** Added in chart_data v2. */
+ p95?: number;
+}
+
+export interface Distribution {
+ bins: HistogramBin[];
+ stats: DistributionStats;
+}
+
+export interface DatasetChartData {
+ version?: number;
+ inputTokensPerTurn?: Distribution;
+ uncachedInputTokensPerTurn?: Distribution;
+ outputTokensPerTurn?: Distribution;
+ subagentInputTokensPerRequest?: Distribution;
+ subagentOutputTokensPerRequest?: Distribution;
+ turnsPerConversation?: Distribution;
+ subagentGroupsPerConversation?: Distribution;
+ cachedFractionPerTurn?: Distribution;
+ [k: string]: unknown;
+}
+
+export interface DatasetDetail extends DatasetRecord {
+ chart_data: DatasetChartData;
+}
+
+export interface ConversationListItem {
+ conv_id: string;
+ models: string[];
+ num_turns: number;
+ num_subagent_groups: number;
+ total_in: number;
+ total_out: number;
+ total_cached: number;
+}
+
+export interface ConversationList {
+ total: number;
+ items: ConversationListItem[];
+}
+
+export interface ConversationDetail {
+ conv_id: string;
+ models: string[];
+ num_turns: number;
+ num_subagent_groups: number;
+ total_in: number;
+ total_out: number;
+ total_cached: number;
+ structure: ConversationStructure;
+}
+
+export type ConversationSort = 'tokens' | 'turns' | 'subagents' | 'id';
+
+// Dataset contents only change on (rare) re-ingest, so cache aggressively.
+const DAY = 24 * 60 * 60 * 1000;
+
+/** Shared fetch for the per-dataset endpoints: 404 → null, other errors throw. */
+async function fetchJsonOr404(
+ url: string,
+ label: string,
+ signal: AbortSignal,
+): Promise {
+ const res = await fetch(url, { signal });
+ if (res.status === 404) return null;
+ if (!res.ok) throw new Error(`${label} ${res.status}`);
+ return (await res.json()) as T;
+}
+
+/** All ingested datasets (registry cards). */
+export function useDatasets() {
+ return useQuery({
+ queryKey: ['datasets'] as const,
+ queryFn: async ({ signal }) => {
+ const res = await fetch('/api/v1/datasets', { signal });
+ if (!res.ok) throw new Error(`datasets ${res.status}`);
+ return (await res.json()) as DatasetRecord[];
+ },
+ staleTime: DAY,
+ });
+}
+
+/** One dataset incl. chart_data. */
+export function useDataset(slug: string | null) {
+ return useQuery({
+ queryKey: ['dataset', slug] as const,
+ queryFn: ({ signal }) =>
+ fetchJsonOr404(`/api/v1/datasets/${slug}`, 'dataset', signal),
+ enabled: Boolean(slug),
+ staleTime: DAY,
+ });
+}
+
+export interface UseConversationsArgs {
+ slug: string | null;
+ search?: string;
+ limit?: number;
+ offset?: number;
+ sort?: ConversationSort;
+}
+
+/** Paginated conversation list for a dataset (counts only). */
+export function useDatasetConversations({
+ slug,
+ search = '',
+ limit = 50,
+ offset = 0,
+ sort = 'tokens',
+}: UseConversationsArgs) {
+ return useQuery({
+ queryKey: ['dataset-conversations', slug, search, limit, offset, sort] as const,
+ queryFn: ({ signal }) => {
+ const qs = new URLSearchParams({
+ limit: String(limit),
+ offset: String(offset),
+ sort,
+ });
+ if (search) qs.set('search', search);
+ return fetchJsonOr404(
+ `/api/v1/datasets/${slug}/conversations?${qs.toString()}`,
+ 'dataset-conversations',
+ signal,
+ );
+ },
+ enabled: Boolean(slug),
+ placeholderData: keepPreviousData,
+ staleTime: DAY,
+ });
+}
+
+/** One conversation's flamegraph structure. */
+export function useDatasetConversation(slug: string | null, convId: string | null) {
+ return useQuery({
+ queryKey: ['dataset-conversation', slug, convId] as const,
+ queryFn: ({ signal }) =>
+ fetchJsonOr404(
+ `/api/v1/datasets/${slug}/conversations/${encodeURIComponent(convId ?? '')}`,
+ 'dataset-conversation',
+ signal,
+ ),
+ enabled: Boolean(slug) && Boolean(convId),
+ staleTime: DAY,
+ });
+}
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts
new file mode 100644
index 00000000..2e54f418
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts
@@ -0,0 +1,13 @@
+import { describe, expect, it } from 'vitest';
+
+import { chunkDerivedAgenticMetricIds } from './use-derived-agentic-metrics';
+
+describe('chunkDerivedAgenticMetricIds', () => {
+ it('keeps every id while respecting the API limit', () => {
+ const ids = Array.from({ length: 401 }, (_, index) => index + 1);
+ const chunks = chunkDerivedAgenticMetricIds(ids);
+
+ expect(chunks.map((chunk) => chunk.length)).toEqual([200, 200, 1]);
+ expect(chunks.flat()).toEqual(ids);
+ });
+});
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
new file mode 100644
index 00000000..388563d9
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -0,0 +1,55 @@
+import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query';
+
+export interface DerivedAgenticMetric {
+ id: number;
+ /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
+ * by mean_load / session_load. Null when the JSONL had no usable records. */
+ normalized_session_time_s: number | null;
+ /** P90 of per-turn ISL/TTFT across every turn in every session.
+ * Null when no prefill rates could be computed. */
+ p90_prefill_tps_per_user: number | null;
+ /** P75 normalized per-request E2E at a fixed 400-token output length. */
+ p75_normalized_e2e_400_s: number | null;
+ /** P90 normalized per-request E2E at a fixed 400-token output length. */
+ p90_normalized_e2e_400_s: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record;
+
+const MAX_IDS_PER_REQUEST = 200;
+
+export function chunkDerivedAgenticMetricIds(ids: number[]): number[][] {
+ const chunks: number[][] = [];
+ for (let i = 0; i < ids.length; i += MAX_IDS_PER_REQUEST) {
+ chunks.push(ids.slice(i, i + MAX_IDS_PER_REQUEST));
+ }
+ return chunks;
+}
+
+const fetchChunk = bulkIdsFetcher('derived-agentic-metrics');
+
+// Unlike the other bulk endpoints, dashboards can put >200 agentic points on
+// screen at once, so this fetcher splits the id set across parallel requests
+// to stay under the route's MAX_IDS_PER_REQUEST.
+async function fetchDerivedAgenticMetrics(
+ ids: number[],
+ signal?: AbortSignal,
+): Promise {
+ if (ids.length === 0) return {};
+ const maps = await Promise.all(
+ chunkDerivedAgenticMetricIds(ids).map((chunk) => fetchChunk(chunk, signal)),
+ );
+ return Object.assign({}, ...maps) as DerivedAgenticMetricMap;
+}
+
+/**
+ * Fetch per-id derived agentic metrics (session time + p90 prefill TPS/user)
+ * computed live from the stored aiperf profile_export.jsonl. Used to drive
+ * the "Session Time" and "Prefill TPS/user" chart variants.
+ *
+ * Ids without a trace_replay blob (older or non-aiperf agentic runs) are
+ * silently omitted from the response.
+ */
+export function useDerivedAgenticMetrics(ids: number[], enabled = true) {
+ return useBulkIdsQuery('derived-agentic-metrics', ids, enabled, fetchDerivedAgenticMetrics);
+}
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
new file mode 100644
index 00000000..6f43de25
--- /dev/null
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -0,0 +1,53 @@
+import { useByIdQuery } from './benchmark-id-query';
+
+export interface RequestRecord {
+ /** Conversation id (groups turns of one agent session). */
+ cid: string;
+ /** Zero-based turn index within the conversation. */
+ ti: number;
+ /** Source trace id from the original raw dataset, when provided by AIPerf. */
+ srcTrace?: string;
+ /** Original raw top-level request index within srcTrace. */
+ srcOuter?: number;
+ /** Original nested request index within srcOuter, for subagent children. */
+ srcInner?: number;
+ /** Loader-specific source kind, e.g. weka_main or weka_flat. */
+ srcKind?: string;
+ /** Worker id (concurrency slot that handled this request). */
+ wid: string;
+ /** Sub-agent depth (0 = top-level). */
+ ad: number;
+ /** `warmup` or `profiling`. */
+ phase: string;
+ /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+ credit: number;
+ /** ns offset from timeline.startNs. HTTP send started. */
+ start: number;
+ /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+ ack: number | null;
+ /** ns offset from timeline.startNs. Last byte received. */
+ end: number;
+ ttftMs: number | null;
+ /** Time per output token in ms. */
+ tpotMs: number | null;
+ isl: number | null;
+ osl: number | null;
+ cancelled: boolean;
+}
+
+export interface RequestTimeline {
+ version: number;
+ startNs: number;
+ endNs: number;
+ durationS: number;
+ requests: RequestRecord[];
+}
+
+/**
+ * Lazy-fetch the per-request Gantt timeline for one agentic point.
+ * Enabled only when the caller opts in (e.g. the timeline view becomes
+ * active), so the payload (~30 KB per point) isn't paid for every page load.
+ */
+export function useRequestTimeline(id: number | null, enabled = false) {
+ return useByIdQuery('request-timeline', id, enabled && Boolean(id));
+}
diff --git a/packages/app/src/hooks/api/use-trace-availability.ts b/packages/app/src/hooks/api/use-trace-availability.ts
new file mode 100644
index 00000000..24e4c067
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-availability.ts
@@ -0,0 +1,15 @@
+import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query';
+
+export type TraceAvailabilityMap = Record;
+
+const fetchTraceAvailability = bulkIdsFetcher('trace-availability');
+
+/**
+ * Bulk presence lookup: which of the given `benchmark_results.id`s have a
+ * stored trace_replay blob. Used by the scatter chart to decide whether to
+ * surface the "View charts" button — cheap boolean per id instead of
+ * shipping multi-MB profile blobs just for the check.
+ */
+export function useTraceAvailability(ids: number[], enabled = true) {
+ return useBulkIdsQuery('trace-availability', ids, enabled, fetchTraceAvailability);
+}
diff --git a/packages/app/src/hooks/api/use-trace-histograms.ts b/packages/app/src/hooks/api/use-trace-histograms.ts
new file mode 100644
index 00000000..8197147a
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-histograms.ts
@@ -0,0 +1,25 @@
+import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query';
+
+export interface TraceHistogramPoint {
+ id: number;
+ /** Input sequence length (tokens) per completed request. */
+ isl: number[];
+ /** Output sequence length (tokens) per completed request. */
+ osl: number[];
+}
+
+export type TraceHistogramMap = Record;
+
+const fetchTraceHistograms = bulkIdsFetcher('trace-histograms');
+
+/**
+ * Fetch per-request ISL/OSL arrays for a set of benchmark_results.id values.
+ * Ids without a stored trace_replay blob are silently omitted from the response.
+ *
+ * Caller passes the agentic id set currently on screen; React Query handles
+ * dedup + stale-while-revalidate. Cache key is sorted-ids-comma-joined so
+ * any permutation of the same set hits the same cache entry.
+ */
+export function useTraceHistograms(ids: number[], enabled = true) {
+ return useBulkIdsQuery('trace-histograms', ids, enabled, fetchTraceHistograms);
+}
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
new file mode 100644
index 00000000..47cf66a6
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -0,0 +1,96 @@
+import { useByIdQuery } from './benchmark-id-query';
+
+export interface TimeSeriesPoint {
+ /** Seconds from benchmark start. */
+ t: number;
+ value: number;
+}
+export interface QueueDepthPoint {
+ t: number;
+ running: number;
+ waiting: number;
+ total: number;
+}
+export interface PointMeta {
+ id: number;
+ hardware: string;
+ framework: string;
+ model: string;
+ precision: string;
+ spec_method: string;
+ disagg: boolean;
+ conc: number;
+ offload_mode: string | null;
+ isl: number | null;
+ osl: number | null;
+ benchmark_type: string;
+ date: string;
+ run_url: string | null;
+ server_gpu_cache_hit_rate: number | null;
+ server_cpu_cache_hit_rate: number | null;
+}
+
+export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown';
+
+export interface MetricSource {
+ id: string;
+ adapter: string;
+ role: MetricSourceRole;
+ endpointUrl: string | null;
+ nativeRole: string | null;
+ workerId: string | null;
+ dpRank: string | null;
+ engine: string | null;
+}
+
+export interface MetricSourceSeries {
+ source: MetricSource;
+ kvCacheUsage: TimeSeriesPoint[];
+ prefixCacheHitRate: TimeSeriesPoint[];
+ queueDepth: QueueDepthPoint[];
+ promptTokensBySource: Record;
+ promptTps: TimeSeriesPoint[];
+ generationTps: TimeSeriesPoint[];
+ prefixCacheHitsTps: TimeSeriesPoint[];
+ hostKvCacheUsage: TimeSeriesPoint[];
+ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+}
+
+export interface TraceServerMetrics {
+ meta: PointMeta;
+ startNs: number;
+ endNs: number;
+ durationS: number;
+ timeslicesCount: number;
+ kvCacheUsage: TimeSeriesPoint[];
+ prefixCacheHitRate: TimeSeriesPoint[];
+ queueDepth: QueueDepthPoint[];
+ promptTokensBySource: Record;
+ prefillTps: TimeSeriesPoint[];
+ decodeTps: TimeSeriesPoint[];
+ /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+ prefixCacheHitsTps: TimeSeriesPoint[];
+ /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+ hostKvCacheUsage: TimeSeriesPoint[];
+ /**
+ * Per-DP-rank KV cache utilization. Empty for single-engine deployments —
+ * the cluster-average `kvCacheUsage` line covers that case alone.
+ */
+ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+ /**
+ * Total KV-cache pool size in tokens (num_gpu_blocks × block_size, summed
+ * across engines). vLLM only — null for SGLang/TRT or older rows.
+ */
+ kvCachePoolTokens: number | null;
+ /** Orchestrator-normalized metrics grouped by endpoint/worker. */
+ metricSources: MetricSourceSeries[];
+}
+
+/**
+ * Lazy-fetch parsed server-metric time-series for one agentic point.
+ * Enabled only when the caller passes `enabled=true` (the detail panel opens),
+ * so we don't pay the parse cost on every hover.
+ */
+export function useTraceServerMetrics(id: number | null, enabled = false) {
+ return useByIdQuery('trace-server-metrics', id, enabled && Boolean(id));
+}
diff --git a/packages/app/src/hooks/useChartContext.ts b/packages/app/src/hooks/useChartContext.ts
index 49812c3e..be095430 100644
--- a/packages/app/src/hooks/useChartContext.ts
+++ b/packages/app/src/hooks/useChartContext.ts
@@ -37,6 +37,12 @@ export function reconcileActiveSet(
interface UseChartStateConfig {
/** URL parameter prefix (e.g., 'i_' for inference, 'r_' for reliability, 'e_' for evaluation) */
urlPrefix: string;
+ /**
+ * Initial high-contrast value when the URL has no `hc` param.
+ * Defaults to false; the inference chart opts in to true. A `hc=0`
+ * URL param overrides it back off.
+ */
+ defaultHighContrast?: boolean;
}
/**
@@ -44,7 +50,7 @@ interface UseChartStateConfig {
* Includes mobile-specific legend collapse behavior.
*/
export function useChartUIState(config: UseChartStateConfig) {
- const { urlPrefix } = config;
+ const { urlPrefix, defaultHighContrast = false } = config;
const { getUrlParam } = useUrlState();
const hcParam = `${urlPrefix}hc` as any;
@@ -52,7 +58,7 @@ export function useChartUIState(config: UseChartStateConfig) {
// Initialize with safe defaults that match SSR output to avoid hydration mismatches.
// URL-param values are applied in a mount effect so the state is only set client-side.
- const [highContrast, setHighContrast] = useState(false);
+ const [highContrast, setHighContrast] = useState(defaultHighContrast);
const [isLegendExpanded, setIsLegendExpanded] = useState(true);
const didInit = useRef(false);
@@ -60,7 +66,9 @@ export function useChartUIState(config: UseChartStateConfig) {
if (didInit.current) return;
didInit.current = true;
const hcVal = getUrlParam(hcParam);
+ // Respect both overrides so the toggle round-trips regardless of the default.
if (hcVal === '1') setHighContrast(true);
+ else if (hcVal === '0') setHighContrast(false);
const legendVal = getUrlParam(legendParam);
if (legendVal === '0') setIsLegendExpanded(false);
}, [getUrlParam, hcParam, legendParam]);
diff --git a/packages/app/src/hooks/useThemeColors.test.ts b/packages/app/src/hooks/useThemeColors.test.ts
index 7275e384..11050d19 100644
--- a/packages/app/src/hooks/useThemeColors.test.ts
+++ b/packages/app/src/hooks/useThemeColors.test.ts
@@ -170,4 +170,32 @@ describe('useThemeColors color maps', () => {
}
unmountOn();
});
+
+ // Regression: deselecting a legend line must not recolor the remaining lines.
+ // The HC palette is sized/indexed by the key set it's generated over, so when
+ // it was generated over the *active* subset (no hcKeys), shrinking the
+ // selection re-sized the palette and shifted every remaining line's hue (most
+ // visible on single-vendor agentic runs spanning the full wheel). Passing a
+ // stable `hcKeys` (the full set with data) fixes each line's color.
+ it('keeps a line HC color stable across active subsets when hcKeys is the full set', () => {
+ const FULL = ['b200', 'b300']; // single-vendor (NVIDIA) agentic comparison
+
+ const all = renderHook(() =>
+ useThemeColors({ highContrast: true, activeKeys: ['b200', 'b300'], hcKeys: FULL }),
+ );
+ const b200WithBoth = all.result.current.resolveColor('b200');
+ const b300Color = all.result.current.resolveColor('b300');
+ all.unmount();
+
+ // b300 deselected → only b200 active, but hcKeys is still the full set.
+ const subset = renderHook(() =>
+ useThemeColors({ highContrast: true, activeKeys: ['b200'], hcKeys: FULL }),
+ );
+ const b200Alone = subset.result.current.resolveColor('b200');
+ subset.unmount();
+
+ expect(b200WithBoth).toMatch(/^#[0-9a-f]{6}$/iu);
+ expect(b200WithBoth).not.toBe(b300Color); // HC still produces distinct hues
+ expect(b200Alone).toBe(b200WithBoth); // deselecting b300 did NOT recolor b200
+ });
});
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 0dac5883..a9d66715 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -8,6 +8,8 @@ import type { WorkerPower } from '@/components/inference/types';
import type { SubmissionsResponse } from './submissions-types';
export interface BenchmarkRow {
+ /** Stable per-point id from benchmark_results; used for agentic detail lookups. */
+ id: number;
hardware: string;
framework: string;
model: string;
@@ -25,9 +27,13 @@ export interface BenchmarkRow {
decode_num_workers: number;
num_prefill_gpu: number;
num_decode_gpu: number;
- isl: number;
- osl: number;
+ benchmark_type: string;
+ // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+ isl: number | null;
+ osl: number | null;
conc: number;
+ /** KV-cache offload mode. Defaults to 'off' for fixed-sequence rows. */
+ offload_mode: string;
image: string | null;
metrics: Record;
/**
@@ -176,13 +182,14 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) {
export interface AvailabilityRow {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
spec_method: string;
disagg: boolean;
+ benchmark_type: string;
date: string;
}
diff --git a/packages/app/src/lib/benchmark-id.test.ts b/packages/app/src/lib/benchmark-id.test.ts
new file mode 100644
index 00000000..0f9fb83b
--- /dev/null
+++ b/packages/app/src/lib/benchmark-id.test.ts
@@ -0,0 +1,33 @@
+import { describe, expect, it } from 'vitest';
+
+import { isPersistedBenchmarkId } from './benchmark-id';
+
+describe('isPersistedBenchmarkId', () => {
+ it('accepts a positive integer (a real bigserial row id)', () => {
+ expect(isPersistedBenchmarkId(1)).toBe(true);
+ expect(isPersistedBenchmarkId(206863)).toBe(true);
+ });
+
+ it('rejects 0 — bigserial starts at 1, so 0 is never a real row', () => {
+ expect(isPersistedBenchmarkId(0)).toBe(false);
+ });
+
+ it('rejects negatives', () => {
+ expect(isPersistedBenchmarkId(-1)).toBe(false);
+ });
+
+ it('rejects NaN (what Number(undefined) yields for overlay rows)', () => {
+ expect(isPersistedBenchmarkId(Number(undefined))).toBe(false);
+ expect(isPersistedBenchmarkId(NaN)).toBe(false);
+ });
+
+ it('rejects non-integers', () => {
+ expect(isPersistedBenchmarkId(1.5)).toBe(false);
+ expect(isPersistedBenchmarkId(Infinity)).toBe(false);
+ });
+
+ it('rejects null / undefined', () => {
+ expect(isPersistedBenchmarkId(null)).toBe(false);
+ expect(isPersistedBenchmarkId(undefined)).toBe(false);
+ });
+});
diff --git a/packages/app/src/lib/benchmark-id.ts b/packages/app/src/lib/benchmark-id.ts
new file mode 100644
index 00000000..b1ccb8bc
--- /dev/null
+++ b/packages/app/src/lib/benchmark-id.ts
@@ -0,0 +1,20 @@
+/**
+ * Shared guard for `benchmark_results.id` values.
+ *
+ * `benchmark_results.id` is a Postgres bigserial that starts at 1, so a real
+ * persisted row always has a positive integer id. Overlay / `?unofficialrun=`
+ * points are transformed live from raw artifacts and never carry a DB id — the
+ * transform yields `undefined` (older code produced `NaN` via `Number(undefined)`).
+ *
+ * A bare `typeof id === 'number'` check is NOT enough: `NaN` and `0` are both
+ * `number` yet neither is a real row. Passing them to the id-keyed endpoints
+ * (`/api/v1/derived-agentic-metrics?ids=…`, `…?id=…`) yields a 400 (the routes
+ * filter to `Number.isFinite(n) && n > 0`), and building an
+ * `/inference/agentic/` link out of one points at a non-existent row.
+ *
+ * Use this predicate at every site that collects ids for a fetch or builds a
+ * per-point detail link so overlay-only views skip cleanly instead of erroring.
+ */
+export function isPersistedBenchmarkId(id: number | null | undefined): id is number {
+ return typeof id === 'number' && Number.isInteger(id) && id > 0;
+}
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 8f27cc8f..c08137e6 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -2,10 +2,15 @@ import { describe, it, expect, vi } from 'vitest';
import type { BenchmarkRow } from '@/lib/api';
-import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform';
+import {
+ mergeRunScopedRows,
+ rowToAggDataEntry,
+ transformBenchmarkRows,
+} from './benchmark-transform';
function makeRow(overrides: Partial = {}): BenchmarkRow {
return {
+ id: 1,
hardware: 'h200',
framework: 'trt',
model: 'dsr1',
@@ -23,6 +28,8 @@ function makeRow(overrides: Partial = {}): BenchmarkRow {
decode_num_workers: 0,
num_prefill_gpu: 8,
num_decode_gpu: 8,
+ benchmark_type: 'single_turn',
+ offload_mode: 'off',
isl: 1024,
osl: 1024,
conc: 64,
@@ -793,3 +800,166 @@ describe('transformBenchmarkRows — dp_attention narrowing', () => {
expect(point.decode_dp_attention).toBe(true);
});
});
+
+describe('mergeRunScopedRows', () => {
+ const vllmRun = (over: Partial = {}) =>
+ makeRow({ model: 'dsv4', hardware: 'b300', framework: 'vllm', precision: 'fp4', ...over });
+ const sglangBase = (over: Partial = {}) =>
+ makeRow({ model: 'dsv4', hardware: 'b300', framework: 'sglang', precision: 'fp4', ...over });
+
+ it('pins configs the run covers to the run rows, replacing base rows', () => {
+ const runRows = [vllmRun({ id: 10, conc: 32 }), vllmRun({ id: 11, conc: 64 })];
+ const baseRows = [vllmRun({ id: 90, conc: 32 }), vllmRun({ id: 91, conc: 128 })];
+ const merged = mergeRunScopedRows(runRows, baseRows);
+ // All vllm base rows dropped (incl. conc=128 the run didn't cover) — a
+ // partial-sweep run must fully own its config or the DISTINCT-ON mixing
+ // the scoping exists to prevent comes right back.
+ expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]);
+ });
+
+ it('carries forward configs the run does not cover (the same-day other-framework curve)', () => {
+ const runRows = [vllmRun({ id: 10 })];
+ const baseRows = [
+ vllmRun({ id: 90 }),
+ sglangBase({ id: 91 }),
+ sglangBase({ id: 92, conc: 128 }),
+ ];
+ const merged = mergeRunScopedRows(runRows, baseRows);
+ expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91, 92]);
+ });
+
+ it('keeps base rows of other hardware / precision / model untouched', () => {
+ const runRows = [vllmRun({ id: 10 })];
+ const baseRows = [
+ vllmRun({ id: 90, hardware: 'b200' }),
+ vllmRun({ id: 91, precision: 'fp8' }),
+ vllmRun({ id: 92, model: 'kimik2.5' }),
+ ];
+ const merged = mergeRunScopedRows(runRows, baseRows);
+ expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 90, 91, 92]);
+ });
+
+ it('scopes per benchmark_type — an agentic run does not hide fixed-seq carry-forward', () => {
+ const runRows = [vllmRun({ id: 10, benchmark_type: 'agentic_traces' })];
+ const baseRows = [
+ vllmRun({ id: 90, benchmark_type: 'agentic_traces' }),
+ vllmRun({ id: 91, benchmark_type: 'single_turn' }),
+ ];
+ const merged = mergeRunScopedRows(runRows, baseRows);
+ expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]);
+ });
+
+ it('returns base rows unchanged when the run produced nothing', () => {
+ const baseRows = [vllmRun({ id: 90 }), sglangBase({ id: 91 })];
+ expect(mergeRunScopedRows([], baseRows)).toBe(baseRows);
+ });
+});
+
+describe('rowToAggDataEntry — agentic interactivity invariant', () => {
+ // Agentic artifacts have shipped *_intvty under two definitions across harness
+ // versions (slow-tail 1/p(ITL) vs fast-tail p(1/ITL)). The chart's
+ // interactivity selector is slow-tail, so we always derive intvty = 1/itl and
+ // discard the artifact value. Mirrors the ingest mapper + backfill.
+ const agentic = (metrics: Record) =>
+ rowToAggDataEntry(makeRow({ benchmark_type: 'agentic_traces', isl: null, osl: null, metrics }));
+
+ it('overrides an artifact-supplied (fast-tail) *_intvty with 1/*_itl', () => {
+ const entry = agentic({
+ p90_itl: 0.0893, // slow-tail 1/itl ≈ 11.198
+ p90_intvty: 23.91, // fast-tail contamination — must be discarded
+ p75_itl: 0.0692,
+ p75_intvty: 19, // must be discarded
+ });
+ expect(entry.p90_intvty).toBeCloseTo(1 / 0.0893, 6);
+ expect(entry.p75_intvty).toBeCloseTo(1 / 0.0692, 6);
+ expect(entry.p90_intvty).not.toBeCloseTo(23.91, 1);
+ });
+
+ it('derives intvty from itl when the artifact omits intvty entirely', () => {
+ const entry = agentic({ p90_itl: 0.1, p95_itl: 0.2 });
+ expect(entry.p90_intvty).toBeCloseTo(10, 6);
+ expect(entry.p95_intvty).toBeCloseTo(5, 6);
+ });
+
+ it('does not invert interactivity for single_turn rows', () => {
+ const entry = rowToAggDataEntry(makeRow({ metrics: { p90_itl: 0.05, p90_intvty: 999 } }));
+ expect(entry.p90_intvty).toBe(999);
+ });
+
+ it('DROPS a stale artifact *_intvty when the matching *_itl is absent (overlay mirror of the ETL fix)', () => {
+ // Artifact carries intvty (possibly the drifted p(1/ITL) definition) but no
+ // itl for that percentile — the value can't be reconciled to 1/p(ITL), so it
+ // must be discarded, not passed through. rowToAggDataEntry then coerces the
+ // now-missing key to 0.
+ const entry = agentic({ p90_intvty: 42, p95_itl: 0.2 });
+ expect(entry.p90_intvty).toBe(0); // dropped → default 0
+ expect(entry.p95_intvty).toBeCloseTo(5, 6); // derived from itl
+ });
+
+ it('DROPS a stale artifact *_intvty when the matching *_itl is zero/invalid', () => {
+ const entry = agentic({ p90_itl: 0, p90_intvty: 42 });
+ expect(entry.p90_intvty).toBe(0);
+ });
+});
+
+// ---------------------------------------------------------------------------
+// rowToAggDataEntry — persisted-id guard (overlay rows carry no DB id)
+// ---------------------------------------------------------------------------
+describe('rowToAggDataEntry — id coercion', () => {
+ it('coerces a stringified bigint id to a number', () => {
+ const entry = rowToAggDataEntry(makeRow({ id: '206863' as unknown as number }));
+ expect(entry.id).toBe(206863);
+ });
+
+ it('yields undefined (not NaN) for a missing id — overlay rows have no persisted id', () => {
+ const entry = rowToAggDataEntry(makeRow({ id: undefined as unknown as number }));
+ expect(entry.id).toBeUndefined();
+ });
+
+ it('yields undefined for a non-positive or non-numeric id', () => {
+ expect(rowToAggDataEntry(makeRow({ id: 0 })).id).toBeUndefined();
+ expect(rowToAggDataEntry(makeRow({ id: 'abc' as unknown as number })).id).toBeUndefined();
+ });
+});
+
+// ---------------------------------------------------------------------------
+// mergeRunScopedRows — offload-aware scoping (data-loss guard)
+// ---------------------------------------------------------------------------
+describe('mergeRunScopedRows — offload variants are distinct series', () => {
+ const agenticRow = (over: Partial = {}) =>
+ makeRow({
+ model: 'dsr1',
+ hardware: 'b300',
+ framework: 'vllm',
+ precision: 'fp4',
+ benchmark_type: 'agentic_traces',
+ isl: null,
+ osl: null,
+ ...over,
+ });
+
+ it('a run row for offload=on does NOT claim/suppress the base offload=off rows', () => {
+ // The selected run produced only the offload=on variant. The offload=off base
+ // rows are a separate series and must carry forward, not vanish.
+ const runRows = [agenticRow({ id: 10, offload_mode: 'on' })];
+ const baseRows = [
+ agenticRow({ id: 90, offload_mode: 'on' }), // same series as the run → replaced
+ agenticRow({ id: 91, offload_mode: 'off' }), // distinct series → kept
+ ];
+ const merged = mergeRunScopedRows(runRows, baseRows);
+ expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]);
+ });
+
+ it('a run covering both offload variants pins both', () => {
+ const runRows = [
+ agenticRow({ id: 10, offload_mode: 'on' }),
+ agenticRow({ id: 11, offload_mode: 'off' }),
+ ];
+ const baseRows = [
+ agenticRow({ id: 90, offload_mode: 'on' }),
+ agenticRow({ id: 91, offload_mode: 'off' }),
+ ];
+ const merged = mergeRunScopedRows(runRows, baseRows);
+ expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]);
+ });
+});
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index ac806b79..943da81b 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -13,12 +13,57 @@ import type {
} from '@/components/inference/types';
import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils';
import { getHardwareConfig } from '@/lib/constants';
+import { isPersistedBenchmarkId } from '@/lib/benchmark-id';
import type { BenchmarkRow } from '@/lib/api';
+/**
+ * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl
+ * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here:
+ * e2el ≡ ttlt (time-to-last-token == end-to-end latency)
+ * tpot ≡ itl (time-per-output-token == inter-token-latency for single-output)
+ * intvty ≡ 1/itl (tok/s from the user's perspective)
+ *
+ * e2el/tpot only fill gaps (existing fields win). `intvty` is ALWAYS 1/itl:
+ * derived where itl is valid, overriding any artifact-supplied value, AND any
+ * artifact `*_intvty` is DROPPED where itl is absent/zero/invalid rather than
+ * passed through. The harness definition of `*_intvty` has drifted (some versions
+ * emit `p(1/ITL)`, which inverts percentile order), so for a slow-tail selector
+ * interactivity must be `1/p(ITL)`. This matches the ingest mapper for official
+ * rows; doing it here keeps overlay / `?unofficialrun=` rows (transformed live
+ * from raw artifacts, never through the DB) on the same single definition.
+ */
+function applyAgenticMetricAliases(raw: Record): Record {
+ const m: Record = { ...raw };
+ for (const suffix of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
+ const itl = raw[`${suffix}_itl`];
+ const ttlt = raw[`${suffix}_ttlt`];
+ if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) m[`${suffix}_e2el`] = ttlt;
+ if (m[`${suffix}_tpot`] === undefined && itl !== undefined) m[`${suffix}_tpot`] = itl;
+ if (typeof itl === 'number' && itl > 0) m[`${suffix}_intvty`] = 1 / itl;
+ else delete m[`${suffix}_intvty`];
+ }
+ return m;
+}
+
/** Convert a DB benchmark row to an AggDataEntry. */
export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
- const m = row.metrics;
+ const isAgentic = row.benchmark_type === 'agentic_traces';
+ const m = isAgentic ? applyAgenticMetricAliases(row.metrics) : row.metrics;
+ // Prefer the dedicated column (added in migration 004); fall back to the
+ // legacy stash inside `metrics` for any rows ingested before that column
+ // existed.
+ const rawMetrics = row.metrics as Record;
+ const offloadMode =
+ row.offload_mode ??
+ (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
+ // Postgres bigint comes through the SQL client as a string; coerce it. Overlay
+ // rows (transformed live from raw artifacts) carry no id, so `Number(undefined)`
+ // is NaN — collapse any non-persisted value to undefined so downstream link /
+ // fetch sites (guarded by isPersistedBenchmarkId) skip it cleanly rather than
+ // emitting `?ids=NaN` or an `/inference/agentic/NaN` link.
+ const numericId = typeof row.id === 'number' ? row.id : Number(row.id);
return {
+ id: isPersistedBenchmarkId(numericId) ? numericId : undefined,
hw: row.hardware,
framework: row.framework,
model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model,
@@ -32,23 +77,43 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
mean_ttft: m.mean_ttft ?? 0,
median_ttft: m.median_ttft ?? 0,
std_ttft: m.std_ttft ?? 0,
+ p75_ttft: m.p75_ttft ?? 0,
+ p90_ttft: m.p90_ttft ?? 0,
+ p95_ttft: m.p95_ttft ?? 0,
p99_ttft: m.p99_ttft ?? 0,
+ 'p99.9_ttft': m['p99.9_ttft'] ?? 0,
mean_tpot: m.mean_tpot ?? 0,
median_tpot: m.median_tpot ?? 0,
std_tpot: m.std_tpot ?? 0,
+ p75_tpot: m.p75_tpot ?? 0,
+ p90_tpot: m.p90_tpot ?? 0,
+ p95_tpot: m.p95_tpot ?? 0,
p99_tpot: m.p99_tpot ?? 0,
+ 'p99.9_tpot': m['p99.9_tpot'] ?? 0,
mean_intvty: m.mean_intvty ?? 0,
median_intvty: m.median_intvty ?? 0,
std_intvty: m.std_intvty ?? 0,
+ p75_intvty: m.p75_intvty ?? 0,
+ p90_intvty: m.p90_intvty ?? 0,
+ p95_intvty: m.p95_intvty ?? 0,
p99_intvty: m.p99_intvty ?? 0,
+ 'p99.9_intvty': m['p99.9_intvty'] ?? 0,
mean_itl: m.mean_itl ?? 0,
median_itl: m.median_itl ?? 0,
std_itl: m.std_itl ?? 0,
+ p75_itl: m.p75_itl ?? 0,
+ p90_itl: m.p90_itl ?? 0,
+ p95_itl: m.p95_itl ?? 0,
p99_itl: m.p99_itl ?? 0,
+ 'p99.9_itl': m['p99.9_itl'] ?? 0,
mean_e2el: m.mean_e2el ?? 0,
median_e2el: m.median_e2el ?? 0,
std_e2el: m.std_e2el ?? 0,
+ p75_e2el: m.p75_e2el ?? 0,
+ p90_e2el: m.p90_e2el ?? 0,
+ p95_e2el: m.p95_e2el ?? 0,
p99_e2el: m.p99_e2el ?? 0,
+ 'p99.9_e2el': m['p99.9_e2el'] ?? 0,
// Measured GPU telemetry (runner's aggregate_power.py). Left undefined for
// rows predating the field so downstream chart code can distinguish
// "no measurement" from "0 W" via createChartDataPoint's typeof guard.
@@ -91,6 +156,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
date: row.date,
actualDate: (row as any).actualDate ?? row.date,
run_url: row.run_url ?? undefined,
+ benchmark_type: row.benchmark_type,
+ isl: row.isl,
+ osl: row.osl,
+ offload_mode: offloadMode,
+ server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate,
+ server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate,
+ theoretical_cache_hit_rate: m.theoretical_cache_hit_rate,
+ num_requests_total: m.num_requests_total,
+ num_requests_successful: m.num_requests_successful,
+ total_prompt_tokens: m.total_prompt_tokens,
+ total_generation_tokens: m.total_generation_tokens,
};
}
@@ -100,13 +176,62 @@ interface PreparedEntry {
date: string;
}
+/**
+ * Rewrite a chart x-axis key to use a different latency percentile prefix
+ * (`median_` → `p99_` etc). Only touches keys that start with a known
+ * percentile prefix; leaves everything else alone.
+ */
+export function withPercentile(key: string, percentile: string): string {
+ return key.replace(/^(?:mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`);
+}
+
+// Replacement granularity for single-run scoping: the changelog config_key
+// tuple (model-precision-hardware-framework) plus benchmark_type AND offload_mode.
+// benchmark_type keeps an agentic-only run from hiding the same config's
+// fixed-seq carry-forward; offload_mode keeps a run that produced only one
+// offload variant (e.g. offload=on) from claiming — and thereby suppressing —
+// the other variant's (offload=off) base rows, which are a distinct series.
+const runScopeKey = (r: BenchmarkRow): string =>
+ `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}|${r.offload_mode ?? 'off'}`;
+
+/**
+ * Merge run-scoped benchmark rows with the normal latest-per-config rows.
+ *
+ * When the user picks a specific workflow run (to disambiguate two same-day
+ * sweeps of the same config), only the configs that run actually produced
+ * should be pinned to it — every other config must keep its normal
+ * carry-forward rows. Scoping the whole chart to the run (the old behavior)
+ * silently hid complementary configs that happened to land on the same date,
+ * e.g. selecting one of two same-day vLLM runs made the day's SGLang curve
+ * vanish because it lived in a different workflow run.
+ *
+ * Run rows win for every (model, precision, hardware, framework,
+ * benchmark_type) group they cover; base rows fill in the rest.
+ */
+export function mergeRunScopedRows(
+ runRows: BenchmarkRow[],
+ baseRows: BenchmarkRow[],
+): BenchmarkRow[] {
+ if (runRows.length === 0) return baseRows;
+ const claimed = new Set(runRows.map(runScopeKey));
+ return [...runRows, ...baseRows.filter((r) => !claimed.has(runScopeKey(r)))];
+}
+
/**
* Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig.
* Returns one InferenceData[] per chart definition (e2e, interactivity).
*
* Converts rows to AggDataEntry once, then reuses for each chart definition.
+ *
+ * @param percentile Optional latency percentile for the chart x-axis
+ * (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart
+ * definition for the chosen percentile — only agentic rows carry the
+ * full set (median/p90/p99/p99.9) so this mainly affects that scenario.
*/
-export function transformBenchmarkRows(rows: BenchmarkRow[]): {
+export function transformBenchmarkRows(
+ rows: BenchmarkRow[],
+ percentile = 'median',
+): {
chartData: InferenceData[][];
hardwareConfig: HardwareConfig;
} {
@@ -132,13 +257,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): {
// Phase 2: Build chart data per chart definition (reusing prepared entries)
const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => {
+ const xKey = withPercentile(chartDef.x, percentile);
const groupedByHw: Record = {};
for (const { entry, hwKey, date } of prepared) {
const dataPoint = createChartDataPoint(
date,
entry,
- chartDef.x as keyof AggDataEntry,
+ xKey as keyof AggDataEntry,
chartDef.y as keyof AggDataEntry,
hwKey,
);
diff --git a/packages/app/src/lib/chart-utils.test.ts b/packages/app/src/lib/chart-utils.test.ts
index db569118..052d498f 100644
--- a/packages/app/src/lib/chart-utils.test.ts
+++ b/packages/app/src/lib/chart-utils.test.ts
@@ -353,30 +353,29 @@ describe('generateHighContrastColors', () => {
expect(Object.values(dark).join(',')).not.toEqual(Object.values(light).join(','));
});
- // ---------- Tier 1: few items → brand zone ----------
-
- it('3 NVIDIA GPUs are not red', () => {
+ // ---------- Single vendor: full wheel for maximum contrast ----------
+ // Brand-zone / rival-ban only apply when MULTIPLE vendors are present (so the
+ // vendors stay visually separable). With a single vendor there's no rival to
+ // distinguish from, so HC opens the full hue wheel — brand hue is sacrificed
+ // for the contrast HC exists to provide (fixes the all-NVIDIA agentic case
+ // where every series otherwise collapsed into the green brand band).
+
+ it('3 NVIDIA GPUs (single vendor) are distinguishable across the full wheel', () => {
const result = generateHighContrastColors(['h100_vllm', 'h200_vllm', 'b200_vllm'], 'dark');
- for (const color of Object.values(result)) {
- expect(isNotReddish(parseRgb(color))).toBe(true);
- }
+ expect(Object.keys(result)).toHaveLength(3);
assertMinDist(result, 30);
});
- it('2 AMD GPUs are not green', () => {
+ it('2 AMD GPUs (single vendor) are distinguishable across the full wheel', () => {
const result = generateHighContrastColors(['mi300x_sglang', 'mi325x_sglang'], 'dark');
- for (const color of Object.values(result)) {
- expect(isNotGreenish(parseRgb(color))).toBe(true);
- }
+ expect(Object.keys(result)).toHaveLength(2);
assertMinDist(result, 30);
});
- it('4 NVIDIA GPUs stay in brand zone and are distinguishable', () => {
+ it('4 NVIDIA GPUs (single vendor) use the full wheel and stay well-separated', () => {
const keys = ['h100_vllm', 'h200_vllm', 'b200_vllm', 'b300_vllm'];
const result = generateHighContrastColors(keys, 'dark');
- for (const color of Object.values(result)) {
- expect(isNotReddish(parseRgb(color))).toBe(true);
- }
+ expect(Object.keys(result)).toHaveLength(4);
assertMinDist(result, 25);
});
@@ -401,19 +400,13 @@ describe('generateHighContrastColors', () => {
assertMinDist(result, 25);
});
- // ---------- Tier 2: moderate items → full wheel minus rival color ----------
+ // ---------- Single vendor, many items → full wheel, best spacing ----------
- it('10 NVIDIA GPUs: no red hues, still distinguishable', () => {
+ it('10 NVIDIA GPUs (single vendor) are well-separated across the full wheel', () => {
const gpus = ['h100', 'h200', 'b200', 'b300', 'gb200'];
const keys = gpus.flatMap((g) => [`${g}_vllm`, `${g}_sglang`]);
const result = generateHighContrastColors(keys, 'dark');
- // Should not be reddish (banned)
- for (const color of Object.values(result)) {
- const rgb = parseRgb(color);
- // Not red-dominant with low green — i.e. not in the red/pink zone
- const isRedPink = rgb[0] > 150 && rgb[1] < 80 && rgb[2] < 150;
- expect(isRedPink).toBe(false);
- }
+ expect(Object.keys(result)).toHaveLength(10);
assertMinDist(result, 20);
});
diff --git a/packages/app/src/lib/chart-utils.ts b/packages/app/src/lib/chart-utils.ts
index 33a5b4e3..ce903fe0 100644
--- a/packages/app/src/lib/chart-utils.ts
+++ b/packages/app/src/lib/chart-utils.ts
@@ -61,10 +61,17 @@ const PALETTE_CACHE = new Map();
/**
* Generates high-contrast colors using iwanthue (k-means in CIELab space).
*
- * Tiered strategy per vendor:
+ * Tiered strategy per vendor (only when >1 vendor is present):
* ≤ PREFERRED_MAX → constrain to brand zone (NVIDIA=green, AMD=red)
* ≤ BAN_MAX → full wheel minus rival's brand color
* > BAN_MAX → full wheel, no restrictions, best spacing wins
+ *
+ * Single-vendor case (e.g. an all-NVIDIA agentic comparison of B200/B300 ×
+ * vLLM/SGLang): the brand zone and rival-ban exist to keep vendors apart at a
+ * glance, but with one vendor there's no rival — clamping every series into the
+ * same narrow hue band just collapses the contrast HC is supposed to maximize.
+ * So skip both restrictions and use the full wheel, giving the series the widest
+ * possible separation.
*/
export const generateHighContrastColors = (
keys: string[],
@@ -91,6 +98,12 @@ export const generateHighContrastColors = (
list.push(key);
}
+ // Brand-zone / rival-ban only serve to keep DIFFERENT vendors apart. With a
+ // single vendor present there's nothing to separate from, so those
+ // restrictions only shrink the usable hue range and kill contrast — open the
+ // full wheel instead (the common all-NVIDIA agentic comparison case).
+ const multiVendor = groups.size > 1;
+
for (const [vendor, vendorKeys] of groups) {
const count = vendorKeys.length;
const isBanned = BANNED_HUE_TEST[vendor] ?? null;
@@ -99,8 +112,8 @@ export const generateHighContrastColors = (
// Tier 1: few items → brand zone only
// Tier 2: moderate → full wheel minus rival color
// Tier 3: many → full wheel, no restrictions
- const usePreferred = preferred && count <= PREFERRED_MAX;
- const useBan = !usePreferred && isBanned && count <= BAN_MAX;
+ const usePreferred = multiVendor && preferred && count <= PREFERRED_MAX;
+ const useBan = multiVendor && !usePreferred && isBanned && count <= BAN_MAX;
// Everything iwanthue's output depends on (the ban filter and preferred
// zone are functions of vendor; the seed is vendor+theme).
@@ -579,6 +592,20 @@ export const paretoFrontLowerRight = (points: InferenceData[]): InferenceData[]
return front;
};
+const PARETO_BY_DIRECTION = {
+ upper_right: paretoFrontUpperRight,
+ upper_left: paretoFrontUpperLeft,
+ lower_left: paretoFrontLowerLeft,
+ lower_right: paretoFrontLowerRight,
+} as const;
+
+export type ParetoDirection = keyof typeof PARETO_BY_DIRECTION;
+
+/** Look up the Pareto frontier function for a roofline direction. */
+export const paretoFrontForDirection = (
+ dir: ParetoDirection,
+): ((points: InferenceData[]) => InferenceData[]) => PARETO_BY_DIRECTION[dir];
+
/**
* Calculates the roofline for a given set of points.
*/
diff --git a/packages/app/src/lib/compare-pair-defaults.test.ts b/packages/app/src/lib/compare-pair-defaults.test.ts
index f0f1ef5b..da81ca0e 100644
--- a/packages/app/src/lib/compare-pair-defaults.test.ts
+++ b/packages/app/src/lib/compare-pair-defaults.test.ts
@@ -6,6 +6,7 @@ import { pickPairDefaults } from './compare-pair-defaults';
function makeRow(overrides: Partial): BenchmarkRow {
return {
+ id: 1,
hardware: 'h100',
framework: 'sglang',
model: 'dsr1',
@@ -30,6 +31,8 @@ function makeRow(overrides: Partial): BenchmarkRow {
metrics: { tput_per_gpu: 100 },
date: '2026-01-01',
run_url: null,
+ benchmark_type: 'single_turn',
+ offload_mode: 'off',
...overrides,
};
}
diff --git a/packages/app/src/lib/compare-pair-defaults.ts b/packages/app/src/lib/compare-pair-defaults.ts
index be6450ad..f5a37e1f 100644
--- a/packages/app/src/lib/compare-pair-defaults.ts
+++ b/packages/app/src/lib/compare-pair-defaults.ts
@@ -14,6 +14,7 @@ export function pickPairDefaults(
const seenB = new Map>();
for (const row of rows) {
if (row.hardware !== a && row.hardware !== b) continue;
+ if (row.isl === null || row.osl === null) continue;
const seq = islOslToSequence(row.isl, row.osl);
if (!seq) continue;
const key = `${seq}|${row.precision}`;
diff --git a/packages/app/src/lib/compare-ssr.test.ts b/packages/app/src/lib/compare-ssr.test.ts
index 5f2828ea..4bf99f89 100644
--- a/packages/app/src/lib/compare-ssr.test.ts
+++ b/packages/app/src/lib/compare-ssr.test.ts
@@ -4,8 +4,13 @@ import type { BenchmarkRow } from '@/lib/api';
import { computeCompareImageRows } from './compare-ssr';
+// BenchmarkRow.id is required (stable per-point id from benchmark_results);
+// hand out a fresh one per stub so id-keyed logic can't collide across rows.
+let nextStubId = 1;
+
function stubRow(overrides: Partial = {}): BenchmarkRow {
return {
+ id: nextStubId++,
hardware: 'h200',
framework: 'sglang',
model: 'dsr1',
@@ -23,6 +28,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow {
decode_num_workers: 0,
num_prefill_gpu: 8,
num_decode_gpu: 8,
+ benchmark_type: 'single_turn',
+ offload_mode: 'off',
isl: 1024,
osl: 1024,
conc: 128,
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
index debbb788..8b691ee4 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
@@ -4,7 +4,7 @@ import { describe, expect, it } from 'vitest';
import type { ShapeKey } from '@/lib/chart-rendering';
-import { renderScatterPoints, syncPointShape } from './scatter-points';
+import { computeTooltipPosition, renderScatterPoints, syncPointShape } from './scatter-points';
interface TestPoint {
hwKey: string;
@@ -163,3 +163,51 @@ describe('syncPointShape', () => {
expect(g.selectAll('.visible-shape').size()).toBe(1);
});
});
+
+describe('computeTooltipPosition', () => {
+ it('keeps a tall pinned tooltip inside the visible viewport', () => {
+ const tooltipNode = document.createElement('div');
+ document.body.append(tooltipNode);
+ Object.defineProperty(tooltipNode, 'getBoundingClientRect', {
+ value: () => ({
+ width: 300,
+ height: 400,
+ left: 0,
+ top: 0,
+ right: 300,
+ bottom: 400,
+ x: 0,
+ y: 0,
+ toJSON: () => ({}),
+ }),
+ });
+
+ const container = document.createElement('div');
+ Object.defineProperties(container, {
+ clientWidth: { value: 800 },
+ clientHeight: { value: 600 },
+ getBoundingClientRect: {
+ value: () => ({
+ width: 800,
+ height: 600,
+ left: 100,
+ top: 600,
+ right: 900,
+ bottom: 1200,
+ x: 100,
+ y: 600,
+ toJSON: () => ({}),
+ }),
+ },
+ });
+ Object.defineProperties(document.documentElement, {
+ clientWidth: { configurable: true, value: 1280 },
+ clientHeight: { configurable: true, value: 720 },
+ });
+
+ expect(computeTooltipPosition(450, 100, d3.select(tooltipNode), container)).toEqual({
+ left: 560,
+ top: 316,
+ });
+ });
+});
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 0c316366..433ed6d1 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -107,17 +107,33 @@ export function renderScatterPoints {
+ text
+ .append('tspan')
+ .attr('x', 0)
+ .attr('dy', i === 0 ? `${firstDy}em` : '1.1em')
+ .text(line);
+ });
+ });
}
// Exit: remove stale points
@@ -150,20 +166,32 @@ export function renderScatterPoints('.point-label')
+ const lines = labelGetter(d).split('\n');
+ const text = d3
+ .select(this)
+ .selectAll('.point-label')
.data([true])
.join('text')
.attr('class', 'point-label')
- .attr('dy', -8)
.attr('text-anchor', 'middle')
.attr('fill', config.foreground!)
.attr('font-size', '10px')
- .attr('pointer-events', 'none')
- .text(config.getLabelText!(d));
+ .attr('font-weight', '700')
+ .attr('pointer-events', 'none');
+ const firstDy = -(0.8 + (lines.length - 1) * 1.1);
+ text
+ .selectAll('tspan')
+ .data(lines)
+ .join('tspan')
+ .attr('x', 0)
+ .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
+ .text((l) => l);
});
} else {
points.selectAll('.point-label').remove();
@@ -283,7 +311,22 @@ export function attachScatterTooltipHandlers<
});
}
-/** Compute tooltip left/top, flipping when it would overflow the chart container. */
+/**
+ * Compute tooltip left/top **in viewport coordinates** so the tooltip can be
+ * rendered via portal with `position: fixed`. Callers still pass cursor coords
+ * relative to `container` (matching `d3.pointer(event, container)`).
+ *
+ * Why viewport coords: the chart cards use `backdrop-filter`, which creates
+ * a stacking context. A tooltip painted inside the upper card's stacking
+ * context cannot rise above the lower card's stacking context regardless of
+ * its z-index. Portalling to document.body + `position: fixed` sidesteps the
+ * whole problem; we just need the coordinates in viewport space.
+ *
+ * Strategy: pick preferred side (right/below cursor), flip if it overflows the
+ * container, then clamp the final fixed coordinates to the viewport. The
+ * viewport clamp matters when a chart continues below the fold: container-
+ * local coordinates can otherwise place a pinned tooltip's actions offscreen.
+ */
export function computeTooltipPosition(
mx: number,
my: number,
@@ -302,11 +345,27 @@ export function computeTooltipPosition(
// Force reflow so we get real dimensions
const tw = node.getBoundingClientRect().width || node.offsetWidth;
const th = node.getBoundingClientRect().height || node.offsetHeight;
+ const rect = container.getBoundingClientRect();
const cw = container.clientWidth;
const ch = container.clientHeight;
+ const EDGE_PAD = 4;
+
+ // Prefer right of cursor; flip to left if no room.
+ let left = mx + offset + tw <= cw ? mx + offset : mx - offset - tw;
+ left = Math.max(EDGE_PAD, Math.min(cw - tw - EDGE_PAD, left));
+
+ // Prefer below cursor; flip above if no room.
+ let top = my + offset + th <= ch ? my + offset : my - offset - th;
+ top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top));
- const left = mx + offset + tw > cw ? mx - offset - tw : mx + offset;
- const top = my + offset + th > ch ? my - offset - th : my + offset;
+ // Convert container-local coords → viewport coords for `position: fixed`,
+ // then keep the complete tooltip visible when its dimensions permit it.
+ const viewportWidth = document.documentElement.clientWidth || window.innerWidth;
+ const viewportHeight = document.documentElement.clientHeight || window.innerHeight;
+ left += rect.left;
+ top += rect.top;
+ left = Math.max(EDGE_PAD, Math.min(viewportWidth - tw - EDGE_PAD, left));
+ top = Math.max(EDGE_PAD, Math.min(viewportHeight - th - EDGE_PAD, top));
return { left, top };
}
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 62208aa7..e217afbd 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -180,17 +180,73 @@ export enum Sequence {
OneK_OneK = '1k/1k',
OneK_EightK = '1k/8k',
EightK_OneK = '8k/1k',
+ AgenticTraces = 'agentic-traces',
}
-const SEQUENCE_CONFIG: Record =
- {
- [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' },
- [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' },
- [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' },
- };
+/**
+ * Top-level scenario kind. Fixed-seq sequences cluster under a single group
+ * in the selector; agentic traces sit alongside as their own kind.
+ */
+export type ScenarioKind = 'fixed-seq' | 'agentic';
+
+export function sequenceKind(seq: Sequence): ScenarioKind {
+ return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq';
+}
+
+const SEQUENCE_CONFIG: Record<
+ Sequence,
+ { label: string; compact: string; category: CategoryTag; kind: ScenarioKind }
+> = {
+ [Sequence.OneK_OneK]: {
+ label: '1K / 1K',
+ compact: '1k1k',
+ category: 'default',
+ kind: 'fixed-seq',
+ },
+ [Sequence.OneK_EightK]: {
+ label: '1K / 8K',
+ compact: '1k8k',
+ category: 'deprecated',
+ kind: 'fixed-seq',
+ },
+ [Sequence.EightK_OneK]: {
+ label: '8K / 1K',
+ compact: '8k1k',
+ category: 'default',
+ kind: 'fixed-seq',
+ },
+ [Sequence.AgenticTraces]: {
+ label: 'Agentic Traces',
+ compact: 'agentic',
+ category: 'default',
+ kind: 'agentic',
+ },
+};
export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
+/**
+ * Percentile of the latency distribution used for the chart x-axis when
+ * viewing agentic traces. Agentic rows carry median/p75/p90/p95/p99/p99.9
+ * variants for ttft, ttlt (=e2el), and itl (and intvty derived from itl);
+ * p75 and p90 are surfaced in the UI.
+ */
+export enum Percentile {
+ P75 = 'p75',
+ P90 = 'p90',
+}
+
+const PERCENTILE_CONFIG: Record = {
+ [Percentile.P75]: { label: 'p75' },
+ [Percentile.P90]: { label: 'p90' },
+};
+
+export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
+
+export function getPercentileLabel(p: Percentile): string {
+ return PERCENTILE_CONFIG[p]?.label ?? p;
+}
+
export const DEPRECATED_SEQUENCES: ReadonlySet = new Set(
(Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][])
.filter(([, c]) => c.category === 'deprecated')
diff --git a/packages/app/src/lib/default-sequence.test.ts b/packages/app/src/lib/default-sequence.test.ts
new file mode 100644
index 00000000..4fd8a6b9
--- /dev/null
+++ b/packages/app/src/lib/default-sequence.test.ts
@@ -0,0 +1,119 @@
+import { describe, expect, it } from 'vitest';
+
+import { Sequence } from './data-mappings';
+import { resolveEffectiveSequence } from './default-sequence';
+
+describe('resolveEffectiveSequence', () => {
+ describe('availability gate (rule 1)', () => {
+ it('returns null while availability has not loaded, even if the selection looks valid', () => {
+ // Pre-availability, availableSequences is the static fallback (which
+ // contains AgenticTraces). Resolving here would fetch + label an agentic
+ // scenario for a fixed-seq-only model, so we hold off.
+ expect(
+ resolveEffectiveSequence({
+ selectedSequence: Sequence.AgenticTraces,
+ availableSequences: [
+ Sequence.OneK_OneK,
+ Sequence.OneK_EightK,
+ Sequence.EightK_OneK,
+ Sequence.AgenticTraces,
+ ],
+ availabilityLoaded: false,
+ }),
+ ).toBeNull();
+ });
+
+ it('returns null pre-availability regardless of the selected sequence', () => {
+ expect(
+ resolveEffectiveSequence({
+ selectedSequence: Sequence.EightK_OneK,
+ availableSequences: [Sequence.EightK_OneK],
+ availabilityLoaded: false,
+ }),
+ ).toBeNull();
+ });
+ });
+
+ describe('honors a valid selection (rule 2a)', () => {
+ it('keeps AgenticTraces when the model actually has agentic data (dsr1 case)', () => {
+ // DeepSeek-R1 in the seeded DB has both agentic and 8k/1k — the agentic
+ // default must survive so the PR intent (agentic-preferred) holds.
+ expect(
+ resolveEffectiveSequence({
+ selectedSequence: Sequence.AgenticTraces,
+ availableSequences: [Sequence.EightK_OneK, Sequence.AgenticTraces],
+ availabilityLoaded: true,
+ }),
+ ).toBe(Sequence.AgenticTraces);
+ });
+
+ it('keeps a fixed-seq selection when available', () => {
+ expect(
+ resolveEffectiveSequence({
+ selectedSequence: Sequence.OneK_OneK,
+ availableSequences: [Sequence.OneK_OneK, Sequence.EightK_OneK],
+ availabilityLoaded: true,
+ }),
+ ).toBe(Sequence.OneK_OneK);
+ });
+ });
+
+ describe('fallback ordering when the selection is unavailable (rule 2b/2c)', () => {
+ it('for a fixed-seq-only model, agentic default falls back to 8k/1k, not the raw first entry (llama70b case)', () => {
+ // Llama-3.3-70B has only 8k/1k in the seeded DB. The agentic default is
+ // unavailable, so it must resolve to a fixed-seq scenario — here the sole
+ // available one.
+ expect(
+ resolveEffectiveSequence({
+ selectedSequence: Sequence.AgenticTraces,
+ availableSequences: [Sequence.EightK_OneK],
+ availabilityLoaded: true,
+ }),
+ ).toBe(Sequence.EightK_OneK);
+ });
+
+ it('prefers 8k/1k over availableSequences[0] when both 1k/1k and 8k/1k exist', () => {
+ // DB row order can surface 1k/1k first. Master defaulted non-agentic
+ // models to 8k/1k, so prefer it rather than snapping to 1k/1k.
+ expect(
+ resolveEffectiveSequence({
+ selectedSequence: Sequence.AgenticTraces,
+ availableSequences: [Sequence.OneK_OneK, Sequence.EightK_OneK],
+ availabilityLoaded: true,
+ }),
+ ).toBe(Sequence.EightK_OneK);
+ });
+
+ it('falls back to availableSequences[0] when 8k/1k is not available', () => {
+ expect(
+ resolveEffectiveSequence({
+ selectedSequence: Sequence.AgenticTraces,
+ availableSequences: [Sequence.OneK_OneK, Sequence.OneK_EightK],
+ availabilityLoaded: true,
+ }),
+ ).toBe(Sequence.OneK_OneK);
+ });
+
+ it('never resolves to AgenticTraces via fallback when the model lacks it', () => {
+ const result = resolveEffectiveSequence({
+ selectedSequence: Sequence.AgenticTraces,
+ availableSequences: [Sequence.OneK_OneK, Sequence.OneK_EightK, Sequence.EightK_OneK],
+ availabilityLoaded: true,
+ });
+ expect(result).not.toBe(Sequence.AgenticTraces);
+ expect(result).toBe(Sequence.EightK_OneK);
+ });
+
+ it('returns the selection itself when the model has no sequences at all', () => {
+ // Degenerate case: keeps a non-null value so the type contract holds; the
+ // chart shows empty. (availabilityLoaded true but zero sequences.)
+ expect(
+ resolveEffectiveSequence({
+ selectedSequence: Sequence.OneK_OneK,
+ availableSequences: [],
+ availabilityLoaded: true,
+ }),
+ ).toBe(Sequence.OneK_OneK);
+ });
+ });
+});
diff --git a/packages/app/src/lib/default-sequence.ts b/packages/app/src/lib/default-sequence.ts
new file mode 100644
index 00000000..d06a5307
--- /dev/null
+++ b/packages/app/src/lib/default-sequence.ts
@@ -0,0 +1,52 @@
+import { Sequence } from './data-mappings';
+
+/**
+ * Effective-sequence resolution.
+ *
+ * `selectedSequence` defaults to {@link Sequence.AgenticTraces} (a deliberate
+ * product choice — agentic-preferred), but not every model has agentic data.
+ * This helper turns the raw user/default selection into the sequence the chart
+ * should actually render, given what the selected model offers.
+ *
+ * Two rules, in order:
+ *
+ * 1. **Availability gate.** Until availability rows have loaded we do NOT know
+ * which sequences the model has. Resolving eagerly here would pick the static
+ * fallback list (which contains AgenticTraces) and make the page fetch + label
+ * an agentic scenario for fixed-seq-only models (e.g. Llama-3.3-70B), then
+ * snap to a fixed-seq scenario once availability arrives — a visible flash of
+ * "Agentic Traces" plus a wasted request. When `availabilityLoaded` is false
+ * we return `null`; callers gate data fetching and selector display on a
+ * non-null result (a loading skeleton covers this window, which is short).
+ *
+ * 2. **Fallback ordering.** Once availability is known: keep the user's
+ * `selectedSequence` if the model has it. Otherwise fall back to a sensible
+ * fixed-seq scenario. `availableSequences[0]` follows DB row order, which can
+ * surface `1k/1k` even when `8k/1k` exists — but `8k/1k` was the pre-agentic
+ * default for non-agentic models, so prefer it when present to match that
+ * long-standing behavior. Only if neither the selection nor `8k/1k` is
+ * available do we fall to `availableSequences[0]`.
+ */
+export function resolveEffectiveSequence({
+ selectedSequence,
+ availableSequences,
+ availabilityLoaded,
+}: {
+ selectedSequence: Sequence;
+ availableSequences: Sequence[];
+ availabilityLoaded: boolean;
+}): Sequence | null {
+ // Rule 1: do not commit to a sequence before we know what the model has.
+ if (!availabilityLoaded) return null;
+
+ // Rule 2a: honor the user's / default selection when the model supports it.
+ if (availableSequences.includes(selectedSequence)) return selectedSequence;
+
+ // Rule 2b: prefer 8k/1k (the pre-agentic default for non-agentic models) over
+ // whatever availableSequences[0] happens to be (DB row order can yield 1k/1k).
+ if (availableSequences.includes(Sequence.EightK_OneK)) return Sequence.EightK_OneK;
+
+ // Rule 2c: last resort — first available, or the selection itself if the model
+ // has no sequences at all (keeps the type non-null; downstream shows empty).
+ return availableSequences[0] ?? selectedSequence;
+}
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 28cc1e36..2f5844c1 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,23 +57,43 @@ function makeEntry(overrides: Partial = {}): AggDataEntry {
mean_ttft: 0.5,
median_ttft: 0.4,
std_ttft: 0.1,
+ p75_ttft: 0.65,
+ p90_ttft: 0.7,
+ p95_ttft: 0.75,
p99_ttft: 0.8,
+ 'p99.9_ttft': 0.9,
mean_tpot: 0.02,
mean_intvty: 45,
median_tpot: 0.02,
median_intvty: 44,
std_tpot: 0.005,
std_intvty: 5,
+ p75_tpot: 0.022,
+ p75_intvty: 50,
+ p90_tpot: 0.025,
+ p90_intvty: 55,
+ p95_tpot: 0.028,
+ p95_intvty: 58,
p99_tpot: 0.03,
p99_intvty: 60,
+ 'p99.9_tpot': 0.035,
+ 'p99.9_intvty': 65,
mean_itl: 0.01,
median_itl: 0.01,
std_itl: 0.002,
+ p75_itl: 0.012,
+ p90_itl: 0.013,
+ p95_itl: 0.014,
p99_itl: 0.015,
+ 'p99.9_itl': 0.018,
mean_e2el: 5,
median_e2el: 4.8,
std_e2el: 0.5,
+ p75_e2el: 5.2,
+ p90_e2el: 5.5,
+ p95_e2el: 5.8,
p99_e2el: 6,
+ 'p99.9_e2el': 6.5,
disagg: false,
num_prefill_gpu: 0,
num_decode_gpu: 0,
diff --git a/packages/app/src/lib/url-state.test.ts b/packages/app/src/lib/url-state.test.ts
index e34b32b4..fe26072f 100644
--- a/packages/app/src/lib/url-state.test.ts
+++ b/packages/app/src/lib/url-state.test.ts
@@ -30,9 +30,13 @@ describe('PARAM_DEFAULTS', () => {
expect(PARAM_DEFAULTS.g_model).toBe('DeepSeek-V4-Pro');
});
- it('has expected default for i_seq', async () => {
+ it('has an EMPTY default for i_seq so the selected scenario is always written', async () => {
+ // The UI default scenario (gate-unlocked) is AgenticTraces, not 8k/1k. An
+ // '8k/1k' default would strip an explicit 8K/1K selection from the URL, which
+ // then resolves back to the agentic default on reload/share. Empty means no
+ // scenario value ever matches the default, so it's always persisted.
const { PARAM_DEFAULTS } = await import('@/lib/url-state');
- expect(PARAM_DEFAULTS.i_seq).toBe('8k/1k');
+ expect(PARAM_DEFAULTS.i_seq).toBe('');
});
it('has expected default for r_range', async () => {
@@ -182,6 +186,28 @@ describe('writeUrlParams + buildShareUrl', () => {
expect(url).not.toContain('g_model');
});
+ it('keeps an explicit i_seq=8k/1k in the share URL (no longer stripped as a default)', async () => {
+ setupWindow('', '/inference');
+ const { writeUrlParams, buildShareUrl } = await import('@/lib/url-state');
+
+ // Picking the fixed-seq scenario must survive into the share URL; before the
+ // fix this matched the '8k/1k' default and was dropped, reverting to agentic.
+ writeUrlParams({ i_seq: '8k/1k' });
+ await vi.advanceTimersByTimeAsync(200);
+
+ expect(buildShareUrl()).toContain('i_seq=8k%2F1k');
+ });
+
+ it('still strips i_seq when it is empty (the no-selection case)', async () => {
+ setupWindow('', '/inference');
+ const { writeUrlParams, buildShareUrl } = await import('@/lib/url-state');
+
+ writeUrlParams({ i_seq: '' });
+ await vi.advanceTimersByTimeAsync(200);
+
+ expect(buildShareUrl()).not.toContain('i_seq');
+ });
+
it('batches multiple params in a single debounce window', async () => {
setupWindow('', '/inference');
const { writeUrlParams, buildShareUrl } = await import('@/lib/url-state');
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index c78bf588..3671b6b8 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -22,8 +22,10 @@ const URL_STATE_KEYS = [
'i_seq',
'i_prec',
'i_metric',
+ 'i_pctl',
'i_xmetric',
'i_e2e_xmetric',
+ 'i_xmode',
'i_scale',
'i_gpus',
'i_dates',
@@ -72,14 +74,22 @@ export const PARAM_DEFAULTS: Record = {
g_model: 'DeepSeek-V4-Pro',
g_rundate: '',
g_runid: '',
- i_seq: '8k/1k',
+ // No strippable default: the UI default scenario (gate-unlocked) is
+ // AgenticTraces, not 8k/1k, so an '8k/1k' default here would strip an explicit
+ // 8K/1K selection from the URL — on reload the empty i_seq resolves back to the
+ // agentic default. Empty means the resolved scenario is ALWAYS written
+ // explicitly (effectiveSequence is never ''), so a shared/reloaded link keeps
+ // whatever the user picked. The no-param case still resolves via availability.
+ i_seq: '',
// No strippable default: precision is only written to the URL once chosen
// explicitly, so an explicit FP4 selection must survive (not be stripped as a
// "default") or it would silently revert to the per-model auto default on reload.
i_prec: '',
i_metric: 'y_tpPerGpu',
- i_xmetric: 'p99_ttft',
- i_e2e_xmetric: '',
+ i_pctl: 'p90',
+ i_xmetric: 'p90_ttft',
+ i_e2e_xmetric: 'p90_ttft',
+ i_xmode: '',
i_scale: 'auto',
i_gpus: '',
i_dates: '',
diff --git a/packages/app/tsconfig.json b/packages/app/tsconfig.json
index 8b658cad..3044b60c 100644
--- a/packages/app/tsconfig.json
+++ b/packages/app/tsconfig.json
@@ -29,7 +29,9 @@
"**/*.tsx",
".next/types/**/*.ts",
"json-custom-types.d.ts",
- ".next/dev/types/**/*.ts"
+ ".next/dev/types/**/*.ts",
+ ".next-e2e/types/**/*.ts",
+ ".next-e2e/dev/types/**/*.ts"
],
"exclude": ["node_modules"]
}
diff --git a/packages/constants/src/agentic.ts b/packages/constants/src/agentic.ts
new file mode 100644
index 00000000..42eab306
--- /dev/null
+++ b/packages/constants/src/agentic.ts
@@ -0,0 +1,2 @@
+/** Fixed output length used by the experimental normalized-E2E chart metric. */
+export const NORMALIZED_E2E_OUTPUT_TOKENS = 400;
diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts
index 6c775be4..74cbce3f 100644
--- a/packages/constants/src/framework-aliases.ts
+++ b/packages/constants/src/framework-aliases.ts
@@ -46,6 +46,7 @@ export const FRAMEWORK_LABELS: Record = {
]),
),
mtp: 'MTP',
+ aiperf: 'AIPerf',
};
/**
diff --git a/packages/constants/src/index.ts b/packages/constants/src/index.ts
index e767e500..7d3d6783 100644
--- a/packages/constants/src/index.ts
+++ b/packages/constants/src/index.ts
@@ -1,3 +1,4 @@
+export * from './agentic';
export * from './framework-aliases';
export * from './github';
export * from './gpu-keys';
diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts
index 7fa88c97..914eed4b 100644
--- a/packages/constants/src/metric-keys.ts
+++ b/packages/constants/src/metric-keys.ts
@@ -1,48 +1,117 @@
/**
* Canonical set of metric keys stored in the benchmark_results.metrics JSONB column.
*
- * All values are in seconds unless noted otherwise. Throughput values are tokens/sec/GPU.
+ * Latency values (ttft/tpot/itl/e2el/intvty) are in seconds. Throughput values are
+ * tokens/sec — `_per_gpu` is per-GPU, `_tps` is total tokens/sec across the deployment.
+ *
+ * Distribution stats (mean/median/std/p75/p90/p95/p99/p99.9) are present for latency,
+ * QPS, and per-request token counts; agentic runs carry the full set, fixed-seq runs
+ * carry median/mean/p99/std for latency only.
*/
export const METRIC_KEYS = new Set([
// throughput (tokens/sec/GPU)
'tput_per_gpu',
'output_tput_per_gpu',
'input_tput_per_gpu',
+ // throughput (tokens/sec, deployment total) — agentic aiperf reports both
+ 'total_tput_tps',
+ 'output_tput_tps',
+ 'input_tput_tps',
// TTFT — time to first token
'median_ttft',
'mean_ttft',
+ 'p75_ttft',
'p90_ttft',
+ 'p95_ttft',
'p99_ttft',
'p99.9_ttft',
'std_ttft',
// TPOT — time per output token
'median_tpot',
'mean_tpot',
+ 'p75_tpot',
'p90_tpot',
+ 'p95_tpot',
'p99_tpot',
'p99.9_tpot',
'std_tpot',
// ITL — inter-token latency
'median_itl',
'mean_itl',
+ 'p75_itl',
'p90_itl',
+ 'p95_itl',
'p99_itl',
'p99.9_itl',
'std_itl',
// E2EL — end-to-end latency
'median_e2el',
'mean_e2el',
+ 'p75_e2el',
'p90_e2el',
+ 'p95_e2el',
'p99_e2el',
'p99.9_e2el',
'std_e2el',
// interactivity
'median_intvty',
'mean_intvty',
+ 'p75_intvty',
'p90_intvty',
+ 'p95_intvty',
'p99_intvty',
'p99.9_intvty',
'std_intvty',
+ // QPS — queries per second (agentic aiperf)
+ 'median_qps',
+ 'mean_qps',
+ 'p75_qps',
+ 'p90_qps',
+ 'p95_qps',
+ 'p99_qps',
+ 'p99.9_qps',
+ 'std_qps',
+ // per-request input token count distribution
+ 'median_input_tokens',
+ 'mean_input_tokens',
+ 'p75_input_tokens',
+ 'p90_input_tokens',
+ 'p95_input_tokens',
+ 'p99_input_tokens',
+ 'p99.9_input_tokens',
+ 'std_input_tokens',
+ // per-request output token count distribution — actual served
+ 'median_output_tokens_actual',
+ 'mean_output_tokens_actual',
+ 'p75_output_tokens_actual',
+ 'p90_output_tokens_actual',
+ 'p95_output_tokens_actual',
+ 'p99_output_tokens_actual',
+ 'p99.9_output_tokens_actual',
+ 'std_output_tokens_actual',
+ // per-request output token count distribution — expected from trace
+ 'median_output_tokens_expected',
+ 'mean_output_tokens_expected',
+ 'p75_output_tokens_expected',
+ 'p90_output_tokens_expected',
+ 'p95_output_tokens_expected',
+ 'p99_output_tokens_expected',
+ 'p99.9_output_tokens_expected',
+ 'std_output_tokens_expected',
+ // run totals (agentic aiperf)
+ 'duration_seconds',
+ 'total_requests_completed',
+ 'total_prompt_tokens',
+ 'total_generation_tokens',
+ // server prefix-cache observability (agentic aiperf)
+ 'server_gpu_cache_hit_rate',
+ 'server_cpu_cache_hit_rate',
+ 'server_external_cache_hit_rate',
+ 'theoretical_cache_hit_rate',
+ // server KV-cache occupancy — mean GPU KV-cache usage fraction (0-1) over the
+ // profiling window (agentic aiperf; flat in v2 artifacts, mapped from
+ // server_metrics.kv_cache.gpu_usage_pct in v3)
+ 'gpu_kv_cache_usage_pct',
// measured power / energy (emitted by runner's aggregate_power.py)
// avg_power_w: mean per-GPU draw (W) during the load window
// joules_per_output_token: energy / total_output_tokens. CLUSTER-WIDE on
diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts
index 06dfa09b..9622fe8c 100644
--- a/packages/constants/src/models.ts
+++ b/packages/constants/src/models.ts
@@ -56,3 +56,20 @@ export function islOslToSequence(isl: number, osl: number): string | null {
};
return map[`${isl}_${osl}`] ?? null;
}
+
+/**
+ * Map a benchmark/availability row to its sequence (scenario) string.
+ * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl.
+ * - Other rows (today: `single_turn`) fall back to `islOslToSequence`.
+ * Returns `null` for rows that can't be classified (e.g. `single_turn` with
+ * unmapped isl/osl values).
+ */
+export function rowToSequence(row: {
+ isl: number | null;
+ osl: number | null;
+ benchmark_type: string;
+}): string | null {
+ if (row.benchmark_type === 'agentic_traces') return 'agentic-traces';
+ if (row.isl === null || row.osl === null) return null;
+ return islOslToSequence(row.isl, row.osl);
+}
diff --git a/packages/constants/src/tables.ts b/packages/constants/src/tables.ts
index 60e85182..f482fd5e 100644
--- a/packages/constants/src/tables.ts
+++ b/packages/constants/src/tables.ts
@@ -2,6 +2,7 @@
export const TABLE_NAMES = {
configs: 'configs',
workflowRuns: 'workflow_runs',
+ agenticTraceReplay: 'agentic_trace_replay',
benchmarkResults: 'benchmark_results',
serverLogs: 'server_logs',
runStats: 'run_stats',
@@ -9,21 +10,38 @@ export const TABLE_NAMES = {
evalSamples: 'eval_samples',
changelogEntries: 'changelog_entries',
availability: 'availability',
+ datasets: 'datasets',
+ datasetConversations: 'dataset_conversations',
+ runDatasets: 'run_datasets',
schemaMigrations: 'schema_migrations',
} as const;
/**
* Data tables in FK-safe insertion order.
* Parents before children — safe for dump, load, and (reversed) reset.
+ *
+ * FK edges enforced by this ordering (verified against migration 008_agentic.sql
+ * and the live schema's pg_constraint):
+ * - benchmark_results.trace_replay_id → agentic_trace_replay(id)
+ * ⇒ agentic_trace_replay before benchmark_results
+ * - dataset_conversations.dataset_id → datasets(id)
+ * ⇒ datasets before dataset_conversations
+ * - run_datasets.workflow_run_id → workflow_runs(id)
+ * ⇒ workflow_runs before run_datasets (run_datasets.dataset_slug is a
+ * plain slug, NOT an FK to datasets, so it needs no ordering vs datasets)
*/
export const TABLE_INSERT_ORDER = [
TABLE_NAMES.configs,
TABLE_NAMES.serverLogs,
TABLE_NAMES.workflowRuns,
+ TABLE_NAMES.agenticTraceReplay,
TABLE_NAMES.benchmarkResults,
TABLE_NAMES.evalResults,
TABLE_NAMES.evalSamples,
TABLE_NAMES.runStats,
TABLE_NAMES.changelogEntries,
TABLE_NAMES.availability,
+ TABLE_NAMES.datasets,
+ TABLE_NAMES.datasetConversations,
+ TABLE_NAMES.runDatasets,
] as const;
diff --git a/packages/db/migrations/008_agentic.sql b/packages/db/migrations/008_agentic.sql
new file mode 100644
index 00000000..eceea82e
--- /dev/null
+++ b/packages/db/migrations/008_agentic.sql
@@ -0,0 +1,326 @@
+-- 007_agentic.sql
+--
+-- Squashed agentic-benchmark + datasets schema. Collapses the feat/agentx
+-- migrations 002_agentic_scenario .. 012_run_datasets into one file that sorts
+-- after master's highest migration (006_benchmark_results_workers), so the
+-- branch's numbering no longer collides with master's 002-006. None of the
+-- collapsed migrations had been applied to any deployed database.
+--
+-- Statement order is preserved exactly. The latest_benchmarks recreate uses
+-- 'select br.*', so it retains every benchmark_results column added earlier
+-- (including master's 'workers' from 006) and re-keys the view on offload_mode.
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 002_agentic_scenario.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Support agentic scenarios in benchmark_results.
+--
+-- Scenarios are discriminated by benchmark_type:
+-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
+-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL.
+--
+-- conc retains its meaning (concurrent users/requests) for both.
+
+-- 1) isl/osl become nullable for agentic rows
+alter table benchmark_results
+ alter column isl drop not null,
+ alter column osl drop not null;
+
+-- 2) CHECK constraints: positive-or-null
+alter table benchmark_results
+ drop constraint benchmark_results_isl_positive,
+ drop constraint benchmark_results_osl_positive;
+
+alter table benchmark_results
+ add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
+ add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
+
+-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
+-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
+alter table benchmark_results
+ drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+ add constraint benchmark_results_unique unique nulls not distinct
+ (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 003_agentic_availability.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Extend the availability table to cover agentic scenarios.
+--
+-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
+-- for availability and add benchmark_type so the frontend can enumerate
+-- agentic vs single_turn scenarios per model/date.
+--
+-- Postgres primary keys require every column to be NOT NULL, so we drop the PK
+-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
+-- equivalent except it allows isl/osl to be NULL for agentic rows.
+
+alter table availability
+ drop constraint availability_pkey;
+
+alter table availability
+ alter column isl drop not null,
+ alter column osl drop not null,
+ add column benchmark_type text not null default 'single_turn';
+
+alter table availability
+ add constraint availability_natural_key unique nulls not distinct
+ (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 004_offload_mode.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Add offload_mode as a first-class dimension on benchmark_results.
+--
+-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
+-- runs: a single run may emit two rows for the same (config, isl, osl, conc)
+-- — one with offload disabled, one enabled. The pre-existing unique key
+-- collapsed those into one row, forcing the ingest to skip variants.
+--
+-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
+-- assumption baked into the existing 5,500+ rows.
+
+alter table benchmark_results
+ add column offload_mode text not null default 'off';
+
+-- Backfill agentic rows from the offload_mode value already living in metrics
+-- JSONB (set during the earlier agentic ingest backfill).
+update benchmark_results
+ set offload_mode = metrics->>'offload_mode'
+ where benchmark_type = 'agentic_traces'
+ and metrics ? 'offload_mode';
+
+-- Replace the unique constraint so on/off variants can coexist.
+alter table benchmark_results
+ drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+ add constraint benchmark_results_unique unique nulls not distinct
+ (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
+
+-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
+drop materialized view if exists latest_benchmarks cascade;
+
+create materialized view latest_benchmarks as
+select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
+ br.*
+from benchmark_results br
+join latest_workflow_runs wr on wr.id = br.workflow_run_id
+where br.error is null
+order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
+
+create unique index latest_benchmarks_pk
+ on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 006_agentic_trace_replay.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Capture raw aiperf trace files per agentic benchmark point.
+--
+-- The aiperf harness produces two per-point export files inside each
+-- `agentic_` artifact:
+-- - profile_export.jsonl (~2 MB raw, per-request data)
+-- - server_metrics_export.csv (~20 KB raw, periodic Prometheus snapshots)
+--
+-- We persist them so the dashboard can later show per-request distributions,
+-- KV cache utilization over time, and conversation traces without needing to
+-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
+-- ~500 KB per point post-gzip the total fits comfortably without a separate
+-- blob service.
+--
+-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
+-- column on benchmark_results). Older, non-aiperf agentic runs simply have a
+-- NULL `trace_replay_id`.
+
+create table agentic_trace_replay (
+ id bigserial primary key,
+ -- gzip(profile_export.jsonl); null when only the server metrics file existed
+ profile_export_jsonl_gz bytea,
+ profile_export_uncompressed_size bigint,
+ -- raw csv bytes; null when only the profile file existed
+ server_metrics_csv bytea,
+ server_metrics_csv_size bigint,
+ created_at timestamptz not null default now()
+);
+
+alter table benchmark_results
+ add column trace_replay_id bigint references agentic_trace_replay(id);
+
+create index benchmark_results_trace_replay_idx
+ on benchmark_results (trace_replay_id)
+ where trace_replay_id is not null;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 007_agentic_trace_server_metrics_json.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Add the full server-metrics time-series JSON to agentic_trace_replay.
+--
+-- The existing `server_metrics_csv` column holds aiperf's summary export —
+-- one row per metric with avg/min/max/std/p1..p99 across the entire run.
+-- That's enough for the cumulative cache-hit number but not for any
+-- "metric over time" view (KV cache utilization curve, queue depth, prefix
+-- hit rate per interval, cumulative prefill token source).
+--
+-- The harness also writes `server_metrics_export.json` which contains the
+-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
+-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
+-- to ~6 MB gzipped (text with repeated metric names + numeric values).
+-- That's the file we store here for any future time-series chart.
+
+alter table agentic_trace_replay
+ add column server_metrics_json_gz bytea,
+ add column server_metrics_json_uncompressed_size bigint;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 008_agentic_aggregate_stats.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed aggregate stats for each agentic_trace_replay row.
+--
+-- Previously the agentic detail page parsed the (huge) profile_export.jsonl
+-- and server_metrics_json blobs on every request to compute distribution
+-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
+-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
+-- worst rows (high-conc TP+EP server_metrics blobs that decompress past
+-- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
+--
+-- This column holds the computed stats so the API serves the page from a
+-- single SQL row read. Shape mirrors the existing benchmark_results.metrics
+-- JSONB convention; an inner `version` field lets the backfill script
+-- detect rows whose stats were computed by an older algorithm and
+-- recompute them. Null when stats haven't been computed yet (existing
+-- rows pre-backfill; the API has a slow-path fallback for that case).
+
+alter table agentic_trace_replay
+ add column aggregate_stats jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 009_agentic_chart_series.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed time-series for the agentic detail page chart.
+--
+-- Sibling to `aggregate_stats` (migration 008): that column stores
+-- per-row percentile/derived *summaries*, this one stores the full
+-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
+-- queueDepth, prefillTps, decodeTps, promptTokensBySource).
+--
+-- Without this, the detail page parsed the entire `server_metrics_json_gz`
+-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
+-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
+-- With pre-computed series the page is a single SQL row read.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored series were produced by an older algorithm.
+-- Null when the series haven't been computed yet; the API has a slow-path
+-- fallback (with stream-parse for oversized blobs) for that case.
+
+alter table agentic_trace_replay
+ add column chart_series jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 010_agentic_request_timeline.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed per-request timeline for the agentic detail page.
+--
+-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
+-- holds a thin per-request array extracted from `profile_export_jsonl_gz`
+-- so the detail page can render a Gantt-style swimlane of every request
+-- (one bar per conversation turn) without re-parsing the JSONL on every
+-- page load.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored timeline was produced by an older
+-- algorithm. Null when the timeline hasn't been computed yet; the API
+-- falls back to parsing the blob in that case.
+
+alter table agentic_trace_replay
+ add column request_timeline jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 011_datasets.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora
+-- the agentic benchmarks replay) + their per-conversation trace structure.
+--
+-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but
+-- not the source traces. These two tables back the new /datasets area: a
+-- registry of ingested dataset versions with precomputed summary + chart data,
+-- and one row per conversation holding a flamegraph-ready `structure` (turns +
+-- subagent groups with input split into cached-prefix vs uncached-suffix). The
+-- raw hash_ids are NOT stored — they're only needed at ingest to derive the
+-- cached/uncached split, so the runtime read is a single small JSONB.
+--
+-- Additive only. To revert this migration:
+-- drop table if exists dataset_conversations;
+-- drop table if exists datasets;
+-- (and see the run_datasets revert below; this is all one migration now:
+-- delete from schema_migrations where filename = '007_agentic.sql';)
+
+create table datasets (
+ -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'.
+ id text primary key,
+ -- URL key, e.g. 'cc-traces-weka-062126'.
+ slug text not null unique,
+ label text not null,
+ -- 'full' | '256k' | 'no-subagents' (the published variants).
+ variant text not null default 'full',
+ description text,
+ hf_url text,
+ license text,
+ conversation_count integer not null default 0,
+ -- Token totals, main_turns, subagent_groups, model mix, date range, etc.
+ summary jsonb not null default '{}'::jsonb,
+ -- Precomputed distributions for the dataset-detail cards (input/output length,
+ -- turns per conversation, subagent fan-out, …). Versioned via an inner field.
+ chart_data jsonb not null default '{}'::jsonb,
+ dataset_version integer not null default 1,
+ ingested_at timestamptz not null default now()
+);
+
+create table dataset_conversations (
+ id bigserial primary key,
+ dataset_id text not null references datasets(id) on delete cascade,
+ -- The conversation id from the dataset record (trace id).
+ conv_id text not null,
+ models text[] not null default '{}',
+ num_turns integer not null default 0,
+ num_subagent_groups integer not null default 0,
+ total_in bigint not null default 0,
+ total_out bigint not null default 0,
+ total_cached bigint not null default 0,
+ -- Flamegraph-ready ordered node tree (turns + subagent groups, each with
+ -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts.
+ structure jsonb not null,
+ unique (dataset_id, conv_id)
+);
+
+create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 012_run_datasets.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Maps a benchmark workflow_run to the source dataset it replayed, so the
+-- agentic detail page can deep-link each request in the timeline to the exact
+-- conversation in the /datasets viewer (the request's conversation_id, with any
+-- ::sa:/::fa: suffix stripped, is the dataset conv_id).
+--
+-- One row per workflow_run (every benchmark in a run replays the same dataset).
+-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/
+-- URL) rather than an FK, so the mapping can be recorded before/independent of
+-- the dataset being ingested; the UI degrades gracefully if the slug is absent.
+--
+-- Additive only. To revert this whole squashed migration:
+-- drop table if exists run_datasets;
+-- drop table if exists dataset_conversations;
+-- drop table if exists datasets;
+-- drop table if exists agentic_trace_replay cascade;
+-- (plus the benchmark_results/availability column + constraint changes above)
+-- delete from schema_migrations where filename = '007_agentic.sql';
+
+create table run_datasets (
+ workflow_run_id bigint primary key references workflow_runs(id) on delete cascade,
+ dataset_slug text not null,
+ created_at timestamptz not null default now()
+);
diff --git a/packages/db/migrations/009_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/009_latest_benchmarks_single_run_per_line.sql
new file mode 100644
index 00000000..039dfe09
--- /dev/null
+++ b/packages/db/migrations/009_latest_benchmarks_single_run_per_line.sql
@@ -0,0 +1,49 @@
+-- ============================================================
+-- LATEST_BENCHMARKS — one run per line (no cross-run stitching)
+-- ============================================================
+--
+-- Previously the view did `distinct on (config_id, conc, isl, osl)` ordered by
+-- date desc — resolved INDEPENDENTLY per concurrency. So if a newer run
+-- re-measured only some concurrencies (a partial re-sweep), the concurrencies it
+-- skipped fell back to an older run that did measure them, and a single chart line
+-- ended up stitched from points produced by different runs on different dates.
+--
+-- A line is one config + sequence + offload mode
+-- (config_id, benchmark_type, isl, osl, offload_mode) plotted
+-- across concurrencies, and it must come from a SINGLE workflow run. We pick the
+-- newest run per line (newest date, then latest sweep by run_started_at, then
+-- highest workflow_run_id so exactly one run wins even on a same-day / null tie),
+-- then keep EVERY concurrency that one run measured. A partial re-sweep therefore
+-- truncates the line to its own concurrencies rather than borrowing an older run's.
+
+drop materialized view if exists latest_benchmarks;
+
+create materialized view latest_benchmarks as
+with winners as (
+ select distinct on (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode)
+ br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+ br.workflow_run_id as winning_run_id
+ from benchmark_results br
+ join latest_workflow_runs wr on wr.id = br.workflow_run_id
+ where br.error is null
+ order by br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+ br.date desc, wr.run_started_at desc nulls last, br.workflow_run_id desc
+)
+select br.*
+from benchmark_results br
+join winners w
+ on w.config_id = br.config_id
+ and w.benchmark_type = br.benchmark_type
+ and w.isl is not distinct from br.isl
+ and w.osl is not distinct from br.osl
+ and w.offload_mode = br.offload_mode
+ and w.winning_run_id = br.workflow_run_id
+where br.error is null;
+
+-- Unique key now includes benchmark_type (part of the line key). One run per line
+-- guarantees one row per concurrency, so this stays unique and keeps
+-- REFRESH MATERIALIZED VIEW CONCURRENTLY working.
+create unique index latest_benchmarks_pk
+ on latest_benchmarks (config_id, conc, isl, osl, benchmark_type, offload_mode)
+ nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/migrations/010_dataset_request_stats.sql b/packages/db/migrations/010_dataset_request_stats.sql
new file mode 100644
index 00000000..0b7c11bb
--- /dev/null
+++ b/packages/db/migrations/010_dataset_request_stats.sql
@@ -0,0 +1,55 @@
+-- Backfill dataset-level requests/conversation statistics.
+-- A request is one actual model call: each top-level turn plus each child turn
+-- inside a subagent group. The group container itself is not a request.
+
+with per_conversation as (
+ select
+ dc.dataset_id,
+ dc.num_subagent_groups,
+ (
+ dc.num_turns + coalesce((
+ select sum(jsonb_array_length(node.value->'children'))
+ from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) as node(value)
+ where node.value->>'kind' = 'subagent'
+ ), 0)
+ )::double precision as request_count
+ from dataset_conversations dc
+), request_stats as (
+ select
+ dataset_id,
+ avg(request_count) as mean_requests,
+ percentile_cont(0.5) within group (order by request_count) as median_requests,
+ avg(num_subagent_groups::double precision) as mean_subagents,
+ percentile_cont(0.5) within group (order by num_subagent_groups) as median_subagents
+ from per_conversation
+ group by dataset_id
+)
+update datasets d
+set summary = jsonb_set(
+ jsonb_set(
+ jsonb_set(
+ jsonb_set(
+ jsonb_set(
+ d.summary,
+ '{meanRequestsPerConversation}',
+ to_jsonb(request_stats.mean_requests),
+ true
+ ),
+ '{medianRequestsPerConversation}',
+ to_jsonb(request_stats.median_requests),
+ true
+ ),
+ '{meanSubagentsPerTrace}',
+ to_jsonb(request_stats.mean_subagents),
+ true
+ ),
+ '{medianSubagentsPerTrace}',
+ to_jsonb(request_stats.median_subagents),
+ true
+ ),
+ '{version}',
+ '3'::jsonb,
+ true
+)
+from request_stats
+where d.id = request_stats.dataset_id;
diff --git a/packages/db/package.json b/packages/db/package.json
index 8789f48b..2c8dc067 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -19,6 +19,10 @@
"db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts",
"db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
"db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
+ "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
+ "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
+ "db:backfill-dataset-stats": "dotenv -e ../../.env -- tsx src/backfill-dataset-stats.ts",
+ "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
"db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
"db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
"db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
@@ -30,11 +34,14 @@
"@neondatabase/serverless": "^1.1.0",
"@noble/ciphers": "^2.2.0",
"@semianalysisai/inferencex-constants": "workspace:*",
- "postgres": "^3.4.9"
+ "postgres": "^3.4.9",
+ "stream-chain": "^3.4.0",
+ "stream-json": "^2.1.0"
},
"devDependencies": {
"@types/adm-zip": "^0.5.8",
"@types/node": "^26.0.1",
+ "@types/stream-json": "^1.7.8",
"@vitest/coverage-v8": "^4.1.9",
"adm-zip": "^0.5.18",
"dotenv-cli": "^11.0.0",
diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts
new file mode 100644
index 00000000..5896529b
--- /dev/null
+++ b/packages/db/src/backfill-aggregate-stats.ts
@@ -0,0 +1,125 @@
+/**
+ * Backfill `agentic_trace_replay.aggregate_stats` for rows that are missing it
+ * or were computed by an older `STATS_VERSION`.
+ *
+ * The ingest path now computes stats inline, but existing rows (and rows
+ * whose computation logic has since changed) still need this pass. Run after the agentic schema migration and any time `STATS_VERSION` bumps.
+ *
+ * Strategy:
+ * - Stream rows one at a time (server_metrics_json_gz can be hundreds of
+ * MB decompressed for TP+EP / high-conc points — keeping one in memory
+ * at a time avoids OOM).
+ * - Skip rows whose stored `aggregate_stats.version` already matches.
+ * - Recompute via the same `computeAggregateStats()` helper the ingest
+ * path uses, so behavior cannot drift.
+ *
+ * Usage:
+ * pnpm --filter @semianalysisai/inferencex-db db:backfill-aggregate-stats
+ * [--limit N] only process the first N candidate rows (useful for
+ * smoke-tests on a fresh deploy)
+ * [--force] recompute every row, even if version already matches
+ * [--yes] skip the confirmation prompt
+ */
+
+import { hasNoSslFlag } from './cli-utils.js';
+import {
+ computeAggregateStats,
+ mergeProfileStatsUpgrade,
+ STATS_VERSION,
+ type AggregateStats,
+} from './etl/compute-aggregate-stats.js';
+import { createAdminSql } from './etl/db-utils.js';
+import {
+ confirmProceed,
+ jsonbParam,
+ parseLimitForceFlags,
+ runBackfillMain,
+ runPerIdBackfill,
+} from './lib/backfill-runner.js';
+
+const flags = parseLimitForceFlags();
+
+const sql = createAdminSql({
+ noSsl: hasNoSslFlag(),
+ max: 1,
+ onnotice: () => {},
+});
+
+async function main(): Promise {
+ console.log('=== backfill-aggregate-stats ===');
+ console.log(` STATS_VERSION = ${STATS_VERSION}`);
+ console.log(` force = ${flags.force}`);
+ console.log(` limit = ${flags.limit ?? 'none'}`);
+
+ // Find candidates: rows missing stats, or whose stored version is stale.
+ // Using >>'version'::int comparison would error on null; coalesce to -1 so
+ // null-stats rows always count as stale.
+ const candidates = flags.force
+ ? await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `
+ : await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where aggregate_stats is null
+ or coalesce((aggregate_stats->>'version')::int, -1) <> ${STATS_VERSION}
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `;
+
+ if (candidates.length === 0) {
+ console.log('\n Nothing to do — all rows up to date.');
+ return;
+ }
+
+ if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return;
+
+ await runPerIdBackfill(
+ candidates.map((c) => c.id),
+ async (id) => {
+ // Fetch one row at a time — the json_gz blob is the heavy field.
+ const [row] = await sql<
+ { profile_export_jsonl_gz: Buffer | null; aggregate_stats: AggregateStats | null }[]
+ >`
+ select profile_export_jsonl_gz, aggregate_stats
+ from agentic_trace_replay
+ where id = ${id}
+ `;
+ if (!row) {
+ console.warn(` id=${id}: row vanished, skipping`);
+ return 'skipped';
+ }
+
+ let stats: AggregateStats;
+ if (row.aggregate_stats?.version === 3) {
+ const profileStats = await computeAggregateStats({
+ profileBlob: row.profile_export_jsonl_gz,
+ serverBlob: null,
+ });
+ stats = mergeProfileStatsUpgrade(row.aggregate_stats, profileStats);
+ } else {
+ const [serverRow] = await sql<{ server_metrics_json_gz: Buffer | null }[]>`
+ select server_metrics_json_gz
+ from agentic_trace_replay
+ where id = ${id}
+ `;
+ stats = await computeAggregateStats({
+ profileBlob: row.profile_export_jsonl_gz,
+ serverBlob: serverRow?.server_metrics_json_gz ?? null,
+ });
+ }
+
+ await sql`
+ update agentic_trace_replay
+ set aggregate_stats = ${jsonbParam(sql, stats)}
+ where id = ${id}
+ `;
+ return 'ok';
+ },
+ );
+}
+
+runBackfillMain('backfill-aggregate-stats', sql, main);
diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts
new file mode 100644
index 00000000..94e1700d
--- /dev/null
+++ b/packages/db/src/backfill-chart-series.ts
@@ -0,0 +1,124 @@
+/**
+ * Backfill `agentic_trace_replay.chart_series` for rows that are missing it
+ * or were computed by an older `CHART_SERIES_VERSION`.
+ *
+ * The ingest path now computes the time-series inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after the agentic schema migration and any time `CHART_SERIES_VERSION`
+ * bumps.
+ *
+ * Strategy:
+ * - Stream rows one at a time (server_metrics_json_gz can decompress
+ * past 500 MB on high-conc TP+EP points — one in memory at a time
+ * avoids OOM).
+ * - Skip rows whose stored version already matches.
+ * - Recompute via the same `computeChartSeries()` helper the ingest
+ * path uses, so behavior cannot drift.
+ *
+ * Usage:
+ * pnpm --filter @semianalysisai/inferencex-db db:backfill-chart-series
+ * [--limit N] only process the first N candidate rows
+ * [--force] recompute every row, even if version already matches
+ * [--yes] skip the confirmation prompt
+ */
+
+import { hasNoSslFlag } from './cli-utils.js';
+import { CHART_SERIES_VERSION, computeChartSeries } from './etl/compute-chart-series.js';
+import { createAdminSql } from './etl/db-utils.js';
+import {
+ confirmProceed,
+ jsonbParam,
+ parseLimitForceFlags,
+ runBackfillMain,
+ runPerIdBackfill,
+} from './lib/backfill-runner.js';
+
+const flags = parseLimitForceFlags();
+
+const sql = createAdminSql({
+ noSsl: hasNoSslFlag(),
+ max: 1,
+ onnotice: () => {},
+});
+
+async function main(): Promise {
+ console.log('=== backfill-chart-series ===');
+ console.log(` CHART_SERIES_VERSION = ${CHART_SERIES_VERSION}`);
+ console.log(` force = ${flags.force}`);
+ console.log(` limit = ${flags.limit ?? 'none'}`);
+
+ // Only rows that actually have a server_metrics blob can produce a
+ // chart_series. Rows without the blob legitimately keep `chart_series`
+ // null and the API serves them via the slow path (which also returns
+ // null because there's no blob to parse — so the page falls into the
+ // "no stored trace_replay blob" branch).
+ const candidates = flags.force
+ ? await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where server_metrics_json_gz is not null
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `
+ : await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where server_metrics_json_gz is not null
+ and (
+ chart_series is null
+ or coalesce((chart_series->>'version')::int, -1) <> ${CHART_SERIES_VERSION}
+ )
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `;
+
+ if (candidates.length === 0) {
+ console.log('\n Nothing to do — all rows up to date.');
+ return;
+ }
+
+ if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return;
+
+ await runPerIdBackfill(
+ candidates.map((c) => c.id),
+ async (id) => {
+ const [row] = await sql<
+ {
+ server_metrics_json_gz: Buffer | null;
+ framework: string | null;
+ disagg: boolean | null;
+ }[]
+ >`
+ select atr.server_metrics_json_gz, source.framework, source.disagg
+ from agentic_trace_replay atr
+ left join lateral (
+ select c.framework, c.disagg
+ from benchmark_results br
+ join configs c on c.id = br.config_id
+ where br.trace_replay_id = atr.id
+ order by br.id
+ limit 1
+ ) source on true
+ where atr.id = ${id}
+ `;
+ if (!row) {
+ console.warn(` id=${id}: row vanished, skipping`);
+ return 'skipped';
+ }
+
+ const series = await computeChartSeries(row.server_metrics_json_gz, {
+ framework: row.framework,
+ disagg: row.disagg ?? false,
+ });
+
+ await sql`
+ update agentic_trace_replay
+ set chart_series = ${series === null ? null : jsonbParam(sql, series)}
+ where id = ${id}
+ `;
+ return 'ok';
+ },
+ );
+}
+
+runBackfillMain('backfill-chart-series', sql, main);
diff --git a/packages/db/src/backfill-dataset-stats.ts b/packages/db/src/backfill-dataset-stats.ts
new file mode 100644
index 00000000..e9c6916d
--- /dev/null
+++ b/packages/db/src/backfill-dataset-stats.ts
@@ -0,0 +1,111 @@
+/**
+ * Backfill dataset summary stats and subagent-only ISL/OSL distributions from
+ * the compact structures already stored in `dataset_conversations`.
+ *
+ * Usage:
+ * pnpm --filter @semianalysisai/inferencex-db db:backfill-dataset-stats --yes
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils';
+import { createAdminSql } from './etl/db-utils';
+import { logHistogram, summarizeValues } from './etl/weka-structure';
+import { jsonbParam, runBackfillMain } from './lib/backfill-runner';
+
+interface DatasetRow {
+ id: string;
+ slug: string;
+ summary: Record;
+ chart_data: Record;
+}
+
+interface ConversationRow {
+ num_subagent_groups: number | string;
+ request_count: number | string;
+}
+
+interface SubagentRequestRow {
+ input_tokens: number | string;
+ output_tokens: number | string;
+}
+
+const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
+
+async function main(): Promise {
+ const datasets = await sql`
+ select id, slug, summary, chart_data
+ from datasets
+ order by slug
+ `;
+ if (datasets.length === 0) {
+ console.log('No datasets found.');
+ return;
+ }
+
+ console.log(`Backfill subagent dataset stats for ${datasets.length} dataset(s).`);
+ if (!hasYesFlag() && !(await confirm('Continue? (y/N) '))) return;
+
+ for (const dataset of datasets) {
+ const conversations = await sql`
+ select
+ num_subagent_groups,
+ (
+ num_turns + coalesce((
+ select sum(jsonb_array_length(node.value->'children'))
+ from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value)
+ where node.value->>'kind' = 'subagent'
+ ), 0)
+ ) as request_count
+ from dataset_conversations dc
+ where dataset_id = ${dataset.id}
+ `;
+ const requests = await sql`
+ select
+ (child.value->>'in')::double precision as input_tokens,
+ (child.value->>'out')::double precision as output_tokens
+ from dataset_conversations dc
+ cross join lateral jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value)
+ cross join lateral jsonb_array_elements(coalesce(node.value->'children', '[]'::jsonb)) child(value)
+ where dc.dataset_id = ${dataset.id}
+ and node.value->>'kind' = 'subagent'
+ `;
+
+ const subagentsPerTrace = conversations.map((row) => Number(row.num_subagent_groups));
+ const requestsPerConversation = conversations.map((row) => Number(row.request_count));
+ const inputTokens = requests.map((row) => Number(row.input_tokens));
+ const outputTokens = requests.map((row) => Number(row.output_tokens));
+ const subagentStats = summarizeValues(subagentsPerTrace);
+ const requestStats = summarizeValues(requestsPerConversation);
+ const summary = {
+ ...dataset.summary,
+ version: 3,
+ meanSubagentsPerTrace: subagentStats.mean,
+ medianSubagentsPerTrace: subagentStats.median,
+ meanRequestsPerConversation: requestStats.mean,
+ medianRequestsPerConversation: requestStats.median,
+ };
+ const chartData = {
+ ...dataset.chart_data,
+ version: 3,
+ subagentInputTokensPerRequest: {
+ bins: logHistogram(inputTokens),
+ stats: summarizeValues(inputTokens),
+ },
+ subagentOutputTokensPerRequest: {
+ bins: logHistogram(outputTokens),
+ stats: summarizeValues(outputTokens),
+ },
+ };
+
+ await sql`
+ update datasets
+ set summary = ${sql.json(summary)},
+ chart_data = ${jsonbParam(sql, chartData)}
+ where id = ${dataset.id}
+ `;
+ console.log(
+ ` ${dataset.slug}: ${requests.length.toLocaleString()} inner requests, median ${subagentStats.median}, mean ${subagentStats.mean.toFixed(1)} subagents/trace`,
+ );
+ }
+}
+
+runBackfillMain('backfill-dataset-stats', sql, main);
diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts
new file mode 100644
index 00000000..67291b6c
--- /dev/null
+++ b/packages/db/src/backfill-request-timeline.ts
@@ -0,0 +1,97 @@
+/**
+ * Backfill `agentic_trace_replay.request_timeline` for rows that are
+ * missing it or were computed by an older `REQUEST_TIMELINE_VERSION`.
+ *
+ * The ingest path now computes the timeline inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after the agentic schema migration and any time the version bumps.
+ *
+ * Usage:
+ * pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline
+ * [--limit N] only process the first N candidate rows
+ * [--force] recompute every row, even if version already matches
+ * [--yes] skip the confirmation prompt
+ */
+
+import { hasNoSslFlag } from './cli-utils.js';
+import {
+ REQUEST_TIMELINE_VERSION,
+ computeRequestTimeline,
+} from './etl/compute-request-timeline.js';
+import { createAdminSql } from './etl/db-utils.js';
+import {
+ confirmProceed,
+ jsonbParam,
+ parseLimitForceFlags,
+ runBackfillMain,
+ runPerIdBackfill,
+} from './lib/backfill-runner.js';
+
+const flags = parseLimitForceFlags();
+
+const sql = createAdminSql({
+ noSsl: hasNoSslFlag(),
+ max: 1,
+ onnotice: () => {},
+});
+
+async function main(): Promise {
+ console.log('=== backfill-request-timeline ===');
+ console.log(` REQUEST_TIMELINE_VERSION = ${REQUEST_TIMELINE_VERSION}`);
+ console.log(` force = ${flags.force}`);
+ console.log(` limit = ${flags.limit ?? 'none'}`);
+
+ // Only rows with a profile_export blob can produce a timeline. Rows
+ // without the blob keep `request_timeline` null and the API serves them
+ // as "no timeline data".
+ const candidates = flags.force
+ ? await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where profile_export_jsonl_gz is not null
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `
+ : await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where profile_export_jsonl_gz is not null
+ and (
+ request_timeline is null
+ or coalesce((request_timeline->>'version')::int, -1) <> ${REQUEST_TIMELINE_VERSION}
+ )
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `;
+
+ if (candidates.length === 0) {
+ console.log('\n Nothing to do — all rows up to date.');
+ return;
+ }
+
+ if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return;
+
+ await runPerIdBackfill(
+ candidates.map((c) => c.id),
+ async (id) => {
+ const [row] = await sql<{ profile_export_jsonl_gz: Buffer | null }[]>`
+ select profile_export_jsonl_gz
+ from agentic_trace_replay
+ where id = ${id}
+ `;
+ if (!row) {
+ console.warn(` id=${id}: row vanished, skipping`);
+ return 'skipped';
+ }
+ const timeline = computeRequestTimeline(row.profile_export_jsonl_gz);
+ await sql`
+ update agentic_trace_replay
+ set request_timeline = ${timeline === null ? null : jsonbParam(sql, timeline)}
+ where id = ${id}
+ `;
+ return 'ok';
+ },
+ );
+}
+
+runBackfillMain('backfill-request-timeline', sql, main);
diff --git a/packages/db/src/dump-db.ts b/packages/db/src/dump-db.ts
index 3810fe7a..d0e315d1 100644
--- a/packages/db/src/dump-db.ts
+++ b/packages/db/src/dump-db.ts
@@ -18,7 +18,25 @@ const sql = createAdminSql({ noSsl: hasNoSslFlag(), readonly: true, max: 1 });
const CURSOR_BATCH = 100;
-/** Stream a table to a JSON file using a cursor, writing row-by-row. */
+/**
+ * Stream a table to a JSON file using a cursor, writing row-by-row.
+ *
+ * BYTEA round-trip: postgres.js decodes a `bytea` column to a Node `Buffer`.
+ * `JSON.stringify(buffer)` invokes Buffer.prototype.toJSON(), which emits
+ * `{"type":"Buffer","data":[, …]}`. That's a lossless byte-array encoding
+ * (verified: JSON.parse → Buffer.from(obj.data) reproduces the exact bytes), so
+ * `agentic_trace_replay`'s blob columns (profile_export_jsonl_gz,
+ * server_metrics_csv, server_metrics_json_gz) survive the dump verbatim.
+ * load-dump.ts reconstructs the Buffer and casts it back to `::bytea`.
+ *
+ * Dump-size note: the byte-array encoding is ~4-6× the raw bytea size (each
+ * byte becomes 1-4 ASCII digits + a comma). For the big compressed blobs
+ * (server_metrics_json_gz can be ~17 MB compressed on high-conc TP+EP rows)
+ * the resulting agentic_trace_replay.json is the largest file in the dump — the
+ * same trade-off server_logs.json already makes. We keep all columns (no
+ * dropping) so dump mode has full parity with the DB, and json-provider
+ * lazy-loads this file only when a blob-backed route actually needs a fallback.
+ */
async function streamTable(table: string, outPath: string): Promise {
const out = createWriteStream(outPath);
out.write('[\n');
diff --git a/packages/db/src/etl/agentic-v3-flatten.ts b/packages/db/src/etl/agentic-v3-flatten.ts
new file mode 100644
index 00000000..a3c223af
--- /dev/null
+++ b/packages/db/src/etl/agentic-v3-flatten.ts
@@ -0,0 +1,131 @@
+/**
+ * v3 agentic agg schema (2026-07-02+): nested containers → canonical flat keys.
+ *
+ * v3 artifacts nest their metrics under `request_metrics` / `server_metrics`
+ * containers; v1/v2 emitted the same information as flat top-level fields.
+ * `flattenAgenticAggRow` maps the nested shape onto the flat schema the DB /
+ * API / frontend consume, so the rest of the mapper stays version-agnostic.
+ */
+
+import { parseNum } from './normalizers';
+
+/**
+ * Distribution stat names accepted from v3 nested stat blocks, with the rename
+ * applied when flattening. `p50` is stored as `median_*` to match the
+ * established METRIC_KEYS naming (fixed-seq runs and the frontend both use
+ * `median_*`; no `p50_*` key exists anywhere downstream).
+ */
+const V3_STAT_KEYS: Record = {
+ mean: 'mean',
+ p50: 'median',
+ median: 'median',
+ p75: 'p75',
+ p90: 'p90',
+ p95: 'p95',
+ p99: 'p99',
+ 'p99.9': 'p99.9',
+ std: 'std',
+};
+
+/** v3 `request_metrics.latency` sub-blocks → flat metric suffix (same name). */
+const V3_LATENCY_METRICS = ['ttft', 'e2el', 'itl', 'tpot', 'intvty'] as const;
+
+/** v3 `request_metrics.tokens` sub-blocks → flat metric suffix. */
+const V3_TOKEN_METRICS: Record = {
+ input: 'input_tokens',
+ output_actual: 'output_tokens_actual',
+ output_expected: 'output_tokens_expected',
+};
+
+/**
+ * Scalar paths in the v3 nested containers → canonical flat metric key. Keys
+ * reuse the flat v2-agentic names wherever one existed so already-ingested runs
+ * and the frontend see one consistent schema; genuinely new information gets a
+ * new key (registered in METRIC_KEYS).
+ */
+const V3_SCALAR_PATHS: [string[], string][] = [
+ // client-side throughput
+ [['request_metrics', 'throughput', 'input', 'tokens_per_second'], 'input_tput_tps'],
+ [['request_metrics', 'throughput', 'output', 'tokens_per_second'], 'output_tput_tps'],
+ [['request_metrics', 'throughput', 'total', 'tokens_per_second'], 'total_tput_tps'],
+ [['request_metrics', 'throughput', 'duration_seconds'], 'duration_seconds'],
+ [['request_metrics', 'throughput', 'per_gpu', 'total_tput_tps'], 'tput_per_gpu'],
+ [['request_metrics', 'throughput', 'per_gpu', 'output_tput_tps'], 'output_tput_per_gpu'],
+ [['request_metrics', 'throughput', 'per_gpu', 'input_tput_tps'], 'input_tput_per_gpu'],
+ [['request_metrics', 'cache', 'theoretical_cache_hit_rate'], 'theoretical_cache_hit_rate'],
+ // server-side prefix-cache observability (same fields v2 emitted flat)
+ [['server_metrics', 'cache', 'gpu_cache_hit_rate'], 'server_gpu_cache_hit_rate'],
+ [['server_metrics', 'cache', 'cpu_cache_hit_rate'], 'server_cpu_cache_hit_rate'],
+ [['server_metrics', 'cache', 'external_cache_hit_rate'], 'server_external_cache_hit_rate'],
+ // KV-cache occupancy (gpu key predates v3 as a flat auto-captured field)
+ [['server_metrics', 'kv_cache', 'gpu_usage_pct'], 'gpu_kv_cache_usage_pct'],
+ // server token totals
+ [['server_metrics', 'tokens', 'prompt_total'], 'total_prompt_tokens'],
+ [['server_metrics', 'tokens', 'generation_total'], 'total_generation_tokens'],
+ [['server_metrics', 'tokens', 'requests_completed'], 'total_requests_completed'],
+ // Deliberately NOT mapped (yet): cache.overall/prefix_cache_hits/queries,
+ // kv_cache.cpu_*, tokens.prompt_by_source, sources[] — new v3 detail we don't
+ // consume anywhere; add here + METRIC_KEYS when a view needs them.
+];
+
+/** Walk a nested object path; returns undefined on any non-object hop. */
+function atPath(obj: Record, path: string[]): unknown {
+ let cur: unknown = obj;
+ for (const seg of path) {
+ if (!cur || typeof cur !== 'object' || Array.isArray(cur)) return undefined;
+ cur = (cur as Record)[seg];
+ }
+ return cur;
+}
+
+/** Flatten one v3 stat block ({mean, p50, …}) into `out` as `{stat}_{suffix}`. */
+function flattenStatBlock(block: unknown, suffix: string, out: Record): void {
+ if (!block || typeof block !== 'object' || Array.isArray(block)) return;
+ for (const [stat, canonical] of Object.entries(V3_STAT_KEYS)) {
+ const n = parseNum((block as Record)[stat]);
+ if (n !== undefined) out[`${canonical}_${suffix}`] = n;
+ }
+}
+
+/**
+ * Flatten a v3 agentic agg row (nested `request_metrics` / `server_metrics`
+ * containers, 2026-07-02+) into the canonical flat metric schema that v1/v2
+ * artifacts emitted directly and that the DB / API / frontend consume.
+ *
+ * Returns the row unchanged when `request_metrics` is absent (v1/v2 rows pass
+ * through untouched). Otherwise returns a copy with the flattened metrics
+ * merged in; the nested containers stay on the row (they're in NON_METRIC_KEYS
+ * so the auto-capture loop ignores them).
+ *
+ * Notes on the v3 source data:
+ * - `p50` percentiles are new (v2 had no median for agentic); stored as
+ * `median_*` to match the frontend's naming.
+ * - `latency.intvty` arrives already slow-tail inverted (pXX_intvty =
+ * 1/pXX_itl). It's flattened here for completeness, but mapBenchmarkRow's
+ * derive-from-itl invariant still overwrites it, keeping one definition
+ * across all harness versions.
+ */
+export function flattenAgenticAggRow(row: Record): Record {
+ const rm = row.request_metrics;
+ if (!rm || typeof rm !== 'object' || Array.isArray(rm)) return row;
+
+ const flat: Record = {};
+
+ // latency distributions
+ for (const metric of V3_LATENCY_METRICS) {
+ flattenStatBlock(atPath(row, ['request_metrics', 'latency', metric]), metric, flat);
+ }
+ // qps distribution (window_seconds / samples are intentionally not stats)
+ flattenStatBlock(atPath(row, ['request_metrics', 'qps']), 'qps', flat);
+ // per-request token-count distributions
+ for (const [src, suffix] of Object.entries(V3_TOKEN_METRICS)) {
+ flattenStatBlock(atPath(row, ['request_metrics', 'tokens', src]), suffix, flat);
+ }
+ // scalars
+ for (const [path, key] of V3_SCALAR_PATHS) {
+ const n = parseNum(atPath(row, path));
+ if (n !== undefined) flat[key] = n;
+ }
+
+ return { ...row, ...flat };
+}
diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts
index a5493629..2a2382c8 100644
--- a/packages/db/src/etl/benchmark-ingest.ts
+++ b/packages/db/src/etl/benchmark-ingest.ts
@@ -4,6 +4,7 @@
import type postgres from 'postgres';
import type { BenchmarkParams } from './benchmark-mapper';
+import { kvCachePoolTokensFromServerLog } from './server-log-metrics';
type Sql = ReturnType;
@@ -29,12 +30,19 @@ export async function bulkIngestBenchmarkRows(
// Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears
// more than once in a single batch. Deduplicate within the batch, keeping
- // the last occurrence (last metrics for each unique config/isl/osl/conc).
+ // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode).
const seen = new Map();
- for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r);
+ for (const r of rows) {
+ seen.set(
+ `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`,
+ r,
+ );
+ }
const deduped = [...seen.values()];
const configIds = deduped.map((r) => r.configId);
+ const benchmarkTypes = deduped.map((r) => r.benchmarkType);
+ const offloadModes = deduped.map((r) => r.offloadMode);
const isls = deduped.map((r) => r.isl);
const osls = deduped.map((r) => r.osl);
const concs = deduped.map((r) => r.conc);
@@ -49,13 +57,14 @@ export async function bulkIngestBenchmarkRows(
const result = await sql<{ inserted: boolean; id: number }[]>`
insert into benchmark_results (
- workflow_run_id, config_id, benchmark_type, date,
+ workflow_run_id, config_id, benchmark_type, offload_mode, date,
isl, osl, conc, image, metrics, workers
)
select
${workflowRunId},
unnest(${sql.array(configIds)}::int[]),
- 'single_turn',
+ unnest(${sql.array(benchmarkTypes)}::text[]),
+ unnest(${sql.array(offloadModes)}::text[]),
${date}::date,
unnest(${sql.array(isls)}::int[]),
unnest(${sql.array(osls)}::int[]),
@@ -63,9 +72,15 @@ export async function bulkIngestBenchmarkRows(
unnest(${sql.array(images)}),
unnest(${sql.array(metricsJsons)}::jsonb[]),
unnest(${sql.array(workersJsons)}::jsonb[])
- on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc)
+ on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode)
do update set
- metrics = excluded.metrics,
+ -- Replace metrics with the fresh artifact values, but carry over
+ -- kv_cache_pool_tokens: it is derived from the server log at
+ -- insertServerLog time (not present in any artifact JSON), so a later
+ -- upsert from the aggregated results_bmk artifact would silently wipe it.
+ metrics = excluded.metrics || jsonb_strip_nulls(
+ jsonb_build_object('kv_cache_pool_tokens', benchmark_results.metrics->'kv_cache_pool_tokens')
+ ),
image = excluded.image,
workers = excluded.workers
returning (xmax = 0) as inserted, id
@@ -98,9 +113,18 @@ export async function insertServerLog(
insert into server_logs (server_log) values (${serverLog})
returning id
`;
+ // Derive the KV-cache pool size (tokens) from the log's authoritative
+ // "GPU KV cache size: N tokens" line(s) and stash it on the result's metrics
+ // JSON, mirroring how trace-replay-ingest derives cache-hit rates. The
+ // scraped vllm:cache_config_info metric can't reconstruct this for MLA models.
+ const kvCachePoolTokens = kvCachePoolTokensFromServerLog(serverLog);
await sql`
update benchmark_results
- set server_log_id = ${logId}
+ set server_log_id = ${logId}${
+ kvCachePoolTokens === null
+ ? sql``
+ : sql`, metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${kvCachePoolTokens}::bigint))`
+ }
where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
`;
}
@@ -155,13 +179,14 @@ export async function bulkUpsertAvailability(
sql: Sql,
rows: {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
specMethod: string;
disagg: boolean;
+ benchmarkType: string;
}[],
date: string,
): Promise {
@@ -170,7 +195,7 @@ export async function bulkUpsertAvailability(
const seen = new Set();
const unique: typeof rows = [];
for (const r of rows) {
- const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`;
+ const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`;
if (!seen.has(key)) {
seen.add(key);
unique.push(r);
@@ -178,7 +203,7 @@ export async function bulkUpsertAvailability(
}
await sql`
- insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date)
+ insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date)
select
unnest(${sql.array(unique.map((r) => r.model))}::text[]),
unnest(${sql.array(unique.map((r) => r.isl))}::int[]),
@@ -188,6 +213,7 @@ export async function bulkUpsertAvailability(
unnest(${sql.array(unique.map((r) => r.framework))}::text[]),
unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]),
unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]),
+ unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]),
${date}::date
on conflict do nothing
`;
diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts
index 65fb3e39..69598039 100644
--- a/packages/db/src/etl/benchmark-mapper.test.ts
+++ b/packages/db/src/etl/benchmark-mapper.test.ts
@@ -22,6 +22,20 @@ function makeV1Row(overrides: Record = {}): Record {
};
}
+/** Minimal valid agentic row: scenario_type triggers the agentic path; `users` → conc. */
+function makeAgenticRow(overrides: Record = {}): Record {
+ return {
+ infmax_model_prefix: 'dsv4',
+ hw: 'b200-nv',
+ framework: 'vllm',
+ precision: 'fp4',
+ scenario_type: 'agentic-coding',
+ users: 72,
+ tput_per_gpu: 20000,
+ ...overrides,
+ };
+}
+
/** Minimal valid v2 benchmark row (disaggregated prefill/decode parallelism). */
function makeV2Row(overrides: Record = {}): Record {
return {
@@ -570,3 +584,306 @@ describe('extractWorkers', () => {
expect(extractWorkers([null, 'bad', 0, undefined])).toBeUndefined();
});
});
+
+describe('mapBenchmarkRow — agentic interactivity normalization', () => {
+ it('derives *_intvty from 1/*_itl, discarding the artifact value', () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(
+ makeAgenticRow({
+ p90_itl: 0.0893,
+ p90_intvty: 23.91, // fast-tail contamination — must be overwritten
+ p75_itl: 0.0692,
+ p75_intvty: 19,
+ }),
+ tracker,
+ );
+ expect(result!.benchmarkType).toBe('agentic_traces');
+ expect(result!.metrics.p90_intvty).toBeCloseTo(1 / 0.0893, 6);
+ expect(result!.metrics.p75_intvty).toBeCloseTo(1 / 0.0692, 6);
+ });
+
+ it('derives *_intvty even when the artifact omits it', () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(makeAgenticRow({ p90_itl: 0.1 }), tracker);
+ expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6);
+ });
+
+ it('does not touch *_intvty for single_turn rows', () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(makeV1Row({ p90_itl: 0.05, p90_intvty: 999 }), tracker);
+ expect(result!.metrics.p90_intvty).toBe(999);
+ });
+
+ it('DELETES a stale artifact *_intvty when the matching *_itl is absent', () => {
+ // Artifact ships intvty (possibly the drifted p(1/ITL) definition) but no itl
+ // for that percentile. Passing it through would mix harness semantics into a
+ // column meant to be 1/p(ITL) everywhere — so the key must be removed, not kept.
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(makeAgenticRow({ p90_intvty: 42, p95_itl: 0.2 }), tracker);
+ expect(result!.metrics).not.toHaveProperty('p90_intvty'); // stale → deleted
+ expect(result!.metrics.p95_intvty).toBeCloseTo(5, 6); // derived from itl
+ });
+
+ it('DELETES a stale artifact *_intvty when the matching *_itl is zero/invalid', () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(makeAgenticRow({ p90_itl: 0, p90_intvty: 42 }), tracker);
+ expect(result!.metrics).not.toHaveProperty('p90_intvty');
+ });
+});
+
+/**
+ * Minimal v3 agentic row (2026-07-02+): nested request_metrics/server_metrics,
+ * p50 percentiles, pre-inverted intvty, kv_offloading descriptors. Mirrors the
+ * real artifact from GH run 28553943579 (trimmed).
+ */
+function makeV3AgenticRow(overrides: Record = {}): Record {
+ return {
+ infmax_model_prefix: 'dsv4',
+ hw: 'cluster:b300-nv',
+ framework: 'vllm',
+ precision: 'fp4',
+ spec_decoding: 'none',
+ disagg: false,
+ scenario_type: 'agentic-coding',
+ is_multinode: false,
+ tp: 4,
+ ep: 1,
+ dp_attention: 'false',
+ conc: 16,
+ image: 'vllm/vllm-openai:v0.23.0',
+ kv_offloading: 'none',
+ kv_offload_backend: '',
+ num_requests_total: 1648,
+ num_requests_successful: 1648,
+ dataset: {
+ source_type: 'public_dataset',
+ hf_dataset_name: 'semianalysisai/cc-traces-weka-062126',
+ },
+ request_metrics: {
+ qps: {
+ window_seconds: 1,
+ samples: 7209,
+ mean: 0.22846,
+ p50: 0,
+ p75: 0,
+ p90: 1,
+ p95: 1,
+ std: 0.60707,
+ },
+ latency: {
+ ttft: {
+ mean: 12.90033,
+ p50: 1.49712,
+ p75: 12.09501,
+ p90: 56.22194,
+ p95: 68.03156,
+ std: 22.68353,
+ },
+ e2el: {
+ mean: 81.05644,
+ p50: 26.18817,
+ p75: 84.93601,
+ p90: 199.85996,
+ p95: 360.31579,
+ std: 149.59205,
+ },
+ itl: {
+ mean: 0.07548,
+ p50: 0.03677,
+ p75: 0.10253,
+ p90: 0.16652,
+ p95: 0.22255,
+ std: 0.08327,
+ },
+ tpot: {
+ mean: 0.07548,
+ p50: 0.03677,
+ p75: 0.10253,
+ p90: 0.16652,
+ p95: 0.22255,
+ std: 0.08327,
+ },
+ // already slow-tail inverted upstream (pXX_intvty = 1/pXX_itl)
+ intvty: {
+ mean: 13.2482,
+ p50: 27.19411,
+ p75: 9.75304,
+ p90: 6.00526,
+ p95: 4.49335,
+ std: 24.77636,
+ },
+ },
+ tokens: {
+ input: {
+ mean: 157676.054,
+ p50: 96047,
+ p75: 197684.25,
+ p90: 404935.9,
+ p95: 547502.85,
+ std: 152480.17653,
+ },
+ output_actual: {
+ mean: 849.06735,
+ p50: 290.5,
+ p75: 783.5,
+ p90: 2231.8,
+ p95: 3915.45,
+ std: 1568.90823,
+ },
+ output_expected: {
+ mean: 1432.32728,
+ p50: 571.5,
+ p75: 1820,
+ p90: 3927,
+ p95: 5312.9,
+ std: 2067.19215,
+ },
+ },
+ throughput: {
+ input: { tokens_per_second: 35980.14001 },
+ output: { tokens_per_second: 193.7489 },
+ total: { tokens_per_second: 36173.88892 },
+ duration_seconds: 7222.04352,
+ per_gpu: {
+ total_tput_tps: 9043.47223,
+ output_tput_tps: 48.43723,
+ input_tput_tps: 8995.035,
+ },
+ },
+ cache: { theoretical_cache_hit_rate: 0.97509 },
+ },
+ server_metrics: {
+ present: true,
+ adapter: 'vllm',
+ metric_count: 49,
+ cache: {
+ gpu_cache_hit_rate: 0.78539,
+ cpu_cache_hit_rate: 0,
+ external_cache_hit_rate: 0,
+ overall_cache_hit_rate: 0.78539,
+ prefix_cache_hits: 205576960,
+ prefix_cache_queries: 261750519,
+ frontend_cache_hit_rate: null,
+ },
+ kv_cache: { gpu_usage_pct: 0.82134, cpu_usage_pct: null, cpu_used_tokens: null },
+ tokens: {
+ prompt_total: 261750519,
+ generation_total: 1422696,
+ requests_completed: 1648,
+ prompt_by_source: {
+ gpu_cache_hit: 205576960,
+ cpu_or_external_cache_hit: 0,
+ computed: 56173559,
+ },
+ },
+ sources: [{ id: 'combined|http://localhost:8888/metrics|engine=0', role: 'combined' }],
+ },
+ ...overrides,
+ };
+}
+
+describe('mapBenchmarkRow — v3 agentic nested agg schema', () => {
+ it('maps identity/routing and flattens the nested containers', () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(makeV3AgenticRow(), tracker);
+
+ expect(result).not.toBeNull();
+ expect(result!.benchmarkType).toBe('agentic_traces');
+ expect(result!.config.hardware).toBe('b300');
+ expect(result!.conc).toBe(16);
+ expect(result!.isl).toBeNull();
+ expect(result!.osl).toBeNull();
+
+ const m = result!.metrics;
+ // latency distributions, p50 stored under the canonical median_* name
+ expect(m.median_ttft).toBeCloseTo(1.49712, 6);
+ expect(m.p90_ttft).toBeCloseTo(56.22194, 6);
+ expect(m.std_e2el).toBeCloseTo(149.59205, 6);
+ expect(m.p95_itl).toBeCloseTo(0.22255, 6);
+ expect(m.mean_tpot).toBeCloseTo(0.07548, 6);
+ // qps + token distributions
+ expect(m.median_qps).toBe(0);
+ expect(m.p90_input_tokens).toBeCloseTo(404935.9, 3);
+ expect(m.median_output_tokens_actual).toBeCloseTo(290.5, 3);
+ expect(m.p95_output_tokens_expected).toBeCloseTo(5312.9, 3);
+ // throughput scalars under the v2 flat names
+ expect(m.tput_per_gpu).toBeCloseTo(9043.47223, 3);
+ expect(m.output_tput_per_gpu).toBeCloseTo(48.43723, 3);
+ expect(m.input_tput_per_gpu).toBeCloseTo(8995.035, 3);
+ expect(m.total_tput_tps).toBeCloseTo(36173.88892, 3);
+ expect(m.duration_seconds).toBeCloseTo(7222.04352, 3);
+ // cache / kv / totals
+ expect(m.theoretical_cache_hit_rate).toBeCloseTo(0.97509, 6);
+ expect(m.server_gpu_cache_hit_rate).toBeCloseTo(0.78539, 6);
+ expect(m.server_external_cache_hit_rate).toBe(0);
+ expect(m.gpu_kv_cache_usage_pct).toBeCloseTo(0.82134, 6);
+ expect(m.total_prompt_tokens).toBe(261750519);
+ expect(m.total_generation_tokens).toBe(1422696);
+ expect(m.total_requests_completed).toBe(1648);
+ // nested containers must not leak into metrics
+ expect(m).not.toHaveProperty('request_metrics');
+ expect(m).not.toHaveProperty('server_metrics');
+ });
+
+ it('re-derives *_intvty from *_itl (matching the pre-inverted artifact values)', () => {
+ const tracker = createSkipTracker();
+ const m = mapBenchmarkRow(makeV3AgenticRow(), tracker)!.metrics;
+ // The artifact already ships slow-tail intvty; the derive invariant keeps
+ // one definition and must agree with it (up to the artifact's rounding).
+ expect(m.median_intvty).toBeCloseTo(1 / 0.03677, 6);
+ expect(m.p90_intvty).toBeCloseTo(1 / 0.16652, 6);
+ expect(m.median_intvty).toBeCloseTo(27.19411, 2);
+ expect(m.p90_intvty).toBeCloseTo(6.00526, 2);
+ // std is never inverted — passes through from the artifact
+ expect(m.std_intvty).toBeCloseTo(24.77636, 6);
+ });
+
+ it("maps kv_offloading 'none' to offload off and skips the empty backend", () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(makeV3AgenticRow(), tracker);
+ expect(result!.offloadMode).toBe('off');
+ expect(result!.metrics).not.toHaveProperty('kv_offload_backend');
+ });
+
+ it("maps kv_offloading 'dram' + backend to offload on with the backend preserved", () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(
+ makeV3AgenticRow({ kv_offloading: 'dram', kv_offload_backend: 'mooncake', conc: 32 }),
+ tracker,
+ );
+ expect(result!.offloadMode).toBe('on');
+ expect((result!.metrics as Record).kv_offloading).toBe('dram');
+ expect((result!.metrics as Record).kv_offload_backend).toBe('mooncake');
+ });
+
+ it('still applies the failed-run guard to v3 rows', () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(
+ makeV3AgenticRow({ num_requests_successful: 0, num_requests_total: 100 }),
+ tracker,
+ );
+ expect(result).toBeNull();
+ expect(tracker.skips.failedRun).toBe(1);
+ });
+
+ it('skips rows where the server never came up (zero total requests)', () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(
+ makeV3AgenticRow({ num_requests_successful: 0, num_requests_total: 0 }),
+ tracker,
+ );
+ expect(result).toBeNull();
+ expect(tracker.skips.failedRun).toBe(1);
+ });
+
+ it('leaves v2 flat agentic rows byte-identical (no flattening applied)', () => {
+ const tracker = createSkipTracker();
+ const result = mapBenchmarkRow(
+ makeAgenticRow({ p90_itl: 0.1, mean_ttft: 1.5, offload_mode: 'on' }),
+ tracker,
+ );
+ expect(result!.metrics.mean_ttft).toBe(1.5);
+ expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6);
+ expect(result!.offloadMode).toBe('on');
+ });
+});
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index b25baf60..5b00618a 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -1,11 +1,13 @@
/**
* Benchmark row mapper: raw JSON dict → typed `BenchmarkParams`.
- * Handles both v1 (single tp/ep) and v2 (separate prefill/decode fields).
+ * Handles v1 (single tp/ep), v2 (separate prefill/decode fields), and v3
+ * (nested agentic containers, flattened via {@link flattenAgenticAggRow}).
*/
import type { ConfigParams } from './config-cache';
import type { SkipTracker } from './skip-tracker';
import { METRIC_KEYS, PRECISION_KEYS } from '@semianalysisai/inferencex-constants';
+import { flattenAgenticAggRow } from './agentic-v3-flatten';
import {
resolveModelKey,
hwToGpuKey,
@@ -17,11 +19,7 @@ import {
parseInt2,
} from './normalizers';
-/**
- * Raw artifact field names that are renamed when stored as metrics.
- * All other numeric fields not in `NON_METRIC_KEYS` are stored under their raw name.
- */
-const METRIC_RENAMES: Record = {};
+export { flattenAgenticAggRow };
/**
* Raw artifact fields that are config/routing dimensions, not metrics.
@@ -57,12 +55,41 @@ const NON_METRIC_KEYS = new Set([
'decode_num_workers',
'num_prefill_gpu',
'num_decode_gpu',
+ // agentic scenario
+ 'scenario_type',
+ 'users',
+ 'offload_mode',
+ 'num_requests_total',
+ 'num_requests_successful',
+ // v3 agentic KV-offload descriptors ('none'|'dram'|… + backend name). Mapped
+ // to offloadMode / stringified metrics explicitly in mapBenchmarkRow.
+ 'kv_offloading',
+ 'kv_offload_backend',
+ // v3 agentic nested containers — flattened by flattenAgenticAggRow before
+ // the auto-capture loop runs; the raw objects themselves are not metrics.
+ 'request_metrics',
+ 'server_metrics',
+ // Public-dataset provenance emitted by aiperf. The ingest runner uses this
+ // object to populate run_datasets; it is not a benchmark metric.
+ 'dataset',
// per-worker measured-power array (not a numeric scalar). Surfaced as a
// sibling of the metrics JSONB by mapBenchmarkRow so the metrics column
// stays Record for the index signature on BenchmarkRow.
'workers',
]);
+/**
+ * `benchmark_type` values understood by the ingest.
+ * - `single_turn` — fixed sequence-length runs (isl/osl set).
+ * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc).
+ */
+export type BenchmarkType = 'single_turn' | 'agentic_traces';
+
+/** Reduce an offload descriptor ('none'|'dram'|…) to the binary on/off. */
+function descriptorToOnOff(v: unknown): string | null {
+ return typeof v === 'string' && v.length > 0 ? (v === 'none' ? 'off' : 'on') : null;
+}
+
/**
* METRIC_KEYS from constants is the canonical set of known metric keys.
* Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured
@@ -91,9 +118,13 @@ export interface WorkerPower {
export interface BenchmarkParams {
config: ConfigParams;
- isl: number;
- osl: number;
+ benchmarkType: BenchmarkType;
+ // Null for agentic_traces; present for single_turn.
+ isl: number | null;
+ osl: number | null;
conc: number;
+ /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */
+ offloadMode: string;
image: string | null;
metrics: Record;
/**
@@ -110,9 +141,11 @@ export interface BenchmarkParams {
/**
* Map a raw benchmark result dict to typed `BenchmarkParams`.
*
- * Supports two artifact schemas:
+ * Supports three artifact schemas:
* - **v1** (pre-2025-12-19): single `tp`/`ep` for both prefill and decode.
* - **v2** (2025-12-19+): separate `prefill_tp`/`decode_tp` etc. for disaggregated configs.
+ * - **v3** (2026-07-02+, agentic only): nested `request_metrics`/`server_metrics`
+ * containers, flattened to the v2 flat schema up front by `flattenAgenticAggRow`.
*
* When mapping fails (unknown model, unknown hardware, or missing ISL/OSL/conc),
* the appropriate skip counter on `tracker` is incremented and `null` is returned.
@@ -128,6 +161,11 @@ export function mapBenchmarkRow(
tracker: SkipTracker,
islOslFallback?: { isl: number; osl: number } | null,
): BenchmarkParams | null {
+ // v3 agentic rows nest their metrics; flatten to the canonical flat schema
+ // first so the rest of the mapper (auto-capture, intvty invariant, guards)
+ // is version-agnostic. No-op for v1/v2 rows.
+ row = flattenAgenticAggRow(row);
+
const modelKey = resolveModelKey(row);
if (!modelKey) {
tracker.skips.unmappedModel++;
@@ -144,14 +182,45 @@ export function mapBenchmarkRow(
return null;
}
- const isl = parseInt2(row.isl) ?? islOslFallback?.isl;
- const osl = parseInt2(row.osl) ?? islOslFallback?.osl;
- const conc = parseInt2(row.conc);
- if (!isl || !osl || !conc) {
+ // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants),
+ // no isl/osl, and `users` instead of `conc`. Everything else stays as-is.
+ const isAgentic = String(row.scenario_type ?? '').startsWith('agentic');
+ const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn';
+
+ const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
+ const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
+ // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones.
+ const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc);
+ if (!conc || (!isAgentic && (!isl || !osl))) {
tracker.skips.noIslOsl++;
return null;
}
+ // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from
+ // every runner, including failed ones with 0 successful requests and null
+ // metrics — both the "issued requests but none succeeded" case (total > 0)
+ // and the "server never came up" case (total === 0). Without this skip the
+ // empty row lands as a dataless point, or overwrites a good row via
+ // ON CONFLICT DO UPDATE when both share the same (config, conc, offload).
+ if (
+ typeof row.num_requests_successful === 'number' &&
+ row.num_requests_successful === 0 &&
+ typeof row.num_requests_total === 'number'
+ ) {
+ tracker.skips.failedRun++;
+ return null;
+ }
+
+ // Agentic offload signal: prefer `offload_mode` ('on'|'off'), then the v3
+ // `kv_offloading` descriptor ('none'|'dram'|…), then legacy `offloading`.
+ // Descriptors reduce to the binary on/off used for row identity ('none' →
+ // 'off', anything else → 'on') so v3 offload points keep colliding-key parity
+ // with their v2 predecessors instead of forking a third offload_mode value.
+ const offloadModeRaw =
+ typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+ ? row.offload_mode
+ : (descriptorToOnOff(row.kv_offloading) ?? descriptorToOnOff(row.offloading) ?? 'off');
+
const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg);
const isMultinode = parseBool(row.is_multinode);
const precision = normalizePrecision(String(row.precision ?? ''));
@@ -160,55 +229,46 @@ export function mapBenchmarkRow(
}
const specMethod = normalizeSpecMethod(row.spec_decoding);
- let prefillTp: number, prefillEp: number, prefillDpAttn: boolean, prefillNumWorkers: number;
- let decodeTp: number, decodeEp: number, decodeDpAttn: boolean, decodeNumWorkers: number;
- let numPrefillGpu: number, numDecodeGpu: number;
+ const parallelism = resolveParallelism(row);
+ const metrics = captureNumericMetrics(row);
- if ('prefill_tp' in row) {
- // v2 schema: full disagg parallelism fields
- prefillTp = parseInt2(row.prefill_tp) ?? 1;
- prefillEp = parseInt2(row.prefill_ep) ?? 1;
- prefillDpAttn = parseBool(row.prefill_dp_attention);
- prefillNumWorkers = parseInt2(row.prefill_num_workers) ?? 0;
- decodeTp = parseInt2(row.decode_tp) ?? 1;
- decodeEp = parseInt2(row.decode_ep) ?? 1;
- decodeDpAttn = parseBool(row.decode_dp_attention);
- decodeNumWorkers = parseInt2(row.decode_num_workers) ?? 0;
- numPrefillGpu = parseInt2(row.num_prefill_gpu) ?? prefillTp * prefillEp;
- numDecodeGpu = parseInt2(row.num_decode_gpu) ?? decodeTp * decodeEp;
- } else {
- // v1 schema: single tp/ep, prefill = decode
- const tp = parseInt2(row.tp) ?? 1;
- const ep = parseInt2(row.ep) ?? 1;
- const dpAttn = parseBool(row.dp_attention);
- prefillTp = tp;
- decodeTp = tp;
- prefillEp = ep;
- decodeEp = ep;
- prefillDpAttn = dpAttn;
- decodeDpAttn = dpAttn;
- prefillNumWorkers = 0;
- decodeNumWorkers = 0;
- numPrefillGpu = tp * ep;
- numDecodeGpu = tp * ep;
+ // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`)
+ // — preserve as a stringified metric so the frontend can expose it in tooltips.
+ // v3 rows additionally carry the offload tier + backend ('dram'/'mooncake');
+ // keep them so the UI can say *what kind* of offload, not just on/off.
+ if (isAgentic) {
+ (metrics as Record).offload_mode = offloadModeRaw;
+ if (typeof row.kv_offloading === 'string' && row.kv_offloading.length > 0) {
+ (metrics as Record).kv_offloading = row.kv_offloading;
+ }
+ if (typeof row.kv_offload_backend === 'string' && row.kv_offload_backend.length > 0) {
+ (metrics as Record).kv_offload_backend = row.kv_offload_backend;
+ }
}
- // Auto-capture all numeric fields not reserved for config/routing dimensions.
- // Fields in METRIC_RENAMES are stored under their canonical name; all others
- // use the raw key. Any key outside METRIC_KEYS triggers a one-time
- // warning so new schema additions don't go silently unnoticed.
- const metrics: Record = {};
- for (const [rawKey, val] of Object.entries(row)) {
- if (NON_METRIC_KEYS.has(rawKey)) continue;
- const n = parseNum(val);
- if (n === undefined) continue;
- const storedKey = METRIC_RENAMES[rawKey] ?? rawKey;
- metrics[storedKey] = n;
- if (!METRIC_KEYS.has(rawKey) && !_warnedMetricKeys.has(rawKey)) {
- _warnedMetricKeys.add(rawKey);
- console.warn(
- ` [WARN] auto-captured unexpected metric '${rawKey}' — add to METRIC_KEYS in constants/src/metric-keys.ts or NON_METRIC_KEYS in benchmark-mapper.ts`,
- );
+ // Slow-tail interactivity invariant. Agentic artifacts ship `*_intvty`, but the
+ // definition has drifted across harness versions: some emit `1/p(ITL)`
+ // (slow-tail), others `p(1/ITL)` — which inverts percentile order, so p90 comes
+ // out as ~1/p10(ITL) instead. The inference chart's interactivity selector and
+ // the detail time-series both treat interactivity as the reciprocal of the ITL
+ // percentile, so we derive it from `*_itl` here rather than trust the artifact,
+ // keeping every agentic row on one definition. `std` is excluded — the
+ // reciprocal of a standard deviation is meaningless. Mirrored in the frontend
+ // overlay path (agenticAliases).
+ //
+ // When `*_itl` is absent/zero/invalid we must DELETE any artifact-supplied
+ // `*_intvty` rather than let it survive: keeping it would mix the harness's
+ // (possibly `p(1/ITL)`) definition into a column that's meant to be `1/p(ITL)`
+ // everywhere else. Downstream reads a missing key as "not recorded"
+ // (rowToAggDataEntry coerces `?? 0`; the legend table renders a dash).
+ if (isAgentic) {
+ for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
+ const itl = metrics[`${k}_itl`];
+ if (typeof itl === 'number' && itl > 0) {
+ metrics[`${k}_intvty`] = 1 / itl;
+ } else {
+ delete metrics[`${k}_intvty`];
+ }
}
}
@@ -231,26 +291,99 @@ export function mapBenchmarkRow(
specMethod,
disagg,
isMultinode,
- prefillTp,
- prefillEp,
- prefillDpAttn,
- prefillNumWorkers,
- decodeTp,
- decodeEp,
- decodeDpAttn,
- decodeNumWorkers,
- numPrefillGpu,
- numDecodeGpu,
+ ...parallelism,
},
+ benchmarkType,
isl,
osl,
conc,
+ offloadMode: offloadModeRaw,
image,
metrics,
workers,
};
}
+/** The parallelism slice of `ConfigParams`, resolved from either artifact schema. */
+type ParallelismParams = Pick<
+ ConfigParams,
+ | 'prefillTp'
+ | 'prefillEp'
+ | 'prefillDpAttn'
+ | 'prefillNumWorkers'
+ | 'decodeTp'
+ | 'decodeEp'
+ | 'decodeDpAttn'
+ | 'decodeNumWorkers'
+ | 'numPrefillGpu'
+ | 'numDecodeGpu'
+>;
+
+/**
+ * Resolve prefill/decode parallelism from a raw row. v2 rows (2025-12-19+)
+ * carry full disagg fields keyed by the presence of `prefill_tp`; v1 rows have
+ * a single `tp`/`ep` that applies to both phases.
+ */
+function resolveParallelism(row: Record): ParallelismParams {
+ if ('prefill_tp' in row) {
+ // v2 schema: full disagg parallelism fields
+ const prefillTp = parseInt2(row.prefill_tp) ?? 1;
+ const prefillEp = parseInt2(row.prefill_ep) ?? 1;
+ const decodeTp = parseInt2(row.decode_tp) ?? 1;
+ const decodeEp = parseInt2(row.decode_ep) ?? 1;
+ return {
+ prefillTp,
+ prefillEp,
+ prefillDpAttn: parseBool(row.prefill_dp_attention),
+ prefillNumWorkers: parseInt2(row.prefill_num_workers) ?? 0,
+ decodeTp,
+ decodeEp,
+ decodeDpAttn: parseBool(row.decode_dp_attention),
+ decodeNumWorkers: parseInt2(row.decode_num_workers) ?? 0,
+ numPrefillGpu: parseInt2(row.num_prefill_gpu) ?? prefillTp * prefillEp,
+ numDecodeGpu: parseInt2(row.num_decode_gpu) ?? decodeTp * decodeEp,
+ };
+ }
+ // v1 schema: single tp/ep, prefill = decode
+ const tp = parseInt2(row.tp) ?? 1;
+ const ep = parseInt2(row.ep) ?? 1;
+ const dpAttn = parseBool(row.dp_attention);
+ return {
+ prefillTp: tp,
+ prefillEp: ep,
+ prefillDpAttn: dpAttn,
+ prefillNumWorkers: 0,
+ decodeTp: tp,
+ decodeEp: ep,
+ decodeDpAttn: dpAttn,
+ decodeNumWorkers: 0,
+ numPrefillGpu: tp * ep,
+ numDecodeGpu: tp * ep,
+ };
+}
+
+/**
+ * Auto-capture all numeric fields not reserved for config/routing dimensions,
+ * stored under their raw key. Any key outside METRIC_KEYS triggers a one-time
+ * warning so new schema additions don't go silently unnoticed.
+ */
+function captureNumericMetrics(row: Record): Record {
+ const metrics: Record = {};
+ for (const [rawKey, val] of Object.entries(row)) {
+ if (NON_METRIC_KEYS.has(rawKey)) continue;
+ const n = parseNum(val);
+ if (n === undefined) continue;
+ metrics[rawKey] = n;
+ if (!METRIC_KEYS.has(rawKey) && !_warnedMetricKeys.has(rawKey)) {
+ _warnedMetricKeys.add(rawKey);
+ console.warn(
+ ` [WARN] auto-captured unexpected metric '${rawKey}' — add to METRIC_KEYS in constants/src/metric-keys.ts or NON_METRIC_KEYS in benchmark-mapper.ts`,
+ );
+ }
+ }
+ return metrics;
+}
+
/**
* Narrow a raw `workers` value from the artifact JSON to `WorkerPower[]` or
* undefined. Each entry must have a string `role`, a numeric `worker_idx`,
diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts
new file mode 100644
index 00000000..7b745c09
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.test.ts
@@ -0,0 +1,152 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import {
+ STATS_VERSION,
+ computeAggregateStats,
+ mergeProfileStatsUpgrade,
+} from './compute-aggregate-stats.js';
+
+/** Build a minimal `profile_export.jsonl` from a few synthetic requests. */
+function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) {
+ const lines = requests.map((r, i) =>
+ JSON.stringify({
+ metadata: {
+ benchmark_phase: 'profiling',
+ conversation_id: `conv-${i}`,
+ turn_index: 0,
+ },
+ metrics: {
+ input_sequence_length: { value: r.isl, unit: 'tokens' },
+ output_sequence_length: { value: r.osl, unit: 'tokens' },
+ request_latency: { value: r.rl ?? 1000, unit: 'ms' },
+ time_to_first_token: { value: r.ttft ?? 100, unit: 'ms' },
+ },
+ }),
+ );
+ return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+/** Build a tiny server_metrics_json blob with KV util + prefix cache series. */
+function makeServerBlob() {
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:kv_cache_usage_perc': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 0, end_ns: 1, avg: 0.2 },
+ { start_ns: 1, end_ns: 2, avg: 0.5 },
+ { start_ns: 2, end_ns: 3, avg: 0.8 },
+ ],
+ },
+ ],
+ },
+ 'vllm:prefix_cache_hits': {
+ series: [{ timeslices: [{ start_ns: 0, rate: 80 }] }],
+ },
+ 'vllm:prefix_cache_queries': {
+ series: [{ timeslices: [{ start_ns: 0, rate: 100 }] }],
+ },
+ },
+ });
+ return gzipSync(Buffer.from(json));
+}
+
+describe('computeAggregateStats', () => {
+ it('returns the current STATS_VERSION in the bundle', async () => {
+ const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+ expect(stats.version).toBe(STATS_VERSION);
+ });
+
+ it('leaves every metric null when both blobs are null', async () => {
+ const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+ expect(stats.isl).toBeNull();
+ expect(stats.osl).toBeNull();
+ expect(stats.kvCacheUtil).toBeNull();
+ expect(stats.prefixCacheHitRate).toBeNull();
+ expect(stats.normalizedSessionTimeS).toBeNull();
+ expect(stats.p90PrefillTpsPerUser).toBeNull();
+ expect(stats.normalizedE2e400).toBeNull();
+ });
+
+ it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => {
+ const profileBlob = makeProfileBlob([
+ { isl: 100, osl: 50, rl: 1000, ttft: 100 },
+ { isl: 200, osl: 75, rl: 2000, ttft: 200 },
+ { isl: 300, osl: 100, rl: 3000, ttft: 300 },
+ ]);
+ const stats = await computeAggregateStats({ profileBlob, serverBlob: null });
+
+ expect(stats.isl?.n).toBe(3);
+ expect(stats.isl?.mean).toBeCloseTo(200, 6);
+ expect(stats.osl?.n).toBe(3);
+ expect(stats.osl?.mean).toBeCloseTo(75, 6);
+
+ // Server-side metrics still null when there's no server blob.
+ expect(stats.kvCacheUtil).toBeNull();
+ expect(stats.prefixCacheHitRate).toBeNull();
+
+ // Derived: prefill TPS per turn = isl / (ttft/1000) = 1000 for each, so p90 = 1000.
+ expect(stats.p90PrefillTpsPerUser).toBeCloseTo(1000, 6);
+ // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+ // loads = [150, 275, 400], mean_load = 275
+ // scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625]
+ // mean ≈ 1.9653
+ expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3);
+ expect(stats.normalizedE2e400?.n).toBe(3);
+ expect(stats.normalizedE2e400?.p90).toBeGreaterThan(0);
+ });
+
+ it('computes KV util + prefix hit rate from the server blob alone', async () => {
+ const stats = await computeAggregateStats({
+ profileBlob: null,
+ serverBlob: makeServerBlob(),
+ });
+ expect(stats.kvCacheUtil?.n).toBe(3);
+ expect(stats.kvCacheUtil?.mean).toBeCloseTo(0.5, 6);
+ expect(stats.prefixCacheHitRate?.n).toBe(1);
+ expect(stats.prefixCacheHitRate?.mean).toBeCloseTo(0.8, 6);
+
+ // Profile-derived metrics absent.
+ expect(stats.isl).toBeNull();
+ expect(stats.osl).toBeNull();
+ expect(stats.normalizedSessionTimeS).toBeNull();
+ expect(stats.p90PrefillTpsPerUser).toBeNull();
+ expect(stats.normalizedE2e400).toBeNull();
+ });
+
+ it('tolerates a malformed profile blob by leaving its metrics null', async () => {
+ // A random non-gzip buffer triggers a gunzip error — code path swallows it.
+ const garbage = Buffer.from('not-gzip-data');
+ const stats = await computeAggregateStats({ profileBlob: garbage, serverBlob: null });
+ expect(stats.isl).toBeNull();
+ expect(stats.osl).toBeNull();
+ expect(stats.normalizedSessionTimeS).toBeNull();
+ expect(stats.p90PrefillTpsPerUser).toBeNull();
+ expect(stats.normalizedE2e400).toBeNull();
+ // Version still set so the row is considered "computed".
+ expect(stats.version).toBe(STATS_VERSION);
+ });
+});
+
+describe('mergeProfileStatsUpgrade', () => {
+ it('updates profile metrics while preserving existing server distributions', async () => {
+ const existing = await computeAggregateStats({
+ profileBlob: null,
+ serverBlob: makeServerBlob(),
+ });
+ const profile = await computeAggregateStats({
+ profileBlob: makeProfileBlob([{ isl: 100, osl: 100, rl: 2080, ttft: 100 }]),
+ serverBlob: null,
+ });
+
+ const merged = mergeProfileStatsUpgrade(existing, profile);
+ expect(merged.version).toBe(STATS_VERSION);
+ expect(merged.isl?.mean).toBe(100);
+ expect(merged.normalizedE2e400?.p90).toBeGreaterThan(0);
+ expect(merged.kvCacheUtil).toEqual(existing.kvCacheUtil);
+ expect(merged.prefixCacheHitRate).toEqual(existing.prefixCacheHitRate);
+ });
+});
diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts
new file mode 100644
index 00000000..cea9361c
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.ts
@@ -0,0 +1,149 @@
+/**
+ * Pre-compute the per-row aggregate stats for an `agentic_trace_replay`
+ * blob pair. The output lands in the `aggregate_stats` JSONB column so the
+ * detail page can serve the "Aggregates across configs" view and the
+ * derived chart x-axis modes from a single SQL row read, instead of
+ * parsing the raw blobs on demand.
+ *
+ * Shape is intentionally versioned — bump `STATS_VERSION` whenever the
+ * computation changes so the backfill script knows which rows to recompute.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream';
+import { computeDerivedFromBlob } from '../queries/derived-agentic-metrics';
+import {
+ STATS_VERSION,
+ extractIslOsl,
+ extractServerMetricSamples,
+ percentilesOf,
+ type MetricPercentiles,
+} from '../queries/agentic-aggregates';
+
+export { STATS_VERSION };
+
+export interface AggregateStats {
+ version: number;
+ isl: MetricPercentiles | null;
+ osl: MetricPercentiles | null;
+ kvCacheUtil: MetricPercentiles | null;
+ prefixCacheHitRate: MetricPercentiles | null;
+ /** Mean of (per-session e2e time × mean_load / session_load) across sessions. */
+ normalizedSessionTimeS: number | null;
+ /** P90 of per-turn ISL/TTFT pooled across every session's turns. */
+ p90PrefillTpsPerUser: number | null;
+ /** Per-request normalized E2E distribution at a fixed 400-token OSL. */
+ normalizedE2e400: MetricPercentiles | null;
+}
+
+/**
+ * Upgrade an existing stats bundle when only profile-derived fields changed.
+ * This avoids re-reading and decompressing the much larger server-metrics blob
+ * while preserving its already-computed KV/cache distributions.
+ */
+export function mergeProfileStatsUpgrade(
+ existing: Omit & {
+ normalizedE2e400?: MetricPercentiles | null;
+ },
+ profile: AggregateStats,
+): AggregateStats {
+ return {
+ ...profile,
+ isl: profile.isl ?? existing.isl,
+ osl: profile.osl ?? existing.osl,
+ normalizedSessionTimeS: profile.normalizedSessionTimeS ?? existing.normalizedSessionTimeS,
+ p90PrefillTpsPerUser: profile.p90PrefillTpsPerUser ?? existing.p90PrefillTpsPerUser,
+ kvCacheUtil: existing.kvCacheUtil,
+ prefixCacheHitRate: existing.prefixCacheHitRate,
+ };
+}
+
+/** Metric subtrees we extract via stream-parse on oversized server blobs. */
+const TARGET_METRIC_KEYS = new Set([
+ 'vllm:kv_cache_usage_perc',
+ 'vllm:gpu_cache_usage_perc',
+ 'vllm:prefix_cache_hits',
+ 'vllm:prefix_cache_queries',
+ 'vllm:gpu_prefix_cache_hits',
+ 'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect just the metric
+ * subtrees we care about. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync().toString('utf8')` hits on high-conc TP+EP rows.
+ */
+async function streamExtractServer(
+ buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+ const collected = await streamCollectKeys(buffer, 'metrics', TARGET_METRIC_KEYS);
+ return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
+/**
+ * Compute the full versioned stats bundle from a (profile, server-metrics)
+ * blob pair. Either blob may be null (e.g. only the server file existed) —
+ * the corresponding stats just come back null.
+ */
+export async function computeAggregateStats(args: {
+ profileBlob: Buffer | null;
+ serverBlob: Buffer | null;
+}): Promise {
+ let islPct: MetricPercentiles | null = null;
+ let oslPct: MetricPercentiles | null = null;
+ let normalized: number | null = null;
+ let prefillP90: number | null = null;
+ let normalizedE2e400: MetricPercentiles | null = null;
+
+ if (args.profileBlob) {
+ try {
+ const jsonl = gunzipSync(args.profileBlob).toString('utf8');
+ const { isl, osl } = extractIslOsl(jsonl);
+ islPct = percentilesOf(isl);
+ oslPct = percentilesOf(osl);
+ const derived = computeDerivedFromBlob(jsonl);
+ normalized = derived.normalized_session_time_s;
+ prefillP90 = derived.p90_prefill_tps_per_user;
+ normalizedE2e400 = derived.normalized_e2e_400;
+ } catch {
+ // ignore malformed blob — leave nulls
+ }
+ }
+
+ let kvPct: MetricPercentiles | null = null;
+ let prefixPct: MetricPercentiles | null = null;
+ if (args.serverBlob) {
+ let server: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+ try {
+ const json = gunzipSync(args.serverBlob).toString('utf8');
+ server = extractServerMetricSamples(json);
+ } catch (error) {
+ // ERR_STRING_TOO_LONG hits on high-conc TP+EP rows. Stream-parse to
+ // pull just the metric subtrees we need without materializing the
+ // full 500+ MB JSON string.
+ if (isStringTooLongError(error)) {
+ try {
+ server = await streamExtractServer(args.serverBlob);
+ } catch {
+ // stream fallback failed too — leave nulls
+ }
+ }
+ }
+ if (server) {
+ kvPct = percentilesOf(server.kvCacheUtil);
+ prefixPct = percentilesOf(server.prefixCacheHitRate);
+ }
+ }
+
+ return {
+ version: STATS_VERSION,
+ isl: islPct,
+ osl: oslPct,
+ kvCacheUtil: kvPct,
+ prefixCacheHitRate: prefixPct,
+ normalizedSessionTimeS: normalized,
+ p90PrefillTpsPerUser: prefillP90,
+ normalizedE2e400,
+ };
+}
diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
new file mode 100644
index 00000000..3f088cd6
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -0,0 +1,341 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { CHART_SERIES_VERSION, computeChartSeries } from './compute-chart-series.js';
+
+/**
+ * Build a minimal server_metrics_json blob covering the metrics the chart
+ * consumes. Each timeslice is one second long starting at t=0.
+ */
+function makeBlob(opts?: {
+ prefixHits?: number;
+ prefixQueries?: number;
+ promptTokensRate?: number;
+}) {
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:kv_cache_usage_perc': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 0, end_ns: 1e9, avg: 0.1 },
+ { start_ns: 1e9, end_ns: 2e9, avg: 0.4 },
+ { start_ns: 2e9, end_ns: 3e9, avg: 0.7 },
+ ],
+ },
+ ],
+ },
+ 'vllm:prefix_cache_hits': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixHits ?? 75 }] }],
+ },
+ 'vllm:prefix_cache_queries': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixQueries ?? 100 }] }],
+ },
+ 'vllm:num_requests_running': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 5 }] }],
+ },
+ 'vllm:num_requests_waiting': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 2 }] }],
+ },
+ 'vllm:prompt_tokens': {
+ series: [
+ { timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.promptTokensRate ?? 1000 }] },
+ ],
+ },
+ 'vllm:generation_tokens': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 500 }] }],
+ },
+ 'vllm:prompt_tokens_by_source': {
+ series: [
+ {
+ labels: { source: 'local_cache_hit' },
+ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 200 }],
+ },
+ {
+ labels: { source: 'miss' },
+ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 800 }],
+ },
+ ],
+ },
+ },
+ });
+ return gzipSync(Buffer.from(json));
+}
+
+/** Build a synthetic per-engine vLLM metric series for the multi-engine test. */
+function buildEngineSeries(engineId: number, baseRunning: number) {
+ const labels = { engine: String(engineId) };
+ return {
+ runningSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, avg: baseRunning },
+ { start_ns: 1e9, avg: baseRunning + 1 },
+ ],
+ },
+ waitingSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, avg: 0 },
+ { start_ns: 1e9, avg: 0 },
+ ],
+ },
+ kvSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, avg: 0.25 },
+ { start_ns: 1e9, avg: 0.5 },
+ ],
+ },
+ promptSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, rate: 100 },
+ { start_ns: 1e9, rate: 200 },
+ ],
+ },
+ genSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, rate: 50 },
+ { start_ns: 1e9, rate: 75 },
+ ],
+ },
+ };
+}
+
+function buildDynamoSeries(
+ endpoint_url: string,
+ dynamo_component: 'prefill' | 'backend',
+ worker_id: string,
+ value: number,
+ field: 'rate' | 'avg' = 'rate',
+) {
+ return {
+ endpoint_url,
+ labels: { dynamo_component, worker_id, dp_rank: '0', engine: '0' },
+ timeslices: [{ start_ns: 0, end_ns: 1e9, [field]: value }],
+ };
+}
+
+describe('computeChartSeries', () => {
+ it('returns null when the blob is null', async () => {
+ expect(await computeChartSeries(null)).toBeNull();
+ });
+
+ it('returns the current CHART_SERIES_VERSION in the bundle', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(series?.version).toBe(CHART_SERIES_VERSION);
+ });
+
+ it('extracts kvCacheUsage points with t=seconds-from-start', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(series?.kvCacheUsage).toEqual([
+ { t: 0, value: 0.1 },
+ { t: 1, value: 0.4 },
+ { t: 2, value: 0.7 },
+ ]);
+ });
+
+ it('merges warmup_metrics before profiling into one continuous series (v11)', async () => {
+ // warmup scrapes at t=0,1s; profiling scrapes at t=10,11s (own start_ns).
+ const blob = gzipSync(
+ Buffer.from(
+ JSON.stringify({
+ warmup_metrics: {
+ 'vllm:kv_cache_usage_perc': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 0, end_ns: 1e9, avg: 0.2 },
+ { start_ns: 1e9, end_ns: 2e9, avg: 0.3 },
+ ],
+ },
+ ],
+ },
+ },
+ metrics: {
+ 'vllm:kv_cache_usage_perc': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 10e9, end_ns: 11e9, avg: 0.8 },
+ { start_ns: 11e9, end_ns: 12e9, avg: 0.9 },
+ ],
+ },
+ ],
+ },
+ },
+ }),
+ ),
+ );
+ const series = await computeChartSeries(blob);
+ // Origin is the earliest (warmup) start_ns, so warmup sits at low t and
+ // profiling follows on the same axis — the frontend slices at the boundary.
+ expect(series?.kvCacheUsage).toEqual([
+ { t: 0, value: 0.2 },
+ { t: 1, value: 0.3 },
+ { t: 10, value: 0.8 },
+ { t: 11, value: 0.9 },
+ ]);
+ });
+
+ it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => {
+ const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 }));
+ expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]);
+ });
+
+ it('drops prefixCacheHitRate windows where queries.rate is 0', async () => {
+ const series = await computeChartSeries(makeBlob({ prefixHits: 5, prefixQueries: 0 }));
+ expect(series?.prefixCacheHitRate).toEqual([]);
+ });
+
+ it('pairs running + waiting into queueDepth points', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(series?.queueDepth).toEqual([{ t: 0, running: 5, waiting: 2, total: 7 }]);
+ });
+
+ it('extracts prefillTps + decodeTps from counter rates', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(series?.prefillTps).toEqual([{ t: 0, value: 1000 }]);
+ expect(series?.decodeTps).toEqual([{ t: 0, value: 500 }]);
+ });
+
+ it('splits promptTokensBySource by label and skips empty series', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(Object.keys(series!.promptTokensBySource).toSorted()).toEqual([
+ 'local_cache_hit',
+ 'miss',
+ ]);
+ expect(series!.promptTokensBySource['local_cache_hit']).toEqual([{ t: 0, value: 200 }]);
+ expect(series!.promptTokensBySource['miss']).toEqual([{ t: 0, value: 800 }]);
+ });
+
+ it('computes timing metadata from the widest metric window', async () => {
+ const series = await computeChartSeries(makeBlob());
+ // kvCacheUsage has the widest window (0 → 3e9), so startNs=0, endNs=3e9.
+ expect(series?.startNs).toBe(0);
+ expect(series?.endNs).toBe(3e9);
+ expect(series?.durationS).toBeCloseTo(3, 6);
+ expect(series?.timeslicesCount).toBe(3);
+ });
+
+ it('returns null on a malformed (non-gzip) blob', async () => {
+ const result = await computeChartSeries(Buffer.from('not-gzip-data'));
+ expect(result).toBeNull();
+ });
+
+ it('aggregates gauges + counters across all engine series (DP/PP fix)', async () => {
+ // Simulate a 4-engine deployment: each engine reports its own series for
+ // every metric. Cluster-wide value should be SUM for running/waiting and
+ // counter rates, AVG for kv_cache_usage_perc (per-engine fraction).
+ const engines = [0, 1, 2, 3].map((id) => buildEngineSeries(id, 3)); // running=3 per engine
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:num_requests_running': { series: engines.map((e) => e.runningSlice) },
+ 'vllm:num_requests_waiting': { series: engines.map((e) => e.waitingSlice) },
+ 'vllm:kv_cache_usage_perc': { series: engines.map((e) => e.kvSlice) },
+ 'vllm:prompt_tokens': { series: engines.map((e) => e.promptSlice) },
+ 'vllm:generation_tokens': { series: engines.map((e) => e.genSlice) },
+ },
+ });
+ const blob = gzipSync(Buffer.from(json));
+ const cs = await computeChartSeries(blob);
+ expect(cs).not.toBeNull();
+ // queueDepth.running = Σ engines = 4 × 3 = 12 at t=0; 4 × 4 = 16 at t=1
+ expect(cs!.queueDepth).toEqual([
+ { t: 0, running: 12, waiting: 0, total: 12 },
+ { t: 1, running: 16, waiting: 0, total: 16 },
+ ]);
+ // kvCacheUsage stays 0.25, 0.5 (average across engines, all engines reported same value)
+ expect(cs!.kvCacheUsage).toEqual([
+ { t: 0, value: 0.25 },
+ { t: 1, value: 0.5 },
+ ]);
+ // prefillTps = Σ rates = 4 × 100 = 400; then 4 × 200 = 800
+ expect(cs!.prefillTps).toEqual([
+ { t: 0, value: 400 },
+ { t: 1, value: 800 },
+ ]);
+ expect(cs!.decodeTps).toEqual([
+ { t: 0, value: 200 },
+ { t: 1, value: 300 },
+ ]);
+ });
+
+ it('uses the Dynamo adapter to preserve workers and canonical prefill/decode roles', async () => {
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:prompt_tokens': {
+ series: [
+ buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 100),
+ buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 200),
+ buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 300),
+ ],
+ },
+ 'vllm:generation_tokens': {
+ series: [
+ buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 1),
+ buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 2),
+ buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 400),
+ ],
+ },
+ 'vllm:num_requests_running': {
+ series: [
+ buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 3, 'avg'),
+ buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 4, 'avg'),
+ buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 5, 'avg'),
+ ],
+ },
+ },
+ });
+
+ const blob = gzipSync(Buffer.from(json));
+ const result = await computeChartSeries(blob, {
+ framework: 'dynamo-vllm',
+ disagg: true,
+ });
+
+ expect(result?.metricSources).toHaveLength(3);
+ expect(result?.metricSources.map(({ source: s }) => [s.role, s.workerId, s.engine])).toEqual([
+ ['prefill', 'prefill-b', '0'],
+ ['prefill', 'prefill-a', '0'],
+ ['decode', 'decode-a', '0'],
+ ]);
+ const prefillA = result?.metricSources.find(({ source: s }) => s.workerId === 'prefill-a');
+ const decode = result?.metricSources.find(({ source: s }) => s.role === 'decode');
+ expect(prefillA?.promptTps).toEqual([{ t: 0, value: 100 }]);
+ expect(prefillA?.queueDepth).toEqual([{ t: 0, running: 3, waiting: 0, total: 3 }]);
+ expect(decode?.generationTps).toEqual([{ t: 0, value: 400 }]);
+
+ const nonDisagg = await computeChartSeries(blob, {
+ framework: 'dynamo-vllm',
+ disagg: false,
+ });
+ expect(nonDisagg?.metricSources).toEqual([]);
+ });
+
+ it('does not interpret Dynamo-native labels without selecting the Dynamo adapter', async () => {
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:prompt_tokens': {
+ series: [
+ {
+ endpoint_url: '10.30.1.56:7500',
+ labels: { dynamo_component: 'prefill', worker_id: 'prefill-a', engine: '0' },
+ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 100 }],
+ },
+ ],
+ },
+ },
+ });
+
+ const result = await computeChartSeries(gzipSync(Buffer.from(json)), {
+ framework: 'vllm',
+ disagg: true,
+ });
+
+ expect(result?.metricSources).toEqual([]);
+ });
+});
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
new file mode 100644
index 00000000..d140306f
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -0,0 +1,576 @@
+/**
+ * Pre-compute the time-series for the agentic detail page chart, so the
+ * API doesn't have to gunzip + JSON-parse a multi-hundred-MB blob on every
+ * request. The output lands in `agentic_trace_replay.chart_series` and is
+ * read directly by `getTraceServerMetrics`.
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `CHART_SERIES_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream';
+import {
+ selectServerMetricsAdapter,
+ type MetricSource,
+ type ServerMetricsContext,
+} from './server-metrics-adapters';
+
+/**
+ * Bump when the extraction algorithm changes — backfill recomputes anything
+ * older.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP
+ * deployments — most visible as a request-queue-depth chart that maxed out
+ * at ~3 when the timeline clearly showed 20+ in-flight).
+ *
+ * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative
+ * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps).
+ *
+ * v4: extract sglang:* metrics too (fallback chain in each picker), so
+ * SGLang runs populate the chart_series the same way vllm runs do.
+ *
+ * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode})
+ * into promptTokensBySource so the cumulative prompt-token-source-breakdown
+ * chart shows useful splits for SGLang runs (filtered to prefill_* modes).
+ *
+ * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source
+ * breakdown from sglang:cached_tokens — current runs always have one
+ * cache_source ("device" / HBM) but hicache (CPU offload) runs would
+ * split into "device" + "host" automatically once ingested.
+ *
+ * v7: extract sglang:hicache_host_{used,total}_tokens into a new
+ * hostKvCacheUsage series so the KV cache utilization chart can plot
+ * the CPU offload pool's usage alongside the on-GPU HBM line.
+ *
+ * v8: keep the per-engine dimension on kv_cache_usage_perc as
+ * `kvCacheUsageByEngine` (one entry per DP rank). The cluster-average
+ * line hides load skew on DEP configs; the detail page overlays the
+ * per-rank lines so a hot rank is visible at a glance.
+ *
+ * v9: retain orchestrator-normalized per-source series. Dynamo labels are
+ * mapped to canonical router/prefill/decode roles, allowing the frontend to
+ * inspect individual workers without interpreting Dynamo-native labels.
+ *
+ * v10: only emit per-source series for disaggregated configs with a recognized
+ * orchestrator adapter. Non-disaggregated and unsupported configs retain the
+ * existing aggregate-only behavior.
+ *
+ * v12: also consume the `warmup_metrics` block from the server-metrics blob and
+ * merge its scrapes into the same series as the profiling `metrics` block.
+ * Warmup and profiling timeslices carry their own absolute `start_ns` and never
+ * overlap in time, so the merged series is continuous (warmup at lower t,
+ * profiling after). This lets the agentic detail page slice `chart_series` into
+ * warmup vs profiling at the request-derived boundary; older blobs without a
+ * warmup block are unaffected. (v11 was a short-lived, since-reverted attempt to
+ * carry kvCachePoolTokens in chart_series; that value now lives in
+ * benchmark_results.metrics, derived from the server log — unrelated to this.)
+ */
+export const CHART_SERIES_VERSION = 12;
+
+export interface TimeSeriesPoint {
+ /** Seconds from benchmark start. */
+ t: number;
+ value: number;
+}
+
+export interface QueueDepthPoint {
+ t: number;
+ running: number;
+ waiting: number;
+ total: number;
+}
+
+export interface ChartSeries {
+ version: number;
+ /** ns wall-clock of the first window's start; for debugging only. */
+ startNs: number;
+ /** ns wall-clock of the last window's end. */
+ endNs: number;
+ /** Total benchmark window in seconds. */
+ durationS: number;
+ /** Number of 1Hz windows captured. */
+ timeslicesCount: number;
+ kvCacheUsage: TimeSeriesPoint[];
+ prefixCacheHitRate: TimeSeriesPoint[];
+ queueDepth: QueueDepthPoint[];
+ promptTokensBySource: Record;
+ prefillTps: TimeSeriesPoint[];
+ decodeTps: TimeSeriesPoint[];
+ /**
+ * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across
+ * engines. Detail page derives "cumulative unique input tokens" as
+ * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually
+ * saved vs the raw queries that came in.
+ */
+ prefixCacheHitsTps: TimeSeriesPoint[];
+ /**
+ * Host (CPU offload) KV cache utilization, 0..1. Only populated for
+ * SGLang hicache runs (derived as hicache_host_used / hicache_host_total).
+ * Frontend overlays this on the KV cache util chart as a second line.
+ */
+ hostKvCacheUsage: TimeSeriesPoint[];
+ /**
+ * Per-DP-rank KV cache utilization (0..1 each). One entry per engine
+ * series found in the raw metric, ordered by the `engine` label when
+ * present and by series-array index otherwise. Empty for single-engine
+ * deployments — the average `kvCacheUsage` line covers that case alone.
+ * The detail page overlays these on the same chart so DEP load skew is
+ * visible without changing the headline number.
+ */
+ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+ /**
+ * The same metrics grouped by normalized server source. Existing aggregate
+ * fields above remain the default and preserve compatibility with old rows.
+ */
+ metricSources: MetricSourceSeries[];
+}
+
+export interface MetricSourceSeries {
+ source: MetricSource;
+ kvCacheUsage: TimeSeriesPoint[];
+ prefixCacheHitRate: TimeSeriesPoint[];
+ queueDepth: QueueDepthPoint[];
+ promptTokensBySource: Record;
+ /** Raw prompt-token counter rate for this source. */
+ promptTps: TimeSeriesPoint[];
+ /** Raw generation-token counter rate for this source. */
+ generationTps: TimeSeriesPoint[];
+ prefixCacheHitsTps: TimeSeriesPoint[];
+ hostKvCacheUsage: TimeSeriesPoint[];
+ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+}
+
+// ── Raw blob shapes (subset we read) ────────────────────────────────────
+
+interface RawSlice {
+ start_ns?: number;
+ end_ns?: number;
+ avg?: number;
+ rate?: number;
+}
+
+interface RawSeries {
+ endpoint_url?: string;
+ labels?: Record;
+ timeslices?: RawSlice[];
+}
+
+interface RawMetric {
+ series?: RawSeries[];
+}
+
+type MetricsMap = Record;
+
+/**
+ * The set of metric subtrees the chart consumes. Includes both vllm:* and
+ * sglang:* names so the stream-parse fallback collects whichever framework
+ * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric.
+ */
+const CHART_METRIC_KEYS = new Set([
+ // vLLM
+ 'vllm:kv_cache_usage_perc',
+ 'vllm:gpu_cache_usage_perc',
+ 'vllm:prefix_cache_hits',
+ 'vllm:prefix_cache_queries',
+ 'vllm:num_requests_running',
+ 'vllm:num_requests_waiting',
+ 'vllm:prompt_tokens',
+ 'vllm:generation_tokens',
+ 'vllm:prompt_tokens_by_source',
+ // SGLang
+ 'sglang:token_usage',
+ 'sglang:cached_tokens',
+ 'sglang:prompt_tokens',
+ 'sglang:generation_tokens',
+ 'sglang:num_running_reqs',
+ 'sglang:num_queue_reqs',
+ 'sglang:realtime_tokens',
+ 'sglang:hicache_host_used_tokens',
+ 'sglang:hicache_host_total_tokens',
+]);
+
+/**
+ * Merge a warmup phase metric map into the profiling one by concatenating each
+ * metric's `series`. The two phases' timeslices carry their own absolute
+ * `start_ns` and never overlap in time, so `buildSeriesFromMetrics` (which keys
+ * by `start_ns`) yields one continuous series — warmup scrapes at lower t,
+ * profiling after. No-ops when either side is empty (older blobs have no warmup).
+ */
+function mergePhaseMetrics(profiling: MetricsMap, warmup: MetricsMap): MetricsMap {
+ if (Object.keys(warmup).length === 0) return profiling;
+ if (Object.keys(profiling).length === 0) return warmup;
+ const out: MetricsMap = {};
+ for (const name of new Set([...Object.keys(profiling), ...Object.keys(warmup)])) {
+ out[name] = {
+ series: [...(profiling[name]?.series ?? []), ...(warmup[name]?.series ?? [])],
+ };
+ }
+ return out;
+}
+
+/**
+ * Stream-parse fallback: collect the chart's metric subtrees from both phase
+ * blocks and merge (see v11). Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows.
+ */
+async function streamCollectMetrics(buffer: Buffer): Promise {
+ const [profiling, warmup] = await Promise.all([
+ streamCollectKeys(buffer, 'metrics', CHART_METRIC_KEYS),
+ streamCollectKeys(buffer, 'warmup_metrics', CHART_METRIC_KEYS),
+ ]);
+ return mergePhaseMetrics(profiling, warmup);
+}
+
+/**
+ * Parse the gzipped server_metrics blob into the metric map. Tries the
+ * synchronous fast path first; falls back to stream-parse on
+ * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed. Merges the warmup block
+ * into the profiling one (v11) so the series span both phases.
+ */
+async function parseMetrics(buffer: Buffer): Promise {
+ try {
+ const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as {
+ metrics?: MetricsMap;
+ warmup_metrics?: MetricsMap;
+ };
+ return mergePhaseMetrics(obj.metrics ?? {}, obj.warmup_metrics ?? {});
+ } catch (error) {
+ if (isStringTooLongError(error)) return await streamCollectMetrics(buffer);
+ throw error;
+ }
+}
+
+/**
+ * Build chart-ready time-series arrays from a gzipped server_metrics blob.
+ * The math mirrors `getTraceServerMetrics` — this helper exists so ingest,
+ * backfill, and the API path produce byte-identical results.
+ */
+export async function computeChartSeries(
+ blob: Buffer | null,
+ context: ServerMetricsContext = {},
+): Promise {
+ if (!blob) return null;
+ let metrics: MetricsMap;
+ try {
+ metrics = await parseMetrics(blob);
+ } catch {
+ // Malformed blob → no series (caller treats null as "no data").
+ return null;
+ }
+ return buildSeriesFromMetrics(metrics, context);
+}
+
+/**
+ * Aggregate one timeslice field across all series of a metric, indexed by
+ * `start_ns`. Multi-engine vllm deployments report one series per engine —
+ * the cluster value is the sum (for running/waiting/throughput counters)
+ * or the average (for kv_cache_usage_perc, a per-engine fraction).
+ */
+function aggregateByStart(
+ series: readonly RawSeries[] | undefined,
+ field: 'avg' | 'rate',
+ combine: 'sum' | 'avg',
+): Map {
+ const sums = new Map();
+ const counts = new Map();
+ for (const s of series ?? []) {
+ for (const ts of s.timeslices ?? []) {
+ if (typeof ts.start_ns !== 'number') continue;
+ const v = ts[field];
+ if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+ sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+ counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+ }
+ }
+ if (combine === 'sum') return sums;
+ const out = new Map();
+ for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+ return out;
+}
+
+/** Stable order: emit one point per unique start_ns, chronologically. */
+function sortedEntries(m: Map): [number, number][] {
+ return [...m.entries()].toSorted((a, b) => a[0] - b[0]);
+}
+
+function buildSeriesFromMetrics(
+ metrics: MetricsMap,
+ context: ServerMetricsContext,
+ includeMetricSources = true,
+ originStartNs?: number,
+): ChartSeries {
+ // Timing reference: smallest start_ns and largest end_ns across every
+ // timeslice we extracted. timeslicesCount is the length of any single
+ // series (engines are scraped on the same cadence), so picking the max
+ // length across all series of all metrics is safe.
+ let startNs = Number.POSITIVE_INFINITY;
+ let endNs = 0;
+ let timeslicesCount = 0;
+ for (const metricMeta of Object.values(metrics)) {
+ for (const s of metricMeta?.series ?? []) {
+ const ts = s.timeslices ?? [];
+ if (ts.length === 0) continue;
+ timeslicesCount = Math.max(timeslicesCount, ts.length);
+ const first = ts[0]!;
+ const last = ts.at(-1)!;
+ if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+ if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+ }
+ }
+ if (!Number.isFinite(startNs)) startNs = 0;
+ const tOf = (ns: number) => (ns - (originStartNs ?? startNs)) / 1e9;
+
+ // Pick the first metric name whose series array has any data; fallback
+ // chain lets the same code path serve both vllm:* and sglang:* blobs.
+ const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => {
+ for (const name of names) {
+ const s = metrics[name]?.series;
+ if (s && s.length > 0) return s;
+ }
+ return undefined;
+ };
+
+ // KV cache usage (gauge, 0..1) — average across engines so the value
+ // stays a fraction (each engine has its own KV pool).
+ const kvSeries = pickSeries(
+ 'vllm:kv_cache_usage_perc',
+ 'vllm:gpu_cache_usage_perc',
+ 'sglang:token_usage',
+ );
+ const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
+ aggregateByStart(kvSeries, 'avg', 'avg'),
+ ).map(([t, v]) => ({ t: tOf(t), value: v }));
+ // Per-engine breakdown of the same metric. We only emit it when there's
+ // more than one series — single-engine deployments would just duplicate
+ // the cluster-average line.
+ const kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[] = [];
+ if (kvSeries && kvSeries.length > 1) {
+ // Sort by numeric engine label when present so rank 0..N renders in
+ // order; fall back to series-array index otherwise.
+ const decorated = kvSeries.map((s, idx) => {
+ const raw =
+ s.labels?.['engine'] ?? s.labels?.['engine_idx'] ?? s.labels?.['dp_rank'] ?? String(idx);
+ const numeric = Number(raw);
+ return { series: s, idx, label: raw, sortKey: Number.isFinite(numeric) ? numeric : idx };
+ });
+ decorated.sort((a, b) => a.sortKey - b.sortKey);
+ for (const { series, label } of decorated) {
+ const pts: TimeSeriesPoint[] = [];
+ for (const ts of series.timeslices ?? []) {
+ if (typeof ts.start_ns !== 'number' || typeof ts.avg !== 'number') continue;
+ if (!Number.isFinite(ts.avg)) continue;
+ pts.push({ t: tOf(ts.start_ns), value: ts.avg });
+ }
+ if (pts.length > 0) kvCacheUsageByEngine.push({ engineLabel: label, points: pts });
+ }
+ }
+
+ // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
+ // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens.
+ const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens');
+ const qsSeries = pickSeries(
+ 'vllm:prefix_cache_queries',
+ 'vllm:prompt_tokens',
+ 'sglang:prompt_tokens',
+ );
+ const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum');
+ const qsByT = aggregateByStart(qsSeries, 'rate', 'sum');
+ const prefixCacheHitRate: TimeSeriesPoint[] = [];
+ for (const [t, h] of sortedEntries(hitsByT)) {
+ const q = qsByT.get(t);
+ if (q !== undefined && q > 0) prefixCacheHitRate.push({ t: tOf(t), value: h / q });
+ }
+
+ // Queue depth: sum running + waiting across engines per timeslice.
+ const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs');
+ const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs');
+ const runByT = aggregateByStart(runSeries, 'avg', 'sum');
+ const waitByT = aggregateByStart(waitSeries, 'avg', 'sum');
+ const queueDepth: QueueDepthPoint[] = [];
+ // Union of timestamps so we surface activity even if one of the gauges
+ // didn't report a sample on a given tick.
+ const allTimes = new Set([...runByT.keys(), ...waitByT.keys()]);
+ for (const t of [...allTimes].toSorted((a, b) => a - b)) {
+ const running = runByT.get(t) ?? 0;
+ const waiting = waitByT.get(t) ?? 0;
+ queueDepth.push({ t: tOf(t), running, waiting, total: running + waiting });
+ }
+
+ // Throughput: sum the counter `rate` (already per-second) across engines.
+ // Takes a fallback chain so vllm:* and sglang:* both work.
+ const counterRate = (...names: string[]): TimeSeriesPoint[] => {
+ const s = pickSeries(...names);
+ return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({
+ t: tOf(t),
+ value: v,
+ }));
+ };
+ const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens');
+ const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens');
+ // Tokens served from prefix cache per scrape. Lets the frontend derive
+ // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
+ const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
+
+ // SGLang hicache: host-pool KV cache utilization as used/total per
+ // timeslice. Both metrics are gauges in absolute tokens. Total stays
+ // constant (it's the pool size), used fluctuates.
+ const hostUsedByT = aggregateByStart(
+ metrics['sglang:hicache_host_used_tokens']?.series,
+ 'avg',
+ 'sum',
+ );
+ const hostTotalByT = aggregateByStart(
+ metrics['sglang:hicache_host_total_tokens']?.series,
+ 'avg',
+ 'sum',
+ );
+ const hostKvCacheUsage: TimeSeriesPoint[] = [];
+ for (const [t, used] of sortedEntries(hostUsedByT)) {
+ const total = hostTotalByT.get(t);
+ if (total !== undefined && total > 0) {
+ hostKvCacheUsage.push({ t: tOf(t), value: used / total });
+ }
+ }
+
+ // Per-source prompt tokens — sum across engines per source label.
+ // vllm: vllm:prompt_tokens_by_source has one series per source label
+ // (local_cache_hit, external_cache_hit, miss, ...). Use the
+ // `source`/`reason`/`kind` label as the breakdown key.
+ // sglang: sglang:realtime_tokens uses a `mode` label with values
+ // {prefill_cache, prefill_compute, decode}. Filter to prefill_*
+ // since decode isn't prompt-token volume.
+ const promptBySrcByT = new Map>();
+ // Sum a series' per-scrape rates into the bucket for `label`. The bucket is
+ // created even when the series has no valid timeslices — the SGLang fallback
+ // below is gated on `promptBySrcByT.size === 0`, so an empty vllm breakdown
+ // must still suppress it.
+ const addSeriesRates = (label: string, series: RawSeries): void => {
+ let byT = promptBySrcByT.get(label);
+ if (!byT) {
+ byT = new Map();
+ promptBySrcByT.set(label, byT);
+ }
+ for (const ts of series.timeslices ?? []) {
+ if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+ byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+ }
+ }
+ };
+ for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+ const labels = series.labels ?? {};
+ const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+ addSeriesRates(source, series);
+ }
+ // SGLang fallback: only consider when the vllm metric wasn't found.
+ // - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]`
+ // - Cache hits, split by tier: per-series `sglang:cached_tokens` where each
+ // series carries a `cache_source` label ("device" = HBM, "host" = CPU
+ // offload via hicache). Current runs have only `device`; when hicache
+ // runs land, additional series will appear and the chart will split.
+ if (promptBySrcByT.size === 0) {
+ for (const series of metrics['sglang:realtime_tokens']?.series ?? []) {
+ const labels = series.labels ?? {};
+ const mode = labels['mode'] ?? 'unknown';
+ // Only carry the cache-miss line over — cache hits come from
+ // sglang:cached_tokens broken out by cache_source below, so we'd
+ // double-count if we kept `prefill_cache` here too.
+ if (mode !== 'prefill_compute') continue;
+ addSeriesRates('compute (miss)', series);
+ }
+ // Cache hits broken out per cache_source. Strip the noisy "total" label
+ // (older sglang versions emit a single un-broken-out series labelled
+ // total — show that as just "cache hit").
+ for (const series of metrics['sglang:cached_tokens']?.series ?? []) {
+ const labels = series.labels ?? {};
+ const src = labels['cache_source'] ?? 'cache hit';
+ const label =
+ src === 'device'
+ ? 'cache hit (HBM)'
+ : src === 'host'
+ ? 'cache hit (CPU offload)'
+ : src === 'total'
+ ? 'cache hit'
+ : `cache hit (${src})`;
+ addSeriesRates(label, series);
+ }
+ }
+ const promptTokensBySource: Record = {};
+ for (const [source, byT] of promptBySrcByT) {
+ const arr: TimeSeriesPoint[] = [];
+ for (const [t, v] of sortedEntries(byT)) {
+ if (v > 0) arr.push({ t: tOf(t), value: v });
+ }
+ if (arr.length > 0) promptTokensBySource[source] = arr;
+ }
+
+ const metricSources: MetricSourceSeries[] = [];
+ const adapter = selectServerMetricsAdapter(context);
+ if (includeMetricSources && context.disagg && adapter.id !== 'generic') {
+ const grouped = new Map();
+ for (const [metricName, metric] of Object.entries(metrics)) {
+ for (const series of metric.series ?? []) {
+ const source = adapter.identifySource(series);
+ let group = grouped.get(source.id);
+ if (!group) {
+ group = { source, metrics: {} };
+ grouped.set(source.id, group);
+ }
+ const groupedMetric = (group.metrics[metricName] ??= { series: [] });
+ groupedMetric.series!.push(series);
+ }
+ }
+ for (const { source, metrics: sourceMetrics } of grouped.values()) {
+ const sourceSeries = buildSeriesFromMetrics(
+ sourceMetrics,
+ context,
+ false,
+ originStartNs ?? startNs,
+ );
+ metricSources.push({
+ source,
+ kvCacheUsage: sourceSeries.kvCacheUsage,
+ prefixCacheHitRate: sourceSeries.prefixCacheHitRate,
+ queueDepth: sourceSeries.queueDepth,
+ promptTokensBySource: sourceSeries.promptTokensBySource,
+ promptTps: sourceSeries.prefillTps,
+ generationTps: sourceSeries.decodeTps,
+ prefixCacheHitsTps: sourceSeries.prefixCacheHitsTps,
+ hostKvCacheUsage: sourceSeries.hostKvCacheUsage,
+ kvCacheUsageByEngine: sourceSeries.kvCacheUsageByEngine,
+ });
+ }
+ const roleOrder: Record = {
+ router: 0,
+ prefill: 1,
+ decode: 2,
+ combined: 3,
+ unknown: 4,
+ };
+ metricSources.sort(
+ (a, b) =>
+ roleOrder[a.source.role] - roleOrder[b.source.role] ||
+ (a.source.endpointUrl ?? '').localeCompare(b.source.endpointUrl ?? '') ||
+ a.source.id.localeCompare(b.source.id),
+ );
+ }
+ return {
+ version: CHART_SERIES_VERSION,
+ startNs,
+ endNs,
+ durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+ timeslicesCount,
+ kvCacheUsage,
+ prefixCacheHitRate,
+ queueDepth,
+ promptTokensBySource,
+ prefillTps,
+ decodeTps,
+ prefixCacheHitsTps,
+ hostKvCacheUsage,
+ kvCacheUsageByEngine,
+ metricSources,
+ };
+}
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
new file mode 100644
index 00000000..1ad9e63b
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -0,0 +1,210 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-request-timeline.js';
+
+interface SyntheticRequest {
+ cid: string;
+ ti: number;
+ srcTrace?: string;
+ srcOuter?: number;
+ srcInner?: number;
+ srcKind?: string;
+ wid?: string;
+ ad?: number;
+ phase?: string;
+ credit: number;
+ start: number;
+ end: number;
+ ack?: number | null;
+ ttftMs?: number | null;
+ tpotMs?: number | null;
+ tpotKey?: 'inter_token_latency' | 'time_per_output_token';
+ isl?: number | null;
+ osl?: number | null;
+ cancelled?: boolean;
+}
+
+function makeBlob(requests: SyntheticRequest[]) {
+ const lines = requests.map((r) =>
+ JSON.stringify({
+ metadata: {
+ conversation_id: r.cid,
+ turn_index: r.ti,
+ ...(r.srcTrace === undefined ? {} : { source_trace_id: r.srcTrace }),
+ ...(r.srcOuter === undefined ? {} : { source_outer_idx: r.srcOuter }),
+ ...(r.srcInner === undefined ? {} : { source_inner_idx: r.srcInner }),
+ ...(r.srcKind === undefined ? {} : { source_kind: r.srcKind }),
+ worker_id: r.wid ?? 'worker_default',
+ agent_depth: r.ad ?? 0,
+ benchmark_phase: r.phase ?? 'profiling',
+ credit_issued_ns: r.credit,
+ request_start_ns: r.start,
+ ...(r.ack === undefined ? {} : { request_ack_ns: r.ack }),
+ request_end_ns: r.end,
+ was_cancelled: r.cancelled ?? false,
+ },
+ metrics: {
+ time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' },
+ [r.tpotKey ?? 'inter_token_latency']:
+ r.tpotMs === null ? null : { value: r.tpotMs ?? 10, unit: 'ms' },
+ input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' },
+ output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' },
+ },
+ }),
+ );
+ return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+describe('computeRequestTimeline', () => {
+ it('returns null when the blob is null', () => {
+ expect(computeRequestTimeline(null)).toBeNull();
+ });
+
+ it('returns null on a malformed (non-gzip) blob', () => {
+ expect(computeRequestTimeline(Buffer.from('not-gzip'))).toBeNull();
+ });
+
+ it('returns null when the blob has no parseable records', () => {
+ expect(computeRequestTimeline(gzipSync(Buffer.from('\n\n')))).toBeNull();
+ });
+
+ it('returns the current REQUEST_TIMELINE_VERSION in the bundle', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([{ cid: 'a', ti: 0, credit: 1000, start: 2000, end: 3000 }]),
+ );
+ expect(tl?.version).toBe(REQUEST_TIMELINE_VERSION);
+ });
+
+ it('shifts ns timestamps to be relative to the earliest credit_issued', () => {
+ // Two requests with absolute ns starting at 1_000_000_000.
+ const tl = computeRequestTimeline(
+ makeBlob([
+ { cid: 'a', ti: 0, credit: 1_000_000_000, start: 1_001_000_000, end: 1_010_000_000 },
+ { cid: 'a', ti: 1, credit: 1_020_000_000, start: 1_021_000_000, end: 1_030_000_000 },
+ ]),
+ );
+ expect(tl?.startNs).toBe(1_000_000_000);
+ expect(tl?.endNs).toBe(1_030_000_000);
+ expect(tl?.durationS).toBeCloseTo(0.03, 6);
+ expect(tl?.requests[0]?.credit).toBe(0);
+ expect(tl?.requests[0]?.end).toBe(10_000_000);
+ expect(tl?.requests[1]?.start).toBe(21_000_000);
+ });
+
+ it('sorts requests by start time, regardless of input order', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([
+ { cid: 'a', ti: 0, credit: 30, start: 50, end: 60 },
+ { cid: 'a', ti: 1, credit: 0, start: 10, end: 20 },
+ { cid: 'a', ti: 2, credit: 80, start: 90, end: 100 },
+ ]),
+ );
+ expect(tl?.requests.map((r) => r.start)).toEqual([10, 50, 90]);
+ });
+
+ it('preserves conversation/worker grouping fields', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([
+ {
+ cid: 'conv-A',
+ ti: 5,
+ wid: 'worker_abcd1234',
+ ad: 2,
+ phase: 'profiling',
+ credit: 0,
+ start: 10,
+ end: 100,
+ },
+ ]),
+ );
+ const r = tl?.requests[0]!;
+ expect(r.cid).toBe('conv-A');
+ expect(r.ti).toBe(5);
+ expect(r.wid).toBe('worker_abcd1234');
+ expect(r.ad).toBe(2);
+ expect(r.phase).toBe('profiling');
+ });
+
+ it('preserves raw source provenance fields when present', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([
+ {
+ cid: 'trace::fa:003',
+ ti: 3,
+ srcTrace: 'trace',
+ srcOuter: 204,
+ srcInner: 16,
+ srcKind: 'weka_flat',
+ credit: 0,
+ start: 10,
+ end: 100,
+ },
+ ]),
+ );
+ expect(tl?.requests[0]).toMatchObject({
+ cid: 'trace::fa:003',
+ ti: 3,
+ srcTrace: 'trace',
+ srcOuter: 204,
+ srcInner: 16,
+ srcKind: 'weka_flat',
+ });
+ });
+
+ it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([
+ {
+ cid: 'a',
+ ti: 0,
+ credit: 0,
+ start: 10,
+ end: 100,
+ ttftMs: 25.5,
+ tpotMs: 12.5,
+ isl: 1024,
+ osl: 256,
+ cancelled: true,
+ },
+ ]),
+ );
+ const r = tl?.requests[0]!;
+ expect(r.cancelled).toBe(true);
+ expect(r.ttftMs).toBeCloseTo(25.5, 6);
+ expect(r.tpotMs).toBeCloseTo(12.5, 6);
+ expect(r.isl).toBe(1024);
+ expect(r.osl).toBe(256);
+ });
+
+ it('accepts time_per_output_token as a TPOT alias', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([
+ {
+ cid: 'a',
+ ti: 0,
+ credit: 0,
+ start: 10,
+ end: 100,
+ tpotMs: 8.25,
+ tpotKey: 'time_per_output_token',
+ },
+ ]),
+ );
+ expect(tl?.requests[0]?.tpotMs).toBeCloseTo(8.25, 6);
+ });
+
+ it('skips records missing both credit_issued_ns and request_start_ns', () => {
+ // Build a record with only request_end_ns — the helper rejects it.
+ const broken = gzipSync(
+ Buffer.from(
+ JSON.stringify({
+ metadata: { conversation_id: 'a', turn_index: 0, request_end_ns: 1234 },
+ metrics: {},
+ }),
+ ),
+ );
+ expect(computeRequestTimeline(broken)).toBeNull();
+ });
+});
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
new file mode 100644
index 00000000..2cbe5174
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -0,0 +1,208 @@
+/**
+ * Pre-compute the per-request timeline for the agentic detail page's
+ * Gantt view. Output lands in `agentic_trace_replay.request_timeline`
+ * and is read directly by the timeline API route.
+ *
+ * Shape is a thin array — ~150 bytes per request × ~200 requests per
+ * point ≈ 30 KB per row before JSONB compression. Trivial vs the raw
+ * gzipped JSONL blob (~1-3 MB).
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `REQUEST_TIMELINE_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const REQUEST_TIMELINE_VERSION = 5;
+
+export interface RequestRecord {
+ /** Conversation id (groups turns of one agent session). */
+ cid: string;
+ /** Zero-based turn index within the conversation. */
+ ti: number;
+ /** Source trace id from the original raw dataset, when distinct from replay cid. */
+ srcTrace?: string;
+ /** Original raw top-level request index within srcTrace. */
+ srcOuter?: number;
+ /** Original nested request index within srcOuter, for subagent children. */
+ srcInner?: number;
+ /** Loader-specific source kind, e.g. weka_main or weka_flat. */
+ srcKind?: string;
+ /** Worker id (concurrency slot that handled this request). */
+ wid: string;
+ /** Sub-agent depth (0 = top-level). */
+ ad: number;
+ /** `warmup` or `profiling`. */
+ phase: string;
+ /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+ credit: number;
+ /** ns offset from timeline.startNs. HTTP send started. */
+ start: number;
+ /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+ ack: number | null;
+ /** ns offset from timeline.startNs. Last byte received. */
+ end: number;
+ /** Time-to-first-token in ms. */
+ ttftMs: number | null;
+ /** Time per output token in ms. */
+ tpotMs: number | null;
+ /** Input sequence length (tokens). */
+ isl: number | null;
+ /** Output sequence length (tokens). */
+ osl: number | null;
+ cancelled: boolean;
+}
+
+export interface RequestTimeline {
+ version: number;
+ /** Wall-clock ns of the earliest event (used as the relative-time origin). */
+ startNs: number;
+ /** Wall-clock ns of the latest `request_end_ns`. */
+ endNs: number;
+ /** Total span in seconds. */
+ durationS: number;
+ requests: RequestRecord[];
+}
+
+interface RawMetadata {
+ conversation_id?: string;
+ turn_index?: number;
+ source_trace_id?: string;
+ source_outer_idx?: number;
+ source_inner_idx?: number;
+ source_kind?: string;
+ worker_id?: string;
+ agent_depth?: number;
+ benchmark_phase?: string;
+ credit_issued_ns?: number;
+ request_start_ns?: number;
+ request_ack_ns?: number;
+ request_end_ns?: number;
+ was_cancelled?: boolean;
+}
+
+interface RawMetricValue {
+ value?: number;
+}
+
+interface RawRecord {
+ metadata?: RawMetadata;
+ metrics?: {
+ time_to_first_token?: RawMetricValue | number;
+ time_per_output_token?: RawMetricValue | number;
+ inter_token_latency?: RawMetricValue | number;
+ input_sequence_length?: RawMetricValue | number;
+ output_sequence_length?: RawMetricValue | number;
+ };
+}
+
+/** Pull a numeric metric out of the `{value, unit}` envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+ if (typeof v === 'number') return Number.isFinite(v) ? v : undefined;
+ if (v && typeof v === 'object' && 'value' in v) {
+ const inner = (v as { value?: unknown }).value;
+ if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+ }
+ return undefined;
+}
+
+/**
+ * Parse the gzipped `profile_export.jsonl` blob into a chart-ready
+ * timeline. Returns null on a missing or malformed blob.
+ */
+export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | null {
+ if (!blob) return null;
+ let text: string;
+ try {
+ text = gunzipSync(blob).toString('utf8');
+ } catch {
+ return null;
+ }
+
+ // First pass: parse + collect raw turns; find timeline origin.
+ const raw: {
+ meta: RawMetadata;
+ ttftMs: number | null;
+ tpotMs: number | null;
+ isl: number | null;
+ osl: number | null;
+ }[] = [];
+ let originNs = Number.POSITIVE_INFINITY;
+ let endNs = 0;
+
+ for (const line of text.split('\n')) {
+ if (!line) continue;
+ let rec: RawRecord;
+ try {
+ rec = JSON.parse(line) as RawRecord;
+ } catch {
+ continue;
+ }
+ const meta = rec.metadata ?? {};
+ // Use credit_issued_ns when available (the true start of the request's
+ // lifecycle), falling back to request_start_ns. Skip rows missing both.
+ const cStart = meta.credit_issued_ns ?? meta.request_start_ns;
+ const cEnd = meta.request_end_ns;
+ if (typeof cStart !== 'number' || typeof cEnd !== 'number') continue;
+
+ if (cStart < originNs) originNs = cStart;
+ if (cEnd > endNs) endNs = cEnd;
+
+ raw.push({
+ meta,
+ ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null,
+ tpotMs:
+ readNum(rec.metrics?.time_per_output_token) ??
+ readNum(rec.metrics?.inter_token_latency) ??
+ null,
+ isl: readNum(rec.metrics?.input_sequence_length) ?? null,
+ osl: readNum(rec.metrics?.output_sequence_length) ?? null,
+ });
+ }
+
+ if (raw.length === 0) return null;
+ if (!Number.isFinite(originNs)) originNs = 0;
+
+ // Second pass: shift timestamps to be relative to originNs (smaller
+ // numbers fit in JSON nicely and the frontend doesn't need bigint math).
+ const requests: RequestRecord[] = [];
+ for (const r of raw) {
+ const m = r.meta;
+ const credit = (m.credit_issued_ns ?? m.request_start_ns ?? originNs) - originNs;
+ const start = (m.request_start_ns ?? m.credit_issued_ns ?? originNs) - originNs;
+ const ack = typeof m.request_ack_ns === 'number' ? m.request_ack_ns - originNs : null;
+ const end = (m.request_end_ns ?? originNs) - originNs;
+ requests.push({
+ cid: m.conversation_id ?? 'unknown',
+ ti: typeof m.turn_index === 'number' ? m.turn_index : 0,
+ srcTrace: typeof m.source_trace_id === 'string' ? m.source_trace_id : undefined,
+ srcOuter: typeof m.source_outer_idx === 'number' ? m.source_outer_idx : undefined,
+ srcInner: typeof m.source_inner_idx === 'number' ? m.source_inner_idx : undefined,
+ srcKind: typeof m.source_kind === 'string' ? m.source_kind : undefined,
+ wid: m.worker_id ?? 'unknown',
+ ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0,
+ phase: m.benchmark_phase ?? 'unknown',
+ credit,
+ start,
+ ack,
+ end,
+ ttftMs: r.ttftMs,
+ tpotMs: r.tpotMs,
+ isl: r.isl,
+ osl: r.osl,
+ cancelled: m.was_cancelled === true,
+ });
+ }
+
+ // Stable order so backfill output is deterministic.
+ requests.sort((a, b) => a.start - b.start);
+
+ return {
+ version: REQUEST_TIMELINE_VERSION,
+ startNs: originNs,
+ endNs,
+ durationS: endNs > originNs ? (endNs - originNs) / 1e9 : 0,
+ requests,
+ };
+}
diff --git a/packages/db/src/etl/dataset-provenance.test.ts b/packages/db/src/etl/dataset-provenance.test.ts
new file mode 100644
index 00000000..4022546e
--- /dev/null
+++ b/packages/db/src/etl/dataset-provenance.test.ts
@@ -0,0 +1,40 @@
+import { describe, expect, it } from 'vitest';
+
+import { datasetSlugFromBenchmarkRow } from './dataset-provenance';
+
+describe('datasetSlugFromBenchmarkRow', () => {
+ it('maps aiperf public-dataset provenance to the dashboard dataset slug', () => {
+ expect(
+ datasetSlugFromBenchmarkRow({
+ dataset: {
+ source_type: 'public_dataset',
+ loader: 'semianalysis_cc_traces_weka_with_subagents',
+ hf_dataset_name: 'semianalysisai/cc-traces-weka-062126',
+ hf_split: 'train',
+ num_dataset_entries: 393,
+ },
+ }),
+ ).toBe('cc-traces-weka-062126');
+ });
+
+ it('supports an unnamespaced Hugging Face dataset id', () => {
+ expect(
+ datasetSlugFromBenchmarkRow({
+ dataset: {
+ source_type: 'public_dataset',
+ hf_dataset_name: 'cc-traces-weka-062126',
+ },
+ }),
+ ).toBe('cc-traces-weka-062126');
+ });
+
+ it.each([
+ {},
+ { dataset: null },
+ { dataset: { source_type: 'synthetic', hf_dataset_name: 'owner/data' } },
+ { dataset: { source_type: 'public_dataset', hf_dataset_name: '' } },
+ { dataset: { source_type: 'public_dataset' } },
+ ])('ignores rows without usable public-dataset provenance: %j', (row) => {
+ expect(datasetSlugFromBenchmarkRow(row)).toBeNull();
+ });
+});
diff --git a/packages/db/src/etl/dataset-provenance.ts b/packages/db/src/etl/dataset-provenance.ts
new file mode 100644
index 00000000..f0d7cd0d
--- /dev/null
+++ b/packages/db/src/etl/dataset-provenance.ts
@@ -0,0 +1,32 @@
+const TRAILING_SLASHES = /\/+$/u;
+
+/** Dataset provenance emitted by aiperf and preserved in agentic benchmark rows. */
+export interface DatasetProvenance {
+ source_type?: unknown;
+ loader?: unknown;
+ hf_dataset_name?: unknown;
+ hf_split?: unknown;
+ hf_subset?: unknown;
+ num_dataset_entries?: unknown;
+}
+
+/**
+ * Resolve the dashboard dataset slug from a benchmark row's provenance.
+ *
+ * Dataset ingest uses the final path component of the Hugging Face dataset id
+ * as `datasets.slug`, so `semianalysisai/cc-traces-weka-062126` maps to
+ * `cc-traces-weka-062126` here as well.
+ */
+export function datasetSlugFromBenchmarkRow(row: Record): string | null {
+ const dataset = row.dataset;
+ if (!dataset || typeof dataset !== 'object' || Array.isArray(dataset)) return null;
+
+ const provenance = dataset as DatasetProvenance;
+ if (provenance.source_type !== 'public_dataset') return null;
+ if (typeof provenance.hf_dataset_name !== 'string') return null;
+
+ const datasetId = provenance.hf_dataset_name.trim().replace(TRAILING_SLASHES, '');
+ if (!datasetId) return null;
+ const slug = datasetId.slice(datasetId.lastIndexOf('/') + 1);
+ return slug || null;
+}
diff --git a/packages/db/src/etl/distribution-stats.ts b/packages/db/src/etl/distribution-stats.ts
new file mode 100644
index 00000000..da3603ab
--- /dev/null
+++ b/packages/db/src/etl/distribution-stats.ts
@@ -0,0 +1,98 @@
+/**
+ * Generic distribution math shared by the dataset ETL: percentile summaries
+ * and histogram binning for the dataset-detail cards. Pure functions, no DB
+ * access. (The per-benchmark-row percentile bundle uses `percentilesOf` in
+ * `queries/agentic-aggregates` — a different shape with its own version key.)
+ */
+
+export interface HistogramBin {
+ x0: number;
+ x1: number;
+ count: number;
+}
+
+export interface NumberSummary {
+ count: number;
+ min: number;
+ max: number;
+ mean: number;
+ median: number;
+ p75: number;
+ p90: number;
+ p95: number;
+}
+
+/** Distribution summary with linear-interpolated percentiles. */
+export function summarizeValues(values: readonly number[]): NumberSummary {
+ if (values.length === 0) {
+ return { count: 0, min: 0, max: 0, mean: 0, median: 0, p75: 0, p90: 0, p95: 0 };
+ }
+ const sorted = [...values].toSorted((a, b) => a - b);
+ const quantile = (q: number): number => {
+ const pos = (sorted.length - 1) * q;
+ const lo = Math.floor(pos);
+ const hi = Math.ceil(pos);
+ if (lo === hi) return sorted[lo]!;
+ return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+ };
+ return {
+ count: sorted.length,
+ min: sorted[0]!,
+ max: sorted.at(-1)!,
+ mean: sorted.reduce((sum, value) => sum + value, 0) / sorted.length,
+ median: quantile(0.5),
+ p75: quantile(0.75),
+ p90: quantile(0.9),
+ p95: quantile(0.95),
+ };
+}
+
+/** Linear-width histogram over [0, max]. Empty input → []. */
+export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] {
+ if (values.length === 0) return [];
+ const max = Math.max(...values);
+ if (max <= 0) return [{ x0: 0, x1: 1, count: values.length }];
+ const width = max / bins;
+ const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({
+ x0: i * width,
+ x1: (i + 1) * width,
+ count: 0,
+ }));
+ for (const v of values) {
+ const idx = Math.min(bins - 1, Math.max(0, Math.floor(v / width)));
+ out[idx].count += 1;
+ }
+ return out;
+}
+
+/** Log-width histogram over positive values (values ≤ 0 are dropped). */
+export function logHistogram(values: readonly number[], bins = 40): HistogramBin[] {
+ const pos = values.filter((v) => v > 0);
+ if (pos.length === 0) return [];
+ const min = Math.min(...pos);
+ const max = Math.max(...pos);
+ const lo = Math.log10(min);
+ const hi = Math.log10(max);
+ if (hi <= lo) return [{ x0: min, x1: max <= min ? min * 10 : max, count: pos.length }];
+ const width = (hi - lo) / bins;
+ const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({
+ x0: 10 ** (lo + i * width),
+ x1: 10 ** (lo + (i + 1) * width),
+ count: 0,
+ }));
+ for (const v of pos) {
+ const idx = Math.min(bins - 1, Math.max(0, Math.floor((Math.log10(v) - lo) / width)));
+ out[idx].count += 1;
+ }
+ return out;
+}
+
+/** Log-width histogram that preserves zero as a dedicated first bin. */
+export function logHistogramWithZero(values: readonly number[], bins = 40): HistogramBin[] {
+ const zeroCount = values.filter((value) => value === 0).length;
+ const positive = values.filter((value) => value > 0);
+ if (zeroCount === 0) return logHistogram(positive, bins);
+ if (positive.length === 0) return [{ x0: 0, x1: 1, count: zeroCount }];
+ const positiveBins = logHistogram(positive, Math.max(1, bins - 1));
+ return [{ x0: 0, x1: positiveBins[0]?.x0 ?? 1, count: zeroCount }, ...positiveBins];
+}
diff --git a/packages/db/src/etl/gzip-json-stream.test.ts b/packages/db/src/etl/gzip-json-stream.test.ts
new file mode 100644
index 00000000..9051ee82
--- /dev/null
+++ b/packages/db/src/etl/gzip-json-stream.test.ts
@@ -0,0 +1,66 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream.js';
+
+describe('isStringTooLongError', () => {
+ it('matches the ERR_STRING_TOO_LONG code', () => {
+ const err = new Error('Cannot create a string longer than ...') as NodeJS.ErrnoException;
+ err.code = 'ERR_STRING_TOO_LONG';
+ expect(isStringTooLongError(err)).toBe(true);
+ });
+
+ it('matches the message-only variant', () => {
+ expect(isStringTooLongError(new Error('Cannot create a string longer than 0x1fffffe8'))).toBe(
+ true,
+ );
+ });
+
+ it('rejects unrelated errors and non-errors', () => {
+ expect(isStringTooLongError(new Error('unexpected token'))).toBe(false);
+ expect(isStringTooLongError(null)).toBe(false);
+ expect(isStringTooLongError('ERR_STRING_TOO_LONG-ish string')).toBe(false);
+ });
+});
+
+describe('streamCollectKeys', () => {
+ const blob = gzipSync(
+ JSON.stringify({
+ metrics: {
+ 'vllm:prompt_tokens': { series: [{ timeslices: [{ start_ns: 1, rate: 2 }] }] },
+ 'vllm:ignored_metric': { series: [] },
+ },
+ warmup_metrics: {
+ 'vllm:prompt_tokens': { series: [] },
+ },
+ }),
+ );
+
+ it('collects only wanted keys under the filtered top-level block', async () => {
+ const out = await streamCollectKeys<{ series: unknown[] }>(
+ blob,
+ 'metrics',
+ new Set(['vllm:prompt_tokens']),
+ );
+ expect(Object.keys(out)).toEqual(['vllm:prompt_tokens']);
+ expect(out['vllm:prompt_tokens']).toEqual({
+ series: [{ timeslices: [{ start_ns: 1, rate: 2 }] }],
+ });
+ });
+
+ it('reads a different top-level phase block via filter', async () => {
+ const out = await streamCollectKeys<{ series: unknown[] }>(
+ blob,
+ 'warmup_metrics',
+ new Set(['vllm:prompt_tokens']),
+ );
+ expect(out).toEqual({ 'vllm:prompt_tokens': { series: [] } });
+ });
+
+ it('rejects on a non-gzip buffer', async () => {
+ await expect(
+ streamCollectKeys(Buffer.from('not gzip'), 'metrics', new Set(['x'])),
+ ).rejects.toThrow();
+ });
+});
diff --git a/packages/db/src/etl/gzip-json-stream.ts b/packages/db/src/etl/gzip-json-stream.ts
new file mode 100644
index 00000000..cb299a8d
--- /dev/null
+++ b/packages/db/src/etl/gzip-json-stream.ts
@@ -0,0 +1,58 @@
+/**
+ * Shared stream-parse helpers for gzipped server-metrics blobs.
+ *
+ * `gunzipSync(buffer).toString('utf8')` trips Node's 512 MB max-string-length
+ * cap on high-conc TP+EP rows, so the compute-* ETL helpers fall back to a
+ * stream-json pipeline that collects only the top-level subtrees they need.
+ * Both the fast-path error detection and the pipeline itself live here so
+ * chart-series and aggregate-stats stay byte-identical in how they parse.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+/**
+ * True when `error` is Node's max-string-length failure (`ERR_STRING_TOO_LONG`
+ * or the older message-only variant) — the signal to switch from
+ * `gunzipSync().toString()` to the streaming parser.
+ */
+export function isStringTooLongError(error: unknown): boolean {
+ const code = error && (error as NodeJS.ErrnoException).code;
+ const msg = error instanceof Error ? error.message : String(error);
+ return code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8');
+}
+
+/**
+ * Gunzip + stream-parse `buffer`, descending into the top-level `filter` key
+ * (e.g. `metrics` / `warmup_metrics`) and collecting only the child entries
+ * whose key is in `wanted`. Never materializes the full JSON string.
+ */
+export async function streamCollectKeys(
+ buffer: Buffer,
+ filter: string,
+ wanted: ReadonlySet,
+): Promise> {
+ const collected: Record = {};
+ const pipeline = chain([
+ Readable.from(buffer),
+ createGunzip(),
+ parser(),
+ pick({ filter }),
+ streamObject(),
+ ]);
+ await new Promise((resolve, reject) => {
+ pipeline.on('data', (chunk: unknown) => {
+ const { key, value } = chunk as { key: string; value: T };
+ if (wanted.has(key)) collected[key] = value;
+ });
+ pipeline.on('end', resolve);
+ pipeline.on('error', reject);
+ });
+ return collected;
+}
diff --git a/packages/db/src/etl/normalizers.test.ts b/packages/db/src/etl/normalizers.test.ts
index e569143a..82aaf67c 100644
--- a/packages/db/src/etl/normalizers.test.ts
+++ b/packages/db/src/etl/normalizers.test.ts
@@ -25,6 +25,11 @@ describe('hwToGpuKey', () => {
expect(hwToGpuKey('mi300x-amd')).toBe('mi300x');
});
+ it('strips a v3 scope prefix (cluster:…)', () => {
+ expect(hwToGpuKey('cluster:b300-nv')).toBe('b300');
+ expect(hwToGpuKey('cluster:h200')).toBe('h200');
+ });
+
it('strips -amds suffix', () => {
expect(hwToGpuKey('mi355x-amds')).toBe('mi355x');
});
diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts
index 1d6a95c1..07793dee 100644
--- a/packages/db/src/etl/normalizers.ts
+++ b/packages/db/src/etl/normalizers.ts
@@ -22,7 +22,11 @@ export { GPU_KEYS };
* stripped base is not in `GPU_KEYS`.
*/
export function hwToGpuKey(hw: string): string | null {
- const base = hw.toLowerCase().split('-')[0];
+ // v3 agentic artifacts scope the hw id (`cluster:b300-nv`) — drop everything
+ // up to the last `:` first. Then take the first segment before `-` as the
+ // canonical key; that subsumes all the prior explicit suffix strips
+ // (-nv, -amds, -dgxc-slurm, -p1, -cw, …).
+ const base = hw.toLowerCase().split(':').pop()!.split('-')[0];
return GPU_KEYS.has(base) ? base : null;
}
@@ -138,7 +142,7 @@ export function resolveModelKey(row: Record): string | null {
*/
export function normalizeFramework(
fw: string,
- disaggField: any,
+ disaggField: unknown,
): { framework: string; disagg: boolean } {
const lower = fw.toLowerCase();
const alias = FRAMEWORK_ALIASES[lower];
@@ -171,7 +175,7 @@ export function normalizePrecision(raw: string): string {
* @param spec - Raw `spec_decoding` value from the artifact.
* @returns Lowercase method name, or `'none'` if absent/empty.
*/
-export function normalizeSpecMethod(spec: any): string {
+export function normalizeSpecMethod(spec: unknown): string {
if (!spec || spec === '') return 'none';
return String(spec).toLowerCase();
}
@@ -183,7 +187,7 @@ export function normalizeSpecMethod(spec: any): string {
* @param v - Value to coerce (any type).
* @returns `true` if the value is one of the recognized truthy forms, `false` otherwise.
*/
-export function parseBool(v: any): boolean {
+export function parseBool(v: unknown): boolean {
return v === true || v === 'true' || v === 'True';
}
@@ -194,7 +198,7 @@ export function parseBool(v: any): boolean {
* @param v - Value to parse (number, string, null, or undefined).
* @returns The parsed number, or `undefined` if the input is null/undefined/NaN.
*/
-export function parseNum(v: any): number | undefined {
+export function parseNum(v: unknown): number | undefined {
if (v === null || v === undefined) return undefined;
const n = typeof v === 'string' ? parseFloat(v) : Number(v);
return isNaN(n) ? undefined : n;
@@ -207,12 +211,14 @@ export function parseNum(v: any): number | undefined {
* @param v - Value to parse (number, string, null, or undefined).
* @returns The parsed integer, or `undefined` if the input is null/undefined/NaN.
*/
-export function parseInt2(v: any): number | undefined {
+export function parseInt2(v: unknown): number | undefined {
if (v === null || v === undefined) return undefined;
const n = typeof v === 'string' ? parseInt(v, 10) : Math.round(Number(v));
return isNaN(n) ? undefined : n;
}
+const ISL_OSL_PATTERN = /[_-](?\d+)k(?\d+)k[_\-.]/iu;
+
/**
* Extract ISL (input sequence length) and OSL (output sequence length) in tokens
* from a file/directory name that encodes them as `{n}k{m}k`.
@@ -225,7 +231,7 @@ export function parseInt2(v: any): number | undefined {
* @returns An object with `isl` and `osl` in tokens, or `null` if no match is found.
*/
export function parseIslOsl(name: string): { isl: number; osl: number } | null {
- const m = name.match(/[_-](?\d+)k(?\d+)k[_\-.]/iu);
+ const m = name.match(ISL_OSL_PATTERN);
if (!m) return null;
return { isl: parseInt(m[1], 10) * 1024, osl: parseInt(m[2], 10) * 1024 };
}
diff --git a/packages/db/src/etl/server-log-metrics.test.ts b/packages/db/src/etl/server-log-metrics.test.ts
new file mode 100644
index 00000000..9e0fa852
--- /dev/null
+++ b/packages/db/src/etl/server-log-metrics.test.ts
@@ -0,0 +1,43 @@
+import { describe, expect, it } from 'vitest';
+
+import { kvCachePoolTokensFromServerLog } from './server-log-metrics';
+
+describe('kvCachePoolTokensFromServerLog', () => {
+ it('returns null for empty / missing logs', () => {
+ expect(kvCachePoolTokensFromServerLog(null)).toBeNull();
+ expect(kvCachePoolTokensFromServerLog('')).toBeNull();
+ expect(kvCachePoolTokensFromServerLog('no kv cache line here')).toBeNull();
+ });
+
+ it('reads a single-engine (ep1) pool size', () => {
+ const log = `
+(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1744] GPU KV cache size: 11,294,463 tokens
+(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1745] Maximum concurrency for 1,048,576 tokens per request: 10.77x
+`;
+ expect(kvCachePoolTokensFromServerLog(log)).toBe(11_294_463);
+ });
+
+ it('sums across data-parallel engine cores (ep8)', () => {
+ const lines = Array.from(
+ { length: 8 },
+ (_, i) =>
+ `(EngineCore_DP${i} pid=${2337827 + i}) INFO [kv_cache_utils.py:1744] GPU KV cache size: 11,577,333 tokens`,
+ ).join('\n');
+ expect(kvCachePoolTokensFromServerLog(lines)).toBe(11_577_333 * 8);
+ });
+
+ it('dedups reprinted lines for the same engine core', () => {
+ const log = `
+(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens
+(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens
+(EngineCore_DP1 pid=2) GPU KV cache size: 5,000,000 tokens
+`;
+ // DP0 counted once + DP1 once = 10M, not 15M.
+ expect(kvCachePoolTokensFromServerLog(log)).toBe(10_000_000);
+ });
+
+ it('falls back to bare lines when no engine-core prefix is present', () => {
+ const log = `INFO GPU KV cache size: 1,234,567 tokens`;
+ expect(kvCachePoolTokensFromServerLog(log)).toBe(1_234_567);
+ });
+});
diff --git a/packages/db/src/etl/server-log-metrics.ts b/packages/db/src/etl/server-log-metrics.ts
new file mode 100644
index 00000000..b8b26dd1
--- /dev/null
+++ b/packages/db/src/etl/server-log-metrics.ts
@@ -0,0 +1,65 @@
+/**
+ * Derive server-side scalars from the captured vLLM server log
+ * (`server_logs.server_log`). These come from startup log lines rather than the
+ * scraped Prometheus `/metrics`, because for MLA / sparse-attention models the
+ * `vllm:cache_config_info` labels (num_gpu_blocks × block_size) do NOT
+ * reconstruct the real KV-cache token capacity — they undercount by a
+ * non-constant factor. vLLM's own `GPU KV cache size: N tokens` line is the
+ * authoritative number.
+ */
+
+/**
+ * Total KV-cache pool size in tokens.
+ *
+ * vLLM prints one `GPU KV cache size: N tokens` line per engine core (one per
+ * data-parallel rank; tensor-parallel is already aggregated into that single
+ * per-engine number). We sum across distinct engine cores so the result is the
+ * deployment-wide total:
+ *
+ * (EngineCore pid=…) GPU KV cache size: 11,294,463 tokens → ep1 total
+ * (EngineCore_DP0 pid=…) GPU KV cache size: 11,577,333 tokens ┐
+ * (EngineCore_DP1 pid=…) GPU KV cache size: 11,577,333 tokens ┘ → ×8 = total
+ *
+ * Returns null when the log has no such line (non-vLLM frameworks, or a log
+ * that didn't capture engine startup).
+ */
+export function kvCachePoolTokensFromServerLog(serverLog: string | null): number | null {
+ if (!serverLog) return null;
+
+ // Scan line-by-line. We deliberately avoid a global regex over the whole blob
+ // with a lazy `[^\n]*?` bridge between the engine tag and the size: some logs
+ // contain multi-megabyte single lines (progress bars, tracebacks) that make
+ // such a regex recurse and blow the stack. A per-line substring pre-filter
+ // means the (cheap) regexes only ever run on the short KV-size lines.
+ //
+ // Each engine core prints one line; the tag (e.g. `EngineCore_DP3`) is stable
+ // across a run while the pid is not, so key on the tag to dedup reprints and
+ // sum across data-parallel ranks.
+ const tagRe = /\((?EngineCore(?:_DP\d+)?)\s+pid=\d+\)/u;
+ const sizeRe = /GPU KV cache size:\s*(?[\d,]+)\s*tokens/u;
+ const perEngine = new Map();
+ let bareTotal = 0;
+ let bareFound = false;
+ for (const line of serverLog.split('\n')) {
+ if (!line.includes('GPU KV cache size')) continue;
+ const sizeMatch = sizeRe.exec(line);
+ if (!sizeMatch) continue;
+ const tokens = Number(sizeMatch.groups!.tokens!.replaceAll(',', ''));
+ if (!Number.isFinite(tokens) || tokens <= 0) continue;
+ const tagMatch = tagRe.exec(line);
+ if (tagMatch) {
+ perEngine.set(tagMatch.groups!.tag!, tokens);
+ } else {
+ // Fallback for logs without the engine-core prefix: count each occurrence
+ // (one per engine when there are no reprints). Best-effort only.
+ bareTotal += tokens;
+ bareFound = true;
+ }
+ }
+ if (perEngine.size > 0) {
+ let total = 0;
+ for (const v of perEngine.values()) total += v;
+ return total;
+ }
+ return bareFound ? bareTotal : null;
+}
diff --git a/packages/db/src/etl/server-metrics-adapters.ts b/packages/db/src/etl/server-metrics-adapters.ts
new file mode 100644
index 00000000..f123d9f8
--- /dev/null
+++ b/packages/db/src/etl/server-metrics-adapters.ts
@@ -0,0 +1,100 @@
+/**
+ * Normalize orchestrator-specific server-metric labels into a stable source
+ * identity consumed by the API and frontend. AIPerf owns the export envelope;
+ * the serving orchestrator owns the meaning of labels inside each series.
+ */
+
+export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown';
+
+export interface RawMetricSourceSeries {
+ endpoint_url?: string;
+ labels?: Record;
+}
+
+export interface ServerMetricsContext {
+ /** Canonical framework stored in configs, for example `dynamo-vllm`. */
+ framework?: string | null;
+ /** Per-worker role series are only meaningful for disaggregated configs. */
+ disagg?: boolean;
+}
+
+export interface MetricSource {
+ /** Stable key used to join this source across different metric names. */
+ id: string;
+ adapter: string;
+ role: MetricSourceRole;
+ endpointUrl: string | null;
+ nativeRole: string | null;
+ workerId: string | null;
+ dpRank: string | null;
+ engine: string | null;
+}
+
+interface ServerMetricsAdapter {
+ id: string;
+ matches: (context: ServerMetricsContext) => boolean;
+ identifySource: (series: RawMetricSourceSeries) => MetricSource;
+}
+
+function stableId(adapter: string, parts: (string | null | undefined)[]): string {
+ return [adapter, ...parts.map((part) => part ?? '')].join('|');
+}
+
+const dynamoAdapter: ServerMetricsAdapter = {
+ id: 'dynamo',
+ matches: ({ framework }) => framework?.startsWith('dynamo-') ?? false,
+ identifySource(series) {
+ const labels = series.labels ?? {};
+ const nativeRole = labels['dynamo_component'] ?? null;
+ const role: MetricSourceRole =
+ nativeRole === 'prefill'
+ ? 'prefill'
+ : nativeRole === 'backend'
+ ? 'decode'
+ : nativeRole === 'frontend' || nativeRole === 'router'
+ ? 'router'
+ : 'unknown';
+ const endpointUrl = series.endpoint_url ?? labels['dynamo_endpoint'] ?? null;
+ const workerId = labels['worker_id'] ?? null;
+ const dpRank = labels['dp_rank'] ?? null;
+ const engine = labels['engine'] ?? labels['engine_idx'] ?? null;
+ return {
+ id: stableId('dynamo', [role, endpointUrl, workerId, dpRank, engine]),
+ adapter: 'dynamo',
+ role,
+ endpointUrl,
+ nativeRole,
+ workerId,
+ dpRank,
+ engine,
+ };
+ },
+};
+
+const genericAdapter: ServerMetricsAdapter = {
+ id: 'generic',
+ matches: () => true,
+ identifySource(series) {
+ const labels = series.labels ?? {};
+ const endpointUrl = series.endpoint_url ?? null;
+ const workerId = labels['worker_id'] ?? null;
+ const dpRank = labels['dp_rank'] ?? null;
+ const engine = labels['engine'] ?? labels['engine_idx'] ?? null;
+ return {
+ id: stableId('generic', [endpointUrl, workerId, dpRank, engine]),
+ adapter: 'generic',
+ role: endpointUrl || workerId || dpRank || engine ? 'unknown' : 'combined',
+ endpointUrl,
+ nativeRole: null,
+ workerId,
+ dpRank,
+ engine,
+ };
+ },
+};
+
+const ADAPTERS: readonly ServerMetricsAdapter[] = [dynamoAdapter, genericAdapter];
+
+export function selectServerMetricsAdapter(context: ServerMetricsContext): ServerMetricsAdapter {
+ return ADAPTERS.find((adapter) => adapter.matches(context)) ?? genericAdapter;
+}
diff --git a/packages/db/src/etl/skip-tracker.test.ts b/packages/db/src/etl/skip-tracker.test.ts
index 90ad73b7..e407db3a 100644
--- a/packages/db/src/etl/skip-tracker.test.ts
+++ b/packages/db/src/etl/skip-tracker.test.ts
@@ -9,6 +9,7 @@ describe('createSkipTracker', () => {
expect(tracker.skips.unmappedHw).toBe(0);
expect(tracker.skips.noIslOsl).toBe(0);
expect(tracker.skips.dbError).toBe(0);
+ expect(tracker.skips.traceReplayMissing).toBe(0);
});
it('initializes with empty unmapped sets', () => {
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 134b5299..5d485bf2 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -8,7 +8,10 @@ export interface Skips {
unmappedModel: number;
unmappedHw: number;
noIslOsl: number;
+ failedRun: number;
dbError: number;
+ /** Agentic point whose sibling `agentic_` artifact had no trace_replay files. */
+ traceReplayMissing: number;
}
export interface SkipSnapshot {
@@ -66,7 +69,15 @@ const MAX_DB_ERRORS = 10;
* @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets.
*/
export function createSkipTracker(): SkipTracker {
- const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 };
+ const skips: Skips = {
+ badZip: 0,
+ unmappedModel: 0,
+ unmappedHw: 0,
+ noIslOsl: 0,
+ failedRun: 0,
+ dbError: 0,
+ traceReplayMissing: 0,
+ };
const unmappedModels = new Set();
const unmappedHws = new Set();
const unmappedPrecisions = new Set();
diff --git a/packages/db/src/etl/trace-artifact-discovery.test.ts b/packages/db/src/etl/trace-artifact-discovery.test.ts
new file mode 100644
index 00000000..2bb1d51b
--- /dev/null
+++ b/packages/db/src/etl/trace-artifact-discovery.test.ts
@@ -0,0 +1,66 @@
+import { execFileSync } from 'node:child_process';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+import { afterEach, describe, expect, it } from 'vitest';
+
+import { discoverTraceReplayArtifacts } from './trace-artifact-discovery';
+
+const tempDirs: string[] = [];
+
+function tempDir(): string {
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'trace-artifacts-test-'));
+ tempDirs.push(dir);
+ return dir;
+}
+
+function writeTraceFiles(dir: string): void {
+ fs.mkdirSync(path.join(dir, 'aiperf_artifacts'), { recursive: true });
+ fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'profile_export.jsonl'), '{}\n');
+ fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.csv'), 'x,y\n');
+ fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.json'), '{}');
+}
+
+afterEach(() => {
+ for (const dir of tempDirs.splice(0)) fs.rmSync(dir, { recursive: true, force: true });
+});
+
+describe('discoverTraceReplayArtifacts', () => {
+ it('discovers the existing single-node sibling layout', () => {
+ const root = tempDir();
+ writeTraceFiles(path.join(root, 'agentic_config-a'));
+
+ const found = discoverTraceReplayArtifacts(root);
+
+ expect(found.get('config-a')).toMatchObject({
+ profileJsonl: expect.stringContaining('profile_export.jsonl'),
+ serverMetricsCsv: expect.stringContaining('server_metrics_export.csv'),
+ serverMetricsJson: expect.stringContaining('server_metrics_export.json'),
+ });
+ });
+
+ it('extracts and indexes multinode traces by concurrency', () => {
+ const root = tempDir();
+ const artifactDir = path.join(root, 'multinode_server_logs_config-b');
+ const archiveSource = path.join(root, 'archive-source');
+ writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_96'));
+ writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_128'));
+ fs.mkdirSync(artifactDir, { recursive: true });
+ execFileSync('tar', [
+ '-czf',
+ path.join(artifactDir, 'multinode_server_logs.tar.gz'),
+ '-C',
+ archiveSource,
+ '.',
+ ]);
+ fs.rmSync(archiveSource, { recursive: true, force: true });
+
+ const found = discoverTraceReplayArtifacts(root);
+
+ expect([...found.keys()].toSorted()).toEqual(['config-b|128', 'config-b|96']);
+ expect(found.get('config-b|96')?.profileJsonl).toContain(
+ 'multinode_server_logs/agentic/conc_96/aiperf_artifacts/profile_export.jsonl',
+ );
+ });
+});
diff --git a/packages/db/src/etl/trace-artifact-discovery.ts b/packages/db/src/etl/trace-artifact-discovery.ts
new file mode 100644
index 00000000..71ee74df
--- /dev/null
+++ b/packages/db/src/etl/trace-artifact-discovery.ts
@@ -0,0 +1,93 @@
+import { execFileSync } from 'node:child_process';
+import fs from 'node:fs';
+import path from 'node:path';
+
+export interface TraceReplayArtifactPaths {
+ profileJsonl: string | null;
+ serverMetricsCsv: string | null;
+ serverMetricsJson: string | null;
+}
+
+const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay'];
+
+const AGENTIC_PREFIX = /^agentic_/u;
+const MULTINODE_PREFIX = /^multinode_server_logs_/u;
+const CONC_DIR_PATTERN = /^conc_(?\d+)$/u;
+
+function traceFilesIn(dir: string): TraceReplayArtifactPaths | null {
+ let profileJsonl: string | null = null;
+ let serverMetricsCsv: string | null = null;
+ let serverMetricsJson: string | null = null;
+
+ for (const subdir of TRACE_SUBDIRS) {
+ const traceDir = path.join(dir, subdir);
+ if (!fs.existsSync(traceDir) || !fs.statSync(traceDir).isDirectory()) continue;
+
+ const profilePath = path.join(traceDir, 'profile_export.jsonl');
+ const csvPath = path.join(traceDir, 'server_metrics_export.csv');
+ const jsonPath = path.join(traceDir, 'server_metrics_export.json');
+ if (!profileJsonl && fs.existsSync(profilePath)) profileJsonl = profilePath;
+ if (!serverMetricsCsv && fs.existsSync(csvPath)) serverMetricsCsv = csvPath;
+ if (!serverMetricsJson && fs.existsSync(jsonPath)) serverMetricsJson = jsonPath;
+ }
+
+ if (!profileJsonl && !serverMetricsCsv && !serverMetricsJson) return null;
+ return { profileJsonl, serverMetricsCsv, serverMetricsJson };
+}
+
+function extractMultinodeArchive(artifactDir: string): string | null {
+ const archivePath = path.join(artifactDir, 'multinode_server_logs.tar.gz');
+ const extractedDir = path.join(artifactDir, 'multinode_server_logs');
+
+ if (!fs.existsSync(extractedDir) && fs.existsSync(archivePath)) {
+ fs.mkdirSync(extractedDir, { recursive: true });
+ execFileSync('tar', ['-xzf', archivePath, '-C', extractedDir], { stdio: 'ignore' });
+ }
+
+ return fs.existsSync(extractedDir) ? extractedDir : null;
+}
+
+/**
+ * Discover trace-replay siblings in both artifact layouts:
+ *
+ * - Single-node: `agentic_/aiperf_artifacts/*`
+ * - Multinode: `multinode_server_logs_/multinode_server_logs.tar.gz`,
+ * containing `agentic/conc_/aiperf_artifacts/*`
+ *
+ * Multinode keys include concurrency (`|`) because one artifact
+ * contains several points, each with a distinct trace payload.
+ */
+export function discoverTraceReplayArtifacts(
+ artifactsDir: string,
+): Map {
+ const discovered = new Map();
+ if (!fs.existsSync(artifactsDir)) return discovered;
+
+ for (const entry of fs.readdirSync(artifactsDir)) {
+ const artifactDir = path.join(artifactsDir, entry);
+ if (!fs.statSync(artifactDir).isDirectory()) continue;
+
+ if (entry.startsWith('agentic_')) {
+ const trace = traceFilesIn(artifactDir);
+ if (trace) discovered.set(entry.replace(AGENTIC_PREFIX, ''), trace);
+ continue;
+ }
+
+ if (!entry.startsWith('multinode_server_logs_')) continue;
+ const extractedDir = extractMultinodeArchive(artifactDir);
+ if (!extractedDir) continue;
+
+ const agenticDir = path.join(extractedDir, 'agentic');
+ if (!fs.existsSync(agenticDir) || !fs.statSync(agenticDir).isDirectory()) continue;
+
+ const suffix = entry.replace(MULTINODE_PREFIX, '');
+ for (const concEntry of fs.readdirSync(agenticDir)) {
+ const match = concEntry.match(CONC_DIR_PATTERN);
+ if (!match?.groups?.conc) continue;
+ const trace = traceFilesIn(path.join(agenticDir, concEntry));
+ if (trace) discovered.set(`${suffix}|${match.groups.conc}`, trace);
+ }
+ }
+
+ return discovered;
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
new file mode 100644
index 00000000..1c739b7d
--- /dev/null
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -0,0 +1,204 @@
+/**
+ * Insert per-point aiperf trace files (`profile_export.jsonl` +
+ * `server_metrics_export.csv`) into `agentic_trace_replay` and link the new row
+ * to each provided benchmark_results row via `trace_replay_id`.
+ *
+ * Mirrors the {@link insertServerLog} idempotency contract: rows that already
+ * have a non-null `trace_replay_id` are left alone so a re-ingest doesn't
+ * duplicate the sibling blob.
+ */
+
+import { gzipSync } from 'node:zlib';
+
+import type postgres from 'postgres';
+
+import { computeAggregateStats } from './compute-aggregate-stats.js';
+import { computeChartSeries } from './compute-chart-series.js';
+import { computeRequestTimeline } from './compute-request-timeline.js';
+import type { ServerMetricsContext } from './server-metrics-adapters';
+
+type Sql = ReturnType;
+
+export interface TraceReplayIngestOptions {
+ metricsContext?: ServerMetricsContext;
+ progressLabel?: string;
+}
+
+function formatBytes(bytes: number | null | undefined): string {
+ if (bytes === null || bytes === undefined) return 'none';
+ if (bytes < 1024) return `${bytes} B`;
+ const kib = bytes / 1024;
+ if (kib < 1024) return `${kib.toFixed(1)} KiB`;
+ const mib = kib / 1024;
+ if (mib < 1024) return `${mib.toFixed(1)} MiB`;
+ return `${(mib / 1024).toFixed(1)} GiB`;
+}
+
+function elapsed(startMs: number): string {
+ return `${((Date.now() - startMs) / 1000).toFixed(1)}s`;
+}
+
+/**
+ * Persist the per-point trace files and link them to `benchmarkResultIds`.
+ *
+ * @param sql Active `postgres` connection.
+ * @param benchmarkResultIds DB ids of the benchmark_results rows produced by
+ * the same `bmk_agentic_` artifact whose
+ * sibling `agentic_` directory holds these
+ * trace files.
+ * @param profileExportJsonl Raw bytes of `profile_export.jsonl`, or null.
+ * Gzipped before storage.
+ * @param serverMetricsCsv Raw bytes of `server_metrics_export.csv`, or null.
+ * Stored as-is.
+ * @param serverMetricsJson Raw bytes of `server_metrics_export.json` —
+ * per-scrape time-series of every Prometheus metric.
+ * Optional, gzipped before storage (~42x ratio).
+ * @param options Canonical framework/disagg context plus optional
+ * progress label for CI logs.
+ */
+export async function insertTraceReplay(
+ sql: Sql,
+ benchmarkResultIds: number[],
+ profileExportJsonl: Buffer | null,
+ serverMetricsCsv: Buffer | null,
+ serverMetricsJson: Buffer | null = null,
+ options: TraceReplayIngestOptions = {},
+): Promise {
+ const { metricsContext = {}, progressLabel } = options;
+ const log = (message: string): void => {
+ if (progressLabel) console.log(` trace_replay ${progressLabel}: ${message}`);
+ };
+
+ if (benchmarkResultIds.length === 0) return;
+ if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return;
+
+ // Only link rows that don't already point at a trace_replay row — keeps
+ // re-ingest from inserting duplicate sibling blobs.
+ const linkStart = Date.now();
+ log(`checking ${benchmarkResultIds.length} benchmark row(s) for existing links`);
+ const unlinked = await sql<{ id: number }[]>`
+ select id from benchmark_results
+ where id = any(${sql.array(benchmarkResultIds)}::bigint[])
+ and trace_replay_id is null
+ `;
+ log(`found ${unlinked.length} unlinked row(s) (${elapsed(linkStart)})`);
+ if (unlinked.length === 0) {
+ log('skipping blob insert; all benchmark rows already linked');
+ return;
+ }
+
+ const gzipStart = Date.now();
+ log(
+ `compressing profile=${formatBytes(profileExportJsonl?.length)}, ` +
+ `server_csv=${formatBytes(serverMetricsCsv?.length)}, ` +
+ `server_json=${formatBytes(serverMetricsJson?.length)}`,
+ );
+ const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null;
+ const profileSize = profileExportJsonl ? profileExportJsonl.length : null;
+ const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null;
+ const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
+ const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
+ log(
+ `compressed profile=${formatBytes(profileGz?.length)}, ` +
+ `server_json=${formatBytes(metricsJsonGz?.length)} (${elapsed(gzipStart)})`,
+ );
+
+ // Pre-compute aggregate stats + chart-ready time-series + per-request
+ // timeline so the detail page doesn't have to re-parse these blobs on
+ // every request. Each helper tolerates a null blob and falls back to
+ // a streaming parser for oversized server_metrics blobs.
+ const computeStart = Date.now();
+ log('computing aggregate stats, chart series, and request timeline');
+ const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([
+ computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
+ computeChartSeries(metricsJsonGz, metricsContext),
+ Promise.resolve(computeRequestTimeline(profileGz)),
+ ]);
+ log(
+ `computed derived JSON: chart_windows=${chartSeries?.timeslicesCount ?? 0}, ` +
+ `timeline_requests=${requestTimeline?.requests.length ?? 0} (${elapsed(computeStart)})`,
+ );
+
+ const insertStart = Date.now();
+ log('inserting trace_replay blob row');
+ const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
+ insert into agentic_trace_replay (
+ profile_export_jsonl_gz,
+ profile_export_uncompressed_size,
+ server_metrics_csv,
+ server_metrics_csv_size,
+ server_metrics_json_gz,
+ server_metrics_json_uncompressed_size,
+ aggregate_stats,
+ chart_series,
+ request_timeline
+ )
+ values (
+ ${profileGz},
+ ${profileSize},
+ ${serverMetricsCsv},
+ ${csvSize},
+ ${metricsJsonGz},
+ ${metricsJsonSize},
+ ${sql.json(structuredClone(aggregateStats) as unknown as Parameters[0])},
+ ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters[0])},
+ ${requestTimeline === null ? null : sql.json(structuredClone(requestTimeline) as unknown as Parameters[0])}
+ )
+ returning id
+ `;
+ log(`inserted trace_replay_id=${traceReplayId} (${elapsed(insertStart)})`);
+
+ const updateStart = Date.now();
+ log(`linking trace_replay_id=${traceReplayId} to ${unlinked.length} benchmark row(s)`);
+ await sql`
+ update benchmark_results
+ set trace_replay_id = ${traceReplayId}
+ where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+ `;
+ log(`linked benchmark rows (${elapsed(updateStart)})`);
+
+ // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang
+ // runs don't populate these in the harness JSON; vLLM runs do but only
+ // for GPU. We always recompute to keep the derivation consistent with
+ // what the detail-page charts plot — overwriting any pre-existing value.
+ //
+ // Source label naming differs by framework / cache topology:
+ // SGLang hicache: 'cache hit (HBM)' + 'cache hit (CPU offload)'
+ // SGLang older: 'cache hit' (no tier breakdown)
+ // vLLM LMCache: 'local_cache_hit' + 'external_kv_transfer' (+ 'local_compute' for miss)
+ // vLLM single: falls back to prefixCacheHitsTps total (= local cache only)
+ if (chartSeries && chartSeries.prefillTps.length > 0) {
+ const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
+ if (sumPrompts > 0) {
+ const sumOf = (name: string): number =>
+ (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0);
+ // CPU-offload hits: SGLang hicache + vLLM LMCache external transfer.
+ const cpuHits = sumOf('cache hit (CPU offload)') + sumOf('external_kv_transfer');
+ // GPU/HBM hits from source breakdown, summed across known aliases.
+ const hbmFromBreakdown =
+ sumOf('cache hit (HBM)') + sumOf('cache hit') + sumOf('local_cache_hit');
+ // If the source breakdown has any GPU entry, use it. Otherwise fall back
+ // to total prefixCacheHitsTps sum (single-source vLLM path with no
+ // by_source metric — equals the lone cache counter's lifetime).
+ const gpuHits =
+ hbmFromBreakdown > 0
+ ? hbmFromBreakdown
+ : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
+ const gpuRate = gpuHits / sumPrompts;
+ const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null;
+ await sql`
+ update benchmark_results
+ set metrics = jsonb_set(
+ case when ${cpuRate}::numeric is not null
+ then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric))
+ else metrics
+ end,
+ '{server_gpu_cache_hit_rate}',
+ to_jsonb(${gpuRate}::numeric)
+ )
+ where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+ `;
+ log('updated cache-hit metrics from chart series');
+ }
+ }
+}
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
new file mode 100644
index 00000000..444236ab
--- /dev/null
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -0,0 +1,259 @@
+import { describe, it, expect } from 'vitest';
+import {
+ countSeenPrefixBlocks,
+ buildConversationStructure,
+ countConversationRequests,
+ linearHistogram,
+ logHistogram,
+ logHistogramWithZero,
+ subagentRequestTurns,
+ summarizeValues,
+ type RawWekaConversation,
+ type SubagentNode,
+ type TurnNode,
+} from './weka-structure';
+
+describe('countSeenPrefixBlocks', () => {
+ it('counts only the contiguous leading run already seen', () => {
+ const seen = new Set([1, 2, 3, 9]);
+ // 1,2,3 seen contiguously; 4 breaks the run even though 9 is seen later.
+ expect(countSeenPrefixBlocks([1, 2, 3, 4, 9], seen)).toBe(3);
+ });
+
+ it('returns 0 when the first block is unseen', () => {
+ expect(countSeenPrefixBlocks([7, 1, 2], new Set([1, 2]))).toBe(0);
+ });
+
+ it('returns the full length when every block is seen', () => {
+ expect(countSeenPrefixBlocks([1, 2], new Set([1, 2, 3]))).toBe(2);
+ });
+
+ it('handles empty hash list', () => {
+ expect(countSeenPrefixBlocks([], new Set([1]))).toBe(0);
+ });
+});
+
+describe('buildConversationStructure', () => {
+ it('splits input into cached-prefix vs uncached as the prefix cache warms', () => {
+ const conv: RawWekaConversation = {
+ id: 'c1',
+ block_size: 64,
+ requests: [
+ // Turn 0: nothing seen yet → all uncached.
+ { type: 'n', model: 'm', in: 128, out: 10, hash_ids: [1, 2] },
+ // Turn 1: blocks 1,2 already seen, 3 is new → 2 blocks cached.
+ { type: 'n', model: 'm', in: 192, out: 20, hash_ids: [1, 2, 3] },
+ ],
+ };
+ const s = buildConversationStructure(conv);
+ const t0 = s.nodes[0] as TurnNode;
+ const t1 = s.nodes[1] as TurnNode;
+ expect(t0).toMatchObject({ kind: 'turn', in: 128, cached: 0, uncached: 128, out: 10 });
+ expect(t1.cached).toBe(128); // 2 blocks × 64
+ expect(t1.uncached).toBe(64); // 192 - 128
+ expect(s.totals).toMatchObject({
+ in: 320,
+ out: 30,
+ cached: 128,
+ uncached: 192,
+ numTurns: 2,
+ numSubagentGroups: 0,
+ });
+ });
+
+ it('stamps top-level turns with their raw Weka request index', () => {
+ const structure = buildConversationStructure({
+ id: 'raw-index',
+ requests: [
+ { type: 'n', in: 1, out: 1 },
+ { type: 'subagent', requests: [{ type: 'n', in: 1, out: 1 }] },
+ { type: 'n', in: 1, out: 1 },
+ ],
+ });
+
+ expect((structure.nodes[0] as TurnNode).rawIndex).toBe(0);
+ expect((structure.nodes[2] as TurnNode).rawIndex).toBe(2);
+ });
+
+ it('clamps cached to the effective input on a partial last block', () => {
+ const conv: RawWekaConversation = {
+ id: 'c2',
+ block_size: 64,
+ requests: [
+ { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // 2 blocks but in=100 (partial)
+ { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // both seen → cached clamped to 100
+ ],
+ };
+ const s = buildConversationStructure(conv);
+ const t1 = s.nodes[1] as TurnNode;
+ expect(t1.cached).toBe(100);
+ expect(t1.uncached).toBe(0);
+ });
+
+ it('treats turns with no hash_ids as fully uncached', () => {
+ const conv: RawWekaConversation = {
+ id: 'c3',
+ requests: [{ type: 'n', in: 50, out: 5 }],
+ };
+ const t0 = buildConversationStructure(conv).nodes[0] as TurnNode;
+ expect(t0).toMatchObject({ cached: 0, uncached: 50 });
+ });
+
+ it('nests subagent groups with aggregated children and runs them against a spawn-time snapshot', () => {
+ const conv: RawWekaConversation = {
+ id: 'c4',
+ block_size: 64,
+ requests: [
+ { type: 'n', model: 'main', t: 0, api_time: 1, in: 64, out: 10, hash_ids: [1] },
+ {
+ type: 'subagent',
+ agent_id: 'a1',
+ subagent_type: 'Explore',
+ t: 12.5,
+ duration_ms: 1234,
+ requests: [
+ // sees parent block 1 (snapshot at spawn) → 1 block cached
+ { type: 'n', model: 'sub', t: 12.5, in: 128, out: 7, hash_ids: [1, 5] },
+ // now block 5 is also seen within the subagent → 2 cached
+ { type: 'n', model: 'sub', t: 13.1, in: 128, out: 3, hash_ids: [1, 5] },
+ ],
+ },
+ // Parent turn after subagent: block 5 must NOT be cached (subagent
+ // context not folded back); only block 1 is in the parent seen set.
+ { type: 'n', model: 'main', in: 128, out: 1, hash_ids: [1, 5] },
+ ],
+ };
+ const s = buildConversationStructure(conv);
+ expect(s.totals.numTurns).toBe(2); // two top-level normal turns
+ expect(s.totals.numSubagentGroups).toBe(1);
+
+ const sub = s.nodes[1] as SubagentNode;
+ expect(sub.kind).toBe('subagent');
+ expect(sub.label).toBe('Explore');
+ expect(sub.agentId).toBe('a1');
+ expect(sub.rawIndex).toBe(1);
+ expect(sub.durationMs).toBe(1234);
+ expect(sub.startS).toBe(12.5);
+ expect(sub.endS).toBeCloseTo(13.734, 6);
+ expect(sub.children).toHaveLength(2);
+ expect(countConversationRequests(s)).toBe(4);
+ expect(subagentRequestTurns(s).map((turn) => turn.model)).toEqual(['sub', 'sub']);
+ expect(sub.children.map((child) => [child.startS, child.endS])).toEqual([
+ [12.5, 12.5],
+ [13.1, 13.1],
+ ]);
+ expect(sub.children.map((child) => [child.rawIndex, child.innerIndex])).toEqual([
+ [1, 0],
+ [1, 1],
+ ]);
+ expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot
+ expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child
+ expect(sub.in).toBe(256);
+ expect(sub.out).toBe(10);
+
+ const afterSub = s.nodes[2] as TurnNode;
+ expect(afterSub.cached).toBe(64); // only block 1; block 5 not folded back
+ expect((s.nodes[0] as TurnNode).endS).toBe(1);
+ });
+
+ it('counts top-level and subagent child turns as requests, but not subagent groups', () => {
+ const structure = buildConversationStructure({
+ id: 'request-count',
+ requests: [
+ { type: 'n', in: 1, out: 1 },
+ {
+ type: 'subagent',
+ requests: [
+ { type: 'n', in: 1, out: 1 },
+ { type: 'n', in: 1, out: 1 },
+ ],
+ },
+ ],
+ });
+
+ expect(countConversationRequests(structure)).toBe(3);
+ expect(subagentRequestTurns(structure)).toHaveLength(2);
+ });
+
+ it('falls back to the default block size and a generic subagent label', () => {
+ const conv: RawWekaConversation = {
+ id: 'c5',
+ requests: [{ type: 'subagent', requests: [{ type: 'n', in: 10, out: 1, hash_ids: [1] }] }],
+ };
+ const s = buildConversationStructure(conv);
+ expect(s.blockSize).toBe(64);
+ expect((s.nodes[0] as SubagentNode).label).toBe('Subagent');
+ });
+
+ it('derives a subagent time range from child timings when group timing is absent', () => {
+ const conv: RawWekaConversation = {
+ id: 'c6',
+ requests: [
+ {
+ type: 'subagent',
+ requests: [
+ { type: 'n', t: 5, api_time: 2.5, in: 10, out: 1 },
+ { type: 'n', t: 9, api_time: 3, in: 10, out: 1 },
+ ],
+ },
+ ],
+ };
+ const sub = buildConversationStructure(conv).nodes[0] as SubagentNode;
+ expect(sub.startS).toBe(5);
+ expect(sub.endS).toBe(12);
+ });
+
+ it('normalizes legacy subagent-relative request intervals', () => {
+ const structure = buildConversationStructure({
+ id: 'legacy-relative',
+ requests: [
+ {
+ type: 'subagent',
+ t: 100,
+ requests: [{ type: 'n', t: 2, api_time: 3, in: 10, out: 1 }],
+ },
+ ],
+ });
+ const child = (structure.nodes[0] as SubagentNode).children[0]!;
+ expect(child).toMatchObject({ startS: 102, endS: 105 });
+ });
+});
+
+describe('histograms', () => {
+ it('linearHistogram buckets across [0, max] and totals the count', () => {
+ const bins = linearHistogram([0, 1, 2, 3, 4], 4);
+ expect(bins).toHaveLength(4);
+ expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(5);
+ expect(bins[0].x0).toBe(0);
+ });
+
+ it('linearHistogram handles all-zero input', () => {
+ expect(linearHistogram([0, 0])).toEqual([{ x0: 0, x1: 1, count: 2 }]);
+ });
+
+ it('logHistogram drops non-positive values and preserves the positive total', () => {
+ const bins = logHistogram([1, 10, 100, 1000, 0, -5], 3);
+ expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(4);
+ });
+
+ it('both return [] for empty input', () => {
+ expect(linearHistogram([])).toEqual([]);
+ expect(logHistogram([])).toEqual([]);
+ });
+
+ it('preserves zero-valued samples in a dedicated log histogram bin', () => {
+ const bins = logHistogramWithZero([0, 0, 1, 10, 100], 4);
+ expect(bins[0]).toEqual({ x0: 0, x1: 1, count: 2 });
+ expect(bins.reduce((total, bin) => total + bin.count, 0)).toBe(5);
+ });
+});
+
+describe('summarizeValues', () => {
+ it('computes the same linearly-interpolated percentile set as request distributions', () => {
+ const summary = summarizeValues(Array.from({ length: 100 }, (_, i) => i + 1));
+ expect(summary.median).toBeCloseTo(50.5, 6);
+ expect(summary.p75).toBeCloseTo(75.25, 6);
+ expect(summary.p90).toBeCloseTo(90.1, 6);
+ expect(summary.p95).toBeCloseTo(95.05, 6);
+ });
+});
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
new file mode 100644
index 00000000..ccfb6ec7
--- /dev/null
+++ b/packages/db/src/etl/weka-structure.ts
@@ -0,0 +1,327 @@
+/**
+ * Pure transforms for the HuggingFace cc-traces-weka datasets.
+ *
+ * Turns a raw conversation record (`{ id, block_size, requests[] }`, where each
+ * request is a normal turn or a subagent group) into a compact, flamegraph-ready
+ * `structure`: ordered nodes with input split into cached-prefix vs
+ * uncached-suffix. The cached split ports `_count_seen_prefix_blocks` from the
+ * aiperf weka loader (contiguous leading hash_ids already seen under an infinite
+ * KV cache). No DB access — safe to import anywhere and unit-test directly.
+ */
+
+export const DEFAULT_BLOCK_SIZE = 64;
+
+// ── Raw record shapes (subset we read) ──────────────────────────────────────
+
+export interface RawWekaRequest {
+ t?: number;
+ type?: string; // 'n' | 's'
+ model?: string;
+ in?: number;
+ out?: number;
+ hash_ids?: number[];
+ api_time?: number;
+}
+
+export interface RawWekaSubagent {
+ t?: number;
+ type: 'subagent';
+ agent_id?: string;
+ subagent_type?: string;
+ duration_ms?: number;
+ requests?: RawWekaRequest[];
+ models?: string[];
+}
+
+export type RawWekaEntry = RawWekaRequest | RawWekaSubagent;
+
+export interface RawWekaConversation {
+ id: string;
+ models?: string[];
+ block_size?: number;
+ hash_id_scope?: string;
+ requests?: RawWekaEntry[];
+}
+
+// ── Output structure (stored in dataset_conversations.structure) ─────────────
+
+export interface TurnNode {
+ kind: 'turn';
+ turnIndex: number;
+ /** Zero-based index in the raw Weka requests array, when this row maps to one. */
+ rawIndex?: number;
+ /** Zero-based index within a raw nested request array, when this row maps to one. */
+ innerIndex?: number;
+ /** Seconds from the start of the conversation. */
+ startS?: number;
+ /** End of the original request interval (`startS + api_time`). */
+ endS?: number;
+ model?: string;
+ in: number;
+ out: number;
+ /** Input tokens served from the prefix cache (≤ in). */
+ cached: number;
+ /** Input tokens that must be (re)computed (in - cached). */
+ uncached: number;
+}
+
+export interface SubagentNode {
+ kind: 'subagent';
+ label: string;
+ agentId?: string;
+ /** Zero-based index of the raw top-level subagent marker. */
+ rawIndex?: number;
+ /** Seconds from the start of the conversation. */
+ startS?: number;
+ /** Seconds from the start of the conversation. */
+ endS?: number;
+ durationMs?: number;
+ in: number;
+ out: number;
+ cached: number;
+ uncached: number;
+ children: TurnNode[];
+}
+
+export type StructureNode = TurnNode | SubagentNode;
+
+export interface ConversationStructure {
+ blockSize: number;
+ nodes: StructureNode[];
+ totals: {
+ in: number;
+ out: number;
+ cached: number;
+ uncached: number;
+ numTurns: number;
+ numSubagentGroups: number;
+ };
+}
+
+/** Actual model requests in a conversation: main turns plus subagent child turns. */
+export function countConversationRequests(structure: ConversationStructure): number {
+ return structure.totals.numTurns + subagentRequestTurns(structure).length;
+}
+
+/** Model requests issued by inner subagents, excluding all parent-agent turns. */
+export function subagentRequestTurns(structure: ConversationStructure): TurnNode[] {
+ return structure.nodes.flatMap((node) => (node.kind === 'subagent' ? node.children : []));
+}
+
+const isSubagent = (e: RawWekaEntry): e is RawWekaSubagent =>
+ (e as RawWekaSubagent).type === 'subagent';
+
+/**
+ * Count contiguous leading hash_ids already present in `seen`
+ * (port of aiperf `_count_seen_prefix_blocks`).
+ */
+export function countSeenPrefixBlocks(
+ hashIds: readonly number[],
+ seen: ReadonlySet,
+): number {
+ let hits = 0;
+ for (const h of hashIds) {
+ if (!seen.has(h)) break;
+ hits += 1;
+ }
+ return hits;
+}
+
+/**
+ * Compute the {cached, uncached} input-token split for one request and fold its
+ * blocks into `seen`. `cached` is derived from blocks but clamped to the
+ * request's effective `in` so cached+uncached === in even when the last block is
+ * partial (in = hash_token_count, not always a multiple of blockSize).
+ */
+function splitInput(
+ req: RawWekaRequest,
+ seen: Set,
+ blockSize: number,
+): { in: number; cached: number; uncached: number } {
+ const input = Math.max(0, Math.round(req.in ?? 0));
+ const hashIds = req.hash_ids ?? [];
+ if (hashIds.length === 0) {
+ return { in: input, cached: 0, uncached: input };
+ }
+ const cachedBlocks = countSeenPrefixBlocks(hashIds, seen);
+ for (const h of hashIds) seen.add(h);
+ const cached = Math.min(input, cachedBlocks * blockSize);
+ return { in: input, cached, uncached: input - cached };
+}
+
+function subagentLabel(s: RawWekaSubagent): string {
+ const base = s.subagent_type?.trim();
+ return base && base.length > 0 ? base : 'Subagent';
+}
+
+function finiteTime(value: number | undefined): number | undefined {
+ return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined;
+}
+
+function requestEndS(startS: number | undefined, apiTime: number | undefined): number | undefined {
+ if (startS === undefined) return undefined;
+ const duration = finiteTime(apiTime) ?? 0;
+ return startS + duration;
+}
+
+/** Mirror aiperf's legacy-relative/current-absolute subagent timestamp handling. */
+function subagentRequestStartS(
+ entry: RawWekaSubagent,
+ request: RawWekaRequest,
+): number | undefined {
+ const requestStart = finiteTime(request.t);
+ if (requestStart === undefined) return undefined;
+ const groupStart = finiteTime(entry.t);
+ if (groupStart !== undefined && requestStart + 1e-6 < groupStart) {
+ return groupStart + requestStart;
+ }
+ return requestStart;
+}
+
+function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } {
+ const children = entry.requests ?? [];
+ const childStarts = children
+ .map((child) => subagentRequestStartS(entry, child))
+ .filter((value): value is number => value !== undefined);
+ const startS =
+ finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined);
+ const durationMs = finiteTime(entry.duration_ms);
+ if (startS !== undefined && durationMs !== undefined) {
+ return { startS, endS: startS + durationMs / 1000 };
+ }
+
+ const childEnds = children
+ .map((child) => {
+ const childStart = subagentRequestStartS(entry, child);
+ if (childStart === undefined) return undefined;
+ return childStart + (finiteTime(child.api_time) ?? 0);
+ })
+ .filter((value): value is number => value !== undefined);
+ return {
+ startS,
+ endS: childEnds.length > 0 ? Math.max(...childEnds) : startS,
+ };
+}
+
+/**
+ * Build the flamegraph structure for one conversation. Main turns share a single
+ * accumulating prefix-cache `seen` set; each subagent group runs against a
+ * *copy* of the parent `seen` at spawn (its context is separate and is not
+ * folded back into the parent), mirroring the weka loader's parent/child split.
+ */
+export function buildConversationStructure(
+ conv: RawWekaConversation,
+ blockSizeOverride?: number,
+): ConversationStructure {
+ const blockSize = blockSizeOverride ?? conv.block_size ?? DEFAULT_BLOCK_SIZE;
+ const seen = new Set();
+ const nodes: StructureNode[] = [];
+ let totalIn = 0;
+ let totalOut = 0;
+ let totalCached = 0;
+ let totalUncached = 0;
+ let numTurns = 0;
+ let numSubagentGroups = 0;
+ let turnIndex = 0;
+
+ for (const [idx, entry] of (conv.requests ?? []).entries()) {
+ if (isSubagent(entry)) {
+ const { startS, endS } = subagentTimeRange(entry);
+ const childSeen = new Set(seen); // snapshot at spawn; not merged back
+ const children: TurnNode[] = [];
+ let gin = 0;
+ let gout = 0;
+ let gcached = 0;
+ let guncached = 0;
+ for (const [innerIdx, inner] of (entry.requests ?? []).entries()) {
+ const split = splitInput(inner, childSeen, blockSize);
+ const out = Math.max(0, Math.round(inner.out ?? 0));
+ const childStartS = subagentRequestStartS(entry, inner);
+ children.push({
+ kind: 'turn',
+ turnIndex: turnIndex++,
+ rawIndex: idx,
+ innerIndex: innerIdx,
+ startS: childStartS,
+ endS: requestEndS(childStartS, inner.api_time),
+ model: inner.model,
+ in: split.in,
+ out,
+ cached: split.cached,
+ uncached: split.uncached,
+ });
+ gin += split.in;
+ gout += out;
+ gcached += split.cached;
+ guncached += split.uncached;
+ }
+ nodes.push({
+ kind: 'subagent',
+ label: subagentLabel(entry),
+ agentId: entry.agent_id,
+ rawIndex: idx,
+ startS,
+ endS,
+ durationMs: entry.duration_ms,
+ in: gin,
+ out: gout,
+ cached: gcached,
+ uncached: guncached,
+ children,
+ });
+ numSubagentGroups += 1;
+ totalIn += gin;
+ totalOut += gout;
+ totalCached += gcached;
+ totalUncached += guncached;
+ } else {
+ const split = splitInput(entry, seen, blockSize);
+ const out = Math.max(0, Math.round(entry.out ?? 0));
+ const startS = finiteTime(entry.t);
+ nodes.push({
+ kind: 'turn',
+ turnIndex: turnIndex++,
+ rawIndex: idx,
+ startS,
+ endS: requestEndS(startS, entry.api_time),
+ model: entry.model,
+ in: split.in,
+ out,
+ cached: split.cached,
+ uncached: split.uncached,
+ });
+ numTurns += 1;
+ totalIn += split.in;
+ totalOut += out;
+ totalCached += split.cached;
+ totalUncached += split.uncached;
+ }
+ }
+
+ return {
+ blockSize,
+ nodes,
+ totals: {
+ in: totalIn,
+ out: totalOut,
+ cached: totalCached,
+ uncached: totalUncached,
+ numTurns,
+ numSubagentGroups,
+ },
+ };
+}
+
+// ── Distribution binning (for the dataset-detail cards) ──────────────────────
+// The implementations moved to distribution-stats.ts (generic, dataset-agnostic
+// math); re-exported here because this module is the established import path
+// for the dataset ingest/backfill scripts and the frontend.
+
+export {
+ linearHistogram,
+ logHistogram,
+ logHistogramWithZero,
+ summarizeValues,
+ type HistogramBin,
+ type NumberSummary,
+} from './distribution-stats';
diff --git a/packages/db/src/etl/workflow-run.ts b/packages/db/src/etl/workflow-run.ts
index 4097a3c5..28d27c87 100644
--- a/packages/db/src/etl/workflow-run.ts
+++ b/packages/db/src/etl/workflow-run.ts
@@ -26,6 +26,7 @@ export interface GithubRunInfo {
runStartedAt: string | null;
headSha: string | null;
headBranch: string | null;
+ headCommitMessage: string | null;
runAttempt: number | null;
pullRequests: GithubPullRequestRef[];
}
@@ -101,6 +102,7 @@ export function createWorkflowRunServices(sql: Sql, githubToken?: string) {
runStartedAt: d.run_started_at ? String(d.run_started_at) : null,
headSha: d.head_sha ? String(d.head_sha) : null,
headBranch: d.head_branch ? String(d.head_branch) : null,
+ headCommitMessage: d.head_commit?.message ? String(d.head_commit.message) : null,
runAttempt: typeof d.run_attempt === 'number' ? d.run_attempt : null,
pullRequests,
};
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index cb222a86..cada82d6 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -21,7 +21,6 @@
* original source sweep run, so public links point at the real benchmark run.
*/
-import { execSync } from 'child_process';
import fs from 'fs';
import os from 'os';
import path from 'path';
@@ -29,6 +28,12 @@ import path from 'path';
import { GPU_KEYS } from '@semianalysisai/inferencex-constants';
import { hasNoSslFlag } from './cli-utils';
+import {
+ dedupeArtifactsByLogicalName,
+ downloadArtifact,
+ fetchRunAttempt,
+ listRunArtifacts,
+} from './lib/github-artifacts';
import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils';
import { isRunAttemptPurged } from './etl/run-overrides';
import { createSkipTracker } from './etl/skip-tracker';
@@ -45,11 +50,15 @@ import {
bulkUpsertAvailability,
insertServerLog,
} from './etl/benchmark-ingest';
+import { insertTraceReplay } from './etl/trace-replay-ingest';
+import { discoverTraceReplayArtifacts } from './etl/trace-artifact-discovery';
+import { datasetSlugFromBenchmarkRow } from './etl/dataset-provenance';
import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper';
import { ingestEvalRow } from './etl/eval-ingest';
import { mapEvalSamples } from './etl/eval-samples-mapper';
import { bulkIngestEvalSamples } from './etl/eval-samples-ingest';
import {
+ type ChangelogEntry,
parseChangelogEntries,
ingestChangelogEntries,
hasEvalsOnlyFlag,
@@ -66,6 +75,29 @@ let runAttemptNum: number;
let REPO: string;
let tempDir: string | null = null;
+function formatBytes(bytes: number | null | undefined): string {
+ if (bytes === null || bytes === undefined) return 'none';
+ if (bytes < 1024) return `${bytes} B`;
+ const kib = bytes / 1024;
+ if (kib < 1024) return `${kib.toFixed(1)} KiB`;
+ const mib = kib / 1024;
+ if (mib < 1024) return `${mib.toFixed(1)} MiB`;
+ return `${(mib / 1024).toFixed(1)} GiB`;
+}
+
+function elapsed(startMs: number): string {
+ return `${((Date.now() - startMs) / 1000).toFixed(1)}s`;
+}
+
+function fileSize(pathname: string | null | undefined): number | null {
+ if (!pathname) return null;
+ try {
+ return fs.statSync(pathname).size;
+ } catch {
+ return null;
+ }
+}
+
if (isDownloadMode) {
// --download [repo]
// Filter out '--' injected by pnpm arg passthrough
@@ -95,48 +127,20 @@ if (isDownloadMode) {
console.log(` Repo: ${REPO}`);
console.log(`\n--- Downloading artifacts to ${artifactsDir} ---`);
- const artifactListJson = execSync(
- `gh api "repos/${REPO}/actions/runs/${runIdStr}/artifacts" --paginate --jq '.artifacts[]'`,
- { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024 },
- );
-
- const allArtifacts: { name: string; archive_download_url: string; created_at: string }[] = [];
- for (const line of artifactListJson.trim().split('\n')) {
- if (!line) continue;
- try {
- const parsed = JSON.parse(line);
- allArtifacts.push(parsed);
- } catch {}
- }
-
- const byName = new Map();
- for (const a of allArtifacts) {
- const existing = byName.get(a.name);
- if (!existing || a.created_at > existing.created_at) {
- byName.set(a.name, a);
- }
- }
+ // Retried configs produce artifacts on multiple runners — keep only the
+ // most recent per logical name (see RUNNER_SUFFIX_RE in github-artifacts)
+ // so a failed attempt's empty metrics can't overwrite the good one via
+ // ON CONFLICT DO UPDATE.
+ const byLogical = dedupeArtifactsByLogicalName(listRunArtifacts(REPO, runIdStr));
- for (const [name, artifact] of byName) {
- console.log(` ${name}`);
- const zipPath = path.join(artifactsDir, 'artifact.zip');
- execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, {
- stdio: ['pipe', 'pipe', 'inherit'],
- });
- const destDir = path.join(artifactsDir, name);
- fs.mkdirSync(destDir, { recursive: true });
- execSync(`unzip -oq "${zipPath}" -d "${destDir}"`, { stdio: 'inherit' });
- fs.unlinkSync(zipPath);
+ for (const artifact of byLogical.values()) {
+ console.log(` ${artifact.name}`);
+ downloadArtifact(artifact, artifactsDir);
}
- console.log(`\n Downloaded ${byName.size} artifact(s)`);
+ console.log(`\n Downloaded ${byLogical.size} artifact(s)`);
- // Fetch run attempt from API
- const attemptStr = execSync(
- `gh api "repos/${REPO}/actions/runs/${runIdStr}" --jq '.run_attempt'`,
- { encoding: 'utf8' },
- ).trim();
- runAttemptNum = parseInt(attemptStr || '1', 10);
+ runAttemptNum = fetchRunAttempt(REPO, runIdStr);
} else {
// CI mode — read from env vars
for (const key of [
@@ -194,6 +198,14 @@ const ARTIFACT_NAMES = {
changelog: 'changelog-metadata',
} as const;
+/**
+ * Strip the `bmk_` and/or `agentic_` prefixes from an artifact directory name
+ * so the bare suffix becomes a shared key between `bmk_agentic_` and
+ * its sibling `agentic_` artifact.
+ */
+const stripBmkAndAgenticPrefix = (s: string): string =>
+ s.replace(/^bmk_/u, '').replace(/^agentic_/u, '');
+
function readJson(filePath: string): unknown {
try {
return JSON.parse(fs.readFileSync(filePath, 'utf8'));
@@ -294,13 +306,14 @@ async function main(): Promise {
const availRows: {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
specMethod: string;
disagg: boolean;
+ benchmarkType: string;
}[] = [];
let totalNewBmk = 0,
@@ -311,6 +324,11 @@ async function main(): Promise {
let totalSamples = 0;
let totalSampleFiles = 0;
let totalChangelogs = 0;
+ let totalTraceReplayLinked = 0;
+ const datasetSlugs = new Set();
+ // Dataset slugs referenced by this run's agentic rows but absent from the
+ // `datasets` table — timeline→dataset deep links 404 until they're ingested.
+ const missingDatasets = new Set();
// ── Check for evals-only flag in changelog ────────────────────────────
const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog);
@@ -318,7 +336,7 @@ async function main(): Promise {
const parsedChangelogs: {
baseRef: string;
headRef: string;
- entries: ReturnType;
+ entries: ChangelogEntry[];
}[] = [];
for (const file of changelogFiles) {
const data = readJson(file) as Record | null;
@@ -329,6 +347,32 @@ async function main(): Promise {
const entries = parseChangelogEntries(data.entries);
if (entries.length > 0) parsedChangelogs.push({ baseRef, headRef, entries });
}
+ if (parsedChangelogs.length === 0) {
+ const headRef = workflowGhInfo?.headBranch ?? workflowGhInfo?.headSha ?? `run-${runIdStr}`;
+ // Prefer the workflow's display name ("e2e Test - B300 DSv4 AgentX vLLM 1h
+ // + 10m warmup") — it describes the sweep; the head commit message usually
+ // describes an unrelated code change.
+ const fallbackDescription =
+ workflowGhInfo?.name?.trim() ||
+ workflowGhInfo?.headCommitMessage?.trim().split('\n')[0]?.trim() ||
+ `GitHub Actions run ${runIdStr}`;
+
+ parsedChangelogs.push({
+ baseRef: 'unknown',
+ headRef,
+ entries: [
+ {
+ configKeys: [],
+ description: fallbackDescription,
+ prLink: null,
+ evalsOnly: false,
+ },
+ ],
+ });
+ console.log(
+ ` No changelog metadata artifact found; using fallback changelog: ${fallbackDescription}`,
+ );
+ }
const evalsOnly = hasEvalsOnlyFlag(parsedChangelogs);
if (evalsOnly) {
console.log('\n ⚠ evals-only run detected — skipping benchmark and stats ingest');
@@ -355,8 +399,13 @@ async function main(): Promise {
if (fs.existsSync(artifactsDir)) {
for (const d of fs.readdirSync(artifactsDir)) {
if (!d.startsWith('server_logs_')) continue;
- const logPath = path.join(artifactsDir, d, 'server.log');
- if (!fs.existsSync(logPath)) continue;
+ // feat-agentx-v1.0 harness nests the log under `results/server.log`;
+ // older runs keep it at the artifact root. Check both.
+ const logPath = [
+ path.join(artifactsDir, d, 'server.log'),
+ path.join(artifactsDir, d, 'results', 'server.log'),
+ ].find((p) => fs.existsSync(p));
+ if (!logPath) continue;
const configKey = d.replace(/^server_logs_/u, '');
serverLogPaths.set(configKey, logPath);
}
@@ -365,23 +414,53 @@ async function main(): Promise {
console.log(` Found ${serverLogPaths.size} server log artifact(s)`);
}
+ // Sibling aiperf artifacts: each `bmk_agentic_` is paired with an
+ // `agentic_` dir holding `profile_export.jsonl` and
+ // `server_metrics_export.csv`. The harness emits these under either a
+ // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current).
+ // Older non-aiperf agentic runs don't ship this sibling. Key on the bare
+ // suffix so both names map to the same Map entry.
+ const traceReplayPaths = discoverTraceReplayArtifacts(artifactsDir);
+ if (traceReplayPaths.size > 0) {
+ console.log(` Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`);
+ }
+
const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))];
console.log(` Found ${allBmkFiles.length} benchmark JSON file(s)`);
- for (const file of allBmkFiles) {
+ for (const [fileIndex, file] of allBmkFiles.entries()) {
+ const fileStart = Date.now();
+ const relativeFile = path.relative(artifactsDir, file);
+ console.log(
+ ` [${fileIndex + 1}/${allBmkFiles.length}] ${relativeFile} (${formatBytes(fileSize(file))})`,
+ );
const data = readJson(file);
- if (!data) continue;
+ if (!data) {
+ console.log(` skipped unreadable JSON (${elapsed(fileStart)})`);
+ continue;
+ }
const rawRows: Record[] = Array.isArray(data)
? data
: [data as Record];
+ console.log(` raw rows: ${rawRows.length}`);
+
+ for (const rawRow of rawRows) {
+ if (!rawRow || typeof rawRow !== 'object') continue;
+ const datasetSlug = datasetSlugFromBenchmarkRow(rawRow);
+ if (datasetSlug) datasetSlugs.add(datasetSlug);
+ }
const rows = rawRows
.filter((r) => typeof r === 'object' && r !== null)
.map((r) => mapBenchmarkRow(r, tracker))
.filter((r): r is NonNullable => r !== null);
- if (rows.length === 0) continue;
+ console.log(` mapped rows: ${rows.length}`);
+ if (rows.length === 0) {
+ console.log(` skipped; no mappable rows (${elapsed(fileStart)})`);
+ continue;
+ }
const toInsert = [];
for (const row of rows) {
@@ -392,15 +471,21 @@ async function main(): Promise {
tracker.recordDbError(`config for ${path.basename(file)}`, error);
}
}
+ console.log(` rows with resolved configs: ${toInsert.length}`);
if (toInsert.length > 0) {
try {
+ const insertStart = Date.now();
const { newCount, dupCount, insertedIds } = await bulkIngestBenchmarkRows(
sql,
toInsert,
workflowRunId,
date,
);
+ console.log(
+ ` benchmark rows: +${newCount} new, ${dupCount} dup, ` +
+ `${insertedIds.length} id(s) (${elapsed(insertStart)})`,
+ );
totalNewBmk += newCount;
totalDupBmk += dupCount;
@@ -415,28 +500,91 @@ async function main(): Promise {
framework: r.config.framework,
specMethod: r.config.specMethod,
disagg: r.config.disagg,
+ benchmarkType: r.benchmarkType,
});
}
const parentDir = path.basename(path.dirname(file));
if (parentDir.startsWith('bmk_') && insertedIds.length > 0) {
+ // Single-turn artifacts are `bmk_` paired with
+ // `server_logs_`. Agentic artifacts are `bmk_agentic_`
+ // but the server log is still `server_logs_` (no `agentic_`
+ // prefix), so fall back to the fully-stripped suffix — otherwise
+ // agentic rows never get their server log (and KV-pool size) linked.
const configKey = parentDir.replace(/^bmk_/u, '');
- const logPath = serverLogPaths.get(configKey);
+ const logPath =
+ serverLogPaths.get(configKey) ??
+ serverLogPaths.get(stripBmkAndAgenticPrefix(parentDir));
if (logPath) {
try {
+ const serverLogStart = Date.now();
+ console.log(
+ ` server_log ${path.basename(logPath)} (${formatBytes(fileSize(logPath))})`,
+ );
const serverLog = fs.readFileSync(logPath, 'utf8').replaceAll('\u0000', '');
await insertServerLog(sql, insertedIds, serverLog);
+ console.log(` server_log linked (${elapsed(serverLogStart)})`);
} catch (error: any) {
tracker.recordDbError(`server_log for ${configKey}`, error);
}
}
}
+
+ // Trace-replay sibling lookup for agentic points only. The aiperf
+ // harness emits `agentic_/trace_replay/...` next to the
+ // `bmk_agentic_` artifact we just ingested.
+ if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) {
+ const suffix = stripBmkAndAgenticPrefix(parentDir);
+ const concMatch = path.basename(file).match(/_conc(?\d+)\.json$/u);
+ const trace =
+ (concMatch?.groups?.conc
+ ? traceReplayPaths.get(`${suffix}|${concMatch.groups.conc}`)
+ : undefined) ?? traceReplayPaths.get(suffix);
+ if (trace) {
+ try {
+ const traceStart = Date.now();
+ console.log(
+ ` trace_replay ${suffix}: ` +
+ `profile=${formatBytes(fileSize(trace.profileJsonl))}, ` +
+ `server_csv=${formatBytes(fileSize(trace.serverMetricsCsv))}, ` +
+ `server_json=${formatBytes(fileSize(trace.serverMetricsJson))}`,
+ );
+ const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null;
+ const metrics = trace.serverMetricsCsv
+ ? fs.readFileSync(trace.serverMetricsCsv)
+ : null;
+ const metricsJson = trace.serverMetricsJson
+ ? fs.readFileSync(trace.serverMetricsJson)
+ : null;
+ await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson, {
+ metricsContext: {
+ framework: toInsert[0]?.config.framework,
+ disagg: toInsert[0]?.config.disagg,
+ },
+ progressLabel: suffix,
+ });
+ totalTraceReplayLinked += insertedIds.length;
+ console.log(` trace_replay ${suffix}: done (${elapsed(traceStart)})`);
+ } catch (error: any) {
+ tracker.recordDbError(`trace_replay for ${suffix}`, error);
+ }
+ } else {
+ console.log(` trace_replay ${suffix}: missing sibling artifact`);
+ tracker.skips.traceReplayMissing++;
+ }
+ }
} catch (error: any) {
tracker.recordDbError(path.basename(file), error);
}
}
+ console.log(` finished ${relativeFile} (${elapsed(fileStart)})`);
}
console.log(` Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`);
+ if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) {
+ console.log(
+ ` Trace replay: ${totalTraceReplayLinked} rows linked, ${tracker.skips.traceReplayMissing} agentic point(s) missing sibling artifact`,
+ );
+ }
if (availRows.length > 0) {
try {
@@ -446,6 +594,30 @@ async function main(): Promise {
tracker.recordDbError('availability', error);
}
}
+
+ if (datasetSlugs.size > 1) {
+ throw new Error(
+ `Conflicting dataset provenance in workflow run ${runId}: ${[...datasetSlugs].toSorted().join(', ')}`,
+ );
+ }
+ const [datasetSlug] = datasetSlugs;
+ if (datasetSlug) {
+ await sql`
+ insert into run_datasets (workflow_run_id, dataset_slug)
+ values (${workflowRunId}, ${datasetSlug})
+ on conflict (workflow_run_id) do update
+ set dataset_slug = excluded.dataset_slug
+ `;
+ console.log(` Dataset: linked workflow run to ${datasetSlug}`);
+ const [known] = await sql`select 1 as ok from datasets where slug = ${datasetSlug}`;
+ if (!known) {
+ missingDatasets.add(datasetSlug);
+ console.warn(
+ ` ⚠ Dataset ${datasetSlug} is not in the datasets table — request-timeline deep links ` +
+ `will 404 until it is ingested (packages/db/src/ingest-weka-dataset.ts)`,
+ );
+ }
+ }
}
// ── Ingest run stats ──────────────────────────────────────────────────
@@ -654,11 +826,17 @@ async function main(): Promise {
const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker;
const totalSkips =
- skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError;
+ skips.badZip +
+ skips.unmappedModel +
+ skips.unmappedHw +
+ skips.noIslOsl +
+ skips.failedRun +
+ skips.dbError;
if (totalSkips > 0) {
console.log(`\n Skipped: ${totalSkips} rows`);
const skipLines: [string, number][] = [
['no isl/osl (old format)', skips.noIslOsl],
+ ['failed run (0 successful)', skips.failedRun],
['unmapped model', skips.unmappedModel],
['unmapped hw', skips.unmappedHw],
['bad/empty zip', skips.badZip],
@@ -690,7 +868,10 @@ async function main(): Promise {
const unmappedOutPath = process.env.UNMAPPED_ENTITIES_OUTPUT;
if (
unmappedOutPath &&
- (unmappedModels.size > 0 || unmappedHws.size > 0 || unmappedPrecisions.size > 0)
+ (unmappedModels.size > 0 ||
+ unmappedHws.size > 0 ||
+ unmappedPrecisions.size > 0 ||
+ missingDatasets.size > 0)
) {
fs.writeFileSync(
unmappedOutPath,
@@ -698,6 +879,7 @@ async function main(): Promise {
models: [...unmappedModels],
hardware: [...unmappedHws],
precisions: [...unmappedPrecisions],
+ datasets: [...missingDatasets],
}),
);
}
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index b9f2b3b5..faa093e3 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -457,6 +457,9 @@ async function mapWorkflowDir(
unmappedModel: local.skips.unmappedModel,
unmappedHw: local.skips.unmappedHw,
noIslOsl: local.skips.noIslOsl,
+ failedRun: local.skips.failedRun,
+ // GCS backup doesn't ingest aiperf trace files; counter stays 0.
+ traceReplayMissing: local.skips.traceReplayMissing,
},
localUnmappedModels: new Set(local.unmappedModels),
localUnmappedHws: new Set(local.unmappedHws),
@@ -621,13 +624,14 @@ async function main(): Promise {
// Upsert availability rows only for successfully resolved configs
const availRows: {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
specMethod: string;
disagg: boolean;
+ benchmarkType: string;
}[] = [];
for (const r of allInserted) {
availRows.push({
@@ -639,6 +643,7 @@ async function main(): Promise {
framework: r.config.framework,
specMethod: r.config.specMethod,
disagg: r.config.disagg,
+ benchmarkType: r.benchmarkType,
});
}
if (availRows.length > 0) {
diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts
index a3b62fe0..f868767e 100644
--- a/packages/db/src/ingest-supplemental.ts
+++ b/packages/db/src/ingest-supplemental.ts
@@ -219,8 +219,10 @@ async function ingestSupplementalBmk(
const rows: {
configId: number;
- isl: number;
- osl: number;
+ benchmarkType: 'single_turn' | 'agentic_traces';
+ offloadMode: string;
+ isl: number | null;
+ osl: number | null;
conc: number;
image: string | null;
metrics: Record;
@@ -271,6 +273,8 @@ async function ingestSupplementalBmk(
rows.push({
configId,
+ benchmarkType: 'single_turn',
+ offloadMode: 'off',
isl: entry.isl,
osl: entry.osl,
conc: entry.conc,
@@ -294,13 +298,14 @@ async function ingestSupplementalBmk(
// to `rows` are exactly the valid ones.
const availRows: {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
specMethod: string;
disagg: boolean;
+ benchmarkType: string;
}[] = [];
for (const entry of entries) {
const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined });
@@ -317,6 +322,7 @@ async function ingestSupplementalBmk(
framework,
specMethod,
disagg,
+ benchmarkType: 'single_turn',
});
}
if (availRows.length > 0) {
diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
new file mode 100644
index 00000000..ed6774c0
--- /dev/null
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -0,0 +1,416 @@
+/**
+ * Ingest a HuggingFace cc-traces-weka dataset into the `datasets` +
+ * `dataset_conversations` tables that back the /datasets area.
+ *
+ * Public dataset, no token needed — fetched via the HF datasets-server rows API
+ * (rows are large, ~3.5 MB each, so we page in small chunks with adaptive
+ * backoff). Per conversation we build a flamegraph-ready `structure` (turns +
+ * subagent groups, input split into cached-prefix vs uncached) and accumulate
+ * dataset-level distributions for the detail cards. Raw hash_ids are discarded
+ * after the cached/uncached split is computed.
+ *
+ * Usage (DATABASE_WRITE_URL must be provided — never hardcoded):
+ * DATABASE_WRITE_URL='postgres://…' pnpm exec tsx src/ingest-weka-dataset.ts \
+ * semianalysisai/cc-traces-weka-062126 [--label "…"] [--variant full|256k] \
+ * [--description "…"] [--limit N]
+ *
+ * Upsert: re-running replaces the dataset's rows (delete + re-insert).
+ * Remember to purge the API cache afterwards (POST /api/v1/invalidate).
+ */
+
+import { createAdminSql } from './etl/db-utils';
+import { hasNoSslFlag } from './cli-utils';
+import {
+ buildConversationStructure,
+ countConversationRequests,
+ linearHistogram,
+ logHistogram,
+ logHistogramWithZero,
+ subagentRequestTurns,
+ summarizeValues,
+ type ConversationStructure,
+ type RawWekaConversation,
+ type TurnNode,
+} from './etl/weka-structure';
+
+const ROWS_API = 'https://datasets-server.huggingface.co/rows';
+const INFO_API = 'https://datasets-server.huggingface.co/info';
+
+interface CliArgs {
+ dataset: string;
+ label?: string;
+ variant?: string;
+ description?: string;
+ limit?: number;
+}
+
+function parseArgs(): CliArgs {
+ const argv = process.argv.slice(2);
+ const positional = argv.filter((a) => !a.startsWith('--'));
+ const dataset = positional[0];
+ if (!dataset) {
+ console.error(
+ 'Usage: tsx src/ingest-weka-dataset.ts [--label …] [--variant full|256k] [--description …] [--limit N]',
+ );
+ process.exit(1);
+ }
+ const getFlag = (name: string): string | undefined => {
+ const i = argv.indexOf(`--${name}`);
+ return i !== -1 && i + 1 < argv.length ? argv[i + 1] : undefined;
+ };
+ const limitRaw = getFlag('limit');
+ return {
+ dataset,
+ label: getFlag('label'),
+ variant: getFlag('variant'),
+ description: getFlag('description'),
+ limit: limitRaw ? Number(limitRaw) : undefined,
+ };
+}
+
+const sleep = (ms: number) =>
+ new Promise((resolve) => {
+ setTimeout(resolve, ms);
+ });
+
+/**
+ * Fetch JSON, transparently retrying on HF rate-limiting (429) and transient
+ * 5xx with exponential backoff. Honors a Retry-After header when present.
+ */
+async function fetchJson(url: string, attempt = 0): Promise {
+ const res = await fetch(url);
+ if (res.status === 429 || res.status >= 500) {
+ if (attempt >= 6) {
+ throw new Error(`${res.status} ${res.statusText} after ${attempt} retries for ${url}`);
+ }
+ const retryAfter = Number(res.headers.get('retry-after'));
+ const waitMs =
+ Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1000 : 2000 * 2 ** attempt;
+ console.warn(
+ ` ${res.status} ${res.statusText}; waiting ${Math.round(waitMs / 1000)}s (attempt ${attempt + 1})`,
+ );
+ await sleep(waitMs);
+ return fetchJson(url, attempt + 1);
+ }
+ if (!res.ok) {
+ throw new Error(`${res.status} ${res.statusText} for ${url}`);
+ }
+ return res.json();
+}
+
+async function getRowCount(dataset: string): Promise {
+ const info = (await fetchJson(`${INFO_API}?dataset=${encodeURIComponent(dataset)}`)) as {
+ dataset_info?: Record }>;
+ };
+ const cfg = info.dataset_info?.['default'];
+ const num = cfg?.splits?.['train']?.num_examples;
+ return typeof num === 'number' ? num : 0;
+}
+
+/** Page through rows with adaptive length (halve on "too big"/error). */
+async function* iterRows(
+ dataset: string,
+ total: number,
+ limit?: number,
+): AsyncGenerator