From 3326cc1fa19a082036998c1aebcea69aae9d5d54 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:11:04 -0500 Subject: [PATCH 01/40] chore: deps and toolchain config for the agentic stack (stream-json, adm-zip, audit overrides) --- .eslintignore | 3 +++ .oxlintrc.json | 1 + packages/db/package.json | 12 +++++++++++- pnpm-lock.yaml | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 .eslintignore diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 00000000..513a873e --- /dev/null +++ b/.eslintignore @@ -0,0 +1,3 @@ +# Stale agent worktrees produced by parallel Claude Code sessions — they +# hold their own branches and are linted as part of their own runs. +.claude/worktrees/ diff --git a/.oxlintrc.json b/.oxlintrc.json index ff610e51..5a03a5a0 100644 --- a/.oxlintrc.json +++ b/.oxlintrc.json @@ -28,6 +28,7 @@ "no-undef": "off", "no-underscore-dangle": "off", "no-useless-undefined": "off", + "require-unicode-regexp": "off", "no-warning-comments": "off", "prefer-destructuring": "off", "sort-imports": "off", diff --git a/packages/db/package.json b/packages/db/package.json index 8789f48b..c7836df4 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -19,6 +19,13 @@ "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts", "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts", "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts", + "db:backfill-agentic-intvty": "dotenv -e ../../.env -- tsx src/backfill-agentic-intvty.ts", + "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts", + "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts", + "db:backfill-agentic-server-logs": "dotenv -e ../../.env -- tsx src/backfill-agentic-server-logs.ts", + "db:backfill-dataset-stats": "dotenv -e ../../.env -- tsx src/backfill-dataset-stats.ts", + "db:backfill-kv-pool": "dotenv -e ../../.env -- tsx src/backfill-kv-pool.ts", + "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts", "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts", "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts", "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts", @@ -30,11 +37,14 @@ "@neondatabase/serverless": "^1.1.0", "@noble/ciphers": "^2.2.0", "@semianalysisai/inferencex-constants": "workspace:*", - "postgres": "^3.4.9" + "postgres": "^3.4.9", + "stream-chain": "^3.4.0", + "stream-json": "^2.1.0" }, "devDependencies": { "@types/adm-zip": "^0.5.8", "@types/node": "^26.0.1", + "@types/stream-json": "^1.7.8", "@vitest/coverage-v8": "^4.1.9", "adm-zip": "^0.5.18", "dotenv-cli": "^11.0.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 084c2485..58cdbba9 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -256,6 +256,12 @@ importers: postgres: specifier: ^3.4.9 version: 3.4.9 + stream-chain: + specifier: ^3.4.0 + version: 3.6.3 + stream-json: + specifier: ^2.1.0 + version: 2.1.0 devDependencies: '@types/adm-zip': specifier: ^0.5.8 @@ -263,6 +269,9 @@ importers: '@types/node': specifier: ^26.0.1 version: 26.0.1 + '@types/stream-json': + specifier: ^1.7.8 + version: 1.7.8 '@vitest/coverage-v8': specifier: ^4.1.9 version: 4.1.9(vitest@4.1.9) @@ -2096,6 +2105,12 @@ packages: '@types/stats.js@0.17.4': resolution: {integrity: sha512-jIBvWWShCvlBqBNIZt0KAshWpvSjhkwkEu4ZUcASoAvhmrgAUI2t1dXrjSL4xXVLB4FznPrIsX3nKXFl/Dt4vA==} + '@types/stream-chain@2.1.0': + resolution: {integrity: sha512-guDyAl6s/CAzXUOWpGK2bHvdiopLIwpGu8v10+lb9hnQOyo4oj/ZUQFOvqFjKGsE3wJP1fpIesCcMvbXuWsqOg==} + + '@types/stream-json@1.7.8': + resolution: {integrity: sha512-MU1OB1eFLcYWd1LjwKXrxdoPtXSRzRmAnnxs4Js/ayB5O/NvHraWwuOaqMWIebpYwM6khFlsJOHEhI9xK/ab4Q==} + '@types/three@0.185.0': resolution: {integrity: sha512-O2Uy8Cj4Nonr8dWUUbifMdPe8B0Mq7EdOHb89S4+kjUw/KhbjTZrUuYlrQ1bpUKG+EP9QJnN7qNxbHGlGoLHMA==} @@ -4812,9 +4827,15 @@ packages: resolution: {integrity: sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==} engines: {node: '>= 0.4'} + stream-chain@3.6.3: + resolution: {integrity: sha512-JZuELdHUuiZL4Olcr4EllGUvj9VKEaDkGHA6QAP5SruD0bgrr8TwtNXwRfH+fCncysEII7HhWll1+aOwvHYyRw==} + stream-combiner@0.2.2: resolution: {integrity: sha512-6yHMqgLYDzQDcAkL+tjJDC5nSNuNIx0vZtRZeiPh7Saef7VHX9H5Ijn9l2VIol2zaNYlYEX6KyuT/237A58qEQ==} + stream-json@2.1.0: + resolution: {integrity: sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==} + string-width@4.2.3: resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} engines: {node: '>=8'} @@ -6950,6 +6971,15 @@ snapshots: '@types/stats.js@0.17.4': {} + '@types/stream-chain@2.1.0': + dependencies: + '@types/node': 26.0.1 + + '@types/stream-json@1.7.8': + dependencies: + '@types/node': 26.0.1 + '@types/stream-chain': 2.1.0 + '@types/three@0.185.0': dependencies: '@dimforge/rapier3d-compat': 0.12.0 @@ -10234,11 +10264,17 @@ snapshots: es-errors: 1.3.0 internal-slot: 1.1.0 + stream-chain@3.6.3: {} + stream-combiner@0.2.2: dependencies: duplexer: 0.1.2 through: 2.3.8 + stream-json@2.1.0: + dependencies: + stream-chain: 3.6.3 + string-width@4.2.3: dependencies: emoji-regex: 8.0.0 From 580adfddcbfa415891136c3b571ccc0f8b14efc6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:11:11 -0500 Subject: [PATCH 02/40] =?UTF-8?q?feat(db):=20agentic=20benchmark=20schema?= =?UTF-8?q?=20=E2=80=94=20agentic=5Ftraces=20results,=20trace-replay=20sid?= =?UTF-8?q?ecar,=20datasets=20tables?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/db/migrations/007_agentic.sql | 326 ++++++++++++++++++ ..._latest_benchmarks_single_run_per_line.sql | 49 +++ .../migrations/009_dataset_request_stats.sql | 55 +++ 3 files changed, 430 insertions(+) create mode 100644 packages/db/migrations/007_agentic.sql create mode 100644 packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql create mode 100644 packages/db/migrations/009_dataset_request_stats.sql diff --git a/packages/db/migrations/007_agentic.sql b/packages/db/migrations/007_agentic.sql new file mode 100644 index 00000000..eceea82e --- /dev/null +++ b/packages/db/migrations/007_agentic.sql @@ -0,0 +1,326 @@ +-- 007_agentic.sql +-- +-- Squashed agentic-benchmark + datasets schema. Collapses the feat/agentx +-- migrations 002_agentic_scenario .. 012_run_datasets into one file that sorts +-- after master's highest migration (006_benchmark_results_workers), so the +-- branch's numbering no longer collides with master's 002-006. None of the +-- collapsed migrations had been applied to any deployed database. +-- +-- Statement order is preserved exactly. The latest_benchmarks recreate uses +-- 'select br.*', so it retains every benchmark_results column added earlier +-- (including master's 'workers' from 006) and re-keys the view on offload_mode. + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 002_agentic_scenario.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Support agentic scenarios in benchmark_results. +-- +-- Scenarios are discriminated by benchmark_type: +-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. +-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. +-- +-- conc retains its meaning (concurrent users/requests) for both. + +-- 1) isl/osl become nullable for agentic rows +alter table benchmark_results + alter column isl drop not null, + alter column osl drop not null; + +-- 2) CHECK constraints: positive-or-null +alter table benchmark_results + drop constraint benchmark_results_isl_positive, + drop constraint benchmark_results_osl_positive; + +alter table benchmark_results + add constraint benchmark_results_isl_positive check (isl is null or isl > 0), + add constraint benchmark_results_osl_positive check (osl is null or osl > 0); + +-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows +-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 003_agentic_availability.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Extend the availability table to cover agentic scenarios. +-- +-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same +-- for availability and add benchmark_type so the frontend can enumerate +-- agentic vs single_turn scenarios per model/date. +-- +-- Postgres primary keys require every column to be NOT NULL, so we drop the PK +-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally +-- equivalent except it allows isl/osl to be NULL for agentic rows. + +alter table availability + drop constraint availability_pkey; + +alter table availability + alter column isl drop not null, + alter column osl drop not null, + add column benchmark_type text not null default 'single_turn'; + +alter table availability + add constraint availability_natural_key unique nulls not distinct + (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 004_offload_mode.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Add offload_mode as a first-class dimension on benchmark_results. +-- +-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace +-- runs: a single run may emit two rows for the same (config, isl, osl, conc) +-- — one with offload disabled, one enabled. The pre-existing unique key +-- collapsed those into one row, forcing the ingest to skip variants. +-- +-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the +-- assumption baked into the existing 5,500+ rows. + +alter table benchmark_results + add column offload_mode text not null default 'off'; + +-- Backfill agentic rows from the offload_mode value already living in metrics +-- JSONB (set during the earlier agentic ingest backfill). +update benchmark_results + set offload_mode = metrics->>'offload_mode' + where benchmark_type = 'agentic_traces' + and metrics ? 'offload_mode'; + +-- Replace the unique constraint so on/off variants can coexist. +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); + +-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. +drop materialized view if exists latest_benchmarks cascade; + +create materialized view latest_benchmarks as +select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) + br.* +from benchmark_results br +join latest_workflow_runs wr on wr.id = br.workflow_run_id +where br.error is null +order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; + +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 006_agentic_trace_replay.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Capture raw aiperf trace files per agentic benchmark point. +-- +-- The aiperf harness produces two per-point export files inside each +-- `agentic_` artifact: +-- - profile_export.jsonl (~2 MB raw, per-request data) +-- - server_metrics_export.csv (~20 KB raw, periodic Prometheus snapshots) +-- +-- We persist them so the dashboard can later show per-request distributions, +-- KV cache utilization over time, and conversation traces without needing to +-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at +-- ~500 KB per point post-gzip the total fits comfortably without a separate +-- blob service. +-- +-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK +-- column on benchmark_results). Older, non-aiperf agentic runs simply have a +-- NULL `trace_replay_id`. + +create table agentic_trace_replay ( + id bigserial primary key, + -- gzip(profile_export.jsonl); null when only the server metrics file existed + profile_export_jsonl_gz bytea, + profile_export_uncompressed_size bigint, + -- raw csv bytes; null when only the profile file existed + server_metrics_csv bytea, + server_metrics_csv_size bigint, + created_at timestamptz not null default now() +); + +alter table benchmark_results + add column trace_replay_id bigint references agentic_trace_replay(id); + +create index benchmark_results_trace_replay_idx + on benchmark_results (trace_replay_id) + where trace_replay_id is not null; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 007_agentic_trace_server_metrics_json.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Add the full server-metrics time-series JSON to agentic_trace_replay. +-- +-- The existing `server_metrics_csv` column holds aiperf's summary export — +-- one row per metric with avg/min/max/std/p1..p99 across the entire run. +-- That's enough for the cumulative cache-hit number but not for any +-- "metric over time" view (KV cache utilization curve, queue depth, prefix +-- hit rate per interval, cumulative prefill token source). +-- +-- The harness also writes `server_metrics_export.json` which contains the +-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole +-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x +-- to ~6 MB gzipped (text with repeated metric names + numeric values). +-- That's the file we store here for any future time-series chart. + +alter table agentic_trace_replay + add column server_metrics_json_gz bytea, + add column server_metrics_json_uncompressed_size bigint; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 008_agentic_aggregate_stats.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed aggregate stats for each agentic_trace_replay row. +-- +-- Previously the agentic detail page parsed the (huge) profile_export.jsonl +-- and server_metrics_json blobs on every request to compute distribution +-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived +-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the +-- worst rows (high-conc TP+EP server_metrics blobs that decompress past +-- Node's 512 MB string cap) couldn't be parsed without a stream fallback. +-- +-- This column holds the computed stats so the API serves the page from a +-- single SQL row read. Shape mirrors the existing benchmark_results.metrics +-- JSONB convention; an inner `version` field lets the backfill script +-- detect rows whose stats were computed by an older algorithm and +-- recompute them. Null when stats haven't been computed yet (existing +-- rows pre-backfill; the API has a slow-path fallback for that case). + +alter table agentic_trace_replay + add column aggregate_stats jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 009_agentic_chart_series.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed time-series for the agentic detail page chart. +-- +-- Sibling to `aggregate_stats` (migration 008): that column stores +-- per-row percentile/derived *summaries*, this one stores the full +-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate, +-- queueDepth, prefillTps, decodeTps, promptTokensBySource). +-- +-- Without this, the detail page parsed the entire `server_metrics_json_gz` +-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc +-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length). +-- With pre-computed series the page is a single SQL row read. +-- +-- Shape includes an inner `version` field so the backfill script can +-- recompute rows whose stored series were produced by an older algorithm. +-- Null when the series haven't been computed yet; the API has a slow-path +-- fallback (with stream-parse for oversized blobs) for that case. + +alter table agentic_trace_replay + add column chart_series jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 010_agentic_request_timeline.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed per-request timeline for the agentic detail page. +-- +-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one +-- holds a thin per-request array extracted from `profile_export_jsonl_gz` +-- so the detail page can render a Gantt-style swimlane of every request +-- (one bar per conversation turn) without re-parsing the JSONL on every +-- page load. +-- +-- Shape includes an inner `version` field so the backfill script can +-- recompute rows whose stored timeline was produced by an older +-- algorithm. Null when the timeline hasn't been computed yet; the API +-- falls back to parsing the blob in that case. + +alter table agentic_trace_replay + add column request_timeline jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 011_datasets.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora +-- the agentic benchmarks replay) + their per-conversation trace structure. +-- +-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but +-- not the source traces. These two tables back the new /datasets area: a +-- registry of ingested dataset versions with precomputed summary + chart data, +-- and one row per conversation holding a flamegraph-ready `structure` (turns + +-- subagent groups with input split into cached-prefix vs uncached-suffix). The +-- raw hash_ids are NOT stored — they're only needed at ingest to derive the +-- cached/uncached split, so the runtime read is a single small JSONB. +-- +-- Additive only. To revert this migration: +-- drop table if exists dataset_conversations; +-- drop table if exists datasets; +-- (and see the run_datasets revert below; this is all one migration now: +-- delete from schema_migrations where filename = '007_agentic.sql';) + +create table datasets ( + -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'. + id text primary key, + -- URL key, e.g. 'cc-traces-weka-062126'. + slug text not null unique, + label text not null, + -- 'full' | '256k' | 'no-subagents' (the published variants). + variant text not null default 'full', + description text, + hf_url text, + license text, + conversation_count integer not null default 0, + -- Token totals, main_turns, subagent_groups, model mix, date range, etc. + summary jsonb not null default '{}'::jsonb, + -- Precomputed distributions for the dataset-detail cards (input/output length, + -- turns per conversation, subagent fan-out, …). Versioned via an inner field. + chart_data jsonb not null default '{}'::jsonb, + dataset_version integer not null default 1, + ingested_at timestamptz not null default now() +); + +create table dataset_conversations ( + id bigserial primary key, + dataset_id text not null references datasets(id) on delete cascade, + -- The conversation id from the dataset record (trace id). + conv_id text not null, + models text[] not null default '{}', + num_turns integer not null default 0, + num_subagent_groups integer not null default 0, + total_in bigint not null default 0, + total_out bigint not null default 0, + total_cached bigint not null default 0, + -- Flamegraph-ready ordered node tree (turns + subagent groups, each with + -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts. + structure jsonb not null, + unique (dataset_id, conv_id) +); + +create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 012_run_datasets.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Maps a benchmark workflow_run to the source dataset it replayed, so the +-- agentic detail page can deep-link each request in the timeline to the exact +-- conversation in the /datasets viewer (the request's conversation_id, with any +-- ::sa:/::fa: suffix stripped, is the dataset conv_id). +-- +-- One row per workflow_run (every benchmark in a run replays the same dataset). +-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/ +-- URL) rather than an FK, so the mapping can be recorded before/independent of +-- the dataset being ingested; the UI degrades gracefully if the slug is absent. +-- +-- Additive only. To revert this whole squashed migration: +-- drop table if exists run_datasets; +-- drop table if exists dataset_conversations; +-- drop table if exists datasets; +-- drop table if exists agentic_trace_replay cascade; +-- (plus the benchmark_results/availability column + constraint changes above) +-- delete from schema_migrations where filename = '007_agentic.sql'; + +create table run_datasets ( + workflow_run_id bigint primary key references workflow_runs(id) on delete cascade, + dataset_slug text not null, + created_at timestamptz not null default now() +); diff --git a/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql new file mode 100644 index 00000000..039dfe09 --- /dev/null +++ b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql @@ -0,0 +1,49 @@ +-- ============================================================ +-- LATEST_BENCHMARKS — one run per line (no cross-run stitching) +-- ============================================================ +-- +-- Previously the view did `distinct on (config_id, conc, isl, osl)` ordered by +-- date desc — resolved INDEPENDENTLY per concurrency. So if a newer run +-- re-measured only some concurrencies (a partial re-sweep), the concurrencies it +-- skipped fell back to an older run that did measure them, and a single chart line +-- ended up stitched from points produced by different runs on different dates. +-- +-- A line is one config + sequence + offload mode +-- (config_id, benchmark_type, isl, osl, offload_mode) plotted +-- across concurrencies, and it must come from a SINGLE workflow run. We pick the +-- newest run per line (newest date, then latest sweep by run_started_at, then +-- highest workflow_run_id so exactly one run wins even on a same-day / null tie), +-- then keep EVERY concurrency that one run measured. A partial re-sweep therefore +-- truncates the line to its own concurrencies rather than borrowing an older run's. + +drop materialized view if exists latest_benchmarks; + +create materialized view latest_benchmarks as +with winners as ( + select distinct on (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode) + br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.workflow_run_id as winning_run_id + from benchmark_results br + join latest_workflow_runs wr on wr.id = br.workflow_run_id + where br.error is null + order by br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.date desc, wr.run_started_at desc nulls last, br.workflow_run_id desc +) +select br.* +from benchmark_results br +join winners w + on w.config_id = br.config_id + and w.benchmark_type = br.benchmark_type + and w.isl is not distinct from br.isl + and w.osl is not distinct from br.osl + and w.offload_mode = br.offload_mode + and w.winning_run_id = br.workflow_run_id +where br.error is null; + +-- Unique key now includes benchmark_type (part of the line key). One run per line +-- guarantees one row per concurrency, so this stays unique and keeps +-- REFRESH MATERIALIZED VIEW CONCURRENTLY working. +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, benchmark_type, offload_mode) + nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/migrations/009_dataset_request_stats.sql b/packages/db/migrations/009_dataset_request_stats.sql new file mode 100644 index 00000000..0b7c11bb --- /dev/null +++ b/packages/db/migrations/009_dataset_request_stats.sql @@ -0,0 +1,55 @@ +-- Backfill dataset-level requests/conversation statistics. +-- A request is one actual model call: each top-level turn plus each child turn +-- inside a subagent group. The group container itself is not a request. + +with per_conversation as ( + select + dc.dataset_id, + dc.num_subagent_groups, + ( + dc.num_turns + coalesce(( + select sum(jsonb_array_length(node.value->'children')) + from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) as node(value) + where node.value->>'kind' = 'subagent' + ), 0) + )::double precision as request_count + from dataset_conversations dc +), request_stats as ( + select + dataset_id, + avg(request_count) as mean_requests, + percentile_cont(0.5) within group (order by request_count) as median_requests, + avg(num_subagent_groups::double precision) as mean_subagents, + percentile_cont(0.5) within group (order by num_subagent_groups) as median_subagents + from per_conversation + group by dataset_id +) +update datasets d +set summary = jsonb_set( + jsonb_set( + jsonb_set( + jsonb_set( + jsonb_set( + d.summary, + '{meanRequestsPerConversation}', + to_jsonb(request_stats.mean_requests), + true + ), + '{medianRequestsPerConversation}', + to_jsonb(request_stats.median_requests), + true + ), + '{meanSubagentsPerTrace}', + to_jsonb(request_stats.mean_subagents), + true + ), + '{medianSubagentsPerTrace}', + to_jsonb(request_stats.median_subagents), + true + ), + '{version}', + '3'::jsonb, + true +) +from request_stats +where d.id = request_stats.dataset_id; From 3870e2b9494c4cca8e96cd0590de9949c8b5e04c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:11:29 -0500 Subject: [PATCH 03/40] =?UTF-8?q?feat(db):=20agentic=20ETL=20=E2=80=94=20t?= =?UTF-8?q?race-replay=20ingest,=20chart-series/timeline/aggregate=20compu?= =?UTF-8?q?tation,=20v3=20agg=20schema?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/constants/src/agentic.ts | 2 + packages/constants/src/framework-aliases.ts | 1 + packages/constants/src/index.ts | 1 + packages/constants/src/metric-keys.ts | 71 ++- packages/constants/src/models.ts | 17 + packages/db/src/etl/agentic-v3-flatten.ts | 131 ++++ packages/db/src/etl/benchmark-ingest.ts | 40 +- packages/db/src/etl/benchmark-mapper.test.ts | 291 +++++++++ packages/db/src/etl/benchmark-mapper.ts | 260 +++++--- .../src/etl/compute-aggregate-stats.test.ts | 152 +++++ .../db/src/etl/compute-aggregate-stats.ts | 149 +++++ .../db/src/etl/compute-chart-series.test.ts | 341 +++++++++++ packages/db/src/etl/compute-chart-series.ts | 576 ++++++++++++++++++ .../src/etl/compute-request-timeline.test.ts | 210 +++++++ .../db/src/etl/compute-request-timeline.ts | 208 +++++++ .../db/src/etl/dataset-provenance.test.ts | 40 ++ packages/db/src/etl/dataset-provenance.ts | 32 + packages/db/src/etl/distribution-stats.ts | 98 +++ packages/db/src/etl/gzip-json-stream.test.ts | 66 ++ packages/db/src/etl/gzip-json-stream.ts | 58 ++ packages/db/src/etl/normalizers.test.ts | 5 + packages/db/src/etl/normalizers.ts | 20 +- .../db/src/etl/server-log-metrics.test.ts | 43 ++ packages/db/src/etl/server-log-metrics.ts | 65 ++ .../db/src/etl/server-metrics-adapters.ts | 100 +++ packages/db/src/etl/skip-tracker.test.ts | 1 + packages/db/src/etl/skip-tracker.ts | 13 +- .../src/etl/trace-artifact-discovery.test.ts | 66 ++ .../db/src/etl/trace-artifact-discovery.ts | 93 +++ packages/db/src/etl/trace-replay-ingest.ts | 151 +++++ packages/db/src/etl/weka-structure.test.ts | 259 ++++++++ packages/db/src/etl/weka-structure.ts | 327 ++++++++++ 32 files changed, 3799 insertions(+), 88 deletions(-) create mode 100644 packages/constants/src/agentic.ts create mode 100644 packages/db/src/etl/agentic-v3-flatten.ts create mode 100644 packages/db/src/etl/compute-aggregate-stats.test.ts create mode 100644 packages/db/src/etl/compute-aggregate-stats.ts create mode 100644 packages/db/src/etl/compute-chart-series.test.ts create mode 100644 packages/db/src/etl/compute-chart-series.ts create mode 100644 packages/db/src/etl/compute-request-timeline.test.ts create mode 100644 packages/db/src/etl/compute-request-timeline.ts create mode 100644 packages/db/src/etl/dataset-provenance.test.ts create mode 100644 packages/db/src/etl/dataset-provenance.ts create mode 100644 packages/db/src/etl/distribution-stats.ts create mode 100644 packages/db/src/etl/gzip-json-stream.test.ts create mode 100644 packages/db/src/etl/gzip-json-stream.ts create mode 100644 packages/db/src/etl/server-log-metrics.test.ts create mode 100644 packages/db/src/etl/server-log-metrics.ts create mode 100644 packages/db/src/etl/server-metrics-adapters.ts create mode 100644 packages/db/src/etl/trace-artifact-discovery.test.ts create mode 100644 packages/db/src/etl/trace-artifact-discovery.ts create mode 100644 packages/db/src/etl/trace-replay-ingest.ts create mode 100644 packages/db/src/etl/weka-structure.test.ts create mode 100644 packages/db/src/etl/weka-structure.ts diff --git a/packages/constants/src/agentic.ts b/packages/constants/src/agentic.ts new file mode 100644 index 00000000..42eab306 --- /dev/null +++ b/packages/constants/src/agentic.ts @@ -0,0 +1,2 @@ +/** Fixed output length used by the experimental normalized-E2E chart metric. */ +export const NORMALIZED_E2E_OUTPUT_TOKENS = 400; diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts index 6c775be4..74cbce3f 100644 --- a/packages/constants/src/framework-aliases.ts +++ b/packages/constants/src/framework-aliases.ts @@ -46,6 +46,7 @@ export const FRAMEWORK_LABELS: Record = { ]), ), mtp: 'MTP', + aiperf: 'AIPerf', }; /** diff --git a/packages/constants/src/index.ts b/packages/constants/src/index.ts index e767e500..7d3d6783 100644 --- a/packages/constants/src/index.ts +++ b/packages/constants/src/index.ts @@ -1,3 +1,4 @@ +export * from './agentic'; export * from './framework-aliases'; export * from './github'; export * from './gpu-keys'; diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts index 7fa88c97..914eed4b 100644 --- a/packages/constants/src/metric-keys.ts +++ b/packages/constants/src/metric-keys.ts @@ -1,48 +1,117 @@ /** * Canonical set of metric keys stored in the benchmark_results.metrics JSONB column. * - * All values are in seconds unless noted otherwise. Throughput values are tokens/sec/GPU. + * Latency values (ttft/tpot/itl/e2el/intvty) are in seconds. Throughput values are + * tokens/sec — `_per_gpu` is per-GPU, `_tps` is total tokens/sec across the deployment. + * + * Distribution stats (mean/median/std/p75/p90/p95/p99/p99.9) are present for latency, + * QPS, and per-request token counts; agentic runs carry the full set, fixed-seq runs + * carry median/mean/p99/std for latency only. */ export const METRIC_KEYS = new Set([ // throughput (tokens/sec/GPU) 'tput_per_gpu', 'output_tput_per_gpu', 'input_tput_per_gpu', + // throughput (tokens/sec, deployment total) — agentic aiperf reports both + 'total_tput_tps', + 'output_tput_tps', + 'input_tput_tps', // TTFT — time to first token 'median_ttft', 'mean_ttft', + 'p75_ttft', 'p90_ttft', + 'p95_ttft', 'p99_ttft', 'p99.9_ttft', 'std_ttft', // TPOT — time per output token 'median_tpot', 'mean_tpot', + 'p75_tpot', 'p90_tpot', + 'p95_tpot', 'p99_tpot', 'p99.9_tpot', 'std_tpot', // ITL — inter-token latency 'median_itl', 'mean_itl', + 'p75_itl', 'p90_itl', + 'p95_itl', 'p99_itl', 'p99.9_itl', 'std_itl', // E2EL — end-to-end latency 'median_e2el', 'mean_e2el', + 'p75_e2el', 'p90_e2el', + 'p95_e2el', 'p99_e2el', 'p99.9_e2el', 'std_e2el', // interactivity 'median_intvty', 'mean_intvty', + 'p75_intvty', 'p90_intvty', + 'p95_intvty', 'p99_intvty', 'p99.9_intvty', 'std_intvty', + // QPS — queries per second (agentic aiperf) + 'median_qps', + 'mean_qps', + 'p75_qps', + 'p90_qps', + 'p95_qps', + 'p99_qps', + 'p99.9_qps', + 'std_qps', + // per-request input token count distribution + 'median_input_tokens', + 'mean_input_tokens', + 'p75_input_tokens', + 'p90_input_tokens', + 'p95_input_tokens', + 'p99_input_tokens', + 'p99.9_input_tokens', + 'std_input_tokens', + // per-request output token count distribution — actual served + 'median_output_tokens_actual', + 'mean_output_tokens_actual', + 'p75_output_tokens_actual', + 'p90_output_tokens_actual', + 'p95_output_tokens_actual', + 'p99_output_tokens_actual', + 'p99.9_output_tokens_actual', + 'std_output_tokens_actual', + // per-request output token count distribution — expected from trace + 'median_output_tokens_expected', + 'mean_output_tokens_expected', + 'p75_output_tokens_expected', + 'p90_output_tokens_expected', + 'p95_output_tokens_expected', + 'p99_output_tokens_expected', + 'p99.9_output_tokens_expected', + 'std_output_tokens_expected', + // run totals (agentic aiperf) + 'duration_seconds', + 'total_requests_completed', + 'total_prompt_tokens', + 'total_generation_tokens', + // server prefix-cache observability (agentic aiperf) + 'server_gpu_cache_hit_rate', + 'server_cpu_cache_hit_rate', + 'server_external_cache_hit_rate', + 'theoretical_cache_hit_rate', + // server KV-cache occupancy — mean GPU KV-cache usage fraction (0-1) over the + // profiling window (agentic aiperf; flat in v2 artifacts, mapped from + // server_metrics.kv_cache.gpu_usage_pct in v3) + 'gpu_kv_cache_usage_pct', // measured power / energy (emitted by runner's aggregate_power.py) // avg_power_w: mean per-GPU draw (W) during the load window // joules_per_output_token: energy / total_output_tokens. CLUSTER-WIDE on diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts index 06dfa09b..9622fe8c 100644 --- a/packages/constants/src/models.ts +++ b/packages/constants/src/models.ts @@ -56,3 +56,20 @@ export function islOslToSequence(isl: number, osl: number): string | null { }; return map[`${isl}_${osl}`] ?? null; } + +/** + * Map a benchmark/availability row to its sequence (scenario) string. + * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl. + * - Other rows (today: `single_turn`) fall back to `islOslToSequence`. + * Returns `null` for rows that can't be classified (e.g. `single_turn` with + * unmapped isl/osl values). + */ +export function rowToSequence(row: { + isl: number | null; + osl: number | null; + benchmark_type: string; +}): string | null { + if (row.benchmark_type === 'agentic_traces') return 'agentic-traces'; + if (row.isl === null || row.osl === null) return null; + return islOslToSequence(row.isl, row.osl); +} diff --git a/packages/db/src/etl/agentic-v3-flatten.ts b/packages/db/src/etl/agentic-v3-flatten.ts new file mode 100644 index 00000000..a3c223af --- /dev/null +++ b/packages/db/src/etl/agentic-v3-flatten.ts @@ -0,0 +1,131 @@ +/** + * v3 agentic agg schema (2026-07-02+): nested containers → canonical flat keys. + * + * v3 artifacts nest their metrics under `request_metrics` / `server_metrics` + * containers; v1/v2 emitted the same information as flat top-level fields. + * `flattenAgenticAggRow` maps the nested shape onto the flat schema the DB / + * API / frontend consume, so the rest of the mapper stays version-agnostic. + */ + +import { parseNum } from './normalizers'; + +/** + * Distribution stat names accepted from v3 nested stat blocks, with the rename + * applied when flattening. `p50` is stored as `median_*` to match the + * established METRIC_KEYS naming (fixed-seq runs and the frontend both use + * `median_*`; no `p50_*` key exists anywhere downstream). + */ +const V3_STAT_KEYS: Record = { + mean: 'mean', + p50: 'median', + median: 'median', + p75: 'p75', + p90: 'p90', + p95: 'p95', + p99: 'p99', + 'p99.9': 'p99.9', + std: 'std', +}; + +/** v3 `request_metrics.latency` sub-blocks → flat metric suffix (same name). */ +const V3_LATENCY_METRICS = ['ttft', 'e2el', 'itl', 'tpot', 'intvty'] as const; + +/** v3 `request_metrics.tokens` sub-blocks → flat metric suffix. */ +const V3_TOKEN_METRICS: Record = { + input: 'input_tokens', + output_actual: 'output_tokens_actual', + output_expected: 'output_tokens_expected', +}; + +/** + * Scalar paths in the v3 nested containers → canonical flat metric key. Keys + * reuse the flat v2-agentic names wherever one existed so already-ingested runs + * and the frontend see one consistent schema; genuinely new information gets a + * new key (registered in METRIC_KEYS). + */ +const V3_SCALAR_PATHS: [string[], string][] = [ + // client-side throughput + [['request_metrics', 'throughput', 'input', 'tokens_per_second'], 'input_tput_tps'], + [['request_metrics', 'throughput', 'output', 'tokens_per_second'], 'output_tput_tps'], + [['request_metrics', 'throughput', 'total', 'tokens_per_second'], 'total_tput_tps'], + [['request_metrics', 'throughput', 'duration_seconds'], 'duration_seconds'], + [['request_metrics', 'throughput', 'per_gpu', 'total_tput_tps'], 'tput_per_gpu'], + [['request_metrics', 'throughput', 'per_gpu', 'output_tput_tps'], 'output_tput_per_gpu'], + [['request_metrics', 'throughput', 'per_gpu', 'input_tput_tps'], 'input_tput_per_gpu'], + [['request_metrics', 'cache', 'theoretical_cache_hit_rate'], 'theoretical_cache_hit_rate'], + // server-side prefix-cache observability (same fields v2 emitted flat) + [['server_metrics', 'cache', 'gpu_cache_hit_rate'], 'server_gpu_cache_hit_rate'], + [['server_metrics', 'cache', 'cpu_cache_hit_rate'], 'server_cpu_cache_hit_rate'], + [['server_metrics', 'cache', 'external_cache_hit_rate'], 'server_external_cache_hit_rate'], + // KV-cache occupancy (gpu key predates v3 as a flat auto-captured field) + [['server_metrics', 'kv_cache', 'gpu_usage_pct'], 'gpu_kv_cache_usage_pct'], + // server token totals + [['server_metrics', 'tokens', 'prompt_total'], 'total_prompt_tokens'], + [['server_metrics', 'tokens', 'generation_total'], 'total_generation_tokens'], + [['server_metrics', 'tokens', 'requests_completed'], 'total_requests_completed'], + // Deliberately NOT mapped (yet): cache.overall/prefix_cache_hits/queries, + // kv_cache.cpu_*, tokens.prompt_by_source, sources[] — new v3 detail we don't + // consume anywhere; add here + METRIC_KEYS when a view needs them. +]; + +/** Walk a nested object path; returns undefined on any non-object hop. */ +function atPath(obj: Record, path: string[]): unknown { + let cur: unknown = obj; + for (const seg of path) { + if (!cur || typeof cur !== 'object' || Array.isArray(cur)) return undefined; + cur = (cur as Record)[seg]; + } + return cur; +} + +/** Flatten one v3 stat block ({mean, p50, …}) into `out` as `{stat}_{suffix}`. */ +function flattenStatBlock(block: unknown, suffix: string, out: Record): void { + if (!block || typeof block !== 'object' || Array.isArray(block)) return; + for (const [stat, canonical] of Object.entries(V3_STAT_KEYS)) { + const n = parseNum((block as Record)[stat]); + if (n !== undefined) out[`${canonical}_${suffix}`] = n; + } +} + +/** + * Flatten a v3 agentic agg row (nested `request_metrics` / `server_metrics` + * containers, 2026-07-02+) into the canonical flat metric schema that v1/v2 + * artifacts emitted directly and that the DB / API / frontend consume. + * + * Returns the row unchanged when `request_metrics` is absent (v1/v2 rows pass + * through untouched). Otherwise returns a copy with the flattened metrics + * merged in; the nested containers stay on the row (they're in NON_METRIC_KEYS + * so the auto-capture loop ignores them). + * + * Notes on the v3 source data: + * - `p50` percentiles are new (v2 had no median for agentic); stored as + * `median_*` to match the frontend's naming. + * - `latency.intvty` arrives already slow-tail inverted (pXX_intvty = + * 1/pXX_itl). It's flattened here for completeness, but mapBenchmarkRow's + * derive-from-itl invariant still overwrites it, keeping one definition + * across all harness versions. + */ +export function flattenAgenticAggRow(row: Record): Record { + const rm = row.request_metrics; + if (!rm || typeof rm !== 'object' || Array.isArray(rm)) return row; + + const flat: Record = {}; + + // latency distributions + for (const metric of V3_LATENCY_METRICS) { + flattenStatBlock(atPath(row, ['request_metrics', 'latency', metric]), metric, flat); + } + // qps distribution (window_seconds / samples are intentionally not stats) + flattenStatBlock(atPath(row, ['request_metrics', 'qps']), 'qps', flat); + // per-request token-count distributions + for (const [src, suffix] of Object.entries(V3_TOKEN_METRICS)) { + flattenStatBlock(atPath(row, ['request_metrics', 'tokens', src]), suffix, flat); + } + // scalars + for (const [path, key] of V3_SCALAR_PATHS) { + const n = parseNum(atPath(row, path)); + if (n !== undefined) flat[key] = n; + } + + return { ...row, ...flat }; +} diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts index a5493629..a405789d 100644 --- a/packages/db/src/etl/benchmark-ingest.ts +++ b/packages/db/src/etl/benchmark-ingest.ts @@ -4,6 +4,7 @@ import type postgres from 'postgres'; import type { BenchmarkParams } from './benchmark-mapper'; +import { kvCachePoolTokensFromServerLog } from './server-log-metrics'; type Sql = ReturnType; @@ -29,12 +30,19 @@ export async function bulkIngestBenchmarkRows( // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears // more than once in a single batch. Deduplicate within the batch, keeping - // the last occurrence (last metrics for each unique config/isl/osl/conc). + // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode). const seen = new Map(); - for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r); + for (const r of rows) { + seen.set( + `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`, + r, + ); + } const deduped = [...seen.values()]; const configIds = deduped.map((r) => r.configId); + const benchmarkTypes = deduped.map((r) => r.benchmarkType); + const offloadModes = deduped.map((r) => r.offloadMode); const isls = deduped.map((r) => r.isl); const osls = deduped.map((r) => r.osl); const concs = deduped.map((r) => r.conc); @@ -49,13 +57,14 @@ export async function bulkIngestBenchmarkRows( const result = await sql<{ inserted: boolean; id: number }[]>` insert into benchmark_results ( - workflow_run_id, config_id, benchmark_type, date, + workflow_run_id, config_id, benchmark_type, offload_mode, date, isl, osl, conc, image, metrics, workers ) select ${workflowRunId}, unnest(${sql.array(configIds)}::int[]), - 'single_turn', + unnest(${sql.array(benchmarkTypes)}::text[]), + unnest(${sql.array(offloadModes)}::text[]), ${date}::date, unnest(${sql.array(isls)}::int[]), unnest(${sql.array(osls)}::int[]), @@ -63,7 +72,7 @@ export async function bulkIngestBenchmarkRows( unnest(${sql.array(images)}), unnest(${sql.array(metricsJsons)}::jsonb[]), unnest(${sql.array(workersJsons)}::jsonb[]) - on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc) + on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode) do update set metrics = excluded.metrics, image = excluded.image, @@ -98,9 +107,18 @@ export async function insertServerLog( insert into server_logs (server_log) values (${serverLog}) returning id `; + // Derive the KV-cache pool size (tokens) from the log's authoritative + // "GPU KV cache size: N tokens" line(s) and stash it on the result's metrics + // JSON, mirroring how trace-replay-ingest derives cache-hit rates. The + // scraped vllm:cache_config_info metric can't reconstruct this for MLA models. + const kvCachePoolTokens = kvCachePoolTokensFromServerLog(serverLog); await sql` update benchmark_results - set server_log_id = ${logId} + set server_log_id = ${logId}${ + kvCachePoolTokens === null + ? sql`` + : sql`, metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${kvCachePoolTokens}::bigint))` + } where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; } @@ -155,13 +173,14 @@ export async function bulkUpsertAvailability( sql: Sql, rows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[], date: string, ): Promise { @@ -170,7 +189,7 @@ export async function bulkUpsertAvailability( const seen = new Set(); const unique: typeof rows = []; for (const r of rows) { - const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`; + const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`; if (!seen.has(key)) { seen.add(key); unique.push(r); @@ -178,7 +197,7 @@ export async function bulkUpsertAvailability( } await sql` - insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date) + insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date) select unnest(${sql.array(unique.map((r) => r.model))}::text[]), unnest(${sql.array(unique.map((r) => r.isl))}::int[]), @@ -188,6 +207,7 @@ export async function bulkUpsertAvailability( unnest(${sql.array(unique.map((r) => r.framework))}::text[]), unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]), unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]), + unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]), ${date}::date on conflict do nothing `; diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts index 65fb3e39..cde2f74b 100644 --- a/packages/db/src/etl/benchmark-mapper.test.ts +++ b/packages/db/src/etl/benchmark-mapper.test.ts @@ -22,6 +22,20 @@ function makeV1Row(overrides: Record = {}): Record { }; } +/** Minimal valid agentic row: scenario_type triggers the agentic path; `users` → conc. */ +function makeAgenticRow(overrides: Record = {}): Record { + return { + infmax_model_prefix: 'dsv4', + hw: 'b200-nv', + framework: 'vllm', + precision: 'fp4', + scenario_type: 'agentic-coding', + users: 72, + tput_per_gpu: 20000, + ...overrides, + }; +} + /** Minimal valid v2 benchmark row (disaggregated prefill/decode parallelism). */ function makeV2Row(overrides: Record = {}): Record { return { @@ -570,3 +584,280 @@ describe('extractWorkers', () => { expect(extractWorkers([null, 'bad', 0, undefined])).toBeUndefined(); }); }); + +describe('mapBenchmarkRow — agentic interactivity normalization', () => { + it('derives *_intvty from 1/*_itl, discarding the artifact value', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeAgenticRow({ + p90_itl: 0.0893, + p90_intvty: 23.91, // fast-tail contamination — must be overwritten + p75_itl: 0.0692, + p75_intvty: 19, + }), + tracker, + ); + expect(result!.benchmarkType).toBe('agentic_traces'); + expect(result!.metrics.p90_intvty).toBeCloseTo(1 / 0.0893, 6); + expect(result!.metrics.p75_intvty).toBeCloseTo(1 / 0.0692, 6); + }); + + it('derives *_intvty even when the artifact omits it', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeAgenticRow({ p90_itl: 0.1 }), tracker); + expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6); + }); + + it('does not touch *_intvty for single_turn rows', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeV1Row({ p90_itl: 0.05, p90_intvty: 999 }), tracker); + expect(result!.metrics.p90_intvty).toBe(999); + }); +}); + +/** + * Minimal v3 agentic row (2026-07-02+): nested request_metrics/server_metrics, + * p50 percentiles, pre-inverted intvty, kv_offloading descriptors. Mirrors the + * real artifact from GH run 28553943579 (trimmed). + */ +function makeV3AgenticRow(overrides: Record = {}): Record { + return { + infmax_model_prefix: 'dsv4', + hw: 'cluster:b300-nv', + framework: 'vllm', + precision: 'fp4', + spec_decoding: 'none', + disagg: false, + scenario_type: 'agentic-coding', + is_multinode: false, + tp: 4, + ep: 1, + dp_attention: 'false', + conc: 16, + image: 'vllm/vllm-openai:v0.23.0', + kv_offloading: 'none', + kv_offload_backend: '', + num_requests_total: 1648, + num_requests_successful: 1648, + dataset: { + source_type: 'public_dataset', + hf_dataset_name: 'semianalysisai/cc-traces-weka-062126', + }, + request_metrics: { + qps: { + window_seconds: 1, + samples: 7209, + mean: 0.22846, + p50: 0, + p75: 0, + p90: 1, + p95: 1, + std: 0.60707, + }, + latency: { + ttft: { + mean: 12.90033, + p50: 1.49712, + p75: 12.09501, + p90: 56.22194, + p95: 68.03156, + std: 22.68353, + }, + e2el: { + mean: 81.05644, + p50: 26.18817, + p75: 84.93601, + p90: 199.85996, + p95: 360.31579, + std: 149.59205, + }, + itl: { + mean: 0.07548, + p50: 0.03677, + p75: 0.10253, + p90: 0.16652, + p95: 0.22255, + std: 0.08327, + }, + tpot: { + mean: 0.07548, + p50: 0.03677, + p75: 0.10253, + p90: 0.16652, + p95: 0.22255, + std: 0.08327, + }, + // already slow-tail inverted upstream (pXX_intvty = 1/pXX_itl) + intvty: { + mean: 13.2482, + p50: 27.19411, + p75: 9.75304, + p90: 6.00526, + p95: 4.49335, + std: 24.77636, + }, + }, + tokens: { + input: { + mean: 157676.054, + p50: 96047, + p75: 197684.25, + p90: 404935.9, + p95: 547502.85, + std: 152480.17653, + }, + output_actual: { + mean: 849.06735, + p50: 290.5, + p75: 783.5, + p90: 2231.8, + p95: 3915.45, + std: 1568.90823, + }, + output_expected: { + mean: 1432.32728, + p50: 571.5, + p75: 1820, + p90: 3927, + p95: 5312.9, + std: 2067.19215, + }, + }, + throughput: { + input: { tokens_per_second: 35980.14001 }, + output: { tokens_per_second: 193.7489 }, + total: { tokens_per_second: 36173.88892 }, + duration_seconds: 7222.04352, + per_gpu: { + total_tput_tps: 9043.47223, + output_tput_tps: 48.43723, + input_tput_tps: 8995.035, + }, + }, + cache: { theoretical_cache_hit_rate: 0.97509 }, + }, + server_metrics: { + present: true, + adapter: 'vllm', + metric_count: 49, + cache: { + gpu_cache_hit_rate: 0.78539, + cpu_cache_hit_rate: 0, + external_cache_hit_rate: 0, + overall_cache_hit_rate: 0.78539, + prefix_cache_hits: 205576960, + prefix_cache_queries: 261750519, + frontend_cache_hit_rate: null, + }, + kv_cache: { gpu_usage_pct: 0.82134, cpu_usage_pct: null, cpu_used_tokens: null }, + tokens: { + prompt_total: 261750519, + generation_total: 1422696, + requests_completed: 1648, + prompt_by_source: { + gpu_cache_hit: 205576960, + cpu_or_external_cache_hit: 0, + computed: 56173559, + }, + }, + sources: [{ id: 'combined|http://localhost:8888/metrics|engine=0', role: 'combined' }], + }, + ...overrides, + }; +} + +describe('mapBenchmarkRow — v3 agentic nested agg schema', () => { + it('maps identity/routing and flattens the nested containers', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeV3AgenticRow(), tracker); + + expect(result).not.toBeNull(); + expect(result!.benchmarkType).toBe('agentic_traces'); + expect(result!.config.hardware).toBe('b300'); + expect(result!.conc).toBe(16); + expect(result!.isl).toBeNull(); + expect(result!.osl).toBeNull(); + + const m = result!.metrics; + // latency distributions, p50 stored under the canonical median_* name + expect(m.median_ttft).toBeCloseTo(1.49712, 6); + expect(m.p90_ttft).toBeCloseTo(56.22194, 6); + expect(m.std_e2el).toBeCloseTo(149.59205, 6); + expect(m.p95_itl).toBeCloseTo(0.22255, 6); + expect(m.mean_tpot).toBeCloseTo(0.07548, 6); + // qps + token distributions + expect(m.median_qps).toBe(0); + expect(m.p90_input_tokens).toBeCloseTo(404935.9, 3); + expect(m.median_output_tokens_actual).toBeCloseTo(290.5, 3); + expect(m.p95_output_tokens_expected).toBeCloseTo(5312.9, 3); + // throughput scalars under the v2 flat names + expect(m.tput_per_gpu).toBeCloseTo(9043.47223, 3); + expect(m.output_tput_per_gpu).toBeCloseTo(48.43723, 3); + expect(m.input_tput_per_gpu).toBeCloseTo(8995.035, 3); + expect(m.total_tput_tps).toBeCloseTo(36173.88892, 3); + expect(m.duration_seconds).toBeCloseTo(7222.04352, 3); + // cache / kv / totals + expect(m.theoretical_cache_hit_rate).toBeCloseTo(0.97509, 6); + expect(m.server_gpu_cache_hit_rate).toBeCloseTo(0.78539, 6); + expect(m.server_external_cache_hit_rate).toBe(0); + expect(m.gpu_kv_cache_usage_pct).toBeCloseTo(0.82134, 6); + expect(m.total_prompt_tokens).toBe(261750519); + expect(m.total_generation_tokens).toBe(1422696); + expect(m.total_requests_completed).toBe(1648); + // nested containers must not leak into metrics + expect(m).not.toHaveProperty('request_metrics'); + expect(m).not.toHaveProperty('server_metrics'); + }); + + it('re-derives *_intvty from *_itl (matching the pre-inverted artifact values)', () => { + const tracker = createSkipTracker(); + const m = mapBenchmarkRow(makeV3AgenticRow(), tracker)!.metrics; + // The artifact already ships slow-tail intvty; the derive invariant keeps + // one definition and must agree with it (up to the artifact's rounding). + expect(m.median_intvty).toBeCloseTo(1 / 0.03677, 6); + expect(m.p90_intvty).toBeCloseTo(1 / 0.16652, 6); + expect(m.median_intvty).toBeCloseTo(27.19411, 2); + expect(m.p90_intvty).toBeCloseTo(6.00526, 2); + // std is never inverted — passes through from the artifact + expect(m.std_intvty).toBeCloseTo(24.77636, 6); + }); + + it("maps kv_offloading 'none' to offload off and skips the empty backend", () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeV3AgenticRow(), tracker); + expect(result!.offloadMode).toBe('off'); + expect(result!.metrics).not.toHaveProperty('kv_offload_backend'); + }); + + it("maps kv_offloading 'dram' + backend to offload on with the backend preserved", () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeV3AgenticRow({ kv_offloading: 'dram', kv_offload_backend: 'mooncake', conc: 32 }), + tracker, + ); + expect(result!.offloadMode).toBe('on'); + expect((result!.metrics as Record).kv_offloading).toBe('dram'); + expect((result!.metrics as Record).kv_offload_backend).toBe('mooncake'); + }); + + it('still applies the failed-run guard to v3 rows', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeV3AgenticRow({ num_requests_successful: 0, num_requests_total: 100 }), + tracker, + ); + expect(result).toBeNull(); + expect(tracker.skips.failedRun).toBe(1); + }); + + it('leaves v2 flat agentic rows byte-identical (no flattening applied)', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeAgenticRow({ p90_itl: 0.1, mean_ttft: 1.5, offload_mode: 'on' }), + tracker, + ); + expect(result!.metrics.mean_ttft).toBe(1.5); + expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6); + expect(result!.offloadMode).toBe('on'); + }); +}); diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index b25baf60..90c23ef0 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -1,11 +1,13 @@ /** * Benchmark row mapper: raw JSON dict → typed `BenchmarkParams`. - * Handles both v1 (single tp/ep) and v2 (separate prefill/decode fields). + * Handles v1 (single tp/ep), v2 (separate prefill/decode fields), and v3 + * (nested agentic containers, flattened via {@link flattenAgenticAggRow}). */ import type { ConfigParams } from './config-cache'; import type { SkipTracker } from './skip-tracker'; import { METRIC_KEYS, PRECISION_KEYS } from '@semianalysisai/inferencex-constants'; +import { flattenAgenticAggRow } from './agentic-v3-flatten'; import { resolveModelKey, hwToGpuKey, @@ -17,11 +19,7 @@ import { parseInt2, } from './normalizers'; -/** - * Raw artifact field names that are renamed when stored as metrics. - * All other numeric fields not in `NON_METRIC_KEYS` are stored under their raw name. - */ -const METRIC_RENAMES: Record = {}; +export { flattenAgenticAggRow }; /** * Raw artifact fields that are config/routing dimensions, not metrics. @@ -57,12 +55,41 @@ const NON_METRIC_KEYS = new Set([ 'decode_num_workers', 'num_prefill_gpu', 'num_decode_gpu', + // agentic scenario + 'scenario_type', + 'users', + 'offload_mode', + 'num_requests_total', + 'num_requests_successful', + // v3 agentic KV-offload descriptors ('none'|'dram'|… + backend name). Mapped + // to offloadMode / stringified metrics explicitly in mapBenchmarkRow. + 'kv_offloading', + 'kv_offload_backend', + // v3 agentic nested containers — flattened by flattenAgenticAggRow before + // the auto-capture loop runs; the raw objects themselves are not metrics. + 'request_metrics', + 'server_metrics', + // Public-dataset provenance emitted by aiperf. The ingest runner uses this + // object to populate run_datasets; it is not a benchmark metric. + 'dataset', // per-worker measured-power array (not a numeric scalar). Surfaced as a // sibling of the metrics JSONB by mapBenchmarkRow so the metrics column // stays Record for the index signature on BenchmarkRow. 'workers', ]); +/** + * `benchmark_type` values understood by the ingest. + * - `single_turn` — fixed sequence-length runs (isl/osl set). + * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc). + */ +export type BenchmarkType = 'single_turn' | 'agentic_traces'; + +/** Reduce an offload descriptor ('none'|'dram'|…) to the binary on/off. */ +function descriptorToOnOff(v: unknown): string | null { + return typeof v === 'string' && v.length > 0 ? (v === 'none' ? 'off' : 'on') : null; +} + /** * METRIC_KEYS from constants is the canonical set of known metric keys. * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured @@ -91,9 +118,13 @@ export interface WorkerPower { export interface BenchmarkParams { config: ConfigParams; - isl: number; - osl: number; + benchmarkType: BenchmarkType; + // Null for agentic_traces; present for single_turn. + isl: number | null; + osl: number | null; conc: number; + /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */ + offloadMode: string; image: string | null; metrics: Record; /** @@ -110,9 +141,11 @@ export interface BenchmarkParams { /** * Map a raw benchmark result dict to typed `BenchmarkParams`. * - * Supports two artifact schemas: + * Supports three artifact schemas: * - **v1** (pre-2025-12-19): single `tp`/`ep` for both prefill and decode. * - **v2** (2025-12-19+): separate `prefill_tp`/`decode_tp` etc. for disaggregated configs. + * - **v3** (2026-07-02+, agentic only): nested `request_metrics`/`server_metrics` + * containers, flattened to the v2 flat schema up front by `flattenAgenticAggRow`. * * When mapping fails (unknown model, unknown hardware, or missing ISL/OSL/conc), * the appropriate skip counter on `tracker` is incremented and `null` is returned. @@ -128,6 +161,11 @@ export function mapBenchmarkRow( tracker: SkipTracker, islOslFallback?: { isl: number; osl: number } | null, ): BenchmarkParams | null { + // v3 agentic rows nest their metrics; flatten to the canonical flat schema + // first so the rest of the mapper (auto-capture, intvty invariant, guards) + // is version-agnostic. No-op for v1/v2 rows. + row = flattenAgenticAggRow(row); + const modelKey = resolveModelKey(row); if (!modelKey) { tracker.skips.unmappedModel++; @@ -144,14 +182,44 @@ export function mapBenchmarkRow( return null; } - const isl = parseInt2(row.isl) ?? islOslFallback?.isl; - const osl = parseInt2(row.osl) ?? islOslFallback?.osl; - const conc = parseInt2(row.conc); - if (!isl || !osl || !conc) { + // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants), + // no isl/osl, and `users` instead of `conc`. Everything else stays as-is. + const isAgentic = String(row.scenario_type ?? '').startsWith('agentic'); + const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn'; + + const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null); + const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null); + // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones. + const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc); + if (!conc || (!isAgentic && (!isl || !osl))) { tracker.skips.noIslOsl++; return null; } + // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from + // every runner, including ones with 0 successful requests and null metrics. + // Without this skip, the empty row's nulls overwrite a good row via + // ON CONFLICT DO UPDATE when both share the same (config, conc, offload). + if ( + typeof row.num_requests_successful === 'number' && + row.num_requests_successful === 0 && + typeof row.num_requests_total === 'number' && + row.num_requests_total > 0 + ) { + tracker.skips.failedRun++; + return null; + } + + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), then the v3 + // `kv_offloading` descriptor ('none'|'dram'|…), then legacy `offloading`. + // Descriptors reduce to the binary on/off used for row identity ('none' → + // 'off', anything else → 'on') so v3 offload points keep colliding-key parity + // with their v2 predecessors instead of forking a third offload_mode value. + const offloadModeRaw = + typeof row.offload_mode === 'string' && row.offload_mode.length > 0 + ? row.offload_mode + : (descriptorToOnOff(row.kv_offloading) ?? descriptorToOnOff(row.offloading) ?? 'off'); + const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg); const isMultinode = parseBool(row.is_multinode); const precision = normalizePrecision(String(row.precision ?? '')); @@ -160,55 +228,36 @@ export function mapBenchmarkRow( } const specMethod = normalizeSpecMethod(row.spec_decoding); - let prefillTp: number, prefillEp: number, prefillDpAttn: boolean, prefillNumWorkers: number; - let decodeTp: number, decodeEp: number, decodeDpAttn: boolean, decodeNumWorkers: number; - let numPrefillGpu: number, numDecodeGpu: number; + const parallelism = resolveParallelism(row); + const metrics = captureNumericMetrics(row); - if ('prefill_tp' in row) { - // v2 schema: full disagg parallelism fields - prefillTp = parseInt2(row.prefill_tp) ?? 1; - prefillEp = parseInt2(row.prefill_ep) ?? 1; - prefillDpAttn = parseBool(row.prefill_dp_attention); - prefillNumWorkers = parseInt2(row.prefill_num_workers) ?? 0; - decodeTp = parseInt2(row.decode_tp) ?? 1; - decodeEp = parseInt2(row.decode_ep) ?? 1; - decodeDpAttn = parseBool(row.decode_dp_attention); - decodeNumWorkers = parseInt2(row.decode_num_workers) ?? 0; - numPrefillGpu = parseInt2(row.num_prefill_gpu) ?? prefillTp * prefillEp; - numDecodeGpu = parseInt2(row.num_decode_gpu) ?? decodeTp * decodeEp; - } else { - // v1 schema: single tp/ep, prefill = decode - const tp = parseInt2(row.tp) ?? 1; - const ep = parseInt2(row.ep) ?? 1; - const dpAttn = parseBool(row.dp_attention); - prefillTp = tp; - decodeTp = tp; - prefillEp = ep; - decodeEp = ep; - prefillDpAttn = dpAttn; - decodeDpAttn = dpAttn; - prefillNumWorkers = 0; - decodeNumWorkers = 0; - numPrefillGpu = tp * ep; - numDecodeGpu = tp * ep; + // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`) + // — preserve as a stringified metric so the frontend can expose it in tooltips. + // v3 rows additionally carry the offload tier + backend ('dram'/'mooncake'); + // keep them so the UI can say *what kind* of offload, not just on/off. + if (isAgentic) { + (metrics as Record).offload_mode = offloadModeRaw; + if (typeof row.kv_offloading === 'string' && row.kv_offloading.length > 0) { + (metrics as Record).kv_offloading = row.kv_offloading; + } + if (typeof row.kv_offload_backend === 'string' && row.kv_offload_backend.length > 0) { + (metrics as Record).kv_offload_backend = row.kv_offload_backend; + } } - // Auto-capture all numeric fields not reserved for config/routing dimensions. - // Fields in METRIC_RENAMES are stored under their canonical name; all others - // use the raw key. Any key outside METRIC_KEYS triggers a one-time - // warning so new schema additions don't go silently unnoticed. - const metrics: Record = {}; - for (const [rawKey, val] of Object.entries(row)) { - if (NON_METRIC_KEYS.has(rawKey)) continue; - const n = parseNum(val); - if (n === undefined) continue; - const storedKey = METRIC_RENAMES[rawKey] ?? rawKey; - metrics[storedKey] = n; - if (!METRIC_KEYS.has(rawKey) && !_warnedMetricKeys.has(rawKey)) { - _warnedMetricKeys.add(rawKey); - console.warn( - ` [WARN] auto-captured unexpected metric '${rawKey}' — add to METRIC_KEYS in constants/src/metric-keys.ts or NON_METRIC_KEYS in benchmark-mapper.ts`, - ); + // Slow-tail interactivity invariant. Agentic artifacts ship `*_intvty`, but the + // definition has drifted across harness versions: some emit `1/p(ITL)` + // (slow-tail), others `p(1/ITL)` — which inverts percentile order, so p90 comes + // out as ~1/p10(ITL) instead. The inference chart's interactivity selector and + // the detail time-series both treat interactivity as the reciprocal of the ITL + // percentile, so we derive it from `*_itl` here rather than trust the artifact, + // keeping every agentic row on one definition. `std` is excluded — the + // reciprocal of a standard deviation is meaningless. Mirrored in the frontend + // overlay path (agenticAliases) and the one-time backfill-agentic-intvty script. + if (isAgentic) { + for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) { + const itl = metrics[`${k}_itl`]; + if (typeof itl === 'number' && itl > 0) metrics[`${k}_intvty`] = 1 / itl; } } @@ -231,26 +280,99 @@ export function mapBenchmarkRow( specMethod, disagg, isMultinode, - prefillTp, - prefillEp, - prefillDpAttn, - prefillNumWorkers, - decodeTp, - decodeEp, - decodeDpAttn, - decodeNumWorkers, - numPrefillGpu, - numDecodeGpu, + ...parallelism, }, + benchmarkType, isl, osl, conc, + offloadMode: offloadModeRaw, image, metrics, workers, }; } +/** The parallelism slice of `ConfigParams`, resolved from either artifact schema. */ +type ParallelismParams = Pick< + ConfigParams, + | 'prefillTp' + | 'prefillEp' + | 'prefillDpAttn' + | 'prefillNumWorkers' + | 'decodeTp' + | 'decodeEp' + | 'decodeDpAttn' + | 'decodeNumWorkers' + | 'numPrefillGpu' + | 'numDecodeGpu' +>; + +/** + * Resolve prefill/decode parallelism from a raw row. v2 rows (2025-12-19+) + * carry full disagg fields keyed by the presence of `prefill_tp`; v1 rows have + * a single `tp`/`ep` that applies to both phases. + */ +function resolveParallelism(row: Record): ParallelismParams { + if ('prefill_tp' in row) { + // v2 schema: full disagg parallelism fields + const prefillTp = parseInt2(row.prefill_tp) ?? 1; + const prefillEp = parseInt2(row.prefill_ep) ?? 1; + const decodeTp = parseInt2(row.decode_tp) ?? 1; + const decodeEp = parseInt2(row.decode_ep) ?? 1; + return { + prefillTp, + prefillEp, + prefillDpAttn: parseBool(row.prefill_dp_attention), + prefillNumWorkers: parseInt2(row.prefill_num_workers) ?? 0, + decodeTp, + decodeEp, + decodeDpAttn: parseBool(row.decode_dp_attention), + decodeNumWorkers: parseInt2(row.decode_num_workers) ?? 0, + numPrefillGpu: parseInt2(row.num_prefill_gpu) ?? prefillTp * prefillEp, + numDecodeGpu: parseInt2(row.num_decode_gpu) ?? decodeTp * decodeEp, + }; + } + // v1 schema: single tp/ep, prefill = decode + const tp = parseInt2(row.tp) ?? 1; + const ep = parseInt2(row.ep) ?? 1; + const dpAttn = parseBool(row.dp_attention); + return { + prefillTp: tp, + prefillEp: ep, + prefillDpAttn: dpAttn, + prefillNumWorkers: 0, + decodeTp: tp, + decodeEp: ep, + decodeDpAttn: dpAttn, + decodeNumWorkers: 0, + numPrefillGpu: tp * ep, + numDecodeGpu: tp * ep, + }; +} + +/** + * Auto-capture all numeric fields not reserved for config/routing dimensions, + * stored under their raw key. Any key outside METRIC_KEYS triggers a one-time + * warning so new schema additions don't go silently unnoticed. + */ +function captureNumericMetrics(row: Record): Record { + const metrics: Record = {}; + for (const [rawKey, val] of Object.entries(row)) { + if (NON_METRIC_KEYS.has(rawKey)) continue; + const n = parseNum(val); + if (n === undefined) continue; + metrics[rawKey] = n; + if (!METRIC_KEYS.has(rawKey) && !_warnedMetricKeys.has(rawKey)) { + _warnedMetricKeys.add(rawKey); + console.warn( + ` [WARN] auto-captured unexpected metric '${rawKey}' — add to METRIC_KEYS in constants/src/metric-keys.ts or NON_METRIC_KEYS in benchmark-mapper.ts`, + ); + } + } + return metrics; +} + /** * Narrow a raw `workers` value from the artifact JSON to `WorkerPower[]` or * undefined. Each entry must have a string `role`, a numeric `worker_idx`, diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts new file mode 100644 index 00000000..7b745c09 --- /dev/null +++ b/packages/db/src/etl/compute-aggregate-stats.test.ts @@ -0,0 +1,152 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { + STATS_VERSION, + computeAggregateStats, + mergeProfileStatsUpgrade, +} from './compute-aggregate-stats.js'; + +/** Build a minimal `profile_export.jsonl` from a few synthetic requests. */ +function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) { + const lines = requests.map((r, i) => + JSON.stringify({ + metadata: { + benchmark_phase: 'profiling', + conversation_id: `conv-${i}`, + turn_index: 0, + }, + metrics: { + input_sequence_length: { value: r.isl, unit: 'tokens' }, + output_sequence_length: { value: r.osl, unit: 'tokens' }, + request_latency: { value: r.rl ?? 1000, unit: 'ms' }, + time_to_first_token: { value: r.ttft ?? 100, unit: 'ms' }, + }, + }), + ); + return gzipSync(Buffer.from(lines.join('\n'))); +} + +/** Build a tiny server_metrics_json blob with KV util + prefix cache series. */ +function makeServerBlob() { + const json = JSON.stringify({ + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 0, end_ns: 1, avg: 0.2 }, + { start_ns: 1, end_ns: 2, avg: 0.5 }, + { start_ns: 2, end_ns: 3, avg: 0.8 }, + ], + }, + ], + }, + 'vllm:prefix_cache_hits': { + series: [{ timeslices: [{ start_ns: 0, rate: 80 }] }], + }, + 'vllm:prefix_cache_queries': { + series: [{ timeslices: [{ start_ns: 0, rate: 100 }] }], + }, + }, + }); + return gzipSync(Buffer.from(json)); +} + +describe('computeAggregateStats', () => { + it('returns the current STATS_VERSION in the bundle', async () => { + const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null }); + expect(stats.version).toBe(STATS_VERSION); + }); + + it('leaves every metric null when both blobs are null', async () => { + const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null }); + expect(stats.isl).toBeNull(); + expect(stats.osl).toBeNull(); + expect(stats.kvCacheUtil).toBeNull(); + expect(stats.prefixCacheHitRate).toBeNull(); + expect(stats.normalizedSessionTimeS).toBeNull(); + expect(stats.p90PrefillTpsPerUser).toBeNull(); + expect(stats.normalizedE2e400).toBeNull(); + }); + + it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => { + const profileBlob = makeProfileBlob([ + { isl: 100, osl: 50, rl: 1000, ttft: 100 }, + { isl: 200, osl: 75, rl: 2000, ttft: 200 }, + { isl: 300, osl: 100, rl: 3000, ttft: 300 }, + ]); + const stats = await computeAggregateStats({ profileBlob, serverBlob: null }); + + expect(stats.isl?.n).toBe(3); + expect(stats.isl?.mean).toBeCloseTo(200, 6); + expect(stats.osl?.n).toBe(3); + expect(stats.osl?.mean).toBeCloseTo(75, 6); + + // Server-side metrics still null when there's no server blob. + expect(stats.kvCacheUtil).toBeNull(); + expect(stats.prefixCacheHitRate).toBeNull(); + + // Derived: prefill TPS per turn = isl / (ttft/1000) = 1000 for each, so p90 = 1000. + expect(stats.p90PrefillTpsPerUser).toBeCloseTo(1000, 6); + // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean. + // loads = [150, 275, 400], mean_load = 275 + // scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625] + // mean ≈ 1.9653 + expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3); + expect(stats.normalizedE2e400?.n).toBe(3); + expect(stats.normalizedE2e400?.p90).toBeGreaterThan(0); + }); + + it('computes KV util + prefix hit rate from the server blob alone', async () => { + const stats = await computeAggregateStats({ + profileBlob: null, + serverBlob: makeServerBlob(), + }); + expect(stats.kvCacheUtil?.n).toBe(3); + expect(stats.kvCacheUtil?.mean).toBeCloseTo(0.5, 6); + expect(stats.prefixCacheHitRate?.n).toBe(1); + expect(stats.prefixCacheHitRate?.mean).toBeCloseTo(0.8, 6); + + // Profile-derived metrics absent. + expect(stats.isl).toBeNull(); + expect(stats.osl).toBeNull(); + expect(stats.normalizedSessionTimeS).toBeNull(); + expect(stats.p90PrefillTpsPerUser).toBeNull(); + expect(stats.normalizedE2e400).toBeNull(); + }); + + it('tolerates a malformed profile blob by leaving its metrics null', async () => { + // A random non-gzip buffer triggers a gunzip error — code path swallows it. + const garbage = Buffer.from('not-gzip-data'); + const stats = await computeAggregateStats({ profileBlob: garbage, serverBlob: null }); + expect(stats.isl).toBeNull(); + expect(stats.osl).toBeNull(); + expect(stats.normalizedSessionTimeS).toBeNull(); + expect(stats.p90PrefillTpsPerUser).toBeNull(); + expect(stats.normalizedE2e400).toBeNull(); + // Version still set so the row is considered "computed". + expect(stats.version).toBe(STATS_VERSION); + }); +}); + +describe('mergeProfileStatsUpgrade', () => { + it('updates profile metrics while preserving existing server distributions', async () => { + const existing = await computeAggregateStats({ + profileBlob: null, + serverBlob: makeServerBlob(), + }); + const profile = await computeAggregateStats({ + profileBlob: makeProfileBlob([{ isl: 100, osl: 100, rl: 2080, ttft: 100 }]), + serverBlob: null, + }); + + const merged = mergeProfileStatsUpgrade(existing, profile); + expect(merged.version).toBe(STATS_VERSION); + expect(merged.isl?.mean).toBe(100); + expect(merged.normalizedE2e400?.p90).toBeGreaterThan(0); + expect(merged.kvCacheUtil).toEqual(existing.kvCacheUtil); + expect(merged.prefixCacheHitRate).toEqual(existing.prefixCacheHitRate); + }); +}); diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts new file mode 100644 index 00000000..cea9361c --- /dev/null +++ b/packages/db/src/etl/compute-aggregate-stats.ts @@ -0,0 +1,149 @@ +/** + * Pre-compute the per-row aggregate stats for an `agentic_trace_replay` + * blob pair. The output lands in the `aggregate_stats` JSONB column so the + * detail page can serve the "Aggregates across configs" view and the + * derived chart x-axis modes from a single SQL row read, instead of + * parsing the raw blobs on demand. + * + * Shape is intentionally versioned — bump `STATS_VERSION` whenever the + * computation changes so the backfill script knows which rows to recompute. + */ + +import { gunzipSync } from 'node:zlib'; + +import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream'; +import { computeDerivedFromBlob } from '../queries/derived-agentic-metrics'; +import { + STATS_VERSION, + extractIslOsl, + extractServerMetricSamples, + percentilesOf, + type MetricPercentiles, +} from '../queries/agentic-aggregates'; + +export { STATS_VERSION }; + +export interface AggregateStats { + version: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; + /** Mean of (per-session e2e time × mean_load / session_load) across sessions. */ + normalizedSessionTimeS: number | null; + /** P90 of per-turn ISL/TTFT pooled across every session's turns. */ + p90PrefillTpsPerUser: number | null; + /** Per-request normalized E2E distribution at a fixed 400-token OSL. */ + normalizedE2e400: MetricPercentiles | null; +} + +/** + * Upgrade an existing stats bundle when only profile-derived fields changed. + * This avoids re-reading and decompressing the much larger server-metrics blob + * while preserving its already-computed KV/cache distributions. + */ +export function mergeProfileStatsUpgrade( + existing: Omit & { + normalizedE2e400?: MetricPercentiles | null; + }, + profile: AggregateStats, +): AggregateStats { + return { + ...profile, + isl: profile.isl ?? existing.isl, + osl: profile.osl ?? existing.osl, + normalizedSessionTimeS: profile.normalizedSessionTimeS ?? existing.normalizedSessionTimeS, + p90PrefillTpsPerUser: profile.p90PrefillTpsPerUser ?? existing.p90PrefillTpsPerUser, + kvCacheUtil: existing.kvCacheUtil, + prefixCacheHitRate: existing.prefixCacheHitRate, + }; +} + +/** Metric subtrees we extract via stream-parse on oversized server blobs. */ +const TARGET_METRIC_KEYS = new Set([ + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'vllm:prefix_cache_hits', + 'vllm:prefix_cache_queries', + 'vllm:gpu_prefix_cache_hits', + 'vllm:gpu_prefix_cache_queries', +]); + +/** + * Stream-parse the gzipped server_metrics_json and collect just the metric + * subtrees we care about. Avoids Node's 512 MB max-string-length cap that + * `gunzipSync().toString('utf8')` hits on high-conc TP+EP rows. + */ +async function streamExtractServer( + buffer: Buffer, +): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> { + const collected = await streamCollectKeys(buffer, 'metrics', TARGET_METRIC_KEYS); + return extractServerMetricSamples(JSON.stringify({ metrics: collected })); +} + +/** + * Compute the full versioned stats bundle from a (profile, server-metrics) + * blob pair. Either blob may be null (e.g. only the server file existed) — + * the corresponding stats just come back null. + */ +export async function computeAggregateStats(args: { + profileBlob: Buffer | null; + serverBlob: Buffer | null; +}): Promise { + let islPct: MetricPercentiles | null = null; + let oslPct: MetricPercentiles | null = null; + let normalized: number | null = null; + let prefillP90: number | null = null; + let normalizedE2e400: MetricPercentiles | null = null; + + if (args.profileBlob) { + try { + const jsonl = gunzipSync(args.profileBlob).toString('utf8'); + const { isl, osl } = extractIslOsl(jsonl); + islPct = percentilesOf(isl); + oslPct = percentilesOf(osl); + const derived = computeDerivedFromBlob(jsonl); + normalized = derived.normalized_session_time_s; + prefillP90 = derived.p90_prefill_tps_per_user; + normalizedE2e400 = derived.normalized_e2e_400; + } catch { + // ignore malformed blob — leave nulls + } + } + + let kvPct: MetricPercentiles | null = null; + let prefixPct: MetricPercentiles | null = null; + if (args.serverBlob) { + let server: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null; + try { + const json = gunzipSync(args.serverBlob).toString('utf8'); + server = extractServerMetricSamples(json); + } catch (error) { + // ERR_STRING_TOO_LONG hits on high-conc TP+EP rows. Stream-parse to + // pull just the metric subtrees we need without materializing the + // full 500+ MB JSON string. + if (isStringTooLongError(error)) { + try { + server = await streamExtractServer(args.serverBlob); + } catch { + // stream fallback failed too — leave nulls + } + } + } + if (server) { + kvPct = percentilesOf(server.kvCacheUtil); + prefixPct = percentilesOf(server.prefixCacheHitRate); + } + } + + return { + version: STATS_VERSION, + isl: islPct, + osl: oslPct, + kvCacheUtil: kvPct, + prefixCacheHitRate: prefixPct, + normalizedSessionTimeS: normalized, + p90PrefillTpsPerUser: prefillP90, + normalizedE2e400, + }; +} diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts new file mode 100644 index 00000000..3f088cd6 --- /dev/null +++ b/packages/db/src/etl/compute-chart-series.test.ts @@ -0,0 +1,341 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { CHART_SERIES_VERSION, computeChartSeries } from './compute-chart-series.js'; + +/** + * Build a minimal server_metrics_json blob covering the metrics the chart + * consumes. Each timeslice is one second long starting at t=0. + */ +function makeBlob(opts?: { + prefixHits?: number; + prefixQueries?: number; + promptTokensRate?: number; +}) { + const json = JSON.stringify({ + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 0, end_ns: 1e9, avg: 0.1 }, + { start_ns: 1e9, end_ns: 2e9, avg: 0.4 }, + { start_ns: 2e9, end_ns: 3e9, avg: 0.7 }, + ], + }, + ], + }, + 'vllm:prefix_cache_hits': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixHits ?? 75 }] }], + }, + 'vllm:prefix_cache_queries': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixQueries ?? 100 }] }], + }, + 'vllm:num_requests_running': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 5 }] }], + }, + 'vllm:num_requests_waiting': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 2 }] }], + }, + 'vllm:prompt_tokens': { + series: [ + { timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.promptTokensRate ?? 1000 }] }, + ], + }, + 'vllm:generation_tokens': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 500 }] }], + }, + 'vllm:prompt_tokens_by_source': { + series: [ + { + labels: { source: 'local_cache_hit' }, + timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 200 }], + }, + { + labels: { source: 'miss' }, + timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 800 }], + }, + ], + }, + }, + }); + return gzipSync(Buffer.from(json)); +} + +/** Build a synthetic per-engine vLLM metric series for the multi-engine test. */ +function buildEngineSeries(engineId: number, baseRunning: number) { + const labels = { engine: String(engineId) }; + return { + runningSlice: { + labels, + timeslices: [ + { start_ns: 0, avg: baseRunning }, + { start_ns: 1e9, avg: baseRunning + 1 }, + ], + }, + waitingSlice: { + labels, + timeslices: [ + { start_ns: 0, avg: 0 }, + { start_ns: 1e9, avg: 0 }, + ], + }, + kvSlice: { + labels, + timeslices: [ + { start_ns: 0, avg: 0.25 }, + { start_ns: 1e9, avg: 0.5 }, + ], + }, + promptSlice: { + labels, + timeslices: [ + { start_ns: 0, rate: 100 }, + { start_ns: 1e9, rate: 200 }, + ], + }, + genSlice: { + labels, + timeslices: [ + { start_ns: 0, rate: 50 }, + { start_ns: 1e9, rate: 75 }, + ], + }, + }; +} + +function buildDynamoSeries( + endpoint_url: string, + dynamo_component: 'prefill' | 'backend', + worker_id: string, + value: number, + field: 'rate' | 'avg' = 'rate', +) { + return { + endpoint_url, + labels: { dynamo_component, worker_id, dp_rank: '0', engine: '0' }, + timeslices: [{ start_ns: 0, end_ns: 1e9, [field]: value }], + }; +} + +describe('computeChartSeries', () => { + it('returns null when the blob is null', async () => { + expect(await computeChartSeries(null)).toBeNull(); + }); + + it('returns the current CHART_SERIES_VERSION in the bundle', async () => { + const series = await computeChartSeries(makeBlob()); + expect(series?.version).toBe(CHART_SERIES_VERSION); + }); + + it('extracts kvCacheUsage points with t=seconds-from-start', async () => { + const series = await computeChartSeries(makeBlob()); + expect(series?.kvCacheUsage).toEqual([ + { t: 0, value: 0.1 }, + { t: 1, value: 0.4 }, + { t: 2, value: 0.7 }, + ]); + }); + + it('merges warmup_metrics before profiling into one continuous series (v11)', async () => { + // warmup scrapes at t=0,1s; profiling scrapes at t=10,11s (own start_ns). + const blob = gzipSync( + Buffer.from( + JSON.stringify({ + warmup_metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 0, end_ns: 1e9, avg: 0.2 }, + { start_ns: 1e9, end_ns: 2e9, avg: 0.3 }, + ], + }, + ], + }, + }, + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 10e9, end_ns: 11e9, avg: 0.8 }, + { start_ns: 11e9, end_ns: 12e9, avg: 0.9 }, + ], + }, + ], + }, + }, + }), + ), + ); + const series = await computeChartSeries(blob); + // Origin is the earliest (warmup) start_ns, so warmup sits at low t and + // profiling follows on the same axis — the frontend slices at the boundary. + expect(series?.kvCacheUsage).toEqual([ + { t: 0, value: 0.2 }, + { t: 1, value: 0.3 }, + { t: 10, value: 0.8 }, + { t: 11, value: 0.9 }, + ]); + }); + + it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => { + const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 })); + expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]); + }); + + it('drops prefixCacheHitRate windows where queries.rate is 0', async () => { + const series = await computeChartSeries(makeBlob({ prefixHits: 5, prefixQueries: 0 })); + expect(series?.prefixCacheHitRate).toEqual([]); + }); + + it('pairs running + waiting into queueDepth points', async () => { + const series = await computeChartSeries(makeBlob()); + expect(series?.queueDepth).toEqual([{ t: 0, running: 5, waiting: 2, total: 7 }]); + }); + + it('extracts prefillTps + decodeTps from counter rates', async () => { + const series = await computeChartSeries(makeBlob()); + expect(series?.prefillTps).toEqual([{ t: 0, value: 1000 }]); + expect(series?.decodeTps).toEqual([{ t: 0, value: 500 }]); + }); + + it('splits promptTokensBySource by label and skips empty series', async () => { + const series = await computeChartSeries(makeBlob()); + expect(Object.keys(series!.promptTokensBySource).toSorted()).toEqual([ + 'local_cache_hit', + 'miss', + ]); + expect(series!.promptTokensBySource['local_cache_hit']).toEqual([{ t: 0, value: 200 }]); + expect(series!.promptTokensBySource['miss']).toEqual([{ t: 0, value: 800 }]); + }); + + it('computes timing metadata from the widest metric window', async () => { + const series = await computeChartSeries(makeBlob()); + // kvCacheUsage has the widest window (0 → 3e9), so startNs=0, endNs=3e9. + expect(series?.startNs).toBe(0); + expect(series?.endNs).toBe(3e9); + expect(series?.durationS).toBeCloseTo(3, 6); + expect(series?.timeslicesCount).toBe(3); + }); + + it('returns null on a malformed (non-gzip) blob', async () => { + const result = await computeChartSeries(Buffer.from('not-gzip-data')); + expect(result).toBeNull(); + }); + + it('aggregates gauges + counters across all engine series (DP/PP fix)', async () => { + // Simulate a 4-engine deployment: each engine reports its own series for + // every metric. Cluster-wide value should be SUM for running/waiting and + // counter rates, AVG for kv_cache_usage_perc (per-engine fraction). + const engines = [0, 1, 2, 3].map((id) => buildEngineSeries(id, 3)); // running=3 per engine + const json = JSON.stringify({ + metrics: { + 'vllm:num_requests_running': { series: engines.map((e) => e.runningSlice) }, + 'vllm:num_requests_waiting': { series: engines.map((e) => e.waitingSlice) }, + 'vllm:kv_cache_usage_perc': { series: engines.map((e) => e.kvSlice) }, + 'vllm:prompt_tokens': { series: engines.map((e) => e.promptSlice) }, + 'vllm:generation_tokens': { series: engines.map((e) => e.genSlice) }, + }, + }); + const blob = gzipSync(Buffer.from(json)); + const cs = await computeChartSeries(blob); + expect(cs).not.toBeNull(); + // queueDepth.running = Σ engines = 4 × 3 = 12 at t=0; 4 × 4 = 16 at t=1 + expect(cs!.queueDepth).toEqual([ + { t: 0, running: 12, waiting: 0, total: 12 }, + { t: 1, running: 16, waiting: 0, total: 16 }, + ]); + // kvCacheUsage stays 0.25, 0.5 (average across engines, all engines reported same value) + expect(cs!.kvCacheUsage).toEqual([ + { t: 0, value: 0.25 }, + { t: 1, value: 0.5 }, + ]); + // prefillTps = Σ rates = 4 × 100 = 400; then 4 × 200 = 800 + expect(cs!.prefillTps).toEqual([ + { t: 0, value: 400 }, + { t: 1, value: 800 }, + ]); + expect(cs!.decodeTps).toEqual([ + { t: 0, value: 200 }, + { t: 1, value: 300 }, + ]); + }); + + it('uses the Dynamo adapter to preserve workers and canonical prefill/decode roles', async () => { + const json = JSON.stringify({ + metrics: { + 'vllm:prompt_tokens': { + series: [ + buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 100), + buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 200), + buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 300), + ], + }, + 'vllm:generation_tokens': { + series: [ + buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 1), + buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 2), + buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 400), + ], + }, + 'vllm:num_requests_running': { + series: [ + buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 3, 'avg'), + buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 4, 'avg'), + buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 5, 'avg'), + ], + }, + }, + }); + + const blob = gzipSync(Buffer.from(json)); + const result = await computeChartSeries(blob, { + framework: 'dynamo-vllm', + disagg: true, + }); + + expect(result?.metricSources).toHaveLength(3); + expect(result?.metricSources.map(({ source: s }) => [s.role, s.workerId, s.engine])).toEqual([ + ['prefill', 'prefill-b', '0'], + ['prefill', 'prefill-a', '0'], + ['decode', 'decode-a', '0'], + ]); + const prefillA = result?.metricSources.find(({ source: s }) => s.workerId === 'prefill-a'); + const decode = result?.metricSources.find(({ source: s }) => s.role === 'decode'); + expect(prefillA?.promptTps).toEqual([{ t: 0, value: 100 }]); + expect(prefillA?.queueDepth).toEqual([{ t: 0, running: 3, waiting: 0, total: 3 }]); + expect(decode?.generationTps).toEqual([{ t: 0, value: 400 }]); + + const nonDisagg = await computeChartSeries(blob, { + framework: 'dynamo-vllm', + disagg: false, + }); + expect(nonDisagg?.metricSources).toEqual([]); + }); + + it('does not interpret Dynamo-native labels without selecting the Dynamo adapter', async () => { + const json = JSON.stringify({ + metrics: { + 'vllm:prompt_tokens': { + series: [ + { + endpoint_url: '10.30.1.56:7500', + labels: { dynamo_component: 'prefill', worker_id: 'prefill-a', engine: '0' }, + timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 100 }], + }, + ], + }, + }, + }); + + const result = await computeChartSeries(gzipSync(Buffer.from(json)), { + framework: 'vllm', + disagg: true, + }); + + expect(result?.metricSources).toEqual([]); + }); +}); diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts new file mode 100644 index 00000000..d140306f --- /dev/null +++ b/packages/db/src/etl/compute-chart-series.ts @@ -0,0 +1,576 @@ +/** + * Pre-compute the time-series for the agentic detail page chart, so the + * API doesn't have to gunzip + JSON-parse a multi-hundred-MB blob on every + * request. The output lands in `agentic_trace_replay.chart_series` and is + * read directly by `getTraceServerMetrics`. + * + * Versioned so the backfill script knows which rows are stale — bump + * `CHART_SERIES_VERSION` whenever the extraction algorithm changes. + */ + +import { gunzipSync } from 'node:zlib'; + +import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream'; +import { + selectServerMetricsAdapter, + type MetricSource, + type ServerMetricsContext, +} from './server-metrics-adapters'; + +/** + * Bump when the extraction algorithm changes — backfill recomputes anything + * older. + * + * v2: aggregate vllm gauges/counters across all engine series (was reading + * only series[0], which under-counted by Nx on multi-engine DP/PP + * deployments — most visible as a request-queue-depth chart that maxed out + * at ~3 when the timeline clearly showed 20+ in-flight). + * + * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative + * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps). + * + * v4: extract sglang:* metrics too (fallback chain in each picker), so + * SGLang runs populate the chart_series the same way vllm runs do. + * + * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode}) + * into promptTokensBySource so the cumulative prompt-token-source-breakdown + * chart shows useful splits for SGLang runs (filtered to prefill_* modes). + * + * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source + * breakdown from sglang:cached_tokens — current runs always have one + * cache_source ("device" / HBM) but hicache (CPU offload) runs would + * split into "device" + "host" automatically once ingested. + * + * v7: extract sglang:hicache_host_{used,total}_tokens into a new + * hostKvCacheUsage series so the KV cache utilization chart can plot + * the CPU offload pool's usage alongside the on-GPU HBM line. + * + * v8: keep the per-engine dimension on kv_cache_usage_perc as + * `kvCacheUsageByEngine` (one entry per DP rank). The cluster-average + * line hides load skew on DEP configs; the detail page overlays the + * per-rank lines so a hot rank is visible at a glance. + * + * v9: retain orchestrator-normalized per-source series. Dynamo labels are + * mapped to canonical router/prefill/decode roles, allowing the frontend to + * inspect individual workers without interpreting Dynamo-native labels. + * + * v10: only emit per-source series for disaggregated configs with a recognized + * orchestrator adapter. Non-disaggregated and unsupported configs retain the + * existing aggregate-only behavior. + * + * v12: also consume the `warmup_metrics` block from the server-metrics blob and + * merge its scrapes into the same series as the profiling `metrics` block. + * Warmup and profiling timeslices carry their own absolute `start_ns` and never + * overlap in time, so the merged series is continuous (warmup at lower t, + * profiling after). This lets the agentic detail page slice `chart_series` into + * warmup vs profiling at the request-derived boundary; older blobs without a + * warmup block are unaffected. (v11 was a short-lived, since-reverted attempt to + * carry kvCachePoolTokens in chart_series; that value now lives in + * benchmark_results.metrics, derived from the server log — unrelated to this.) + */ +export const CHART_SERIES_VERSION = 12; + +export interface TimeSeriesPoint { + /** Seconds from benchmark start. */ + t: number; + value: number; +} + +export interface QueueDepthPoint { + t: number; + running: number; + waiting: number; + total: number; +} + +export interface ChartSeries { + version: number; + /** ns wall-clock of the first window's start; for debugging only. */ + startNs: number; + /** ns wall-clock of the last window's end. */ + endNs: number; + /** Total benchmark window in seconds. */ + durationS: number; + /** Number of 1Hz windows captured. */ + timeslicesCount: number; + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + prefillTps: TimeSeriesPoint[]; + decodeTps: TimeSeriesPoint[]; + /** + * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across + * engines. Detail page derives "cumulative unique input tokens" as + * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually + * saved vs the raw queries that came in. + */ + prefixCacheHitsTps: TimeSeriesPoint[]; + /** + * Host (CPU offload) KV cache utilization, 0..1. Only populated for + * SGLang hicache runs (derived as hicache_host_used / hicache_host_total). + * Frontend overlays this on the KV cache util chart as a second line. + */ + hostKvCacheUsage: TimeSeriesPoint[]; + /** + * Per-DP-rank KV cache utilization (0..1 each). One entry per engine + * series found in the raw metric, ordered by the `engine` label when + * present and by series-array index otherwise. Empty for single-engine + * deployments — the average `kvCacheUsage` line covers that case alone. + * The detail page overlays these on the same chart so DEP load skew is + * visible without changing the headline number. + */ + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; + /** + * The same metrics grouped by normalized server source. Existing aggregate + * fields above remain the default and preserve compatibility with old rows. + */ + metricSources: MetricSourceSeries[]; +} + +export interface MetricSourceSeries { + source: MetricSource; + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + /** Raw prompt-token counter rate for this source. */ + promptTps: TimeSeriesPoint[]; + /** Raw generation-token counter rate for this source. */ + generationTps: TimeSeriesPoint[]; + prefixCacheHitsTps: TimeSeriesPoint[]; + hostKvCacheUsage: TimeSeriesPoint[]; + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; +} + +// ── Raw blob shapes (subset we read) ──────────────────────────────────── + +interface RawSlice { + start_ns?: number; + end_ns?: number; + avg?: number; + rate?: number; +} + +interface RawSeries { + endpoint_url?: string; + labels?: Record; + timeslices?: RawSlice[]; +} + +interface RawMetric { + series?: RawSeries[]; +} + +type MetricsMap = Record; + +/** + * The set of metric subtrees the chart consumes. Includes both vllm:* and + * sglang:* names so the stream-parse fallback collects whichever framework + * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric. + */ +const CHART_METRIC_KEYS = new Set([ + // vLLM + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'vllm:prefix_cache_hits', + 'vllm:prefix_cache_queries', + 'vllm:num_requests_running', + 'vllm:num_requests_waiting', + 'vllm:prompt_tokens', + 'vllm:generation_tokens', + 'vllm:prompt_tokens_by_source', + // SGLang + 'sglang:token_usage', + 'sglang:cached_tokens', + 'sglang:prompt_tokens', + 'sglang:generation_tokens', + 'sglang:num_running_reqs', + 'sglang:num_queue_reqs', + 'sglang:realtime_tokens', + 'sglang:hicache_host_used_tokens', + 'sglang:hicache_host_total_tokens', +]); + +/** + * Merge a warmup phase metric map into the profiling one by concatenating each + * metric's `series`. The two phases' timeslices carry their own absolute + * `start_ns` and never overlap in time, so `buildSeriesFromMetrics` (which keys + * by `start_ns`) yields one continuous series — warmup scrapes at lower t, + * profiling after. No-ops when either side is empty (older blobs have no warmup). + */ +function mergePhaseMetrics(profiling: MetricsMap, warmup: MetricsMap): MetricsMap { + if (Object.keys(warmup).length === 0) return profiling; + if (Object.keys(profiling).length === 0) return warmup; + const out: MetricsMap = {}; + for (const name of new Set([...Object.keys(profiling), ...Object.keys(warmup)])) { + out[name] = { + series: [...(profiling[name]?.series ?? []), ...(warmup[name]?.series ?? [])], + }; + } + return out; +} + +/** + * Stream-parse fallback: collect the chart's metric subtrees from both phase + * blocks and merge (see v11). Avoids Node's 512 MB max-string-length cap that + * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows. + */ +async function streamCollectMetrics(buffer: Buffer): Promise { + const [profiling, warmup] = await Promise.all([ + streamCollectKeys(buffer, 'metrics', CHART_METRIC_KEYS), + streamCollectKeys(buffer, 'warmup_metrics', CHART_METRIC_KEYS), + ]); + return mergePhaseMetrics(profiling, warmup); +} + +/** + * Parse the gzipped server_metrics blob into the metric map. Tries the + * synchronous fast path first; falls back to stream-parse on + * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed. Merges the warmup block + * into the profiling one (v11) so the series span both phases. + */ +async function parseMetrics(buffer: Buffer): Promise { + try { + const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { + metrics?: MetricsMap; + warmup_metrics?: MetricsMap; + }; + return mergePhaseMetrics(obj.metrics ?? {}, obj.warmup_metrics ?? {}); + } catch (error) { + if (isStringTooLongError(error)) return await streamCollectMetrics(buffer); + throw error; + } +} + +/** + * Build chart-ready time-series arrays from a gzipped server_metrics blob. + * The math mirrors `getTraceServerMetrics` — this helper exists so ingest, + * backfill, and the API path produce byte-identical results. + */ +export async function computeChartSeries( + blob: Buffer | null, + context: ServerMetricsContext = {}, +): Promise { + if (!blob) return null; + let metrics: MetricsMap; + try { + metrics = await parseMetrics(blob); + } catch { + // Malformed blob → no series (caller treats null as "no data"). + return null; + } + return buildSeriesFromMetrics(metrics, context); +} + +/** + * Aggregate one timeslice field across all series of a metric, indexed by + * `start_ns`. Multi-engine vllm deployments report one series per engine — + * the cluster value is the sum (for running/waiting/throughput counters) + * or the average (for kv_cache_usage_perc, a per-engine fraction). + */ +function aggregateByStart( + series: readonly RawSeries[] | undefined, + field: 'avg' | 'rate', + combine: 'sum' | 'avg', +): Map { + const sums = new Map(); + const counts = new Map(); + for (const s of series ?? []) { + for (const ts of s.timeslices ?? []) { + if (typeof ts.start_ns !== 'number') continue; + const v = ts[field]; + if (typeof v !== 'number' || !Number.isFinite(v)) continue; + sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v); + counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1); + } + } + if (combine === 'sum') return sums; + const out = new Map(); + for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1)); + return out; +} + +/** Stable order: emit one point per unique start_ns, chronologically. */ +function sortedEntries(m: Map): [number, number][] { + return [...m.entries()].toSorted((a, b) => a[0] - b[0]); +} + +function buildSeriesFromMetrics( + metrics: MetricsMap, + context: ServerMetricsContext, + includeMetricSources = true, + originStartNs?: number, +): ChartSeries { + // Timing reference: smallest start_ns and largest end_ns across every + // timeslice we extracted. timeslicesCount is the length of any single + // series (engines are scraped on the same cadence), so picking the max + // length across all series of all metrics is safe. + let startNs = Number.POSITIVE_INFINITY; + let endNs = 0; + let timeslicesCount = 0; + for (const metricMeta of Object.values(metrics)) { + for (const s of metricMeta?.series ?? []) { + const ts = s.timeslices ?? []; + if (ts.length === 0) continue; + timeslicesCount = Math.max(timeslicesCount, ts.length); + const first = ts[0]!; + const last = ts.at(-1)!; + if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns; + if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns; + } + } + if (!Number.isFinite(startNs)) startNs = 0; + const tOf = (ns: number) => (ns - (originStartNs ?? startNs)) / 1e9; + + // Pick the first metric name whose series array has any data; fallback + // chain lets the same code path serve both vllm:* and sglang:* blobs. + const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => { + for (const name of names) { + const s = metrics[name]?.series; + if (s && s.length > 0) return s; + } + return undefined; + }; + + // KV cache usage (gauge, 0..1) — average across engines so the value + // stays a fraction (each engine has its own KV pool). + const kvSeries = pickSeries( + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'sglang:token_usage', + ); + const kvCacheUsage: TimeSeriesPoint[] = sortedEntries( + aggregateByStart(kvSeries, 'avg', 'avg'), + ).map(([t, v]) => ({ t: tOf(t), value: v })); + // Per-engine breakdown of the same metric. We only emit it when there's + // more than one series — single-engine deployments would just duplicate + // the cluster-average line. + const kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[] = []; + if (kvSeries && kvSeries.length > 1) { + // Sort by numeric engine label when present so rank 0..N renders in + // order; fall back to series-array index otherwise. + const decorated = kvSeries.map((s, idx) => { + const raw = + s.labels?.['engine'] ?? s.labels?.['engine_idx'] ?? s.labels?.['dp_rank'] ?? String(idx); + const numeric = Number(raw); + return { series: s, idx, label: raw, sortKey: Number.isFinite(numeric) ? numeric : idx }; + }); + decorated.sort((a, b) => a.sortKey - b.sortKey); + for (const { series, label } of decorated) { + const pts: TimeSeriesPoint[] = []; + for (const ts of series.timeslices ?? []) { + if (typeof ts.start_ns !== 'number' || typeof ts.avg !== 'number') continue; + if (!Number.isFinite(ts.avg)) continue; + pts.push({ t: tOf(ts.start_ns), value: ts.avg }); + } + if (pts.length > 0) kvCacheUsageByEngine.push({ engineLabel: label, points: pts }); + } + } + + // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across + // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens. + const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens'); + const qsSeries = pickSeries( + 'vllm:prefix_cache_queries', + 'vllm:prompt_tokens', + 'sglang:prompt_tokens', + ); + const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum'); + const qsByT = aggregateByStart(qsSeries, 'rate', 'sum'); + const prefixCacheHitRate: TimeSeriesPoint[] = []; + for (const [t, h] of sortedEntries(hitsByT)) { + const q = qsByT.get(t); + if (q !== undefined && q > 0) prefixCacheHitRate.push({ t: tOf(t), value: h / q }); + } + + // Queue depth: sum running + waiting across engines per timeslice. + const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs'); + const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs'); + const runByT = aggregateByStart(runSeries, 'avg', 'sum'); + const waitByT = aggregateByStart(waitSeries, 'avg', 'sum'); + const queueDepth: QueueDepthPoint[] = []; + // Union of timestamps so we surface activity even if one of the gauges + // didn't report a sample on a given tick. + const allTimes = new Set([...runByT.keys(), ...waitByT.keys()]); + for (const t of [...allTimes].toSorted((a, b) => a - b)) { + const running = runByT.get(t) ?? 0; + const waiting = waitByT.get(t) ?? 0; + queueDepth.push({ t: tOf(t), running, waiting, total: running + waiting }); + } + + // Throughput: sum the counter `rate` (already per-second) across engines. + // Takes a fallback chain so vllm:* and sglang:* both work. + const counterRate = (...names: string[]): TimeSeriesPoint[] => { + const s = pickSeries(...names); + return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({ + t: tOf(t), + value: v, + })); + }; + const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens'); + const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens'); + // Tokens served from prefix cache per scrape. Lets the frontend derive + // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). + const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); + + // SGLang hicache: host-pool KV cache utilization as used/total per + // timeslice. Both metrics are gauges in absolute tokens. Total stays + // constant (it's the pool size), used fluctuates. + const hostUsedByT = aggregateByStart( + metrics['sglang:hicache_host_used_tokens']?.series, + 'avg', + 'sum', + ); + const hostTotalByT = aggregateByStart( + metrics['sglang:hicache_host_total_tokens']?.series, + 'avg', + 'sum', + ); + const hostKvCacheUsage: TimeSeriesPoint[] = []; + for (const [t, used] of sortedEntries(hostUsedByT)) { + const total = hostTotalByT.get(t); + if (total !== undefined && total > 0) { + hostKvCacheUsage.push({ t: tOf(t), value: used / total }); + } + } + + // Per-source prompt tokens — sum across engines per source label. + // vllm: vllm:prompt_tokens_by_source has one series per source label + // (local_cache_hit, external_cache_hit, miss, ...). Use the + // `source`/`reason`/`kind` label as the breakdown key. + // sglang: sglang:realtime_tokens uses a `mode` label with values + // {prefill_cache, prefill_compute, decode}. Filter to prefill_* + // since decode isn't prompt-token volume. + const promptBySrcByT = new Map>(); + // Sum a series' per-scrape rates into the bucket for `label`. The bucket is + // created even when the series has no valid timeslices — the SGLang fallback + // below is gated on `promptBySrcByT.size === 0`, so an empty vllm breakdown + // must still suppress it. + const addSeriesRates = (label: string, series: RawSeries): void => { + let byT = promptBySrcByT.get(label); + if (!byT) { + byT = new Map(); + promptBySrcByT.set(label, byT); + } + for (const ts of series.timeslices ?? []) { + if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { + byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate); + } + } + }; + for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) { + const labels = series.labels ?? {}; + const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels); + addSeriesRates(source, series); + } + // SGLang fallback: only consider when the vllm metric wasn't found. + // - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]` + // - Cache hits, split by tier: per-series `sglang:cached_tokens` where each + // series carries a `cache_source` label ("device" = HBM, "host" = CPU + // offload via hicache). Current runs have only `device`; when hicache + // runs land, additional series will appear and the chart will split. + if (promptBySrcByT.size === 0) { + for (const series of metrics['sglang:realtime_tokens']?.series ?? []) { + const labels = series.labels ?? {}; + const mode = labels['mode'] ?? 'unknown'; + // Only carry the cache-miss line over — cache hits come from + // sglang:cached_tokens broken out by cache_source below, so we'd + // double-count if we kept `prefill_cache` here too. + if (mode !== 'prefill_compute') continue; + addSeriesRates('compute (miss)', series); + } + // Cache hits broken out per cache_source. Strip the noisy "total" label + // (older sglang versions emit a single un-broken-out series labelled + // total — show that as just "cache hit"). + for (const series of metrics['sglang:cached_tokens']?.series ?? []) { + const labels = series.labels ?? {}; + const src = labels['cache_source'] ?? 'cache hit'; + const label = + src === 'device' + ? 'cache hit (HBM)' + : src === 'host' + ? 'cache hit (CPU offload)' + : src === 'total' + ? 'cache hit' + : `cache hit (${src})`; + addSeriesRates(label, series); + } + } + const promptTokensBySource: Record = {}; + for (const [source, byT] of promptBySrcByT) { + const arr: TimeSeriesPoint[] = []; + for (const [t, v] of sortedEntries(byT)) { + if (v > 0) arr.push({ t: tOf(t), value: v }); + } + if (arr.length > 0) promptTokensBySource[source] = arr; + } + + const metricSources: MetricSourceSeries[] = []; + const adapter = selectServerMetricsAdapter(context); + if (includeMetricSources && context.disagg && adapter.id !== 'generic') { + const grouped = new Map(); + for (const [metricName, metric] of Object.entries(metrics)) { + for (const series of metric.series ?? []) { + const source = adapter.identifySource(series); + let group = grouped.get(source.id); + if (!group) { + group = { source, metrics: {} }; + grouped.set(source.id, group); + } + const groupedMetric = (group.metrics[metricName] ??= { series: [] }); + groupedMetric.series!.push(series); + } + } + for (const { source, metrics: sourceMetrics } of grouped.values()) { + const sourceSeries = buildSeriesFromMetrics( + sourceMetrics, + context, + false, + originStartNs ?? startNs, + ); + metricSources.push({ + source, + kvCacheUsage: sourceSeries.kvCacheUsage, + prefixCacheHitRate: sourceSeries.prefixCacheHitRate, + queueDepth: sourceSeries.queueDepth, + promptTokensBySource: sourceSeries.promptTokensBySource, + promptTps: sourceSeries.prefillTps, + generationTps: sourceSeries.decodeTps, + prefixCacheHitsTps: sourceSeries.prefixCacheHitsTps, + hostKvCacheUsage: sourceSeries.hostKvCacheUsage, + kvCacheUsageByEngine: sourceSeries.kvCacheUsageByEngine, + }); + } + const roleOrder: Record = { + router: 0, + prefill: 1, + decode: 2, + combined: 3, + unknown: 4, + }; + metricSources.sort( + (a, b) => + roleOrder[a.source.role] - roleOrder[b.source.role] || + (a.source.endpointUrl ?? '').localeCompare(b.source.endpointUrl ?? '') || + a.source.id.localeCompare(b.source.id), + ); + } + return { + version: CHART_SERIES_VERSION, + startNs, + endNs, + durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0, + timeslicesCount, + kvCacheUsage, + prefixCacheHitRate, + queueDepth, + promptTokensBySource, + prefillTps, + decodeTps, + prefixCacheHitsTps, + hostKvCacheUsage, + kvCacheUsageByEngine, + metricSources, + }; +} diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts new file mode 100644 index 00000000..1ad9e63b --- /dev/null +++ b/packages/db/src/etl/compute-request-timeline.test.ts @@ -0,0 +1,210 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-request-timeline.js'; + +interface SyntheticRequest { + cid: string; + ti: number; + srcTrace?: string; + srcOuter?: number; + srcInner?: number; + srcKind?: string; + wid?: string; + ad?: number; + phase?: string; + credit: number; + start: number; + end: number; + ack?: number | null; + ttftMs?: number | null; + tpotMs?: number | null; + tpotKey?: 'inter_token_latency' | 'time_per_output_token'; + isl?: number | null; + osl?: number | null; + cancelled?: boolean; +} + +function makeBlob(requests: SyntheticRequest[]) { + const lines = requests.map((r) => + JSON.stringify({ + metadata: { + conversation_id: r.cid, + turn_index: r.ti, + ...(r.srcTrace === undefined ? {} : { source_trace_id: r.srcTrace }), + ...(r.srcOuter === undefined ? {} : { source_outer_idx: r.srcOuter }), + ...(r.srcInner === undefined ? {} : { source_inner_idx: r.srcInner }), + ...(r.srcKind === undefined ? {} : { source_kind: r.srcKind }), + worker_id: r.wid ?? 'worker_default', + agent_depth: r.ad ?? 0, + benchmark_phase: r.phase ?? 'profiling', + credit_issued_ns: r.credit, + request_start_ns: r.start, + ...(r.ack === undefined ? {} : { request_ack_ns: r.ack }), + request_end_ns: r.end, + was_cancelled: r.cancelled ?? false, + }, + metrics: { + time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' }, + [r.tpotKey ?? 'inter_token_latency']: + r.tpotMs === null ? null : { value: r.tpotMs ?? 10, unit: 'ms' }, + input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' }, + output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' }, + }, + }), + ); + return gzipSync(Buffer.from(lines.join('\n'))); +} + +describe('computeRequestTimeline', () => { + it('returns null when the blob is null', () => { + expect(computeRequestTimeline(null)).toBeNull(); + }); + + it('returns null on a malformed (non-gzip) blob', () => { + expect(computeRequestTimeline(Buffer.from('not-gzip'))).toBeNull(); + }); + + it('returns null when the blob has no parseable records', () => { + expect(computeRequestTimeline(gzipSync(Buffer.from('\n\n')))).toBeNull(); + }); + + it('returns the current REQUEST_TIMELINE_VERSION in the bundle', () => { + const tl = computeRequestTimeline( + makeBlob([{ cid: 'a', ti: 0, credit: 1000, start: 2000, end: 3000 }]), + ); + expect(tl?.version).toBe(REQUEST_TIMELINE_VERSION); + }); + + it('shifts ns timestamps to be relative to the earliest credit_issued', () => { + // Two requests with absolute ns starting at 1_000_000_000. + const tl = computeRequestTimeline( + makeBlob([ + { cid: 'a', ti: 0, credit: 1_000_000_000, start: 1_001_000_000, end: 1_010_000_000 }, + { cid: 'a', ti: 1, credit: 1_020_000_000, start: 1_021_000_000, end: 1_030_000_000 }, + ]), + ); + expect(tl?.startNs).toBe(1_000_000_000); + expect(tl?.endNs).toBe(1_030_000_000); + expect(tl?.durationS).toBeCloseTo(0.03, 6); + expect(tl?.requests[0]?.credit).toBe(0); + expect(tl?.requests[0]?.end).toBe(10_000_000); + expect(tl?.requests[1]?.start).toBe(21_000_000); + }); + + it('sorts requests by start time, regardless of input order', () => { + const tl = computeRequestTimeline( + makeBlob([ + { cid: 'a', ti: 0, credit: 30, start: 50, end: 60 }, + { cid: 'a', ti: 1, credit: 0, start: 10, end: 20 }, + { cid: 'a', ti: 2, credit: 80, start: 90, end: 100 }, + ]), + ); + expect(tl?.requests.map((r) => r.start)).toEqual([10, 50, 90]); + }); + + it('preserves conversation/worker grouping fields', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'conv-A', + ti: 5, + wid: 'worker_abcd1234', + ad: 2, + phase: 'profiling', + credit: 0, + start: 10, + end: 100, + }, + ]), + ); + const r = tl?.requests[0]!; + expect(r.cid).toBe('conv-A'); + expect(r.ti).toBe(5); + expect(r.wid).toBe('worker_abcd1234'); + expect(r.ad).toBe(2); + expect(r.phase).toBe('profiling'); + }); + + it('preserves raw source provenance fields when present', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'trace::fa:003', + ti: 3, + srcTrace: 'trace', + srcOuter: 204, + srcInner: 16, + srcKind: 'weka_flat', + credit: 0, + start: 10, + end: 100, + }, + ]), + ); + expect(tl?.requests[0]).toMatchObject({ + cid: 'trace::fa:003', + ti: 3, + srcTrace: 'trace', + srcOuter: 204, + srcInner: 16, + srcKind: 'weka_flat', + }); + }); + + it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'a', + ti: 0, + credit: 0, + start: 10, + end: 100, + ttftMs: 25.5, + tpotMs: 12.5, + isl: 1024, + osl: 256, + cancelled: true, + }, + ]), + ); + const r = tl?.requests[0]!; + expect(r.cancelled).toBe(true); + expect(r.ttftMs).toBeCloseTo(25.5, 6); + expect(r.tpotMs).toBeCloseTo(12.5, 6); + expect(r.isl).toBe(1024); + expect(r.osl).toBe(256); + }); + + it('accepts time_per_output_token as a TPOT alias', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'a', + ti: 0, + credit: 0, + start: 10, + end: 100, + tpotMs: 8.25, + tpotKey: 'time_per_output_token', + }, + ]), + ); + expect(tl?.requests[0]?.tpotMs).toBeCloseTo(8.25, 6); + }); + + it('skips records missing both credit_issued_ns and request_start_ns', () => { + // Build a record with only request_end_ns — the helper rejects it. + const broken = gzipSync( + Buffer.from( + JSON.stringify({ + metadata: { conversation_id: 'a', turn_index: 0, request_end_ns: 1234 }, + metrics: {}, + }), + ), + ); + expect(computeRequestTimeline(broken)).toBeNull(); + }); +}); diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts new file mode 100644 index 00000000..2cbe5174 --- /dev/null +++ b/packages/db/src/etl/compute-request-timeline.ts @@ -0,0 +1,208 @@ +/** + * Pre-compute the per-request timeline for the agentic detail page's + * Gantt view. Output lands in `agentic_trace_replay.request_timeline` + * and is read directly by the timeline API route. + * + * Shape is a thin array — ~150 bytes per request × ~200 requests per + * point ≈ 30 KB per row before JSONB compression. Trivial vs the raw + * gzipped JSONL blob (~1-3 MB). + * + * Versioned so the backfill script knows which rows are stale — bump + * `REQUEST_TIMELINE_VERSION` whenever the extraction algorithm changes. + */ + +import { gunzipSync } from 'node:zlib'; + +/** Bump when the extraction algorithm changes — backfill recomputes anything older. */ +export const REQUEST_TIMELINE_VERSION = 5; + +export interface RequestRecord { + /** Conversation id (groups turns of one agent session). */ + cid: string; + /** Zero-based turn index within the conversation. */ + ti: number; + /** Source trace id from the original raw dataset, when distinct from replay cid. */ + srcTrace?: string; + /** Original raw top-level request index within srcTrace. */ + srcOuter?: number; + /** Original nested request index within srcOuter, for subagent children. */ + srcInner?: number; + /** Loader-specific source kind, e.g. weka_main or weka_flat. */ + srcKind?: string; + /** Worker id (concurrency slot that handled this request). */ + wid: string; + /** Sub-agent depth (0 = top-level). */ + ad: number; + /** `warmup` or `profiling`. */ + phase: string; + /** ns offset from timeline.startNs. Load gen decided to dispatch. */ + credit: number; + /** ns offset from timeline.startNs. HTTP send started. */ + start: number; + /** ns offset from timeline.startNs. First server acknowledgement (or null). */ + ack: number | null; + /** ns offset from timeline.startNs. Last byte received. */ + end: number; + /** Time-to-first-token in ms. */ + ttftMs: number | null; + /** Time per output token in ms. */ + tpotMs: number | null; + /** Input sequence length (tokens). */ + isl: number | null; + /** Output sequence length (tokens). */ + osl: number | null; + cancelled: boolean; +} + +export interface RequestTimeline { + version: number; + /** Wall-clock ns of the earliest event (used as the relative-time origin). */ + startNs: number; + /** Wall-clock ns of the latest `request_end_ns`. */ + endNs: number; + /** Total span in seconds. */ + durationS: number; + requests: RequestRecord[]; +} + +interface RawMetadata { + conversation_id?: string; + turn_index?: number; + source_trace_id?: string; + source_outer_idx?: number; + source_inner_idx?: number; + source_kind?: string; + worker_id?: string; + agent_depth?: number; + benchmark_phase?: string; + credit_issued_ns?: number; + request_start_ns?: number; + request_ack_ns?: number; + request_end_ns?: number; + was_cancelled?: boolean; +} + +interface RawMetricValue { + value?: number; +} + +interface RawRecord { + metadata?: RawMetadata; + metrics?: { + time_to_first_token?: RawMetricValue | number; + time_per_output_token?: RawMetricValue | number; + inter_token_latency?: RawMetricValue | number; + input_sequence_length?: RawMetricValue | number; + output_sequence_length?: RawMetricValue | number; + }; +} + +/** Pull a numeric metric out of the `{value, unit}` envelope (or a bare number). */ +function readNum(v: unknown): number | undefined { + if (typeof v === 'number') return Number.isFinite(v) ? v : undefined; + if (v && typeof v === 'object' && 'value' in v) { + const inner = (v as { value?: unknown }).value; + if (typeof inner === 'number' && Number.isFinite(inner)) return inner; + } + return undefined; +} + +/** + * Parse the gzipped `profile_export.jsonl` blob into a chart-ready + * timeline. Returns null on a missing or malformed blob. + */ +export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | null { + if (!blob) return null; + let text: string; + try { + text = gunzipSync(blob).toString('utf8'); + } catch { + return null; + } + + // First pass: parse + collect raw turns; find timeline origin. + const raw: { + meta: RawMetadata; + ttftMs: number | null; + tpotMs: number | null; + isl: number | null; + osl: number | null; + }[] = []; + let originNs = Number.POSITIVE_INFINITY; + let endNs = 0; + + for (const line of text.split('\n')) { + if (!line) continue; + let rec: RawRecord; + try { + rec = JSON.parse(line) as RawRecord; + } catch { + continue; + } + const meta = rec.metadata ?? {}; + // Use credit_issued_ns when available (the true start of the request's + // lifecycle), falling back to request_start_ns. Skip rows missing both. + const cStart = meta.credit_issued_ns ?? meta.request_start_ns; + const cEnd = meta.request_end_ns; + if (typeof cStart !== 'number' || typeof cEnd !== 'number') continue; + + if (cStart < originNs) originNs = cStart; + if (cEnd > endNs) endNs = cEnd; + + raw.push({ + meta, + ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null, + tpotMs: + readNum(rec.metrics?.time_per_output_token) ?? + readNum(rec.metrics?.inter_token_latency) ?? + null, + isl: readNum(rec.metrics?.input_sequence_length) ?? null, + osl: readNum(rec.metrics?.output_sequence_length) ?? null, + }); + } + + if (raw.length === 0) return null; + if (!Number.isFinite(originNs)) originNs = 0; + + // Second pass: shift timestamps to be relative to originNs (smaller + // numbers fit in JSON nicely and the frontend doesn't need bigint math). + const requests: RequestRecord[] = []; + for (const r of raw) { + const m = r.meta; + const credit = (m.credit_issued_ns ?? m.request_start_ns ?? originNs) - originNs; + const start = (m.request_start_ns ?? m.credit_issued_ns ?? originNs) - originNs; + const ack = typeof m.request_ack_ns === 'number' ? m.request_ack_ns - originNs : null; + const end = (m.request_end_ns ?? originNs) - originNs; + requests.push({ + cid: m.conversation_id ?? 'unknown', + ti: typeof m.turn_index === 'number' ? m.turn_index : 0, + srcTrace: typeof m.source_trace_id === 'string' ? m.source_trace_id : undefined, + srcOuter: typeof m.source_outer_idx === 'number' ? m.source_outer_idx : undefined, + srcInner: typeof m.source_inner_idx === 'number' ? m.source_inner_idx : undefined, + srcKind: typeof m.source_kind === 'string' ? m.source_kind : undefined, + wid: m.worker_id ?? 'unknown', + ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0, + phase: m.benchmark_phase ?? 'unknown', + credit, + start, + ack, + end, + ttftMs: r.ttftMs, + tpotMs: r.tpotMs, + isl: r.isl, + osl: r.osl, + cancelled: m.was_cancelled === true, + }); + } + + // Stable order so backfill output is deterministic. + requests.sort((a, b) => a.start - b.start); + + return { + version: REQUEST_TIMELINE_VERSION, + startNs: originNs, + endNs, + durationS: endNs > originNs ? (endNs - originNs) / 1e9 : 0, + requests, + }; +} diff --git a/packages/db/src/etl/dataset-provenance.test.ts b/packages/db/src/etl/dataset-provenance.test.ts new file mode 100644 index 00000000..4022546e --- /dev/null +++ b/packages/db/src/etl/dataset-provenance.test.ts @@ -0,0 +1,40 @@ +import { describe, expect, it } from 'vitest'; + +import { datasetSlugFromBenchmarkRow } from './dataset-provenance'; + +describe('datasetSlugFromBenchmarkRow', () => { + it('maps aiperf public-dataset provenance to the dashboard dataset slug', () => { + expect( + datasetSlugFromBenchmarkRow({ + dataset: { + source_type: 'public_dataset', + loader: 'semianalysis_cc_traces_weka_with_subagents', + hf_dataset_name: 'semianalysisai/cc-traces-weka-062126', + hf_split: 'train', + num_dataset_entries: 393, + }, + }), + ).toBe('cc-traces-weka-062126'); + }); + + it('supports an unnamespaced Hugging Face dataset id', () => { + expect( + datasetSlugFromBenchmarkRow({ + dataset: { + source_type: 'public_dataset', + hf_dataset_name: 'cc-traces-weka-062126', + }, + }), + ).toBe('cc-traces-weka-062126'); + }); + + it.each([ + {}, + { dataset: null }, + { dataset: { source_type: 'synthetic', hf_dataset_name: 'owner/data' } }, + { dataset: { source_type: 'public_dataset', hf_dataset_name: '' } }, + { dataset: { source_type: 'public_dataset' } }, + ])('ignores rows without usable public-dataset provenance: %j', (row) => { + expect(datasetSlugFromBenchmarkRow(row)).toBeNull(); + }); +}); diff --git a/packages/db/src/etl/dataset-provenance.ts b/packages/db/src/etl/dataset-provenance.ts new file mode 100644 index 00000000..f0d7cd0d --- /dev/null +++ b/packages/db/src/etl/dataset-provenance.ts @@ -0,0 +1,32 @@ +const TRAILING_SLASHES = /\/+$/u; + +/** Dataset provenance emitted by aiperf and preserved in agentic benchmark rows. */ +export interface DatasetProvenance { + source_type?: unknown; + loader?: unknown; + hf_dataset_name?: unknown; + hf_split?: unknown; + hf_subset?: unknown; + num_dataset_entries?: unknown; +} + +/** + * Resolve the dashboard dataset slug from a benchmark row's provenance. + * + * Dataset ingest uses the final path component of the Hugging Face dataset id + * as `datasets.slug`, so `semianalysisai/cc-traces-weka-062126` maps to + * `cc-traces-weka-062126` here as well. + */ +export function datasetSlugFromBenchmarkRow(row: Record): string | null { + const dataset = row.dataset; + if (!dataset || typeof dataset !== 'object' || Array.isArray(dataset)) return null; + + const provenance = dataset as DatasetProvenance; + if (provenance.source_type !== 'public_dataset') return null; + if (typeof provenance.hf_dataset_name !== 'string') return null; + + const datasetId = provenance.hf_dataset_name.trim().replace(TRAILING_SLASHES, ''); + if (!datasetId) return null; + const slug = datasetId.slice(datasetId.lastIndexOf('/') + 1); + return slug || null; +} diff --git a/packages/db/src/etl/distribution-stats.ts b/packages/db/src/etl/distribution-stats.ts new file mode 100644 index 00000000..da3603ab --- /dev/null +++ b/packages/db/src/etl/distribution-stats.ts @@ -0,0 +1,98 @@ +/** + * Generic distribution math shared by the dataset ETL: percentile summaries + * and histogram binning for the dataset-detail cards. Pure functions, no DB + * access. (The per-benchmark-row percentile bundle uses `percentilesOf` in + * `queries/agentic-aggregates` — a different shape with its own version key.) + */ + +export interface HistogramBin { + x0: number; + x1: number; + count: number; +} + +export interface NumberSummary { + count: number; + min: number; + max: number; + mean: number; + median: number; + p75: number; + p90: number; + p95: number; +} + +/** Distribution summary with linear-interpolated percentiles. */ +export function summarizeValues(values: readonly number[]): NumberSummary { + if (values.length === 0) { + return { count: 0, min: 0, max: 0, mean: 0, median: 0, p75: 0, p90: 0, p95: 0 }; + } + const sorted = [...values].toSorted((a, b) => a - b); + const quantile = (q: number): number => { + const pos = (sorted.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sorted[lo]!; + return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo); + }; + return { + count: sorted.length, + min: sorted[0]!, + max: sorted.at(-1)!, + mean: sorted.reduce((sum, value) => sum + value, 0) / sorted.length, + median: quantile(0.5), + p75: quantile(0.75), + p90: quantile(0.9), + p95: quantile(0.95), + }; +} + +/** Linear-width histogram over [0, max]. Empty input → []. */ +export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] { + if (values.length === 0) return []; + const max = Math.max(...values); + if (max <= 0) return [{ x0: 0, x1: 1, count: values.length }]; + const width = max / bins; + const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({ + x0: i * width, + x1: (i + 1) * width, + count: 0, + })); + for (const v of values) { + const idx = Math.min(bins - 1, Math.max(0, Math.floor(v / width))); + out[idx].count += 1; + } + return out; +} + +/** Log-width histogram over positive values (values ≤ 0 are dropped). */ +export function logHistogram(values: readonly number[], bins = 40): HistogramBin[] { + const pos = values.filter((v) => v > 0); + if (pos.length === 0) return []; + const min = Math.min(...pos); + const max = Math.max(...pos); + const lo = Math.log10(min); + const hi = Math.log10(max); + if (hi <= lo) return [{ x0: min, x1: max <= min ? min * 10 : max, count: pos.length }]; + const width = (hi - lo) / bins; + const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({ + x0: 10 ** (lo + i * width), + x1: 10 ** (lo + (i + 1) * width), + count: 0, + })); + for (const v of pos) { + const idx = Math.min(bins - 1, Math.max(0, Math.floor((Math.log10(v) - lo) / width))); + out[idx].count += 1; + } + return out; +} + +/** Log-width histogram that preserves zero as a dedicated first bin. */ +export function logHistogramWithZero(values: readonly number[], bins = 40): HistogramBin[] { + const zeroCount = values.filter((value) => value === 0).length; + const positive = values.filter((value) => value > 0); + if (zeroCount === 0) return logHistogram(positive, bins); + if (positive.length === 0) return [{ x0: 0, x1: 1, count: zeroCount }]; + const positiveBins = logHistogram(positive, Math.max(1, bins - 1)); + return [{ x0: 0, x1: positiveBins[0]?.x0 ?? 1, count: zeroCount }, ...positiveBins]; +} diff --git a/packages/db/src/etl/gzip-json-stream.test.ts b/packages/db/src/etl/gzip-json-stream.test.ts new file mode 100644 index 00000000..9051ee82 --- /dev/null +++ b/packages/db/src/etl/gzip-json-stream.test.ts @@ -0,0 +1,66 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { isStringTooLongError, streamCollectKeys } from './gzip-json-stream.js'; + +describe('isStringTooLongError', () => { + it('matches the ERR_STRING_TOO_LONG code', () => { + const err = new Error('Cannot create a string longer than ...') as NodeJS.ErrnoException; + err.code = 'ERR_STRING_TOO_LONG'; + expect(isStringTooLongError(err)).toBe(true); + }); + + it('matches the message-only variant', () => { + expect(isStringTooLongError(new Error('Cannot create a string longer than 0x1fffffe8'))).toBe( + true, + ); + }); + + it('rejects unrelated errors and non-errors', () => { + expect(isStringTooLongError(new Error('unexpected token'))).toBe(false); + expect(isStringTooLongError(null)).toBe(false); + expect(isStringTooLongError('ERR_STRING_TOO_LONG-ish string')).toBe(false); + }); +}); + +describe('streamCollectKeys', () => { + const blob = gzipSync( + JSON.stringify({ + metrics: { + 'vllm:prompt_tokens': { series: [{ timeslices: [{ start_ns: 1, rate: 2 }] }] }, + 'vllm:ignored_metric': { series: [] }, + }, + warmup_metrics: { + 'vllm:prompt_tokens': { series: [] }, + }, + }), + ); + + it('collects only wanted keys under the filtered top-level block', async () => { + const out = await streamCollectKeys<{ series: unknown[] }>( + blob, + 'metrics', + new Set(['vllm:prompt_tokens']), + ); + expect(Object.keys(out)).toEqual(['vllm:prompt_tokens']); + expect(out['vllm:prompt_tokens']).toEqual({ + series: [{ timeslices: [{ start_ns: 1, rate: 2 }] }], + }); + }); + + it('reads a different top-level phase block via filter', async () => { + const out = await streamCollectKeys<{ series: unknown[] }>( + blob, + 'warmup_metrics', + new Set(['vllm:prompt_tokens']), + ); + expect(out).toEqual({ 'vllm:prompt_tokens': { series: [] } }); + }); + + it('rejects on a non-gzip buffer', async () => { + await expect( + streamCollectKeys(Buffer.from('not gzip'), 'metrics', new Set(['x'])), + ).rejects.toThrow(); + }); +}); diff --git a/packages/db/src/etl/gzip-json-stream.ts b/packages/db/src/etl/gzip-json-stream.ts new file mode 100644 index 00000000..cb299a8d --- /dev/null +++ b/packages/db/src/etl/gzip-json-stream.ts @@ -0,0 +1,58 @@ +/** + * Shared stream-parse helpers for gzipped server-metrics blobs. + * + * `gunzipSync(buffer).toString('utf8')` trips Node's 512 MB max-string-length + * cap on high-conc TP+EP rows, so the compute-* ETL helpers fall back to a + * stream-json pipeline that collects only the top-level subtrees they need. + * Both the fast-path error detection and the pipeline itself live here so + * chart-series and aggregate-stats stay byte-identical in how they parse. + */ + +import { Readable } from 'node:stream'; +import { createGunzip } from 'node:zlib'; + +import { chain } from 'stream-chain'; + +import { parser } from 'stream-json'; +import { pick } from 'stream-json/filters/pick.js'; +import { streamObject } from 'stream-json/streamers/stream-object.js'; + +/** + * True when `error` is Node's max-string-length failure (`ERR_STRING_TOO_LONG` + * or the older message-only variant) — the signal to switch from + * `gunzipSync().toString()` to the streaming parser. + */ +export function isStringTooLongError(error: unknown): boolean { + const code = error && (error as NodeJS.ErrnoException).code; + const msg = error instanceof Error ? error.message : String(error); + return code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8'); +} + +/** + * Gunzip + stream-parse `buffer`, descending into the top-level `filter` key + * (e.g. `metrics` / `warmup_metrics`) and collecting only the child entries + * whose key is in `wanted`. Never materializes the full JSON string. + */ +export async function streamCollectKeys( + buffer: Buffer, + filter: string, + wanted: ReadonlySet, +): Promise> { + const collected: Record = {}; + const pipeline = chain([ + Readable.from(buffer), + createGunzip(), + parser(), + pick({ filter }), + streamObject(), + ]); + await new Promise((resolve, reject) => { + pipeline.on('data', (chunk: unknown) => { + const { key, value } = chunk as { key: string; value: T }; + if (wanted.has(key)) collected[key] = value; + }); + pipeline.on('end', resolve); + pipeline.on('error', reject); + }); + return collected; +} diff --git a/packages/db/src/etl/normalizers.test.ts b/packages/db/src/etl/normalizers.test.ts index e569143a..82aaf67c 100644 --- a/packages/db/src/etl/normalizers.test.ts +++ b/packages/db/src/etl/normalizers.test.ts @@ -25,6 +25,11 @@ describe('hwToGpuKey', () => { expect(hwToGpuKey('mi300x-amd')).toBe('mi300x'); }); + it('strips a v3 scope prefix (cluster:…)', () => { + expect(hwToGpuKey('cluster:b300-nv')).toBe('b300'); + expect(hwToGpuKey('cluster:h200')).toBe('h200'); + }); + it('strips -amds suffix', () => { expect(hwToGpuKey('mi355x-amds')).toBe('mi355x'); }); diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index 1d6a95c1..07793dee 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -22,7 +22,11 @@ export { GPU_KEYS }; * stripped base is not in `GPU_KEYS`. */ export function hwToGpuKey(hw: string): string | null { - const base = hw.toLowerCase().split('-')[0]; + // v3 agentic artifacts scope the hw id (`cluster:b300-nv`) — drop everything + // up to the last `:` first. Then take the first segment before `-` as the + // canonical key; that subsumes all the prior explicit suffix strips + // (-nv, -amds, -dgxc-slurm, -p1, -cw, …). + const base = hw.toLowerCase().split(':').pop()!.split('-')[0]; return GPU_KEYS.has(base) ? base : null; } @@ -138,7 +142,7 @@ export function resolveModelKey(row: Record): string | null { */ export function normalizeFramework( fw: string, - disaggField: any, + disaggField: unknown, ): { framework: string; disagg: boolean } { const lower = fw.toLowerCase(); const alias = FRAMEWORK_ALIASES[lower]; @@ -171,7 +175,7 @@ export function normalizePrecision(raw: string): string { * @param spec - Raw `spec_decoding` value from the artifact. * @returns Lowercase method name, or `'none'` if absent/empty. */ -export function normalizeSpecMethod(spec: any): string { +export function normalizeSpecMethod(spec: unknown): string { if (!spec || spec === '') return 'none'; return String(spec).toLowerCase(); } @@ -183,7 +187,7 @@ export function normalizeSpecMethod(spec: any): string { * @param v - Value to coerce (any type). * @returns `true` if the value is one of the recognized truthy forms, `false` otherwise. */ -export function parseBool(v: any): boolean { +export function parseBool(v: unknown): boolean { return v === true || v === 'true' || v === 'True'; } @@ -194,7 +198,7 @@ export function parseBool(v: any): boolean { * @param v - Value to parse (number, string, null, or undefined). * @returns The parsed number, or `undefined` if the input is null/undefined/NaN. */ -export function parseNum(v: any): number | undefined { +export function parseNum(v: unknown): number | undefined { if (v === null || v === undefined) return undefined; const n = typeof v === 'string' ? parseFloat(v) : Number(v); return isNaN(n) ? undefined : n; @@ -207,12 +211,14 @@ export function parseNum(v: any): number | undefined { * @param v - Value to parse (number, string, null, or undefined). * @returns The parsed integer, or `undefined` if the input is null/undefined/NaN. */ -export function parseInt2(v: any): number | undefined { +export function parseInt2(v: unknown): number | undefined { if (v === null || v === undefined) return undefined; const n = typeof v === 'string' ? parseInt(v, 10) : Math.round(Number(v)); return isNaN(n) ? undefined : n; } +const ISL_OSL_PATTERN = /[_-](?\d+)k(?\d+)k[_\-.]/iu; + /** * Extract ISL (input sequence length) and OSL (output sequence length) in tokens * from a file/directory name that encodes them as `{n}k{m}k`. @@ -225,7 +231,7 @@ export function parseInt2(v: any): number | undefined { * @returns An object with `isl` and `osl` in tokens, or `null` if no match is found. */ export function parseIslOsl(name: string): { isl: number; osl: number } | null { - const m = name.match(/[_-](?\d+)k(?\d+)k[_\-.]/iu); + const m = name.match(ISL_OSL_PATTERN); if (!m) return null; return { isl: parseInt(m[1], 10) * 1024, osl: parseInt(m[2], 10) * 1024 }; } diff --git a/packages/db/src/etl/server-log-metrics.test.ts b/packages/db/src/etl/server-log-metrics.test.ts new file mode 100644 index 00000000..9e0fa852 --- /dev/null +++ b/packages/db/src/etl/server-log-metrics.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from 'vitest'; + +import { kvCachePoolTokensFromServerLog } from './server-log-metrics'; + +describe('kvCachePoolTokensFromServerLog', () => { + it('returns null for empty / missing logs', () => { + expect(kvCachePoolTokensFromServerLog(null)).toBeNull(); + expect(kvCachePoolTokensFromServerLog('')).toBeNull(); + expect(kvCachePoolTokensFromServerLog('no kv cache line here')).toBeNull(); + }); + + it('reads a single-engine (ep1) pool size', () => { + const log = ` +(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1744] GPU KV cache size: 11,294,463 tokens +(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1745] Maximum concurrency for 1,048,576 tokens per request: 10.77x +`; + expect(kvCachePoolTokensFromServerLog(log)).toBe(11_294_463); + }); + + it('sums across data-parallel engine cores (ep8)', () => { + const lines = Array.from( + { length: 8 }, + (_, i) => + `(EngineCore_DP${i} pid=${2337827 + i}) INFO [kv_cache_utils.py:1744] GPU KV cache size: 11,577,333 tokens`, + ).join('\n'); + expect(kvCachePoolTokensFromServerLog(lines)).toBe(11_577_333 * 8); + }); + + it('dedups reprinted lines for the same engine core', () => { + const log = ` +(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens +(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens +(EngineCore_DP1 pid=2) GPU KV cache size: 5,000,000 tokens +`; + // DP0 counted once + DP1 once = 10M, not 15M. + expect(kvCachePoolTokensFromServerLog(log)).toBe(10_000_000); + }); + + it('falls back to bare lines when no engine-core prefix is present', () => { + const log = `INFO GPU KV cache size: 1,234,567 tokens`; + expect(kvCachePoolTokensFromServerLog(log)).toBe(1_234_567); + }); +}); diff --git a/packages/db/src/etl/server-log-metrics.ts b/packages/db/src/etl/server-log-metrics.ts new file mode 100644 index 00000000..b8b26dd1 --- /dev/null +++ b/packages/db/src/etl/server-log-metrics.ts @@ -0,0 +1,65 @@ +/** + * Derive server-side scalars from the captured vLLM server log + * (`server_logs.server_log`). These come from startup log lines rather than the + * scraped Prometheus `/metrics`, because for MLA / sparse-attention models the + * `vllm:cache_config_info` labels (num_gpu_blocks × block_size) do NOT + * reconstruct the real KV-cache token capacity — they undercount by a + * non-constant factor. vLLM's own `GPU KV cache size: N tokens` line is the + * authoritative number. + */ + +/** + * Total KV-cache pool size in tokens. + * + * vLLM prints one `GPU KV cache size: N tokens` line per engine core (one per + * data-parallel rank; tensor-parallel is already aggregated into that single + * per-engine number). We sum across distinct engine cores so the result is the + * deployment-wide total: + * + * (EngineCore pid=…) GPU KV cache size: 11,294,463 tokens → ep1 total + * (EngineCore_DP0 pid=…) GPU KV cache size: 11,577,333 tokens ┐ + * (EngineCore_DP1 pid=…) GPU KV cache size: 11,577,333 tokens ┘ → ×8 = total + * + * Returns null when the log has no such line (non-vLLM frameworks, or a log + * that didn't capture engine startup). + */ +export function kvCachePoolTokensFromServerLog(serverLog: string | null): number | null { + if (!serverLog) return null; + + // Scan line-by-line. We deliberately avoid a global regex over the whole blob + // with a lazy `[^\n]*?` bridge between the engine tag and the size: some logs + // contain multi-megabyte single lines (progress bars, tracebacks) that make + // such a regex recurse and blow the stack. A per-line substring pre-filter + // means the (cheap) regexes only ever run on the short KV-size lines. + // + // Each engine core prints one line; the tag (e.g. `EngineCore_DP3`) is stable + // across a run while the pid is not, so key on the tag to dedup reprints and + // sum across data-parallel ranks. + const tagRe = /\((?EngineCore(?:_DP\d+)?)\s+pid=\d+\)/u; + const sizeRe = /GPU KV cache size:\s*(?[\d,]+)\s*tokens/u; + const perEngine = new Map(); + let bareTotal = 0; + let bareFound = false; + for (const line of serverLog.split('\n')) { + if (!line.includes('GPU KV cache size')) continue; + const sizeMatch = sizeRe.exec(line); + if (!sizeMatch) continue; + const tokens = Number(sizeMatch.groups!.tokens!.replaceAll(',', '')); + if (!Number.isFinite(tokens) || tokens <= 0) continue; + const tagMatch = tagRe.exec(line); + if (tagMatch) { + perEngine.set(tagMatch.groups!.tag!, tokens); + } else { + // Fallback for logs without the engine-core prefix: count each occurrence + // (one per engine when there are no reprints). Best-effort only. + bareTotal += tokens; + bareFound = true; + } + } + if (perEngine.size > 0) { + let total = 0; + for (const v of perEngine.values()) total += v; + return total; + } + return bareFound ? bareTotal : null; +} diff --git a/packages/db/src/etl/server-metrics-adapters.ts b/packages/db/src/etl/server-metrics-adapters.ts new file mode 100644 index 00000000..f123d9f8 --- /dev/null +++ b/packages/db/src/etl/server-metrics-adapters.ts @@ -0,0 +1,100 @@ +/** + * Normalize orchestrator-specific server-metric labels into a stable source + * identity consumed by the API and frontend. AIPerf owns the export envelope; + * the serving orchestrator owns the meaning of labels inside each series. + */ + +export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown'; + +export interface RawMetricSourceSeries { + endpoint_url?: string; + labels?: Record; +} + +export interface ServerMetricsContext { + /** Canonical framework stored in configs, for example `dynamo-vllm`. */ + framework?: string | null; + /** Per-worker role series are only meaningful for disaggregated configs. */ + disagg?: boolean; +} + +export interface MetricSource { + /** Stable key used to join this source across different metric names. */ + id: string; + adapter: string; + role: MetricSourceRole; + endpointUrl: string | null; + nativeRole: string | null; + workerId: string | null; + dpRank: string | null; + engine: string | null; +} + +interface ServerMetricsAdapter { + id: string; + matches: (context: ServerMetricsContext) => boolean; + identifySource: (series: RawMetricSourceSeries) => MetricSource; +} + +function stableId(adapter: string, parts: (string | null | undefined)[]): string { + return [adapter, ...parts.map((part) => part ?? '')].join('|'); +} + +const dynamoAdapter: ServerMetricsAdapter = { + id: 'dynamo', + matches: ({ framework }) => framework?.startsWith('dynamo-') ?? false, + identifySource(series) { + const labels = series.labels ?? {}; + const nativeRole = labels['dynamo_component'] ?? null; + const role: MetricSourceRole = + nativeRole === 'prefill' + ? 'prefill' + : nativeRole === 'backend' + ? 'decode' + : nativeRole === 'frontend' || nativeRole === 'router' + ? 'router' + : 'unknown'; + const endpointUrl = series.endpoint_url ?? labels['dynamo_endpoint'] ?? null; + const workerId = labels['worker_id'] ?? null; + const dpRank = labels['dp_rank'] ?? null; + const engine = labels['engine'] ?? labels['engine_idx'] ?? null; + return { + id: stableId('dynamo', [role, endpointUrl, workerId, dpRank, engine]), + adapter: 'dynamo', + role, + endpointUrl, + nativeRole, + workerId, + dpRank, + engine, + }; + }, +}; + +const genericAdapter: ServerMetricsAdapter = { + id: 'generic', + matches: () => true, + identifySource(series) { + const labels = series.labels ?? {}; + const endpointUrl = series.endpoint_url ?? null; + const workerId = labels['worker_id'] ?? null; + const dpRank = labels['dp_rank'] ?? null; + const engine = labels['engine'] ?? labels['engine_idx'] ?? null; + return { + id: stableId('generic', [endpointUrl, workerId, dpRank, engine]), + adapter: 'generic', + role: endpointUrl || workerId || dpRank || engine ? 'unknown' : 'combined', + endpointUrl, + nativeRole: null, + workerId, + dpRank, + engine, + }; + }, +}; + +const ADAPTERS: readonly ServerMetricsAdapter[] = [dynamoAdapter, genericAdapter]; + +export function selectServerMetricsAdapter(context: ServerMetricsContext): ServerMetricsAdapter { + return ADAPTERS.find((adapter) => adapter.matches(context)) ?? genericAdapter; +} diff --git a/packages/db/src/etl/skip-tracker.test.ts b/packages/db/src/etl/skip-tracker.test.ts index 90ad73b7..e407db3a 100644 --- a/packages/db/src/etl/skip-tracker.test.ts +++ b/packages/db/src/etl/skip-tracker.test.ts @@ -9,6 +9,7 @@ describe('createSkipTracker', () => { expect(tracker.skips.unmappedHw).toBe(0); expect(tracker.skips.noIslOsl).toBe(0); expect(tracker.skips.dbError).toBe(0); + expect(tracker.skips.traceReplayMissing).toBe(0); }); it('initializes with empty unmapped sets', () => { diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts index 134b5299..5d485bf2 100644 --- a/packages/db/src/etl/skip-tracker.ts +++ b/packages/db/src/etl/skip-tracker.ts @@ -8,7 +8,10 @@ export interface Skips { unmappedModel: number; unmappedHw: number; noIslOsl: number; + failedRun: number; dbError: number; + /** Agentic point whose sibling `agentic_` artifact had no trace_replay files. */ + traceReplayMissing: number; } export interface SkipSnapshot { @@ -66,7 +69,15 @@ const MAX_DB_ERRORS = 10; * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets. */ export function createSkipTracker(): SkipTracker { - const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 }; + const skips: Skips = { + badZip: 0, + unmappedModel: 0, + unmappedHw: 0, + noIslOsl: 0, + failedRun: 0, + dbError: 0, + traceReplayMissing: 0, + }; const unmappedModels = new Set(); const unmappedHws = new Set(); const unmappedPrecisions = new Set(); diff --git a/packages/db/src/etl/trace-artifact-discovery.test.ts b/packages/db/src/etl/trace-artifact-discovery.test.ts new file mode 100644 index 00000000..2bb1d51b --- /dev/null +++ b/packages/db/src/etl/trace-artifact-discovery.test.ts @@ -0,0 +1,66 @@ +import { execFileSync } from 'node:child_process'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { afterEach, describe, expect, it } from 'vitest'; + +import { discoverTraceReplayArtifacts } from './trace-artifact-discovery'; + +const tempDirs: string[] = []; + +function tempDir(): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'trace-artifacts-test-')); + tempDirs.push(dir); + return dir; +} + +function writeTraceFiles(dir: string): void { + fs.mkdirSync(path.join(dir, 'aiperf_artifacts'), { recursive: true }); + fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'profile_export.jsonl'), '{}\n'); + fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.csv'), 'x,y\n'); + fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.json'), '{}'); +} + +afterEach(() => { + for (const dir of tempDirs.splice(0)) fs.rmSync(dir, { recursive: true, force: true }); +}); + +describe('discoverTraceReplayArtifacts', () => { + it('discovers the existing single-node sibling layout', () => { + const root = tempDir(); + writeTraceFiles(path.join(root, 'agentic_config-a')); + + const found = discoverTraceReplayArtifacts(root); + + expect(found.get('config-a')).toMatchObject({ + profileJsonl: expect.stringContaining('profile_export.jsonl'), + serverMetricsCsv: expect.stringContaining('server_metrics_export.csv'), + serverMetricsJson: expect.stringContaining('server_metrics_export.json'), + }); + }); + + it('extracts and indexes multinode traces by concurrency', () => { + const root = tempDir(); + const artifactDir = path.join(root, 'multinode_server_logs_config-b'); + const archiveSource = path.join(root, 'archive-source'); + writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_96')); + writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_128')); + fs.mkdirSync(artifactDir, { recursive: true }); + execFileSync('tar', [ + '-czf', + path.join(artifactDir, 'multinode_server_logs.tar.gz'), + '-C', + archiveSource, + '.', + ]); + fs.rmSync(archiveSource, { recursive: true, force: true }); + + const found = discoverTraceReplayArtifacts(root); + + expect([...found.keys()].toSorted()).toEqual(['config-b|128', 'config-b|96']); + expect(found.get('config-b|96')?.profileJsonl).toContain( + 'multinode_server_logs/agentic/conc_96/aiperf_artifacts/profile_export.jsonl', + ); + }); +}); diff --git a/packages/db/src/etl/trace-artifact-discovery.ts b/packages/db/src/etl/trace-artifact-discovery.ts new file mode 100644 index 00000000..71ee74df --- /dev/null +++ b/packages/db/src/etl/trace-artifact-discovery.ts @@ -0,0 +1,93 @@ +import { execFileSync } from 'node:child_process'; +import fs from 'node:fs'; +import path from 'node:path'; + +export interface TraceReplayArtifactPaths { + profileJsonl: string | null; + serverMetricsCsv: string | null; + serverMetricsJson: string | null; +} + +const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay']; + +const AGENTIC_PREFIX = /^agentic_/u; +const MULTINODE_PREFIX = /^multinode_server_logs_/u; +const CONC_DIR_PATTERN = /^conc_(?\d+)$/u; + +function traceFilesIn(dir: string): TraceReplayArtifactPaths | null { + let profileJsonl: string | null = null; + let serverMetricsCsv: string | null = null; + let serverMetricsJson: string | null = null; + + for (const subdir of TRACE_SUBDIRS) { + const traceDir = path.join(dir, subdir); + if (!fs.existsSync(traceDir) || !fs.statSync(traceDir).isDirectory()) continue; + + const profilePath = path.join(traceDir, 'profile_export.jsonl'); + const csvPath = path.join(traceDir, 'server_metrics_export.csv'); + const jsonPath = path.join(traceDir, 'server_metrics_export.json'); + if (!profileJsonl && fs.existsSync(profilePath)) profileJsonl = profilePath; + if (!serverMetricsCsv && fs.existsSync(csvPath)) serverMetricsCsv = csvPath; + if (!serverMetricsJson && fs.existsSync(jsonPath)) serverMetricsJson = jsonPath; + } + + if (!profileJsonl && !serverMetricsCsv && !serverMetricsJson) return null; + return { profileJsonl, serverMetricsCsv, serverMetricsJson }; +} + +function extractMultinodeArchive(artifactDir: string): string | null { + const archivePath = path.join(artifactDir, 'multinode_server_logs.tar.gz'); + const extractedDir = path.join(artifactDir, 'multinode_server_logs'); + + if (!fs.existsSync(extractedDir) && fs.existsSync(archivePath)) { + fs.mkdirSync(extractedDir, { recursive: true }); + execFileSync('tar', ['-xzf', archivePath, '-C', extractedDir], { stdio: 'ignore' }); + } + + return fs.existsSync(extractedDir) ? extractedDir : null; +} + +/** + * Discover trace-replay siblings in both artifact layouts: + * + * - Single-node: `agentic_/aiperf_artifacts/*` + * - Multinode: `multinode_server_logs_/multinode_server_logs.tar.gz`, + * containing `agentic/conc_/aiperf_artifacts/*` + * + * Multinode keys include concurrency (`|`) because one artifact + * contains several points, each with a distinct trace payload. + */ +export function discoverTraceReplayArtifacts( + artifactsDir: string, +): Map { + const discovered = new Map(); + if (!fs.existsSync(artifactsDir)) return discovered; + + for (const entry of fs.readdirSync(artifactsDir)) { + const artifactDir = path.join(artifactsDir, entry); + if (!fs.statSync(artifactDir).isDirectory()) continue; + + if (entry.startsWith('agentic_')) { + const trace = traceFilesIn(artifactDir); + if (trace) discovered.set(entry.replace(AGENTIC_PREFIX, ''), trace); + continue; + } + + if (!entry.startsWith('multinode_server_logs_')) continue; + const extractedDir = extractMultinodeArchive(artifactDir); + if (!extractedDir) continue; + + const agenticDir = path.join(extractedDir, 'agentic'); + if (!fs.existsSync(agenticDir) || !fs.statSync(agenticDir).isDirectory()) continue; + + const suffix = entry.replace(MULTINODE_PREFIX, ''); + for (const concEntry of fs.readdirSync(agenticDir)) { + const match = concEntry.match(CONC_DIR_PATTERN); + if (!match?.groups?.conc) continue; + const trace = traceFilesIn(path.join(agenticDir, concEntry)); + if (trace) discovered.set(`${suffix}|${match.groups.conc}`, trace); + } + } + + return discovered; +} diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts new file mode 100644 index 00000000..b50168db --- /dev/null +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -0,0 +1,151 @@ +/** + * Insert per-point aiperf trace files (`profile_export.jsonl` + + * `server_metrics_export.csv`) into `agentic_trace_replay` and link the new row + * to each provided benchmark_results row via `trace_replay_id`. + * + * Mirrors the {@link insertServerLog} idempotency contract: rows that already + * have a non-null `trace_replay_id` are left alone so a re-ingest doesn't + * duplicate the sibling blob. + */ + +import { gzipSync } from 'node:zlib'; + +import type postgres from 'postgres'; + +import { computeAggregateStats } from './compute-aggregate-stats.js'; +import { computeChartSeries } from './compute-chart-series.js'; +import { computeRequestTimeline } from './compute-request-timeline.js'; +import type { ServerMetricsContext } from './server-metrics-adapters'; + +type Sql = ReturnType; + +/** + * Persist the per-point trace files and link them to `benchmarkResultIds`. + * + * @param sql Active `postgres` connection. + * @param benchmarkResultIds DB ids of the benchmark_results rows produced by + * the same `bmk_agentic_` artifact whose + * sibling `agentic_` directory holds these + * trace files. + * @param profileExportJsonl Raw bytes of `profile_export.jsonl`, or null. + * Gzipped before storage. + * @param serverMetricsCsv Raw bytes of `server_metrics_export.csv`, or null. + * Stored as-is. + * @param serverMetricsJson Raw bytes of `server_metrics_export.json` — + * per-scrape time-series of every Prometheus metric. + * Optional, gzipped before storage (~42x ratio). + * @param metricsContext Canonical framework used to select the + * orchestrator-specific metric-label adapter. + */ +export async function insertTraceReplay( + sql: Sql, + benchmarkResultIds: number[], + profileExportJsonl: Buffer | null, + serverMetricsCsv: Buffer | null, + serverMetricsJson: Buffer | null = null, + metricsContext: ServerMetricsContext = {}, +): Promise { + if (benchmarkResultIds.length === 0) return; + if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return; + + // Only link rows that don't already point at a trace_replay row — keeps + // re-ingest from inserting duplicate sibling blobs. + const unlinked = await sql<{ id: number }[]>` + select id from benchmark_results + where id = any(${sql.array(benchmarkResultIds)}::bigint[]) + and trace_replay_id is null + `; + if (unlinked.length === 0) return; + + const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null; + const profileSize = profileExportJsonl ? profileExportJsonl.length : null; + const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null; + const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null; + const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null; + + // Pre-compute aggregate stats + chart-ready time-series + per-request + // timeline so the detail page doesn't have to re-parse these blobs on + // every request. Each helper tolerates a null blob and falls back to + // a streaming parser for oversized server_metrics blobs. + const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([ + computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }), + computeChartSeries(metricsJsonGz, metricsContext), + Promise.resolve(computeRequestTimeline(profileGz)), + ]); + + const [{ id: traceReplayId }] = await sql<{ id: number }[]>` + insert into agentic_trace_replay ( + profile_export_jsonl_gz, + profile_export_uncompressed_size, + server_metrics_csv, + server_metrics_csv_size, + server_metrics_json_gz, + server_metrics_json_uncompressed_size, + aggregate_stats, + chart_series, + request_timeline + ) + values ( + ${profileGz}, + ${profileSize}, + ${serverMetricsCsv}, + ${csvSize}, + ${metricsJsonGz}, + ${metricsJsonSize}, + ${sql.json(structuredClone(aggregateStats) as unknown as Parameters[0])}, + ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters[0])}, + ${requestTimeline === null ? null : sql.json(structuredClone(requestTimeline) as unknown as Parameters[0])} + ) + returning id + `; + + await sql` + update benchmark_results + set trace_replay_id = ${traceReplayId} + where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) + `; + + // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang + // runs don't populate these in the harness JSON; vLLM runs do but only + // for GPU. We always recompute to keep the derivation consistent with + // what the detail-page charts plot — overwriting any pre-existing value. + // + // Source label naming differs by framework / cache topology: + // SGLang hicache: 'cache hit (HBM)' + 'cache hit (CPU offload)' + // SGLang older: 'cache hit' (no tier breakdown) + // vLLM LMCache: 'local_cache_hit' + 'external_kv_transfer' (+ 'local_compute' for miss) + // vLLM single: falls back to prefixCacheHitsTps total (= local cache only) + if (chartSeries && chartSeries.prefillTps.length > 0) { + const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); + if (sumPrompts > 0) { + const sumOf = (name: string): number => + (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0); + // CPU-offload hits: SGLang hicache + vLLM LMCache external transfer. + const cpuHits = sumOf('cache hit (CPU offload)') + sumOf('external_kv_transfer'); + // GPU/HBM hits from source breakdown, summed across known aliases. + const hbmFromBreakdown = + sumOf('cache hit (HBM)') + sumOf('cache hit') + sumOf('local_cache_hit'); + // If the source breakdown has any GPU entry, use it. Otherwise fall back + // to total prefixCacheHitsTps sum (single-source vLLM path with no + // by_source metric — equals the lone cache counter's lifetime). + const gpuHits = + hbmFromBreakdown > 0 + ? hbmFromBreakdown + : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); + const gpuRate = gpuHits / sumPrompts; + const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null; + await sql` + update benchmark_results + set metrics = jsonb_set( + case when ${cpuRate}::numeric is not null + then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric)) + else metrics + end, + '{server_gpu_cache_hit_rate}', + to_jsonb(${gpuRate}::numeric) + ) + where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) + `; + } + } +} diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts new file mode 100644 index 00000000..444236ab --- /dev/null +++ b/packages/db/src/etl/weka-structure.test.ts @@ -0,0 +1,259 @@ +import { describe, it, expect } from 'vitest'; +import { + countSeenPrefixBlocks, + buildConversationStructure, + countConversationRequests, + linearHistogram, + logHistogram, + logHistogramWithZero, + subagentRequestTurns, + summarizeValues, + type RawWekaConversation, + type SubagentNode, + type TurnNode, +} from './weka-structure'; + +describe('countSeenPrefixBlocks', () => { + it('counts only the contiguous leading run already seen', () => { + const seen = new Set([1, 2, 3, 9]); + // 1,2,3 seen contiguously; 4 breaks the run even though 9 is seen later. + expect(countSeenPrefixBlocks([1, 2, 3, 4, 9], seen)).toBe(3); + }); + + it('returns 0 when the first block is unseen', () => { + expect(countSeenPrefixBlocks([7, 1, 2], new Set([1, 2]))).toBe(0); + }); + + it('returns the full length when every block is seen', () => { + expect(countSeenPrefixBlocks([1, 2], new Set([1, 2, 3]))).toBe(2); + }); + + it('handles empty hash list', () => { + expect(countSeenPrefixBlocks([], new Set([1]))).toBe(0); + }); +}); + +describe('buildConversationStructure', () => { + it('splits input into cached-prefix vs uncached as the prefix cache warms', () => { + const conv: RawWekaConversation = { + id: 'c1', + block_size: 64, + requests: [ + // Turn 0: nothing seen yet → all uncached. + { type: 'n', model: 'm', in: 128, out: 10, hash_ids: [1, 2] }, + // Turn 1: blocks 1,2 already seen, 3 is new → 2 blocks cached. + { type: 'n', model: 'm', in: 192, out: 20, hash_ids: [1, 2, 3] }, + ], + }; + const s = buildConversationStructure(conv); + const t0 = s.nodes[0] as TurnNode; + const t1 = s.nodes[1] as TurnNode; + expect(t0).toMatchObject({ kind: 'turn', in: 128, cached: 0, uncached: 128, out: 10 }); + expect(t1.cached).toBe(128); // 2 blocks × 64 + expect(t1.uncached).toBe(64); // 192 - 128 + expect(s.totals).toMatchObject({ + in: 320, + out: 30, + cached: 128, + uncached: 192, + numTurns: 2, + numSubagentGroups: 0, + }); + }); + + it('stamps top-level turns with their raw Weka request index', () => { + const structure = buildConversationStructure({ + id: 'raw-index', + requests: [ + { type: 'n', in: 1, out: 1 }, + { type: 'subagent', requests: [{ type: 'n', in: 1, out: 1 }] }, + { type: 'n', in: 1, out: 1 }, + ], + }); + + expect((structure.nodes[0] as TurnNode).rawIndex).toBe(0); + expect((structure.nodes[2] as TurnNode).rawIndex).toBe(2); + }); + + it('clamps cached to the effective input on a partial last block', () => { + const conv: RawWekaConversation = { + id: 'c2', + block_size: 64, + requests: [ + { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // 2 blocks but in=100 (partial) + { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // both seen → cached clamped to 100 + ], + }; + const s = buildConversationStructure(conv); + const t1 = s.nodes[1] as TurnNode; + expect(t1.cached).toBe(100); + expect(t1.uncached).toBe(0); + }); + + it('treats turns with no hash_ids as fully uncached', () => { + const conv: RawWekaConversation = { + id: 'c3', + requests: [{ type: 'n', in: 50, out: 5 }], + }; + const t0 = buildConversationStructure(conv).nodes[0] as TurnNode; + expect(t0).toMatchObject({ cached: 0, uncached: 50 }); + }); + + it('nests subagent groups with aggregated children and runs them against a spawn-time snapshot', () => { + const conv: RawWekaConversation = { + id: 'c4', + block_size: 64, + requests: [ + { type: 'n', model: 'main', t: 0, api_time: 1, in: 64, out: 10, hash_ids: [1] }, + { + type: 'subagent', + agent_id: 'a1', + subagent_type: 'Explore', + t: 12.5, + duration_ms: 1234, + requests: [ + // sees parent block 1 (snapshot at spawn) → 1 block cached + { type: 'n', model: 'sub', t: 12.5, in: 128, out: 7, hash_ids: [1, 5] }, + // now block 5 is also seen within the subagent → 2 cached + { type: 'n', model: 'sub', t: 13.1, in: 128, out: 3, hash_ids: [1, 5] }, + ], + }, + // Parent turn after subagent: block 5 must NOT be cached (subagent + // context not folded back); only block 1 is in the parent seen set. + { type: 'n', model: 'main', in: 128, out: 1, hash_ids: [1, 5] }, + ], + }; + const s = buildConversationStructure(conv); + expect(s.totals.numTurns).toBe(2); // two top-level normal turns + expect(s.totals.numSubagentGroups).toBe(1); + + const sub = s.nodes[1] as SubagentNode; + expect(sub.kind).toBe('subagent'); + expect(sub.label).toBe('Explore'); + expect(sub.agentId).toBe('a1'); + expect(sub.rawIndex).toBe(1); + expect(sub.durationMs).toBe(1234); + expect(sub.startS).toBe(12.5); + expect(sub.endS).toBeCloseTo(13.734, 6); + expect(sub.children).toHaveLength(2); + expect(countConversationRequests(s)).toBe(4); + expect(subagentRequestTurns(s).map((turn) => turn.model)).toEqual(['sub', 'sub']); + expect(sub.children.map((child) => [child.startS, child.endS])).toEqual([ + [12.5, 12.5], + [13.1, 13.1], + ]); + expect(sub.children.map((child) => [child.rawIndex, child.innerIndex])).toEqual([ + [1, 0], + [1, 1], + ]); + expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot + expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child + expect(sub.in).toBe(256); + expect(sub.out).toBe(10); + + const afterSub = s.nodes[2] as TurnNode; + expect(afterSub.cached).toBe(64); // only block 1; block 5 not folded back + expect((s.nodes[0] as TurnNode).endS).toBe(1); + }); + + it('counts top-level and subagent child turns as requests, but not subagent groups', () => { + const structure = buildConversationStructure({ + id: 'request-count', + requests: [ + { type: 'n', in: 1, out: 1 }, + { + type: 'subagent', + requests: [ + { type: 'n', in: 1, out: 1 }, + { type: 'n', in: 1, out: 1 }, + ], + }, + ], + }); + + expect(countConversationRequests(structure)).toBe(3); + expect(subagentRequestTurns(structure)).toHaveLength(2); + }); + + it('falls back to the default block size and a generic subagent label', () => { + const conv: RawWekaConversation = { + id: 'c5', + requests: [{ type: 'subagent', requests: [{ type: 'n', in: 10, out: 1, hash_ids: [1] }] }], + }; + const s = buildConversationStructure(conv); + expect(s.blockSize).toBe(64); + expect((s.nodes[0] as SubagentNode).label).toBe('Subagent'); + }); + + it('derives a subagent time range from child timings when group timing is absent', () => { + const conv: RawWekaConversation = { + id: 'c6', + requests: [ + { + type: 'subagent', + requests: [ + { type: 'n', t: 5, api_time: 2.5, in: 10, out: 1 }, + { type: 'n', t: 9, api_time: 3, in: 10, out: 1 }, + ], + }, + ], + }; + const sub = buildConversationStructure(conv).nodes[0] as SubagentNode; + expect(sub.startS).toBe(5); + expect(sub.endS).toBe(12); + }); + + it('normalizes legacy subagent-relative request intervals', () => { + const structure = buildConversationStructure({ + id: 'legacy-relative', + requests: [ + { + type: 'subagent', + t: 100, + requests: [{ type: 'n', t: 2, api_time: 3, in: 10, out: 1 }], + }, + ], + }); + const child = (structure.nodes[0] as SubagentNode).children[0]!; + expect(child).toMatchObject({ startS: 102, endS: 105 }); + }); +}); + +describe('histograms', () => { + it('linearHistogram buckets across [0, max] and totals the count', () => { + const bins = linearHistogram([0, 1, 2, 3, 4], 4); + expect(bins).toHaveLength(4); + expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(5); + expect(bins[0].x0).toBe(0); + }); + + it('linearHistogram handles all-zero input', () => { + expect(linearHistogram([0, 0])).toEqual([{ x0: 0, x1: 1, count: 2 }]); + }); + + it('logHistogram drops non-positive values and preserves the positive total', () => { + const bins = logHistogram([1, 10, 100, 1000, 0, -5], 3); + expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(4); + }); + + it('both return [] for empty input', () => { + expect(linearHistogram([])).toEqual([]); + expect(logHistogram([])).toEqual([]); + }); + + it('preserves zero-valued samples in a dedicated log histogram bin', () => { + const bins = logHistogramWithZero([0, 0, 1, 10, 100], 4); + expect(bins[0]).toEqual({ x0: 0, x1: 1, count: 2 }); + expect(bins.reduce((total, bin) => total + bin.count, 0)).toBe(5); + }); +}); + +describe('summarizeValues', () => { + it('computes the same linearly-interpolated percentile set as request distributions', () => { + const summary = summarizeValues(Array.from({ length: 100 }, (_, i) => i + 1)); + expect(summary.median).toBeCloseTo(50.5, 6); + expect(summary.p75).toBeCloseTo(75.25, 6); + expect(summary.p90).toBeCloseTo(90.1, 6); + expect(summary.p95).toBeCloseTo(95.05, 6); + }); +}); diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts new file mode 100644 index 00000000..ccfb6ec7 --- /dev/null +++ b/packages/db/src/etl/weka-structure.ts @@ -0,0 +1,327 @@ +/** + * Pure transforms for the HuggingFace cc-traces-weka datasets. + * + * Turns a raw conversation record (`{ id, block_size, requests[] }`, where each + * request is a normal turn or a subagent group) into a compact, flamegraph-ready + * `structure`: ordered nodes with input split into cached-prefix vs + * uncached-suffix. The cached split ports `_count_seen_prefix_blocks` from the + * aiperf weka loader (contiguous leading hash_ids already seen under an infinite + * KV cache). No DB access — safe to import anywhere and unit-test directly. + */ + +export const DEFAULT_BLOCK_SIZE = 64; + +// ── Raw record shapes (subset we read) ────────────────────────────────────── + +export interface RawWekaRequest { + t?: number; + type?: string; // 'n' | 's' + model?: string; + in?: number; + out?: number; + hash_ids?: number[]; + api_time?: number; +} + +export interface RawWekaSubagent { + t?: number; + type: 'subagent'; + agent_id?: string; + subagent_type?: string; + duration_ms?: number; + requests?: RawWekaRequest[]; + models?: string[]; +} + +export type RawWekaEntry = RawWekaRequest | RawWekaSubagent; + +export interface RawWekaConversation { + id: string; + models?: string[]; + block_size?: number; + hash_id_scope?: string; + requests?: RawWekaEntry[]; +} + +// ── Output structure (stored in dataset_conversations.structure) ───────────── + +export interface TurnNode { + kind: 'turn'; + turnIndex: number; + /** Zero-based index in the raw Weka requests array, when this row maps to one. */ + rawIndex?: number; + /** Zero-based index within a raw nested request array, when this row maps to one. */ + innerIndex?: number; + /** Seconds from the start of the conversation. */ + startS?: number; + /** End of the original request interval (`startS + api_time`). */ + endS?: number; + model?: string; + in: number; + out: number; + /** Input tokens served from the prefix cache (≤ in). */ + cached: number; + /** Input tokens that must be (re)computed (in - cached). */ + uncached: number; +} + +export interface SubagentNode { + kind: 'subagent'; + label: string; + agentId?: string; + /** Zero-based index of the raw top-level subagent marker. */ + rawIndex?: number; + /** Seconds from the start of the conversation. */ + startS?: number; + /** Seconds from the start of the conversation. */ + endS?: number; + durationMs?: number; + in: number; + out: number; + cached: number; + uncached: number; + children: TurnNode[]; +} + +export type StructureNode = TurnNode | SubagentNode; + +export interface ConversationStructure { + blockSize: number; + nodes: StructureNode[]; + totals: { + in: number; + out: number; + cached: number; + uncached: number; + numTurns: number; + numSubagentGroups: number; + }; +} + +/** Actual model requests in a conversation: main turns plus subagent child turns. */ +export function countConversationRequests(structure: ConversationStructure): number { + return structure.totals.numTurns + subagentRequestTurns(structure).length; +} + +/** Model requests issued by inner subagents, excluding all parent-agent turns. */ +export function subagentRequestTurns(structure: ConversationStructure): TurnNode[] { + return structure.nodes.flatMap((node) => (node.kind === 'subagent' ? node.children : [])); +} + +const isSubagent = (e: RawWekaEntry): e is RawWekaSubagent => + (e as RawWekaSubagent).type === 'subagent'; + +/** + * Count contiguous leading hash_ids already present in `seen` + * (port of aiperf `_count_seen_prefix_blocks`). + */ +export function countSeenPrefixBlocks( + hashIds: readonly number[], + seen: ReadonlySet, +): number { + let hits = 0; + for (const h of hashIds) { + if (!seen.has(h)) break; + hits += 1; + } + return hits; +} + +/** + * Compute the {cached, uncached} input-token split for one request and fold its + * blocks into `seen`. `cached` is derived from blocks but clamped to the + * request's effective `in` so cached+uncached === in even when the last block is + * partial (in = hash_token_count, not always a multiple of blockSize). + */ +function splitInput( + req: RawWekaRequest, + seen: Set, + blockSize: number, +): { in: number; cached: number; uncached: number } { + const input = Math.max(0, Math.round(req.in ?? 0)); + const hashIds = req.hash_ids ?? []; + if (hashIds.length === 0) { + return { in: input, cached: 0, uncached: input }; + } + const cachedBlocks = countSeenPrefixBlocks(hashIds, seen); + for (const h of hashIds) seen.add(h); + const cached = Math.min(input, cachedBlocks * blockSize); + return { in: input, cached, uncached: input - cached }; +} + +function subagentLabel(s: RawWekaSubagent): string { + const base = s.subagent_type?.trim(); + return base && base.length > 0 ? base : 'Subagent'; +} + +function finiteTime(value: number | undefined): number | undefined { + return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined; +} + +function requestEndS(startS: number | undefined, apiTime: number | undefined): number | undefined { + if (startS === undefined) return undefined; + const duration = finiteTime(apiTime) ?? 0; + return startS + duration; +} + +/** Mirror aiperf's legacy-relative/current-absolute subagent timestamp handling. */ +function subagentRequestStartS( + entry: RawWekaSubagent, + request: RawWekaRequest, +): number | undefined { + const requestStart = finiteTime(request.t); + if (requestStart === undefined) return undefined; + const groupStart = finiteTime(entry.t); + if (groupStart !== undefined && requestStart + 1e-6 < groupStart) { + return groupStart + requestStart; + } + return requestStart; +} + +function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } { + const children = entry.requests ?? []; + const childStarts = children + .map((child) => subagentRequestStartS(entry, child)) + .filter((value): value is number => value !== undefined); + const startS = + finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined); + const durationMs = finiteTime(entry.duration_ms); + if (startS !== undefined && durationMs !== undefined) { + return { startS, endS: startS + durationMs / 1000 }; + } + + const childEnds = children + .map((child) => { + const childStart = subagentRequestStartS(entry, child); + if (childStart === undefined) return undefined; + return childStart + (finiteTime(child.api_time) ?? 0); + }) + .filter((value): value is number => value !== undefined); + return { + startS, + endS: childEnds.length > 0 ? Math.max(...childEnds) : startS, + }; +} + +/** + * Build the flamegraph structure for one conversation. Main turns share a single + * accumulating prefix-cache `seen` set; each subagent group runs against a + * *copy* of the parent `seen` at spawn (its context is separate and is not + * folded back into the parent), mirroring the weka loader's parent/child split. + */ +export function buildConversationStructure( + conv: RawWekaConversation, + blockSizeOverride?: number, +): ConversationStructure { + const blockSize = blockSizeOverride ?? conv.block_size ?? DEFAULT_BLOCK_SIZE; + const seen = new Set(); + const nodes: StructureNode[] = []; + let totalIn = 0; + let totalOut = 0; + let totalCached = 0; + let totalUncached = 0; + let numTurns = 0; + let numSubagentGroups = 0; + let turnIndex = 0; + + for (const [idx, entry] of (conv.requests ?? []).entries()) { + if (isSubagent(entry)) { + const { startS, endS } = subagentTimeRange(entry); + const childSeen = new Set(seen); // snapshot at spawn; not merged back + const children: TurnNode[] = []; + let gin = 0; + let gout = 0; + let gcached = 0; + let guncached = 0; + for (const [innerIdx, inner] of (entry.requests ?? []).entries()) { + const split = splitInput(inner, childSeen, blockSize); + const out = Math.max(0, Math.round(inner.out ?? 0)); + const childStartS = subagentRequestStartS(entry, inner); + children.push({ + kind: 'turn', + turnIndex: turnIndex++, + rawIndex: idx, + innerIndex: innerIdx, + startS: childStartS, + endS: requestEndS(childStartS, inner.api_time), + model: inner.model, + in: split.in, + out, + cached: split.cached, + uncached: split.uncached, + }); + gin += split.in; + gout += out; + gcached += split.cached; + guncached += split.uncached; + } + nodes.push({ + kind: 'subagent', + label: subagentLabel(entry), + agentId: entry.agent_id, + rawIndex: idx, + startS, + endS, + durationMs: entry.duration_ms, + in: gin, + out: gout, + cached: gcached, + uncached: guncached, + children, + }); + numSubagentGroups += 1; + totalIn += gin; + totalOut += gout; + totalCached += gcached; + totalUncached += guncached; + } else { + const split = splitInput(entry, seen, blockSize); + const out = Math.max(0, Math.round(entry.out ?? 0)); + const startS = finiteTime(entry.t); + nodes.push({ + kind: 'turn', + turnIndex: turnIndex++, + rawIndex: idx, + startS, + endS: requestEndS(startS, entry.api_time), + model: entry.model, + in: split.in, + out, + cached: split.cached, + uncached: split.uncached, + }); + numTurns += 1; + totalIn += split.in; + totalOut += out; + totalCached += split.cached; + totalUncached += split.uncached; + } + } + + return { + blockSize, + nodes, + totals: { + in: totalIn, + out: totalOut, + cached: totalCached, + uncached: totalUncached, + numTurns, + numSubagentGroups, + }, + }; +} + +// ── Distribution binning (for the dataset-detail cards) ────────────────────── +// The implementations moved to distribution-stats.ts (generic, dataset-agnostic +// math); re-exported here because this module is the established import path +// for the dataset ingest/backfill scripts and the frontend. + +export { + linearHistogram, + logHistogram, + logHistogramWithZero, + summarizeValues, + type HistogramBin, + type NumberSummary, +} from './distribution-stats'; From 28b78cdb22ed64ca385401003271b5ed310857ab Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:11:35 -0500 Subject: [PATCH 04/40] feat(db): agentic query layer, backfill CLIs, weka dataset ingest --- packages/db/src/backfill-agentic-intvty.ts | 107 +++++ .../db/src/backfill-agentic-server-logs.ts | 215 +++++++++ packages/db/src/backfill-aggregate-stats.ts | 126 ++++++ packages/db/src/backfill-chart-series.ts | 124 ++++++ packages/db/src/backfill-dataset-stats.ts | 111 +++++ packages/db/src/backfill-kv-pool.ts | 103 +++++ packages/db/src/backfill-request-timeline.ts | 97 ++++ packages/db/src/ingest-ci-run.ts | 187 ++++++-- packages/db/src/ingest-gcs-backup.ts | 9 +- packages/db/src/ingest-supplemental.ts | 14 +- packages/db/src/ingest-weka-dataset.ts | 416 ++++++++++++++++++ .../src/json-provider.line-single-run.test.ts | 26 +- packages/db/src/json-provider.ts | 17 +- packages/db/src/lib/backfill-runner.test.ts | 55 +++ packages/db/src/lib/backfill-runner.ts | 98 +++++ packages/db/src/lib/github-artifacts.test.ts | 42 ++ packages/db/src/lib/github-artifacts.ts | 86 ++++ .../db/src/queries/agentic-aggregates.test.ts | 113 +++++ packages/db/src/queries/agentic-aggregates.ts | 406 +++++++++++++++++ packages/db/src/queries/agentic-shared.ts | 81 ++++ packages/db/src/queries/benchmark-siblings.ts | 169 +++++++ packages/db/src/queries/benchmarks.ts | 33 +- packages/db/src/queries/datasets.test.ts | 102 +++++ packages/db/src/queries/datasets.ts | 213 +++++++++ .../queries/derived-agentic-metrics.test.ts | 111 +++++ .../db/src/queries/derived-agentic-metrics.ts | 268 +++++++++++ .../db/src/queries/request-timeline.test.ts | 45 ++ packages/db/src/queries/request-timeline.ts | 64 +++ packages/db/src/queries/trace-availability.ts | 34 ++ .../db/src/queries/trace-histograms.test.ts | 78 ++++ packages/db/src/queries/trace-histograms.ts | 134 ++++++ .../src/queries/trace-server-metrics.test.ts | 105 +++++ .../db/src/queries/trace-server-metrics.ts | 211 +++++++++ packages/db/src/queries/workflow-info.ts | 15 +- 34 files changed, 3944 insertions(+), 71 deletions(-) create mode 100644 packages/db/src/backfill-agentic-intvty.ts create mode 100644 packages/db/src/backfill-agentic-server-logs.ts create mode 100644 packages/db/src/backfill-aggregate-stats.ts create mode 100644 packages/db/src/backfill-chart-series.ts create mode 100644 packages/db/src/backfill-dataset-stats.ts create mode 100644 packages/db/src/backfill-kv-pool.ts create mode 100644 packages/db/src/backfill-request-timeline.ts create mode 100644 packages/db/src/ingest-weka-dataset.ts create mode 100644 packages/db/src/lib/backfill-runner.test.ts create mode 100644 packages/db/src/lib/backfill-runner.ts create mode 100644 packages/db/src/lib/github-artifacts.test.ts create mode 100644 packages/db/src/lib/github-artifacts.ts create mode 100644 packages/db/src/queries/agentic-aggregates.test.ts create mode 100644 packages/db/src/queries/agentic-aggregates.ts create mode 100644 packages/db/src/queries/agentic-shared.ts create mode 100644 packages/db/src/queries/benchmark-siblings.ts create mode 100644 packages/db/src/queries/datasets.test.ts create mode 100644 packages/db/src/queries/datasets.ts create mode 100644 packages/db/src/queries/derived-agentic-metrics.test.ts create mode 100644 packages/db/src/queries/derived-agentic-metrics.ts create mode 100644 packages/db/src/queries/request-timeline.test.ts create mode 100644 packages/db/src/queries/request-timeline.ts create mode 100644 packages/db/src/queries/trace-availability.ts create mode 100644 packages/db/src/queries/trace-histograms.test.ts create mode 100644 packages/db/src/queries/trace-histograms.ts create mode 100644 packages/db/src/queries/trace-server-metrics.test.ts create mode 100644 packages/db/src/queries/trace-server-metrics.ts diff --git a/packages/db/src/backfill-agentic-intvty.ts b/packages/db/src/backfill-agentic-intvty.ts new file mode 100644 index 00000000..a8eebdba --- /dev/null +++ b/packages/db/src/backfill-agentic-intvty.ts @@ -0,0 +1,107 @@ +/** + * Backfill: enforce the slow-tail interactivity invariant on agentic rows. + * + * Agentic trace-replay artifacts emit both `*_itl` and `*_intvty`. Historically + * the harness wrote `*_intvty = 1/p(ITL)` (slow-tail — "interactivity at the + * p-th latency"), which is what the inference chart's interactivity selector + * and the detail time-series both assume. A later "timing fix" harness started + * emitting `*_intvty = p(1/ITL)` instead (fast-tail — equivalent to + * `1/p(100-x)(ITL)`), because taking the reciprocal reverses percentile order. + * Ingest stores every metric verbatim, so those runs landed in the DB with the + * opposite definition — e.g. p90 reading 23.9 instead of 11.2 for the same + * point — contaminating cross-run Pareto comparisons. + * + * This rewrites `mean/p75/p90/p95 _intvty = 1/_itl` for every agentic row so the + * stored value always matches the slow-tail definition the charts use. It is + * idempotent: rows already on the correct definition are left untouched (guarded + * by a relative-deviation check). `std_intvty` is intentionally NOT touched — + * the reciprocal of a standard deviation is meaningless, and the API strips it. + * The prior fast-tail value is discarded on purpose (p10_itl isn't stored, so it + * isn't recoverable anyway, and per project policy fast-tail must not back a + * slow-tail selector). + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes + */ + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; +import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils.js'; + +// Percentile-style keys whose interactivity is the reciprocal of the matching +// ITL percentile. `std` is excluded by design (not a reciprocal); `median`/`p99` +// are absent from agentic artifacts so they never appear here. +const KEYS = ['mean', 'p75', 'p90', 'p95'] as const; + +// Relative tolerance: skip rows already within 1e-6 of 1/itl so correct rows +// keep their original full-precision value and the change counts are accurate. +const REL_TOL = 1e-6; + +const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} }); + +async function contaminationCounts(): Promise> { + const out: Record = {}; + for (const k of KEYS) { + const rows = await sql.unsafe(` + SELECT count(*)::int AS n + FROM benchmark_results + WHERE benchmark_type = 'agentic_traces' + AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0 + AND metrics ? '${k}_intvty' + AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric) + > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric) + `); + out[k] = (rows[0] as unknown as { n: number }).n; + } + return out; +} + +async function main(): Promise { + const total = await sql<{ n: number }[]>` + SELECT count(*)::int AS n FROM benchmark_results WHERE benchmark_type = 'agentic_traces' + `; + console.log(`Agentic rows: ${total[0]!.n}`); + + const before = await contaminationCounts(); + console.log('Contaminated (intvty != 1/itl) before:', JSON.stringify(before)); + if (KEYS.every((k) => before[k] === 0)) { + console.log('Nothing to backfill — all agentic rows already satisfy intvty = 1/itl.'); + await sql.end(); + return; + } + + if (!hasYesFlag() && !(await confirm('Rewrite *_intvty = 1/*_itl for these rows? (y/N) '))) { + await sql.end(); + return; + } + + let totalUpdated = 0; + for (const k of KEYS) { + // keys are from a fixed trusted const — safe to interpolate. + const res = await sql.unsafe(` + UPDATE benchmark_results + SET metrics = jsonb_set(metrics, '{${k}_intvty}', to_jsonb(1.0 / (metrics->>'${k}_itl')::numeric)) + WHERE benchmark_type = 'agentic_traces' + AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0 + AND metrics ? '${k}_intvty' + AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric) + > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric) + `); + console.log(` ${k}_intvty: updated ${res.count} row(s)`); + totalUpdated += res.count; + } + + const after = await contaminationCounts(); + console.log('Contaminated after:', JSON.stringify(after)); + if (!KEYS.every((k) => after[k] === 0)) { + throw new Error('Backfill incomplete — some rows still deviate. Aborting before MV refresh.'); + } + + await refreshLatestBenchmarks(sql); + console.log(`Done. Rewrote ${totalUpdated} metric value(s) across agentic rows.`); + await sql.end(); +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/packages/db/src/backfill-agentic-server-logs.ts b/packages/db/src/backfill-agentic-server-logs.ts new file mode 100644 index 00000000..37157861 --- /dev/null +++ b/packages/db/src/backfill-agentic-server-logs.ts @@ -0,0 +1,215 @@ +/** + * Backfill server logs (and the derived KV-cache pool size) for AGENTIC + * benchmark points. + * + * Agentic runs upload their vLLM server log as a `server_logs_` artifact, + * but the ingest path historically failed to link it to agentic rows (the + * `bmk_agentic_` → `server_logs_` key mismatch, now fixed in + * ingest-ci-run). As a result the agentic server log text was never stored, so + * `kv_cache_pool_tokens` cannot be derived from the DB — we must re-fetch the + * artifacts from GitHub. + * + * For each agentic workflow run this: + * 1. lists the run's artifacts and keeps only `server_logs_*` + `bmk_agentic_*` + * (dedup by logical name, mirroring ingest's runner-suffix collapse), + * 2. downloads + unzips just those (small — skips the multi-MB trace dirs), + * 3. maps each `bmk_agentic_` JSON → config → benchmark_results rows via + * the same mapBenchmarkRow/config-cache logic ingest uses, + * 4. calls insertServerLog(), which stores+links the log AND derives + * `kv_cache_pool_tokens` into benchmark_results.metrics. + * + * Idempotent: insertServerLog only links rows whose server_log_id is null. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-server-logs + * [--limit N] only process the first N workflow runs + * [--yes] skip the confirmation prompt + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { hasNoSslFlag } from './cli-utils'; +import { insertServerLog } from './etl/benchmark-ingest'; +import { mapBenchmarkRow } from './etl/benchmark-mapper'; +import { createConfigCache } from './etl/config-cache'; +import { createAdminSql } from './etl/db-utils'; +import { createSkipTracker } from './etl/skip-tracker'; +import { confirmProceed, parseLimitForceFlags, runBackfillMain } from './lib/backfill-runner'; +import { + RUNNER_SUFFIX_RE, + dedupeArtifactsByLogicalName, + downloadArtifact, + listRunArtifacts, + type ArtifactMeta, +} from './lib/github-artifacts'; + +const REPO = 'SemiAnalysisAI/InferenceX'; + +const flags = parseLimitForceFlags(); +const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} }); + +/** + * List the run's `server_logs_*` / `bmk_agentic_*` artifacts, deduped by + * runner-suffix-stripped logical name (matches ingest's collapse). + */ +function listArtifacts(githubRunId: string): Map { + return dedupeArtifactsByLogicalName( + listRunArtifacts(REPO, githubRunId).filter( + (a) => a.name.startsWith('server_logs_') || a.name.startsWith('bmk_agentic_'), + ), + ); +} + +/** Logical key shared by a server_logs_/bmk_agentic_ artifact pair. */ +function logicalKey(name: string): string { + return name + .replace(/^server_logs_/u, '') + .replace(/^bmk_agentic_/u, '') + .replace(RUNNER_SUFFIX_RE, ''); +} + +/** + * Read up to `maxBytes` of a (possibly huge) server log as UTF-8, stripping NUL + * bytes. vLLM's "GPU KV cache size" startup lines are near the top, so a head + * read is enough to derive the KV pool — and it caps storage for the rare + * multi-hundred-MB logs that exceed V8's ~512 MB string limit. + */ +const stripNul = (s: string): string => s.replaceAll(String.fromCodePoint(0), ''); + +function readServerLogCapped(p: string, maxBytes = 64 * 1024 * 1024): string { + if (fs.statSync(p).size <= maxBytes) return stripNul(fs.readFileSync(p, 'utf8')); + const fd = fs.openSync(p, 'r'); + try { + const buf = Buffer.allocUnsafe(maxBytes); + const n = fs.readSync(fd, buf, 0, maxBytes, 0); + return stripNul(buf.subarray(0, n).toString('utf8')); + } finally { + fs.closeSync(fd); + } +} + +function findJsonFiles(dir: string): string[] { + const out: string[] = []; + const walk = (d: string) => { + for (const e of fs.readdirSync(d, { withFileTypes: true })) { + const p = path.join(d, e.name); + if (e.isDirectory()) walk(p); + else if (e.name.endsWith('.json')) out.push(p); + } + }; + walk(dir); + return out; +} + +async function main(): Promise { + console.log('=== backfill-agentic-server-logs ==='); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // Agentic workflow runs that still have unlinked server logs. + const runs = await sql<{ github_run_id: string; workflow_run_id: number }[]>` + select distinct wr.github_run_id::text as github_run_id, wr.id as workflow_run_id + from benchmark_results br + join workflow_runs wr on wr.id = br.workflow_run_id + where br.benchmark_type = 'agentic_traces' + and br.server_log_id is null + order by wr.id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (runs.length === 0) { + console.log('\n Nothing to do — all agentic rows already have a server log.'); + return; + } + if (!(await confirmProceed(`${runs.length} agentic workflow run(s) to process.`))) return; + + const cache = createConfigCache(sql); + await cache.preloadConfigs(); + const tracker = createSkipTracker(); + + let linkedRows = 0; + let runsOk = 0; + let runsFailed = 0; + const t0 = Date.now(); + + for (const { github_run_id: githubRunId, workflow_run_id: wrId } of runs) { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `kvpool-${githubRunId}-`)); + try { + const artifacts = listArtifacts(githubRunId); + // server log path by logical key + const serverLogByKey = new Map(); + const bmkDirs: string[] = []; + for (const art of artifacts.values()) { + const dir = downloadArtifact(art, tmp); + if (art.name.startsWith('server_logs_')) { + const logPath = path.join(dir, 'server.log'); + if (fs.existsSync(logPath)) serverLogByKey.set(logicalKey(art.name), logPath); + } else { + bmkDirs.push(dir); + } + } + + let runLinked = 0; + for (const bmkDir of bmkDirs) { + const key = logicalKey(path.basename(bmkDir)); + const logPath = serverLogByKey.get(key); + if (!logPath) continue; + for (const file of findJsonFiles(bmkDir)) { + let raw: unknown; + try { + raw = JSON.parse(fs.readFileSync(file, 'utf8')); + } catch { + continue; + } + const rows = Array.isArray(raw) ? raw : [raw]; + for (const row of rows) { + if (!row || typeof row !== 'object') continue; + const mapped = mapBenchmarkRow(row as Record, tracker); + if (!mapped || mapped.benchmarkType !== 'agentic_traces') continue; + const configId = await cache.getOrCreateConfig(mapped.config); + const ids = await sql<{ id: number }[]>` + select id from benchmark_results + where workflow_run_id = ${wrId} + and config_id = ${configId} + and conc = ${mapped.conc} + and benchmark_type = 'agentic_traces' + and server_log_id is null + `; + if (ids.length === 0) continue; + const serverLog = readServerLogCapped(logPath); + await insertServerLog( + sql, + ids.map((r) => r.id), + serverLog, + ); + runLinked += ids.length; + } + } + } + linkedRows += runLinked; + runsOk++; + const elapsed = Math.round((Date.now() - t0) / 1000); + console.log( + ` ✓ run ${githubRunId}: ${serverLogByKey.size} log(s), linked ${runLinked} row(s) ` + + `(${runsOk}/${runs.length}, ${elapsed}s total)`, + ); + } catch (error) { + runsFailed++; + console.error( + ` ✗ run ${githubRunId}: ${error instanceof Error ? (error.stack ?? error.message) : String(error)}`, + ); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + } + + const totalSec = Math.round((Date.now() - t0) / 1000); + console.log( + `\n=== complete: ${linkedRows} row(s) linked across ${runsOk} run(s) ` + + `(${runsFailed} failed) in ${totalSec}s ===`, + ); + if (runsFailed > 0) process.exitCode = 1; +} + +runBackfillMain('backfill-agentic-server-logs', sql, main); diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts new file mode 100644 index 00000000..2e3a4038 --- /dev/null +++ b/packages/db/src/backfill-aggregate-stats.ts @@ -0,0 +1,126 @@ +/** + * Backfill `agentic_trace_replay.aggregate_stats` for rows that are missing it + * or were computed by an older `STATS_VERSION`. + * + * The ingest path now computes stats inline, but existing rows (and rows + * whose computation logic has since changed) still need this pass. Run after + * applying migration 008 and any time `STATS_VERSION` bumps. + * + * Strategy: + * - Stream rows one at a time (server_metrics_json_gz can be hundreds of + * MB decompressed for TP+EP / high-conc points — keeping one in memory + * at a time avoids OOM). + * - Skip rows whose stored `aggregate_stats.version` already matches. + * - Recompute via the same `computeAggregateStats()` helper the ingest + * path uses, so behavior cannot drift. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-aggregate-stats + * [--limit N] only process the first N candidate rows (useful for + * smoke-tests on a fresh deploy) + * [--force] recompute every row, even if version already matches + * [--yes] skip the confirmation prompt + */ + +import { hasNoSslFlag } from './cli-utils.js'; +import { + computeAggregateStats, + mergeProfileStatsUpgrade, + STATS_VERSION, + type AggregateStats, +} from './etl/compute-aggregate-stats.js'; +import { createAdminSql } from './etl/db-utils.js'; +import { + confirmProceed, + jsonbParam, + parseLimitForceFlags, + runBackfillMain, + runPerIdBackfill, +} from './lib/backfill-runner.js'; + +const flags = parseLimitForceFlags(); + +const sql = createAdminSql({ + noSsl: hasNoSslFlag(), + max: 1, + onnotice: () => {}, +}); + +async function main(): Promise { + console.log('=== backfill-aggregate-stats ==='); + console.log(` STATS_VERSION = ${STATS_VERSION}`); + console.log(` force = ${flags.force}`); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // Find candidates: rows missing stats, or whose stored version is stale. + // Using >>'version'::int comparison would error on null; coalesce to -1 so + // null-stats rows always count as stale. + const candidates = flags.force + ? await sql<{ id: number }[]>` + select id + from agentic_trace_replay + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + ` + : await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where aggregate_stats is null + or coalesce((aggregate_stats->>'version')::int, -1) <> ${STATS_VERSION} + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (candidates.length === 0) { + console.log('\n Nothing to do — all rows up to date.'); + return; + } + + if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return; + + await runPerIdBackfill( + candidates.map((c) => c.id), + async (id) => { + // Fetch one row at a time — the json_gz blob is the heavy field. + const [row] = await sql< + { profile_export_jsonl_gz: Buffer | null; aggregate_stats: AggregateStats | null }[] + >` + select profile_export_jsonl_gz, aggregate_stats + from agentic_trace_replay + where id = ${id} + `; + if (!row) { + console.warn(` id=${id}: row vanished, skipping`); + return 'skipped'; + } + + let stats: AggregateStats; + if (row.aggregate_stats?.version === 3) { + const profileStats = await computeAggregateStats({ + profileBlob: row.profile_export_jsonl_gz, + serverBlob: null, + }); + stats = mergeProfileStatsUpgrade(row.aggregate_stats, profileStats); + } else { + const [serverRow] = await sql<{ server_metrics_json_gz: Buffer | null }[]>` + select server_metrics_json_gz + from agentic_trace_replay + where id = ${id} + `; + stats = await computeAggregateStats({ + profileBlob: row.profile_export_jsonl_gz, + serverBlob: serverRow?.server_metrics_json_gz ?? null, + }); + } + + await sql` + update agentic_trace_replay + set aggregate_stats = ${jsonbParam(sql, stats)} + where id = ${id} + `; + return 'ok'; + }, + ); +} + +runBackfillMain('backfill-aggregate-stats', sql, main); diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts new file mode 100644 index 00000000..94e009cf --- /dev/null +++ b/packages/db/src/backfill-chart-series.ts @@ -0,0 +1,124 @@ +/** + * Backfill `agentic_trace_replay.chart_series` for rows that are missing it + * or were computed by an older `CHART_SERIES_VERSION`. + * + * The ingest path now computes the time-series inline, but existing rows + * (and rows whose computation logic has since changed) still need this + * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION` + * bumps. + * + * Strategy: + * - Stream rows one at a time (server_metrics_json_gz can decompress + * past 500 MB on high-conc TP+EP points — one in memory at a time + * avoids OOM). + * - Skip rows whose stored version already matches. + * - Recompute via the same `computeChartSeries()` helper the ingest + * path uses, so behavior cannot drift. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-chart-series + * [--limit N] only process the first N candidate rows + * [--force] recompute every row, even if version already matches + * [--yes] skip the confirmation prompt + */ + +import { hasNoSslFlag } from './cli-utils.js'; +import { CHART_SERIES_VERSION, computeChartSeries } from './etl/compute-chart-series.js'; +import { createAdminSql } from './etl/db-utils.js'; +import { + confirmProceed, + jsonbParam, + parseLimitForceFlags, + runBackfillMain, + runPerIdBackfill, +} from './lib/backfill-runner.js'; + +const flags = parseLimitForceFlags(); + +const sql = createAdminSql({ + noSsl: hasNoSslFlag(), + max: 1, + onnotice: () => {}, +}); + +async function main(): Promise { + console.log('=== backfill-chart-series ==='); + console.log(` CHART_SERIES_VERSION = ${CHART_SERIES_VERSION}`); + console.log(` force = ${flags.force}`); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // Only rows that actually have a server_metrics blob can produce a + // chart_series. Rows without the blob legitimately keep `chart_series` + // null and the API serves them via the slow path (which also returns + // null because there's no blob to parse — so the page falls into the + // "no stored trace_replay blob" branch). + const candidates = flags.force + ? await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where server_metrics_json_gz is not null + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + ` + : await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where server_metrics_json_gz is not null + and ( + chart_series is null + or coalesce((chart_series->>'version')::int, -1) <> ${CHART_SERIES_VERSION} + ) + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (candidates.length === 0) { + console.log('\n Nothing to do — all rows up to date.'); + return; + } + + if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return; + + await runPerIdBackfill( + candidates.map((c) => c.id), + async (id) => { + const [row] = await sql< + { + server_metrics_json_gz: Buffer | null; + framework: string | null; + disagg: boolean | null; + }[] + >` + select atr.server_metrics_json_gz, source.framework, source.disagg + from agentic_trace_replay atr + left join lateral ( + select c.framework, c.disagg + from benchmark_results br + join configs c on c.id = br.config_id + where br.trace_replay_id = atr.id + order by br.id + limit 1 + ) source on true + where atr.id = ${id} + `; + if (!row) { + console.warn(` id=${id}: row vanished, skipping`); + return 'skipped'; + } + + const series = await computeChartSeries(row.server_metrics_json_gz, { + framework: row.framework, + disagg: row.disagg ?? false, + }); + + await sql` + update agentic_trace_replay + set chart_series = ${series === null ? null : jsonbParam(sql, series)} + where id = ${id} + `; + return 'ok'; + }, + ); +} + +runBackfillMain('backfill-chart-series', sql, main); diff --git a/packages/db/src/backfill-dataset-stats.ts b/packages/db/src/backfill-dataset-stats.ts new file mode 100644 index 00000000..e9c6916d --- /dev/null +++ b/packages/db/src/backfill-dataset-stats.ts @@ -0,0 +1,111 @@ +/** + * Backfill dataset summary stats and subagent-only ISL/OSL distributions from + * the compact structures already stored in `dataset_conversations`. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-dataset-stats --yes + */ + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils'; +import { createAdminSql } from './etl/db-utils'; +import { logHistogram, summarizeValues } from './etl/weka-structure'; +import { jsonbParam, runBackfillMain } from './lib/backfill-runner'; + +interface DatasetRow { + id: string; + slug: string; + summary: Record; + chart_data: Record; +} + +interface ConversationRow { + num_subagent_groups: number | string; + request_count: number | string; +} + +interface SubagentRequestRow { + input_tokens: number | string; + output_tokens: number | string; +} + +const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} }); + +async function main(): Promise { + const datasets = await sql` + select id, slug, summary, chart_data + from datasets + order by slug + `; + if (datasets.length === 0) { + console.log('No datasets found.'); + return; + } + + console.log(`Backfill subagent dataset stats for ${datasets.length} dataset(s).`); + if (!hasYesFlag() && !(await confirm('Continue? (y/N) '))) return; + + for (const dataset of datasets) { + const conversations = await sql` + select + num_subagent_groups, + ( + num_turns + coalesce(( + select sum(jsonb_array_length(node.value->'children')) + from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value) + where node.value->>'kind' = 'subagent' + ), 0) + ) as request_count + from dataset_conversations dc + where dataset_id = ${dataset.id} + `; + const requests = await sql` + select + (child.value->>'in')::double precision as input_tokens, + (child.value->>'out')::double precision as output_tokens + from dataset_conversations dc + cross join lateral jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value) + cross join lateral jsonb_array_elements(coalesce(node.value->'children', '[]'::jsonb)) child(value) + where dc.dataset_id = ${dataset.id} + and node.value->>'kind' = 'subagent' + `; + + const subagentsPerTrace = conversations.map((row) => Number(row.num_subagent_groups)); + const requestsPerConversation = conversations.map((row) => Number(row.request_count)); + const inputTokens = requests.map((row) => Number(row.input_tokens)); + const outputTokens = requests.map((row) => Number(row.output_tokens)); + const subagentStats = summarizeValues(subagentsPerTrace); + const requestStats = summarizeValues(requestsPerConversation); + const summary = { + ...dataset.summary, + version: 3, + meanSubagentsPerTrace: subagentStats.mean, + medianSubagentsPerTrace: subagentStats.median, + meanRequestsPerConversation: requestStats.mean, + medianRequestsPerConversation: requestStats.median, + }; + const chartData = { + ...dataset.chart_data, + version: 3, + subagentInputTokensPerRequest: { + bins: logHistogram(inputTokens), + stats: summarizeValues(inputTokens), + }, + subagentOutputTokensPerRequest: { + bins: logHistogram(outputTokens), + stats: summarizeValues(outputTokens), + }, + }; + + await sql` + update datasets + set summary = ${sql.json(summary)}, + chart_data = ${jsonbParam(sql, chartData)} + where id = ${dataset.id} + `; + console.log( + ` ${dataset.slug}: ${requests.length.toLocaleString()} inner requests, median ${subagentStats.median}, mean ${subagentStats.mean.toFixed(1)} subagents/trace`, + ); + } +} + +runBackfillMain('backfill-dataset-stats', sql, main); diff --git a/packages/db/src/backfill-kv-pool.ts b/packages/db/src/backfill-kv-pool.ts new file mode 100644 index 00000000..efa04c81 --- /dev/null +++ b/packages/db/src/backfill-kv-pool.ts @@ -0,0 +1,103 @@ +/** + * Backfill `benchmark_results.metrics->kv_cache_pool_tokens` from the captured + * server logs. The value is parsed from vLLM's authoritative + * "GPU KV cache size: N tokens" startup line(s), summed across data-parallel + * engine cores (see {@link kvCachePoolTokensFromServerLog}). + * + * The ingest path now derives this inline in `insertServerLog`, but existing + * rows need this one-time pass. Idempotent: re-running only touches rows that + * still lack the value (unless --force). + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-kv-pool + * [--limit N] only process the first N candidate server logs + * [--force] recompute even when the value is already set + * [--yes] skip the confirmation prompt + */ + +import { hasNoSslFlag } from './cli-utils.js'; +import { createAdminSql } from './etl/db-utils.js'; +import { kvCachePoolTokensFromServerLog } from './etl/server-log-metrics.js'; +import { confirmProceed, parseLimitForceFlags, runBackfillMain } from './lib/backfill-runner.js'; + +const flags = parseLimitForceFlags(); + +const sql = createAdminSql({ + noSsl: hasNoSslFlag(), + max: 1, + onnotice: () => {}, +}); + +async function main(): Promise { + console.log('=== backfill-kv-pool ==='); + console.log(` force = ${flags.force}`); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // One server log can be linked to several benchmark_results (multiple + // concurrency points share a server). Group by log id so we parse each log + // once and fan the value out to all its rows. + const candidates = flags.force + ? await sql<{ server_log_id: number }[]>` + select distinct server_log_id + from benchmark_results + where server_log_id is not null + order by server_log_id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + ` + : await sql<{ server_log_id: number }[]>` + select distinct server_log_id + from benchmark_results + where server_log_id is not null + and metrics->>'kv_cache_pool_tokens' is null + order by server_log_id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (candidates.length === 0) { + console.log('\n Nothing to do — all rows up to date.'); + return; + } + + if (!(await confirmProceed(`${candidates.length} candidate server log(s).`))) return; + + let updated = 0; + let logsWithValue = 0; + let logsNoValue = 0; + let failed = 0; + const t0 = Date.now(); + for (const { server_log_id: logId } of candidates) { + try { + const [row] = await sql<{ server_log: string | null }[]>` + select server_log from server_logs where id = ${logId} + `; + const tokens = kvCachePoolTokensFromServerLog(row?.server_log ?? null); + if (tokens === null) { + logsNoValue++; + continue; // non-vLLM or no startup line — leave unset + } + logsWithValue++; + const targets = flags.force + ? sql`server_log_id = ${logId}` + : sql`server_log_id = ${logId} and metrics->>'kv_cache_pool_tokens' is null`; + const result = await sql` + update benchmark_results + set metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${tokens}::bigint)) + where ${targets} + `; + updated += result.count; + console.log(` ✓ log=${logId}: ${tokens.toLocaleString()} tok → ${result.count} row(s)`); + } catch (error) { + failed++; + console.error(` ✗ log=${logId}: ${error instanceof Error ? error.message : String(error)}`); + } + } + + const totalSec = Math.round((Date.now() - t0) / 1000); + console.log( + `\n=== backfill complete: ${updated} row(s) updated from ${logsWithValue} log(s) ` + + `(${logsNoValue} log(s) had no KV-pool line, ${failed} failed) in ${totalSec}s ===`, + ); + if (failed > 0) process.exitCode = 1; +} + +runBackfillMain('backfill-kv-pool', sql, main); diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts new file mode 100644 index 00000000..09126654 --- /dev/null +++ b/packages/db/src/backfill-request-timeline.ts @@ -0,0 +1,97 @@ +/** + * Backfill `agentic_trace_replay.request_timeline` for rows that are + * missing it or were computed by an older `REQUEST_TIMELINE_VERSION`. + * + * The ingest path now computes the timeline inline, but existing rows + * (and rows whose computation logic has since changed) still need this + * pass. Run after applying migration 010 and any time the version bumps. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline + * [--limit N] only process the first N candidate rows + * [--force] recompute every row, even if version already matches + * [--yes] skip the confirmation prompt + */ + +import { hasNoSslFlag } from './cli-utils.js'; +import { + REQUEST_TIMELINE_VERSION, + computeRequestTimeline, +} from './etl/compute-request-timeline.js'; +import { createAdminSql } from './etl/db-utils.js'; +import { + confirmProceed, + jsonbParam, + parseLimitForceFlags, + runBackfillMain, + runPerIdBackfill, +} from './lib/backfill-runner.js'; + +const flags = parseLimitForceFlags(); + +const sql = createAdminSql({ + noSsl: hasNoSslFlag(), + max: 1, + onnotice: () => {}, +}); + +async function main(): Promise { + console.log('=== backfill-request-timeline ==='); + console.log(` REQUEST_TIMELINE_VERSION = ${REQUEST_TIMELINE_VERSION}`); + console.log(` force = ${flags.force}`); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // Only rows with a profile_export blob can produce a timeline. Rows + // without the blob keep `request_timeline` null and the API serves them + // as "no timeline data". + const candidates = flags.force + ? await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where profile_export_jsonl_gz is not null + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + ` + : await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where profile_export_jsonl_gz is not null + and ( + request_timeline is null + or coalesce((request_timeline->>'version')::int, -1) <> ${REQUEST_TIMELINE_VERSION} + ) + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (candidates.length === 0) { + console.log('\n Nothing to do — all rows up to date.'); + return; + } + + if (!(await confirmProceed(`${candidates.length} candidate row(s).`))) return; + + await runPerIdBackfill( + candidates.map((c) => c.id), + async (id) => { + const [row] = await sql<{ profile_export_jsonl_gz: Buffer | null }[]>` + select profile_export_jsonl_gz + from agentic_trace_replay + where id = ${id} + `; + if (!row) { + console.warn(` id=${id}: row vanished, skipping`); + return 'skipped'; + } + const timeline = computeRequestTimeline(row.profile_export_jsonl_gz); + await sql` + update agentic_trace_replay + set request_timeline = ${timeline === null ? null : jsonbParam(sql, timeline)} + where id = ${id} + `; + return 'ok'; + }, + ); +} + +runBackfillMain('backfill-request-timeline', sql, main); diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index cb222a86..d23a8f63 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -21,7 +21,6 @@ * original source sweep run, so public links point at the real benchmark run. */ -import { execSync } from 'child_process'; import fs from 'fs'; import os from 'os'; import path from 'path'; @@ -29,6 +28,12 @@ import path from 'path'; import { GPU_KEYS } from '@semianalysisai/inferencex-constants'; import { hasNoSslFlag } from './cli-utils'; +import { + dedupeArtifactsByLogicalName, + downloadArtifact, + fetchRunAttempt, + listRunArtifacts, +} from './lib/github-artifacts'; import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils'; import { isRunAttemptPurged } from './etl/run-overrides'; import { createSkipTracker } from './etl/skip-tracker'; @@ -45,6 +50,9 @@ import { bulkUpsertAvailability, insertServerLog, } from './etl/benchmark-ingest'; +import { insertTraceReplay } from './etl/trace-replay-ingest'; +import { discoverTraceReplayArtifacts } from './etl/trace-artifact-discovery'; +import { datasetSlugFromBenchmarkRow } from './etl/dataset-provenance'; import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper'; import { ingestEvalRow } from './etl/eval-ingest'; import { mapEvalSamples } from './etl/eval-samples-mapper'; @@ -95,48 +103,20 @@ if (isDownloadMode) { console.log(` Repo: ${REPO}`); console.log(`\n--- Downloading artifacts to ${artifactsDir} ---`); - const artifactListJson = execSync( - `gh api "repos/${REPO}/actions/runs/${runIdStr}/artifacts" --paginate --jq '.artifacts[]'`, - { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024 }, - ); - - const allArtifacts: { name: string; archive_download_url: string; created_at: string }[] = []; - for (const line of artifactListJson.trim().split('\n')) { - if (!line) continue; - try { - const parsed = JSON.parse(line); - allArtifacts.push(parsed); - } catch {} - } - - const byName = new Map(); - for (const a of allArtifacts) { - const existing = byName.get(a.name); - if (!existing || a.created_at > existing.created_at) { - byName.set(a.name, a); - } - } + // Retried configs produce artifacts on multiple runners — keep only the + // most recent per logical name (see RUNNER_SUFFIX_RE in github-artifacts) + // so a failed attempt's empty metrics can't overwrite the good one via + // ON CONFLICT DO UPDATE. + const byLogical = dedupeArtifactsByLogicalName(listRunArtifacts(REPO, runIdStr)); - for (const [name, artifact] of byName) { - console.log(` ${name}`); - const zipPath = path.join(artifactsDir, 'artifact.zip'); - execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, { - stdio: ['pipe', 'pipe', 'inherit'], - }); - const destDir = path.join(artifactsDir, name); - fs.mkdirSync(destDir, { recursive: true }); - execSync(`unzip -oq "${zipPath}" -d "${destDir}"`, { stdio: 'inherit' }); - fs.unlinkSync(zipPath); + for (const artifact of byLogical.values()) { + console.log(` ${artifact.name}`); + downloadArtifact(artifact, artifactsDir); } - console.log(`\n Downloaded ${byName.size} artifact(s)`); + console.log(`\n Downloaded ${byLogical.size} artifact(s)`); - // Fetch run attempt from API - const attemptStr = execSync( - `gh api "repos/${REPO}/actions/runs/${runIdStr}" --jq '.run_attempt'`, - { encoding: 'utf8' }, - ).trim(); - runAttemptNum = parseInt(attemptStr || '1', 10); + runAttemptNum = fetchRunAttempt(REPO, runIdStr); } else { // CI mode — read from env vars for (const key of [ @@ -194,6 +174,14 @@ const ARTIFACT_NAMES = { changelog: 'changelog-metadata', } as const; +/** + * Strip the `bmk_` and/or `agentic_` prefixes from an artifact directory name + * so the bare suffix becomes a shared key between `bmk_agentic_` and + * its sibling `agentic_` artifact. + */ +const stripBmkAndAgenticPrefix = (s: string): string => + s.replace(/^bmk_/u, '').replace(/^agentic_/u, ''); + function readJson(filePath: string): unknown { try { return JSON.parse(fs.readFileSync(filePath, 'utf8')); @@ -294,13 +282,14 @@ async function main(): Promise { const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; let totalNewBmk = 0, @@ -311,6 +300,11 @@ async function main(): Promise { let totalSamples = 0; let totalSampleFiles = 0; let totalChangelogs = 0; + let totalTraceReplayLinked = 0; + const datasetSlugs = new Set(); + // Dataset slugs referenced by this run's agentic rows but absent from the + // `datasets` table — timeline→dataset deep links 404 until they're ingested. + const missingDatasets = new Set(); // ── Check for evals-only flag in changelog ──────────────────────────── const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog); @@ -355,8 +349,13 @@ async function main(): Promise { if (fs.existsSync(artifactsDir)) { for (const d of fs.readdirSync(artifactsDir)) { if (!d.startsWith('server_logs_')) continue; - const logPath = path.join(artifactsDir, d, 'server.log'); - if (!fs.existsSync(logPath)) continue; + // feat-agentx-v1.0 harness nests the log under `results/server.log`; + // older runs keep it at the artifact root. Check both. + const logPath = [ + path.join(artifactsDir, d, 'server.log'), + path.join(artifactsDir, d, 'results', 'server.log'), + ].find((p) => fs.existsSync(p)); + if (!logPath) continue; const configKey = d.replace(/^server_logs_/u, ''); serverLogPaths.set(configKey, logPath); } @@ -365,6 +364,17 @@ async function main(): Promise { console.log(` Found ${serverLogPaths.size} server log artifact(s)`); } + // Sibling aiperf artifacts: each `bmk_agentic_` is paired with an + // `agentic_` dir holding `profile_export.jsonl` and + // `server_metrics_export.csv`. The harness emits these under either a + // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current). + // Older non-aiperf agentic runs don't ship this sibling. Key on the bare + // suffix so both names map to the same Map entry. + const traceReplayPaths = discoverTraceReplayArtifacts(artifactsDir); + if (traceReplayPaths.size > 0) { + console.log(` Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`); + } + const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))]; console.log(` Found ${allBmkFiles.length} benchmark JSON file(s)`); @@ -376,6 +386,12 @@ async function main(): Promise { ? data : [data as Record]; + for (const rawRow of rawRows) { + if (!rawRow || typeof rawRow !== 'object') continue; + const datasetSlug = datasetSlugFromBenchmarkRow(rawRow); + if (datasetSlug) datasetSlugs.add(datasetSlug); + } + const rows = rawRows .filter((r) => typeof r === 'object' && r !== null) .map((r) => mapBenchmarkRow(r, tracker)) @@ -415,13 +431,21 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } const parentDir = path.basename(path.dirname(file)); if (parentDir.startsWith('bmk_') && insertedIds.length > 0) { + // Single-turn artifacts are `bmk_` paired with + // `server_logs_`. Agentic artifacts are `bmk_agentic_` + // but the server log is still `server_logs_` (no `agentic_` + // prefix), so fall back to the fully-stripped suffix — otherwise + // agentic rows never get their server log (and KV-pool size) linked. const configKey = parentDir.replace(/^bmk_/u, ''); - const logPath = serverLogPaths.get(configKey); + const logPath = + serverLogPaths.get(configKey) ?? + serverLogPaths.get(stripBmkAndAgenticPrefix(parentDir)); if (logPath) { try { const serverLog = fs.readFileSync(logPath, 'utf8').replaceAll('\u0000', ''); @@ -431,12 +455,49 @@ async function main(): Promise { } } } + + // Trace-replay sibling lookup for agentic points only. The aiperf + // harness emits `agentic_/trace_replay/...` next to the + // `bmk_agentic_` artifact we just ingested. + if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) { + const suffix = stripBmkAndAgenticPrefix(parentDir); + const concMatch = path.basename(file).match(/_conc(?\d+)\.json$/u); + const trace = + (concMatch?.groups?.conc + ? traceReplayPaths.get(`${suffix}|${concMatch.groups.conc}`) + : undefined) ?? traceReplayPaths.get(suffix); + if (trace) { + try { + const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null; + const metrics = trace.serverMetricsCsv + ? fs.readFileSync(trace.serverMetricsCsv) + : null; + const metricsJson = trace.serverMetricsJson + ? fs.readFileSync(trace.serverMetricsJson) + : null; + await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson, { + framework: toInsert[0]?.config.framework, + disagg: toInsert[0]?.config.disagg, + }); + totalTraceReplayLinked += insertedIds.length; + } catch (error: any) { + tracker.recordDbError(`trace_replay for ${suffix}`, error); + } + } else { + tracker.skips.traceReplayMissing++; + } + } } catch (error: any) { tracker.recordDbError(path.basename(file), error); } } } console.log(` Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`); + if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) { + console.log( + ` Trace replay: ${totalTraceReplayLinked} rows linked, ${tracker.skips.traceReplayMissing} agentic point(s) missing sibling artifact`, + ); + } if (availRows.length > 0) { try { @@ -446,6 +507,30 @@ async function main(): Promise { tracker.recordDbError('availability', error); } } + + if (datasetSlugs.size > 1) { + throw new Error( + `Conflicting dataset provenance in workflow run ${runId}: ${[...datasetSlugs].toSorted().join(', ')}`, + ); + } + const [datasetSlug] = datasetSlugs; + if (datasetSlug) { + await sql` + insert into run_datasets (workflow_run_id, dataset_slug) + values (${workflowRunId}, ${datasetSlug}) + on conflict (workflow_run_id) do update + set dataset_slug = excluded.dataset_slug + `; + console.log(` Dataset: linked workflow run to ${datasetSlug}`); + const [known] = await sql`select 1 as ok from datasets where slug = ${datasetSlug}`; + if (!known) { + missingDatasets.add(datasetSlug); + console.warn( + ` ⚠ Dataset ${datasetSlug} is not in the datasets table — request-timeline deep links ` + + `will 404 until it is ingested (packages/db/src/ingest-weka-dataset.ts)`, + ); + } + } } // ── Ingest run stats ────────────────────────────────────────────────── @@ -654,11 +739,17 @@ async function main(): Promise { const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker; const totalSkips = - skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError; + skips.badZip + + skips.unmappedModel + + skips.unmappedHw + + skips.noIslOsl + + skips.failedRun + + skips.dbError; if (totalSkips > 0) { console.log(`\n Skipped: ${totalSkips} rows`); const skipLines: [string, number][] = [ ['no isl/osl (old format)', skips.noIslOsl], + ['failed run (0 successful)', skips.failedRun], ['unmapped model', skips.unmappedModel], ['unmapped hw', skips.unmappedHw], ['bad/empty zip', skips.badZip], @@ -690,7 +781,10 @@ async function main(): Promise { const unmappedOutPath = process.env.UNMAPPED_ENTITIES_OUTPUT; if ( unmappedOutPath && - (unmappedModels.size > 0 || unmappedHws.size > 0 || unmappedPrecisions.size > 0) + (unmappedModels.size > 0 || + unmappedHws.size > 0 || + unmappedPrecisions.size > 0 || + missingDatasets.size > 0) ) { fs.writeFileSync( unmappedOutPath, @@ -698,6 +792,7 @@ async function main(): Promise { models: [...unmappedModels], hardware: [...unmappedHws], precisions: [...unmappedPrecisions], + datasets: [...missingDatasets], }), ); } diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index b9f2b3b5..faa093e3 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -457,6 +457,9 @@ async function mapWorkflowDir( unmappedModel: local.skips.unmappedModel, unmappedHw: local.skips.unmappedHw, noIslOsl: local.skips.noIslOsl, + failedRun: local.skips.failedRun, + // GCS backup doesn't ingest aiperf trace files; counter stays 0. + traceReplayMissing: local.skips.traceReplayMissing, }, localUnmappedModels: new Set(local.unmappedModels), localUnmappedHws: new Set(local.unmappedHws), @@ -621,13 +624,14 @@ async function main(): Promise { // Upsert availability rows only for successfully resolved configs const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const r of allInserted) { availRows.push({ @@ -639,6 +643,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } if (availRows.length > 0) { diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts index a3b62fe0..f868767e 100644 --- a/packages/db/src/ingest-supplemental.ts +++ b/packages/db/src/ingest-supplemental.ts @@ -219,8 +219,10 @@ async function ingestSupplementalBmk( const rows: { configId: number; - isl: number; - osl: number; + benchmarkType: 'single_turn' | 'agentic_traces'; + offloadMode: string; + isl: number | null; + osl: number | null; conc: number; image: string | null; metrics: Record; @@ -271,6 +273,8 @@ async function ingestSupplementalBmk( rows.push({ configId, + benchmarkType: 'single_turn', + offloadMode: 'off', isl: entry.isl, osl: entry.osl, conc: entry.conc, @@ -294,13 +298,14 @@ async function ingestSupplementalBmk( // to `rows` are exactly the valid ones. const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const entry of entries) { const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined }); @@ -317,6 +322,7 @@ async function ingestSupplementalBmk( framework, specMethod, disagg, + benchmarkType: 'single_turn', }); } if (availRows.length > 0) { diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts new file mode 100644 index 00000000..ed6774c0 --- /dev/null +++ b/packages/db/src/ingest-weka-dataset.ts @@ -0,0 +1,416 @@ +/** + * Ingest a HuggingFace cc-traces-weka dataset into the `datasets` + + * `dataset_conversations` tables that back the /datasets area. + * + * Public dataset, no token needed — fetched via the HF datasets-server rows API + * (rows are large, ~3.5 MB each, so we page in small chunks with adaptive + * backoff). Per conversation we build a flamegraph-ready `structure` (turns + + * subagent groups, input split into cached-prefix vs uncached) and accumulate + * dataset-level distributions for the detail cards. Raw hash_ids are discarded + * after the cached/uncached split is computed. + * + * Usage (DATABASE_WRITE_URL must be provided — never hardcoded): + * DATABASE_WRITE_URL='postgres://…' pnpm exec tsx src/ingest-weka-dataset.ts \ + * semianalysisai/cc-traces-weka-062126 [--label "…"] [--variant full|256k] \ + * [--description "…"] [--limit N] + * + * Upsert: re-running replaces the dataset's rows (delete + re-insert). + * Remember to purge the API cache afterwards (POST /api/v1/invalidate). + */ + +import { createAdminSql } from './etl/db-utils'; +import { hasNoSslFlag } from './cli-utils'; +import { + buildConversationStructure, + countConversationRequests, + linearHistogram, + logHistogram, + logHistogramWithZero, + subagentRequestTurns, + summarizeValues, + type ConversationStructure, + type RawWekaConversation, + type TurnNode, +} from './etl/weka-structure'; + +const ROWS_API = 'https://datasets-server.huggingface.co/rows'; +const INFO_API = 'https://datasets-server.huggingface.co/info'; + +interface CliArgs { + dataset: string; + label?: string; + variant?: string; + description?: string; + limit?: number; +} + +function parseArgs(): CliArgs { + const argv = process.argv.slice(2); + const positional = argv.filter((a) => !a.startsWith('--')); + const dataset = positional[0]; + if (!dataset) { + console.error( + 'Usage: tsx src/ingest-weka-dataset.ts [--label …] [--variant full|256k] [--description …] [--limit N]', + ); + process.exit(1); + } + const getFlag = (name: string): string | undefined => { + const i = argv.indexOf(`--${name}`); + return i !== -1 && i + 1 < argv.length ? argv[i + 1] : undefined; + }; + const limitRaw = getFlag('limit'); + return { + dataset, + label: getFlag('label'), + variant: getFlag('variant'), + description: getFlag('description'), + limit: limitRaw ? Number(limitRaw) : undefined, + }; +} + +const sleep = (ms: number) => + new Promise((resolve) => { + setTimeout(resolve, ms); + }); + +/** + * Fetch JSON, transparently retrying on HF rate-limiting (429) and transient + * 5xx with exponential backoff. Honors a Retry-After header when present. + */ +async function fetchJson(url: string, attempt = 0): Promise { + const res = await fetch(url); + if (res.status === 429 || res.status >= 500) { + if (attempt >= 6) { + throw new Error(`${res.status} ${res.statusText} after ${attempt} retries for ${url}`); + } + const retryAfter = Number(res.headers.get('retry-after')); + const waitMs = + Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1000 : 2000 * 2 ** attempt; + console.warn( + ` ${res.status} ${res.statusText}; waiting ${Math.round(waitMs / 1000)}s (attempt ${attempt + 1})`, + ); + await sleep(waitMs); + return fetchJson(url, attempt + 1); + } + if (!res.ok) { + throw new Error(`${res.status} ${res.statusText} for ${url}`); + } + return res.json(); +} + +async function getRowCount(dataset: string): Promise { + const info = (await fetchJson(`${INFO_API}?dataset=${encodeURIComponent(dataset)}`)) as { + dataset_info?: Record }>; + }; + const cfg = info.dataset_info?.['default']; + const num = cfg?.splits?.['train']?.num_examples; + return typeof num === 'number' ? num : 0; +} + +/** Page through rows with adaptive length (halve on "too big"/error). */ +async function* iterRows( + dataset: string, + total: number, + limit?: number, +): AsyncGenerator { + const cap = limit ? Math.min(limit, total) : total; + let offset = 0; + let length = 5; // ~18 MB/page at ~3.5 MB/row; backs off on failure + while (offset < cap) { + const want = Math.min(length, cap - offset); + const url = `${ROWS_API}?dataset=${encodeURIComponent(dataset)}&config=default&split=train&offset=${offset}&length=${want}`; + let payload: { rows?: { row: RawWekaConversation }[] }; + try { + payload = (await fetchJson(url)) as { rows?: { row: RawWekaConversation }[] }; + } catch (error) { + if (want > 1) { + length = Math.max(1, Math.floor(want / 2)); + console.warn( + ` page @${offset} (len ${want}) failed (${String(error)}); retrying with len ${length}`, + ); + continue; + } + throw error; + } + const rows = payload.rows ?? []; + if (rows.length === 0) break; + for (const r of rows) yield r.row; + offset += rows.length; + process.stdout.write(`\r fetched ${Math.min(offset, cap)}/${cap} conversations`); + if (offset < cap) await sleep(400); // be polite to the HF datasets-server + } + process.stdout.write('\n'); +} + +interface Accumulator { + inputPerTurn: number[]; // effective input tokens, every turn (incl. subagent children) + uncachedInputPerTurn: number[]; + outputPerTurn: number[]; + cachedFractionPerTurn: number[]; // cached/in, for turns with in>0 + turnsPerConv: number[]; // main (top-level) turns + requestsPerConv: number[]; // main turns + subagent child turns + subagentInputPerRequest: number[]; + subagentOutputPerRequest: number[]; + subagentGroupsPerConv: number[]; + subagentTurnsPerGroup: number[]; + totalIn: number; + totalOut: number; + totalCached: number; + mainTurns: number; + subagentGroups: number; + subagentTurns: number; + modelCounts: Record; +} + +function newAccumulator(): Accumulator { + return { + inputPerTurn: [], + uncachedInputPerTurn: [], + outputPerTurn: [], + cachedFractionPerTurn: [], + turnsPerConv: [], + requestsPerConv: [], + subagentInputPerRequest: [], + subagentOutputPerRequest: [], + subagentGroupsPerConv: [], + subagentTurnsPerGroup: [], + totalIn: 0, + totalOut: 0, + totalCached: 0, + mainTurns: 0, + subagentGroups: 0, + subagentTurns: 0, + modelCounts: {}, + }; +} + +function recordTurn(acc: Accumulator, t: TurnNode): void { + acc.inputPerTurn.push(t.in); + acc.uncachedInputPerTurn.push(t.uncached); + acc.outputPerTurn.push(t.out); + if (t.in > 0) acc.cachedFractionPerTurn.push(t.cached / t.in); + if (t.model) acc.modelCounts[t.model] = (acc.modelCounts[t.model] ?? 0) + 1; +} + +function accumulate(acc: Accumulator, s: ConversationStructure): void { + acc.totalIn += s.totals.in; + acc.totalOut += s.totals.out; + acc.totalCached += s.totals.cached; + acc.mainTurns += s.totals.numTurns; + acc.subagentGroups += s.totals.numSubagentGroups; + acc.turnsPerConv.push(s.totals.numTurns); + acc.requestsPerConv.push(countConversationRequests(s)); + for (const turn of subagentRequestTurns(s)) { + acc.subagentInputPerRequest.push(turn.in); + acc.subagentOutputPerRequest.push(turn.out); + } + acc.subagentGroupsPerConv.push(s.totals.numSubagentGroups); + for (const node of s.nodes) { + if (node.kind === 'turn') { + recordTurn(acc, node); + } else { + acc.subagentTurnsPerGroup.push(node.children.length); + acc.subagentTurns += node.children.length; + for (const child of node.children) recordTurn(acc, child); + } + } +} + +function buildChartData(acc: Accumulator) { + return { + version: 3, + inputTokensPerTurn: { + bins: logHistogram(acc.inputPerTurn), + stats: summarizeValues(acc.inputPerTurn), + }, + uncachedInputTokensPerTurn: { + bins: logHistogramWithZero(acc.uncachedInputPerTurn), + stats: summarizeValues(acc.uncachedInputPerTurn), + }, + outputTokensPerTurn: { + bins: logHistogram(acc.outputPerTurn), + stats: summarizeValues(acc.outputPerTurn), + }, + subagentInputTokensPerRequest: { + bins: logHistogram(acc.subagentInputPerRequest), + stats: summarizeValues(acc.subagentInputPerRequest), + }, + subagentOutputTokensPerRequest: { + bins: logHistogram(acc.subagentOutputPerRequest), + stats: summarizeValues(acc.subagentOutputPerRequest), + }, + turnsPerConversation: { + bins: linearHistogram(acc.turnsPerConv), + stats: summarizeValues(acc.turnsPerConv), + }, + subagentGroupsPerConversation: { + bins: linearHistogram(acc.subagentGroupsPerConv), + stats: summarizeValues(acc.subagentGroupsPerConv), + }, + cachedFractionPerTurn: { + bins: linearHistogram(acc.cachedFractionPerTurn, 20), + stats: summarizeValues(acc.cachedFractionPerTurn), + }, + }; +} + +function buildSummary(acc: Accumulator, blockSize: number, hashIdScope: string | null) { + const cachedPct = acc.totalIn > 0 ? acc.totalCached / acc.totalIn : 0; + const requestsPerConversation = summarizeValues(acc.requestsPerConv); + const subagentsPerTrace = summarizeValues(acc.subagentGroupsPerConv); + return { + version: 3, + blockSize, + hashIdScope, + totalIn: acc.totalIn, + totalOut: acc.totalOut, + totalCached: acc.totalCached, + cachedPct, + mainTurns: acc.mainTurns, + subagentGroups: acc.subagentGroups, + subagentTurns: acc.subagentTurns, + meanRequestsPerConversation: requestsPerConversation.mean, + medianRequestsPerConversation: requestsPerConversation.median, + meanSubagentsPerTrace: subagentsPerTrace.mean, + medianSubagentsPerTrace: subagentsPerTrace.median, + modelMix: acc.modelCounts, + }; +} + +function slugFromDataset(dataset: string): string { + return dataset.includes('/') ? dataset.slice(dataset.indexOf('/') + 1) : dataset; +} + +function inferVariant(slug: string): string { + if (slug.endsWith('-256k')) return '256k'; + if (slug.includes('no-subagent')) return 'no-subagents'; + return 'full'; +} + +function defaultLabel(slug: string): string { + // cc-traces-weka-062126 → "CC Traces Weka 062126" + return slug + .split('-') + .map((p) => (/^\d+$/u.test(p) ? p : p.toUpperCase())) + .join(' ') + .replace(/^CC TRACES WEKA/u, 'CC Traces Weka'); +} + +async function main(): Promise { + const args = parseArgs(); + const slug = slugFromDataset(args.dataset); + const variant = args.variant ?? inferVariant(slug); + const label = args.label ?? defaultLabel(slug); + const hfUrl = `https://huggingface.co/datasets/${args.dataset}`; + + console.log(`=== ingest-weka-dataset: ${args.dataset} ===`); + console.log(` slug=${slug} variant=${variant} label="${label}"`); + + const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1 }); + + const total = await getRowCount(args.dataset); + console.log(` ${total} conversations on HF`); + + const acc = newAccumulator(); + let blockSize = 64; + let hashIdScope: string | null = null; + + // Buffer the per-conversation rows; flush in batches to keep memory bounded. + interface ConvRow { + dataset_id: string; + conv_id: string; + models: string[]; + num_turns: number; + num_subagent_groups: number; + total_in: number; + total_out: number; + total_cached: number; + structure: ConversationStructure; + } + const pending: ConvRow[] = []; + + try { + // Upsert the dataset shell first (FK target). Counts/summary filled at the end. + await sql` + insert into datasets (id, slug, label, variant, description, hf_url, license) + values (${args.dataset}, ${slug}, ${label}, ${variant}, ${args.description ?? null}, ${hfUrl}, 'apache-2.0') + on conflict (id) do update set + slug = excluded.slug, label = excluded.label, variant = excluded.variant, + description = coalesce(excluded.description, datasets.description), + hf_url = excluded.hf_url, license = excluded.license, ingested_at = now() + `; + // Clear prior conversations for a clean re-ingest. + await sql`delete from dataset_conversations where dataset_id = ${args.dataset}`; + + const flush = async () => { + if (pending.length === 0) return; + // postgres.js row-helper insert: serializes `structure` to jsonb and + // `models` to text[] per row (unnest can't carry a text[] column — a 2D + // array would flatten into scalar rows). + const rows = pending.map((p) => ({ + dataset_id: args.dataset, + conv_id: p.conv_id, + models: p.models, + num_turns: p.num_turns, + num_subagent_groups: p.num_subagent_groups, + total_in: p.total_in, + total_out: p.total_out, + total_cached: p.total_cached, + structure: sql.json(p.structure as unknown as Parameters[0]), + })); + await sql`insert into dataset_conversations ${sql(rows)}`; + pending.length = 0; + }; + + let count = 0; + for await (const conv of iterRows(args.dataset, total, args.limit)) { + blockSize = conv.block_size ?? blockSize; + hashIdScope = conv.hash_id_scope ?? hashIdScope; + const structure = buildConversationStructure(conv); + accumulate(acc, structure); + pending.push({ + dataset_id: args.dataset, + conv_id: conv.id, + models: Array.isArray(conv.models) ? conv.models : [], + num_turns: structure.totals.numTurns, + num_subagent_groups: structure.totals.numSubagentGroups, + total_in: structure.totals.in, + total_out: structure.totals.out, + total_cached: structure.totals.cached, + structure, + }); + count += 1; + if (pending.length >= 25) await flush(); + } + await flush(); + + const summary = buildSummary(acc, blockSize, hashIdScope); + const chartData = buildChartData(acc); + await sql` + update datasets set + conversation_count = ${count}, + summary = ${sql.json(summary as unknown as Parameters[0])}, + chart_data = ${sql.json(chartData as unknown as Parameters[0])}, + ingested_at = now() + where id = ${args.dataset} + `; + + console.log(`\n ingested ${count} conversations`); + console.log( + ` main turns=${acc.mainTurns} subagent groups=${acc.subagentGroups} subagent turns=${acc.subagentTurns}`, + ); + console.log( + ` totals: in=${acc.totalIn.toLocaleString()} out=${acc.totalOut.toLocaleString()} ` + + `cached=${acc.totalCached.toLocaleString()} (${(summary.cachedPct * 100).toFixed(1)}% of input)`, + ); + console.log('\n=== done ==='); + console.log(' Purge the API cache: POST /api/v1/invalidate'); + } finally { + await sql.end({ timeout: 5 }); + } +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/packages/db/src/json-provider.line-single-run.test.ts b/packages/db/src/json-provider.line-single-run.test.ts index 643b8896..b75fa26a 100644 --- a/packages/db/src/json-provider.line-single-run.test.ts +++ b/packages/db/src/json-provider.line-single-run.test.ts @@ -7,8 +7,9 @@ import { afterAll, beforeAll, describe, expect, it } from 'vitest'; import type { getLatestBenchmarks as GetLatestBenchmarks } from './json-provider.js'; /** - * A chart line is one config + sequence (config_id, benchmark_type, isl, osl) plotted across - * concurrencies, and it must come from a SINGLE workflow run. getLatestBenchmarks picks the + * A chart line is one config + sequence + offload mode + * (config_id, benchmark_type, isl, osl, offload_mode) plotted across concurrencies, and it must + * come from a SINGLE workflow run. getLatestBenchmarks picks the * newest run per line (date, then run_started_at, then workflow_run_id) and returns EVERY * concurrency that one run measured — never stitching skipped concurrencies from an older run. * @@ -62,6 +63,7 @@ const result = ( tpot: number, isl = 1024, osl = 1024, + offloadMode = 'off', ) => ({ id: nextResultId++, workflow_run_id: runDbId, @@ -71,6 +73,7 @@ const result = ( isl, osl, conc, + offload_mode: offloadMode, image: null, metrics: { median_tpot: tpot }, error: null, @@ -105,6 +108,10 @@ beforeAll(async () => { // config 1, seq (8192,1024): only run A measured it (run B skipped this sequence). result(10, 1, OLD, 1, 0.2, 8192, 1024), result(10, 1, OLD, 8, 0.3, 8192, 1024), + // Offload mode is an independent line dimension. A newer off-mode run must not hide + // the older on-mode line for the same config and sequence. + result(10, 1, OLD, 4, 0.25, 4096, 4096, 'on'), + result(11, 1, NEW, 4, 0.2, 4096, 4096, 'off'), // config 2, seq (1024,1024): two same-day runs with identical run_started_at. result(20, 2, NEW, 1, 0.5), result(20, 2, NEW, 8, 0.6), @@ -157,6 +164,21 @@ describe('getLatestBenchmarks — one run per line', () => { ]); }); + it('selects winning runs independently for each offload mode', () => { + const rows = getLatestBenchmarks('testm', NEW, false).filter( + (r) => r.isl === 4096 && r.osl === 4096, + ); + + expect( + rows + .map((r) => ({ offloadMode: r.offload_mode, runUrl: r.run_url })) + .toSorted((a, b) => a.offloadMode.localeCompare(b.offloadMode)), + ).toEqual([ + { offloadMode: 'off', runUrl: 'https://github.com/x/runs/101/attempts/1' }, + { offloadMode: 'on', runUrl: 'https://github.com/x/runs/100/attempts/1' }, + ]); + }); + it('breaks a same-day, same-timestamp tie by workflow_run_id (higher id wins the whole line)', () => { const rows = getLatestBenchmarks('testm', NEW, false); // config 2: run E (200, id 20) and run F (201, id 21) share run_started_at; F wins by id. diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index dfb03e98..b502b243 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -72,6 +72,8 @@ interface RawBenchmarkResult { isl: number; osl: number; conc: number; + /** Added by the AgentX schema; older dumps omit it and are treated as off. */ + offload_mode?: string; image: string | null; metrics: Record; /** Added in migration 006; older dumps omit this field — surfaced as undefined. */ @@ -281,6 +283,7 @@ function toBenchmarkRow( metrics?: Record, ): BenchmarkRow { return { + id: br.id, hardware: c.hardware, framework: c.framework, model: c.model, @@ -298,6 +301,8 @@ function toBenchmarkRow( decode_num_workers: c.decode_num_workers, num_prefill_gpu: c.num_prefill_gpu, num_decode_gpu: c.num_decode_gpu, + benchmark_type: br.benchmark_type ?? 'single_turn', + offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off', isl: br.isl, osl: br.osl, conc: br.conc, @@ -351,9 +356,9 @@ export function compareBenchmarkRecency( return bStarted.localeCompare(aStarted); } -/** Chart-line identity: one config + sequence. All concurrencies of a line come from one run. */ +/** Chart-line identity: one config + sequence + offload mode. All concurrencies of a line come from one run. */ const lineKey = (br: RawBenchmarkResult): string => - `${br.config_id}:${br.benchmark_type}:${br.isl}:${br.osl}`; + `${br.config_id}:${br.benchmark_type}:${br.isl}:${br.osl}:${br.offload_mode ?? 'off'}`; export function getLatestBenchmarks( modelKey: string | string[], @@ -390,7 +395,7 @@ export function getLatestBenchmarks( return true; }); - // Single run per LINE (config_id, benchmark_type, isl, osl): pick the newest run that + // Single run per LINE (config_id, benchmark_type, isl, osl, offload_mode): pick the newest run that // produced data for the line, then keep EVERY concurrency that one run measured. Sort by // recency (date, then run_started_at) with a final workflow_run_id DESC tiebreak so exactly // one run wins even when run_started_at is equal/null — matching the SQL ORDER BY. @@ -499,7 +504,11 @@ export function getAvailabilityData(): AvailabilityRow[] { for (const a of s.availability) { const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`; if (validKeys.has(key)) { - rows.push({ ...a, date: toDateString(a.date) }); + rows.push({ + ...a, + benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn', + date: toDateString(a.date), + }); } } diff --git a/packages/db/src/lib/backfill-runner.test.ts b/packages/db/src/lib/backfill-runner.test.ts new file mode 100644 index 00000000..6da9071f --- /dev/null +++ b/packages/db/src/lib/backfill-runner.test.ts @@ -0,0 +1,55 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +import { parseLimitForceFlags, runPerIdBackfill } from './backfill-runner.js'; + +describe('parseLimitForceFlags', () => { + const originalArgv = process.argv; + afterEach(() => { + process.argv = originalArgv; + }); + + it('defaults to no limit and force off', () => { + process.argv = ['node', 'script.ts']; + expect(parseLimitForceFlags()).toEqual({ limit: null, force: false }); + }); + + it('parses --limit N and --force', () => { + process.argv = ['node', 'script.ts', '--limit', '25', '--force', '--yes']; + expect(parseLimitForceFlags()).toEqual({ limit: 25, force: true }); + }); +}); + +describe('runPerIdBackfill', () => { + beforeEach(() => { + vi.spyOn(console, 'log').mockImplementation(() => {}); + vi.spyOn(console, 'error').mockImplementation(() => {}); + }); + afterEach(() => { + vi.restoreAllMocks(); + process.exitCode = undefined; + }); + + it('processes ids serially and leaves exitCode unset on success', async () => { + const seen: number[] = []; + await runPerIdBackfill([1, 2, 3], (id) => { + seen.push(id); + return Promise.resolve(id === 2 ? 'skipped' : 'ok'); + }); + expect(seen).toEqual([1, 2, 3]); + expect(process.exitCode).toBeUndefined(); + // Two ✓ lines (skipped rows do not log) plus the summary line. + const logged = vi.mocked(console.log).mock.calls.map((c) => String(c[0])); + expect(logged.filter((l) => l.includes('✓')).length).toBe(2); + expect(logged.at(-1)).toContain('=== backfill complete: 2 ok, 0 failed'); + }); + + it('counts throws as failures and sets exitCode = 1', async () => { + await runPerIdBackfill([1, 2], (id) => + id === 1 ? Promise.reject(new Error('boom')) : Promise.resolve('ok'), + ); + expect(process.exitCode).toBe(1); + const logged = vi.mocked(console.log).mock.calls.map((c) => String(c[0])); + expect(logged.at(-1)).toContain('=== backfill complete: 1 ok, 1 failed'); + expect(vi.mocked(console.error).mock.calls[0]?.[0]).toContain('✗ id=1: boom'); + }); +}); diff --git a/packages/db/src/lib/backfill-runner.ts b/packages/db/src/lib/backfill-runner.ts new file mode 100644 index 00000000..de00bee1 --- /dev/null +++ b/packages/db/src/lib/backfill-runner.ts @@ -0,0 +1,98 @@ +/** + * Shared scaffolding for the one-shot `backfill-*.ts` CLI scripts (invoked + * via the `db:backfill-*` package scripts). Each script keeps only its + * candidate query and per-row recompute; flag parsing, the `--yes` + * confirmation gate, per-row progress logging, and the exit-code summary + * live here so every backfill behaves identically on the command line. + */ + +import { confirm, hasYesFlag } from '../cli-utils.js'; +import type { Sql } from '../etl/db-utils.js'; + +export interface LimitForceFlags { + limit: number | null; + force: boolean; +} + +/** Parse the standard `--limit N` / `--force` backfill flags from argv. */ +export function parseLimitForceFlags(): LimitForceFlags { + let limit: number | null = null; + let force = false; + for (let i = 2; i < process.argv.length; i++) { + const arg = process.argv[i]!; + if (arg === '--force') force = true; + else if (arg === '--limit') { + const next = process.argv[++i]; + if (!next || Number.isNaN(Number(next))) { + console.error('--limit requires a numeric argument'); + process.exit(1); + } + limit = Number(next); + } + } + return { limit, force }; +} + +/** + * Print the candidate-count line, then gate on `--yes` or an interactive + * y/N prompt. Returns false (after logging "Aborted.") when declined. + */ +export async function confirmProceed(candidatesLabel: string): Promise { + console.log(`\n ${candidatesLabel}`); + if (hasYesFlag()) return true; + const ok = await confirm('\nProceed? (y/N) '); + if (!ok) console.log('Aborted.'); + return ok; +} + +/** + * Iterate candidate row ids one at a time (the recomputed blobs can be + * hundreds of MB decompressed — serial processing keeps memory bounded), + * logging per-row progress and a final summary. `processRow` returns 'ok' + * (counts toward the ✓ log) or 'skipped' (e.g. row vanished — the callback + * logs its own warning); throwing marks the row failed. Sets + * `process.exitCode = 1` when any row failed. + */ +export async function runPerIdBackfill( + ids: readonly number[], + processRow: (id: number) => Promise<'ok' | 'skipped'>, +): Promise { + let ok = 0; + let failed = 0; + const t0 = Date.now(); + for (const id of ids) { + const start = Date.now(); + try { + if ((await processRow(id)) === 'skipped') continue; + ok++; + const elapsed = Math.round((Date.now() - start) / 1000); + const elapsedTotal = Math.round((Date.now() - t0) / 1000); + console.log(` ✓ id=${id} (${elapsed}s, ${ok}/${ids.length} done, ${elapsedTotal}s total)`); + } catch (error) { + failed++; + console.error(` ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`); + } + } + const totalSec = Math.round((Date.now() - t0) / 1000); + console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`); + if (failed > 0) process.exitCode = 1; +} + +/** + * jsonb parameter for a freshly computed value. `structuredClone` strips + * class instances/prototypes so postgres.js serializes plain data only — + * matches what the inline ingest path stores. + */ +export function jsonbParam(sql: Sql, value: unknown): ReturnType { + return sql.json(structuredClone(value) as unknown as Parameters[0]); +} + +/** Standard `main().catch(…).finally(sql.end())` trailer for backfill CLIs. */ +export function runBackfillMain(name: string, sql: Sql, main: () => Promise): void { + main() + .catch((error) => { + console.error(`${name} failed:`, error); + process.exitCode = 1; + }) + .finally(() => sql.end()); +} diff --git a/packages/db/src/lib/github-artifacts.test.ts b/packages/db/src/lib/github-artifacts.test.ts new file mode 100644 index 00000000..571643a5 --- /dev/null +++ b/packages/db/src/lib/github-artifacts.test.ts @@ -0,0 +1,42 @@ +import { describe, expect, it } from 'vitest'; + +import { RUNNER_SUFFIX_RE, dedupeArtifactsByLogicalName } from './github-artifacts.js'; + +const art = (name: string, created_at: string) => ({ + name, + archive_download_url: `https://api.github.com/${name}`, + created_at, +}); + +describe('RUNNER_SUFFIX_RE', () => { + it('strips the trailing runner-pool + attempt token', () => { + expect('bmk_dsr1_conc4_h200-cw_00'.replace(RUNNER_SUFFIX_RE, '')).toBe('bmk_dsr1_conc4'); + expect('bmk_dsr1_conc4_h200-dgxc-slurm_1'.replace(RUNNER_SUFFIX_RE, '')).toBe('bmk_dsr1_conc4'); + }); + + it('does not over-match across earlier underscore separators', () => { + // The (conc, offload) variant tokens must survive — only the final + // `__` pair is stripped. + expect('bmk_agentic_glm5_offload_on_b200-nb_2'.replace(RUNNER_SUFFIX_RE, '')).toBe( + 'bmk_agentic_glm5_offload_on', + ); + expect('server_logs_glm5'.replace(RUNNER_SUFFIX_RE, '')).toBe('server_logs_glm5'); + }); +}); + +describe('dedupeArtifactsByLogicalName', () => { + it('keeps only the most recent artifact per logical name', () => { + const deduped = dedupeArtifactsByLogicalName([ + art('bmk_dsr1_conc4_h200-cw_00', '2026-06-01T00:00:00Z'), + art('bmk_dsr1_conc4_h200-dgxc-slurm_1', '2026-06-02T00:00:00Z'), + art('bmk_dsr1_conc8_h200-cw_00', '2026-06-01T00:00:00Z'), + ]); + expect([...deduped.keys()].toSorted()).toEqual(['bmk_dsr1_conc4', 'bmk_dsr1_conc8']); + expect(deduped.get('bmk_dsr1_conc4')?.name).toBe('bmk_dsr1_conc4_h200-dgxc-slurm_1'); + }); + + it('passes through names without a runner suffix unchanged', () => { + const deduped = dedupeArtifactsByLogicalName([art('run-stats', '2026-06-01T00:00:00Z')]); + expect(deduped.get('run-stats')?.name).toBe('run-stats'); + }); +}); diff --git a/packages/db/src/lib/github-artifacts.ts b/packages/db/src/lib/github-artifacts.ts new file mode 100644 index 00000000..291740cf --- /dev/null +++ b/packages/db/src/lib/github-artifacts.ts @@ -0,0 +1,86 @@ +/** + * GitHub Actions artifact helpers shared by `ingest-ci-run.ts` (download + * mode) and `backfill-agentic-server-logs.ts`. All calls shell out to the + * `gh` CLI, which picks up GITHUB_TOKEN from the environment. + */ + +import { execSync } from 'node:child_process'; +import fs from 'node:fs'; +import path from 'node:path'; + +export interface ArtifactMeta { + name: string; + archive_download_url: string; + created_at: string; +} + +/** + * Strips the trailing `__` token from an + * artifact name so retries on different runners collapse to one logical + * artifact. Without this, two artifacts produced for the same logical + * config (e.g. `…_h200-cw_00` and `…_h200-dgxc-slurm_1`) both land in the + * DB and the failed one's empty metrics can overwrite the good one via + * ON CONFLICT DO UPDATE. + * + * The runner pool name itself has no underscores (`h200-cw`, + * `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip + * bounded — using `\w` here would over-match across earlier `_` separators + * and collapse different (conc, offload) variants into the same logical + * name. + */ +export const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/u; + +/** List a workflow run's artifacts via `gh api` (paginated). Malformed lines are skipped. */ +export function listRunArtifacts(repo: string, runId: string): ArtifactMeta[] { + const json = execSync( + `gh api "repos/${repo}/actions/runs/${runId}/artifacts" --paginate --jq '.artifacts[]'`, + { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024 }, + ); + const out: ArtifactMeta[] = []; + for (const line of json.trim().split('\n')) { + if (!line) continue; + try { + out.push(JSON.parse(line) as ArtifactMeta); + } catch { + // skip malformed line + } + } + return out; +} + +/** + * Group artifacts by their runner-suffix-stripped logical name, keeping only + * the most recent (`created_at`) per group. + */ +export function dedupeArtifactsByLogicalName( + artifacts: readonly ArtifactMeta[], +): Map { + const byLogical = new Map(); + for (const a of artifacts) { + const key = a.name.replace(RUNNER_SUFFIX_RE, ''); + const existing = byLogical.get(key); + if (!existing || a.created_at > existing.created_at) byLogical.set(key, a); + } + return byLogical; +} + +/** Download + unzip one artifact into `/`; returns that dir. */ +export function downloadArtifact(artifact: ArtifactMeta, destRoot: string): string { + const zipPath = path.join(destRoot, 'artifact.zip'); + execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, { + stdio: ['pipe', 'pipe', 'inherit'], + }); + const destDir = path.join(destRoot, artifact.name); + fs.mkdirSync(destDir, { recursive: true }); + execSync(`unzip -oq "${zipPath}" -d "${destDir}"`, { stdio: 'inherit' }); + fs.unlinkSync(zipPath); + return destDir; +} + +/** Fetch a run's current attempt number via `gh api` (defaults to 1). */ +export function fetchRunAttempt(repo: string, runId: string): number { + const attemptStr = execSync(`gh api "repos/${repo}/actions/runs/${runId}" --jq '.run_attempt'`, { + encoding: 'utf8', + }).trim(); + return parseInt(attemptStr || '1', 10); +} diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts new file mode 100644 index 00000000..529306cf --- /dev/null +++ b/packages/db/src/queries/agentic-aggregates.test.ts @@ -0,0 +1,113 @@ +import { describe, expect, it } from 'vitest'; + +import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates'; + +describe('percentilesOf', () => { + it('returns null for empty input', () => { + expect(percentilesOf([])).toBeNull(); + expect(percentilesOf([Number.NaN, Number.POSITIVE_INFINITY])).toBeNull(); + }); + + it('computes percentiles for a simple integer range', () => { + // 1..100, evenly spaced — linear quantile is straightforward. + const xs = Array.from({ length: 100 }, (_, i) => i + 1); + const p = percentilesOf(xs); + expect(p).not.toBeNull(); + expect(p!.n).toBe(100); + expect(p!.mean).toBeCloseTo(50.5, 6); + expect(p!.p50).toBeCloseTo(50.5, 6); + // For 100 sorted values, p75 = sorted[0.75 * 99] = sorted[74.25] interp. + expect(p!.p75).toBeCloseTo(75.25, 6); + expect(p!.p90).toBeCloseTo(90.1, 6); + expect(p!.p99).toBeCloseTo(99.01, 6); + }); + + it('filters out non-finite values before computing', () => { + const p = percentilesOf([1, 2, Number.NaN, 3, Number.POSITIVE_INFINITY, 4]); + expect(p?.n).toBe(4); + expect(p?.mean).toBeCloseTo(2.5, 6); + }); +}); + +describe('extractIslOsl', () => { + it('reads input/output sequence length from profiling records', () => { + const lines = [ + JSON.stringify({ + metadata: { benchmark_phase: 'profiling' }, + metrics: { + input_sequence_length: { value: 100, unit: 'tokens' }, + output_sequence_length: { value: 50, unit: 'tokens' }, + }, + }), + JSON.stringify({ + metadata: { benchmark_phase: 'profiling' }, + metrics: { + input_sequence_length: { value: 200, unit: 'tokens' }, + output_sequence_length: { value: 75, unit: 'tokens' }, + }, + }), + // warmup record — should be ignored + JSON.stringify({ + metadata: { benchmark_phase: 'warmup' }, + metrics: { + input_sequence_length: { value: 9999, unit: 'tokens' }, + output_sequence_length: { value: 9999, unit: 'tokens' }, + }, + }), + ]; + const { isl, osl } = extractIslOsl(lines.join('\n')); + expect(isl).toEqual([100, 200]); + expect(osl).toEqual([50, 75]); + }); +}); + +describe('extractServerMetricSamples', () => { + it('extracts KV cache util gauge and computes per-interval prefix hit rate', () => { + const json = JSON.stringify({ + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 0, end_ns: 1, avg: 0.1 }, + { start_ns: 1, end_ns: 2, avg: 0.5 }, + { start_ns: 2, end_ns: 3, avg: 0.9 }, + ], + }, + ], + }, + 'vllm:prefix_cache_hits': { + series: [ + { + timeslices: [ + { start_ns: 0, rate: 80 }, + { start_ns: 1, rate: 50 }, + { start_ns: 2, rate: 0 }, // skipped because matching queries.rate is 0 + ], + }, + ], + }, + 'vllm:prefix_cache_queries': { + series: [ + { + timeslices: [ + { start_ns: 0, rate: 100 }, // hit rate = 0.8 + { start_ns: 1, rate: 100 }, // hit rate = 0.5 + { start_ns: 2, rate: 0 }, + ], + }, + ], + }, + }, + }); + const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json); + expect(kvCacheUtil).toEqual([0.1, 0.5, 0.9]); + expect(prefixCacheHitRate).toEqual([0.8, 0.5]); + }); + + it('returns empty arrays when the JSON lacks the expected metric series', () => { + const out = extractServerMetricSamples(JSON.stringify({ metrics: {} })); + expect(out.kvCacheUtil).toEqual([]); + expect(out.prefixCacheHitRate).toEqual([]); + }); +}); diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts new file mode 100644 index 00000000..72faa148 --- /dev/null +++ b/packages/db/src/queries/agentic-aggregates.ts @@ -0,0 +1,406 @@ +/** + * Per-id aggregate stats for the "Aggregates across configs" view on the + * agentic detail page. Each id contributes one summary number per metric per + * percentile so the frontend can plot how each metric varies across the + * SKU's parallelism + concurrency configs. + * + * Sources: + * - `profile_export.jsonl` → ISL / OSL per request (filtered to profiling phase) + * - `server_metrics_json` → time-series of KV cache utilization + + * prefix-cache hit rate per scrape interval + * + * Returns mean/p50/p75/p90/p99 per metric. Nulls when the blob is missing + * or has no usable samples — frontend treats those as "no data". + */ + +import { Readable } from 'node:stream'; +import { createGunzip, gunzipSync } from 'node:zlib'; + +import { chain } from 'stream-chain'; + +import { parser } from 'stream-json'; +import { pick } from 'stream-json/filters/pick.js'; +import { streamObject } from 'stream-json/streamers/stream-object.js'; + +import type { DbClient } from '../connection.js'; +import { + fetchAggregateStatsRows, + percentilesOf, + readNum, + type MetricPercentiles, +} from './agentic-shared'; + +// Percentile math + envelope reader live in agentic-shared.ts; re-exported +// here because etl/compute-aggregate-stats and the API layer import them +// from this module. +export { percentilesOf, type MetricPercentiles } from './agentic-shared'; + +/** + * Bump when the aggregate-stats computation algorithm changes — the backfill + * script recomputes any row whose stored `aggregate_stats.version` is older. + * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular + * import: the compute helper depends on the extractors below. + * + * v2: aggregate vllm gauges/counters across all engine series (was reading + * only series[0], which under-counted by Nx on multi-engine DP/PP deployments). + * + * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate + * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way + * they do for vllm runs. + * + * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL. + */ +export const STATS_VERSION = 4; + +export interface AgenticAggregate { + id: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; +} + +export type AgenticAggregateMap = Record; + +/** + * `profile_export_jsonl_gz` is small (~1-3 MB) so we can batch many per + * round-trip. `server_metrics_json_gz` is much bigger (~17 MB compressed + * for high-conc TP+EP runs; Neon encodes bytea over HTTP at ~1.6× wire + * size, so two of those = ~50 MB and three already trips the 64 MB cap). + * We fetch the two blob types in separate queries with different chunk + * sizes. + */ +const PROFILE_CHUNK_SIZE = 8; +const SERVER_CHUNK_SIZE = 1; + +interface ProfileRecord { + metadata?: { benchmark_phase?: string }; + metrics?: { + input_sequence_length?: { value?: number } | number; + output_sequence_length?: { value?: number } | number; + }; +} + +/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */ +export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } { + const isl: number[] = []; + const osl: number[] = []; + for (const line of jsonl.split('\n')) { + if (!line) continue; + let rec: ProfileRecord; + try { + rec = JSON.parse(line) as ProfileRecord; + } catch { + continue; + } + if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue; + const m = rec.metrics ?? {}; + const i = readNum(m.input_sequence_length); + const o = readNum(m.output_sequence_length); + if (typeof i === 'number') isl.push(i); + if (typeof o === 'number') osl.push(o); + } + return { isl, osl }; +} + +interface TimeSlice { + start_ns?: number; + end_ns?: number; + avg?: number; + rate?: number; + count?: number; + sum?: number; +} +interface Series { + labels?: Record; + timeslices?: TimeSlice[]; +} +interface MetricMeta { + series?: Series[]; +} +interface MetricsJson { + metrics?: Record; +} + +/** + * Aggregate a per-timeslice field across all series of a metric, indexed by + * the timeslice's `start_ns`. vllm reports one series per engine on + * multi-engine DP/PP deployments, so we sum (or average) across engines to + * get the cluster-wide value at each timeslice. + * + * `field` selects which numeric field on a timeslice to read (`avg` for + * gauges, `rate` for counter deltas). `combine` controls cross-engine math: + * 'sum' for running/waiting/throughput counters where the cluster total is + * the sum; 'avg' for KV cache utilization, which is bounded [0, 1] per + * engine and should be averaged across engines for the cluster view. + */ +function aggregateSeriesByStart( + metricSeries: readonly Series[] | undefined, + field: 'avg' | 'rate', + combine: 'sum' | 'avg', +): Map { + const sums = new Map(); + const counts = new Map(); + for (const s of metricSeries ?? []) { + for (const ts of s.timeslices ?? []) { + if (typeof ts.start_ns !== 'number') continue; + const v = ts[field]; + if (typeof v !== 'number' || !Number.isFinite(v)) continue; + sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v); + counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1); + } + } + if (combine === 'sum') return sums; + const out = new Map(); + for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1)); + return out; +} + +/** + * Parse the server_metrics_json → time-series arrays for KV cache util and + * prefix cache hit rate (per-interval, computed from the prometheus + * counters the same way trace-server-metrics does it). + * + * Aggregates across all engine series so multi-engine DP/PP deployments are + * counted correctly (previously we only read engine 0). + */ +/** First metric whose series array is non-empty; supports vllm/sglang fallback. */ +function pickFirstNonEmpty( + metrics: Record, + ...names: string[] +): Series[] | undefined { + for (const name of names) { + const s = metrics[name]?.series; + if (s && s.length > 0) return s; + } + return undefined; +} + +export function extractServerMetricSamples(json: string): { + kvCacheUtil: number[]; + prefixCacheHitRate: number[]; +} { + const parsed = JSON.parse(json) as MetricsJson; + const metrics = parsed.metrics ?? {}; + + // KV cache util — per-engine gauge in [0, 1]. Average across engines so the + // value stays a percentage; summing would give meaningless 0..N. + const kvSeriesAll = pickFirstNonEmpty( + metrics, + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'sglang:token_usage', + ); + const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()]; + + // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across + // all engines. Sum first, then divide. SGLang names: cached_tokens / prompt_tokens. + const hitsAll = pickFirstNonEmpty( + metrics, + 'vllm:prefix_cache_hits', + 'vllm:gpu_prefix_cache_hits', + 'sglang:cached_tokens', + ); + const queriesAll = pickFirstNonEmpty( + metrics, + 'vllm:prefix_cache_queries', + 'vllm:gpu_prefix_cache_queries', + 'vllm:prompt_tokens', + 'sglang:prompt_tokens', + ); + const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum'); + const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum'); + const prefixCacheHitRate: number[] = []; + for (const [t, h] of hitsByT) { + const q = qByT.get(t); + if (q !== undefined && q > 0) prefixCacheHitRate.push(h / q); + } + + return { kvCacheUtil, prefixCacheHitRate }; +} + +/** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */ +const TARGET_METRIC_KEYS = new Set([ + // vLLM + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'vllm:prefix_cache_hits', + 'vllm:prefix_cache_queries', + 'vllm:gpu_prefix_cache_hits', + 'vllm:gpu_prefix_cache_queries', + 'vllm:prompt_tokens', + // SGLang + 'sglang:token_usage', + 'sglang:cached_tokens', + 'sglang:prompt_tokens', +]); + +/** + * Stream-parse the gzipped server_metrics_json and collect ONLY the metrics + * we need. Avoids the Node 512 MB string cap that JSON.parse hits on + * server_metrics blobs from high-conc TP+EP runs (which can decompress to + * >500 MB because vllm dumps `cache_config_info` every scrape interval). + * + * Pipeline: Buffer → gunzip → JSON parser → Pick('metrics') → + * StreamObject (one metric per chunk) → keep only the keys we care about. + * + * Returns the same `{ kvCacheUtil, prefixCacheHitRate }` shape as the + * synchronous fast path so callers can use either interchangeably. + */ +async function streamExtractServerMetricSamples( + buffer: Buffer, +): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> { + const collected: Record = {}; + // stream-json's TypeScript types don't compose cleanly with node:stream's + // pipeline() generic, and several `.pipe()`/event APIs are typed loosely — + // cast to any for this local pipe chain. It works at runtime. + // stream-json composes transforms via stream-chain. `pick`/`streamObject` + // each return a Transform when called; `chain([...])` wires them. + /* eslint-disable @typescript-eslint/no-explicit-any */ + const pipeline = chain([ + Readable.from(buffer), + createGunzip(), + parser(), + pick({ filter: 'metrics' }), + streamObject(), + ]); + await new Promise((resolve, reject) => { + (pipeline as any).on('data', (chunk: unknown) => { + const { key, value } = chunk as { key: string; value: MetricMeta }; + if (TARGET_METRIC_KEYS.has(key)) collected[key] = value; + }); + (pipeline as any).on('end', resolve); + (pipeline as any).on('error', reject); + }); + /* eslint-enable @typescript-eslint/no-explicit-any */ + return extractServerMetricSamples(JSON.stringify({ metrics: collected })); +} + +export async function getAgenticAggregates( + sql: DbClient, + benchmarkResultIds: number[], +): Promise { + if (benchmarkResultIds.length === 0) return {}; + + const result: AgenticAggregateMap = {}; + + // Fast path: read the pre-computed `aggregate_stats` JSONB written by the + // ingest pipeline (and back-filled by `backfill-aggregate-stats.ts`). One + // round-trip pulls everything we need for every requested id with no blob + // decompression, so the slow blob-parsing fallback only runs for ids + // whose stats are missing or were produced by an older `STATS_VERSION`. + const statsRows = await fetchAggregateStatsRows(sql, benchmarkResultIds); + + const idsNeedingProfile: number[] = []; + const idsNeedingServer: number[] = []; + for (const row of statsRows) { + const id = Number(row.benchmark_result_id); + const agg = blankAggregate(id); + if (row.stats && Number(row.stats.version) === STATS_VERSION) { + agg.isl = row.stats.isl ?? null; + agg.osl = row.stats.osl ?? null; + agg.kvCacheUtil = row.stats.kvCacheUtil ?? null; + agg.prefixCacheHitRate = row.stats.prefixCacheHitRate ?? null; + } else { + // No stats (or stale version) — schedule the blob-parse fallback below + // so the response still surfaces data. Backfill should drain these. + idsNeedingProfile.push(id); + idsNeedingServer.push(id); + } + result[id] = agg; + } + // Also fall back for ids that didn't return a row at all (no trace_replay + // link) — keep the caller contract: every id we know about lands in the map. + for (const id of benchmarkResultIds) { + if (!(id in result)) result[id] = blankAggregate(id); + } + + if (idsNeedingProfile.length === 0 && idsNeedingServer.length === 0) { + return result; + } + + // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ────── + for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) { + const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE); + const rows = (await sql` + select + br.id as benchmark_result_id, + atr.profile_export_jsonl_gz as profile_blob + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${chunk}::bigint[]) + `) as { benchmark_result_id: number; profile_blob: Buffer | null }[]; + for (const row of rows) { + const id = Number(row.benchmark_result_id); + result[id] ??= blankAggregate(id); + if (row.profile_blob) { + try { + const jsonl = gunzipSync(row.profile_blob).toString('utf8'); + const { isl, osl } = extractIslOsl(jsonl); + result[id].isl = percentilesOf(isl); + result[id].osl = percentilesOf(osl); + } catch { + // ignore malformed blob + } + } + } + } + // ── Fallback Pass 2: server_metrics blobs (huge; one at a time). ─────── + // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row + // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow + // path runs at most once per sibling set. + for (let i = 0; i < idsNeedingServer.length; i += SERVER_CHUNK_SIZE) { + const chunk = idsNeedingServer.slice(i, i + SERVER_CHUNK_SIZE); + const rows = (await sql` + select + br.id as benchmark_result_id, + atr.server_metrics_json_gz as server_blob + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${chunk}::bigint[]) + `) as { benchmark_result_id: number; server_blob: Buffer | null }[]; + for (const row of rows) { + const id = Number(row.benchmark_result_id); + result[id] ??= blankAggregate(id); + if (!row.server_blob) continue; + let parsed: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null; + try { + const json = gunzipSync(row.server_blob).toString('utf8'); + parsed = extractServerMetricSamples(json); + } catch (error) { + // ERR_STRING_TOO_LONG (>512 MB) hits on high-conc TP+EP rows whose + // server_metrics_json decompresses past Node's max string length. + // Stream-parse to extract just the metric subtrees we care about. + const code = error && (error as NodeJS.ErrnoException).code; + const msg = error instanceof Error ? error.message : String(error); + if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) { + try { + parsed = await streamExtractServerMetricSamples(row.server_blob); + } catch { + // stream fallback failed too — leave nulls + } + } + } + if (parsed) { + result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil); + result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate); + } + } + } + return result; +} + +/** Shape of the JSONB column when read back via postgres-js. */ +interface AggregateStatsRow { + version: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; + normalizedSessionTimeS: number | null; + p90PrefillTpsPerUser: number | null; +} + +function blankAggregate(id: number): AgenticAggregate { + return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null }; +} diff --git a/packages/db/src/queries/agentic-shared.ts b/packages/db/src/queries/agentic-shared.ts new file mode 100644 index 00000000..e8a639e7 --- /dev/null +++ b/packages/db/src/queries/agentic-shared.ts @@ -0,0 +1,81 @@ +/** + * Helpers shared by the agentic per-point queries (`agentic-aggregates.ts`, + * `derived-agentic-metrics.ts`): percentile math over aiperf samples, + * the `{value, unit}` metric-envelope reader, and the single-round-trip + * `aggregate_stats` fetch both fast paths start from. + */ + +import type { DbClient } from '../connection.js'; + +export interface MetricPercentiles { + mean: number; + p50: number; + p75: number; + p90: number; + p99: number; + /** Sample count used to compute the percentiles. */ + n: number; +} + +/** Linear-interpolated percentile (matches numpy's default linear method). */ +export function quantile(sortedAsc: number[], q: number): number { + if (sortedAsc.length === 0) return Number.NaN; + if (sortedAsc.length === 1) return sortedAsc[0]!; + const pos = (sortedAsc.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sortedAsc[lo]!; + return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo); +} + +export function meanOf(xs: number[]): number { + if (xs.length === 0) return Number.NaN; + let s = 0; + for (const x of xs) s += x; + return s / xs.length; +} + +/** Compute the percentile bundle for an array of samples; null if empty. */ +export function percentilesOf(samples: number[]): MetricPercentiles | null { + const clean = samples.filter((v) => Number.isFinite(v)); + if (clean.length === 0) return null; + const sorted = [...clean].toSorted((a, b) => a - b); + return { + mean: meanOf(sorted), + p50: quantile(sorted, 0.5), + p75: quantile(sorted, 0.75), + p90: quantile(sorted, 0.9), + p99: quantile(sorted, 0.99), + n: sorted.length, + }; +} + +/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */ +export function readNum(v: unknown): number | undefined { + if (typeof v === 'number') return v; + if (v && typeof v === 'object' && 'value' in v) { + const inner = (v as { value?: unknown }).value; + if (typeof inner === 'number' && Number.isFinite(inner)) return inner; + } + return undefined; +} + +/** + * One round-trip fetch of the pre-computed `aggregate_stats` JSONB for a set + * of benchmark_results ids (via their trace_replay link). Both agentic fast + * paths read from this; ids without a trace_replay row simply don't appear. + * `Stats` is the caller's view of the JSONB shape. + */ +export async function fetchAggregateStatsRows( + sql: DbClient, + benchmarkResultIds: readonly number[], +): Promise<{ benchmark_result_id: number; stats: Stats | null }[]> { + return (await sql` + select + br.id as benchmark_result_id, + atr.aggregate_stats as stats + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${benchmarkResultIds}::bigint[]) + `) as unknown as { benchmark_result_id: number; stats: Stats | null }[]; +} diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts new file mode 100644 index 00000000..2d36eb22 --- /dev/null +++ b/packages/db/src/queries/benchmark-siblings.ts @@ -0,0 +1,169 @@ +/** + * Find all benchmark_results that share the same SKU (hardware + framework + + * model + precision + spec_method + disagg + benchmark_type + workflow_run) + * as the given point. Used by the detail page to render a "switch between + * concs / parallelisms" navigator within a single run. + */ + +import type { DbClient } from '../connection.js'; + +export interface BenchmarkSibling { + id: number; + conc: number; + /** "on" | "off" | null. */ + offload_mode: string | null; + decode_tp: number; + decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; + prefill_tp: number; + prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; + num_prefill_gpu: number; + num_decode_gpu: number; + disagg: boolean; + is_multinode: boolean; + /** Throughput per GPU (tok/s/gpu) for this point; null if the metric is absent. */ + tput_per_gpu: number | null; + /** + * Total requests for this point — `total_requests_completed` (aiperf runner) + * falling back to the legacy `num_requests_total`; null if neither is present. + */ + total_requests: number | null; + /** True if this row IS the point passed in. */ + is_current: boolean; + /** Whether the row has a stored trace_replay blob (for navigation hint). */ + has_trace: boolean; +} + +export interface BenchmarkSku { + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + benchmark_type: string; + /** Human-readable workflow_run summary so the page header can hint at provenance. */ + github_run_id: number; + date: string; + /** Slug of the source dataset this run replayed (run_datasets), or null. */ + dataset_slug: string | null; +} + +export interface BenchmarkSiblings { + sku: BenchmarkSku; + siblings: BenchmarkSibling[]; +} + +export async function getBenchmarkSiblings( + sql: DbClient, + benchmarkResultId: number, +): Promise { + // Step 1: resolve the SKU defining fields for the requested point. + const seed = (await sql` + select + c.hardware, c.framework, c.model, c.precision, c.spec_method, + br.benchmark_type, br.workflow_run_id, br.date::text, + wr.github_run_id, rd.dataset_slug + from benchmark_results br + join configs c on c.id = br.config_id + join workflow_runs wr on wr.id = br.workflow_run_id + left join run_datasets rd on rd.workflow_run_id = br.workflow_run_id + where br.id = ${benchmarkResultId} + `) as unknown as { + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + benchmark_type: string; + workflow_run_id: number; + date: string; + github_run_id: number; + dataset_slug: string | null; + }[]; + const root = seed[0]; + if (!root) return null; + + // Step 2: pull every sibling row sharing the SKU within the same workflow_run. + const rows = (await sql` + select + br.id, br.conc, br.offload_mode, + c.decode_tp, c.decode_ep, c.decode_dp_attention, c.decode_num_workers, + c.prefill_tp, c.prefill_ep, c.prefill_dp_attention, c.prefill_num_workers, + c.num_prefill_gpu, c.num_decode_gpu, c.disagg, c.is_multinode, + (br.metrics->>'tput_per_gpu')::float8 as tput_per_gpu, + coalesce( + (br.metrics->>'total_requests_completed')::float8, + (br.metrics->>'num_requests_total')::float8 + ) as total_requests, + (br.trace_replay_id is not null) as has_trace + from benchmark_results br + join configs c on c.id = br.config_id + where br.workflow_run_id = ${root.workflow_run_id} + and br.benchmark_type = ${root.benchmark_type} + and c.hardware = ${root.hardware} + and c.framework = ${root.framework} + and c.model = ${root.model} + and c.precision = ${root.precision} + and c.spec_method = ${root.spec_method} + order by c.decode_tp, c.decode_ep, br.offload_mode nulls first, br.conc + `) as unknown as { + id: number; + conc: number; + offload_mode: string | null; + decode_tp: number; + decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; + prefill_tp: number; + prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; + num_prefill_gpu: number; + num_decode_gpu: number; + disagg: boolean; + is_multinode: boolean; + tput_per_gpu: number | null; + total_requests: number | null; + has_trace: boolean; + }[]; + + const siblings: BenchmarkSibling[] = rows.map((r) => ({ + id: Number(r.id), + conc: r.conc, + offload_mode: r.offload_mode, + decode_tp: r.decode_tp, + decode_ep: r.decode_ep, + decode_dp_attention: r.decode_dp_attention, + decode_num_workers: r.decode_num_workers, + prefill_tp: r.prefill_tp, + prefill_ep: r.prefill_ep, + prefill_dp_attention: r.prefill_dp_attention, + prefill_num_workers: r.prefill_num_workers, + num_prefill_gpu: r.num_prefill_gpu, + num_decode_gpu: r.num_decode_gpu, + disagg: r.disagg, + is_multinode: r.is_multinode, + tput_per_gpu: r.tput_per_gpu === null ? null : Number(r.tput_per_gpu), + total_requests: r.total_requests === null ? null : Number(r.total_requests), + is_current: Number(r.id) === benchmarkResultId, + has_trace: r.has_trace, + })); + + return { + sku: { + hardware: root.hardware, + framework: root.framework, + model: root.model, + precision: root.precision, + spec_method: root.spec_method, + benchmark_type: root.benchmark_type, + github_run_id: Number(root.github_run_id), + date: root.date, + dataset_slug: root.dataset_slug ?? null, + }, + siblings, + }; +} diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index d99a1da1..37301e2b 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -11,6 +11,8 @@ import type { WorkerPower } from '../etl/benchmark-mapper.js'; export type BenchmarkWorkerRow = WorkerPower; export interface BenchmarkRow { + /** Stable benchmark_results id used for agentic detail lookups. */ + id: number; hardware: string; framework: string; model: string; @@ -28,9 +30,11 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + isl: number | null; + osl: number | null; conc: number; + offload_mode: string; image: string | null; metrics: Record; /** @@ -50,7 +54,7 @@ export interface BenchmarkRow { * `['glm5', 'glm5.1']` unions both buckets under the one display. * * Selection unit is the LINE, not the point: for each line - * `(config_id, benchmark_type, isl, osl)` we pick the single newest workflow run that + * `(config_id, benchmark_type, isl, osl, offload_mode)` we pick the single newest workflow run that * produced data for it (newest date, then latest sweep, then highest run id) and return * EVERY concurrency that one run measured — and nothing from any other run. A partial * re-sweep therefore truncates the line to its own concurrencies rather than stitching the @@ -93,7 +97,8 @@ export async function getLatestBenchmarks( ) )` : sql``; - // winners: the single newest run per LINE (config_id, benchmark_type, isl, osl) under the + // winners: the single newest run per LINE + // (config_id, benchmark_type, isl, osl, offload_mode) under the // date/run cutoff. br.date is a calendar day, so two same-day sweeps tie on date — break // by wr.run_started_at (latest sweep wins), then br.workflow_run_id so exactly one run wins // even when run_started_at is equal/null. The outer join then pulls EVERY concurrency that @@ -101,8 +106,8 @@ export async function getLatestBenchmarks( // of concurrencies a partial re-sweep skipped). const rows = await sql` WITH winners AS ( - SELECT DISTINCT ON (br.config_id, br.benchmark_type, br.isl, br.osl) - br.config_id, br.benchmark_type, br.isl, br.osl, + SELECT DISTINCT ON (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode) + br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, br.workflow_run_id AS winning_run_id FROM benchmark_results br JOIN configs c ON c.id = br.config_id @@ -111,10 +116,11 @@ export async function getLatestBenchmarks( AND br.error IS NULL AND ${dateFilter} ${runFilter} - ORDER BY br.config_id, br.benchmark_type, br.isl, br.osl, + ORDER BY br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, br.date DESC, wr.run_started_at DESC NULLS LAST, br.workflow_run_id DESC ) SELECT + br.id, c.hardware, c.framework, c.model, @@ -132,6 +138,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -148,6 +156,7 @@ export async function getLatestBenchmarks( AND w.benchmark_type = br.benchmark_type AND w.isl IS NOT DISTINCT FROM br.isl AND w.osl IS NOT DISTINCT FROM br.osl + AND w.offload_mode = br.offload_mode AND w.winning_run_id = br.workflow_run_id WHERE br.error IS NULL ORDER BY br.config_id, br.conc, br.isl, br.osl @@ -158,6 +167,7 @@ export async function getLatestBenchmarks( // No date filter: use materialized view for instant lookups const rows = await sql` SELECT + lb.id, c.hardware, c.framework, c.model, @@ -175,6 +185,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + lb.benchmark_type, + lb.offload_mode, lb.isl, lb.osl, lb.conc, @@ -207,6 +219,7 @@ export async function getBenchmarksForRun( const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; const rows = await sql` SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + br.id, c.hardware, c.framework, c.model, @@ -224,6 +237,8 @@ export async function getBenchmarksForRun( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -257,6 +272,7 @@ export async function getAllBenchmarksForHistory( const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; const rows = await sql` SELECT + br.id, c.hardware, c.framework, c.model, @@ -274,9 +290,12 @@ export async function getAllBenchmarksForHistory( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, + br.image, br.metrics - '{std_ttft,std_tpot,std_e2el,std_intvty,std_itl,mean_ttft,mean_tpot,mean_e2el,mean_intvty,mean_itl}'::text[] as metrics, br.workers, br.date::text, diff --git a/packages/db/src/queries/datasets.test.ts b/packages/db/src/queries/datasets.test.ts new file mode 100644 index 00000000..c1676445 --- /dev/null +++ b/packages/db/src/queries/datasets.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, it } from 'vitest'; + +import type { DbClient } from '../connection.js'; +import { getConversation, listConversations, listDatasets } from './datasets.js'; + +/** + * Mock DbClient: returns canned result sets in call order. Each call to the + * tagged-template `sql` shifts the next queued rows array. The query text is + * ignored — these tests assert the JS-side shaping/coercion, not SQL. + */ +function mockSql(queue: unknown[][]): DbClient { + const responses = [...queue]; + return (() => Promise.resolve(responses.shift() ?? [])) as unknown as DbClient; +} + +describe('listDatasets', () => { + it('coerces conversation_count to a number', async () => { + const sql = mockSql([ + [ + { + id: 'a/b', + slug: 'b', + label: 'B', + variant: 'full', + conversation_count: '393', + summary: {}, + }, + ], + ]); + const out = await listDatasets(sql); + expect(out).toHaveLength(1); + expect(out[0].conversation_count).toBe(393); + expect(typeof out[0].conversation_count).toBe('number'); + }); +}); + +describe('listConversations', () => { + it('returns null when the dataset slug is unknown', async () => { + const sql = mockSql([[]]); // datasets lookup → no rows + expect(await listConversations(sql, 'missing')).toBeNull(); + }); + + it('returns total + numerically-coerced items', async () => { + const sql = mockSql([ + [{ id: 'ds-id' }], // datasets lookup + [{ n: 2 }], // count + [ + { + conv_id: 'c1', + models: ['m'], + num_turns: '5', + num_subagent_groups: '1', + total_in: '1000', + total_out: '200', + total_cached: '900', + }, + ], // items + ]); + const out = await listConversations(sql, 'b', { sort: 'tokens' }); + expect(out).not.toBeNull(); + expect(out!.total).toBe(2); + expect(out!.items[0]).toMatchObject({ + conv_id: 'c1', + num_turns: 5, + num_subagent_groups: 1, + total_in: 1000, + total_out: 200, + total_cached: 900, + }); + expect(typeof out!.items[0].total_in).toBe('number'); + }); +}); + +describe('getConversation', () => { + it('returns null when the conversation is missing', async () => { + const sql = mockSql([[]]); + expect(await getConversation(sql, 'b', 'nope')).toBeNull(); + }); + + it('coerces counts and passes through the structure', async () => { + const structure = { blockSize: 64, nodes: [], totals: {} }; + const sql = mockSql([ + [ + { + conv_id: 'c1', + models: ['m'], + num_turns: '3', + num_subagent_groups: '0', + total_in: '500', + total_out: '100', + total_cached: '450', + structure, + }, + ], + ]); + const out = await getConversation(sql, 'b', 'c1'); + expect(out).not.toBeNull(); + expect(out!.num_turns).toBe(3); + expect(out!.total_cached).toBe(450); + expect(out!.structure).toBe(structure); + }); +}); diff --git a/packages/db/src/queries/datasets.ts b/packages/db/src/queries/datasets.ts new file mode 100644 index 00000000..cfefe391 --- /dev/null +++ b/packages/db/src/queries/datasets.ts @@ -0,0 +1,213 @@ +/** + * Read queries for the agentic-benchmark source datasets (the HF cc-traces-weka + * corpora ingested by ingest-weka-dataset.ts). Back the /datasets area: + * - listDatasets → registry cards (no per-conversation rows) + * - getDataset → one dataset incl. precomputed chart_data + * - listConversations → paginated conversation list (counts only, no structure) + * - getConversation → one conversation's flamegraph structure + */ + +import type { DbClient } from '../connection.js'; +import type { ConversationStructure } from '../etl/weka-structure.js'; + +export interface DatasetSummary { + blockSize?: number; + hashIdScope?: string | null; + totalIn?: number; + totalOut?: number; + totalCached?: number; + cachedPct?: number; + mainTurns?: number; + subagentGroups?: number; + subagentTurns?: number; + meanRequestsPerConversation?: number; + medianRequestsPerConversation?: number; + meanSubagentsPerTrace?: number; + medianSubagentsPerTrace?: number; + modelMix?: Record; + [k: string]: unknown; +} + +export interface DatasetRecord { + id: string; + slug: string; + label: string; + variant: string; + description: string | null; + hf_url: string | null; + license: string | null; + conversation_count: number; + summary: DatasetSummary; + ingested_at: string; +} + +export interface DatasetDetail extends DatasetRecord { + /** Precomputed distribution bins + stats keyed by metric (see ingest buildChartData). */ + chart_data: Record; +} + +export interface ConversationListItem { + conv_id: string; + models: string[]; + num_turns: number; + num_subagent_groups: number; + total_in: number; + total_out: number; + total_cached: number; +} + +export interface ConversationList { + total: number; + items: ConversationListItem[]; +} + +export interface ConversationDetail { + conv_id: string; + models: string[]; + num_turns: number; + num_subagent_groups: number; + total_in: number; + total_out: number; + total_cached: number; + structure: ConversationStructure; +} + +/** All ingested datasets, newest first. Excludes the (large) chart_data blob. */ +export async function listDatasets(sql: DbClient): Promise { + const rows = (await sql` + select id, slug, label, variant, description, hf_url, license, + conversation_count, summary, ingested_at::text + from datasets + order by ingested_at desc, slug asc + `) as unknown as DatasetRecord[]; + return rows.map((r) => ({ ...r, conversation_count: Number(r.conversation_count) })); +} + +/** One dataset by slug, including chart_data. Null if not found. */ +export async function getDataset(sql: DbClient, slug: string): Promise { + const rows = (await sql` + select id, slug, label, variant, description, hf_url, license, + conversation_count, summary, chart_data, ingested_at::text + from datasets + where slug = ${slug} + `) as unknown as DatasetDetail[]; + const row = rows[0]; + if (!row) return null; + return { ...row, conversation_count: Number(row.conversation_count) }; +} + +export interface ListConversationsOpts { + search?: string; + limit?: number; + offset?: number; + /** 'tokens' (total_in desc), 'turns' (num_turns desc), or 'id' (conv_id asc). */ + sort?: 'tokens' | 'turns' | 'subagents' | 'id'; +} + +const MAX_LIMIT = 200; + +/** + * Paginated conversation list for a dataset (by slug). Returns counts only — + * the per-conversation `structure` blob is fetched separately by + * getConversation so the list stays light. + */ +export async function listConversations( + sql: DbClient, + slug: string, + opts: ListConversationsOpts = {}, +): Promise { + const ds = (await sql`select id from datasets where slug = ${slug}`) as unknown as { + id: string; + }[]; + const datasetId = ds[0]?.id; + if (!datasetId) return null; + + const limit = Math.min(MAX_LIMIT, Math.max(1, opts.limit ?? 50)); + const offset = Math.max(0, opts.offset ?? 0); + const search = opts.search?.trim(); + const like = search ? `%${search}%` : null; + + const totalRows = (await sql` + select count(*)::int as n + from dataset_conversations + where dataset_id = ${datasetId} + and (${like}::text is null or conv_id ilike ${like}) + `) as unknown as { n: number }[]; + const total = totalRows[0]?.n ?? 0; + + // Separate queries per sort (literal ORDER BY) — the neon HTTP driver doesn't + // compose nested sql fragments the way postgres.js does, so we can't splice an + // order-by fragment. The sort key is an enum, never raw user input. + const sort = opts.sort ?? 'tokens'; + let items: ConversationListItem[]; + if (sort === 'turns') { + items = (await sql` + select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached + from dataset_conversations + where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like}) + order by num_turns desc, conv_id asc + limit ${limit} offset ${offset} + `) as unknown as ConversationListItem[]; + } else if (sort === 'subagents') { + items = (await sql` + select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached + from dataset_conversations + where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like}) + order by num_subagent_groups desc, conv_id asc + limit ${limit} offset ${offset} + `) as unknown as ConversationListItem[]; + } else if (sort === 'id') { + items = (await sql` + select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached + from dataset_conversations + where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like}) + order by conv_id asc + limit ${limit} offset ${offset} + `) as unknown as ConversationListItem[]; + } else { + items = (await sql` + select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached + from dataset_conversations + where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like}) + order by total_in desc, conv_id asc + limit ${limit} offset ${offset} + `) as unknown as ConversationListItem[]; + } + + return { + total, + items: items.map((r) => ({ + ...r, + num_turns: Number(r.num_turns), + num_subagent_groups: Number(r.num_subagent_groups), + total_in: Number(r.total_in), + total_out: Number(r.total_out), + total_cached: Number(r.total_cached), + })), + }; +} + +/** One conversation's full flamegraph structure. Null if dataset/conv missing. */ +export async function getConversation( + sql: DbClient, + slug: string, + convId: string, +): Promise { + const rows = (await sql` + select dc.conv_id, dc.models, dc.num_turns, dc.num_subagent_groups, + dc.total_in, dc.total_out, dc.total_cached, dc.structure + from dataset_conversations dc + join datasets d on d.id = dc.dataset_id + where d.slug = ${slug} and dc.conv_id = ${convId} + `) as unknown as ConversationDetail[]; + const row = rows[0]; + if (!row) return null; + return { + ...row, + num_turns: Number(row.num_turns), + num_subagent_groups: Number(row.num_subagent_groups), + total_in: Number(row.total_in), + total_out: Number(row.total_out), + total_cached: Number(row.total_cached), + }; +} diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts new file mode 100644 index 00000000..afc5b22d --- /dev/null +++ b/packages/db/src/queries/derived-agentic-metrics.test.ts @@ -0,0 +1,111 @@ +import { describe, expect, it } from 'vitest'; + +import { computeDerivedFromBlob } from './derived-agentic-metrics.js'; + +/** Build one aiperf JSONL record for the synthetic fixture. */ +function rec( + conversation_id: string, + turn_index: number, + fields: { isl: number; osl: number; ttft_ms: number; latency_ms: number }, +): string { + return JSON.stringify({ + metadata: { conversation_id, turn_index, benchmark_phase: 'profiling' }, + metrics: { + request_latency: { value: fields.latency_ms, unit: 'ms' }, + time_to_first_token: { value: fields.ttft_ms, unit: 'ms' }, + input_sequence_length: { value: fields.isl, unit: 'tokens' }, + output_sequence_length: { value: fields.osl, unit: 'tokens' }, + }, + }); +} + +describe('computeDerivedFromBlob', () => { + it('returns nulls when no usable records', () => { + const out = computeDerivedFromBlob(''); + expect(out.normalized_session_time_s).toBeNull(); + expect(out.p90_prefill_tps_per_user).toBeNull(); + expect(out.normalized_e2e_400).toBeNull(); + }); + + it('normalizes each request to 400 output tokens before taking percentiles', () => { + const jsonl = [ + // Both requests have TTFT=2s and ITL=20ms, despite very different OSL/E2E. + rec('s1', 0, { isl: 100, osl: 100, ttft_ms: 2000, latency_ms: 3980 }), + rec('s2', 0, { isl: 100, osl: 1000, ttft_ms: 2000, latency_ms: 21_980 }), + ].join('\n'); + + const out = computeDerivedFromBlob(jsonl); + // 2s TTFT + 399 × 20ms ITL = 9.98s for both requests. + expect(out.normalized_e2e_400?.n).toBe(2); + expect(out.normalized_e2e_400?.p75).toBeCloseTo(9.98, 8); + expect(out.normalized_e2e_400?.p90).toBeCloseTo(9.98, 8); + }); + + it('rescales single-session time and computes P90 prefill', () => { + // One session, two turns. load = (100+50) + (200+50) = 400. + // Single session ⇒ mean_load = load_i ⇒ T̃ = T = (1000+2000) ms = 3.0 s. + const jsonl = [ + rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }), + rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }), + ].join('\n'); + const out = computeDerivedFromBlob(jsonl); + expect(out.normalized_session_time_s).toBeCloseTo(3, 6); + // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → global P90 = 200. + expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6); + }); + + it('rescales times across sessions with unequal load', () => { + // s1: 1 turn, load = 100, T = 1s + // s2: 1 turn, load = 300, T = 3s + // mean_load = 200; T̃_1 = 1 * 200/100 = 2; T̃_2 = 3 * 200/300 = 2 + // Mean T̃ = 2.0 + const jsonl = [ + rec('s1', 0, { isl: 90, osl: 10, ttft_ms: 500, latency_ms: 1000 }), + rec('s2', 0, { isl: 270, osl: 30, ttft_ms: 500, latency_ms: 3000 }), + ].join('\n'); + const out = computeDerivedFromBlob(jsonl); + expect(out.normalized_session_time_s).toBeCloseTo(2, 6); + }); + + it('drops records missing required fields and skips non-profiling phase', () => { + const lines = [ + rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }), + // missing TTFT — should be skipped + JSON.stringify({ + metadata: { conversation_id: 's1', turn_index: 1, benchmark_phase: 'profiling' }, + metrics: { + request_latency: { value: 1000, unit: 'ms' }, + input_sequence_length: { value: 100, unit: 'tokens' }, + output_sequence_length: { value: 50, unit: 'tokens' }, + }, + }), + // warmup phase — should be skipped + JSON.stringify({ + metadata: { conversation_id: 's2', turn_index: 0, benchmark_phase: 'warmup' }, + metrics: { + request_latency: { value: 9999, unit: 'ms' }, + time_to_first_token: { value: 9999, unit: 'ms' }, + input_sequence_length: { value: 100, unit: 'tokens' }, + output_sequence_length: { value: 50, unit: 'tokens' }, + }, + }), + ]; + const out = computeDerivedFromBlob(lines.join('\n')); + expect(out.normalized_session_time_s).toBeCloseTo(1, 6); + expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6); + }); + + it('p90 across turns: 10-turn session picks the right rank', () => { + // Prefill rates 100..1000 (per turn isl/ttft); p90 of 10 values (linear) = 910. + const turns = Array.from({ length: 10 }, (_, i) => + rec('s1', i, { + isl: (i + 1) * 100, // 100, 200, ..., 1000 tokens + osl: 10, + ttft_ms: 1000, // 1 second → rates: 100..1000 tps + latency_ms: 1500, + }), + ); + const out = computeDerivedFromBlob(turns.join('\n')); + expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6); + }); +}); diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts new file mode 100644 index 00000000..8e5d15c9 --- /dev/null +++ b/packages/db/src/queries/derived-agentic-metrics.ts @@ -0,0 +1,268 @@ +/** + * Live-computed per-point metrics derived from the stored aiperf + * `profile_export.jsonl` blob. These aren't precomputed in the metrics JSONB + * because they require grouping by `conversation_id` and aggregating per + * session — work that's cheap once per agentic point but adds up to be + * meaningful only when actually plotted. + * + * - normalized_session_time_s: per the "Mean Normalized Session Time" proposal + * (https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa). Sum of + * per-turn `request_latency` per session (inter-turn tool/thinking gaps are + * inherently excluded since we only sum the active GPU time, not wallclock). + * Each session's time is rescaled by `mean_load / session_load`, where load + * is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions. + * + * - p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart. + * Per turn: prefill_tps = ISL / TTFT_seconds. Single P90 across every turn + * in every session — the per-session percentile + cross-session mean + * sandwich was discarded because it just dampens tail behavior. + */ + +import { gunzipSync } from 'node:zlib'; + +import { NORMALIZED_E2E_OUTPUT_TOKENS } from '@semianalysisai/inferencex-constants'; + +import type { DbClient } from '../connection.js'; +import { STATS_VERSION } from './agentic-aggregates'; +import { + fetchAggregateStatsRows, + meanOf, + percentilesOf, + quantile, + readNum, + type MetricPercentiles, +} from './agentic-shared'; + +export interface DerivedAgenticMetric { + /** benchmark_results.id this entry belongs to. */ + id: number; + /** Mean normalized session time in seconds. */ + normalized_session_time_s: number | null; + /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */ + p90_prefill_tps_per_user: number | null; + /** P75 normalized per-request E2E at a fixed 400-token output length. */ + p75_normalized_e2e_400_s: number | null; + /** P90 normalized per-request E2E at a fixed 400-token output length. */ + p90_normalized_e2e_400_s: number | null; +} + +export type DerivedAgenticMetricMap = Record; + +/** + * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless + * HTTP driver caps responses at 64 MB — chunk to stay well under. + */ +const QUERY_CHUNK_SIZE = 6; + +interface RecordMetrics { + request_latency?: { value?: number; unit?: string } | number; + time_to_first_token?: { value?: number; unit?: string } | number; + input_sequence_length?: { value?: number } | number; + output_sequence_length?: { value?: number } | number; +} + +interface RecordMetadata { + conversation_id?: string; + turn_index?: number; + benchmark_phase?: string; +} + +interface ProfileRecord { + metadata?: RecordMetadata; + metrics?: RecordMetrics; +} + +interface TurnFields { + request_latency_ms: number; + ttft_ms: number; + isl: number; + osl: number; +} + +function extractTurn(rec: ProfileRecord): TurnFields | null { + const m = rec.metrics ?? {}; + const rl = readNum(m.request_latency); + const tt = readNum(m.time_to_first_token); + const isl = readNum(m.input_sequence_length); + const osl = readNum(m.output_sequence_length); + if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null; + if (rl <= 0 || tt <= 0 || isl <= 0) return null; + return { request_latency_ms: rl, ttft_ms: tt, isl, osl }; +} + +/** + * Parse one point's JSONL and return the two derived metrics. Returns + * `{ session_time: null, prefill: null }` if the blob has no usable records. + */ +export function computeDerivedFromBlob(jsonl: string): { + normalized_session_time_s: number | null; + p90_prefill_tps_per_user: number | null; + normalized_e2e_400: MetricPercentiles | null; +} { + // Group records by conversation_id, filter to the profiling phase. + const bySession = new Map(); + for (const line of jsonl.split('\n')) { + if (!line) continue; + let rec: ProfileRecord; + try { + rec = JSON.parse(line) as ProfileRecord; + } catch { + continue; + } + if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue; + const sid = rec.metadata?.conversation_id; + if (!sid) continue; + const turn = extractTurn(rec); + if (!turn) continue; + let list = bySession.get(sid); + if (!list) { + list = []; + bySession.set(sid, list); + } + list.push(turn); + } + if (bySession.size === 0) { + return { + normalized_session_time_s: null, + p90_prefill_tps_per_user: null, + normalized_e2e_400: null, + }; + } + + // Per-session aggregates for session time; per-turn prefill rates pool into + // a single global array so the percentile sees the full distribution. + const sessionTimesS: number[] = []; + const sessionLoads: number[] = []; + const allPrefillRates: number[] = []; + const allNormalizedE2eS: number[] = []; + for (const turns of bySession.values()) { + let timeMs = 0; + let load = 0; + for (const t of turns) { + timeMs += t.request_latency_ms; + load += t.isl + t.osl; + const ttftSec = t.ttft_ms / 1000; + if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec); + + // Keep the observed TTFT, then project the request's mean decode + // interval to a fixed output length. Do this per request before taking + // percentiles so long original outputs do not dominate the tail. + const observedDecodeIntervals = Math.max(t.osl - 1, 1); + const itlMs = (t.request_latency_ms - t.ttft_ms) / observedDecodeIntervals; + const normalizedMs = t.ttft_ms + (NORMALIZED_E2E_OUTPUT_TOKENS - 1) * itlMs; + if ( + Number.isFinite(itlMs) && + itlMs >= 0 && + Number.isFinite(normalizedMs) && + normalizedMs > 0 + ) { + allNormalizedE2eS.push(normalizedMs / 1000); + } + } + if (load > 0) { + sessionTimesS.push(timeMs / 1000); + sessionLoads.push(load); + } + } + + // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean. + let normalized: number | null = null; + if (sessionTimesS.length > 0) { + const meanLoad = meanOf(sessionLoads); + if (meanLoad > 0) { + const scaled: number[] = []; + for (let i = 0; i < sessionTimesS.length; i++) { + const ti = sessionTimesS[i]!; + const li = sessionLoads[i]!; + if (li > 0) scaled.push(ti * (meanLoad / li)); + } + normalized = scaled.length > 0 ? meanOf(scaled) : null; + } + } + + let prefill: number | null = null; + if (allPrefillRates.length > 0) { + allPrefillRates.sort((a, b) => a - b); + prefill = quantile(allPrefillRates, 0.9); + } + + return { + normalized_session_time_s: normalized, + p90_prefill_tps_per_user: prefill, + normalized_e2e_400: percentilesOf(allNormalizedE2eS), + }; +} + +export async function getDerivedAgenticMetrics( + sql: DbClient, + benchmarkResultIds: number[], +): Promise { + if (benchmarkResultIds.length === 0) return {}; + + const result: DerivedAgenticMetricMap = {}; + + // Fast path: read the pre-computed values out of `aggregate_stats`. The + // ingest pipeline computes both metrics in the same pass that produces the + // percentile bundles, so a single SQL round-trip covers most ids without + // touching the gzipped profile blob. + const statsRows = await fetchAggregateStatsRows<{ + version?: number; + normalizedSessionTimeS?: number | null; + p90PrefillTpsPerUser?: number | null; + normalizedE2e400?: MetricPercentiles | null; + }>(sql, benchmarkResultIds); + + const idsNeedingBlob: number[] = []; + for (const row of statsRows) { + const id = Number(row.benchmark_result_id); + if (row.stats && Number(row.stats.version) === STATS_VERSION) { + result[id] = { + id, + normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null, + p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null, + p75_normalized_e2e_400_s: row.stats.normalizedE2e400?.p75 ?? null, + p90_normalized_e2e_400_s: row.stats.normalizedE2e400?.p90 ?? null, + }; + } else { + idsNeedingBlob.push(id); + } + } + + if (idsNeedingBlob.length === 0) return result; + + // Fallback: parse the profile blob directly. Used for rows whose + // `aggregate_stats` is null or computed by an older STATS_VERSION; the + // backfill script drains the population so this path should be rare. + const rows: { benchmark_result_id: number; blob: Buffer }[] = []; + for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) { + const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE); + const chunkRows = (await sql` + select + br.id as benchmark_result_id, + atr.profile_export_jsonl_gz as blob + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${chunk}::bigint[]) + and atr.profile_export_jsonl_gz is not null + `) as { benchmark_result_id: number; blob: Buffer }[]; + rows.push(...chunkRows); + } + + for (const row of rows) { + try { + const jsonl = gunzipSync(row.blob).toString('utf8'); + const { normalized_session_time_s, p90_prefill_tps_per_user, normalized_e2e_400 } = + computeDerivedFromBlob(jsonl); + result[Number(row.benchmark_result_id)] = { + id: Number(row.benchmark_result_id), + normalized_session_time_s, + p90_prefill_tps_per_user, + p75_normalized_e2e_400_s: normalized_e2e_400?.p75 ?? null, + p90_normalized_e2e_400_s: normalized_e2e_400?.p90 ?? null, + }; + } catch { + // Skip malformed blobs silently — frontend treats missing ids as "no data". + } + } + return result; +} diff --git a/packages/db/src/queries/request-timeline.test.ts b/packages/db/src/queries/request-timeline.test.ts new file mode 100644 index 00000000..62ba5385 --- /dev/null +++ b/packages/db/src/queries/request-timeline.test.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from 'vitest'; + +import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline'; +import type { DbClient } from '../connection.js'; + +import { getRequestTimeline } from './request-timeline'; + +function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } { + const responses = [...queue]; + const calls: string[] = []; + const sql = ((strings: TemplateStringsArray) => { + calls.push(strings.join('?')); + return Promise.resolve(responses.shift() ?? []); + }) as unknown as DbClient; + return { sql, calls }; +} + +const timeline: RequestTimeline = { + version: REQUEST_TIMELINE_VERSION, + startNs: 100, + endNs: 200, + durationS: 0.0000001, + requests: [], +}; + +describe('getRequestTimeline', () => { + it('returns the current precomputed timeline without selecting the raw profile blob', async () => { + const { sql, calls } = mockSql([ + [{ trace_replay_id: 870, has_blob: true, request_timeline: timeline }], + ]); + + await expect(getRequestTimeline(sql, 422991)).resolves.toEqual(timeline); + expect(calls).toHaveLength(1); + expect(calls[0]).not.toContain('profile_export_jsonl_gz as blob'); + }); + + it('does not fetch a blob when neither a current timeline nor a blob exists', async () => { + const { sql, calls } = mockSql([ + [{ trace_replay_id: 870, has_blob: false, request_timeline: null }], + ]); + + await expect(getRequestTimeline(sql, 422991)).resolves.toBeNull(); + expect(calls).toHaveLength(1); + }); +}); diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts new file mode 100644 index 00000000..2a6bb40c --- /dev/null +++ b/packages/db/src/queries/request-timeline.ts @@ -0,0 +1,64 @@ +/** + * Per-request timeline for the agentic detail page's Gantt view. + * + * Backed by `agentic_trace_replay.request_timeline` (pre-computed at + * ingest time, see `etl/compute-request-timeline.ts`). The fast path is + * a single SQL row read; the slow path re-computes from + * `profile_export_jsonl_gz` and is only taken when the column is missing + * or the stored `REQUEST_TIMELINE_VERSION` is stale. + */ + +import { + REQUEST_TIMELINE_VERSION, + computeRequestTimeline, + type RequestTimeline, +} from '../etl/compute-request-timeline'; + +import type { DbClient } from '../connection.js'; + +export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline'; + +interface RawMetaRow { + trace_replay_id: number; + has_blob: boolean; + request_timeline: RequestTimeline | null; +} + +interface RawBlobRow { + blob: Buffer | null; +} + +export async function getRequestTimeline( + sql: DbClient, + benchmarkResultId: number, +): Promise { + const rows = (await sql` + select + atr.id as trace_replay_id, + (atr.profile_export_jsonl_gz is not null) as has_blob, + atr.request_timeline + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = ${benchmarkResultId} + `) as unknown as RawMetaRow[]; + const row = rows[0]; + if (!row) return null; + + // Fast path: pre-computed timeline at the current version. + if (row.request_timeline && Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION) { + return row.request_timeline; + } + + if (!row.has_blob) return null; + + // Slow path only: fetch the large profile blob after establishing that the + // pre-computed timeline is stale or missing. Long trace runs can have blobs + // large enough to exceed Neon's 64 MiB encoded-response limit, so the fast + // path must never select the blob alongside request_timeline. + const blobRows = (await sql` + select profile_export_jsonl_gz as blob + from agentic_trace_replay + where id = ${row.trace_replay_id} + `) as unknown as RawBlobRow[]; + return computeRequestTimeline(blobRows[0]?.blob ?? null); +} diff --git a/packages/db/src/queries/trace-availability.ts b/packages/db/src/queries/trace-availability.ts new file mode 100644 index 00000000..155b3d4c --- /dev/null +++ b/packages/db/src/queries/trace-availability.ts @@ -0,0 +1,34 @@ +/** + * Bulk "does this point have a trace_replay blob?" lookup. Used by the + * inference scatter chart to decide whether to render a "View charts" + * button in the pinned tooltip — a pure presence check that doesn't need + * the multi-megabyte blob payload `getTraceHistograms` ships. + * + * Going through `trace-histograms` for this trips Neon's 64 MB + * per-HTTP-response cap as soon as one chunk's combined gzip payload + * exceeds the cap (high-conc 8×8 rows can be 13 MB compressed each). + */ + +import type { DbClient } from '../connection.js'; + +/** Map of `benchmark_results.id` → true for each id that has a trace_replay blob. */ +export type TraceAvailabilityMap = Record; + +export async function getTraceAvailability( + sql: DbClient, + benchmarkResultIds: number[], +): Promise { + if (benchmarkResultIds.length === 0) return {}; + + const rows = (await sql` + select br.id + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${benchmarkResultIds}::bigint[]) + and atr.profile_export_jsonl_gz is not null + `) as { id: number }[]; + + const result: TraceAvailabilityMap = {}; + for (const row of rows) result[Number(row.id)] = true; + return result; +} diff --git a/packages/db/src/queries/trace-histograms.test.ts b/packages/db/src/queries/trace-histograms.test.ts new file mode 100644 index 00000000..c3c6ec8a --- /dev/null +++ b/packages/db/src/queries/trace-histograms.test.ts @@ -0,0 +1,78 @@ +import { describe, expect, it } from 'vitest'; + +import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline'; +import type { DbClient } from '../connection.js'; + +import { getTraceHistograms } from './trace-histograms'; + +function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } { + const responses = [...queue]; + const calls: string[] = []; + const sql = ((strings: TemplateStringsArray) => { + calls.push(strings.join('?')); + return Promise.resolve(responses.shift() ?? []); + }) as unknown as DbClient; + return { sql, calls }; +} + +const timeline: RequestTimeline = { + version: REQUEST_TIMELINE_VERSION, + startNs: 0, + endNs: 10, + durationS: 0.00000001, + requests: [ + { + cid: 'session-1', + ti: 0, + wid: '0', + ad: 0, + phase: 'profiling', + credit: 0, + start: 1, + ack: 2, + end: 3, + ttftMs: 1, + tpotMs: 2, + isl: 4096, + osl: 512, + cancelled: false, + }, + { + cid: 'session-1', + ti: 1, + wid: '0', + ad: 0, + phase: 'profiling', + credit: 4, + start: 5, + ack: 6, + end: 7, + ttftMs: 1, + tpotMs: 2, + isl: null, + osl: 128, + cancelled: false, + }, + ], +}; + +describe('getTraceHistograms', () => { + it('builds distributions from the precomputed timeline without selecting the raw blob', async () => { + const { sql, calls } = mockSql([ + [ + { + benchmark_result_id: 422991, + trace_replay_id: 870, + request_timeline: timeline, + has_blob: true, + }, + ], + ]); + + await expect(getTraceHistograms(sql, [422991])).resolves.toEqual({ + 422991: { id: 422991, isl: [4096], osl: [512, 128] }, + }); + expect(calls).toHaveLength(1); + expect(calls[0]).not.toContain('profile_export_jsonl_gz as blob'); + }); +}); diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts new file mode 100644 index 00000000..24b96c35 --- /dev/null +++ b/packages/db/src/queries/trace-histograms.ts @@ -0,0 +1,134 @@ +/** + * Fetch per-request ISL/OSL arrays from stored aiperf `profile_export.jsonl` + * blobs (gzipped in `agentic_trace_replay.profile_export_jsonl_gz`). Caller + * passes the set of `benchmark_results.id`s it wants and receives one entry + * per id that actually has a trace_replay blob (others are silently skipped). + * + * The JSONL has one JSON object per request with the shape: + * { metrics: { input_sequence_length: { value, unit }, output_sequence_length: {...}, ... } } + * + * Returns raw arrays rather than pre-binned histograms — payload stays tiny + * (~256 ints * 2 fields per point, ~2 KB compressed) and the frontend can bin + * however it wants. + */ + +import { gunzipSync } from 'node:zlib'; + +import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline'; + +import type { DbClient } from '../connection.js'; + +export interface TraceHistogramPoint { + /** benchmark_results.id this entry belongs to. */ + id: number; + /** Input sequence length (tokens) per completed request. */ + isl: number[]; + /** Output sequence length (tokens) per completed request. */ + osl: number[]; +} + +export type TraceHistogramMap = Record; + +const QUERY_CHUNK_SIZE = 12; +// Bytea values expand in Neon's JSON-over-HTTP response. Keep raw fallback +// reads comfortably below its 64 MiB response cap; current ingests should use +// request_timeline instead and never need this path. +const MAX_FALLBACK_BLOB_BYTES = 24 * 1024 * 1024; + +interface TimelineRow { + benchmark_result_id: number; + trace_replay_id: number; + request_timeline: RequestTimeline | null; + has_blob: boolean; +} + +function histogramFromTimeline(id: number, timeline: RequestTimeline): TraceHistogramPoint { + const isl: number[] = []; + const osl: number[] = []; + for (const request of timeline.requests) { + if (typeof request.isl === 'number' && Number.isFinite(request.isl)) isl.push(request.isl); + if (typeof request.osl === 'number' && Number.isFinite(request.osl)) osl.push(request.osl); + } + return { id, isl, osl }; +} + +export async function getTraceHistograms( + sql: DbClient, + benchmarkResultIds: number[], +): Promise { + if (benchmarkResultIds.length === 0) return {}; + + const result: TraceHistogramMap = {}; + const fallbackRows: TimelineRow[] = []; + for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) { + const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE); + const chunkRows = (await sql` + select + br.id as benchmark_result_id, + atr.id as trace_replay_id, + atr.request_timeline, + (atr.profile_export_jsonl_gz is not null) as has_blob + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${chunk}::bigint[]) + `) as unknown as TimelineRow[]; + for (const row of chunkRows) { + const id = Number(row.benchmark_result_id); + if ( + row.request_timeline && + Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION + ) { + result[id] = histogramFromTimeline(id, row.request_timeline); + } else if (row.has_blob) { + fallbackRows.push(row); + } + } + } + + // Compatibility fallback for pre-timeline rows. Fetch one small blob at a + // time; oversized legacy rows are omitted instead of turning the whole API + // response into a 507. + for (const row of fallbackRows) { + const blobRows = (await sql` + select profile_export_jsonl_gz as blob + from agentic_trace_replay + where id = ${row.trace_replay_id} + and octet_length(profile_export_jsonl_gz) <= ${MAX_FALLBACK_BLOB_BYTES} + `) as unknown as { blob: Buffer }[]; + const blob = blobRows[0]?.blob; + if (!blob) continue; + try { + const jsonl = gunzipSync(blob).toString('utf8'); + const isl: number[] = []; + const osl: number[] = []; + for (const line of jsonl.split('\n')) { + if (!line) continue; + let rec: { metrics?: Record }; + try { + rec = JSON.parse(line); + } catch { + continue; + } + const m = rec.metrics ?? {}; + const islVal = readMetric(m['input_sequence_length']); + const oslVal = readMetric(m['output_sequence_length']); + if (typeof islVal === 'number' && Number.isFinite(islVal)) isl.push(islVal); + if (typeof oslVal === 'number' && Number.isFinite(oslVal)) osl.push(oslVal); + } + result[Number(row.benchmark_result_id)] = { + id: Number(row.benchmark_result_id), + isl, + osl, + }; + } catch { + // Drop malformed blobs silently — caller treats missing ids as "no data". + } + } + return result; +} + +function readMetric(v: { value?: number } | number | undefined): number | undefined { + if (v === undefined || v === null) return undefined; + if (typeof v === 'number') return v; + return v.value; +} diff --git a/packages/db/src/queries/trace-server-metrics.test.ts b/packages/db/src/queries/trace-server-metrics.test.ts new file mode 100644 index 00000000..f045dfda --- /dev/null +++ b/packages/db/src/queries/trace-server-metrics.test.ts @@ -0,0 +1,105 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { CHART_SERIES_VERSION, type ChartSeries } from '../etl/compute-chart-series'; +import type { DbClient } from '../connection.js'; + +import { getTraceServerMetrics } from './trace-server-metrics'; + +function currentSeries(): ChartSeries { + return { + version: CHART_SERIES_VERSION, + startNs: 0, + endNs: 1e9, + durationS: 1, + timeslicesCount: 1, + kvCacheUsage: [], + prefixCacheHitRate: [], + queueDepth: [], + promptTokensBySource: {}, + prefillTps: [{ t: 0, value: 100 }], + decodeTps: [], + prefixCacheHitsTps: [], + hostKvCacheUsage: [], + kvCacheUsageByEngine: [], + metricSources: [], + }; +} + +function metaRow(overrides: Record = {}) { + return { + id: 42, + trace_replay_id: 7, + has_blob: true, + chart_series: currentSeries(), + hardware: 'gb200', + framework: 'dynamo-vllm', + model: 'deepseek-r1-0528', + precision: 'fp8', + spec_method: 'none', + disagg: true, + conc: 128, + offload_mode: 'off', + isl: null, + osl: null, + benchmark_type: 'agentic_traces', + date: '2026-06-23', + run_url: null, + server_gpu_cache_hit_rate: null, + server_cpu_cache_hit_rate: null, + kv_cache_pool_tokens: null, + ...overrides, + }; +} + +function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } { + const responses = [...queue]; + const calls: string[] = []; + const sql = ((strings: TemplateStringsArray) => { + calls.push(strings.join('?')); + return Promise.resolve(responses.shift() ?? []); + }) as unknown as DbClient; + return { sql, calls }; +} + +describe('getTraceServerMetrics', () => { + it('returns current precomputed series without selecting the raw blob', async () => { + const { sql, calls } = mockSql([[metaRow()]]); + + const result = await getTraceServerMetrics(sql, 42); + + expect(result?.prefillTps).toEqual([{ t: 0, value: 100 }]); + expect(calls).toHaveLength(1); + expect(calls[0]).not.toContain('server_metrics_json_gz as blob'); + }); + + it('fetches and computes the raw blob only when chart_series is stale', async () => { + const raw = gzipSync( + Buffer.from( + JSON.stringify({ + metrics: { + 'vllm:prompt_tokens': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 321 }] }], + }, + }, + }), + ), + ); + const stale = { ...currentSeries(), version: CHART_SERIES_VERSION - 1 }; + const { sql, calls } = mockSql([[metaRow({ chart_series: stale })], [{ blob: raw }]]); + + const result = await getTraceServerMetrics(sql, 42); + + expect(result?.prefillTps).toEqual([{ t: 0, value: 321 }]); + expect(calls).toHaveLength(2); + expect(calls[1]).toContain('server_metrics_json_gz as blob'); + }); + + it('returns null without a blob and does not issue a second query', async () => { + const { sql, calls } = mockSql([[metaRow({ has_blob: false, chart_series: null })]]); + + await expect(getTraceServerMetrics(sql, 42)).resolves.toBeNull(); + expect(calls).toHaveLength(1); + }); +}); diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts new file mode 100644 index 00000000..d24d0879 --- /dev/null +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -0,0 +1,211 @@ +/** + * Time-series view of one agentic benchmark point: chart-ready arrays for + * KV utilization, prefix-cache hit rate, queue depth, prefill + decode TPS, + * and per-source prompt-token counts. + * + * Backed by `agentic_trace_replay.chart_series` (pre-computed at ingest + * time, see `etl/compute-chart-series.ts`). The fast path is a single SQL + * row read; the slow path re-computes from `server_metrics_json_gz` and is + * only taken when the column is missing or the stored + * `CHART_SERIES_VERSION` is stale (the backfill script should drain that). + */ + +import { + CHART_SERIES_VERSION, + computeChartSeries, + type ChartSeries, + type MetricSourceSeries, + type QueueDepthPoint, + type TimeSeriesPoint, +} from '../etl/compute-chart-series'; + +import type { DbClient } from '../connection.js'; + +export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series'; + +export interface PointMeta { + id: number; + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + disagg: boolean; + conc: number; + offload_mode: string | null; + isl: number | null; + osl: number | null; + benchmark_type: string; + date: string; + /** GitHub Actions run URL for jumping to the source. */ + run_url: string | null; + /** Cumulative end-of-run cache-hit number the dashboard already shows. */ + server_gpu_cache_hit_rate: number | null; + /** Cumulative end-of-run CPU offload cache-hit. */ + server_cpu_cache_hit_rate: number | null; +} + +export interface TraceServerMetrics { + /** Point context — hardware, model, conc, etc. for the page header. */ + meta: PointMeta; + /** ns wall-clock of the first window's start; for debugging only. */ + startNs: number; + /** ns wall-clock of the last window's end. */ + endNs: number; + /** Total benchmark window in seconds. */ + durationS: number; + /** Number of 1Hz windows captured. */ + timeslicesCount: number; + /** vllm:kv_cache_usage_perc avg per scrape, values in 0..1. */ + kvCacheUsage: TimeSeriesPoint[]; + /** Per-window prefix-cache hit rate computed as Δhits / Δqueries (0..1). */ + prefixCacheHitRate: TimeSeriesPoint[]; + /** Request queue depth: running, waiting, total per scrape. */ + queueDepth: QueueDepthPoint[]; + /** + * Per-source prompt-token counts over time (counter rate per scrape). + * Keyed by the value of the `source` label (typically `local_cache_hit`, + * `external_cache_hit`, `miss`, etc.). Plot as stacked area. + */ + promptTokensBySource: Record; + /** Prefill throughput: vllm:prompt_tokens rate (tokens/sec) per scrape. */ + prefillTps: TimeSeriesPoint[]; + /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */ + decodeTps: TimeSeriesPoint[]; + /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ + prefixCacheHitsTps: TimeSeriesPoint[]; + /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */ + hostKvCacheUsage: TimeSeriesPoint[]; + /** + * Per-DP-rank KV cache utilization. Empty for single-engine deployments — + * the cluster-average `kvCacheUsage` line covers that case alone. + */ + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; + /** + * Total KV-cache pool size in tokens (num_gpu_blocks × block_size, summed + * across engines). vLLM only — null for SGLang/TRT or older rows. + */ + kvCachePoolTokens: number | null; + /** Orchestrator-normalized metrics grouped by endpoint/worker. */ + metricSources: MetricSourceSeries[]; +} + +interface RawMetaRow extends PointMeta { + trace_replay_id: number | null; + has_blob: boolean; + chart_series: ChartSeries | null; + /** Derived at server-log ingest from "GPU KV cache size: N tokens" lines. */ + kv_cache_pool_tokens: string | null; +} + +interface RawBlobRow { + blob: Buffer | null; +} + +function buildMeta(row: RawMetaRow): PointMeta { + return { + id: Number(row.id), + hardware: row.hardware, + framework: row.framework, + model: row.model, + precision: row.precision, + spec_method: row.spec_method, + disagg: row.disagg, + conc: row.conc, + offload_mode: row.offload_mode, + isl: row.isl, + osl: row.osl, + benchmark_type: row.benchmark_type, + date: row.date, + run_url: row.run_url, + server_gpu_cache_hit_rate: + row.server_gpu_cache_hit_rate === null ? null : Number(row.server_gpu_cache_hit_rate), + server_cpu_cache_hit_rate: + row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate), + }; +} + +function merge( + meta: PointMeta, + series: ChartSeries, + kvCachePoolTokens: number | null, +): TraceServerMetrics { + return { + meta, + kvCachePoolTokens, + startNs: series.startNs, + endNs: series.endNs, + durationS: series.durationS, + timeslicesCount: series.timeslicesCount, + kvCacheUsage: series.kvCacheUsage, + prefixCacheHitRate: series.prefixCacheHitRate, + queueDepth: series.queueDepth, + promptTokensBySource: series.promptTokensBySource, + prefillTps: series.prefillTps, + decodeTps: series.decodeTps, + // v2 chart_series rows pre-backfill don't have this field — default to [] + prefixCacheHitsTps: series.prefixCacheHitsTps ?? [], + hostKvCacheUsage: series.hostKvCacheUsage ?? [], + // v8+ field; older chart_series rows lack it → omit per-engine overlay. + kvCacheUsageByEngine: series.kvCacheUsageByEngine ?? [], + // v9+ field; old rows are served without a source selector until backfilled. + metricSources: series.metricSources ?? [], + }; +} + +export async function getTraceServerMetrics( + sql: DbClient, + benchmarkResultId: number, +): Promise { + const rows = (await sql` + select + br.trace_replay_id, + (atr.server_metrics_json_gz is not null) as has_blob, + atr.chart_series, + br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg, + br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type, + br.date::text, + case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url, + (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate, + (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate, + (br.metrics ->> 'kv_cache_pool_tokens')::numeric as kv_cache_pool_tokens + from benchmark_results br + join configs c on c.id = br.config_id + join workflow_runs wr on wr.id = br.workflow_run_id + left join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = ${benchmarkResultId} + `) as unknown as RawMetaRow[]; + const row = rows[0]; + if (!row) return null; + if (!row.has_blob || row.trace_replay_id === null) return null; + const meta = buildMeta(row); + const kvCachePoolTokens = + row.kv_cache_pool_tokens === null ? null : Number(row.kv_cache_pool_tokens); + + // Fast path: pre-computed chart_series at the current version. + if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) { + return merge(meta, row.chart_series, kvCachePoolTokens); + } + + // Slow path only: fetch the large raw blob after establishing that the + // pre-computed series is missing or stale. Disaggregated blobs can be tens + // of MB compressed, so selecting this in the metadata query defeats the + // fast path even when chart_series is current. + const blobRows = (await sql` + select server_metrics_json_gz as blob + from agentic_trace_replay + where id = ${row.trace_replay_id} + `) as unknown as RawBlobRow[]; + const blob = blobRows[0]?.blob; + if (!blob) return null; + + // `computeChartSeries` handles + // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP + // rows succeed even before the backfill drains them. + const series = await computeChartSeries(blob, { + framework: row.framework, + disagg: row.disagg, + }); + if (!series) return null; + return merge(meta, series, kvCachePoolTokens); +} diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts index dfcb9e9f..01e13dd8 100644 --- a/packages/db/src/queries/workflow-info.ts +++ b/packages/db/src/queries/workflow-info.ts @@ -129,20 +129,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise { const rows = await sql` - SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text + SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text FROM availability a WHERE EXISTS ( SELECT 1 @@ -153,8 +155,9 @@ export async function getAvailabilityData(sql: DbClient): Promise Date: Thu, 2 Jul 2026 14:11:41 -0500 Subject: [PATCH 05/40] ci: agentic ingest dispatch workflow and ingest agent docs --- .claude/agents/ingest.md | 196 +++++++++++++++++++ .github/workflows/ingest-agentic-results.yml | 180 +++++++++++++++++ 2 files changed, 376 insertions(+) create mode 100644 .claude/agents/ingest.md create mode 100644 .github/workflows/ingest-agentic-results.yml diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md new file mode 100644 index 00000000..10e37d6c --- /dev/null +++ b/.claude/agents/ingest.md @@ -0,0 +1,196 @@ +--- +name: ingest +description: Ingest a benchmark run from GitHub Actions into the Neon DB used by the feat/agentx deployment. The target DB write URL must be provided in the invocation. Handles standard ingest, delete+reingest, and changelog entries. Invoke when the user asks to ingest a workflow run URL. +tools: Bash, Read, Edit, Write +--- + +You ingest benchmark runs from `SemiAnalysisAI/InferenceX` GitHub Actions into the Neon branch used by the `feat/agentx` deployment of this dashboard. Operate on `/Users/quilicic/InferenceX-app`. + +## Environment + +- **Repo root**: `/Users/quilicic/InferenceX-app` +- **DB write URL — MUST be provided by the invoker.** There is no default: the target Neon branch changes over time, and ingesting into the wrong one silently corrupts a live deployment. If the prompt does not include a `postgresql://` write URL, STOP and ask for it before touching anything. Requirements: + - Use the **direct (non-pooled)** host for ingest/migrations — no `-pooler` in the hostname. + - For psql diagnostics you may use the same URL directly: `psql "$DATABASE_WRITE_URL" -c "..."`. +- **Local dev server**: usually `http://localhost:3002` (port 3000 is a different project on this machine — never purge port 3000) +- **Preview URL**: `https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app` +- **INVALIDATE_SECRET** lives in repo root `.env` under that key. +- **GitHub auth**: `gh auth token` for `gh` calls and the GITHUB_TOKEN env var. + +## Standard ingest + +```bash +cd /Users/quilicic/InferenceX-app/packages/db +DATABASE_WRITE_URL='' \ +GITHUB_TOKEN=$(gh auth token) \ +pnpm exec tsx src/ingest-ci-run.ts --download SemiAnalysisAI/InferenceX +``` + +Then refresh the materialized view (the script's auto-refresh sometimes races): +`REFRESH MATERIALIZED VIEW latest_benchmarks;` + +## Cache purge (always do after any DB mutation) + +```bash +SECRET=$(grep "^INVALIDATE_SECRET" /Users/quilicic/InferenceX-app/.env | cut -d= -f2 | tr -d '"') +# Localhost (port 3002, NOT 3000) +curl -s -X POST -H "Authorization: Bearer $SECRET" http://localhost:3002/api/v1/invalidate +# Preview +mkdir -p /tmp/vp && cd /tmp/vp \ + && vercel link --project inferencemax-app --scope semianalysisai --yes >/dev/null 2>&1 \ + && vercel curl /api/v1/invalidate \ + --deployment https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app \ + --yes -- -sS -X POST -H "Authorization: Bearer $SECRET" +rm -rf /tmp/vp +``` + +## Delete + reingest (use only when user explicitly says "delete and reingest" OR when the run supersedes prior data with the same (model, hw, framework, precision)) + +```sql +BEGIN; +DELETE FROM benchmark_results br USING configs c +WHERE c.id = br.config_id + AND c.model = '' AND c.hardware = '' AND c.framework = '' + AND c.precision = '' AND br.benchmark_type = ''; +DELETE FROM availability +WHERE model = '' AND hardware = '' AND framework = '' + AND precision = '' AND benchmark_type = ''; +COMMIT; +``` + +If the user says "replace ONLY the points this run produces", scope the DELETE to `AND br.conc IN (...)` so untouched conc levels survive. Don't do this unless asked. + +## AIPerf tagging — DO NOT use by default + +AIPerf is no longer a separate harness from the user's perspective. **Always** ingest with `spec_method='none'` (the standard path above), regardless of run name. Run names that include the word "aiperf" do NOT mean you should set `spec_decoding='aiperf'` — the user wants those runs to merge into the standard legend entry alongside other runs of the same (model, hw, framework, precision). + +Only override this if the user **explicitly** asks for the run to appear as a separate legend line. If they do, the patching procedure is preserved below. Otherwise, use the standard ingest section above and do not touch `spec_decoding`. + +
+Explicit-request-only: how to tag a run as `spec_decoding='aiperf'` + +```bash +RID= +TMPDIR=$(mktemp -d -t aiperf-$RID-XXXX) +cd $TMPDIR + +# 1. Logical-name dedup + download +gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/$RID/artifacts" --paginate \ + --jq '.artifacts[] | "\(.name)\t\(.archive_download_url)\t\(.created_at)"' \ + | python3 -c " +import sys, re, collections +seen = collections.OrderedDict() +for line in sys.stdin: + name, url, created = line.rstrip('\n').split('\t') + key = re.sub(r'_[a-zA-Z][a-zA-Z0-9.-]*_\d+$', '', name) + if key not in seen or seen[key][2] < created: + seen[key] = (name, url, created) +for _, (name, url, _) in seen.items(): + print(f'{name}\t{url}') +" > artifacts.tsv +while IFS=$'\t' read -r name url; do + mkdir -p "$name" + gh api "$url" > "$name/a.zip" 2>/dev/null + unzip -oq "$name/a.zip" -d "$name" 2>/dev/null + rm "$name/a.zip" +done < artifacts.tsv + +# 2. Patch every benchmark JSON to set spec_decoding=aiperf +find $TMPDIR -name "*.json" | python3 -c " +import sys, json +for fn in (l.strip() for l in sys.stdin): + try: + with open(fn) as f: d = json.load(f) + except Exception: continue + rows = d if isinstance(d, list) else [d] + if not rows or not isinstance(rows[0], dict): continue + changed = False + for row in rows: + if isinstance(row, dict) and ('scenario_type' in row or 'infmax_model_prefix' in row or 'tput_per_gpu' in row): + row['spec_decoding'] = 'aiperf' + changed = True + if changed: + with open(fn, 'w') as f: json.dump(d if isinstance(d, list) else rows[0], f) +" + +# 3. Ingest in CI mode (reads INGEST_* env vars) +cd /Users/quilicic/InferenceX-app/packages/db +INGEST_RUN_ID=$RID INGEST_RUN_ATTEMPT=1 INGEST_ARTIFACTS_PATH=$TMPDIR INGEST_REPO=SemiAnalysisAI/InferenceX \ +DATABASE_WRITE_URL='' \ +GITHUB_TOKEN=$(gh auth token) \ +pnpm exec tsx src/ingest-ci-run.ts +rm -rf $TMPDIR +``` + +The `spec_method` column has a lowercase check constraint — always lowercase. + +
+ +## Don't auto-mention "AIPerf" in changelog entries + +Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`. + +## Adding a perf changelog entry — MANDATORY for every ingest + +**You ALWAYS MUST add a changelog entry for every run you ingest. This is not optional.** Every standard ingest, delete+reingest, and partial ingest gets exactly one changelog entry. Never finish an ingest without one. + +- If the user gave changelog text, use it verbatim (substitute `` with the run's hardware SKU when the text contains that placeholder). +- If the user did NOT specify text, DO NOT skip the changelog — derive a sensible description from the run name (see convention below) and add it anyway, then tell the user what you used so they can adjust. + +Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `---` (matches what the user actually sees in the filter chain). + +```sql +INSERT INTO changelog_entries (workflow_run_id, date, base_ref, head_ref, config_keys, description, pr_link) +SELECT id, date, '', '', ARRAY['---'], '', NULL +FROM latest_workflow_runs WHERE github_run_id = +RETURNING id, workflow_run_id, date::text, description; +``` + +Description convention from prior entries: ` Ingest # ()` — e.g. + +- `B200 Kimi Ingest #1` +- `MI355X Kimi Ingest #2` +- `H200 Kimi Ingest #1 (mmap cache)` + +If the user doesn't specify a description, DO NOT skip the entry and DO NOT block on asking — derive a description from the run name, add the entry, and report what you used so the user can adjust. + +## Common gotchas + +- **`conclusion IS NULL` filter**: availability hides runs whose `latest_workflow_runs.conclusion` is null (still in_progress). If a user wants in-progress data shown, you can `UPDATE workflow_runs SET conclusion='success', status='completed' WHERE id = ` then `REFRESH MATERIALIZED VIEW latest_benchmarks`. +- **failed_run filter**: rows where `num_requests_successful === 0 AND num_requests_total > 0` get skipped on purpose — they have null metrics and would overwrite good rows via ON CONFLICT. +- **Aggregated `results_bmk` artifact** contains rows from all runner attempts merged together — pair the artifact-level logical-name dedup with the row-level failed-run skip to avoid empty-row overwrites. +- **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `__` suffix. +- **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection. +- **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = ''` so the frontend's max-date-per-group dedup doesn't drop the older sweep. +- **Agentic interactivity normalization (`*_intvty`)**: for `agentic_traces` runs, interactivity MUST be the slow-tail reciprocal of the ITL percentile — `*_intvty = 1/*_itl` (so `p90_intvty = 1/p90_itl`). Some harness versions emit `*_intvty` as `p(1/ITL)` instead (fast-tail — inverts percentile order, e.g. p90 shows ~`1/p10(ITL)`), which silently contaminates cross-run Pareto comparisons. The ingest mapper (`benchmark-mapper.ts`) now **derives `*_intvty` from `*_itl` and discards the artifact's value** for agentic rows, so a normal ingest is self-correcting — no manual step needed. The frontend `agenticAliases` does the same for overlay / `?unofficialrun=` rows. If you ever load agentic data through a path that bypasses the mapper, run `pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes` (idempotent; rewrites `mean/p75/p90/p95 _intvty = 1/_itl`) then refresh the MV + purge cache. `std_intvty` is intentionally left alone (the reciprocal of a std is meaningless; the API strips it anyway). + +## Process + +1. **Always start by checking the run** with `gh api repos/SemiAnalysisAI/InferenceX/actions/runs/ --jq '{name, status, conclusion}'`. Note the model/hw/precision from the name. If `status != "completed"`, ask the user if they want to ingest in-progress data (will likely have failed_run skips). +2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding. +3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line. +4. **Refresh materialized view**. +5. **Add changelog entry — ALWAYS, MANDATORY.** Every ingest gets exactly one changelog entry (see "Adding a perf changelog entry — MANDATORY"). Use the user's text if given (substituting ``); otherwise derive one from the run name and add it anyway. Never skip this step. +6. **Purge both caches** (localhost 3002 + preview — never port 3000). +7. **Report** the row count, date, hardware, run id, and the changelog id (always present). + +## Related: ingesting agentic _datasets_ (not benchmark runs) + +This agent ingests **benchmark runs**. The HF agentic trace **datasets** (`semianalysisai/cc-traces-weka-*`) that the agentic benchmark replays are ingested by a separate script, not this flow: + +```bash +cd packages/db && DATABASE_WRITE_URL='' \ + pnpm exec tsx src/ingest-weka-dataset.ts \ + [--label "…"] [--variant full|256k] [--description "…"] [--limit N] +``` + +It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). + +New agentic benchmark artifacts preserve AIPerf's `metadata.dataset` provenance as a top-level `dataset` object. Standard benchmark ingest automatically derives the dataset slug from `dataset.hf_dataset_name` and upserts `run_datasets`; do not manually backfill that mapping for new-format runs. Manual mapping is only needed for legacy artifacts that do not contain dataset provenance. + +## Don't + +- Don't push to git unless the user asked. +- Don't ingest without permission if it's a delete+reingest of existing data. +- Don't hit port 3000 for cache purge — it's a different project. +- Don't capitalize `spec_method` values (DB has a lowercase check constraint). diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml new file mode 100644 index 00000000..cf8366ea --- /dev/null +++ b/.github/workflows/ingest-agentic-results.yml @@ -0,0 +1,180 @@ +name: Ingest Agentic Benchmark Results + +# Dispatched from the main InferenceX repo at the end of an agentic (AgentX +# trace-replay) sweep, mirroring the fixed-seq-len `ingest-results` dispatch: +# +# curl -sSf -X POST \ +# -H "Authorization: Bearer $INFX_FRONTEND_PAT" \ +# -H "Accept: application/vnd.github+v3+json" \ +# https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \ +# -d '{"event_type": "ingest-agentic-results", +# "client_payload": {"run-id": "", "run-attempt": ""}}' +# +# The ingest script (packages/db/src/ingest-ci-run.ts) auto-detects agentic +# artifacts: benchmark rows land in benchmark_results (benchmark_type= +# 'agentic_traces'), raw profile exports + server metrics land in the +# agentic_trace_replay sidecar with precomputed chart/timeline JSONBs, the +# run is linked to its dataset in run_datasets, and changelog-metadata is +# ingested when present. This is a separate workflow from ingest-results.yml +# because agentic ingests are blob-heavy (100MB+ gzipped profile exports per +# high-concurrency point) and need a much longer timeout, plus +# agentic-specific alerting (missing dataset slug). + +on: + repository_dispatch: + types: [ingest-agentic-results] + +jobs: + ingest: + # Blob-heavy: uploading trace-replay sidecars for a ~20-point sweep takes + # far longer than a fixed-seq-len ingest. + timeout-minutes: 60 + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Wait for source run to finish + run: sleep 300 + + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: pnpm/action-setup@0e279bb959325dab635dd2c09392533439d90093 # v6.0.8 + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: '24' + cache: pnpm + - name: Install dependencies + run: pnpm install --filter @semianalysisai/inferencex-db... + env: + CYPRESS_INSTALL_BINARY: '0' + + - name: Run migrations + env: + DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} + run: pnpm admin:db:migrate --yes + + - name: Download artifacts from InferenceX run + env: + GH_TOKEN: ${{ secrets.INFX_MAIN_PAT }} + RUN_ID: ${{ github.event.client_payload.run-id }} + ARTIFACTS_PATH: ${{ github.workspace }}/artifacts + run: | + mkdir -p "$ARTIFACTS_PATH" + + # Download all artifacts for the run, deduplicated by name (keep latest). + gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/${RUN_ID}/artifacts" --paginate \ + | jq -r ' + [.artifacts[]] + | group_by(.name) | map(sort_by(.created_at) | last)[] + | "\(.name)\t\(.archive_download_url)"' \ + | while IFS=$'\t' read -r name url; do + echo "Downloading artifact: ${name}" + ok=false + for attempt in 1 2 3; do + if gh api "${url}" > artifact.zip; then + ok=true + break + fi + echo " Attempt ${attempt}/3 failed, retrying in ${attempt}s..." + sleep "$attempt" + done + if [ "$ok" = false ]; then + echo "::warning::Failed to download artifact after 3 attempts: ${name} — skipping" + rm -f artifact.zip + echo "$name" >> "$ARTIFACTS_PATH/.failures" + continue + fi + mkdir -p "${ARTIFACTS_PATH}/${name}" + if ! unzip -o artifact.zip -d "${ARTIFACTS_PATH}/${name}"; then + echo "::warning::Failed to extract artifact: ${name} — skipping" + rm -rf "${ARTIFACTS_PATH:?}/${name}" + echo "$name" >> "$ARTIFACTS_PATH/.failures" + fi + rm -f artifact.zip + done + + if [ -f "$ARTIFACTS_PATH/.failures" ]; then + count=$(wc -l < "$ARTIFACTS_PATH/.failures") + rm "$ARTIFACTS_PATH/.failures" + echo "::warning::${count} artifact(s) failed to download; ingesting what's available" + fi + + echo "Downloaded artifacts:" + ls "$ARTIFACTS_PATH/" + + if [ -z "$(ls -A "$ARTIFACTS_PATH")" ]; then + echo "::error::No artifacts could be downloaded from run ${RUN_ID}" + exit 1 + fi + + - name: Ingest results to DB + env: + DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} + GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }} + INGEST_RUN_ID: ${{ github.event.client_payload.run-id }} + INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt }} + INGEST_ARTIFACTS_PATH: ${{ github.workspace }}/artifacts + INGEST_REPO: SemiAnalysisAI/InferenceX + UNMAPPED_ENTITIES_OUTPUT: ${{ github.workspace }}/unmapped-entities.json + run: pnpm admin:db:ingest:ci + + - name: Apply run overrides + env: + DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} + run: pnpm admin:db:apply-overrides --yes + + - name: Verify database + env: + DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} + run: pnpm admin:db:verify + + - name: Invalidate Vercel cache + env: + VERCEL_INVALIDATE_SECRET: ${{ secrets.VERCEL_INVALIDATE_SECRET }} + run: | + curl -sSf -X POST "https://inferencex.semianalysis.com/api/v1/invalidate" \ + -H "Authorization: Bearer $VERCEL_INVALIDATE_SECRET" || true + + - name: Check for unmapped entities + if: always() + id: unmapped + run: | + f="${{ github.workspace }}/unmapped-entities.json" + if [ -f "$f" ]; then + echo "found=true" >> "$GITHUB_OUTPUT" + models=$(jq -r '.models // [] | join(", ")' "$f") + hardware=$(jq -r '.hardware // [] | join(", ")' "$f") + precisions=$(jq -r '.precisions // [] | join(", ")' "$f") + datasets=$(jq -r '.datasets // [] | join(", ")' "$f") + msg="" + [ -n "$models" ] && msg="${msg}Models: ${models}\n" + [ -n "$hardware" ] && msg="${msg}Hardware: ${hardware}\n" + [ -n "$precisions" ] && msg="${msg}Precisions: ${precisions}\n" + [ -n "$datasets" ] && msg="${msg}Datasets missing from datasets table (run ingest-weka-dataset): ${datasets}\n" + { + echo 'summary<> "$GITHUB_OUTPUT" + fi + + - name: Notify Slack on unmapped entities + if: steps.unmapped.outputs.found == 'true' + uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL }} + webhook-type: incoming-webhook + payload: | + { + "text": ":warning: *Unrecognized entities during agentic ingest*\nRun ID: ${{ github.event.client_payload.run-id }}\n```${{ steps.unmapped.outputs.summary }}```\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" + } + + - name: Notify Slack on failure + if: failure() + uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL }} + webhook-type: incoming-webhook + payload: | + { + "text": ":rotating_light: *Agentic ingest workflow failed*\nRun ID: ${{ github.event.client_payload.run-id }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" + } From 311eb3cc5d158887e3a075b20066458ea0261563 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:11:46 -0500 Subject: [PATCH 06/40] feat(api): v1 agentic + datasets endpoints and React Query hooks --- .../app/src/app/api/unofficial-run/route.ts | 6 + .../app/api/v1/agentic-aggregates/route.ts | 34 +++ .../app/api/v1/benchmark-siblings/route.ts | 28 +++ .../[slug]/conversations/[convId]/route.ts | 33 +++ .../v1/datasets/[slug]/conversations/route.ts | 53 +++++ .../src/app/api/v1/datasets/[slug]/route.ts | 29 +++ packages/app/src/app/api/v1/datasets/route.ts | 24 +++ .../api/v1/derived-agentic-metrics/route.ts | 43 ++++ packages/app/src/app/api/v1/id-routes.test.ts | 136 ++++++++++++ packages/app/src/app/api/v1/id-routes.ts | 85 ++++++++ .../src/app/api/v1/request-timeline/route.ts | 30 +++ .../app/api/v1/trace-availability/route.ts | 29 +++ .../src/app/api/v1/trace-histograms/route.ts | 34 +++ .../app/api/v1/trace-server-metrics/route.ts | 30 +++ .../src/hooks/api/benchmark-id-query.test.ts | 37 ++++ .../app/src/hooks/api/benchmark-id-query.ts | 59 ++++++ .../src/hooks/api/use-agentic-aggregates.ts | 31 +++ .../src/hooks/api/use-benchmark-siblings.ts | 44 ++++ packages/app/src/hooks/api/use-benchmarks.ts | 12 +- packages/app/src/hooks/api/use-datasets.ts | 199 ++++++++++++++++++ .../api/use-derived-agentic-metrics.test.ts | 13 ++ .../hooks/api/use-derived-agentic-metrics.ts | 55 +++++ .../app/src/hooks/api/use-request-timeline.ts | 53 +++++ .../src/hooks/api/use-trace-availability.ts | 15 ++ .../app/src/hooks/api/use-trace-histograms.ts | 25 +++ .../src/hooks/api/use-trace-server-metrics.ts | 96 +++++++++ packages/app/src/hooks/useChartContext.ts | 12 +- packages/app/src/hooks/useThemeColors.test.ts | 28 +++ 28 files changed, 1269 insertions(+), 4 deletions(-) create mode 100644 packages/app/src/app/api/v1/agentic-aggregates/route.ts create mode 100644 packages/app/src/app/api/v1/benchmark-siblings/route.ts create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/route.ts create mode 100644 packages/app/src/app/api/v1/datasets/route.ts create mode 100644 packages/app/src/app/api/v1/derived-agentic-metrics/route.ts create mode 100644 packages/app/src/app/api/v1/id-routes.test.ts create mode 100644 packages/app/src/app/api/v1/id-routes.ts create mode 100644 packages/app/src/app/api/v1/request-timeline/route.ts create mode 100644 packages/app/src/app/api/v1/trace-availability/route.ts create mode 100644 packages/app/src/app/api/v1/trace-histograms/route.ts create mode 100644 packages/app/src/app/api/v1/trace-server-metrics/route.ts create mode 100644 packages/app/src/hooks/api/benchmark-id-query.test.ts create mode 100644 packages/app/src/hooks/api/benchmark-id-query.ts create mode 100644 packages/app/src/hooks/api/use-agentic-aggregates.ts create mode 100644 packages/app/src/hooks/api/use-benchmark-siblings.ts create mode 100644 packages/app/src/hooks/api/use-datasets.ts create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.ts create mode 100644 packages/app/src/hooks/api/use-request-timeline.ts create mode 100644 packages/app/src/hooks/api/use-trace-availability.ts create mode 100644 packages/app/src/hooks/api/use-trace-histograms.ts create mode 100644 packages/app/src/hooks/api/use-trace-server-metrics.ts diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index 072c99f1..304ccb0b 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -33,6 +33,10 @@ export function normalizeArtifactRows( if (!params) continue; const { config } = params; results.push({ + // Synthetic id — overlay rows aren't persisted, so trace_replay lookups + // (keyed on benchmark_results.id) will always miss, which is the + // intended behaviour: overlays never have stored trace_replay blobs. + id: 0, hardware: config.hardware, framework: config.framework, model: config.model, @@ -50,6 +54,8 @@ export function normalizeArtifactRows( decode_num_workers: config.decodeNumWorkers, num_prefill_gpu: config.numPrefillGpu, num_decode_gpu: config.numDecodeGpu, + benchmark_type: params.benchmarkType, + offload_mode: params.offloadMode, isl: params.isl, osl: params.osl, conc: params.conc, diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts new file mode 100644 index 00000000..63fd2512 --- /dev/null +++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts @@ -0,0 +1,34 @@ +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getAgenticAggregates, + type AgenticAggregateMap, +} from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idsQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: response stays small (a few numbers per id), but generating it +// parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the +// "Aggregates" toggle stays snappy. +const getCachedAgenticAggregates = cachedQuery( + (ids: number[]): Promise => getAgenticAggregates(getDb(), ids), + 'agentic-aggregates', + { blobOnly: true }, +); + +/** + * GET /api/v1/agentic-aggregates?ids=1,2,3 + * + * Returns per-id mean/p50/p75/p90/p99 for ISL, OSL, KV cache utilization, + * and prefix cache hit rate — computed live from the stored aiperf + * profile_export.jsonl + server_metrics_json blobs. Ids without a + * trace_replay blob (or with no usable samples) get nulls. + */ +export const GET = idsQueryRoute({ + maxIds: 200, + logLabel: 'agentic aggregates', + fetch: getCachedAgenticAggregates, +}); diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts new file mode 100644 index 00000000..38e79c23 --- /dev/null +++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts @@ -0,0 +1,28 @@ +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getBenchmarkSiblings, + type BenchmarkSiblings, +} from '@semianalysisai/inferencex-db/queries/benchmark-siblings'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +const getCachedSiblings = cachedQuery( + (id: number): Promise => getBenchmarkSiblings(getDb(), id), + 'benchmark-siblings', +); + +/** + * GET /api/v1/benchmark-siblings?id=N + * + * Returns the SKU (hw/framework/model/precision/spec/benchmark_type) of the + * benchmark_result + all sibling rows that share that SKU within the same + * workflow_run. Used by the agentic detail page to render a navigator. + */ +export const GET = idQueryRoute({ + logLabel: 'benchmark siblings', + fetch: getCachedSiblings, +}); diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts new file mode 100644 index 00000000..84cc15e3 --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts @@ -0,0 +1,33 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getConversation, + type ConversationDetail, +} from '@semianalysisai/inferencex-db/queries/datasets'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedConversation = cachedQuery( + (slug: string, convId: string): Promise => + getConversation(getDb(), slug, convId), + 'dataset-conversation', +); + +/** GET /api/v1/datasets/[slug]/conversations/[convId] — flamegraph structure. */ +export async function GET( + _request: NextRequest, + { params }: { params: Promise<{ slug: string; convId: string }> }, +) { + const { slug, convId } = await params; + try { + const data = await getCachedConversation(slug, decodeURIComponent(convId)); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching dataset conversation:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts new file mode 100644 index 00000000..62b9e5b7 --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts @@ -0,0 +1,53 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + listConversations, + type ConversationList, + type ListConversationsOpts, +} from '@semianalysisai/inferencex-db/queries/datasets'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const SORTS = new Set(['tokens', 'turns', 'subagents', 'id']); + +const getCachedConversations = cachedQuery( + ( + slug: string, + search: string, + limit: number, + offset: number, + sort: string, + ): Promise => + listConversations(getDb(), slug, { + search: search || undefined, + limit, + offset, + sort: sort as ListConversationsOpts['sort'], + }), + 'dataset-conversations', +); + +/** + * GET /api/v1/datasets/[slug]/conversations?search=&limit=&offset=&sort= + * Paginated conversation list (counts only, no flamegraph structure). + */ +export async function GET(request: NextRequest, { params }: { params: Promise<{ slug: string }> }) { + const { slug } = await params; + const sp = request.nextUrl.searchParams; + const search = sp.get('search') ?? ''; + const limit = Math.min(200, Math.max(1, Number(sp.get('limit')) || 50)); + const offset = Math.max(0, Number(sp.get('offset')) || 0); + const sortParam = sp.get('sort') ?? 'tokens'; + const sort = SORTS.has(sortParam) ? sortParam : 'tokens'; + try { + const data = await getCachedConversations(slug, search, limit, offset, sort); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching dataset conversations:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/datasets/[slug]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/route.ts new file mode 100644 index 00000000..9e4af580 --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/route.ts @@ -0,0 +1,29 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { getDataset, type DatasetDetail } from '@semianalysisai/inferencex-db/queries/datasets'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedDataset = cachedQuery( + (slug: string): Promise => getDataset(getDb(), slug), + 'dataset', +); + +/** GET /api/v1/datasets/[slug] — one dataset incl. precomputed chart_data. */ +export async function GET( + _request: NextRequest, + { params }: { params: Promise<{ slug: string }> }, +) { + const { slug } = await params; + try { + const data = await getCachedDataset(slug); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching dataset:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/datasets/route.ts b/packages/app/src/app/api/v1/datasets/route.ts new file mode 100644 index 00000000..f0acca3c --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/route.ts @@ -0,0 +1,24 @@ +import { NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { listDatasets, type DatasetRecord } from '@semianalysisai/inferencex-db/queries/datasets'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedDatasets = cachedQuery( + (): Promise => listDatasets(getDb()), + 'datasets', +); + +/** GET /api/v1/datasets — all ingested cc-traces-weka datasets (registry cards). */ +export async function GET() { + try { + const data = await getCachedDatasets(); + return cachedJson(data); + } catch (error) { + console.error('Error fetching datasets:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts new file mode 100644 index 00000000..836a8d93 --- /dev/null +++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts @@ -0,0 +1,43 @@ +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getDerivedAgenticMetrics, + type DerivedAgenticMetricMap, +} from '@semianalysisai/inferencex-db/queries/derived-agentic-metrics'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idsQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: the response is one entry per id with two numbers, but the +// derivation work parses thousands of JSONL records per blob — cache the +// computed result so a chart-refresh hits the warm path. +// Bumped to v3 for per-request normalized-E2E @ 400 output tokens. +// Stale v1 cache entries return undefined for the new field and silently +// blank the chart with "No data available". +const getCachedDerivedAgenticMetrics = cachedQuery( + (ids: number[]): Promise => getDerivedAgenticMetrics(getDb(), ids), + 'derived-agentic-metrics-v3', + { blobOnly: true }, +); + +/** + * GET /api/v1/derived-agentic-metrics?ids=1,2,3 + * + * Returns per-id derived metrics computed live from the stored aiperf + * profile_export.jsonl blobs: + * - normalized_session_time_s: mean across sessions of session e2e time + * (Σ per-turn request_latency) rescaled by mean_load / session_load. + * - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT) + * across every turn in every session. + * - p75/p90_normalized_e2e_400_s: percentile of per-request + * TTFT + 399 × observed ITL. + * + * Ids without a trace_replay blob or with unparseable records are omitted. + */ +export const GET = idsQueryRoute({ + maxIds: 200, + logLabel: 'derived agentic metrics', + fetch: getCachedDerivedAgenticMetrics, +}); diff --git a/packages/app/src/app/api/v1/id-routes.test.ts b/packages/app/src/app/api/v1/id-routes.test.ts new file mode 100644 index 00000000..32499e99 --- /dev/null +++ b/packages/app/src/app/api/v1/id-routes.test.ts @@ -0,0 +1,136 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +vi.mock('@/lib/api-cache', () => ({ + cachedJson: (data: unknown) => Response.json(data), +})); + +import { NextRequest, NextResponse } from 'next/server'; + +import { idQueryRoute, idsQueryRoute, parseIdsParam } from './id-routes'; + +function req(url: string): NextRequest { + return new NextRequest(new URL(url, 'http://localhost')); +} + +beforeEach(() => { + vi.clearAllMocks(); +}); + +describe('parseIdsParam', () => { + it('parses, dedupes, and sorts ids ascending', () => { + const result = parseIdsParam(req('/x?ids=3, 1,2,3'), 200); + expect(result).toEqual([1, 2, 3]); + }); + + it('drops non-finite and non-positive ids', () => { + const result = parseIdsParam(req('/x?ids=abc,-1,0,5'), 200); + expect(result).toEqual([5]); + }); + + it('returns 400 when the param is missing', async () => { + const result = parseIdsParam(req('/x'), 200); + expect(result).toBeInstanceOf(NextResponse); + const res = result as NextResponse; + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('ids query param is required'); + }); + + it('returns 400 when no valid ids remain', async () => { + const result = parseIdsParam(req('/x?ids=abc,-2'), 200); + expect(result).toBeInstanceOf(NextResponse); + const res = result as NextResponse; + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('no valid ids provided'); + }); + + it('returns 400 when the id count exceeds maxIds', async () => { + const result = parseIdsParam(req('/x?ids=1,2,3'), 2); + expect(result).toBeInstanceOf(NextResponse); + const res = result as NextResponse; + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('too many ids (max 2)'); + }); +}); + +describe('idsQueryRoute', () => { + it('fetches with sorted deduped ids and returns the payload', async () => { + const fetch = vi.fn().mockResolvedValue({ 1: 'a', 2: 'b' }); + const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch }); + + const res = await GET(req('/x?ids=2,1,2')); + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ 1: 'a', 2: 'b' }); + expect(fetch).toHaveBeenCalledWith([1, 2]); + }); + + it('returns 400 without calling fetch when ids are invalid', async () => { + const fetch = vi.fn(); + const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch }); + + const res = await GET(req('/x')); + expect(res.status).toBe(400); + expect(fetch).not.toHaveBeenCalled(); + }); + + it('returns 500 and logs when the fetch throws', async () => { + const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + const fetch = vi.fn().mockRejectedValue(new Error('boom')); + const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch }); + + const res = await GET(req('/x?ids=1')); + expect(res.status).toBe(500); + const body = await res.json(); + expect(body.error).toBe('Internal server error'); + expect(consoleSpy).toHaveBeenCalledWith('Error fetching things:', expect.any(Error)); + consoleSpy.mockRestore(); + }); +}); + +describe('idQueryRoute', () => { + it('fetches by id and returns the payload', async () => { + const fetch = vi.fn().mockResolvedValue({ value: 42 }); + const GET = idQueryRoute({ logLabel: 'thing', fetch }); + + const res = await GET(req('/x?id=7')); + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ value: 42 }); + expect(fetch).toHaveBeenCalledWith(7); + }); + + it.each(['/x', '/x?id=abc', '/x?id=0'])('returns 400 for %s', async (url) => { + const fetch = vi.fn(); + const GET = idQueryRoute({ logLabel: 'thing', fetch }); + + const res = await GET(req(url)); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('id is required (benchmark_result_id)'); + expect(fetch).not.toHaveBeenCalled(); + }); + + it('returns 404 when the fetch yields null', async () => { + const fetch = vi.fn().mockResolvedValue(null); + const GET = idQueryRoute({ logLabel: 'thing', fetch }); + + const res = await GET(req('/x?id=7')); + expect(res.status).toBe(404); + const body = await res.json(); + expect(body.error).toBe('Not found'); + }); + + it('returns 500 and logs when the fetch throws', async () => { + const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + const fetch = vi.fn().mockRejectedValue(new Error('boom')); + const GET = idQueryRoute({ logLabel: 'thing', fetch }); + + const res = await GET(req('/x?id=7')); + expect(res.status).toBe(500); + const body = await res.json(); + expect(body.error).toBe('Internal server error'); + expect(consoleSpy).toHaveBeenCalledWith('Error fetching thing:', expect.any(Error)); + consoleSpy.mockRestore(); + }); +}); diff --git a/packages/app/src/app/api/v1/id-routes.ts b/packages/app/src/app/api/v1/id-routes.ts new file mode 100644 index 00000000..fea9221b --- /dev/null +++ b/packages/app/src/app/api/v1/id-routes.ts @@ -0,0 +1,85 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { cachedJson } from '@/lib/api-cache'; + +/** + * Shared GET-handler factories for the agentic benchmark routes, which all + * key off `benchmark_results.id`. Two shapes exist: + * - bulk `?ids=1,2,3` routes returning a map keyed by id + * - single `?id=N` routes returning one payload or 404 + * + * Both preserve the v1 error contract: 400 with `{error}` for bad params, + * 404 `{error: 'Not found'}` when a single-id lookup misses, and 500 + * `{error: 'Internal server error'}` (with a console.error) on query failure. + * Success payloads go through `cachedJson` for CDN caching + gzip. + */ + +/** + * Parse, dedupe, validate, and ascending-sort the `ids` query param. + * Sorted so the same set of ids in any order hits the same cache entry. + * Returns a NextResponse (400) when the param is missing, empty, or too long. + */ +export function parseIdsParam(request: NextRequest, maxIds: number): number[] | NextResponse { + const raw = request.nextUrl.searchParams.get('ids'); + if (!raw) { + return NextResponse.json({ error: 'ids query param is required' }, { status: 400 }); + } + + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => Number(s.trim())) + .filter((n) => Number.isFinite(n) && n > 0), + ), + ]; + if (ids.length === 0) { + return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 }); + } + if (ids.length > maxIds) { + return NextResponse.json({ error: `too many ids (max ${maxIds})` }, { status: 400 }); + } + return ids.toSorted((a, b) => a - b); +} + +/** Build a GET handler for a bulk `?ids=…` route. */ +export function idsQueryRoute(options: { + maxIds: number; + /** Human-readable name used in the 500-path console.error. */ + logLabel: string; + fetch: (ids: number[]) => Promise; +}): (request: NextRequest) => Promise { + const { maxIds, logLabel, fetch } = options; + return async (request: NextRequest) => { + const ids = parseIdsParam(request, maxIds); + if (ids instanceof NextResponse) return ids; + try { + return cachedJson(await fetch(ids)); + } catch (error) { + console.error(`Error fetching ${logLabel}:`, error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } + }; +} + +/** Build a GET handler for a single `?id=N` route (404 when the fetch misses). */ +export function idQueryRoute(options: { + logLabel: string; + fetch: (id: number) => Promise; +}): (request: NextRequest) => Promise { + const { logLabel, fetch } = options; + return async (request: NextRequest) => { + const id = Number(request.nextUrl.searchParams.get('id')); + if (!id || !Number.isFinite(id)) { + return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 }); + } + try { + const data = await fetch(id); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error(`Error fetching ${logLabel}:`, error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } + }; +} diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts new file mode 100644 index 00000000..9a3750d6 --- /dev/null +++ b/packages/app/src/app/api/v1/request-timeline/route.ts @@ -0,0 +1,30 @@ +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getRequestTimeline, + type RequestTimeline, +} from '@semianalysisai/inferencex-db/queries/request-timeline'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +const getCachedRequestTimeline = cachedQuery( + (id: number): Promise => getRequestTimeline(getDb(), id), + 'request-timeline', + { blobOnly: true }, +); + +/** + * GET /api/v1/request-timeline?id=N + * + * Returns the per-request Gantt timeline for one agentic benchmark point. + * Each request entry has ns-from-start offsets for credit/start/ack/end, + * plus TTFT, ISL, OSL, conversation id, turn index, worker id. 404 if the + * point has no stored profile_export.jsonl blob. + */ +export const GET = idQueryRoute({ + logLabel: 'request timeline', + fetch: getCachedRequestTimeline, +}); diff --git a/packages/app/src/app/api/v1/trace-availability/route.ts b/packages/app/src/app/api/v1/trace-availability/route.ts new file mode 100644 index 00000000..45eafef4 --- /dev/null +++ b/packages/app/src/app/api/v1/trace-availability/route.ts @@ -0,0 +1,29 @@ +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getTraceAvailability, + type TraceAvailabilityMap, +} from '@semianalysisai/inferencex-db/queries/trace-availability'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idsQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +const getCachedTraceAvailability = cachedQuery( + (ids: number[]): Promise => getTraceAvailability(getDb(), ids), + 'trace-availability', +); + +/** + * GET /api/v1/trace-availability?ids=1,2,3 + * + * Returns `{[id]: true}` for ids that have a stored trace_replay blob. + * Lightweight presence check used by the scatter tooltip to decide whether + * to render the "View charts" button — see queries/trace-availability.ts. + */ +export const GET = idsQueryRoute({ + maxIds: 500, + logLabel: 'trace availability', + fetch: getCachedTraceAvailability, +}); diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts new file mode 100644 index 00000000..131010ff --- /dev/null +++ b/packages/app/src/app/api/v1/trace-histograms/route.ts @@ -0,0 +1,34 @@ +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getTraceHistograms, + type TraceHistogramMap, +} from '@semianalysisai/inferencex-db/queries/trace-histograms'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idsQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: a 50-id histogram payload can easily exceed Next.js's 2MB +// unstable_cache limit (each point carries one int per request, ~500-1000+ +// requests for agentic), which manifests as a 500 from the route. Blob +// storage lets us cache the larger response without losing the warm-cache hit. +const getCachedTraceHistograms = cachedQuery( + (ids: number[]): Promise => getTraceHistograms(getDb(), ids), + 'trace-histograms', + { blobOnly: true }, +); + +/** + * GET /api/v1/trace-histograms?ids=1,2,3 + * + * Returns per-request ISL/OSL arrays parsed from the stored aiperf + * `profile_export.jsonl` blobs, keyed by `benchmark_results.id`. + * Ids without a trace_replay blob are omitted from the response. + */ +export const GET = idsQueryRoute({ + maxIds: 200, + logLabel: 'trace histograms', + fetch: getCachedTraceHistograms, +}); diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts new file mode 100644 index 00000000..a759e6dc --- /dev/null +++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts @@ -0,0 +1,30 @@ +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getTraceServerMetrics, + type TraceServerMetrics, +} from '@semianalysisai/inferencex-db/queries/trace-server-metrics'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +const getCachedTraceServerMetrics = cachedQuery( + (id: number): Promise => getTraceServerMetrics(getDb(), id), + 'trace-server-metrics', + { blobOnly: true }, +); + +/** + * GET /api/v1/trace-server-metrics?id=N + * + * Returns parsed time-series for the agentic detail view: KV cache usage, + * prefix cache hit rate per interval, queue depth, and per-source prompt + * token rates. Times are in seconds from benchmark start. 404 if the point + * has no stored server_metrics_export.json blob. + */ +export const GET = idQueryRoute({ + logLabel: 'trace server metrics', + fetch: getCachedTraceServerMetrics, +}); diff --git a/packages/app/src/hooks/api/benchmark-id-query.test.ts b/packages/app/src/hooks/api/benchmark-id-query.test.ts new file mode 100644 index 00000000..c7d951f4 --- /dev/null +++ b/packages/app/src/hooks/api/benchmark-id-query.test.ts @@ -0,0 +1,37 @@ +import { afterEach, describe, expect, it, vi } from 'vitest'; + +import { bulkIdsFetcher } from './benchmark-id-query'; + +afterEach(() => { + vi.unstubAllGlobals(); +}); + +describe('bulkIdsFetcher', () => { + it('returns an empty map without fetching for an empty id set', async () => { + const fetchMock = vi.fn(); + vi.stubGlobal('fetch', fetchMock); + + const result = await bulkIdsFetcher('trace-availability')([]); + expect(result).toEqual({}); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + it('fetches the endpoint with comma-joined ids and returns the parsed map', async () => { + const fetchMock = vi.fn().mockResolvedValue(Response.json({ 1: true, 3: true })); + vi.stubGlobal('fetch', fetchMock); + + const result = await bulkIdsFetcher('trace-availability')([1, 3]); + expect(result).toEqual({ 1: true, 3: true }); + expect(fetchMock).toHaveBeenCalledWith('/api/v1/trace-availability?ids=1,3', { + signal: undefined, + }); + }); + + it('throws with the endpoint name and status on a non-ok response', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(new Response('nope', { status: 500 }))); + + await expect(bulkIdsFetcher('trace-histograms')([1])).rejects.toThrow( + 'trace-histograms 500', + ); + }); +}); diff --git a/packages/app/src/hooks/api/benchmark-id-query.ts b/packages/app/src/hooks/api/benchmark-id-query.ts new file mode 100644 index 00000000..0aa50687 --- /dev/null +++ b/packages/app/src/hooks/api/benchmark-id-query.ts @@ -0,0 +1,59 @@ +import { useQuery } from '@tanstack/react-query'; + +/** + * Shared React Query plumbing for the agentic endpoints keyed by + * `benchmark_results.id` (`/api/v1/?ids=…` bulk maps and + * `/api/v1/?id=N` single lookups). + * + * Conventions kept identical across all of these hooks: + * - queryKey = [endpoint, sorted-deduped-ids-comma-joined] so any + * permutation of the same id set hits the same cache entry + * - staleTime = 5 minutes (the underlying blobs are immutable per run) + * - bulk queries disabled for empty id sets; single queries 404 → null + */ + +const STALE_TIME_MS = 5 * 60 * 1000; + +/** Build the standard bulk fetcher: GET `/api/v1/?ids=…` → map. */ +export function bulkIdsFetcher( + endpoint: string, +): (ids: number[], signal?: AbortSignal) => Promise> { + return async (ids, signal) => { + if (ids.length === 0) return {}; + const res = await fetch(`/api/v1/${endpoint}?ids=${ids.join(',')}`, { signal }); + if (!res.ok) throw new Error(`${endpoint} ${res.status}`); + return (await res.json()) as Record; + }; +} + +/** Bulk map query over a set of benchmark_results ids. */ +export function useBulkIdsQuery( + endpoint: string, + ids: number[], + enabled: boolean, + fetchByIds: (ids: number[], signal?: AbortSignal) => Promise, +) { + const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b); + return useQuery({ + queryKey: [endpoint, sortedKey.join(',')] as const, + queryFn: ({ signal }: { signal: AbortSignal }) => fetchByIds(sortedKey, signal), + enabled: enabled && sortedKey.length > 0, + staleTime: STALE_TIME_MS, + }); +} + +/** Single-payload query for one benchmark_results id; 404 resolves to null. */ +export function useByIdQuery(endpoint: string, id: number | null, enabled: boolean) { + return useQuery({ + queryKey: [endpoint, id] as const, + queryFn: async ({ signal }): Promise => { + if (!id) return null; + const res = await fetch(`/api/v1/${endpoint}?id=${id}`, { signal }); + if (res.status === 404) return null; + if (!res.ok) throw new Error(`${endpoint} ${res.status}`); + return (await res.json()) as T; + }, + enabled, + staleTime: STALE_TIME_MS, + }); +} diff --git a/packages/app/src/hooks/api/use-agentic-aggregates.ts b/packages/app/src/hooks/api/use-agentic-aggregates.ts new file mode 100644 index 00000000..7ca029cf --- /dev/null +++ b/packages/app/src/hooks/api/use-agentic-aggregates.ts @@ -0,0 +1,31 @@ +import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query'; + +export interface MetricPercentiles { + mean: number; + p50: number; + p75: number; + p90: number; + p99: number; + n: number; +} + +export interface AgenticAggregate { + id: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; +} + +export type AgenticAggregateMap = Record; + +const fetchAgenticAggregates = bulkIdsFetcher('agentic-aggregates'); + +/** + * Fetch per-id aggregate stats (mean/p50/p75/p90/p99) for ISL, OSL, KV + * cache utilization, and prefix cache hit rate. Used by the "Aggregates + * across configs" view on the agentic detail page. + */ +export function useAgenticAggregates(ids: number[], enabled = true) { + return useBulkIdsQuery('agentic-aggregates', ids, enabled, fetchAgenticAggregates); +} diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts new file mode 100644 index 00000000..58469c26 --- /dev/null +++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts @@ -0,0 +1,44 @@ +import { useByIdQuery } from './benchmark-id-query'; + +export interface BenchmarkSibling { + id: number; + conc: number; + offload_mode: string | null; + decode_tp: number; + decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; + prefill_tp: number; + prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; + num_prefill_gpu: number; + num_decode_gpu: number; + disagg: boolean; + is_multinode: boolean; + tput_per_gpu: number | null; + total_requests: number | null; + is_current: boolean; + has_trace: boolean; +} + +export interface BenchmarkSku { + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + benchmark_type: string; + github_run_id: number; + date: string; + dataset_slug: string | null; +} + +export interface BenchmarkSiblings { + sku: BenchmarkSku; + siblings: BenchmarkSibling[]; +} + +export function useBenchmarkSiblings(id: number | null) { + return useByIdQuery('benchmark-siblings', id, id !== null && id > 0); +} diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts index a8d634f1..095cf192 100644 --- a/packages/app/src/hooks/api/use-benchmarks.ts +++ b/packages/app/src/hooks/api/use-benchmarks.ts @@ -28,6 +28,14 @@ export function benchmarkQueryOptions( }; } -export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) { - return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId)); +export function useBenchmarks( + model: string, + date?: string, + enabled = true, + runId?: string, + exactRun?: boolean, +) { + return useQuery( + benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId, exactRun), + ); } diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts new file mode 100644 index 00000000..ea1b17cf --- /dev/null +++ b/packages/app/src/hooks/api/use-datasets.ts @@ -0,0 +1,199 @@ +import { useQuery, keepPreviousData } from '@tanstack/react-query'; + +import type { + ConversationStructure, + StructureNode, +} from '@semianalysisai/inferencex-db/etl/weka-structure'; + +export type { ConversationStructure, StructureNode }; + +export interface DatasetSummary { + blockSize?: number; + hashIdScope?: string | null; + totalIn?: number; + totalOut?: number; + totalCached?: number; + cachedPct?: number; + mainTurns?: number; + subagentGroups?: number; + subagentTurns?: number; + meanRequestsPerConversation?: number; + medianRequestsPerConversation?: number; + meanSubagentsPerTrace?: number; + medianSubagentsPerTrace?: number; + modelMix?: Record; + [k: string]: unknown; +} + +export interface DatasetRecord { + id: string; + slug: string; + label: string; + variant: string; + description: string | null; + hf_url: string | null; + license: string | null; + conversation_count: number; + summary: DatasetSummary; + ingested_at: string; +} + +export interface HistogramBin { + x0: number; + x1: number; + count: number; +} + +export interface DistributionStats { + count: number; + min: number; + max: number; + mean: number; + median: number; + /** Added in chart_data v2. */ + p75?: number; + p90: number; + /** Added in chart_data v2. */ + p95?: number; +} + +export interface Distribution { + bins: HistogramBin[]; + stats: DistributionStats; +} + +export interface DatasetChartData { + version?: number; + inputTokensPerTurn?: Distribution; + uncachedInputTokensPerTurn?: Distribution; + outputTokensPerTurn?: Distribution; + subagentInputTokensPerRequest?: Distribution; + subagentOutputTokensPerRequest?: Distribution; + turnsPerConversation?: Distribution; + subagentGroupsPerConversation?: Distribution; + cachedFractionPerTurn?: Distribution; + [k: string]: unknown; +} + +export interface DatasetDetail extends DatasetRecord { + chart_data: DatasetChartData; +} + +export interface ConversationListItem { + conv_id: string; + models: string[]; + num_turns: number; + num_subagent_groups: number; + total_in: number; + total_out: number; + total_cached: number; +} + +export interface ConversationList { + total: number; + items: ConversationListItem[]; +} + +export interface ConversationDetail { + conv_id: string; + models: string[]; + num_turns: number; + num_subagent_groups: number; + total_in: number; + total_out: number; + total_cached: number; + structure: ConversationStructure; +} + +export type ConversationSort = 'tokens' | 'turns' | 'subagents' | 'id'; + +// Dataset contents only change on (rare) re-ingest, so cache aggressively. +const DAY = 24 * 60 * 60 * 1000; + +/** Shared fetch for the per-dataset endpoints: 404 → null, other errors throw. */ +async function fetchJsonOr404( + url: string, + label: string, + signal: AbortSignal, +): Promise { + const res = await fetch(url, { signal }); + if (res.status === 404) return null; + if (!res.ok) throw new Error(`${label} ${res.status}`); + return (await res.json()) as T; +} + +/** All ingested datasets (registry cards). */ +export function useDatasets() { + return useQuery({ + queryKey: ['datasets'] as const, + queryFn: async ({ signal }) => { + const res = await fetch('/api/v1/datasets', { signal }); + if (!res.ok) throw new Error(`datasets ${res.status}`); + return (await res.json()) as DatasetRecord[]; + }, + staleTime: DAY, + }); +} + +/** One dataset incl. chart_data. */ +export function useDataset(slug: string | null) { + return useQuery({ + queryKey: ['dataset', slug] as const, + queryFn: ({ signal }) => + fetchJsonOr404(`/api/v1/datasets/${slug}`, 'dataset', signal), + enabled: Boolean(slug), + staleTime: DAY, + }); +} + +export interface UseConversationsArgs { + slug: string | null; + search?: string; + limit?: number; + offset?: number; + sort?: ConversationSort; +} + +/** Paginated conversation list for a dataset (counts only). */ +export function useDatasetConversations({ + slug, + search = '', + limit = 50, + offset = 0, + sort = 'tokens', +}: UseConversationsArgs) { + return useQuery({ + queryKey: ['dataset-conversations', slug, search, limit, offset, sort] as const, + queryFn: ({ signal }) => { + const qs = new URLSearchParams({ + limit: String(limit), + offset: String(offset), + sort, + }); + if (search) qs.set('search', search); + return fetchJsonOr404( + `/api/v1/datasets/${slug}/conversations?${qs.toString()}`, + 'dataset-conversations', + signal, + ); + }, + enabled: Boolean(slug), + placeholderData: keepPreviousData, + staleTime: DAY, + }); +} + +/** One conversation's flamegraph structure. */ +export function useDatasetConversation(slug: string | null, convId: string | null) { + return useQuery({ + queryKey: ['dataset-conversation', slug, convId] as const, + queryFn: ({ signal }) => + fetchJsonOr404( + `/api/v1/datasets/${slug}/conversations/${encodeURIComponent(convId ?? '')}`, + 'dataset-conversation', + signal, + ), + enabled: Boolean(slug) && Boolean(convId), + staleTime: DAY, + }); +} diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts new file mode 100644 index 00000000..2e54f418 --- /dev/null +++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, it } from 'vitest'; + +import { chunkDerivedAgenticMetricIds } from './use-derived-agentic-metrics'; + +describe('chunkDerivedAgenticMetricIds', () => { + it('keeps every id while respecting the API limit', () => { + const ids = Array.from({ length: 401 }, (_, index) => index + 1); + const chunks = chunkDerivedAgenticMetricIds(ids); + + expect(chunks.map((chunk) => chunk.length)).toEqual([200, 200, 1]); + expect(chunks.flat()).toEqual(ids); + }); +}); diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts new file mode 100644 index 00000000..388563d9 --- /dev/null +++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts @@ -0,0 +1,55 @@ +import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query'; + +export interface DerivedAgenticMetric { + id: number; + /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled + * by mean_load / session_load. Null when the JSONL had no usable records. */ + normalized_session_time_s: number | null; + /** P90 of per-turn ISL/TTFT across every turn in every session. + * Null when no prefill rates could be computed. */ + p90_prefill_tps_per_user: number | null; + /** P75 normalized per-request E2E at a fixed 400-token output length. */ + p75_normalized_e2e_400_s: number | null; + /** P90 normalized per-request E2E at a fixed 400-token output length. */ + p90_normalized_e2e_400_s: number | null; +} + +export type DerivedAgenticMetricMap = Record; + +const MAX_IDS_PER_REQUEST = 200; + +export function chunkDerivedAgenticMetricIds(ids: number[]): number[][] { + const chunks: number[][] = []; + for (let i = 0; i < ids.length; i += MAX_IDS_PER_REQUEST) { + chunks.push(ids.slice(i, i + MAX_IDS_PER_REQUEST)); + } + return chunks; +} + +const fetchChunk = bulkIdsFetcher('derived-agentic-metrics'); + +// Unlike the other bulk endpoints, dashboards can put >200 agentic points on +// screen at once, so this fetcher splits the id set across parallel requests +// to stay under the route's MAX_IDS_PER_REQUEST. +async function fetchDerivedAgenticMetrics( + ids: number[], + signal?: AbortSignal, +): Promise { + if (ids.length === 0) return {}; + const maps = await Promise.all( + chunkDerivedAgenticMetricIds(ids).map((chunk) => fetchChunk(chunk, signal)), + ); + return Object.assign({}, ...maps) as DerivedAgenticMetricMap; +} + +/** + * Fetch per-id derived agentic metrics (session time + p90 prefill TPS/user) + * computed live from the stored aiperf profile_export.jsonl. Used to drive + * the "Session Time" and "Prefill TPS/user" chart variants. + * + * Ids without a trace_replay blob (older or non-aiperf agentic runs) are + * silently omitted from the response. + */ +export function useDerivedAgenticMetrics(ids: number[], enabled = true) { + return useBulkIdsQuery('derived-agentic-metrics', ids, enabled, fetchDerivedAgenticMetrics); +} diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts new file mode 100644 index 00000000..6f43de25 --- /dev/null +++ b/packages/app/src/hooks/api/use-request-timeline.ts @@ -0,0 +1,53 @@ +import { useByIdQuery } from './benchmark-id-query'; + +export interface RequestRecord { + /** Conversation id (groups turns of one agent session). */ + cid: string; + /** Zero-based turn index within the conversation. */ + ti: number; + /** Source trace id from the original raw dataset, when provided by AIPerf. */ + srcTrace?: string; + /** Original raw top-level request index within srcTrace. */ + srcOuter?: number; + /** Original nested request index within srcOuter, for subagent children. */ + srcInner?: number; + /** Loader-specific source kind, e.g. weka_main or weka_flat. */ + srcKind?: string; + /** Worker id (concurrency slot that handled this request). */ + wid: string; + /** Sub-agent depth (0 = top-level). */ + ad: number; + /** `warmup` or `profiling`. */ + phase: string; + /** ns offset from timeline.startNs. Load gen decided to dispatch. */ + credit: number; + /** ns offset from timeline.startNs. HTTP send started. */ + start: number; + /** ns offset from timeline.startNs. First server acknowledgement (or null). */ + ack: number | null; + /** ns offset from timeline.startNs. Last byte received. */ + end: number; + ttftMs: number | null; + /** Time per output token in ms. */ + tpotMs: number | null; + isl: number | null; + osl: number | null; + cancelled: boolean; +} + +export interface RequestTimeline { + version: number; + startNs: number; + endNs: number; + durationS: number; + requests: RequestRecord[]; +} + +/** + * Lazy-fetch the per-request Gantt timeline for one agentic point. + * Enabled only when the caller opts in (e.g. the timeline view becomes + * active), so the payload (~30 KB per point) isn't paid for every page load. + */ +export function useRequestTimeline(id: number | null, enabled = false) { + return useByIdQuery('request-timeline', id, enabled && Boolean(id)); +} diff --git a/packages/app/src/hooks/api/use-trace-availability.ts b/packages/app/src/hooks/api/use-trace-availability.ts new file mode 100644 index 00000000..24e4c067 --- /dev/null +++ b/packages/app/src/hooks/api/use-trace-availability.ts @@ -0,0 +1,15 @@ +import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query'; + +export type TraceAvailabilityMap = Record; + +const fetchTraceAvailability = bulkIdsFetcher('trace-availability'); + +/** + * Bulk presence lookup: which of the given `benchmark_results.id`s have a + * stored trace_replay blob. Used by the scatter chart to decide whether to + * surface the "View charts" button — cheap boolean per id instead of + * shipping multi-MB profile blobs just for the check. + */ +export function useTraceAvailability(ids: number[], enabled = true) { + return useBulkIdsQuery('trace-availability', ids, enabled, fetchTraceAvailability); +} diff --git a/packages/app/src/hooks/api/use-trace-histograms.ts b/packages/app/src/hooks/api/use-trace-histograms.ts new file mode 100644 index 00000000..8197147a --- /dev/null +++ b/packages/app/src/hooks/api/use-trace-histograms.ts @@ -0,0 +1,25 @@ +import { bulkIdsFetcher, useBulkIdsQuery } from './benchmark-id-query'; + +export interface TraceHistogramPoint { + id: number; + /** Input sequence length (tokens) per completed request. */ + isl: number[]; + /** Output sequence length (tokens) per completed request. */ + osl: number[]; +} + +export type TraceHistogramMap = Record; + +const fetchTraceHistograms = bulkIdsFetcher('trace-histograms'); + +/** + * Fetch per-request ISL/OSL arrays for a set of benchmark_results.id values. + * Ids without a stored trace_replay blob are silently omitted from the response. + * + * Caller passes the agentic id set currently on screen; React Query handles + * dedup + stale-while-revalidate. Cache key is sorted-ids-comma-joined so + * any permutation of the same set hits the same cache entry. + */ +export function useTraceHistograms(ids: number[], enabled = true) { + return useBulkIdsQuery('trace-histograms', ids, enabled, fetchTraceHistograms); +} diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts new file mode 100644 index 00000000..47cf66a6 --- /dev/null +++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts @@ -0,0 +1,96 @@ +import { useByIdQuery } from './benchmark-id-query'; + +export interface TimeSeriesPoint { + /** Seconds from benchmark start. */ + t: number; + value: number; +} +export interface QueueDepthPoint { + t: number; + running: number; + waiting: number; + total: number; +} +export interface PointMeta { + id: number; + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + disagg: boolean; + conc: number; + offload_mode: string | null; + isl: number | null; + osl: number | null; + benchmark_type: string; + date: string; + run_url: string | null; + server_gpu_cache_hit_rate: number | null; + server_cpu_cache_hit_rate: number | null; +} + +export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown'; + +export interface MetricSource { + id: string; + adapter: string; + role: MetricSourceRole; + endpointUrl: string | null; + nativeRole: string | null; + workerId: string | null; + dpRank: string | null; + engine: string | null; +} + +export interface MetricSourceSeries { + source: MetricSource; + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + promptTps: TimeSeriesPoint[]; + generationTps: TimeSeriesPoint[]; + prefixCacheHitsTps: TimeSeriesPoint[]; + hostKvCacheUsage: TimeSeriesPoint[]; + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; +} + +export interface TraceServerMetrics { + meta: PointMeta; + startNs: number; + endNs: number; + durationS: number; + timeslicesCount: number; + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + prefillTps: TimeSeriesPoint[]; + decodeTps: TimeSeriesPoint[]; + /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ + prefixCacheHitsTps: TimeSeriesPoint[]; + /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */ + hostKvCacheUsage: TimeSeriesPoint[]; + /** + * Per-DP-rank KV cache utilization. Empty for single-engine deployments — + * the cluster-average `kvCacheUsage` line covers that case alone. + */ + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; + /** + * Total KV-cache pool size in tokens (num_gpu_blocks × block_size, summed + * across engines). vLLM only — null for SGLang/TRT or older rows. + */ + kvCachePoolTokens: number | null; + /** Orchestrator-normalized metrics grouped by endpoint/worker. */ + metricSources: MetricSourceSeries[]; +} + +/** + * Lazy-fetch parsed server-metric time-series for one agentic point. + * Enabled only when the caller passes `enabled=true` (the detail panel opens), + * so we don't pay the parse cost on every hover. + */ +export function useTraceServerMetrics(id: number | null, enabled = false) { + return useByIdQuery('trace-server-metrics', id, enabled && Boolean(id)); +} diff --git a/packages/app/src/hooks/useChartContext.ts b/packages/app/src/hooks/useChartContext.ts index 49812c3e..be095430 100644 --- a/packages/app/src/hooks/useChartContext.ts +++ b/packages/app/src/hooks/useChartContext.ts @@ -37,6 +37,12 @@ export function reconcileActiveSet( interface UseChartStateConfig { /** URL parameter prefix (e.g., 'i_' for inference, 'r_' for reliability, 'e_' for evaluation) */ urlPrefix: string; + /** + * Initial high-contrast value when the URL has no `hc` param. + * Defaults to false; the inference chart opts in to true. A `hc=0` + * URL param overrides it back off. + */ + defaultHighContrast?: boolean; } /** @@ -44,7 +50,7 @@ interface UseChartStateConfig { * Includes mobile-specific legend collapse behavior. */ export function useChartUIState(config: UseChartStateConfig) { - const { urlPrefix } = config; + const { urlPrefix, defaultHighContrast = false } = config; const { getUrlParam } = useUrlState(); const hcParam = `${urlPrefix}hc` as any; @@ -52,7 +58,7 @@ export function useChartUIState(config: UseChartStateConfig) { // Initialize with safe defaults that match SSR output to avoid hydration mismatches. // URL-param values are applied in a mount effect so the state is only set client-side. - const [highContrast, setHighContrast] = useState(false); + const [highContrast, setHighContrast] = useState(defaultHighContrast); const [isLegendExpanded, setIsLegendExpanded] = useState(true); const didInit = useRef(false); @@ -60,7 +66,9 @@ export function useChartUIState(config: UseChartStateConfig) { if (didInit.current) return; didInit.current = true; const hcVal = getUrlParam(hcParam); + // Respect both overrides so the toggle round-trips regardless of the default. if (hcVal === '1') setHighContrast(true); + else if (hcVal === '0') setHighContrast(false); const legendVal = getUrlParam(legendParam); if (legendVal === '0') setIsLegendExpanded(false); }, [getUrlParam, hcParam, legendParam]); diff --git a/packages/app/src/hooks/useThemeColors.test.ts b/packages/app/src/hooks/useThemeColors.test.ts index 7275e384..11050d19 100644 --- a/packages/app/src/hooks/useThemeColors.test.ts +++ b/packages/app/src/hooks/useThemeColors.test.ts @@ -170,4 +170,32 @@ describe('useThemeColors color maps', () => { } unmountOn(); }); + + // Regression: deselecting a legend line must not recolor the remaining lines. + // The HC palette is sized/indexed by the key set it's generated over, so when + // it was generated over the *active* subset (no hcKeys), shrinking the + // selection re-sized the palette and shifted every remaining line's hue (most + // visible on single-vendor agentic runs spanning the full wheel). Passing a + // stable `hcKeys` (the full set with data) fixes each line's color. + it('keeps a line HC color stable across active subsets when hcKeys is the full set', () => { + const FULL = ['b200', 'b300']; // single-vendor (NVIDIA) agentic comparison + + const all = renderHook(() => + useThemeColors({ highContrast: true, activeKeys: ['b200', 'b300'], hcKeys: FULL }), + ); + const b200WithBoth = all.result.current.resolveColor('b200'); + const b300Color = all.result.current.resolveColor('b300'); + all.unmount(); + + // b300 deselected → only b200 active, but hcKeys is still the full set. + const subset = renderHook(() => + useThemeColors({ highContrast: true, activeKeys: ['b200'], hcKeys: FULL }), + ); + const b200Alone = subset.result.current.resolveColor('b200'); + subset.unmount(); + + expect(b200WithBoth).toMatch(/^#[0-9a-f]{6}$/iu); + expect(b200WithBoth).not.toBe(b300Color); // HC still produces distinct hues + expect(b200Alone).toBe(b200WithBoth); // deselecting b300 did NOT recolor b200 + }); }); From bd3089418bffa12540cf864d9f52a7be0544066b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:12:03 -0500 Subject: [PATCH 07/40] =?UTF-8?q?feat(datasets):=20dataset=20browser=20?= =?UTF-8?q?=E2=80=94=20conversation=20flamegraph,=20distributions,=20deep-?= =?UTF-8?q?link=20targets?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../[slug]/conversations/[convId]/page.tsx | 35 ++ packages/app/src/app/datasets/[slug]/page.tsx | 32 ++ packages/app/src/app/datasets/page.tsx | 99 ++++ .../components/datasets/conversation-view.tsx | 109 +++++ .../components/datasets/dataset-detail.tsx | 312 +++++++++++++ .../src/components/datasets/dataset-list.tsx | 86 ++++ .../components/datasets/distribution-card.tsx | 220 +++++++++ .../app/src/components/datasets/format.ts | 28 ++ packages/app/src/components/datasets/stat.tsx | 9 + .../datasets/trace-flamegraph-model.ts | 422 +++++++++++++++++ .../datasets/trace-flamegraph.test.ts | 246 ++++++++++ .../components/datasets/trace-flamegraph.tsx | 439 ++++++++++++++++++ 12 files changed, 2037 insertions(+) create mode 100644 packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx create mode 100644 packages/app/src/app/datasets/[slug]/page.tsx create mode 100644 packages/app/src/app/datasets/page.tsx create mode 100644 packages/app/src/components/datasets/conversation-view.tsx create mode 100644 packages/app/src/components/datasets/dataset-detail.tsx create mode 100644 packages/app/src/components/datasets/dataset-list.tsx create mode 100644 packages/app/src/components/datasets/distribution-card.tsx create mode 100644 packages/app/src/components/datasets/format.ts create mode 100644 packages/app/src/components/datasets/stat.tsx create mode 100644 packages/app/src/components/datasets/trace-flamegraph-model.ts create mode 100644 packages/app/src/components/datasets/trace-flamegraph.test.ts create mode 100644 packages/app/src/components/datasets/trace-flamegraph.tsx diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx new file mode 100644 index 00000000..83eb56a0 --- /dev/null +++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx @@ -0,0 +1,35 @@ +import { Suspense } from 'react'; +import type { Metadata } from 'next'; + +import { ConversationView } from '@/components/datasets/conversation-view'; +import { SITE_URL } from '@semianalysisai/inferencex-constants'; + +interface Props { + params: Promise<{ slug: string; convId: string }>; +} + +export async function generateMetadata({ params }: Props): Promise { + const { slug, convId } = await params; + const short = convId.slice(0, 12); + const title = `Conversation ${short} | ${slug}`; + const description = `Per-turn token flamegraph (cached prefix vs uncached input vs output) for conversation ${short} in the ${slug} agentic trace dataset.`; + return { + title, + description, + alternates: { canonical: `${SITE_URL}/datasets/${slug}/conversations/${convId}` }, + robots: { index: false }, // per-conversation pages are too numerous to index + }; +} + +export default async function ConversationPage({ params }: Props) { + const { slug, convId } = await params; + return ( +
+
+ + + +
+
+ ); +} diff --git a/packages/app/src/app/datasets/[slug]/page.tsx b/packages/app/src/app/datasets/[slug]/page.tsx new file mode 100644 index 00000000..f32e3fa6 --- /dev/null +++ b/packages/app/src/app/datasets/[slug]/page.tsx @@ -0,0 +1,32 @@ +import type { Metadata } from 'next'; + +import { DatasetDetail } from '@/components/datasets/dataset-detail'; +import { SITE_URL } from '@semianalysisai/inferencex-constants'; + +interface Props { + params: Promise<{ slug: string }>; +} + +export async function generateMetadata({ params }: Props): Promise { + const { slug } = await params; + const title = `${slug} | Agentic Datasets`; + const description = `Distributions, token statistics, and per-conversation flamegraphs for the ${slug} agentic trace dataset.`; + return { + title, + description, + alternates: { canonical: `${SITE_URL}/datasets/${slug}` }, + openGraph: { title: `${title} | InferenceX`, description, url: `${SITE_URL}/datasets/${slug}` }, + twitter: { title: `${title} | InferenceX`, description }, + }; +} + +export default async function DatasetDetailPage({ params }: Props) { + const { slug } = await params; + return ( +
+
+ +
+
+ ); +} diff --git a/packages/app/src/app/datasets/page.tsx b/packages/app/src/app/datasets/page.tsx new file mode 100644 index 00000000..7fe46b93 --- /dev/null +++ b/packages/app/src/app/datasets/page.tsx @@ -0,0 +1,99 @@ +import type { Metadata } from 'next'; + +import { Card } from '@/components/ui/card'; +import { JsonLd } from '@/components/json-ld'; +import { DatasetList } from '@/components/datasets/dataset-list'; +import { SITE_URL } from '@semianalysisai/inferencex-constants'; + +const DESCRIPTION = + 'The real Claude Code agentic conversation traces that the InferenceX agentic benchmark replays — methodology, distributions, and per-conversation flamegraphs.'; + +export const metadata: Metadata = { + title: 'Agentic Datasets', + description: DESCRIPTION, + alternates: { canonical: `${SITE_URL}/datasets` }, + openGraph: { + title: 'Agentic Datasets | InferenceX', + description: DESCRIPTION, + url: `${SITE_URL}/datasets`, + }, + twitter: { title: 'Agentic Datasets | InferenceX', description: DESCRIPTION }, +}; + +const jsonLd = { + '@context': 'https://schema.org', + '@type': 'CollectionPage', + name: 'InferenceX Agentic Datasets', + description: DESCRIPTION, + url: `${SITE_URL}/datasets`, +}; + +export default function DatasetsPage() { + return ( +
+ +
+
+ +

+ Agentic Benchmark Datasets +

+

+ InferenceX's agentic benchmark doesn't replay synthetic prompts — it replays + real Claude Code coding sessions captured as conversation traces. + Each trace is a full multi-turn session: the main agent's turns plus any + subagents it spawned, with per-turn input/output token counts and the 64-token + KV-cache block hashes needed to reconstruct prefix-cache reuse. The traces are + published openly on HuggingFace under semianalysisai/cc-traces-weka-*{' '} + (apache-2.0). +

+ +

+ How traces are captured +

+

+ Production Claude Code sessions are recorded through a logging proxy that captures + every API request: its input and output token counts, the model used, timing (TTFT, + inter-token latency), and a list of hash_ids — one per 64-token KV block + of the request's input. Subagent invocations are grouped under their parent turn. + No prompt or completion text is stored; only token counts and block hashes, so the + corpus is shareable while remaining a faithful workload for replay. +

+ +

+ Cached prefix vs uncached suffix +

+

+ Agentic workloads are dominated by prefix reuse: each turn resends the growing + conversation, so most of its input is already in the KV cache from prior turns. We + reconstruct this exactly. Walking a conversation in order under an idealized infinite + cache, a turn's cached prefix is its longest run of leading{' '} + hash_ids already seen; the rest is the uncached suffix{' '} + that must be (re)computed. Blocks are 64 tokens; the split is clamped so cached + + uncached equals the turn's effective input even on a partial final block. + Subagents run against a snapshot of the parent cache at spawn (their context is + separate and is not folded back into the parent). +

+ +

Dataset variants

+
    +
  • + full — every captured request, unmodified. +
  • +
  • + 256k — requests whose input + output exceeds 256,000 tokens are + dropped so every turn fits a 256k context window (used when benchmarking engines + configured for a 256k max context). +
  • +
+
+
+ +
+

Datasets

+ +
+
+
+ ); +} diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx new file mode 100644 index 00000000..415a430d --- /dev/null +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -0,0 +1,109 @@ +'use client'; + +import Link from 'next/link'; +import { useSearchParams } from 'next/navigation'; + +import { Card } from '@/components/ui/card'; +import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; +import { useDatasetConversation } from '@/hooks/api/use-datasets'; +import { compact, formatShare } from './format'; +import { Stat } from './stat'; + +export function ConversationView({ slug, convId }: { slug: string; convId: string }) { + const { data, isLoading, isError } = useDatasetConversation(slug, convId); + + // Deep-link target from a request-timeline click: ?raw= or ?turn=[&sa=]. + // useSearchParams (not a one-shot window.location read) so the params are + // present on the very first client-side navigation, not just after a reload. + const params = useSearchParams(); + const turnRaw = params.get('turn'); + const sourceRaw = params.get('raw'); + const sourceInner = params.get('inner'); + const highlight = { + turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null, + raw: sourceRaw !== null && /^\d+$/u.test(sourceRaw) ? Number(sourceRaw) : null, + inner: sourceInner !== null && /^\d+$/u.test(sourceInner) ? Number(sourceInner) : null, + agent: params.get('sa'), + }; + + if (isLoading) { + return ( +
Loading conversation…
+ ); + } + if (isError || !data) { + return ( +
+ Conversation not found.{' '} + + Back to dataset + +
+ ); + } + + const cachedPct = formatShare(data.total_cached, data.total_in); + + return ( +
+
+
+ + Datasets + + / + + {slug} + + / + conversation +
+

+ {data.conv_id} +

+ {data.models.length > 0 && ( +
+ {data.models.map((m) => ( + + {m} + + ))} +
+ )} +
+ + +
+ + + + + + +
+
+ + +

Flamegraph

+

+ One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default — + click a group to expand it. Each bar splits input into cached prefix and uncached suffix, + plus generated output. Timestamps are elapsed from conversation start; subagent headers + show their full active range. A colored bracket on the left groups requests in the same + main-agent or subagent scope whose original execution intervals overlapped (ran in + parallel). +

+ +
+
+ ); +} diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx new file mode 100644 index 00000000..051e7457 --- /dev/null +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -0,0 +1,312 @@ +'use client'; + +import { useState } from 'react'; +import Link from 'next/link'; + +import { Card } from '@/components/ui/card'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { DistributionCard } from '@/components/datasets/distribution-card'; +import { + useDataset, + useDatasetConversations, + type ConversationSort, +} from '@/hooks/api/use-datasets'; +import { track } from '@/lib/analytics'; +import { compact, formatPct, formatShare, perConversation } from './format'; +import { Stat } from './stat'; + +const PAGE = 50; + +const SORTS: { value: ConversationSort; label: string }[] = [ + { value: 'tokens', label: 'Total input ↓' }, + { value: 'turns', label: 'Turns ↓' }, + { value: 'subagents', label: 'Subagent groups ↓' }, + { value: 'id', label: 'Conversation ID' }, +]; + +export function DatasetDetail({ slug }: { slug: string }) { + const { data: dataset, isLoading, isError } = useDataset(slug); + const [search, setSearch] = useState(''); + const [sort, setSort] = useState('tokens'); + const [page, setPage] = useState(0); + + const { data: convs, isFetching } = useDatasetConversations({ + slug, + search, + sort, + limit: PAGE, + offset: page * PAGE, + }); + + if (isLoading) { + return
Loading dataset…
; + } + if (isError || !dataset) { + return ( +
+ Dataset not found.{' '} + + Back to datasets + +
+ ); + } + + const s = dataset.summary ?? {}; + const cd = dataset.chart_data ?? {}; + const total = convs?.total ?? 0; + const pageCount = Math.ceil(total / PAGE); + + return ( +
+ {/* header */} +
+
+ + ← Datasets + +
+ + {dataset.description && ( +

{dataset.description}

+ )} +
+ + {/* summary stats */} + +
+ + + + + + + + +
+ {s.modelMix && Object.keys(s.modelMix).length > 0 && ( +
+
+ Model mix (turns) +
+
+ {Object.entries(s.modelMix) + .toSorted((a, b) => b[1] - a[1]) + .map(([model, count]) => ( + + {model} {compact(count)} + + ))} +
+
+ )} +
+ + {/* distribution cards */} +
+

Distributions

+
+ + + + + + + +
+
+ + {/* conversation list */} +
+
+

+ Conversations{' '} + ({total}) +

+
+ { + setSearch(e.target.value); + setPage(0); + }} + placeholder="Search by ID…" + className="h-8 w-40 rounded-md border border-border/40 bg-background px-2 text-xs outline-none focus:border-primary" + /> + +
+
+ + + + + + + + + + + + + + + {(convs?.items ?? []).map((c) => { + const cachedPct = formatShare(c.total_cached, c.total_in); + return ( + + + + + + + + + ); + })} + {!isFetching && (convs?.items.length ?? 0) === 0 && ( + + + + )} + +
ConversationTurnsSubagentsInputOutputCached
+ track('datasets_conversation_clicked', { slug })} + className="font-mono text-xs text-primary hover:underline" + > + {c.conv_id.slice(0, 20)}… + + {c.models.length > 0 && ( + + {c.models.length} model{c.models.length === 1 ? '' : 's'} + + )} + {c.num_turns}{c.num_subagent_groups}{compact(c.total_in)}{compact(c.total_out)} + {cachedPct} +
+ No conversations match. +
+
+ + {pageCount > 1 && ( +
+ + + Page {page + 1} of {pageCount} + + +
+ )} +
+
+ ); +} diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx new file mode 100644 index 00000000..d85d7eaa --- /dev/null +++ b/packages/app/src/components/datasets/dataset-list.tsx @@ -0,0 +1,86 @@ +'use client'; + +import Link from 'next/link'; + +import { Card } from '@/components/ui/card'; +import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets'; +import { track } from '@/lib/analytics'; +import { compact, formatPct, perConversation } from './format'; + +function DatasetCard({ d }: { d: DatasetRecord }) { + const s = d.summary ?? {}; + const cachedPct = formatPct(s.cachedPct); + return ( + track('datasets_card_clicked', { slug: d.slug })} + className="block transition-colors hover:[&_*]:border-primary/40" + > + +
+

{d.label}

+ + {d.variant} + +
+ {d.description && ( +

{d.description}

+ )} +
+ + + + + + + + +
+
View dataset →
+
+ + ); +} + +function Stat({ label, value }: { label: string; value: string }) { + return ( +
+
{label}
+
{value}
+
+ ); +} + +export function DatasetList() { + const { data, isLoading, isError } = useDatasets(); + + if (isLoading) { + return
Loading datasets…
; + } + if (isError || !data) { + return ( +
Failed to load datasets.
+ ); + } + if (data.length === 0) { + return ( +
+ No datasets ingested yet. +
+ ); + } + + return ( +
+ {data.map((d) => ( + + ))} +
+ ); +} diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx new file mode 100644 index 00000000..8adc02ee --- /dev/null +++ b/packages/app/src/components/datasets/distribution-card.tsx @@ -0,0 +1,220 @@ +'use client'; + +import { useMemo } from 'react'; + +import { Card } from '@/components/ui/card'; +import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover'; +import type { Distribution } from '@/hooks/api/use-datasets'; +import { compact } from './format'; + +interface DistributionCardProps { + title: string; + subtitle?: string; + unit: string; + distribution?: Distribution; + scale?: 'log' | 'linear'; + /** Format the x value (defaults to compact). e.g. percent for cached fraction. */ + formatValue?: (v: number) => string; +} + +const W = 720; +const H = 240; +const PAD = { top: 12, right: 16, bottom: 48, left: 52 }; + +/** + * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a + * themeable bar chart with p50/p75/p90/p95 guide lines and a hover tooltip. Bars are + * drawn at equal visual width; for log-scaled bins the edge labels are already + * log-spaced so the shape reads as a log histogram. + */ +export function DistributionCard({ + title, + subtitle, + unit, + distribution, + scale = 'linear', + formatValue = compact, +}: DistributionCardProps) { + const computed = useMemo(() => { + const bins = distribution?.bins ?? []; + if (bins.length === 0) return null; + const maxCount = Math.max(1, ...bins.map((b) => b.count)); + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const n = bins.length; + const barW = innerW / n; + // Map a data value to an x pixel by locating its bin (positional — works for + // both linear and log bins since the edges are precomputed at ingest). + // Out-of-range values clamp to the first/last bin. + const valueToX = (v: number): number => { + for (let i = 0; i < n; i++) { + if (v >= bins[i].x0 && (v < bins[i].x1 || i === n - 1)) { + return PAD.left + (i + 0.5) * barW; + } + } + if (v <= bins[0].x0) return PAD.left + 0.5 * barW; + return PAD.left + (n - 0.5) * barW; + }; + return { bins, maxCount, innerW, innerH, n, barW, valueToX }; + }, [distribution]); + + if (!computed) { + return ( + +
{title}
+
+ No data +
+
+ ); + } + + const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed; + const stats = distribution?.stats; + + const guides: { label: string; value: number; color: string }[] = stats + ? [ + { label: 'p50', value: stats.median, color: '#3b82f6' }, + ...(typeof stats.p75 === 'number' + ? [{ label: 'p75', value: stats.p75, color: '#22c55e' }] + : []), + { label: 'p90', value: stats.p90, color: '#f59e0b' }, + ...(typeof stats.p95 === 'number' + ? [{ label: 'p95', value: stats.p95, color: '#ef4444' }] + : []), + ] + : []; + + // X tick labels from a few bin edges. + const tickIdxs = [0, Math.floor(n / 3), Math.floor((2 * n) / 3), n - 1]; + + const resolve = (fraction: number) => { + const i = Math.min(n - 1, Math.max(0, Math.floor(fraction * n))); + const b = bins[i]; + const items: HoverItem[] = [ + { + color: 'currentColor', + label: 'Range', + value: `${formatValue(b.x0)}–${formatValue(b.x1)} ${unit}`, + }, + { color: 'currentColor', label: 'Count', value: b.count.toLocaleString() }, + ]; + return { items }; + }; + + return ( + +
+ {title} + {scale === 'log' && ( + + log scale + + )} +
+ {subtitle &&
{subtitle}
} + {stats && ( +
+ n={stats.count.toLocaleString()} · p50 {formatValue(stats.median)} + {typeof stats.p75 === 'number' && <> · p75 {formatValue(stats.p75)}} · p90{' '} + {formatValue(stats.p90)} + {typeof stats.p95 === 'number' && <> · p95 {formatValue(stats.p95)}} · max{' '} + {formatValue(stats.max)} {unit} +
+ )} +
+ + {/* bars */} + {bins.map((b, i) => { + const h = (b.count / maxCount) * innerH; + const x = PAD.left + i * barW; + const y = PAD.top + (innerH - h); + return ( + + ); + })} + + {/* guide lines */} + {guides.map((g) => { + const x = valueToX(g.value); + return ( + + ); + })} + + {/* x axis */} + + {tickIdxs.map((i, k) => { + const anchor = k === 0 ? 'start' : k === tickIdxs.length - 1 ? 'end' : 'middle'; + const x = PAD.left + (i + 0.5) * barW; + return ( + + {formatValue(bins[i].x0)} + + ); + })} + + {unit} + + + {/* guide legend */} + {guides.map((g, i) => ( + + + + {g.label} {formatValue(g.value)} + + + ))} + +
+
+ ); +} diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts new file mode 100644 index 00000000..fd526d12 --- /dev/null +++ b/packages/app/src/components/datasets/format.ts @@ -0,0 +1,28 @@ +/** + * Compact number formatter for dataset token/count displays: + * 1234 → "1.2k", 1_200_000 → "1.2M", 3.2e9 → "3.2B", 0.82 → "0.82". + */ +export function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + if (abs > 0 && abs < 1) return n.toFixed(2); + return String(Math.round(n)); +} + +/** Format a per-conversation count without hiding a meaningful fractional mean. */ +export function perConversation(n: number | undefined): string { + if (typeof n !== 'number' || !Number.isFinite(n)) return '—'; + return n.toLocaleString(undefined, { maximumFractionDigits: 1 }); +} + +/** Format a 0–1 fraction as a whole percent ("42%"), em dash when absent. */ +export function formatPct(fraction: number | undefined): string { + return typeof fraction === 'number' ? `${(fraction * 100).toFixed(0)}%` : '—'; +} + +/** Percent share of `part` in `total` ("42%"), em dash when `total` is 0. */ +export function formatShare(part: number, total: number): string { + return total > 0 ? `${((part / total) * 100).toFixed(0)}%` : '—'; +} diff --git a/packages/app/src/components/datasets/stat.tsx b/packages/app/src/components/datasets/stat.tsx new file mode 100644 index 00000000..3fb6a32a --- /dev/null +++ b/packages/app/src/components/datasets/stat.tsx @@ -0,0 +1,9 @@ +/** Label/value pair for the summary
grids on dataset and conversation pages. */ +export function Stat({ label, value }: { label: string; value: string }) { + return ( +
+
{label}
+
{value}
+
+ ); +} diff --git a/packages/app/src/components/datasets/trace-flamegraph-model.ts b/packages/app/src/components/datasets/trace-flamegraph-model.ts new file mode 100644 index 00000000..2aff9ac3 --- /dev/null +++ b/packages/app/src/components/datasets/trace-flamegraph-model.ts @@ -0,0 +1,422 @@ +/** + * Pure logic for the trace flamegraph: overlap detection, deep-link resolution, + * visible-row construction, and bracket-lane layout. No React/DOM — everything + * here is unit-testable directly. Rendering lives in trace-flamegraph.tsx. + */ + +import type { StructureNode } from '@/hooks/api/use-datasets'; + +// Kept distinct from token-segment colors. A row can carry multiple rails when +// it overlaps different requests during different parts of its lifetime. +export const OVERLAP_COLORS = ['#06b6d4', '#ec4899', '#6366f1', '#84cc16', '#f97316'] as const; + +// Cap on simultaneously-drawn bracket lanes. A pathological conversation (e.g. a +// long-running session whose subagent fans out into hundreds of children with +// 15+ concurrent requests) can require dozens of lanes; left unbounded the +// gutter grows wide enough to push the bars off-screen AND emits one DOM node +// per lane per row (tens of thousands of empty divs). We bound it: lanes beyond +// the cap fold into the last "dense" lane, which stays readable for the common +// case (≤6 concurrent) and degrades gracefully for the outliers. +export const MAX_LANES = 6; + +export interface TimedRequest { + key: string; + startS?: number; + endS?: number; +} + +export interface RequestOverlapGroup { + id: string; + requestKeys: string[]; + startS: number; + endS: number; +} + +/** + * Find maximal sets of requests that were simultaneously in flight. + * Intervals are half-open, so one request ending exactly when another begins + * is serialized rather than parallel. Maximal-set filtering prevents a nested + * A/B pair from duplicating an A/B/C marker, while preserving A/B and B/C as + * separate groups when their overlaps happen at different times. + */ +export function findRequestOverlapGroups( + requests: TimedRequest[], + scopeKey = 'scope', +): RequestOverlapGroup[] { + const valid = requests.filter( + (request): request is TimedRequest & { startS: number; endS: number } => + Number.isFinite(request.startS) && + Number.isFinite(request.endS) && + request.endS! > request.startS!, + ); + const boundaries = [ + ...new Set(valid.flatMap((request) => [request.startS, request.endS])), + ].toSorted((a, b) => a - b); + const candidates = new Map>(); + + for (let i = 0; i < boundaries.length - 1; i++) { + const startS = boundaries[i]!; + const endS = boundaries[i + 1]!; + if (endS <= startS) continue; + const requestKeys = valid + .filter((request) => request.startS <= startS && request.endS >= endS) + .map((request) => request.key) + .toSorted(); + if (requestKeys.length < 2) continue; + const key = requestKeys.join('\u0000'); + const existing = candidates.get(key); + candidates.set(key, { + requestKeys, + startS: existing ? Math.min(existing.startS, startS) : startS, + endS: existing ? Math.max(existing.endS, endS) : endS, + }); + } + + const maximal = [...candidates.values()].filter( + (candidate, _, all) => + !all.some( + (other) => + other.requestKeys.length > candidate.requestKeys.length && + candidate.requestKeys.every((key) => other.requestKeys.includes(key)), + ), + ); + + return maximal + .toSorted( + (a, b) => + a.startS - b.startS || + a.endS - b.endS || + a.requestKeys.join(',').localeCompare(b.requestKeys.join(',')), + ) + .map((group, index) => ({ ...group, id: `${scopeKey}-${index + 1}` })); +} + +export interface RowOverlap { + id: string; + label: string; + color: string; + startS: number; + endS: number; + peerCount: number; +} + +export interface VisibleRow { + key: string; + label: string; + sublabel?: string; + timeLabel?: string; + cached: number; + uncached: number; + output: number; + total: number; + indent: number; + isGroup: boolean; + isExpanded: boolean; + groupIndex?: number; + overlaps: RowOverlap[]; +} + +/** Format seconds from conversation start as a compact elapsed timestamp. */ +export function formatElapsedTime(seconds: number): string { + const total = Math.max(0, Math.round(seconds)); + const hours = Math.floor(total / 3600); + const minutes = Math.floor((total % 3600) / 60); + const secs = total % 60; + const mm = String(minutes).padStart(2, '0'); + const ss = String(secs).padStart(2, '0'); + return hours > 0 ? `${hours}:${mm}:${ss}` : `${mm}:${ss}`; +} + +/** Elapsed-interval label for a row ("+MM:SS–MM:SS"), or undefined when untimed. */ +export function timeLabel(startS?: number, endS?: number): string | undefined { + if (startS === undefined || !Number.isFinite(startS)) return undefined; + const start = formatElapsedTime(startS); + if (endS === undefined || !Number.isFinite(endS) || endS <= startS) return `+${start}`; + return `+${start}–${formatElapsedTime(endS)}`; +} + +export interface DeepLinkHighlight { + turn?: number | null; + raw?: number | null; + inner?: number | null; + agent?: string | null; +} + +export interface DeepLinkTarget { + rowKey: string; + expandGroup: number | null; +} + +/** + * Resolve a request-timeline deep link to a flamegraph row key (+ the subagent + * group that must be expanded to show it). Raw Weka source coordinates are + * exact and take precedence: + * raw= -> top-level Weka request + * raw=&inner= -> subagent child inside that top-level marker + * Otherwise main turns match by main-turn ordinal and subagent turns match the + * group by agentId, then the ti-th child. + * + * `buildConversationStructure` emits exactly one node per raw Weka entry (and + * one child per nested entry), so a node's array position IS its raw index. + * Structures ingested before rawIndex/innerIndex were stored omit the explicit + * fields — fall back to the array position so deep links keep resolving against + * those older rows instead of silently doing nothing. + */ +export function resolveDeepLinkTarget( + nodes: readonly StructureNode[], + highlight: DeepLinkHighlight, +): DeepLinkTarget | null { + const { turn, raw, inner, agent } = highlight; + if (typeof raw === 'number' && raw >= 0) { + if (typeof inner === 'number' && inner >= 0) { + const gi = nodes.findIndex( + (node, i) => node.kind === 'subagent' && (node.rawIndex ?? i) === raw, + ); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + const ci = group.children.findIndex((child, i) => (child.innerIndex ?? i) === inner); + if (ci === -1) return null; + return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi }; + } + const i = nodes.findIndex( + (node, idx) => node.kind === 'turn' && (node.rawIndex ?? idx) === raw, + ); + if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null }; + return null; + } + if (typeof turn !== 'number' || turn < 0) return null; + if (agent) { + const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === agent); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + if (turn >= group.children.length) return null; + return { rowKey: `g-${gi}-c-${turn}`, expandGroup: gi }; + } + let ordinal = 0; + for (let i = 0; i < nodes.length; i++) { + if (nodes[i].kind === 'turn') { + if (ordinal === turn) return { rowKey: `t-${i}`, expandGroup: null }; + ordinal += 1; + } + } + return null; +} + +/** + * Overlap groups per row key. Main-agent turns and each subagent's children are + * separate scopes — parallelism is only meaningful within one agent's stream. + */ +export function buildRowOverlaps(nodes: readonly StructureNode[]): Map { + const mainGroups = findRequestOverlapGroups( + nodes.flatMap((node, i) => + node.kind === 'turn' ? [{ key: `t-${i}`, startS: node.startS, endS: node.endS }] : [], + ), + 'main', + ); + const subagentGroups = nodes.flatMap((node, i) => + node.kind === 'subagent' + ? findRequestOverlapGroups( + node.children.map((child, ci) => ({ + key: `g-${i}-c-${ci}`, + startS: child.startS, + endS: child.endS, + })), + `subagent-${i}`, + ) + : [], + ); + const groups: RequestOverlapGroup[] = [...mainGroups, ...subagentGroups]; + + const byRow = new Map(); + groups.forEach((group, groupIndex) => { + const overlap = { + id: group.id, + label: `P${groupIndex + 1}`, + color: OVERLAP_COLORS[groupIndex % OVERLAP_COLORS.length]!, + startS: group.startS, + endS: group.endS, + peerCount: group.requestKeys.length - 1, + }; + group.requestKeys.forEach((key) => byRow.set(key, [...(byRow.get(key) ?? []), overlap])); + }); + return byRow; +} + +/** + * Flatten structure nodes into the rows currently visible: one row per main + * turn, one header per subagent group, plus indented children for expanded + * groups. Row keys (`t-`, `g-`, `g--c-`) index by node position so + * they stay stable across expand/collapse. + */ +export function buildVisibleRows( + nodes: readonly StructureNode[], + expanded: ReadonlySet, + overlapsByRow: ReadonlyMap, +): VisibleRow[] { + const out: VisibleRow[] = []; + let turnNo = 0; + nodes.forEach((node: StructureNode, i) => { + if (node.kind === 'turn') { + turnNo += 1; + out.push({ + key: `t-${i}`, + label: `Turn ${turnNo}`, + sublabel: node.model ?? undefined, + timeLabel: timeLabel(node.startS, node.endS), + cached: node.cached, + uncached: node.uncached, + output: node.out, + total: node.in + node.out, + indent: 0, + isGroup: false, + isExpanded: false, + overlaps: overlapsByRow.get(`t-${i}`) ?? [], + }); + } else { + const isExpanded = expanded.has(i); + out.push({ + key: `g-${i}`, + label: `${node.label}`, + sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${ + node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : '' + }`, + timeLabel: timeLabel(node.startS, node.endS), + cached: node.cached, + uncached: node.uncached, + output: node.out, + total: node.in + node.out, + indent: 0, + isGroup: true, + isExpanded, + groupIndex: i, + overlaps: [], + }); + if (isExpanded) { + node.children.forEach((child, ci) => { + out.push({ + key: `g-${i}-c-${ci}`, + label: `↳ subturn ${ci + 1}`, + sublabel: child.model ?? undefined, + timeLabel: timeLabel(child.startS, child.endS), + cached: child.cached, + uncached: child.uncached, + output: child.out, + total: child.in + child.out, + indent: 1, + isGroup: false, + isExpanded: false, + overlaps: overlapsByRow.get(`g-${i}-c-${ci}`) ?? [], + }); + }); + } + } + }); + return out; +} + +export interface BraceSeg { + role: 'first' | 'middle' | 'last' | 'through'; + isMember: boolean; + color: string; + groupId: string; + peerCount: number; + startS: number; + endS: number; +} + +export interface BraceLayout { + laneCount: number; + overflowLanes: number; + /** Per visible row: only the lanes that actually carry a bracket segment. */ + rowSegs: { lane: number; seg: BraceSeg }[][]; +} + +/** + * Geometry for the parallel-group brackets drawn in the left gutter. Each + * overlap group becomes a vertical bracket spanning from its first to its last + * visible member row, with a right-pointing tick on the exact member rows. + * Non-transitive chains (a row in two groups) get separate lanes so their + * brackets sit side by side. `through` = a row inside a group's span that is + * NOT itself a member (the aux-stream edge case) — drawn as a faint connector + * with no tick. + */ +export function computeBraceLayout(rows: readonly VisibleRow[]): BraceLayout { + const groupMap = new Map< + string, + { id: string; color: string; peerCount: number; startS: number; endS: number; idxs: number[] } + >(); + rows.forEach((r, idx) => { + for (const ov of r.overlaps) { + const g = groupMap.get(ov.id) ?? { + id: ov.id, + color: ov.color, + peerCount: ov.peerCount, + startS: ov.startS, + endS: ov.endS, + idxs: [], + }; + g.idxs.push(idx); + groupMap.set(ov.id, g); + } + }); + const groups = [...groupMap.values()] + .filter((g) => g.idxs.length >= 2) // need ≥2 visible members to bracket + .map((g) => ({ + ...g, + min: Math.min(...g.idxs), + max: Math.max(...g.idxs), + members: new Set(g.idxs), + })) + .toSorted((a, b) => a.min - b.min || a.max - b.max); + + // Greedy lane assignment: a group reuses a lane whose previous group ended + // before this one starts. + const laneEnd: number[] = []; + const laneOf = new Map(); + for (const g of groups) { + let lane = laneEnd.findIndex((end) => end < g.min); + if (lane === -1) { + lane = laneEnd.length; + laneEnd.push(g.max); + } else { + laneEnd[lane] = g.max; + } + laneOf.set(g.id, lane); + } + const rawLaneCount = laneEnd.length; + // Bound the gutter (see MAX_LANES). Lanes past the cap collapse onto the last + // visible lane, so every parallel row still carries a marker but the gutter + // width and DOM-node count stay bounded regardless of how parallel the + // conversation is. + const laneCount = Math.min(rawLaneCount, MAX_LANES); + const displayLane = (lane: number) => Math.min(lane, laneCount - 1); + + // Sparse per-row segments: only lanes that actually carry a bracket on a row + // are stored (and later rendered). The previous dense matrix emitted one DOM + // node per lane per row — catastrophic at 49 lanes × 2k rows. + const rowSegs: { lane: number; seg: BraceSeg }[][] = rows.map(() => []); + for (const g of groups) { + const lane = displayLane(laneOf.get(g.id)!); + for (let idx = g.min; idx <= g.max; idx++) { + const isMember = g.members.has(idx); + const role = + idx === g.min ? 'first' : idx === g.max ? 'last' : isMember ? 'middle' : 'through'; + const seg: BraceSeg = { + role, + isMember, + color: g.color, + groupId: g.id, + peerCount: g.peerCount, + startS: g.startS, + endS: g.endS, + }; + const cell = rowSegs[idx]!; + const existing = cell.find((c) => c.lane === lane); + // Collisions only happen in the folded overflow lane. Prefer a real + // member marker over a faint pass-through connector. + if (!existing) cell.push({ lane, seg }); + else if (seg.isMember && !existing.seg.isMember) existing.seg = seg; + } + } + return { laneCount, overflowLanes: rawLaneCount - laneCount, rowSegs }; +} diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts new file mode 100644 index 00000000..0af344f1 --- /dev/null +++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts @@ -0,0 +1,246 @@ +import { describe, expect, it } from 'vitest'; + +import type { + StructureNode, + SubagentNode, + TurnNode, +} from '@semianalysisai/inferencex-db/etl/weka-structure'; + +import { + buildRowOverlaps, + buildVisibleRows, + computeBraceLayout, + findRequestOverlapGroups, + formatElapsedTime, + resolveDeepLinkTarget, + timeLabel, +} from './trace-flamegraph-model'; + +describe('formatElapsedTime', () => { + it('formats elapsed seconds below and above one hour', () => { + expect(formatElapsedTime(0)).toBe('00:00'); + expect(formatElapsedTime(65.4)).toBe('01:05'); + expect(formatElapsedTime(3661.6)).toBe('1:01:02'); + expect(formatElapsedTime(86_541.149)).toBe('24:02:21'); + }); + + it('clamps negative offsets to the conversation origin', () => { + expect(formatElapsedTime(-5)).toBe('00:00'); + }); +}); + +describe('timeLabel', () => { + it('renders a range when the end is after the start, a point otherwise', () => { + expect(timeLabel(65, 130)).toBe('+01:05–02:10'); + expect(timeLabel(65)).toBe('+01:05'); + expect(timeLabel(65, 65)).toBe('+01:05'); + expect(timeLabel(undefined, 130)).toBeUndefined(); + expect(timeLabel(Number.NaN, 130)).toBeUndefined(); + }); +}); + +describe('findRequestOverlapGroups', () => { + it('keeps non-transitive overlap chains as separate groups', () => { + const groups = findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 8 }, + { key: 'B', startS: 5, endS: 11 }, + { key: 'C', startS: 9, endS: 15 }, + ]); + + expect(groups.map((group) => group.requestKeys)).toEqual([ + ['A', 'B'], + ['B', 'C'], + ]); + expect(groups.map(({ startS, endS }) => [startS, endS])).toEqual([ + [5, 8], + [9, 11], + ]); + }); + + it('does not consider touching or invalid intervals parallel', () => { + expect( + findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 5 }, + { key: 'B', startS: 5, endS: 8 }, + { key: 'missing-end', startS: 3 }, + { key: 'zero-duration', startS: 4, endS: 4 }, + ]), + ).toEqual([]); + }); + + it('returns only the maximal simultaneous set for nested intervals', () => { + const groups = findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 10 }, + { key: 'B', startS: 2, endS: 8 }, + { key: 'C', startS: 3, endS: 7 }, + ]); + expect(groups).toMatchObject([{ requestKeys: ['A', 'B', 'C'], startS: 3, endS: 7 }]); + }); +}); + +const turn = (turnIndex: number, extra: Partial = {}): TurnNode => ({ + kind: 'turn', + turnIndex, + in: 100, + out: 10, + cached: 0, + uncached: 100, + ...extra, +}); +const subagent = (children: TurnNode[], extra: Partial = {}): SubagentNode => ({ + kind: 'subagent', + label: 'Subagent', + in: 100, + out: 10, + cached: 0, + uncached: 100, + children, + ...extra, +}); + +describe('resolveDeepLinkTarget', () => { + // Node layout mirroring a real Weka conversation: raw entries + // 0: turn, 1: subagent (2 children), 2: turn + const withRawIndexes: StructureNode[] = [ + turn(0, { rawIndex: 0 }), + subagent([turn(1, { rawIndex: 1, innerIndex: 0 }), turn(2, { rawIndex: 1, innerIndex: 1 })], { + agentId: 'subagent_001_abcd1234', + rawIndex: 1, + }), + turn(3, { rawIndex: 2 }), + ]; + // The same conversation as stored by the pre-rawIndex ingest (fields absent). + const legacy: StructureNode[] = [ + turn(0), + subagent([turn(1), turn(2)], { agentId: 'subagent_001_abcd1234' }), + turn(3), + ]; + + it('resolves raw source coordinates against explicit rawIndex fields', () => { + expect(resolveDeepLinkTarget(withRawIndexes, { raw: 2 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(withRawIndexes, { raw: 1, inner: 1 })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + }); + + it('falls back to node array position for structures ingested before rawIndex existed', () => { + // One node per raw entry means position === raw index, so the deep link + // must still resolve exactly (regression: it previously returned null and + // the flamegraph neither scrolled nor highlighted anything). + expect(resolveDeepLinkTarget(legacy, { raw: 2, turn: 1 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(legacy, { raw: 0, turn: 0 })).toEqual({ + rowKey: 't-0', + expandGroup: null, + }); + }); + + it('resolves subagent children positionally when innerIndex is absent', () => { + expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 1, turn: 1 })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + }); + + it('returns null for out-of-range raw coordinates instead of guessing', () => { + expect(resolveDeepLinkTarget(legacy, { raw: 9 })).toBeNull(); + expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 5 })).toBeNull(); + // raw pointing at a subagent marker without inner does not match a turn. + expect(resolveDeepLinkTarget(legacy, { raw: 1 })).toBeNull(); + }); + + it('keeps the positional turn/agent fallback for links without raw coordinates', () => { + expect(resolveDeepLinkTarget(legacy, { turn: 1 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(legacy, { turn: 1, agent: 'subagent_001_abcd1234' })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + expect(resolveDeepLinkTarget(legacy, {})).toBeNull(); + }); +}); + +describe('buildVisibleRows', () => { + const nodes: StructureNode[] = [ + turn(0, { startS: 0, endS: 10, model: 'claude' }), + subagent([turn(1), turn(2)], { label: 'Subagent: search', durationMs: 12_000 }), + turn(3), + ]; + + it('hides collapsed subagent children and keys rows by node position', () => { + const rows = buildVisibleRows(nodes, new Set(), new Map()); + expect(rows.map((r) => r.key)).toEqual(['t-0', 'g-1', 't-2']); + expect(rows[0]).toMatchObject({ + label: 'Turn 1', + sublabel: 'claude', + timeLabel: '+00:00–00:10', + total: 110, + isGroup: false, + }); + expect(rows[1]).toMatchObject({ + label: 'Subagent: search', + sublabel: '2 turns · 12s', + isGroup: true, + isExpanded: false, + groupIndex: 1, + }); + }); + + it('inserts indented child rows for expanded groups and attaches overlaps', () => { + const overlap = { + id: 'main-1', + label: 'P1', + color: '#06b6d4', + startS: 0, + endS: 1, + peerCount: 1, + }; + const rows = buildVisibleRows(nodes, new Set([1]), new Map([['g-1-c-0', [overlap]]])); + expect(rows.map((r) => r.key)).toEqual(['t-0', 'g-1', 'g-1-c-0', 'g-1-c-1', 't-2']); + expect(rows[2]).toMatchObject({ label: '↳ subturn 1', indent: 1, overlaps: [overlap] }); + expect(rows[3]!.overlaps).toEqual([]); + }); +}); + +describe('buildRowOverlaps and computeBraceLayout', () => { + it('brackets parallel main turns and spans a non-member row as pass-through', () => { + const nodes: StructureNode[] = [ + turn(0, { startS: 0, endS: 10 }), + turn(1), // untimed — sits inside the bracket span without being a member + turn(2, { startS: 5, endS: 30 }), // overlaps turn 0 and turn 3 + turn(3, { startS: 28, endS: 40 }), + ]; + const overlaps = buildRowOverlaps(nodes); + expect([...overlaps.keys()].toSorted()).toEqual(['t-0', 't-2', 't-3']); + + const rows = buildVisibleRows(nodes, new Set(), overlaps); + const layout = computeBraceLayout(rows); + // Two overlap groups sharing rows 0–2 and 2–3 need two side-by-side lanes. + expect(layout.laneCount).toBe(2); + expect(layout.overflowLanes).toBe(0); + const roles = layout.rowSegs.map((segs) => + segs.map(({ lane, seg }) => `${lane}:${seg.role}${seg.isMember ? '' : ':nonmember'}`), + ); + expect(roles[0]).toEqual(['0:first']); + expect(roles[1]).toEqual(['0:through:nonmember']); + expect(roles[2]!.toSorted()).toEqual(['0:last', '1:first']); + expect(roles[3]).toEqual(['1:last']); + }); + + it('reports no lanes for a fully serial conversation', () => { + const nodes: StructureNode[] = [ + turn(0, { startS: 0, endS: 5 }), + turn(1, { startS: 5, endS: 9 }), + ]; + const rows = buildVisibleRows(nodes, new Set(), buildRowOverlaps(nodes)); + expect(computeBraceLayout(rows)).toEqual({ laneCount: 0, overflowLanes: 0, rowSegs: [[], []] }); + }); +}); diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx new file mode 100644 index 00000000..d63cc691 --- /dev/null +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -0,0 +1,439 @@ +'use client'; + +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { createPortal } from 'react-dom'; + +import type { ConversationStructure } from '@/hooks/api/use-datasets'; +import { track } from '@/lib/analytics'; +import { compact, formatShare } from './format'; +import { + buildRowOverlaps, + buildVisibleRows, + computeBraceLayout, + formatElapsedTime, + MAX_LANES, + OVERLAP_COLORS, + resolveDeepLinkTarget, + type VisibleRow, +} from './trace-flamegraph-model'; + +// Pure logic lives in trace-flamegraph-model.ts; re-exported here so this file +// stays the module entry point for the flamegraph's public API. +export { + findRequestOverlapGroups, + formatElapsedTime, + resolveDeepLinkTarget, +} from './trace-flamegraph-model'; +export type { + DeepLinkHighlight, + DeepLinkTarget, + RequestOverlapGroup, + TimedRequest, +} from './trace-flamegraph-model'; + +// Stacked-bar segment colors. Cached prefix vs uncached input vs output — +// fixed hues (theme-independent) so the meaning is stable in light/dark. +const SEG = { + cached: '#10b981', // emerald-500 — input served from prefix cache + uncached: '#f59e0b', // amber-500 — input that must be (re)computed + output: '#8b5cf6', // violet-500 — generated tokens +} as const; + +const LEGEND = [ + { key: 'cached', label: 'Cached prefix', color: SEG.cached }, + { key: 'uncached', label: 'Uncached input', color: SEG.uncached }, + { key: 'output', label: 'Output', color: SEG.output }, +] as const; + +// Width (px) of one parallel-group bracket lane in the left gutter. Overlapping +// groups (non-transitive chains) get their own lane so their brackets sit +// side-by-side instead of stacking visually. +const LANE_W = 14; + +interface TooltipState { + x: number; + y: number; + row: VisibleRow; +} + +/** + * Per-conversation flamegraph driven by the precomputed `structure` JSONB. + * One row per turn; subagent groups render a collapsible header with indented + * children (collapsed by default). Each bar stacks cached-prefix + uncached + * input + output, scaled to the widest visible turn. + */ +export function TraceFlamegraph({ + structure, + highlightTurn, + highlightRawIndex, + highlightInnerIndex, + highlightAgentId, +}: { + structure: ConversationStructure; + /** Turn index to scroll to / highlight (from a request-timeline deep link). */ + highlightTurn?: number | null; + /** Raw Weka top-level request index to scroll to / highlight. */ + highlightRawIndex?: number | null; + /** Raw Weka nested request index under highlightRawIndex, for subagent children. */ + highlightInnerIndex?: number | null; + /** Subagent id when the highlighted turn is inside a subagent group. */ + highlightAgentId?: string | null; +}) { + const nodes = structure.nodes; + + // Resolve the deep-link target to a row key (+ the group that must be open to + // show it). See resolveDeepLinkTarget for the matching rules. + const target = useMemo( + () => + resolveDeepLinkTarget(nodes, { + turn: highlightTurn, + raw: highlightRawIndex, + inner: highlightInnerIndex, + agent: highlightAgentId, + }), + [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId], + ); + + // Subagent groups collapsed by default — except the deep-link target's group. + const [expanded, setExpanded] = useState>(() => + typeof target?.expandGroup === 'number' ? new Set([target.expandGroup]) : new Set(), + ); + const [tooltip, setTooltip] = useState(null); + const scrollRef = useRef(null); + + // Portal target only exists after mount (the tooltip is portaled to body so + // its position:fixed is viewport-relative, immune to ancestor transforms). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + + // The deep-link target row gets a state-driven highlight (ring + bg flash) + // that fades out — state-driven so a re-render can't clobber it, and so the + // fade is a real CSS transition rather than an abrupt classList removal. + const [highlightKey, setHighlightKey] = useState(target?.rowKey ?? null); + + // When the deep-link target resolves/changes: expand its subagent group, then + // (after the row renders) scroll it into view and flash the highlight. Runs on + // first load and on any later target change (e.g. clicking another bar into + // the same conversation). The row query/scroll is deferred to the next frame + // so the just-expanded child row exists in the DOM. + useEffect(() => { + if (!target) return; + if (typeof target.expandGroup === 'number') { + const gi = target.expandGroup; + setExpanded((prev) => (prev.has(gi) ? prev : new Set(prev).add(gi))); + } + setHighlightKey(target.rowKey); + const raf = requestAnimationFrame(() => { + scrollRef.current + ?.querySelector(`[data-rowkey="${target.rowKey}"]`) + ?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + }); + const t = setTimeout(() => setHighlightKey(null), 2200); + return () => { + cancelAnimationFrame(raf); + clearTimeout(t); + }; + }, [target]); + + const groupIndexes = useMemo(() => { + const out: number[] = []; + nodes.forEach((node, i) => { + if (node.kind === 'subagent') out.push(i); + }); + return out; + }, [nodes]); + + const toggle = useCallback((i: number) => { + setExpanded((prev) => { + const next = new Set(prev); + if (next.has(i)) next.delete(i); + else next.add(i); + return next; + }); + }, []); + + const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]); + const collapseAll = useCallback(() => setExpanded(new Set()), []); + + const overlapsByRow = useMemo(() => buildRowOverlaps(nodes), [nodes]); + + const rows = useMemo( + () => buildVisibleRows(nodes, expanded, overlapsByRow), + [nodes, expanded, overlapsByRow], + ); + + // Two scales: leaf turns/subturns share a per-turn axis (the primary signal — + // how cached/uncached evolves), while subagent group headers carry aggregates + // orders of magnitude larger, so they get their own axis to stay comparable to + // each other. Group bars render slim + muted, so the mixed scale reads as a + // distinct "group summary" track rather than a contradiction. + const maxTotal = useMemo( + () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)), + [rows], + ); + const maxGroupTotal = useMemo( + () => Math.max(1, ...rows.filter((r) => r.isGroup).map((r) => r.total)), + [rows], + ); + + const braces = useMemo(() => computeBraceLayout(rows), [rows]); + + const onMove = (e: React.MouseEvent, row: VisibleRow) => { + setTooltip({ x: e.clientX, y: e.clientY, row }); + }; + + return ( +
+
+
+ {LEGEND.map((l) => ( + + + {l.label} + + ))} + + + Bracketed rows ran in parallel + +
+ {groupIndexes.length > 0 && ( +
+ + +
+ )} +
+ + {braces.overflowLanes > 0 && ( +

+ Dense parallel region — bracket lanes capped at {MAX_LANES}; {braces.overflowLanes}{' '} + further overlapping {braces.overflowLanes === 1 ? 'group is' : 'groups are'} folded into + the last lane. +

+ )} + +
+ {/* gap-0 so the per-row bracket segments connect into a continuous + vertical rail across the rows of a parallel group. */} +
+ {rows.map((row, idx) => { + // Group headers use the group axis; turns/subturns use the per-turn + // axis. Clamp to the track width either way. + const denom = row.isGroup ? maxGroupTotal : maxTotal; + const widthPct = Math.min(100, Math.max(0.5, (row.total / denom) * 100)); + const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0; + const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0; + const ow = row.total > 0 ? (row.output / row.total) * 100 : 0; + const isHighlighted = row.key === highlightKey; + const segs = braces.rowSegs[idx]!; + return ( +
+ {/* Parallel-group bracket gutter (only rendered when the + conversation has any overlaps, so non-overlap traces keep a + flush-left layout with no dead space). Segments are sparse and + absolutely positioned per lane so a row only pays for the + lanes it actually touches. */} + {braces.laneCount > 0 && ( +
+ {segs.map(({ lane, seg }) => { + const top = seg.role === 'first' ? '50%' : '0'; + const bottom = seg.role === 'last' ? '50%' : '0'; + return ( +
+ {/* vertical rail */} +
+ {/* right-pointing tick marking an actual member row */} + {seg.isMember && ( +
+ )} +
+ ); + })} +
+ )} + + {/* row content (indented for subagent children) */} +
+ {/* label / group toggle */} +
+ {row.isGroup ? ( + + ) : ( + {row.label} + )} +
+ + {/* Original interval, measured from conversation start. */} +
+ {row.timeLabel ?? '—'} +
+ + {/* stacked bar — group headers render as a slim muted summary + strip so they read as aggregates, not individual turns. */} +
onMove(e, row)} + onMouseLeave={() => setTooltip(null)} + > +
+
+
+
+
+
+ + {/* total */} +
+ {compact(row.total)} +
+
+
+ ); + })} +
+
+ + {tooltip && + mounted && + createPortal( +
+
+ {tooltip.row.label} + {tooltip.row.sublabel ? ( + + {tooltip.row.sublabel} + + ) : null} +
+
+ Cached prefix + + {compact(tooltip.row.cached)} + + Uncached input + + {compact(tooltip.row.uncached)} + + Output + + {compact(tooltip.row.output)} + + Cached % + + {formatShare(tooltip.row.cached, tooltip.row.cached + tooltip.row.uncached)} + + From start + + {tooltip.row.timeLabel ?? '—'} + +
+
, + document.body, + )} +
+ ); +} From 4d5bb87496131bd8a037f91c2123b495abb97ce4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:12:09 -0500 Subject: [PATCH 08/40] =?UTF-8?q?feat(agentic):=20per-point=20detail=20?= =?UTF-8?q?=E2=80=94=20request=20timeline,=20time-series=20charts,=20aggre?= =?UTF-8?q?gates,=20distributions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../inference/agentic/[id]/page.tsx | 17 + .../agentic-point/agentic-point-detail.tsx | 334 ++++++++++ .../agentic-point/aggregate-chart.tsx | 286 +++++++++ .../agentic-point/aggregates-grid.tsx | 104 ++++ .../inference/agentic-point/chart-hover.tsx | 148 +++++ .../inference/agentic-point/chart-shared.tsx | 57 ++ .../agentic-point/dataset-conv-id.test.ts | 53 ++ .../inference/agentic-point/distribution.tsx | 233 +++++++ .../agentic-point/expandable-chart.tsx | 56 ++ .../agentic-point/metric-source-toolbar.tsx | 130 ++++ .../agentic-point/phase-slice.test.ts | 212 +++++++ .../inference/agentic-point/phase-slice.ts | 188 ++++++ .../inference/agentic-point/point-summary.tsx | 50 ++ .../agentic-point/request-metric-cards.tsx | 223 +++++++ .../agentic-point/request-timeline.test.ts | 378 ++++++++++++ .../agentic-point/request-timeline.tsx | 581 ++++++++++++++++++ .../agentic-point/server-metric-cards.tsx | 474 ++++++++++++++ .../inference/agentic-point/sibling-nav.tsx | 247 ++++++++ .../agentic-point/time-series-chart.tsx | 526 ++++++++++++++++ .../agentic-point/time-series-math.test.ts | 457 ++++++++++++++ .../agentic-point/time-series-math.ts | 491 +++++++++++++++ .../inference/agentic-point/timeline-bars.tsx | 252 ++++++++ .../timeline-cursor-stats.test.ts | 69 +++ .../agentic-point/timeline-cursor-stats.ts | 57 ++ .../agentic-point/timeline-format.ts | 15 + .../agentic-point/timeline-layout.ts | 21 + .../inference/agentic-point/timeline-rows.ts | 476 ++++++++++++++ .../agentic-point/timeline-tooltips.tsx | 143 +++++ .../agentic-point/timeline-view-snapshot.ts | 108 ++++ 29 files changed, 6386 insertions(+) create mode 100644 packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx create mode 100644 packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx create mode 100644 packages/app/src/components/inference/agentic-point/aggregate-chart.tsx create mode 100644 packages/app/src/components/inference/agentic-point/aggregates-grid.tsx create mode 100644 packages/app/src/components/inference/agentic-point/chart-hover.tsx create mode 100644 packages/app/src/components/inference/agentic-point/chart-shared.tsx create mode 100644 packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts create mode 100644 packages/app/src/components/inference/agentic-point/distribution.tsx create mode 100644 packages/app/src/components/inference/agentic-point/expandable-chart.tsx create mode 100644 packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx create mode 100644 packages/app/src/components/inference/agentic-point/phase-slice.test.ts create mode 100644 packages/app/src/components/inference/agentic-point/phase-slice.ts create mode 100644 packages/app/src/components/inference/agentic-point/point-summary.tsx create mode 100644 packages/app/src/components/inference/agentic-point/request-metric-cards.tsx create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.test.ts create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.tsx create mode 100644 packages/app/src/components/inference/agentic-point/server-metric-cards.tsx create mode 100644 packages/app/src/components/inference/agentic-point/sibling-nav.tsx create mode 100644 packages/app/src/components/inference/agentic-point/time-series-chart.tsx create mode 100644 packages/app/src/components/inference/agentic-point/time-series-math.test.ts create mode 100644 packages/app/src/components/inference/agentic-point/time-series-math.ts create mode 100644 packages/app/src/components/inference/agentic-point/timeline-bars.tsx create mode 100644 packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts create mode 100644 packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts create mode 100644 packages/app/src/components/inference/agentic-point/timeline-format.ts create mode 100644 packages/app/src/components/inference/agentic-point/timeline-layout.ts create mode 100644 packages/app/src/components/inference/agentic-point/timeline-rows.ts create mode 100644 packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx create mode 100644 packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx new file mode 100644 index 00000000..77f29805 --- /dev/null +++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx @@ -0,0 +1,17 @@ +import type { Metadata } from 'next'; + +import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail'; + +export const metadata: Metadata = { + title: 'Agentic trace detail | InferenceX', + robots: { index: false }, +}; + +export default async function AgenticPointDetailPage({ + params, +}: { + params: Promise<{ id: string }>; +}) { + const { id } = await params; + return ; +} diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx new file mode 100644 index 00000000..64742acd --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -0,0 +1,334 @@ +'use client'; + +import Link from 'next/link'; +import { usePathname, useRouter, useSearchParams } from 'next/navigation'; +import { useCallback, useMemo, useState } from 'react'; +import { ArrowLeft } from 'lucide-react'; + +import { useAgenticAggregates } from '@/hooks/api/use-agentic-aggregates'; +import { useRequestTimeline } from '@/hooks/api/use-request-timeline'; +import { useTraceServerMetrics } from '@/hooks/api/use-trace-server-metrics'; +import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings'; +import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { track } from '@/lib/analytics'; + +import { AggregatesGrid } from './aggregates-grid'; +import { MetricSourceToolbar } from './metric-source-toolbar'; +import { + phaseBoundarySec, + sliceServerSeriesByPhase, + sliceTimelineByPhase, + timelineHasWarmup, + type ServerSeriesLike, + type StagePhase, +} from './phase-slice'; +import { PointSummary } from './point-summary'; +import { RequestMetricOverTime, SequenceMetricCard } from './request-metric-cards'; +import { RequestTimelineView } from './request-timeline'; +import { + CumulativeUniqueInputTokensCard, + InflightUniqueTokensCard, + KvCacheUtilizationCard, + PrefixCacheHitRateCard, + PromptTokenSourceCard, + RequestActivityCard, + ThroughputCard, + type RequestActivityView, +} from './server-metric-cards'; +import { SiblingNav } from './sibling-nav'; +import type { ThroughputSeriesKey } from './time-series-math'; + +interface Props { + id: number; +} + +type DetailView = 'point' | 'timeline' | 'aggregates'; + +const VIEW_OPTIONS: SegmentedToggleOption[] = [ + { value: 'point', label: 'Per-point', testId: 'detail-view-point' }, + { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' }, + { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' }, +]; + +const isDetailView = (value: string | null): value is DetailView => + value === 'point' || value === 'timeline' || value === 'aggregates'; + +/** URL-persisted detail view (`?view=`; per-point is the unadorned default). */ +function useDetailView(): [DetailView, (nextView: DetailView) => void] { + const router = useRouter(); + const pathname = usePathname(); + const searchParams = useSearchParams(); + const requestedView = searchParams.get('view'); + const view: DetailView = isDetailView(requestedView) ? requestedView : 'point'; + const setView = useCallback( + (nextView: DetailView) => { + const nextParams = new URLSearchParams(searchParams.toString()); + if (nextView === 'point') nextParams.delete('view'); + else nextParams.set('view', nextView); + const query = nextParams.toString(); + router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false }); + track('inference_agentic_detail_view_changed', { view: nextView }); + }, + [pathname, router, searchParams], + ); + return [view, setView]; +} + +export function AgenticPointDetail({ id }: Props) { + const router = useRouter(); + const metricsQuery = useTraceServerMetrics(id, true); + const siblingsQuery = useBenchmarkSiblings(id); + + const metrics = metricsQuery.data; + const siblingsData = siblingsQuery.data; + + const [view, setView] = useDetailView(); + const [metricSourceId, setMetricSourceId] = useState('all'); + const [requestActivityView, setRequestActivityView] = useState('queue'); + const [throughputSeries, setThroughputSeries] = useState>( + () => new Set(['input', 'decode']), + ); + // Fetch aggregates only when the aggregates view is active. Uses the full + // sibling set (across parallelism + concurrency configs) so each chart + // shows how the metric varies across the SKU. + const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? []; + const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates'); + // Per-request timeline used by the timeline view AND every per-point + // request-derived chart (ISL/OSL, latency-over-time, in-flight), so fetch + // whenever we're on either view. + const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point'); + const timeline = timelineQuery.data; + + // Warmup vs profiling stage. Only meaningful when the point actually has a + // warmup phase (older runs are profiling-only) — when absent the toggle is + // hidden and everything falls back to the full (profiling) run. + const [phase, setPhase] = useState('profiling'); + const hasWarmup = useMemo(() => timelineHasWarmup(timeline), [timeline]); + const effectivePhase: StagePhase = hasWarmup ? phase : 'profiling'; + + // Server-metric boundary on the chart's own t-axis (rebased through absolute + // ns — see phase-slice header for the origin-gap invariant). Request charts + // get a phase-scoped timeline (filtered + rebased) so they share a 0-based + // axis with the server charts for the selected phase. + const boundarySec = useMemo(() => phaseBoundarySec(metrics, timeline), [metrics, timeline]); + const phaseTimeline = useMemo( + () => (timeline ? sliceTimelineByPhase(timeline, effectivePhase) : null), + [timeline, effectivePhase], + ); + + const metricSources = metrics?.metricSources ?? []; + const selectedMetricSource = metricSources.find(({ source }) => source.id === metricSourceId); + const baseServerSeries: ServerSeriesLike | undefined = useMemo(() => { + const src = metrics?.metricSources?.find((m) => m.source.id === metricSourceId); + if (src) { + return { + kvCacheUsage: src.kvCacheUsage, + prefixCacheHitRate: src.prefixCacheHitRate, + queueDepth: src.queueDepth, + promptTokensBySource: src.promptTokensBySource, + prefillTps: src.promptTps, + decodeTps: src.generationTps, + prefixCacheHitsTps: src.prefixCacheHitsTps, + hostKvCacheUsage: src.hostKvCacheUsage, + kvCacheUsageByEngine: src.kvCacheUsageByEngine, + }; + } + return metrics ?? undefined; + }, [metrics, metricSourceId]); + // Phase-sliced server series (+ matching durationS) consumed by every server + // chart. Null only when there are no server metrics at all. + const sliced = useMemo( + () => + baseServerSeries + ? sliceServerSeriesByPhase( + baseServerSeries, + effectivePhase, + boundarySec, + metrics?.durationS ?? 0, + ) + : null, + [baseServerSeries, effectivePhase, boundarySec, metrics?.durationS], + ); + // Some runs only scrape server metrics during profiling — `chart_series` + // starts at the profiling boundary, so the warmup slice collapses to ~0–1 + // points (just the t=0 origin) even though request-level warmup data exists. + // Require ≥2 points in some series to count as real warmup coverage; otherwise + // show an explanatory note instead of six silently-blank charts. + const slicedHasServerData = + (sliced?.series.kvCacheUsage.length ?? 0) > 1 || + (sliced?.series.queueDepth.length ?? 0) > 1 || + (sliced?.series.prefillTps.length ?? 0) > 1 || + (sliced?.series.prefixCacheHitRate.length ?? 0) > 1; + + return ( +
+
+ + · + + Inference chart + +
+ + {siblingsData ? ( + + ) : siblingsQuery.isLoading ? ( +
Loading SKU navigator…
+ ) : null} + + {metrics ? ( + + ) : metricsQuery.isLoading ? ( +
Loading point metadata…
+ ) : null} + + {metricsQuery.isError && ( +
+ Failed to load trace data for benchmark point #{id}. +
+ )} + {metricsQuery.data === null && !metricsQuery.isLoading && ( +
+ No stored trace_replay blob for benchmark point #{id}. This point predates the aiperf + time-series capture, or its source artifacts have expired on GitHub. +
+ )} + +
+ + {view === 'aggregates' && ( + + {siblingIds.length} configs in SKU + {aggregatesQuery.isLoading ? ' · loading…' : ''} + + )} + {view === 'timeline' && timelineQuery.data && ( + + {timelineQuery.data.requests.length} requests + + )} +
+ + {view === 'point' && (metricSources.length > 1 || hasWarmup) && ( + + )} + + {view === 'aggregates' ? ( + + ) : view === 'timeline' ? ( + timelineQuery.isLoading ? ( +
+ Loading request timeline… +
+ ) : timelineQuery.data ? ( + + ) : ( +
+ No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact + isn't stored for this row. +
+ ) + ) : ( + <> + {effectivePhase === 'warmup' && ( +

+ Showing the warmup phase — a + cache-warming pass whose outputs are capped at 1 token. Warmup OSL ≈ 1, and + interactivity/decode are blank (single-token outputs have no inter-token latency). + {!slicedHasServerData && + ' Warmup server-side metrics aren’t available for this point, so the server charts below are empty — the request-level charts above still reflect warmup.'} +

+ )} +
+ + + + + + + + + + + + + + + + + + + + +
+ + )} +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx new file mode 100644 index 00000000..d4526d24 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx @@ -0,0 +1,286 @@ +'use client'; + +import { useMemo } from 'react'; + +import { ChartHover, type HoverItem } from './chart-hover'; +import { ChartEmpty, PERCENTILE_COLORS } from './chart-shared'; + +export type PercentileKey = 'mean' | 'p50' | 'p75' | 'p90' | 'p99'; + +interface PercentileLine { + key: PercentileKey; + /** Display label in legend / tooltip. */ + label: string; + color: string; +} + +const PERCENTILE_LINES: PercentileLine[] = [ + { key: 'mean', label: 'Mean', color: PERCENTILE_COLORS.mean }, + { key: 'p50', label: 'P50', color: PERCENTILE_COLORS.p50 }, + { key: 'p75', label: 'P75', color: PERCENTILE_COLORS.p75 }, + { key: 'p90', label: 'P90', color: PERCENTILE_COLORS.p90 }, + { key: 'p99', label: 'P99', color: PERCENTILE_COLORS.p99 }, +]; + +// Wider bottom/left padding than CHART_PAD — the x-axis carries rotated +// per-config labels instead of time ticks. +const PAD = { top: 16, right: 16, bottom: 90, left: 64 }; + +export interface AggregatePoint { + /** Sibling label rendered on x-axis (e.g. "TP8 • c=8"). */ + label: string; + /** Per-percentile value; missing percentiles are dropped from the plot. */ + values: Partial>; + /** Sibling id — purely informational, used in the tooltip title. */ + id?: number; +} + +/** + * Multi-line chart: one x-position per sibling config, one line per + * percentile (mean/p50/p75/p90/p99). Designed for the "Aggregates across + * configs" view on the agentic detail page. + */ +export function AggregateChart({ + points, + unit, + yMax, + yFmt, + width = 720, + height = 320, +}: { + points: readonly AggregatePoint[]; + unit: string; + /** Optional fixed y-axis upper bound (e.g. 1 for percentages). */ + yMax?: number; + /** Optional value formatter (e.g. percentage → "30%"). */ + yFmt?: (v: number) => string; + width?: number; + height?: number; +}) { + const W = width; + const H = height; + const fmt = (v: number) => + yFmt + ? yFmt(v) + : v >= 10000 + ? new Intl.NumberFormat('en-US').format(Math.round(v)) + : v.toFixed(v < 10 ? 2 : 0); + + const computed = useMemo(() => { + if (points.length === 0) return null; + let yMaxComputed = 0; + for (const p of points) { + for (const line of PERCENTILE_LINES) { + const v = p.values[line.key]; + if (typeof v === 'number' && Number.isFinite(v) && v > yMaxComputed) yMaxComputed = v; + } + } + const yTop = yMax ?? (yMaxComputed === 0 ? 1 : yMaxComputed * 1.05); + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + return { yTop, innerW, innerH }; + }, [points, W, H, yMax]); + + if (!computed) { + return ; + } + const { yTop, innerW, innerH } = computed; + + // X positions: evenly spaced across the inner width. + const xOf = (i: number) => + points.length === 1 ? PAD.left + innerW / 2 : PAD.left + (i / (points.length - 1)) * innerW; + const yOf = (v: number) => PAD.top + (1 - v / yTop) * innerH; + + // 5 y-axis ticks evenly between 0 and yTop. + const yTicks = Array.from({ length: 5 }, (_, i) => (yTop * i) / 4); + + // Resolve hover: snap to nearest sibling index and emit all percentiles + // that have data at that x. + const resolve = (fraction: number) => { + const idx = Math.round(fraction * (points.length - 1)); + const p = points[Math.max(0, Math.min(points.length - 1, idx))]; + if (!p) return null; + const items: HoverItem[] = []; + for (const line of PERCENTILE_LINES) { + const v = p.values[line.key]; + if (typeof v !== 'number' || !Number.isFinite(v)) continue; + items.push({ color: line.color, label: line.label, value: fmt(v) }); + } + return { items, title: p.label }; + }; + + return ( +
+
+ {PERCENTILE_LINES.map((line) => ( +
+ + {line.label} +
+ ))} + + {points.length} configs · units: {unit} + +
+ + {/* y-axis ticks + gridlines */} + {yTicks.map((v, i) => { + const y = yOf(v); + return ( + + + + {fmt(v)} + + + ); + })} + + {/* X-axis tick labels — one per sibling, rotated 30° to fit. */} + {points.map((p, i) => { + const x = xOf(i); + return ( + + + + {p.label} + + + ); + })} + + {/* X axis baseline */} + + + {/* Horizontal connecting lines per percentile — faint backdrop so the + eye can follow how each percentile changes across configs. */} + {PERCENTILE_LINES.map((line) => { + const segments: { x1: number; y1: number; x2: number; y2: number }[] = []; + let prev: { x: number; y: number } | null = null; + for (let i = 0; i < points.length; i++) { + const v = points[i]!.values[line.key]; + if (typeof v !== 'number' || !Number.isFinite(v)) { + prev = null; + continue; + } + const x = xOf(i); + const y = yOf(v); + if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y }); + prev = { x, y }; + } + return ( + + {segments.map((s, j) => ( + + ))} + + ); + })} + + {/* Per-sibling vertical bar spanning the percentile range, with a + colored tick at each percentile level. Mean rendered as a small + diamond to distinguish from the percentile ticks. */} + {points.map((p, i) => { + const x = xOf(i); + // Collect percentile values present for this sibling. + const present = PERCENTILE_LINES.filter( + (line) => + typeof p.values[line.key] === 'number' && Number.isFinite(p.values[line.key]!), + ).map((line) => ({ ...line, value: p.values[line.key]! })); + if (present.length === 0) return null; + // Only the *percentile* values define the bar extent; mean might be + // outside the percentile span on weird distributions. + const pctlOnly = present.filter((p2) => p2.key !== 'mean'); + const bandValues = pctlOnly.length > 0 ? pctlOnly : present; + const bandYs = bandValues.map((b) => yOf(b.value)); + const yLo = Math.min(...bandYs); + const yHi = Math.max(...bandYs); + return ( + + + {present.map((b) => { + const ty = yOf(b.value); + if (b.key === 'mean') { + // Diamond marker for mean. + const s = 4; + return ( + + ); + } + // Horizontal tick at each percentile. + return ( + + ); + })} + + ); + })} + +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx b/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx new file mode 100644 index 00000000..09252940 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/aggregates-grid.tsx @@ -0,0 +1,104 @@ +'use client'; + +import type { AgenticAggregateMap, MetricPercentiles } from '@/hooks/api/use-agentic-aggregates'; +import type { BenchmarkSibling } from '@/hooks/api/use-benchmark-siblings'; + +import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart'; +import { CHART_SIZES } from './chart-shared'; +import { ExpandableChart } from './expandable-chart'; +import { chipLabel } from './sibling-nav'; + +/** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */ +function toAggPoint( + sibling: { id: number; label: string }, + pct: MetricPercentiles | null | undefined, +): AggregatePoint { + const values: Partial> = {}; + if (pct) { + values.mean = pct.mean; + values.p50 = pct.p50; + values.p75 = pct.p75; + values.p90 = pct.p90; + values.p99 = pct.p99; + } + return { id: sibling.id, label: sibling.label, values }; +} + +/** "Aggregates across configs" view: ISL/OSL/KV/prefix stats per SKU sibling. */ +export function AggregatesGrid({ + siblings, + aggregates, + isLoading, +}: { + siblings: BenchmarkSibling[]; + aggregates: AgenticAggregateMap | undefined; + isLoading: boolean; +}) { + if (siblings.length === 0) { + return ( +
+ SKU sibling list not loaded yet — open a point to populate. +
+ ); + } + if (isLoading && !aggregates) { + return ( +
+ Computing aggregates across {siblings.length} configs… (parsing trace blobs) +
+ ); + } + const labeled = siblings.map((s) => ({ id: s.id, label: chipLabel(s) })); + const islPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.isl)); + const oslPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.osl)); + const kvPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.kvCacheUtil)); + const prefixPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.prefixCacheHitRate)); + return ( +
+ ( + + )} + /> + ( + + )} + /> + ( + `${(v * 100).toFixed(0)}%`} + {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)} + /> + )} + /> + ( + `${(v * 100).toFixed(0)}%`} + {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)} + /> + )} + /> +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/chart-hover.tsx b/packages/app/src/components/inference/agentic-point/chart-hover.tsx new file mode 100644 index 00000000..24270122 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/chart-hover.tsx @@ -0,0 +1,148 @@ +'use client'; + +import { useState, type ReactNode } from 'react'; + +/** Vertical crosshair + floating value tooltip overlay shared by every chart. */ +export interface HoverItem { + /** Color swatch to render next to the label. */ + color: string; + label: string; + value: string; + /** Optional faint secondary line (e.g. timestamp under main values). */ + hint?: string; +} + +interface ChartHoverProps { + /** Padding inside the SVG; matches the chart's CHART_PAD. */ + pad: { top: number; right: number; bottom: number; left: number }; + /** SVG viewBox dimensions used to render the chart. */ + width: number; + height: number; + /** + * Called with the cursor's normalized x in [0..1] across the plot area. + * Returns `null` to hide the tooltip (e.g. cursor outside data range). + */ + resolve: (xFraction: number) => { items: HoverItem[]; title?: string } | null; + children: ReactNode; +} + +/** + * Wrap a chart's render to add mouse-driven crosshair + tooltip. + * + * The chart owner renders its bars / lines / axes via `children`; this wrapper + * adds an invisible across the plot area to capture pointer events, a + * vertical line that follows the cursor, and a floating tooltip on the right + * of the cursor (auto-flipping to the left when it would overflow). + */ +export function ChartHover({ pad, width, height, resolve, children }: ChartHoverProps) { + const [hover, setHover] = useState<{ + xPx: number; + yPx: number; + fraction: number; + items: HoverItem[]; + title?: string; + } | null>(null); + + const innerW = width - pad.left - pad.right; + const innerH = height - pad.top - pad.bottom; + + const onMove = (e: React.MouseEvent) => { + const svg = e.currentTarget.ownerSVGElement; + if (!svg) return; + const rect = svg.getBoundingClientRect(); + // Convert client coords → SVG viewBox coords. + const sx = ((e.clientX - rect.left) * width) / rect.width; + const sy = ((e.clientY - rect.top) * height) / rect.height; + const fraction = Math.max(0, Math.min(1, (sx - pad.left) / innerW)); + const resolved = resolve(fraction); + if (!resolved) { + setHover(null); + return; + } + setHover({ xPx: sx, yPx: sy, fraction, items: resolved.items, title: resolved.title }); + }; + + const onLeave = () => setHover(null); + + return ( +
+ + {children} + {hover && ( + + )} + + + {hover && hover.items.length > 0 && ( + + )} +
+ ); +} + +function HoverTooltip({ + xFraction, + containerWidth, + padLeft, + innerW, + title, + items, +}: { + xFraction: number; + containerWidth: number; + padLeft: number; + innerW: number; + title?: string; + items: HoverItem[]; +}) { + // Position tooltip near the crosshair as a % of the container. + // We flip to the cursor's left side when it would overflow the right edge. + const xPx = padLeft + xFraction * innerW; + const onRight = xPx < containerWidth * 0.55; + const left = onRight ? `${(xPx / containerWidth) * 100}%` : 'auto'; + const right = onRight ? 'auto' : `${((containerWidth - xPx) / containerWidth) * 100}%`; + return ( +
+ {title &&
{title}
} + {items.map((it, i) => ( +
+ + {it.label} + {it.value} +
+ ))} +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/chart-shared.tsx b/packages/app/src/components/inference/agentic-point/chart-shared.tsx new file mode 100644 index 00000000..f00f4532 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/chart-shared.tsx @@ -0,0 +1,57 @@ +'use client'; + +/** + * Shared presentational constants and helpers for the agentic point-detail + * charts (time-series, stacked-area, distribution, aggregate). These charts + * are hand-rolled SVG (not the d3-chart library) and share axis padding, + * tick formatting, and empty/loading states. + */ + +/** Axis padding shared by the time-series, stacked-area, and distribution charts. */ +export const CHART_PAD = { top: 12, right: 16, bottom: 56, left: 60 } as const; + +/** Sizes passed to charts for the inline (small) vs expanded (dialog) render. */ +export const CHART_SIZES = { + inline: { width: 720, height: 260 }, + expanded: { width: 1300, height: 520 }, +} as const; + +/** + * Guide-line colors per percentile, shared by the aggregate chart's lines and + * the distribution chart's vertical guides so the same percentile reads as the + * same color across the detail page. + */ +export const PERCENTILE_COLORS = { + mean: '#ef4444', + p50: '#3b82f6', + p75: '#22c55e', + p90: '#f59e0b', + p95: '#ef4444', + p99: '#a855f7', +} as const; + +/** Integer tick label: thousands separators only once the value reaches 10000. */ +export const fmtCount = (n: number): string => + n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n)); + +/** Seconds → "42s" / "3m 20s" time-axis tick label. */ +export const fmtSeconds = (s: number): string => { + if (s < 60) return `${Math.round(s)}s`; + const m = Math.floor(s / 60); + const rem = Math.round(s % 60); + return `${m}m ${rem}s`; +}; + +/** "No data" placeholder sized to match the chart it replaces. */ +export function ChartEmpty({ height = 260 }: { height?: number }) { + return ( +
+ No data +
+ ); +} + +/** Loading placeholder for a chart card. */ +export function ChartSkeleton() { + return
; +} diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts new file mode 100644 index 00000000..f55d6131 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts @@ -0,0 +1,53 @@ +import { describe, expect, it } from 'vitest'; + +import { datasetConvId, subagentIdOf } from './request-timeline'; + +describe('datasetConvId', () => { + it('returns a plain conversation id unchanged', () => { + expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602')).toBe( + '002001296e8a8c38ad9d7cc436d691afc602', + ); + }); + + it('strips a ::sa: subagent suffix to the parent conv id', () => { + expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe( + '002001296e8a8c38ad9d7cc436d691afc602', + ); + }); + + it('strips a ::fa: forked-agent suffix', () => { + expect(datasetConvId('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBe( + '02bc0afb13f7a2d9efa86c28511261d85c0e', + ); + }); + + it('strips at the first :: even with a trailing stream index', () => { + expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc'); + }); +}); + +describe('subagentIdOf', () => { + it('returns null for a main-conversation cid', () => { + expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602')).toBeNull(); + }); + + it('extracts the subagent id from a ::sa: cid', () => { + expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe( + 'subagent_004_27c95af7', + ); + }); + + it('drops a trailing :s index from the subagent id', () => { + expect(subagentIdOf('abc::sa:subagent_001_f552fe6f:s3')).toBe('subagent_001_f552fe6f'); + }); + + it('drops an :aux: stream suffix from the subagent id', () => { + expect(subagentIdOf('04dba6fe::sa:subagent_001_b00fdc12:aux:011')).toBe( + 'subagent_001_b00fdc12', + ); + }); + + it('returns null for a ::fa: forked-agent cid (no matching subagent group)', () => { + expect(subagentIdOf('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBeNull(); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx new file mode 100644 index 00000000..6573d60c --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/distribution.tsx @@ -0,0 +1,233 @@ +'use client'; + +import { useMemo } from 'react'; + +import { ChartHover, type HoverItem } from './chart-hover'; +import { CHART_PAD, ChartEmpty, PERCENTILE_COLORS, fmtCount } from './chart-shared'; +import { quantile } from './time-series-math'; + +const PAD = CHART_PAD; + +const GUIDES = [ + { label: 'p50', q: 0.5, color: PERCENTILE_COLORS.p50 }, + { label: 'p75', q: 0.75, color: PERCENTILE_COLORS.p75 }, + { label: 'p90', q: 0.9, color: PERCENTILE_COLORS.p90 }, + { label: 'p95', q: 0.95, color: PERCENTILE_COLORS.p95 }, +] as const; + +/** + * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the + * detail-page card — fills its container width via `viewBox` + 100% width. + * Hover shows the bin range + count + cumulative percentile. + */ +export function Distribution({ + values, + unit, + width = 720, + height = 260, +}: { + values: readonly number[]; + unit: string; + width?: number; + height?: number; +}) { + const W = width; + const H = height; + + const computed = useMemo(() => { + if (values.length === 0) return null; + const sorted = [...values].toSorted((a, b) => a - b); + const min = sorted[0]!; + const max = sorted.at(-1)!; + const range = Math.max(1e-9, max - min); + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length)))); + const counts: number[] = Array.from({ length: nBins }, () => 0); + for (const v of values) { + const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins)); + counts[i]!++; + } + return { sorted, min, max, range, innerW, innerH, nBins, counts }; + }, [values, W, H]); + + if (!computed) { + return ; + } + const { sorted, min, max, range, innerW, innerH, nBins, counts } = computed; + const maxCount = Math.max(...counts, 1); + const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW; + const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH; + const barW = innerW / nBins; + + const fmt = fmtCount; + + // Hover: report the bin range under cursor, its count, and what percentile + // the bin's midpoint represents in the empirical distribution. + const resolve = (fraction: number) => { + const v = min + fraction * range; + const binIdx = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins)); + const binLo = min + (binIdx * range) / nBins; + const binHi = min + ((binIdx + 1) * range) / nBins; + const count = counts[binIdx] ?? 0; + // Cumulative % at the bin's right edge. + let cumCount = 0; + for (let i = 0; i <= binIdx; i++) cumCount += counts[i] ?? 0; + const cumPct = (cumCount / values.length) * 100; + const items: HoverItem[] = [ + { color: 'currentColor', label: 'Bin', value: `${fmt(binLo)}–${fmt(binHi)} ${unit}` }, + { color: 'currentColor', label: 'Count', value: count.toLocaleString() }, + { color: 'currentColor', label: 'Cumulative', value: `${cumPct.toFixed(1)}%` }, + ]; + return { items }; + }; + + const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max]; + const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4); + + return ( +
+
+ {values.length.toLocaleString()} requests · range {fmt(min)}–{fmt(max)} {unit} +
+ + {/* y-axis gridlines + labels */} + {yTickVals.map((v, i) => { + const y = yScale(v); + return ( + + + + {fmt(v)} + + + ); + })} + + {/* Bars */} + {counts.map((c, i) => { + const h = (c / maxCount) * innerH; + const x = PAD.left + i * barW; + const y = PAD.top + (innerH - h); + return ( + + ); + })} + + {/* Percentile guide lines */} + {GUIDES.map(({ q, color }) => { + const v = quantile(sorted, q); + const x = xScale(v); + return ( + + ); + })} + + {/* X axis */} + + {xTickVals.map((v, i) => { + const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle'; + return ( + + {fmt(v)} + + ); + })} + + value ({unit}) + + + count + + + {/* Percentile legend chips */} + {(() => { + const chipY = H - 8; + const chipW = innerW / GUIDES.length; + return GUIDES.map(({ label: ql, q, color }, i) => { + const v = quantile(sorted, q); + const x = PAD.left + i * chipW; + return ( + + + + {ql} {fmt(v)} + + + ); + }); + })()} + +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx new file mode 100644 index 00000000..cb5987ec --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx @@ -0,0 +1,56 @@ +'use client'; + +import { useState, type ReactNode } from 'react'; +import { Maximize2 } from 'lucide-react'; + +import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog'; + +/** + * Wraps a chart in a card with a header + expand button. Click the button to + * open the chart in a large dialog. The `render` prop receives `expanded:true` + * inside the dialog so charts can pick larger width/height. + */ +export function ExpandableChart({ + title, + render, + controls, + testId, +}: { + title: string; + render: (expanded: boolean) => ReactNode; + controls?: ReactNode; + testId?: string; +}) { + const [open, setOpen] = useState(false); + + return ( +
+
+

{title}

+
+ {controls} + +
+
+ {render(false)} + + + +
+ {title} + {controls} +
+
+
{render(true)}
+
+
+
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx b/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx new file mode 100644 index 00000000..e56ddeee --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/metric-source-toolbar.tsx @@ -0,0 +1,130 @@ +'use client'; + +import type { MetricSource, MetricSourceSeries } from '@/hooks/api/use-trace-server-metrics'; +import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { track } from '@/lib/analytics'; + +import type { StagePhase } from './phase-slice'; + +const SOURCE_ROLE_LABEL: Record = { + router: 'Router', + prefill: 'Prefill', + decode: 'Decode', + combined: 'Combined', + unknown: 'Unknown', +}; + +/** "Role · instance" label for one server-metrics endpoint. */ +export function metricSourceLabel(source: MetricSource): string { + const instance = + source.workerId ?? + (source.dpRank ? `DP ${source.dpRank}` : null) ?? + source.endpointUrl ?? + (source.engine ? `engine ${source.engine}` : null); + return instance + ? `${SOURCE_ROLE_LABEL[source.role]} · ${instance}` + : SOURCE_ROLE_LABEL[source.role]; +} + +// Warmup vs profiling stage selector. Drives the server-metric charts AND the +// request-derived charts (ISL/OSL, latency-over-time, in-flight). Only shown +// when the point actually has a warmup phase. +const STAGE_PHASE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'profiling', label: 'Profiling', testId: 'stage-phase-profiling' }, + { value: 'warmup', label: 'Warmup', testId: 'stage-phase-warmup' }, +]; + +/** + * Sticky per-point toolbar: warmup/profiling stage toggle (when the point has + * a warmup phase) and the server-metrics endpoint selector (when the point has + * more than one source). The parent decides when to render it at all. + */ +export function MetricSourceToolbar({ + hasWarmup, + phase, + onPhaseChange, + metricSources, + selectedSource, + onSourceChange, + fallbackAdapter, +}: { + hasWarmup: boolean; + phase: StagePhase; + onPhaseChange: (phase: StagePhase) => void; + metricSources: MetricSourceSeries[]; + selectedSource: MetricSourceSeries | undefined; + onSourceChange: (id: string) => void; + /** Adapter reported in analytics when the selected source lookup misses. */ + fallbackAdapter: string | undefined; +}) { + return ( +
+ {hasWarmup ? ( +
+ Stage + { + onPhaseChange(value); + track('inference_agentic_phase_changed', { phase: value }); + }} + ariaLabel="Stage phase" + testId="stage-phase-toggle" + buttonClassName="px-2.5 py-1 text-xs" + /> +
+ ) : ( + + )} + {metricSources.length > 1 ? ( +
+ Server metrics + +
+ ) : null} +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.test.ts b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts new file mode 100644 index 00000000..ef6cdaab --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts @@ -0,0 +1,212 @@ +import { describe, expect, it } from 'vitest'; + +import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline'; +import { + phaseBoundaryNs, + phaseBoundarySec, + requestsForPhase, + sliceServerSeriesByPhase, + sliceTimelineByPhase, + timelineHasWarmup, + type ServerSeriesLike, +} from './phase-slice'; + +function req(overrides: Partial): RequestRecord { + return { + cid: 'c', + ti: 0, + wid: 'w', + ad: 0, + phase: 'profiling', + credit: 0, + start: 0, + ack: null, + end: 1, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + cancelled: false, + ...overrides, + }; +} + +function timeline(requests: RequestRecord[], startNs = 1_000): RequestTimeline { + return { version: 3, startNs, endNs: startNs + 1, durationS: 1, requests }; +} + +function makeSeries(ts: number[]): ServerSeriesLike { + const pts = ts.map((t) => ({ t, value: t * 10 })); + return { + kvCacheUsage: pts, + prefixCacheHitRate: pts, + queueDepth: ts.map((t) => ({ t, running: t, waiting: t + 1, total: 2 * t + 1 })), + promptTokensBySource: { src: pts }, + prefillTps: pts, + decodeTps: pts, + prefixCacheHitsTps: pts, + hostKvCacheUsage: pts, + kvCacheUsageByEngine: [{ engineLabel: 'e0', points: pts }], + }; +} + +describe('phaseBoundaryNs', () => { + it('returns null when there are no profiling requests', () => { + expect(phaseBoundaryNs(timeline([req({ phase: 'warmup', start: 5 })]))).toBeNull(); + }); + + it('returns null when there are no warmup requests', () => { + expect(phaseBoundaryNs(timeline([req({ phase: 'profiling', start: 5 })]))).toBeNull(); + }); + + it('returns startNs + earliest profiling start when both phases present', () => { + const t = timeline( + [ + req({ phase: 'warmup', start: 0 }), + req({ phase: 'profiling', start: 900 }), + req({ phase: 'profiling', start: 700 }), + ], + 1_000, + ); + expect(phaseBoundaryNs(t)).toBe(1_700); + }); + + it('returns null for nullish timeline', () => { + expect(phaseBoundaryNs(null)).toBeNull(); + expect(phaseBoundaryNs(undefined)).toBeNull(); + }); +}); + +describe('phaseBoundarySec', () => { + it('rebases through absolute ns by subtracting serverMetrics.startNs (origin gap)', () => { + // timeline origin and server-metrics origin differ — the classic ~124s gap. + const tl = timeline( + [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 600 * 1e9 })], + 200 * 1e9, // timeline.startNs + ); + // boundaryNs = 200e9 + 600e9 = 800e9 ; serverMetrics origin = 124e9 earlier + const boundarySec = phaseBoundarySec({ startNs: 76 * 1e9 }, tl); + // (800e9 - 76e9)/1e9 = 724 + expect(boundarySec).toBe(724); + }); + + it('clamps a negative mapping to 0', () => { + const tl = timeline( + [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 0 })], + 0, + ); + expect(phaseBoundarySec({ startNs: 5 * 1e9 }, tl)).toBe(0); + }); + + it('returns null when serverMetrics missing or no split', () => { + const tl = timeline( + [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 1e9 })], + 0, + ); + expect(phaseBoundarySec(null, tl)).toBeNull(); + expect(phaseBoundarySec({ startNs: 0 }, timeline([req({ phase: 'profiling' })]))).toBeNull(); + }); +}); + +describe('timelineHasWarmup', () => { + it('detects warmup presence', () => { + expect(timelineHasWarmup(timeline([req({ phase: 'profiling' })]))).toBe(false); + expect(timelineHasWarmup(timeline([req({ phase: 'warmup' })]))).toBe(true); + expect(timelineHasWarmup(null)).toBe(false); + }); +}); + +describe('sliceServerSeriesByPhase', () => { + it('is an identity passthrough (full duration) when boundary is null', () => { + const s = makeSeries([0, 1, 2]); + const out = sliceServerSeriesByPhase(s, 'profiling', null, 99); + expect(out.series).toBe(s); + expect(out.durationS).toBe(99); + }); + + it('warmup keeps t < boundary, no rebase, durationS = boundary', () => { + const s = makeSeries([0, 1, 2, 3, 4]); + const out = sliceServerSeriesByPhase(s, 'warmup', 2, 5); + expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1]); // excludes t===2 + expect(out.durationS).toBe(2); + }); + + it('profiling keeps t >= boundary and rebases to start at 0', () => { + const s = makeSeries([0, 1, 2, 3, 4]); + const out = sliceServerSeriesByPhase(s, 'profiling', 2, 5); + expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1, 2]); // 2,3,4 -> 0,1,2 + expect(out.series.kvCacheUsage.map((p) => p.value)).toEqual([20, 30, 40]); // values preserved + expect(out.durationS).toBe(3); // 5 - 2 + }); + + it('slices queueDepth, promptTokensBySource, and kvCacheUsageByEngine; preserves queue fields', () => { + const s = makeSeries([0, 1, 2, 3]); + const out = sliceServerSeriesByPhase(s, 'profiling', 2, 4); + expect(out.series.queueDepth).toEqual([ + { t: 0, running: 2, waiting: 3, total: 5 }, + { t: 1, running: 3, waiting: 4, total: 7 }, + ]); + expect(out.series.promptTokensBySource.src.map((p) => p.t)).toEqual([0, 1]); + expect(out.series.kvCacheUsageByEngine[0]!.points.map((p) => p.t)).toEqual([0, 1]); + expect(out.series.kvCacheUsageByEngine[0]!.engineLabel).toBe('e0'); + }); + + it('does not mutate the input series', () => { + const s = makeSeries([0, 1, 2]); + const before = s.kvCacheUsage.map((p) => p.t); + sliceServerSeriesByPhase(s, 'profiling', 1, 3); + expect(s.kvCacheUsage.map((p) => p.t)).toEqual(before); + }); +}); + +describe('requestsForPhase', () => { + const rs = [ + req({ phase: 'warmup', isl: 1 }), + req({ phase: 'profiling', isl: 2 }), + req({ phase: 'unknown', isl: 3 }), + ]; + + it('profiling selects only profiling rows', () => { + expect(requestsForPhase(rs, 'profiling').map((r) => r.isl)).toEqual([2]); + }); + + it('warmup selects everything that is not profiling', () => { + expect(requestsForPhase(rs, 'warmup').map((r) => r.isl)).toEqual([1, 3]); + }); +}); + +describe('sliceTimelineByPhase', () => { + // startNs origin = 1000; warmup request at offset 0..50, profiling at 100..300. + const tl = timeline( + [ + req({ phase: 'warmup', credit: 0, start: 0, ack: 10, end: 50, isl: 1 }), + req({ phase: 'profiling', credit: 90, start: 100, ack: 120, end: 300, isl: 2 }), + ], + 1_000, + ); + // tl.durationS default = 1 from helper; override for window math. + const tlDur: RequestTimeline = { ...tl, durationS: 3 }; + + it('returns the input unchanged for a single-phase timeline', () => { + const single = timeline([req({ phase: 'profiling', start: 5 })]); + expect(sliceTimelineByPhase(single, 'profiling')).toBe(single); + }); + + it('warmup keeps pre-boundary requests, no rebase, startNs unchanged', () => { + const out = sliceTimelineByPhase(tlDur, 'warmup'); + expect(out.requests.map((r) => r.isl)).toEqual([1]); + expect(out.requests[0]!.start).toBe(0); // not rebased + expect(out.startNs).toBe(1_000); + }); + + it('profiling keeps post-boundary requests and rebases offsets + startNs', () => { + const out = sliceTimelineByPhase(tlDur, 'profiling'); + expect(out.requests.map((r) => r.isl)).toEqual([2]); + // boundary offset = 100 → rebased: start 100→0, end 300→200, ack 120→20, credit 90→-10 + expect(out.requests[0]!.start).toBe(0); + expect(out.requests[0]!.end).toBe(200); + expect(out.requests[0]!.ack).toBe(20); + // startNs shifts forward by the boundary offset so absolute time is preserved + expect(out.startNs).toBe(1_100); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.ts b/packages/app/src/components/inference/agentic-point/phase-slice.ts new file mode 100644 index 00000000..e6e17719 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/phase-slice.ts @@ -0,0 +1,188 @@ +/** + * Warmup vs profiling phase slicing for the agentic per-point detail page. + * + * Agentic trace-replay runs have two phases: a warmup (cache-warming) pass, then + * the measured profiling window. The server-metric time-series (`chart_series`) + * spans the whole run with no per-point phase label, but the per-request + * `request_timeline` IS phase-tagged. We derive the warmup→profiling boundary + * from the timeline and slice the server series at it. + * + * ⚠️ ORIGIN-GAP INVARIANT: the two payloads share the aiperf clock but have + * DIFFERENT zero origins — `serverMetrics.startNs` is the first server scrape, + * `timeline.startNs` is the first request's credit (observed ~124 s apart in + * real runs). The boundary must therefore be rebased through absolute ns by + * subtracting `serverMetrics.startNs`; a same-axis offset comparison would be + * off by the origin gap. This rebasing lives in `phaseBoundarySec` only. + */ + +import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline'; +import type { + QueueDepthPoint, + TimeSeriesPoint, + TraceServerMetrics, +} from '@/hooks/api/use-trace-server-metrics'; + +export type StagePhase = 'warmup' | 'profiling'; + +/** + * The subset of server-metric series the per-point charts render. Both the + * top-level `TraceServerMetrics` and a per-source object (after the detail page + * remaps `promptTps`→`prefillTps`, `generationTps`→`decodeTps`) are assignable. + */ +export interface ServerSeriesLike { + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + prefillTps: TimeSeriesPoint[]; + decodeTps: TimeSeriesPoint[]; + prefixCacheHitsTps: TimeSeriesPoint[]; + hostKvCacheUsage: TimeSeriesPoint[]; + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; +} + +/** True when the timeline contains at least one non-profiling (warmup) request. */ +export function timelineHasWarmup(timeline: RequestTimeline | null | undefined): boolean { + return Boolean(timeline?.requests.some((r) => r.phase !== 'profiling')); +} + +/** + * Absolute-ns wall-clock instant where the profiling phase begins + * = `timeline.startNs + earliest profiling request's start offset`. + * Returns null unless BOTH a warmup and a profiling request exist (nothing to + * split otherwise). + */ +export function phaseBoundaryNs(timeline: RequestTimeline | null | undefined): number | null { + if (!timeline) return null; + let hasWarmup = false; + let minProfilingStart: number | null = null; + for (const r of timeline.requests) { + if (r.phase === 'profiling') { + if (minProfilingStart === null || r.start < minProfilingStart) minProfilingStart = r.start; + } else { + hasWarmup = true; + } + } + if (!hasWarmup || minProfilingStart === null) return null; + return timeline.startNs + minProfilingStart; +} + +/** + * The profiling-start boundary expressed on the SERVER-METRIC chart's own t-axis + * (seconds from `serverMetrics.startNs`). See the origin-gap invariant at the top + * of the file — the `- serverMetrics.startNs` subtraction is mandatory. + * + * Returns null when there's no warmup/profiling split, or `serverMetrics` is + * absent (→ callers fall back to the full-run series). + */ +export function phaseBoundarySec( + serverMetrics: Pick | null | undefined, + timeline: RequestTimeline | null | undefined, +): number | null { + if (!serverMetrics) return null; + const boundaryNs = phaseBoundaryNs(timeline); + if (boundaryNs === null) return null; + return Math.max(0, (boundaryNs - serverMetrics.startNs) / 1e9); +} + +export interface PhaseSlicedSeries { + series: S; + durationS: number; +} + +/** + * Slice every server-metric series to one phase: + * - warmup: keep points with `t < boundary`, no rebase, `durationS = boundary` + * - profiling: keep points with `t >= boundary`, rebased so `t` starts at 0, + * `durationS = full - boundary` + * + * A point exactly at `t === boundary` belongs to profiling. Null boundary + * (single-phase point, or no server metrics) → identity passthrough with the + * full `durationS`. Pure — returns new objects, never mutates the input. + * + * NOTE: rebasing the profiling slice to start at 0 makes the cumulative charts + * (prompt-token source, unique-input-tokens) read as "since profiling start" + * rather than "since run start" — intended. + */ +export function sliceServerSeriesByPhase( + series: S, + phase: StagePhase, + boundarySec: number | null, + fullDurationS: number, +): PhaseSlicedSeries { + if (boundarySec === null) return { series, durationS: fullDurationS }; + const b = boundarySec; + const keep = phase === 'warmup' ? (t: number) => t < b : (t: number) => t >= b; + const rebase = phase === 'profiling' ? (t: number) => t - b : (t: number) => t; + + const sliceTs = (pts: TimeSeriesPoint[]): TimeSeriesPoint[] => + pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) })); + const sliceQd = (pts: QueueDepthPoint[]): QueueDepthPoint[] => + pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) })); + const sliceRecord = ( + rec: Record, + ): Record => { + const out: Record = {}; + for (const [k, v] of Object.entries(rec)) out[k] = sliceTs(v); + return out; + }; + + const slicedFields: ServerSeriesLike = { + kvCacheUsage: sliceTs(series.kvCacheUsage), + prefixCacheHitRate: sliceTs(series.prefixCacheHitRate), + queueDepth: sliceQd(series.queueDepth), + promptTokensBySource: sliceRecord(series.promptTokensBySource), + prefillTps: sliceTs(series.prefillTps), + decodeTps: sliceTs(series.decodeTps), + prefixCacheHitsTps: sliceTs(series.prefixCacheHitsTps), + hostKvCacheUsage: sliceTs(series.hostKvCacheUsage), + kvCacheUsageByEngine: series.kvCacheUsageByEngine.map((e) => ({ + engineLabel: e.engineLabel, + points: sliceTs(e.points), + })), + }; + + const durationS = phase === 'warmup' ? b : Math.max(1, fullDurationS - b); + return { series: { ...series, ...slicedFields } as S, durationS }; +} + +/** Filter request-timeline records to one phase (warmup = anything not profiling). */ +export function requestsForPhase(requests: RequestRecord[], phase: StagePhase): RequestRecord[] { + return phase === 'warmup' + ? requests.filter((r) => r.phase !== 'profiling') + : requests.filter((r) => r.phase === 'profiling'); +} + +/** + * Scope a whole request timeline to one phase: keep only that phase's requests + * and, for profiling, rebase every ns offset (and `startNs`) so the phase starts + * at t=0 — mirroring `sliceServerSeriesByPhase` so the request-derived charts and + * the server charts share a 0-based axis for the same phase. `durationS` becomes + * the phase window. Returns the input unchanged when there's no warmup/profiling + * split (single-phase point). Pure — new object, original untouched. + * + * The boundary here is on the REQUEST clock (offset from `timeline.startNs`), so + * we use `phaseBoundaryNs` minus `timeline.startNs` rather than the server-axis + * `phaseBoundarySec` (different origin — see the file header). + */ +export function sliceTimelineByPhase( + timeline: RequestTimeline, + phase: StagePhase, +): RequestTimeline { + const boundaryNs = phaseBoundaryNs(timeline); + if (boundaryNs === null) return timeline; + const boundaryOff = boundaryNs - timeline.startNs; // ns offset on the request clock + const inPhase = (r: RequestRecord) => + phase === 'warmup' ? r.start < boundaryOff : r.start >= boundaryOff; + const shift = phase === 'profiling' ? boundaryOff : 0; + const requests = timeline.requests.filter(inPhase).map((r) => ({ + ...r, + credit: r.credit - shift, + start: r.start - shift, + ack: r.ack === null ? null : r.ack - shift, + end: r.end - shift, + })); + const durationS = + phase === 'warmup' ? boundaryOff / 1e9 : Math.max(1, timeline.durationS - boundaryOff / 1e9); + return { ...timeline, startNs: timeline.startNs + shift, requests, durationS }; +} diff --git a/packages/app/src/components/inference/agentic-point/point-summary.tsx b/packages/app/src/components/inference/agentic-point/point-summary.tsx new file mode 100644 index 00000000..8a777baa --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/point-summary.tsx @@ -0,0 +1,50 @@ +'use client'; + +import type { ReactNode } from 'react'; + +import type { PointMeta } from '@/hooks/api/use-trace-server-metrics'; + +const fmtPct = (v: number | null | undefined): string => + v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`; + +function MetaLine({ label, value }: { label: string; value: ReactNode }) { + return ( +
+ {label} + {value} +
+ ); +} + +/** Selected-point header: config facts (offload, concurrency, cache hit rates, ISL/OSL). */ +export function PointSummary({ meta }: { meta: PointMeta }) { + return ( +
+
+

+ Selected point + {meta.disagg ? ' · disagg' : ''} + {meta.spec_method && meta.spec_method !== 'none' ? ` · spec=${meta.spec_method}` : ''} +

+ {meta.run_url && ( + + GitHub Actions run → + + )} +
+
+ + + + + {meta.isl !== null && } + {meta.osl !== null && } +
+
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx b/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx new file mode 100644 index 00000000..8ca85ac9 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/request-metric-cards.tsx @@ -0,0 +1,223 @@ +'use client'; + +import { useState } from 'react'; + +import type { RequestTimeline } from '@/hooks/api/use-request-timeline'; +import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { track } from '@/lib/analytics'; + +import { CHART_SIZES, ChartEmpty, ChartSkeleton } from './chart-shared'; +import { Distribution } from './distribution'; +import { ExpandableChart } from './expandable-chart'; +import { TimeSeriesChart } from './time-series-chart'; +import { + averageSequenceLengthInFlight, + rollingRequestMetric, + timeRollingAverage, + type RequestMetric, + type RequestPercentile, +} from './time-series-math'; + +const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'p75', label: 'P75' }, + { value: 'p90', label: 'P90' }, +]; + +const LATENCY_METRIC_OPTIONS: SegmentedToggleOption<'ttft' | 'e2e'>[] = [ + { value: 'ttft', label: 'TTFT', testId: 'latency-metric-ttft' }, + { value: 'e2e', label: 'E2E', testId: 'latency-metric-e2e' }, +]; + +type SequenceMetricView = 'distribution' | 'inflight'; + +const SEQUENCE_METRIC_OPTIONS: SegmentedToggleOption[] = [ + { value: 'distribution', label: 'Distribution' }, + { value: 'inflight', label: 'In-flight avg' }, +]; + +// Unofficial-run overlays cannot open this persisted point-detail route: they +// have no benchmark_results id or stored request timeline. These charts are +// therefore intentionally limited to DB-backed agentic points. +export function RequestMetricOverTime({ + title, + metric, + timeline, + isLoading, + latencySelector = false, +}: { + title: string; + metric: RequestMetric; + timeline: RequestTimeline | null | undefined; + isLoading: boolean; + latencySelector?: boolean; +}) { + const [percentile, setPercentile] = useState('p90'); + const [latencyMetric, setLatencyMetric] = useState<'ttft' | 'e2e'>('ttft'); + const selectedMetric = latencySelector ? latencyMetric : metric; + const result = timeline + ? rollingRequestMetric(timeline.requests, selectedMetric, percentile, 50) + : null; + const metricLabel = + selectedMetric === 'ttft' ? 'TTFT' : selectedMetric === 'e2e' ? 'E2E latency' : 'Interactivity'; + const color = + selectedMetric === 'ttft' ? '#f59e0b' : selectedMetric === 'e2e' ? '#a855f7' : '#06b6d4'; + const pointCount = result?.raw.length; + const isLatency = selectedMetric !== 'interactivity'; + + const controls = ( +
+ {latencySelector && ( + { + setLatencyMetric(value); + track('inference_agentic_latency_metric_changed', { metric: value }); + }} + ariaLabel="Latency metric" + testId="latency-metric-toggle" + /> + )} + + {pointCount === undefined + ? '— points' + : `${pointCount.toLocaleString()} ${pointCount === 1 ? 'point' : 'points'}`} + + { + setPercentile(value); + track('inference_agentic_percentile_changed', { + metric: selectedMetric, + percentile: value, + }); + }} + ariaLabel={`${metricLabel} percentile`} + testId={`${selectedMetric}-percentile-toggle`} + /> +
+ ); + + return ( + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!timeline) return isLoading ? : ; + return ( + `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s` + : (value) => `${value.toFixed(0)}` + } + yAxisLabel={isLatency ? `${metricLabel} (s)` : 'Interactivity (tok/s/user)'} + {...size} + /> + ); + }} + /> + ); +} + +export function SequenceMetricCard({ + metric, + timeline, + timelineLoading, +}: { + metric: 'isl' | 'osl'; + /** Phase-scoped timeline — distribution values + in-flight are both derived from it. */ + timeline: RequestTimeline | null | undefined; + timelineLoading: boolean; +}) { + const [view, setView] = useState('distribution'); + const acronym = metric.toUpperCase(); + const fullName = metric === 'isl' ? 'Input sequence length' : 'Output sequence length'; + const testPrefix = `${metric}-metric`; + // Per-request ISL/OSL for the selected phase (request_timeline carries both, + // so the distribution honours the warmup/profiling toggle for free). + const values = timeline + ? timeline.requests + .map((r) => r[metric]) + .filter((v): v is number => typeof v === 'number' && Number.isFinite(v)) + : undefined; + return ( + ({ + ...option, + testId: `${testPrefix}-${option.value}`, + }))} + onValueChange={(value) => { + setView(value); + track('inference_agentic_sequence_metric_view_changed', { metric, view: value }); + }} + ariaLabel={`${acronym} chart view`} + testId={`${testPrefix}-toggle`} + buttonClassName="px-2 py-1 text-xs" + /> + } + render={(expanded) => { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (view === 'distribution') { + if (values && values.length > 0) + return ; + return timelineLoading ? : ; + } + if (!timeline) return timelineLoading ? : ; + const raw = averageSequenceLengthInFlight(timeline.requests, metric); + return ( +
+ {metric === 'osl' && ( +

+ Retrospective: final observed OSL is assigned across each request's lifetime. +

+ )} + +
+ ); + }} + /> + ); +} diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts new file mode 100644 index 00000000..cf43f5ae --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts @@ -0,0 +1,378 @@ +import { describe, expect, it } from 'vitest'; + +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; + +import { + buildRequestTimelineRows, + computeStableRowIndex, + conversationHref, + parseTimelineViewSnapshot, + requestIdleStats, + splitTimelineCid, + type TimelineViewSnapshot, +} from './request-timeline'; + +const request = (start: number, end: number): RequestRecord => ({ + cid: 'conversation', + ti: start, + wid: 'worker', + ad: 0, + phase: 'profiling', + credit: start, + start, + ack: null, + end, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + cancelled: false, +}); + +describe('requestIdleStats', () => { + it('sums only gaps where no requests overlap', () => { + expect( + requestIdleStats([ + request(0, 10), + request(5, 20), + request(30, 40), + request(35, 50), + request(70, 80), + ]), + ).toEqual({ idleNs: 30, spanNs: 80 }); + }); + + it('handles unsorted and nested requests without double-counting busy time', () => { + expect(requestIdleStats([request(20, 30), request(0, 100), request(10, 40)])).toEqual({ + idleNs: 0, + spanNs: 100, + }); + }); + + it('does not count time before the first start or after the final end', () => { + expect(requestIdleStats([request(100, 200), request(300, 400)])).toEqual({ + idleNs: 100, + spanNs: 300, + }); + }); + + it('returns zeroes for an empty timeline', () => { + expect(requestIdleStats([])).toEqual({ idleNs: 0, spanNs: 0 }); + }); +}); + +describe('subagent timeline hierarchy', () => { + it('parses aux lanes separately from their parent subagent id', () => { + expect(splitTimelineCid('conv::sa:subagent_001_abcd:aux:011')).toEqual({ + parent: 'conv', + subagentBase: 'subagent_001_abcd', + stream: null, + aux: '011', + }); + }); + + it('renders aux requests as always-visible children of their subagent', () => { + const records = [ + { ...request(0, 10), cid: 'conv' }, + { ...request(10, 30), cid: 'conv::sa:subagent_001_abcd' }, + { ...request(12, 20), cid: 'conv::sa:subagent_001_abcd:aux:011' }, + { ...request(14, 24), cid: 'conv::sa:subagent_001_abcd:aux:012' }, + { ...request(40, 50), cid: 'conv::sa:subagent_002_ef01' }, + ]; + + const rows = buildRequestTimelineRows(records, 'conversation', new Set()); + expect(rows.map(({ kind, depth }) => ({ kind, depth }))).toEqual([ + { kind: 'parent', depth: 0 }, + { kind: 'subagent', depth: 1 }, + { kind: 'aux', depth: 2 }, + { kind: 'aux', depth: 2 }, + { kind: 'subagent', depth: 1 }, + ]); + expect(rows[1]!.requests.map((record) => record.cid)).toEqual(['conv::sa:subagent_001_abcd']); + expect(rows[1]!.auxCount).toBe(2); + expect(rows[2]!.label).toBe('aux 011 · parallel'); + expect(rows[3]!.label).toBe('aux 012 · parallel'); + }); + + it('keeps aux lanes visible while primary streams remain collapsed', () => { + const records = [ + { ...request(10, 20), cid: 'conv::sa:subagent_001_abcd:s0' }, + { ...request(12, 22), cid: 'conv::sa:subagent_001_abcd:s1' }, + { ...request(14, 18), cid: 'conv::sa:subagent_001_abcd:aux:001' }, + ]; + + const rows = buildRequestTimelineRows(records, 'conversation', new Set()); + expect(rows.map((row) => row.kind)).toEqual(['parent', 'subagent', 'aux']); + expect(rows[1]!.requests).toHaveLength(2); + expect(rows[2]!.requests).toHaveLength(1); + }); + + it('parses aux lanes hanging directly off the main conversation', () => { + expect(splitTimelineCid('conv::aux:000')).toEqual({ + parent: 'conv', + subagentBase: null, + stream: null, + aux: '000', + }); + expect(splitTimelineCid('conv::aux:red:002')).toEqual({ + parent: 'conv', + subagentBase: null, + stream: null, + aux: 'red:002', + }); + expect(splitTimelineCid('conv::sa:subagent_001_abcd:aux:red:002')).toEqual({ + parent: 'conv', + subagentBase: 'subagent_001_abcd', + stream: null, + aux: 'red:002', + }); + }); + + it('nests main-agent aux lanes under the parent conversation row', () => { + const records = [ + { ...request(0, 10), cid: 'conv' }, + { ...request(2, 8), cid: 'conv::aux:001' }, + { ...request(4, 12), cid: 'conv::aux:red:002' }, + { ...request(20, 30), cid: 'conv::sa:subagent_001_abcd' }, + ]; + + const rows = buildRequestTimelineRows(records, 'conversation', new Set()); + expect(rows.map(({ kind, depth }) => ({ kind, depth }))).toEqual([ + { kind: 'parent', depth: 0 }, + { kind: 'aux', depth: 1 }, + { kind: 'aux', depth: 1 }, + { kind: 'subagent', depth: 1 }, + ]); + expect(rows[0]!.requests.map((record) => record.cid)).toEqual(['conv']); + expect(rows[1]!.label).toBe('aux 001 · parallel'); + expect(rows[1]!.parentRowKey).toBe('conv'); + expect(rows[2]!.label).toBe('aux red:002 · parallel'); + // Aux lanes inherit the parent conversation's color. + expect(rows[1]!.color).toBe(rows[0]!.color); + expect(rows[2]!.color).toBe(rows[0]!.color); + }); + + it('groups main-agent aux requests with their parent for stable order/color', () => { + const records = [ + { ...request(50, 60), cid: 'other' }, + { ...request(0, 10), cid: 'conv::aux:000' }, + { ...request(5, 15), cid: 'conv' }, + ]; + const index = computeStableRowIndex(records, 'conversation'); + // 'conv' groups with its aux lane (earliest start 0) and sorts before 'other'. + expect([...index.keys()].toSorted()).toEqual(['conv', 'other']); + expect(index.get('conv')).toBe(0); + expect(index.get('other')).toBe(1); + }); + + it('deep-links a main-agent aux request to the parent conversation without sa', () => { + expect(conversationHref('slug', { ...request(0, 10), cid: 'abc123::aux:red:002', ti: 3 })).toBe( + '/datasets/slug/conversations/abc123?turn=3', + ); + }); +}); + +describe('conversationHref', () => { + it('builds a turn-carrying dataset link for a main-conversation request', () => { + expect( + conversationHref('cc-traces-weka-062126', { ...request(0, 10), cid: 'abc123', ti: 4 }), + ).toBe('/datasets/cc-traces-weka-062126/conversations/abc123?turn=4'); + }); + + it('carries the subagent id and strips the ::sa suffix from the conv id', () => { + expect( + conversationHref('slug', { + ...request(0, 10), + cid: 'abc123::sa:subagent_001_bf1c5c16:s2', + ti: 7, + }), + ).toBe('/datasets/slug/conversations/abc123?turn=7&sa=subagent_001_bf1c5c16'); + }); + + it('uses raw source provenance for flattened-agent dataset links', () => { + expect( + conversationHref('slug', { + ...request(0, 10), + cid: '02bc0afb13f7a2d9efa86c28511261d85c0e::fa:003', + ti: 3, + srcTrace: '02bc0afb13f7a2d9efa86c28511261d85c0e', + srcOuter: 204, + srcKind: 'weka_flat', + }), + ).toBe('/datasets/slug/conversations/02bc0afb13f7a2d9efa86c28511261d85c0e?turn=3&raw=204'); + }); + + it('uses raw nested source provenance for subagent child links', () => { + expect( + conversationHref('slug', { + ...request(0, 10), + cid: '117ebe75819d050f308a0a81647893abd02d::sa:subagent_010_32ee2daa', + ti: 16, + srcTrace: '117ebe75819d050f308a0a81647893abd02d', + srcOuter: 39, + srcInner: 16, + srcKind: 'weka_subagent', + }), + ).toBe( + '/datasets/slug/conversations/117ebe75819d050f308a0a81647893abd02d?turn=16&raw=39&inner=16', + ); + }); +}); + +describe('stable row order + color across phase filters', () => { + // Same conversations appear in both warmup and profiling. Their global + // first-start order is A (0) < B (10) < C (only profiling, 50). The bug: + // filtering to a phase re-sorted + re-colored by the visible subset, so a + // conversation jumped rows and swapped color when toggling phases. + const rec = ( + cid: string, + phase: RequestRecord['phase'], + start: number, + end: number, + ): RequestRecord => ({ ...request(start, end), cid, phase }); + const full: RequestRecord[] = [ + rec('A', 'warmup', 0, 5), + rec('A', 'profiling', 100, 110), + rec('B', 'warmup', 10, 15), + rec('B', 'profiling', 120, 130), + rec('C', 'profiling', 50, 60), // profiling-only; earliest profiling start + ]; + + it('keeps each conversation in the same position and color when the phase changes', () => { + const index = computeStableRowIndex(full, 'conversation'); + const warmupRows = buildRequestTimelineRows( + full.filter((r) => r.phase === 'warmup'), + 'conversation', + new Set(), + index, + ).filter((r) => r.kind === 'parent'); + const profilingRows = buildRequestTimelineRows( + full.filter((r) => r.phase === 'profiling'), + 'conversation', + new Set(), + index, + ).filter((r) => r.kind === 'parent'); + + // Position: A before B in both phases (C only shows in profiling, and sorts + // after A/B by its global index — NOT first by its earlier profiling start). + expect(warmupRows.map((r) => r.label)).toEqual(['A', 'B']); + expect(profilingRows.map((r) => r.label)).toEqual(['A', 'B', 'C']); + + // Color: identical per conversation across phases, distinct between them. + const warmupColors = Object.fromEntries(warmupRows.map((r) => [r.label, r.color])); + const profilingColors = Object.fromEntries(profilingRows.map((r) => [r.label, r.color])); + expect(warmupColors.A).toBe(profilingColors.A); + expect(warmupColors.B).toBe(profilingColors.B); + expect(warmupColors.A).not.toBe(warmupColors.B); + }); + + it('phase-spanning conversations occupy the same ABSOLUTE row in both phase views', () => { + // Warmup-only conversations start earliest — under a plain global-start + // ordering they'd sit above the shared ones in the warmup view but be + // absent from the profiling view, sliding every shared row up when the + // toggle flips. Spanning conversations must sort first so the leading block + // is identical in both views and a carried-over conversation never moves. + const data: RequestRecord[] = [ + rec('W1', 'warmup', 0, 2), + rec('W2', 'warmup', 3, 4), + rec('A', 'warmup', 5, 8), + rec('A', 'profiling', 100, 110), + rec('B', 'warmup', 10, 15), + rec('B', 'profiling', 120, 130), + rec('P', 'profiling', 50, 60), + ]; + const index = computeStableRowIndex(data, 'conversation'); + const parentLabels = (phase: RequestRecord['phase']) => + buildRequestTimelineRows( + data.filter((r) => r.phase === phase), + 'conversation', + new Set(), + index, + ) + .filter((r) => r.kind === 'parent') + .map((r) => r.label); + // Shared block [A, B] leads both views at rows 0 and 1; phase-unique + // conversations fill in below. + expect(parentLabels('warmup')).toEqual(['A', 'B', 'W1', 'W2']); + expect(parentLabels('profiling')).toEqual(['A', 'B', 'P']); + }); + + it('without a shared index, the same subset re-sorts by its own start times (regression guard)', () => { + // Sanity: the legacy self-contained path (no index arg) orders by the + // subset's own first-start, which is exactly why the shared index is needed. + const profilingOnly = buildRequestTimelineRows( + full.filter((r) => r.phase === 'profiling'), + 'conversation', + new Set(), + ).filter((r) => r.kind === 'parent'); + // C (start 50) sorts first here, ahead of A (100) and B (120). + expect(profilingOnly.map((r) => r.label)).toEqual(['C', 'A', 'B']); + }); +}); + +describe('parseTimelineViewSnapshot', () => { + const full: TimelineViewSnapshot = { + viewStart: 1_000, + viewEnd: 5_000, + rowMode: 'worker', + phaseFilter: 'warmup', + expanded: ['conv::sa:subagent_001_abcd'], + scrollTop: 240, + scrollLeft: 80, + }; + + it('round-trips a full snapshot', () => { + expect(parseTimelineViewSnapshot(JSON.stringify(full))).toEqual(full); + }); + + it('round-trips the profiling phase and rejects the removed "all" value', () => { + expect( + parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'profiling' }))?.phaseFilter, + ).toBe('profiling'); + // 'all' is no longer a valid phase — coerces back to the profiling default. + expect( + parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'all' }))?.phaseFilter, + ).toBe('profiling'); + }); + + it('returns null for absent or unparseable input', () => { + expect(parseTimelineViewSnapshot(null)).toBeNull(); + expect(parseTimelineViewSnapshot('')).toBeNull(); + expect(parseTimelineViewSnapshot('{not json')).toBeNull(); + expect(parseTimelineViewSnapshot('42')).toBeNull(); + }); + + it('preserves a null viewEnd (not zoomed) and rejects non-finite viewEnd', () => { + const restored = parseTimelineViewSnapshot(JSON.stringify({ ...full, viewEnd: null })); + expect(restored?.viewEnd).toBeNull(); + // NaN / Infinity don't survive JSON, but a malformed string value must coerce to null. + expect(parseTimelineViewSnapshot('{"viewEnd":"oops"}')?.viewEnd).toBeNull(); + }); + + it('falls back to defaults for invalid enums and missing numbers', () => { + expect(parseTimelineViewSnapshot('{}')).toEqual({ + viewStart: 0, + viewEnd: null, + rowMode: 'conversation', + phaseFilter: 'profiling', + expanded: [], + scrollTop: 0, + scrollLeft: 0, + }); + const bogus = parseTimelineViewSnapshot( + JSON.stringify({ rowMode: 'nope', phaseFilter: 'nope', viewStart: 'x', scrollTop: null }), + )!; + expect(bogus.rowMode).toBe('conversation'); + expect(bogus.phaseFilter).toBe('profiling'); + expect(bogus.viewStart).toBe(0); + expect(bogus.scrollTop).toBe(0); + }); + + it('drops non-string entries from the expanded list', () => { + expect(parseTimelineViewSnapshot('{"expanded":["a",1,null,"b"]}')!.expanded).toEqual([ + 'a', + 'b', + ]); + expect(parseTimelineViewSnapshot('{"expanded":"nope"}')!.expanded).toEqual([]); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx new file mode 100644 index 00000000..1786c74d --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -0,0 +1,581 @@ +'use client'; + +import { useCallback, useLayoutEffect, useMemo, useRef, useState } from 'react'; +import { useRouter } from 'next/navigation'; + +import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline'; +import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { track } from '@/lib/analytics'; + +import { requestsForPhase } from './phase-slice'; +import { TimelineBars } from './timeline-bars'; +import { formatDuration } from './timeline-format'; +import { + CHART_WIDTH, + HEADER_HEIGHT, + LABEL_WIDTH, + PADDING_RIGHT, + ROW_GAP, + ROW_HEIGHT, + TIMELINE_BODY_MAX_HEIGHT, + timelineSvgHeight, +} from './timeline-layout'; +import { + buildRequestTimelineRows, + computeStableRowIndex, + conversationHref, + requestIdleStats, + type RequestTimelineRow, + type RowMode, +} from './timeline-rows'; +import type { SortedRequestTimes } from './timeline-cursor-stats'; +import { + consumeTimelineViewSnapshot, + saveTimelineViewSnapshot, + type PhaseFilter, +} from './timeline-view-snapshot'; +import { + CursorPopover, + TimelineTooltip, + type CursorState, + type TooltipData, +} from './timeline-tooltips'; + +// Stable public API: pure helpers and types live in focused modules, but +// external consumers (detail page, tests) import them from here. +export { + buildRequestTimelineRows, + computeStableRowIndex, + conversationHref, + datasetConvId, + requestIdleStats, + splitTimelineCid, + subagentIdOf, +} from './timeline-rows'; +export type { RequestIdleStats, RequestTimelineRow } from './timeline-rows'; +export { parseTimelineViewSnapshot } from './timeline-view-snapshot'; +export type { TimelineViewSnapshot } from './timeline-view-snapshot'; + +/** + * Gantt-style request timeline for one agentic benchmark point. + * + * Rows are conversations (or workers — toggle in the header). Bars are + * individual HTTP requests, drawn from request_start to request_end with a + * thin lead-in segment from credit_issued (load gen queue). Shift+scroll + * zooms, drag pans, hover shows per-request stats. + * + * The reference for this layout is the agent-timeline in semianalysis-claude-code-proxy. + */ + +const ROW_MODE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'conversation', label: 'By conversation', testId: 'timeline-mode-conversation' }, + { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' }, +]; + +const PHASE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' }, + { value: 'warmup', label: 'Warmup', testId: 'timeline-phase-warmup' }, +]; + +const PLOT_WIDTH = CHART_WIDTH - PADDING_RIGHT; + +export function RequestTimelineView({ + data, + datasetSlug, + pointId, +}: { + data: RequestTimeline; + /** Source dataset slug for this run; enables click-to-conversation deep links. */ + datasetSlug?: string | null; + /** benchmark_results.id — keys the per-point view-state snapshot for restore. */ + pointId: number; +}) { + const router = useRouter(); + const [rowMode, setRowMode] = useState('conversation'); + const [phaseFilter, setPhaseFilter] = useState('profiling'); + const [tooltip, setTooltip] = useState(null); + + // The scroll container (vertical row scroll + horizontal chart scroll) and a + // ref mirror of the live view state, so click-through can snapshot the exact + // position without rebuilding openConversation on every zoom/pan tick. + const scrollRef = useRef(null); + const liveStateRef = useRef<{ + viewStart: number; + viewEnd: number | null; + rowMode: RowMode; + phaseFilter: PhaseFilter; + expandedSubagents: ReadonlySet; + }>({ + viewStart: 0, + viewEnd: null, + rowMode: 'conversation', + phaseFilter: 'profiling', + expandedSubagents: new Set(), + }); + + const openConversation = useCallback( + (req: RequestRecord) => { + if (!datasetSlug) return; + // Snapshot the current zoom/scroll/filter position so the browser back + // button restores it (see the restore effect below). + if (scrollRef.current) { + const live = liveStateRef.current; + saveTimelineViewSnapshot(pointId, { + viewStart: live.viewStart, + viewEnd: live.viewEnd, + rowMode: live.rowMode, + phaseFilter: live.phaseFilter, + expanded: [...live.expandedSubagents], + scrollTop: scrollRef.current.scrollTop, + scrollLeft: scrollRef.current.scrollLeft, + }); + } + track('agentic_timeline_to_dataset', { slug: datasetSlug }); + router.push(conversationHref(datasetSlug, req)); + }, + [datasetSlug, router, pointId], + ); + // Which multi-stream subagents currently have their per-stream rows + // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id). + const [expandedSubagents, setExpandedSubagents] = useState>(() => new Set()); + const toggleSubagent = useCallback((key: string) => { + setExpandedSubagents((prev) => { + const next = new Set(prev); + if (next.has(key)) next.delete(key); + else next.add(key); + return next; + }); + }, []); + const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null); + + // The phase toggle only means something when warmup requests are actually + // present. aiperf's profile_export only contains profiling-phase requests, so + // in practice every record is `profiling` and the toggle is a no-op — hide it + // unless a non-profiling request exists (keeps it working if warmup is ever + // exported). + const hasWarmup = useMemo( + () => data.requests.some((r) => r.phase !== 'profiling'), + [data.requests], + ); + + // Apply phase filter, then group into rows. With no warmup data the filter + // collapses to "profiling" regardless of the (hidden) toggle state. + const filtered = useMemo( + () => requestsForPhase(data.requests, hasWarmup ? phaseFilter : 'profiling'), + [data.requests, phaseFilter, hasWarmup], + ); + // Stable order/color per conversation (or worker), computed over the FULL + // request set — NOT the phase-filtered subset — so a row keeps its position + // and color when the user toggles between warmup and profiling. + const stableRowIndex = useMemo( + () => computeStableRowIndex(data.requests, rowMode), + [data.requests, rowMode], + ); + const rows = useMemo( + () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents, stableRowIndex), + [filtered, rowMode, expandedSubagents, stableRowIndex], + ); + const idleStats = useMemo(() => requestIdleStats(filtered), [filtered]); + + // Pre-sort the timestamp columns so the cursor-time stats popover can + // count "running / waiting at time t" in O(log n). With a few hundred + // requests this is overkill — but it stays smooth on huge runs too. + const sortedTimes = useMemo(() => { + const credits = filtered.map((r) => r.credit).toSorted((a, b) => a - b); + const starts = filtered.map((r) => r.start).toSorted((a, b) => a - b); + const ends = filtered.map((r) => r.end).toSorted((a, b) => a - b); + return { credits, starts, ends }; + }, [filtered]); + + // Cursor state (vertical line + stats popover). null when the mouse + // isn't over the chart. xPx is svg-local; tNs is the ns offset from + // dataStart that the cursor is pointing at. + const [cursor, setCursor] = useState(null); + + // Timeline extent (clamped to actual data — if we filtered out warmup + // the visible window should shrink to just the profiling phase). + const { dataStart, dataEnd } = useMemo(() => { + if (filtered.length === 0) return { dataStart: 0, dataEnd: 1 }; + let min = Number.POSITIVE_INFINITY; + let max = Number.NEGATIVE_INFINITY; + for (const r of filtered) { + if (r.credit < min) min = r.credit; + if (r.end > max) max = r.end; + } + return { dataStart: min, dataEnd: max }; + }, [filtered]); + const totalNs = Math.max(dataEnd - dataStart, 1); + + // Visible window state (ns offsets, relative to dataStart). + const [viewStart, setViewStart] = useState(0); + const [viewEnd, setViewEnd] = useState(null); + const vStart = viewStart; + const vEnd = viewEnd ?? totalNs; + const visibleDur = Math.max(vEnd - vStart, 1); + const isZoomed = viewEnd !== null; + + // Mirror the live view state into a ref so the click-through snapshot reads + // the latest values without rebuilding openConversation on every zoom tick. + liveStateRef.current = { viewStart, viewEnd, rowMode, phaseFilter, expandedSubagents }; + + // Restore the snapshot written on click-through (e.g. open a request in the + // dataset flamegraph, then hit the browser back button). Runs once per mount, + // keyed by point id; the snapshot is consumed so a later reload starts fresh. + // Scroll is applied after the restored filters/expansions re-render the rows + // (rAF fires after that synchronous commit, before paint — no visible jump). + useLayoutEffect(() => { + const snapshot = consumeTimelineViewSnapshot(pointId); + if (!snapshot) return; + setRowMode(snapshot.rowMode); + setPhaseFilter(snapshot.phaseFilter); + setExpandedSubagents(new Set(snapshot.expanded)); + setViewStart(snapshot.viewStart); + setViewEnd(snapshot.viewEnd); + const target = { top: snapshot.scrollTop, left: snapshot.scrollLeft }; + requestAnimationFrame(() => { + const el = scrollRef.current; + if (!el) return; + el.scrollTop = target.top; + el.scrollLeft = target.left; + }); + // setState setters are stable; only re-run if the point itself changes. + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [pointId]); + + const svgHeight = timelineSvgHeight(rows.length); + + // Native (non-passive) wheel handler: React's synthetic onWheel is attached + // passively, so preventDefault there is silently ignored and shift+scroll + // would zoom AND horizontally pan the scroll container. + const zoomSvgRef = useRef(null); + const handleWheel = useCallback( + (e: WheelEvent) => { + // Zoom only on shift+scroll so plain scrolling keeps its native meaning + // (page / row-container scroll) instead of being hijacked by the chart. + if (!e.shiftKey) return; + e.preventDefault(); + const rect = (e.currentTarget as SVGSVGElement).getBoundingClientRect(); + const mouseX = e.clientX - rect.left; + const mouseRatio = Math.max(0, Math.min(1, mouseX / PLOT_WIDTH)); + const curStart = vStart; + const curEnd = vEnd; + const curDur = curEnd - curStart; + // With shift held, most browsers report the wheel delta on deltaX. + const delta = e.deltaY || e.deltaX; + const factor = delta > 0 ? 1.2 : 1 / 1.2; + const newDur = Math.min(Math.max(curDur * factor, totalNs * 0.001), totalNs); + const pivot = curStart + mouseRatio * curDur; + let newStart = pivot - mouseRatio * newDur; + let newEnd = pivot + (1 - mouseRatio) * newDur; + if (newStart < 0) { + newEnd -= newStart; + newStart = 0; + } + if (newEnd > totalNs) { + newStart -= newEnd - totalNs; + newEnd = totalNs; + if (newStart < 0) newStart = 0; + } + if (newEnd - newStart >= totalNs * 0.99) { + setViewStart(0); + setViewEnd(null); + } else { + setViewStart(newStart); + setViewEnd(newEnd); + } + }, + [vStart, vEnd, totalNs], + ); + + useLayoutEffect(() => { + const svg = zoomSvgRef.current; + if (!svg) return; + svg.addEventListener('wheel', handleWheel, { passive: false }); + return () => svg.removeEventListener('wheel', handleWheel); + }, [handleWheel]); + + const handleMouseDown = useCallback( + (e: React.MouseEvent) => { + if (e.button !== 0) return; + dragRef.current = { startX: e.clientX, vs: vStart, ve: vEnd }; + }, + [vStart, vEnd], + ); + + const handleMouseMove = useCallback( + (e: React.MouseEvent) => { + // Dragging takes precedence over cursor tracking — panning the view. + if (dragRef.current) { + const dx = e.clientX - dragRef.current.startX; + const nsPerPx = visibleDur / PLOT_WIDTH; + const delta = -dx * nsPerPx; + let ns = dragRef.current.vs + delta; + let ne = dragRef.current.ve + delta; + const dur = ne - ns; + if (ns < 0) { + ns = 0; + ne = dur; + } + if (ne > totalNs) { + ne = totalNs; + ns = totalNs - dur; + if (ns < 0) ns = 0; + } + setViewStart(ns); + setViewEnd(ne); + setTooltip(null); + setCursor(null); + return; + } + // Track the cursor position in svg-local px and the matching ns offset + // so the crosshair + stats popover can render. Clamped to the chart + // plot area (don't show a cursor on the axis labels gutter). + const rect = e.currentTarget.getBoundingClientRect(); + const xPx = Math.max(0, Math.min(PLOT_WIDTH, e.clientX - rect.left)); + const nsPerPx = visibleDur / PLOT_WIDTH; + const tNs = vStart + xPx * nsPerPx; + setCursor({ xPx, tNs, clientX: e.clientX, clientY: e.clientY }); + }, + [visibleDur, totalNs, vStart], + ); + + const handleMouseUp = useCallback(() => { + dragRef.current = null; + }, []); + + const handleMouseLeave = useCallback(() => { + dragRef.current = null; + setCursor(null); + }, []); + + const resetZoom = useCallback(() => { + setViewStart(0); + setViewEnd(null); + }, []); + + // Stable bar callbacks so TimelineBars' memo isn't defeated by fresh + // closures on every tooltip/cursor state change. + const handleBarHover = useCallback( + (e: React.MouseEvent, row: RequestTimelineRow, req: RequestRecord) => { + setTooltip({ x: e.clientX, y: e.clientY, row, req }); + }, + [], + ); + const handleBarLeave = useCallback(() => setTooltip(null), []); + const handleBarClick = useCallback( + (e: React.MouseEvent, req: RequestRecord) => { + if (e.metaKey || e.ctrlKey || e.shiftKey || e.altKey || e.button !== 0) return; + e.preventDefault(); + openConversation(req); + }, + [openConversation], + ); + + if (rows.length === 0) { + return ( +
+ No requests in the current filter. +
+ ); + } + + const totalRequests = filtered.length; + + return ( +
+ {/* Controls */} +
+ + {hasWarmup && ( + + )} + + {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '} + {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '} + {formatDuration((dataEnd - dataStart) / 1e6)} ·{' '} + + idle {formatDuration(idleStats.idleNs / 1e6)} + {idleStats.spanNs > 0 + ? ` (${((idleStats.idleNs / idleStats.spanNs) * 100).toFixed(1)}%)` + : ''} + + {isZoomed && ( + <> + {' · '} + + + )} + +
+ + {/* Chart container */} +
+ {/* Fixed-height window: rows scroll vertically and the chart scrolls + horizontally inside it, so the card doesn't grow to fit every + conversation/worker AND the horizontal scrollbar stays pinned to the + window's bottom edge (rather than the bottom of the tall content). */} +
+
+ {/* Label column — pinned left (sticky) so it stays put during + horizontal scroll, while scrolling vertically with the rows. */} +
+
+ + {rowMode === 'conversation' ? 'Conversation' : 'Worker'} + +
+ {rows.map((row) => { + const isSubagentRow = row.kind === 'subagent'; + const isChildRow = row.kind === 'stream' || row.kind === 'aux'; + const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; + const isExpanded = isExpandable && expandedSubagents.has(row.key); + return ( +
+ {isExpandable ? ( + + ) : ( + + )} + + + {row.label} + {isExpandable && ( + ×{row.streamCount} + )} + {isSubagentRow && (row.auxCount ?? 0) > 0 && ( + +{row.auxCount} aux + )} + + + {row.requests.length > 0 ? row.requests.length : '—'} + +
+ ); + })} +
+ + {/* Chart column — horizontal scrolling is handled by the window + container above so its scrollbar stays pinned to the window's + bottom edge; double-click anywhere resets the zoom. */} +
+ + + + {/* Cursor crosshair — drawn on top of bars so it stays visible + through dense rows. Stats popover is rendered as fixed + HTML below the SVG block. */} + {cursor && ( + + )} + +
+
+
+
+ + {/* Footer — interaction hint only. */} +
+ + shift+scroll to zoom · drag to pan · double-click to reset + +
+ + {/* Cursor stats popover: count of in-flight / waiting at the cursor's + ns offset. Hidden when the user is hovering an individual bar + (per-request tooltip wins). */} + {cursor && !tooltip && ( + + )} + + {/* Tooltip */} + {tooltip && } +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx b/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx new file mode 100644 index 00000000..6eb109b7 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/server-metric-cards.tsx @@ -0,0 +1,474 @@ +'use client'; + +import type { RequestTimeline } from '@/hooks/api/use-request-timeline'; +import type { MetricSourceSeries, QueueDepthPoint } from '@/hooks/api/use-trace-server-metrics'; +import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { track } from '@/lib/analytics'; + +import { CHART_SIZES, ChartEmpty, ChartSkeleton } from './chart-shared'; +import { ExpandableChart } from './expandable-chart'; +import { metricSourceLabel } from './metric-source-toolbar'; +import type { PhaseSlicedSeries, ServerSeriesLike } from './phase-slice'; +import { StackedAreaChart, TimeSeriesChart } from './time-series-chart'; +import { + cumulativeCompletedRequests, + cumulativeDifferenceMonotonic, + cumulativeTimeAverage, + cumulativeUniqueInputTokens, + buildThroughputChartSeries, + inflightUniqueTokens, + rollingAverage, + timeRollingAverage, + toggleThroughputSeries, + type ThroughputSeriesKey, +} from './time-series-math'; + +/** + * Phase-sliced server series (+ matching durationS). Null while the trace + * blob is loading or absent — cards render a skeleton until it arrives. + */ +type SlicedServerSeries = PhaseSlicedSeries | null; + +export type RequestActivityView = 'queue' | 'completed'; + +const REQUEST_ACTIVITY_OPTIONS: SegmentedToggleOption[] = [ + { value: 'queue', label: 'Queue depth', testId: 'request-activity-queue' }, + { value: 'completed', label: 'Completed', testId: 'request-activity-completed' }, +]; + +/** Compact token count for chart labels: 306808 → "307K tok", 3.2e6 → "3.2M tok". */ +const fmtTokensCompact = (n: number): string => { + if (n >= 1e6) return `${(n / 1e6).toFixed(1)}M tok`; + if (n >= 1e3) return `${Math.round(n / 1e3)}K tok`; + return `${Math.round(n)} tok`; +}; + +// Per-DP-rank color palette for DEP runs (one distinct color per rank in +// the KV cache utilization overlay). Mirrors the request-timeline row +// palette so the same DP index reads as the same color across both views. +// Wraps mod-N if more than 12 ranks ever land. +const DP_RANK_PALETTE = [ + '#3b82f6', + '#ef4444', + '#10b981', + '#f59e0b', + '#a855f7', + '#06b6d4', + '#f97316', + '#84cc16', + '#ec4899', + '#14b8a6', + '#8b5cf6', + '#eab308', +]; + +export function KvCacheUtilizationCard({ sliced }: { sliced: SlicedServerSeries }) { + return ( + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!sliced) return ; + const serverSeries = sliced.series; + // For SGLang hicache rows we have both GPU (HBM) util and + // host (CPU offload pool) util — overlay them as two lines. + const hasHost = serverSeries.hostKvCacheUsage.length > 0; + // DEP runs report one series per engine. When there's more + // than one, draw one line per rank in distinct colors so + // load skew is visible at a glance; cluster-average sits on + // top in white so it stands out. + const perEngine = serverSeries.kvCacheUsageByEngine ?? []; + const hasPerEngine = perEngine.length > 1; + // Render order matters: per-engine first → average drawn on top. + const series = [ + ...(hasPerEngine + ? perEngine.map((e, i) => ({ + name: `DP ${e.engineLabel}`, + data: rollingAverage(e.points, 50), + color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!, + // Thin + translucent so the Avg line on top reads as + // the headline number, not just one more series. + strokeWidth: 1, + strokeOpacity: 0.5, + })) + : []), + { + name: hasHost ? 'GPU HBM (avg n=50)' : hasPerEngine ? 'Avg' : 'GPU KV cache (avg n=50)', + data: rollingAverage(serverSeries.kvCacheUsage, 50), + // Skip raw scatter when per-engine overlay is on — the + // DP-rank lines already convey the spread, dots would be noise. + rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage, + // Bold red Avg sits on top of the translucent per-DP lines. + // DP 1 in the palette is #ef4444 (lighter red); the darker + // #dc2626 here plus the heavier stroke keeps it distinct. + color: hasPerEngine ? '#dc2626' : '#3b82f6', + strokeWidth: hasPerEngine ? 3.5 : 2, + }, + ...(hasHost + ? [ + { + name: 'CPU offload pool (avg n=50)', + data: rollingAverage(serverSeries.hostKvCacheUsage, 50), + rawData: serverSeries.hostKvCacheUsage, + color: '#f97316', + strokeWidth: 2, + }, + ] + : []), + ]; + return ( + `${(v * 100).toFixed(0)}%`} + yAxisLabel="KV cache (%)" + {...size} + /> + ); + }} + /> + ); +} + +export function RequestActivityCard({ + sliced, + phaseTimeline, + timelineLoading, + view, + onViewChange, +}: { + sliced: SlicedServerSeries; + phaseTimeline: RequestTimeline | null; + timelineLoading: boolean; + view: RequestActivityView; + onViewChange: (view: RequestActivityView) => void; +}) { + return ( + { + onViewChange(value); + track('inference_agentic_request_activity_changed', { view: value }); + }} + ariaLabel="Request activity metric" + testId="request-activity-toggle" + buttonClassName="px-2 py-1 text-xs" + /> + } + render={(expanded) => { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (view === 'completed') { + if (!phaseTimeline) { + return timelineLoading ? : ; + } + return ( + + ); + } + if (!sliced) return ; + const serverSeries = sliced.series; + return ( + ({ + t: p.t, + value: p.running, + })), + 50, + ), + color: '#22c55e', + strokeWidth: 2, + }, + { + name: 'Waiting (avg n=50)', + data: rollingAverage( + serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ + t: p.t, + value: p.waiting, + })), + 50, + ), + color: '#ef4444', + strokeWidth: 2, + }, + { + name: 'Total (avg n=50)', + data: rollingAverage( + serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ + t: p.t, + value: p.total, + })), + 50, + ), + color: '#3b82f6', + strokeWidth: 2, + }, + ]} + durationS={sliced.durationS} + yAxisLabel="Requests" + {...size} + /> + ); + }} + /> + ); +} + +export function PrefixCacheHitRateCard({ sliced }: { sliced: SlicedServerSeries }) { + return ( + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!sliced) return ; + const serverSeries = sliced.series; + return ( + `${(v * 100).toFixed(0)}%`} + yAxisLabel="Hit rate (%)" + {...size} + /> + ); + }} + /> + ); +} + +export function ThroughputCard({ + sliced, + selectedSource, + selected, + onSelectedChange, +}: { + sliced: SlicedServerSeries; + selectedSource: MetricSourceSeries | undefined; + selected: ReadonlySet; + onSelectedChange: (next: ReadonlySet) => void; +}) { + return ( + + {( + [ + ['input', 'Input'], + ['decode', 'Decode'], + ] as const + ).map(([key, label]) => { + const active = selected.has(key); + const isOnlyActive = active && selected.size === 1; + return ( + + ); + })} +
+ } + render={(expanded) => { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!sliced) return ; + const serverSeries = sliced.series; + return ( + + ); + }} + /> + ); +} + +export function PromptTokenSourceCard({ sliced }: { sliced: SlicedServerSeries }) { + return ( + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!sliced) return ; + return ( + + ); + }} + /> + ); +} + +export function CumulativeUniqueInputTokensCard({ sliced }: { sliced: SlicedServerSeries }) { + return ( + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!sliced) return ; + const serverSeries = sliced.series; + // Unique = total prompt tokens received minus tokens served from + // any cache tier — i.e. the freshly prefill-computed tokens. Prefer + // the promptTokensBySource breakdown (its buckets sum to the real + // prompt-token total, so subtracting cache tiers is exact). Fall + // back to cumsum(prefillTps - prefixCacheHitsTps) only for older + // data without the breakdown: vllm:prefix_cache_hits re-counts + // tokens across scheduler passes, so its cumulative can exceed the + // prompt tokens received, driving the diff negative and freezing + // the monotonic-clamped line after a few seconds. + const uniqueFromBreakdown = cumulativeUniqueInputTokens(serverSeries.promptTokensBySource); + const uniqueData = + uniqueFromBreakdown.length > 0 + ? uniqueFromBreakdown + : cumulativeDifferenceMonotonic( + serverSeries.prefillTps, + serverSeries.prefixCacheHitsTps, + ); + return ( + + ); + }} + /> + ); +} + +export function InflightUniqueTokensCard({ + phaseTimeline, + timelineLoading, + kvCachePoolTokens, +}: { + phaseTimeline: RequestTimeline | null; + timelineLoading: boolean; + /** KV-cache pool size in tokens (vLLM only) — drawn as a constant ceiling. */ + kvCachePoolTokens: number | null; +}) { + return ( + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!phaseTimeline) { + return timelineLoading ? : ; + } + // Step function: at each request start/end, sum the ISLs of + // currently-active requests across distinct cids. Within one + // cid turns are sequential so each cid contributes at most + // one in-flight ISL; across cids we treat content as + // independent (cross-conv prefix sharing adds <1pp in + // practice). Smooth with a 30s time-weighted rolling average + // so brief turn-handoff dips don't dominate the chart. + const raw = inflightUniqueTokens(phaseTimeline.requests); + const smoothed = timeRollingAverage(raw, 30); + // KV-cache pool size (vLLM only) drawn as a constant ceiling so + // you can see how close the working set gets to eviction + // pressure. Phase-independent — it's a static config value. + const pool = kvCachePoolTokens; + return ( + 0 + ? [{ value: pool, label: `KV cache pool · ${fmtTokensCompact(pool)}` }] + : undefined + } + {...size} + /> + ); + }} + /> + ); +} diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx new file mode 100644 index 00000000..a1a5d1ab --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx @@ -0,0 +1,247 @@ +'use client'; + +import { useMemo, useState } from 'react'; +import { useRouter } from 'next/navigation'; +import { ChevronLeft, ChevronRight } from 'lucide-react'; + +import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings'; +import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { track } from '@/lib/analytics'; + +const HW_LABELS: Record = { + b200: 'B200', + b300: 'B300', + gb200: 'GB200', + gb300: 'GB300', + h100: 'H100', + h200: 'H200', + mi300x: 'MI300X', + mi325x: 'MI325X', + mi355x: 'MI355X', +}; + +const MODEL_LABELS: Record = { + dsr1: 'DeepSeek R1', + dsv4: 'DeepSeek V4 Pro', + glm5: 'GLM-5', + 'glm5.1': 'GLM-5.1', + gptoss120b: 'gpt-oss 120B', + kimik2: 'Kimi K2', + 'kimik2.5': 'Kimi K2.5', + 'kimik2.6': 'Kimi K2.6', + llama70b: 'Llama 3.3 70B', + 'minimaxm2.5': 'MiniMax M2.5', + 'minimaxm2.7': 'MiniMax M2.7', + 'qwen3.5': 'Qwen 3.5', +}; + +function hwLabel(hw: string) { + return HW_LABELS[hw] ?? hw.toUpperCase(); +} +function modelLabel(m: string) { + return MODEL_LABELS[m] ?? m; +} +function frameworkLabel(fw: string) { + if (fw === 'vllm') return 'vLLM'; + if (fw === 'sglang') return 'SGLang'; + if (fw === 'trt') return 'TRT'; + if (fw === 'mori-sglang') return 'Mori-SGLang'; + if (fw.startsWith('dynamo-')) return `Dynamo ${fw.slice('dynamo-'.length).toUpperCase()}`; + return fw; +} + +/** Short label for a sibling chip: parallelism + concurrency. */ +export function chipLabel(s: BenchmarkSibling): string { + // Same parallelism labeler the chart points use (TP/EP/TEP/DEP/DPA…). + const parallel = parallelismLabel({ + tp: s.decode_tp, + ep: s.decode_ep, + dpAttention: s.decode_dp_attention, + disagg: s.disagg, + isMultinode: s.is_multinode, + prefillTp: s.prefill_tp, + prefillEp: s.prefill_ep, + prefillDpAttention: s.prefill_dp_attention, + prefillNumWorkers: s.prefill_num_workers, + decodeTp: s.decode_tp, + decodeEp: s.decode_ep, + decodeDpAttention: s.decode_dp_attention, + decodeNumWorkers: s.decode_num_workers, + }); + const offload = s.offload_mode === 'on' ? ' • off=ON' : ''; + return `${parallel} • c=${s.conc}${offload}`; +} + +type SortMode = 'default' | 'conc' | 'parallelism' | 'tput' | 'requests'; + +const SORT_OPTIONS: { value: SortMode; label: string }[] = [ + { value: 'default', label: 'Default' }, + { value: 'conc', label: 'Concurrency ↑' }, + { value: 'parallelism', label: 'Parallelism' }, + { value: 'tput', label: 'Throughput/GPU ↓' }, + { value: 'requests', label: 'Total requests ↓' }, +]; + +// Group key for the "parallelism" sort: ep first (so TP/EP1 sorts ahead of +// EP/TEP/DEP groups), then tp, then dp-attention, then disagg — every config +// of one parallelism lands together, ordered by concurrency within. +const parallelRank = (s: BenchmarkSibling): [number, number, number, number] => [ + s.decode_ep ?? 0, + s.decode_tp ?? 0, + s.decode_dp_attention ? 1 : 0, + s.disagg ? 1 : 0, +]; + +function sortSiblings(siblings: BenchmarkSibling[], mode: SortMode): BenchmarkSibling[] { + if (mode === 'default') return siblings; + const out = [...siblings]; + if (mode === 'conc') { + out.sort((a, b) => a.conc - b.conc); + } else if (mode === 'tput') { + // Highest throughput/GPU first; rows missing the metric sink to the end. + out.sort((a, b) => (b.tput_per_gpu ?? -Infinity) - (a.tput_per_gpu ?? -Infinity)); + } else if (mode === 'requests') { + // Most total requests first; rows missing the metric sink to the end. + out.sort((a, b) => (b.total_requests ?? -Infinity) - (a.total_requests ?? -Infinity)); + } else { + out.sort((a, b) => { + const ra = parallelRank(a); + const rb = parallelRank(b); + for (let i = 0; i < ra.length; i++) { + if (ra[i] !== rb[i]) return ra[i] - rb[i]; + } + // Within a parallelism group: offload off before on, then concurrency. + const oa = a.offload_mode === 'on' ? 1 : 0; + const ob = b.offload_mode === 'on' ? 1 : 0; + return oa - ob || a.conc - b.conc; + }); + } + return out; +} + +const isSortMode = (v: string | null): v is SortMode => + v !== null && SORT_OPTIONS.some((o) => o.value === v); + +export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) { + const router = useRouter(); + // Persist the sort in the URL so clicking a point (which remounts this + // component on the new route) keeps the chosen order instead of resetting. + // Read it once from the URL on mount — this component only renders after the + // client-side siblings query resolves, so `window` is always available here + // (no SSR/hydration mismatch). Matches the app's window-based url-state read. + const [sortMode, setSortMode] = useState(() => { + if (typeof window === 'undefined') return 'default'; + const v = new URLSearchParams(window.location.search).get('sort'); + return isSortMode(v) ? v : 'default'; + }); + + const sorted = useMemo(() => sortSiblings(siblings, sortMode), [siblings, sortMode]); + + // prev/next follow the displayed (sorted) order so navigation matches the row. + const currentIdx = sorted.findIndex((s) => s.is_current); + const prev = currentIdx > 0 ? sorted[currentIdx - 1] : null; + const next = currentIdx !== -1 && currentIdx < sorted.length - 1 ? sorted[currentIdx + 1] : null; + + // Carry the active sort through every point-to-point link. + const hrefFor = (id: number) => + sortMode === 'default' + ? `/inference/agentic/${id}` + : `/inference/agentic/${id}?sort=${sortMode}`; + + const currentId = siblings.find((s) => s.is_current)?.id; + + const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`; + + return ( +
+
+

{skuLabel}

+ + {siblings.length} point{siblings.length === 1 ? '' : 's'} in this run · {sku.date} + +
+
+
+ Sort by + +
+ +
+ {sorted.map((s) => { + const active = s.is_current; + return ( + + ); + })} +
+ +
+
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx new file mode 100644 index 00000000..2131c82e --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -0,0 +1,526 @@ +'use client'; + +import { useMemo } from 'react'; + +import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics'; + +import { ChartHover, type HoverItem } from './chart-hover'; +import { CHART_PAD, ChartEmpty, fmtCount, fmtSeconds } from './chart-shared'; +import { interpAt, type ChartSeries } from './time-series-math'; + +// Historical entry point: the pure data-shaping helpers lived in this module +// before being extracted; re-export them so both import paths stay valid. +export * from './time-series-math'; + +/** A constant horizontal reference line (e.g. a capacity ceiling). */ +export interface ReferenceLine { + value: number; + label: string; + /** Line + label color. Defaults to a muted emerald. */ + color?: string; +} + +interface TimeSeriesChartProps { + series: ChartSeries[]; + durationS: number; + yMax?: number; + yFmt?: (v: number) => string; + yAxisLabel?: string; + width?: number; + height?: number; + /** + * Horizontal reference lines drawn across the plot. Their values are folded + * into the auto y-max so the line stays on-chart even when it exceeds the + * data (e.g. a KV-cache pool ceiling well above the working set). + */ + refLines?: readonly ReferenceLine[]; +} + +const NO_REF_LINES: readonly ReferenceLine[] = []; + +const PAD = CHART_PAD; + +export function TimeSeriesChart({ + series, + durationS, + yMax: yMaxOpt, + yFmt = fmtCount, + yAxisLabel, + width = 720, + height = 260, + refLines = NO_REF_LINES, +}: TimeSeriesChartProps) { + const W = width; + const H = height; + + const layout = useMemo(() => { + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const xMax = Math.max(durationS, 1); + // Fold reference-line values into the auto max so a ceiling above the data + // (e.g. KV-cache pool >> working set) still renders inside the plot. + const refMax = refLines.length > 0 ? Math.max(...refLines.map((r) => r.value)) : 0; + const yMax = + yMaxOpt ?? Math.max(1e-9, refMax, ...series.flatMap((s) => s.data.map((d) => d.value))); + const xScale = (t: number) => PAD.left + (t / xMax) * innerW; + const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH; + return { innerW, innerH, xMax, yMax, xScale, yScale }; + }, [series, durationS, yMaxOpt, refLines, W, H]); + + const { innerW, innerH, xMax, yMax, xScale, yScale } = layout; + + const subsample = (arr: TimeSeriesPoint[]) => { + if (arr.length === 0) return arr; + const stride = Math.max(1, Math.floor(arr.length / innerW)); + return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr; + }; + + // Pre-format axis ticks. + const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4); + const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4); + + const resolve = (fraction: number) => { + const t = fraction * xMax; + const items: HoverItem[] = []; + for (const s of series) { + if (s.hideFromHover) continue; + const v = interpAt(s.data, t); + if (v === null || !Number.isFinite(v)) continue; + items.push({ color: s.color, label: s.name, value: yFmt(v) }); + } + if (items.length === 0) return null; + return { items, title: fmtSeconds(t) }; + }; + + if (series.every((s) => s.data.length === 0)) { + return ; + } + + return ( + + {/* y-axis gridlines + labels */} + {yTickVals.map((v, i) => { + const y = yScale(v); + return ( + + + + {yFmt(v)} + + + ); + })} + + {/* Raw scatter underlay */} + {series + .filter((s) => s.rawData && s.rawData.length > 0) + .map((s, si) => + subsample(s.rawData!).map((d, i) => ( + + )), + )} + + {/* Lines */} + {series.map((s, si) => { + if (s.data.length === 0) return null; + const sampled = subsample(s.data); + const path = sampled + .map( + (d, i) => + `${i === 0 ? 'M' : 'L'}${xScale(d.t).toFixed(2)},${yScale(d.value).toFixed(2)}`, + ) + .join(' '); + return ( + + ); + })} + + {/* Horizontal reference lines (e.g. KV-cache pool ceiling). Drawn on top + of the data lines, with a label pinned to the right edge. */} + {refLines.map((ref, i) => { + if (!Number.isFinite(ref.value) || ref.value < 0 || ref.value > yMax) return null; + const y = yScale(ref.value); + const color = ref.color ?? '#16a34a'; + return ( + + + + {ref.label} + + + ); + })} + + {/* X-axis */} + + {xTickVals.map((v, i) => { + const x = xScale(v); + const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle'; + return ( + + {fmtSeconds(v)} + + ); + })} + + time + + + {yAxisLabel && ( + + {yAxisLabel} + + )} + + {/* Legend — skip series flagged hideFromHover so per-engine + underlays don't clutter the chip row. */} + {(() => { + const visible = series.filter((s) => !s.hideFromHover); + const chipY = H - 8; + const chipW = innerW / Math.max(1, visible.length); + return visible.map((s, i) => { + const x = PAD.left + i * chipW; + return ( + + + + {s.name} + + + ); + }); + })()} + + ); +} + +// Fixed colors for the token-source names the chart-series builder emits +// (vLLM names first, then the SGLang names compute-chart-series produces). +const KNOWN_SOURCE_COLORS: Record = { + local_compute: '#f97316', + local_cache_hit: '#3b82f6', + external_kv_transfer: '#22c55e', + miss: '#f97316', + 'cache hit (HBM)': '#3b82f6', + 'cache hit (CPU offload)': '#22c55e', + 'cache hit': '#3b82f6', + 'compute (miss)': '#f97316', +}; + +const SOURCE_LABELS: Record = { + local_compute: 'Prefill', + local_cache_hit: 'HBM Cache Hit', + external_kv_transfer: 'Offload Cache Hit', + miss: 'Miss', +}; + +// Fallback palette for any source name not in KNOWN_SOURCE_COLORS so we never +// emit two layers in the same shade. Cycles by stack (insertion) order. +const FALLBACK_PALETTE = [ + '#3b82f6', + '#f97316', + '#22c55e', + '#a855f7', + '#ef4444', + '#06b6d4', + '#f59e0b', + '#ec4899', +]; + +/** Stacked-area chart for token-source share over time. */ +export function StackedAreaChart({ + sourceSeries, + durationS, + width = 720, + height = 260, +}: { + sourceSeries: Record; + durationS: number; + width?: number; + height?: number; +}) { + const W = width; + const H = height; + + const computed = useMemo(() => { + const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0); + if (entries.length === 0) return null; + + // Different sources can land on different scrape timestamps + // (SGLang's hits/misses fire on alternating ticks), so we MUST + // align across all sources before computing shares — otherwise the + // share calculation indexes into each source's own time axis and + // mixes values from different moments. + // + // Approach: union all timestamps across sources, then for each + // unique timestamp carry forward the cumulative sum for every + // source (a source that didn't report at time t holds its previous + // cumulative value rather than dropping to 0). + const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted( + (a, b) => a - b, + ); + + // For each source, walk its (sorted) array and produce a parallel + // cumulative-sum array indexed against `tValues` via carry-forward. + const cum: Record = {}; + for (const [name, arr] of entries) { + const valByT = new Map(arr.map((p) => [p.t, p.value])); + const out: number[] = Array.from({ length: tValues.length }); + let acc = 0; + for (let i = 0; i < tValues.length; i++) { + const v = valByT.get(tValues[i]!); + if (v !== undefined) acc += v; + out[i] = acc; + } + cum[name] = out; + } + + const shares: Record = {}; + for (const name of Object.keys(cum)) shares[name] = []; + for (let i = 0; i < tValues.length; i++) { + const total = entries.reduce((s, [name]) => s + (cum[name]?.[i] ?? 0), 0); + for (const [name] of entries) { + shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0); + } + } + return { tValues, shares }; + }, [sourceSeries]); + + if (!computed) { + return ; + } + const { tValues, shares } = computed; + + const stackOrder = Object.keys(shares); + + // Assign colors once per render in stack order so the layers and the hover + // tooltip always agree, including for unknown source names on the fallback + // palette. + const colorByName = new Map(); + let fallbackIdx = 0; + for (const name of stackOrder) { + const known = KNOWN_SOURCE_COLORS[name]; + colorByName.set(name, known ?? FALLBACK_PALETTE[fallbackIdx++ % FALLBACK_PALETTE.length]!); + } + const colorFor = (name: string): string => colorByName.get(name) ?? FALLBACK_PALETTE[0]!; + + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const xMax = Math.max(durationS, 1); + const xScale = (t: number) => PAD.left + (t / xMax) * innerW; + const yScale = (v: number) => PAD.top + (1 - v) * innerH; + + const lower: number[] = Array.from({ length: tValues.length }, () => 0); + const layers = stackOrder.map((name) => { + const upper = shares[name]!.map((v, i) => lower[i]! + v); + const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]); + const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]); + const d = `${top + .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`) + .join(' ')} ${[...bottom] + .toReversed() + .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`) + .join(' ')} Z`; + const color = colorFor(name); + for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!; + return { name, color, d }; + }); + + const resolve = (fraction: number) => { + const t = fraction * xMax; + // Find the closest tValue index. + let idx = 0; + let bestDist = Infinity; + for (let i = 0; i < tValues.length; i++) { + const d = Math.abs(tValues[i]! - t); + if (d < bestDist) { + bestDist = d; + idx = i; + } + } + const items: HoverItem[] = stackOrder.map((name) => ({ + color: colorFor(name), + label: SOURCE_LABELS[name] ?? name, + value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`, + })); + return { items, title: fmtSeconds(t) }; + }; + + const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4); + const yTickVals = [0, 0.25, 0.5, 0.75, 1]; + + return ( + + {yTickVals.map((v, i) => { + const y = yScale(v); + return ( + + + + {(v * 100).toFixed(0)}% + + + ); + })} + {layers.map((l, i) => ( + + ))} + + {xTickVals.map((v, i) => { + const x = xScale(v); + const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle'; + return ( + + {fmtSeconds(v)} + + ); + })} + + time + + + % of prefill tokens + + {(() => { + const chipY = H - 8; + const chipW = innerW / Math.max(1, layers.length); + return layers.map((l, i) => { + const x = PAD.left + i * chipW; + return ( + + + + {SOURCE_LABELS[l.name] ?? l.name} + + + ); + }); + })()} + + ); +} diff --git a/packages/app/src/components/inference/agentic-point/time-series-math.test.ts b/packages/app/src/components/inference/agentic-point/time-series-math.test.ts new file mode 100644 index 00000000..d92fc9ba --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/time-series-math.test.ts @@ -0,0 +1,457 @@ +import { describe, expect, it } from 'vitest'; + +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; + +import { + averageSequenceLengthInFlight, + buildThroughputChartSeries, + cumulativeAverage, + cumulativeCompletedRequests, + cumulativeDifferenceMonotonic, + cumulativeTimeAverage, + cumulativeUniqueInputTokens, + inflightUniqueTokens, + interpAt, + rollingAverage, + rollingRequestMetric, + timeRollingAverage, + toggleThroughputSeries, +} from './time-series-math'; + +const request = ( + endS: number, + ttftMs: number | null, + tpotMs: number | null, + overrides: Partial = {}, +): RequestRecord => ({ + cid: 'conversation', + ti: endS, + wid: 'worker', + ad: 0, + phase: 'profiling', + credit: 0, + start: 0, + ack: null, + end: endS * 1e9, + ttftMs, + tpotMs, + isl: 100, + osl: 10, + cancelled: false, + ...overrides, +}); + +describe('rollingRequestMetric', () => { + it('computes a trailing P75 TTFT over the requested window', () => { + const result = rollingRequestMetric( + [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30), request(4, 400, 40)], + 'ttft', + 'p75', + 3, + ); + + expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 }); + expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]); + expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.325]); + }); + + it('inverts the rolling TPOT percentile for interactivity', () => { + const result = rollingRequestMetric( + [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30)], + 'interactivity', + 'p90', + 3, + ); + + expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]); + expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8); + expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]); + }); + + it('computes E2E latency from request start through request end', () => { + const result = rollingRequestMetric( + [request(2, 100, 10, { start: 500_000_000 }), request(4, 200, 20, { start: 1_000_000_000 })], + 'e2e', + 'p90', + 50, + ); + + expect(result.raw).toEqual([ + { t: 2, value: 1.5 }, + { t: 4, value: 3 }, + ]); + expect(result.trend.at(-1)?.value).toBeCloseTo(2.85, 8); + expect(result.cumulative.at(-1)?.value).toBeCloseTo(2.85, 8); + }); + + it('drops cancelled, missing, and non-positive samples (phase is the caller’s concern)', () => { + const result = rollingRequestMetric( + [ + request(1, 100, 10), + request(2, 200, 20, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline + request(3, 300, 30, { cancelled: true }), + request(4, null, null), + request(5, 0, 0), + ], + 'ttft', + 'p90', + ); + + expect(result.raw).toEqual([ + { t: 1, value: 0.1 }, + { t: 2, value: 0.2 }, + ]); + }); +}); + +describe('timeRollingAverage', () => { + it('integrates the step function over the trailing window', () => { + const result = timeRollingAverage( + [ + { t: 0, value: 10 }, + { t: 2, value: 20 }, + { t: 4, value: 40 }, + ], + 4, + ); + + // t=0: zero-length window → raw value. t=2: 10 held on [0,2) → 10. + // t=4: 10 on [0,2) + 20 on [2,4) = 60 area / 4 s = 15. + expect(result).toEqual([ + { t: 0, value: 10 }, + { t: 2, value: 10 }, + { t: 4, value: 15 }, + ]); + }); + + it('carries the pre-window step value into a clipped window', () => { + const result = timeRollingAverage( + [ + { t: 0, value: 10 }, + { t: 2, value: 20 }, + { t: 4, value: 40 }, + ], + 2, + ); + + // Window [2,4): value 20 held throughout (the t=0 sample sets the step + // value at the window start via carry-forward of data[j-1]). + expect(result.at(-1)).toEqual({ t: 4, value: 20 }); + }); + + it('passes through empty input and non-positive windows', () => { + expect(timeRollingAverage([], 30)).toEqual([]); + const data = [{ t: 0, value: 1 }]; + expect(timeRollingAverage(data, 0)).toBe(data); + }); +}); + +describe('rollingAverage', () => { + it('averages a centered window clipped at the edges', () => { + const data = [1, 2, 3, 4].map((value, i) => ({ t: i, value })); + expect(rollingAverage(data, 3).map((p) => p.value)).toEqual([1.5, 2, 3, 3.5]); + }); + + it('passes through window sizes of 1 or less', () => { + const data = [{ t: 0, value: 5 }]; + expect(rollingAverage(data, 1)).toBe(data); + }); +}); + +describe('cumulativeAverage', () => { + it('hides the startup interval without removing it from later averages', () => { + const result = cumulativeAverage( + [ + { t: 0, value: 300 }, + { t: 30, value: 0 }, + { t: 60, value: 0 }, + { t: 90, value: 100 }, + ], + 60, + ); + + expect(result).toEqual([ + { t: 60, value: 100 }, + { t: 90, value: 100 }, + ]); + }); + + it('preserves the original behavior when no burn-in is requested', () => { + expect( + cumulativeAverage([ + { t: 0, value: 10 }, + { t: 1, value: 20 }, + ]), + ).toEqual([ + { t: 0, value: 10 }, + { t: 1, value: 15 }, + ]); + }); +}); + +describe('cumulativeTimeAverage', () => { + it('computes a run-to-date time-weighted average for a step series', () => { + expect( + cumulativeTimeAverage([ + { t: 0, value: 100 }, + { t: 1, value: 300 }, + { t: 3, value: 100 }, + { t: 4, value: 0 }, + ]), + ).toEqual([ + { t: 0, value: 100 }, + { t: 1, value: 100 }, + { t: 3, value: 700 / 3 }, + { t: 4, value: 200 }, + ]); + }); + + it('coalesces same-time request events to their final step value', () => { + expect( + cumulativeTimeAverage([ + { t: 0, value: 0 }, + { t: 0, value: 100 }, + { t: 2, value: 0 }, + ]), + ).toEqual([ + { t: 0, value: 100 }, + { t: 2, value: 100 }, + ]); + }); +}); + +describe('cumulativeCompletedRequests', () => { + it('sorts completions and excludes cancelled requests (phase is the caller’s concern)', () => { + expect( + cumulativeCompletedRequests([ + request(4, 100, 10), + request(2, 100, 10), + request(1, 100, 10, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline + request(3, 100, 10, { cancelled: true }), + ]), + ).toEqual([ + { t: 0, value: 0 }, + { t: 1, value: 1 }, + { t: 2, value: 2 }, + { t: 4, value: 3 }, + ]); + }); + + it('returns no series when there are no successful completions', () => { + expect(cumulativeCompletedRequests([request(1, 100, 10, { cancelled: true })])).toEqual([]); + }); +}); + +describe('averageSequenceLengthInFlight', () => { + it('computes the event-time average across overlapping profiling requests', () => { + expect( + averageSequenceLengthInFlight( + [ + request(4, 100, 10, { start: 0, end: 4_000_000_000, isl: 100 }), + request(3, 100, 10, { start: 1_000_000_000, end: 3_000_000_000, isl: 300 }), + ], + 'isl', + ), + ).toEqual([ + { t: 0, value: 100 }, + { t: 1, value: 200 }, + { t: 3, value: 100 }, + { t: 4, value: 0 }, + ]); + }); + + it('excludes cancelled and missing sequence lengths (phase is the caller’s concern)', () => { + // Only the null-osl and cancelled rows are dropped; the warmup row is kept + // (the caller passes a phase-scoped timeline), so it produces a step series. + expect( + averageSequenceLengthInFlight( + [ + request(1, 100, 10, { osl: null }), + request(2, 100, 10, { osl: 20, cancelled: true }), + request(3, 100, 10, { osl: 30, phase: 'warmup', start: 0, end: 3_000_000_000 }), + ], + 'osl', + ), + ).toEqual([ + { t: 0, value: 30 }, + { t: 3, value: 0 }, + ]); + }); +}); + +describe('toggleThroughputSeries', () => { + it('allows either series to be hidden when both are selected', () => { + expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'input')]).toEqual(['decode']); + expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'decode')]).toEqual(['input']); + }); + + it('does not allow the final visible series to be hidden', () => { + const selected = new Set<'input' | 'decode'>(['decode']); + expect(toggleThroughputSeries(selected, 'decode')).toBe(selected); + }); + + it('allows the hidden series to be restored', () => { + expect([...toggleThroughputSeries(new Set(['decode']), 'input')]).toEqual(['decode', 'input']); + }); + + it('only includes the total running average when both series are visible', () => { + const input = [{ t: 0, value: 10 }]; + const decode = [{ t: 0, value: 20 }]; + + expect( + buildThroughputChartSeries(input, decode, new Set(['input', 'decode'])).map( + ({ name }) => name, + ), + ).toEqual(['Input (avg n=50)', 'Decode (avg n=50)', 'Total running avg (60s burn-in)']); + expect( + buildThroughputChartSeries(input, decode, new Set(['input'])).map(({ name }) => name), + ).toEqual(['Input (avg n=50)']); + expect( + buildThroughputChartSeries(input, decode, new Set(['decode'])).map(({ name }) => name), + ).toEqual(['Decode (avg n=50)']); + }); +}); + +describe('cumulativeUniqueInputTokens', () => { + it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => { + const out = cumulativeUniqueInputTokens({ + local_compute: [ + { t: 0, value: 100 }, + { t: 1, value: 50 }, + ], + local_cache_hit: [ + { t: 0, value: 900 }, + { t: 1, value: 950 }, + ], + external_kv_transfer: [ + { t: 0, value: 5000 }, + { t: 1, value: 6000 }, + ], + }); + expect(out).toEqual([ + { t: 0, value: 100 }, + { t: 1, value: 150 }, + ]); + }); + + it('recognizes the sglang compute/cache labels the builder emits', () => { + const out = cumulativeUniqueInputTokens({ + 'compute (miss)': [ + { t: 0, value: 10 }, + { t: 2, value: 20 }, + ], + 'cache hit (HBM)': [{ t: 0, value: 999 }], + 'cache hit (CPU offload)': [{ t: 2, value: 999 }], + }); + expect(out).toEqual([ + { t: 0, value: 10 }, + { t: 2, value: 30 }, + ]); + }); + + it('sums multiple non-cache buckets at the same timestamp', () => { + const out = cumulativeUniqueInputTokens({ + local_compute: [{ t: 0, value: 100 }], + miss: [{ t: 0, value: 25 }], + }); + expect(out).toEqual([{ t: 0, value: 125 }]); + }); + + it('is monotonic non-decreasing (no clamp needed — values are rates ≥ 0)', () => { + const out = cumulativeUniqueInputTokens({ + local_compute: [ + { t: 0, value: 300 }, + { t: 1, value: 0 }, + { t: 2, value: 10 }, + ], + }); + expect(out.map((p) => p.value)).toEqual([300, 300, 310]); + }); + + it('returns [] when there is no breakdown so the caller can fall back', () => { + expect(cumulativeUniqueInputTokens(undefined)).toEqual([]); + expect(cumulativeUniqueInputTokens({})).toEqual([]); + }); + + it('returns [] when every bucket is a cache tier (no computed signal)', () => { + expect( + cumulativeUniqueInputTokens({ + local_cache_hit: [{ t: 0, value: 100 }], + 'cache hit': [{ t: 0, value: 100 }], + }), + ).toEqual([]); + }); +}); + +describe('inflightUniqueTokens', () => { + it('sums active ISLs across cids as a step series (ends before starts on ties)', () => { + const out = inflightUniqueTokens([ + { cid: 'a', start: 0, end: 2e9, isl: 100 }, + { cid: 'a', start: 2e9, end: 4e9, isl: 150 }, // turn handoff at t=2 + { cid: 'b', start: 1e9, end: 3e9, isl: 200 }, + ]); + expect(out).toEqual([ + { t: 0, value: 0 }, + { t: 0, value: 100 }, + { t: 1, value: 300 }, + { t: 2, value: 200 }, // end of a's turn 1 processed first — no double count + { t: 2, value: 350 }, + { t: 3, value: 150 }, + { t: 4, value: 0 }, + ]); + }); + + it('counts one in-flight ISL per cid even when its requests overlap', () => { + const out = inflightUniqueTokens([ + { cid: 'a', start: 0, end: 3e9, isl: 100 }, + { cid: 'a', start: 1e9, end: 2e9, isl: 50 }, + ]); + expect(out).toEqual([ + { t: 0, value: 0 }, + { t: 0, value: 100 }, + { t: 1, value: 100 }, // nested request folded into the cid's max ISL + { t: 2, value: 0 }, + { t: 3, value: 0 }, + ]); + }); + + it('skips requests without a positive ISL and empty input', () => { + expect(inflightUniqueTokens([])).toEqual([]); + expect(inflightUniqueTokens([{ cid: 'a', start: 0, end: 1e9, isl: null }])).toEqual([]); + expect(inflightUniqueTokens([{ cid: 'a', start: 0, end: 1e9, isl: 0 }])).toEqual([]); + }); +}); + +describe('cumulativeDifferenceMonotonic', () => { + it('unions timestamps and clamps the difference to its running max', () => { + expect( + cumulativeDifferenceMonotonic( + [ + { t: 0, value: 10 }, + { t: 1, value: 10 }, + ], + [ + { t: 0, value: 5 }, + { t: 2, value: 20 }, // drives the raw diff negative — clamp holds + ], + ), + ).toEqual([ + { t: 0, value: 5 }, + { t: 1, value: 15 }, + { t: 2, value: 15 }, + ]); + }); +}); + +describe('interpAt', () => { + it('linearly interpolates between samples and clamps outside the range', () => { + const data = [ + { t: 0, value: 0 }, + { t: 10, value: 100 }, + ]; + expect(interpAt(data, 5)).toBe(50); + expect(interpAt(data, -1)).toBe(0); + expect(interpAt(data, 11)).toBe(100); + expect(interpAt([], 5)).toBeNull(); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/time-series-math.ts b/packages/app/src/components/inference/agentic-point/time-series-math.ts new file mode 100644 index 00000000..7242db4d --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/time-series-math.ts @@ -0,0 +1,491 @@ +/** + * Pure data-shaping helpers behind the agentic point-detail time-series + * charts: rolling/cumulative aggregations over `TimeSeriesPoint[]` server + * scrapes and per-request timeline records. No React, no SVG — everything + * here is unit-testable in isolation (see time-series-math.test.ts). + */ + +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; +import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics'; + +/** One drawable line in a TimeSeriesChart. */ +export interface ChartSeries { + name: string; + /** The line to draw (caller pre-smooths if desired). */ + data: TimeSeriesPoint[]; + /** Optional raw per-scrape values; rendered as low-opacity scatter behind the line. */ + rawData?: TimeSeriesPoint[]; + color: string; + /** Override default stroke width (1.8). Use higher values for emphasis lines. */ + strokeWidth?: number; + /** Stroke opacity (0..1). Use < 1 for background/underlay lines. */ + strokeOpacity?: number; + /** Hide from the hover legend (e.g. per-engine underlay lines that + * would clutter the tooltip). The path still renders. */ + hideFromHover?: boolean; +} + +export type RequestMetric = 'interactivity' | 'ttft' | 'e2e'; +export type RequestPercentile = 'p75' | 'p90'; +export type ThroughputSeriesKey = 'input' | 'decode'; + +/** Toggle one throughput series while preserving the at-least-one invariant. */ +export function toggleThroughputSeries( + selected: ReadonlySet, + key: ThroughputSeriesKey, +): ReadonlySet { + if (selected.has(key) && selected.size === 1) return selected; + const next = new Set(selected); + if (next.has(key)) next.delete(key); + else next.add(key); + return next; +} + +/** Linear-interpolated percentile (matches numpy's default method). */ +export function quantile(sortedAsc: number[], q: number): number { + if (sortedAsc.length === 1) return sortedAsc[0]!; + const pos = (sortedAsc.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sortedAsc[lo]!; + return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo); +} + +/** Linear-interpolated value at time `t` from a time-sorted series. */ +export function interpAt(data: TimeSeriesPoint[], t: number): number | null { + if (data.length === 0) return null; + if (t <= data[0]!.t) return data[0]!.value; + if (t >= data.at(-1)!.t) return data.at(-1)!.value; + // Binary search + let lo = 0; + let hi = data.length - 1; + while (hi - lo > 1) { + const mid = (lo + hi) >> 1; + if (data[mid]!.t <= t) lo = mid; + else hi = mid; + } + const a = data[lo]!; + const b = data[hi]!; + if (b.t === a.t) return a.value; + const frac = (t - a.t) / (b.t - a.t); + return a.value + (b.value - a.value) * frac; +} + +/** + * Build raw request samples plus a trailing request-count percentile. E2E + * latency is measured from HTTP request start through final response byte. + * + * The percentile is computed in latency space. Interactivity then inverts + * the selected TPOT percentile, matching the aggregate chart convention: + * P90 interactivity = 1 / P90 TPOT (a conservative tail-latency view). + */ +export function rollingRequestMetric( + requests: readonly RequestRecord[], + metric: RequestMetric, + percentile: RequestPercentile, + windowSize = 50, +): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } { + const q = percentile === 'p75' ? 0.75 : 0.9; + // Phase is the caller's concern — the agentic detail page passes a + // phase-scoped (warmup or profiling) timeline. Here we only drop cancelled + // requests and samples without a usable latency value. + const samples = requests + .filter((request) => !request.cancelled) + .flatMap((request) => { + const latencyMs = + metric === 'ttft' + ? request.ttftMs + : metric === 'e2e' + ? (request.end - request.start) / 1e6 + : request.tpotMs; + if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return []; + return [{ t: request.end / 1e9, latencyMs }]; + }) + .toSorted((a, b) => a.t - b.t); + + const raw = samples.map(({ t, latencyMs }) => ({ + t, + value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000, + })); + const trend = samples.map(({ t }, i) => { + const start = Math.max(0, i - Math.max(1, windowSize) + 1); + const sorted = samples + .slice(start, i + 1) + .map((sample) => sample.latencyMs) + .toSorted((a, b) => a - b); + const latencyMs = quantile(sorted, q); + return { t, value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000 }; + }); + const prefixLatencies: number[] = []; + const cumulative = samples.map(({ t, latencyMs }) => { + let lo = 0; + let hi = prefixLatencies.length; + while (lo < hi) { + const mid = (lo + hi) >> 1; + if (prefixLatencies[mid]! <= latencyMs) lo = mid + 1; + else hi = mid; + } + prefixLatencies.splice(lo, 0, latencyMs); + const cumulativeLatencyMs = quantile(prefixLatencies, q); + return { + t, + value: metric === 'interactivity' ? 1000 / cumulativeLatencyMs : cumulativeLatencyMs / 1000, + }; + }); + + return { raw, trend, cumulative }; +} + +/** + * Time-weighted rolling average over a `windowS`-second trailing window. + * Treats the input as a step function (value held constant between + * samples) and integrates over the trailing window, dividing by the + * window length. Good for smoothing irregularly-sampled event series + * (e.g. request start/end events) where the regular sample-count + * `rollingAverage` would over-weight bursts of close-together events. + */ +export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] { + if (data.length === 0 || windowS <= 0) return data; + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + for (let i = 0; i < data.length; i++) { + const tEnd = data[i]!.t; + const tStart = Math.max(0, tEnd - windowS); + // Find the first sample j whose t is >= tStart; the step value at + // tStart is data[j-1].value if j > 0, else data[0].value. + let j = 0; + while (j < data.length && data[j]!.t < tStart) j++; + let prevT = tStart; + let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value; + let area = 0; + for (; j <= i; j++) { + const curT = data[j]!.t; + area += prevV * (curT - prevT); + prevT = curT; + prevV = data[j]!.value; + } + const dur = tEnd - tStart; + out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value }; + } + return out; +} + +/** Centered rolling average over `windowSize` samples. */ +export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] { + if (data.length === 0 || windowSize <= 1) return data; + const half = Math.floor(windowSize / 2); + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + for (let i = 0; i < data.length; i++) { + const start = Math.max(0, i - half); + const end = Math.min(data.length, i + half + 1); + let sum = 0; + let n = 0; + for (let j = start; j < end; j++) { + sum += data[j]!.value; + n++; + } + out[i] = { t: data[i]!.t, value: n > 0 ? sum / n : 0 }; + } + return out; +} + +/** + * Expanding-window cumulative mean from index 0..i. + * + * `burnInS` suppresses rendering during the unstable startup interval while + * retaining those samples in every later average. This avoids visually + * promoting a single bursty counter bucket without changing the run-to-date + * meaning of the line once it appears. + */ +export function cumulativeAverage(data: TimeSeriesPoint[], burnInS = 0): TimeSeriesPoint[] { + if (data.length === 0) return data; + const out: TimeSeriesPoint[] = []; + const firstT = data[0]!.t; + let sum = 0; + for (let i = 0; i < data.length; i++) { + sum += data[i]!.value; + if (data[i]!.t - firstT >= burnInS) { + out.push({ t: data[i]!.t, value: sum / (i + 1) }); + } + } + return out; +} + +/** + * Run-to-date time-weighted average of a step series. + * + * Duplicate timestamps are coalesced to their final value before integration; + * this is important for request handoffs where several start/end events occur + * at the same instant. Each value is held until the next timestamp. + */ +export function cumulativeTimeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] { + if (data.length === 0) return []; + const points: TimeSeriesPoint[] = []; + for (const point of data.toSorted((a, b) => a.t - b.t)) { + if (!Number.isFinite(point.t) || !Number.isFinite(point.value)) continue; + const previous = points.at(-1); + if (previous?.t === point.t) previous.value = point.value; + else points.push({ ...point }); + } + if (points.length === 0) return []; + + const firstT = points[0]!.t; + let previousT = firstT; + let previousValue = points[0]!.value; + let area = 0; + return points.map((point, index) => { + if (index === 0) return { t: point.t, value: point.value }; + area += previousValue * (point.t - previousT); + const duration = point.t - firstT; + previousT = point.t; + previousValue = point.value; + return { t: point.t, value: duration > 0 ? area / duration : point.value }; + }); +} + +/** + * Cumulative count of successfully completed (non-cancelled) requests by end + * time. Phase is the caller's concern — pass a phase-scoped timeline. + */ +export function cumulativeCompletedRequests(requests: readonly RequestRecord[]): TimeSeriesPoint[] { + const completionTimes = requests + .filter((request) => !request.cancelled) + .map((request) => request.end / 1e9) + .filter(Number.isFinite) + .toSorted((a, b) => a - b); + if (completionTimes.length === 0) return []; + return [{ t: 0, value: 0 }, ...completionTimes.map((t, index) => ({ t, value: index + 1 }))]; +} + +/** + * Retrospective average sequence length among requests active at each event. + * OSL uses the request's final observed length across its whole lifetime. + */ +export function averageSequenceLengthInFlight( + requests: readonly RequestRecord[], + metric: 'isl' | 'osl', +): TimeSeriesPoint[] { + const events = new Map(); + const addEvent = (t: number, tokenDelta: number, countDelta: number) => { + const current = events.get(t) ?? { tokenDelta: 0, countDelta: 0 }; + current.tokenDelta += tokenDelta; + current.countDelta += countDelta; + events.set(t, current); + }; + + // Phase is the caller's concern — pass a phase-scoped timeline. + for (const request of requests) { + const tokens = request[metric]; + if ( + request.cancelled || + tokens === null || + !Number.isFinite(tokens) || + tokens < 0 || + request.end < request.start + ) { + continue; + } + addEvent(request.start / 1e9, tokens, 1); + addEvent(request.end / 1e9, -tokens, -1); + } + + let tokensInFlight = 0; + let requestsInFlight = 0; + return [...events.entries()] + .toSorted((a, b) => a[0] - b[0]) + .map(([t, event]) => { + tokensInFlight += event.tokenDelta; + requestsInFlight += event.countDelta; + return { t, value: requestsInFlight > 0 ? tokensInFlight / requestsInFlight : 0 }; + }); +} + +// A promptTokensBySource bucket label denotes tokens served from some cache +// tier (local prefix cache, offloaded/host KV, remote KV transfer) rather than +// freshly computed. Matches vllm labels (`local_cache_hit`, +// `external_kv_transfer`) and the sglang labels the chart-series builder emits +// (`cache hit (HBM)`, `cache hit (CPU offload)`, `cache hit`). +const CACHE_SOURCE_RE = /cache|hit|transfer|reuse/iu; + +/** + * Cumulative "unique" (freshly prefill-computed) input tokens from the + * promptTokensBySource breakdown: total prompt tokens minus everything served + * from a cache tier. The breakdown's buckets sum to the real prompt-token + * total per scrape, so this is internally consistent and naturally monotonic. + * + * Preferred over `cumulativeDifferenceMonotonic(prefillTps, prefixCacheHitsTps)` + * because `vllm:prefix_cache_hits` re-counts tokens across chunked-prefill / + * preemption scheduler passes — its cumulative routinely exceeds the prompt + * tokens ever received, which drove the difference deeply negative and froze + * the monotonic-clamped curve at whatever it reached in the first few seconds. + * + * Any bucket whose label isn't recognizably a cache tier counts as computed + * (the safe direction for "unique"): a new fresh-compute label over-reports + * unique slightly rather than silently freezing the line. Returns [] when no + * breakdown is available so the caller can fall back. + */ +export function cumulativeUniqueInputTokens( + promptTokensBySource: Record | undefined, +): TimeSeriesPoint[] { + if (!promptTokensBySource) return []; + const computedByT = new Map(); + let sawComputed = false; + for (const [source, series] of Object.entries(promptTokensBySource)) { + if (CACHE_SOURCE_RE.test(source)) continue; + sawComputed = true; + for (const p of series) computedByT.set(p.t, (computedByT.get(p.t) ?? 0) + p.value); + } + if (!sawComputed) return []; + const out: TimeSeriesPoint[] = []; + let sum = 0; + for (const t of [...computedByT.keys()].toSorted((x, y) => x - y)) { + sum += computedByT.get(t)!; + out.push({ t, value: sum }); + } + return out; +} + +/** + * Per-event step series: at each request start/end, sum the ISLs of + * currently-active requests across distinct `cid`s. Within a single + * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N), + * so each cid contributes at most one in-flight ISL at a time. Across + * different cids we assume content is independent (parent ↔ subagent + * and conv ↔ conv share negligible prefix in practice — cross-conv + * dedup added ~0.25 pp to theoretical hit rate, so treating them as + * independent is a tight approximation of the true in-flight unique + * token count). + * + * Output is a step function: one point per event, value held constant + * until the next event. Time axis is seconds relative to the earliest + * event in `requests`. + */ +export function inflightUniqueTokens( + requests: readonly { cid: string; start: number; end: number; isl: number | null }[], +): TimeSeriesPoint[] { + if (requests.length === 0) return []; + // The request_timeline timestamps are ns-relative to its own origin. + // Convert events to seconds and emit a step series. + interface Event { + tNs: number; + kind: 'start' | 'end'; + cid: string; + isl: number; + } + const events: Event[] = []; + for (const r of requests) { + const isl = r.isl ?? 0; + if (isl <= 0) continue; + events.push( + { tNs: r.start, kind: 'start', cid: r.cid, isl }, + { tNs: r.end, kind: 'end', cid: r.cid, isl }, + ); + } + if (events.length === 0) return []; + // Sort by time; on ties, process 'end' before 'start' so a same-instant + // turn handoff within one cid doesn't transiently double-count. + events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1)); + + // Active ISL per cid (max in case the same cid somehow has overlapping + // events; in practice it's always 0 or 1 request at a time per cid). + const activeByCid = new Map(); + let total = 0; + const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }]; + for (const e of events) { + const tSec = e.tNs / 1e9; + if (e.kind === 'start') { + const prev = activeByCid.get(e.cid) ?? 0; + const next = Math.max(prev, e.isl); + activeByCid.set(e.cid, next); + total += next - prev; + } else { + const cur = activeByCid.get(e.cid) ?? 0; + if (cur > 0) { + total -= cur; + activeByCid.delete(e.cid); + } + } + out.push({ t: tSec, value: Math.max(0, total) }); + } + return out; +} + +/** + * Monotonic-non-decreasing cumulative difference of two rate series: + * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce + * a running max so the curve never dips below its prior value. + * + * Use this to plot things like "cumulative cache-missed tokens" where the + * true value can only ever grow, but the underlying per-tick rates can + * temporarily look negative due to counter timing skew between scrapes + * (vllm's `prefix_cache_hits` and `prompt_tokens` counters can lag each + * other by ~5-10 s in our data even though their lifetime totals agree). + * + * `a` and `b` may have different (or overlapping) timestamp sets — both + * are unioned and walked in time order. Output has one point per unique + * timestamp present in either input. + */ +export function cumulativeDifferenceMonotonic( + a: TimeSeriesPoint[], + b: TimeSeriesPoint[], +): TimeSeriesPoint[] { + const aByT = new Map(a.map((p) => [p.t, p.value])); + const bByT = new Map(b.map((p) => [p.t, p.value])); + const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y); + const out: TimeSeriesPoint[] = Array.from({ length: allT.length }); + let cumA = 0; + let cumB = 0; + let runningMax = 0; + for (let i = 0; i < allT.length; i++) { + const t = allT[i]!; + cumA += aByT.get(t) ?? 0; + cumB += bByT.get(t) ?? 0; + const diff = cumA - cumB; + if (diff > runningMax) runningMax = diff; + out[i] = { t, value: runningMax }; + } + return out; +} + +/** Pointwise sum of two arrays sharing the same t index. */ +function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] { + const n = Math.min(a.length, b.length); + const out: TimeSeriesPoint[] = Array.from({ length: n }); + for (let i = 0; i < n; i++) { + out[i] = { t: a[i]!.t, value: a[i]!.value + b[i]!.value }; + } + return out; +} + +/** Build throughput lines from the currently visible input/decode signals. */ +export function buildThroughputChartSeries( + input: TimeSeriesPoint[], + decode: TimeSeriesPoint[], + selected: ReadonlySet, +): ChartSeries[] { + const series: ChartSeries[] = []; + if (selected.has('input')) { + series.push({ + name: 'Input (avg n=50)', + data: rollingAverage(input, 50), + color: '#3b82f6', + strokeWidth: 1.6, + }); + } + if (selected.has('decode')) { + series.push({ + name: 'Decode (avg n=50)', + data: rollingAverage(decode, 50), + color: '#f97316', + strokeWidth: 1.6, + }); + } + if (selected.size === 2) { + series.push({ + name: 'Total running avg (60s burn-in)', + data: cumulativeAverage(sumSeries(input, decode), 60), + color: '#ef4444', + strokeWidth: 3, + }); + } + return series; +} diff --git a/packages/app/src/components/inference/agentic-point/timeline-bars.tsx b/packages/app/src/components/inference/agentic-point/timeline-bars.tsx new file mode 100644 index 00000000..a5444cb2 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/timeline-bars.tsx @@ -0,0 +1,252 @@ +'use client'; + +import { memo } from 'react'; + +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; + +import { + CHART_WIDTH, + HEADER_HEIGHT, + PADDING_RIGHT, + ROW_GAP, + ROW_HEIGHT, + timelineSvgHeight, +} from './timeline-layout'; +import { formatTickLabel } from './timeline-format'; +import { conversationHref, type RequestTimelineRow } from './timeline-rows'; + +/** Phase color overlay drawn as a thin strip at the bottom of each bar. */ +const PHASE_COLORS: Record = { + profiling: '#22c55e', + warmup: '#94a3b8', + unknown: '#64748b', +}; + +// Time-axis tick spacing candidates (~8 ticks across the visible window, +// snapped to the first nice multiple that fits). +const NICE_TICK_MS = [ + 100, 250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 120_000, 300_000, 600_000, 1_800_000, +]; + +export interface TimelineBarsProps { + rows: RequestTimelineRow[]; + expandedSubagents: ReadonlySet; + /** Absolute ns timestamp of the visible data's origin (min credit). */ + dataStart: number; + /** Visible window (ns offsets from dataStart). */ + vStart: number; + vEnd: number; + datasetSlug?: string | null; + onBarHover: (e: React.MouseEvent, row: RequestTimelineRow, req: RequestRecord) => void; + onBarLeave: () => void; + /** Plain left-click SPA navigation; modified clicks fall through to the href. */ + onBarClick: (e: React.MouseEvent, req: RequestRecord) => void; +} + +/** + * The static SVG content of the timeline: time axis, row separators, and every + * request bar. Memoized so tooltip/cursor mousemove state changes in the parent + * (which fire on every pointer move) don't re-render thousands of bar rects — + * only zoom/pan, filter, and expansion changes reach this subtree. + */ +export const TimelineBars = memo( + ({ + rows, + expandedSubagents, + dataStart, + vStart, + vEnd, + datasetSlug, + onBarHover, + onBarLeave, + onBarClick, + }: TimelineBarsProps) => { + const svgHeight = timelineSvgHeight(rows.length); + const visibleDur = Math.max(vEnd - vStart, 1); + const scale = (CHART_WIDTH - PADDING_RIGHT) / visibleDur; + // Local coords: convert ns offset from dataStart to x px. + const xOf = (ns: number) => (ns - dataStart - vStart) * scale; + + // Time-axis ticks (~8 across visible window, snapped to nice second multiples). + const targetMs = visibleDur / 1e6 / 8; + const tickMs = NICE_TICK_MS.find((n) => n >= targetMs) ?? targetMs; + const tickNs = tickMs * 1e6; + const ticks: number[] = []; + const tickStart = Math.floor(vStart / tickNs) * tickNs; + for (let t = tickStart; t <= vEnd + tickNs; t += tickNs) { + if (t >= vStart && t <= vEnd) ticks.push(t); + } + + return ( + <> + {/* Header / time-axis baseline */} + + + {/* Time axis ticks */} + {ticks.map((t) => { + // Convert visible-window ns offset → x px (the tick array + // is already in dataStart-relative coords). + const x = (t - vStart) * scale; + return ( + + + + {formatTickLabel(t)} + + + ); + })} + + {/* Row separators */} + {rows.map((row, idx) => ( + + ))} + + {/* Request bars */} + {rows.map((row, rowIdx) => { + const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; + const barH = ROW_HEIGHT - 4; + // For multi-stream subagent containers, suppress the union + // bars when expanded — the child stream rows draw them + // individually instead, so we'd double-draw otherwise. + if ( + row.kind === 'subagent' && + (row.streamCount ?? 1) > 1 && + expandedSubagents.has(row.key) + ) { + return null; + } + return row.requests.map((req) => { + const xCredit = xOf(req.credit); + const xStart = xOf(req.start); + const xEnd = xOf(req.end); + // Cull bars entirely outside the visible window so big + // benchmarks don't render thousands of zero-width rects. + if (xEnd < -2 || xCredit > CHART_WIDTH + 2) return null; + const runW = Math.max(xEnd - xStart, 1); + const queueW = Math.max(xStart - xCredit, 0); + const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!; + const barKey = `${req.cid}-${req.ti}-${req.start}`; + const barChildren = ( + <> + {/* Queue lead-in (faint) — only drawn when noticeable. */} + {queueW >= 1 && ( + + )} + {/* Main bar — opacity stepped down with depth so + parent > subagent > stream reads visually. */} + + {/* Phase strip at bottom */} + + {/* Cancelled X overlay */} + {req.cancelled && runW > 6 && ( + + )} + + ); + // No source dataset → not linkable; plain group. + if (!datasetSlug) { + return ( + onBarHover(e, row, req)} + onMouseLeave={onBarLeave} + > + {barChildren} + + ); + } + // Linkable: render a real SVG anchor with the conversation + // href so the browser's native "open in new tab" works + // (right-click menu, ⌘/Ctrl-click, middle-click). Plain + // left-click stays an in-app navigation; modified or + // non-primary clicks fall through to the browser. Suppress + // the native link drag so it doesn't fight the pan gesture. + return ( + onBarHover(e, row, req)} + onMouseLeave={onBarLeave} + onClick={(e) => onBarClick(e, req)} + onDragStart={(e) => e.preventDefault()} + style={{ cursor: 'pointer' }} + > + {barChildren} + + ); + }); + })} + + ); + }, +); diff --git a/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts new file mode 100644 index 00000000..47c0f034 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.test.ts @@ -0,0 +1,69 @@ +import { describe, expect, it } from 'vitest'; + +import { countLeq, countLt, cursorStatsAt } from './timeline-cursor-stats'; + +describe('countLeq / countLt', () => { + const sorted = [1, 3, 3, 5, 9]; + + it('counts values <= / < target with binary search', () => { + expect(countLeq(sorted, 3)).toBe(3); + expect(countLt(sorted, 3)).toBe(1); + expect(countLeq(sorted, 0)).toBe(0); + expect(countLt(sorted, 0)).toBe(0); + expect(countLeq(sorted, 9)).toBe(5); + expect(countLt(sorted, 9)).toBe(4); + expect(countLeq(sorted, 100)).toBe(5); + }); + + it('handles empty arrays', () => { + expect(countLeq([], 1)).toBe(0); + expect(countLt([], 1)).toBe(0); + }); +}); + +describe('cursorStatsAt', () => { + // Three requests on a shared clock: + // A: credit 0, start 2, end 10 + // B: credit 1, start 5, end 8 + // C: credit 12, start 14, end 20 + const times = { + credits: [0, 1, 12], + starts: [2, 5, 14], + ends: [8, 10, 20], + }; + + it('counts running, waiting, and completed at an instant', () => { + // t=3: A running, B credited but not started, C not yet credited. + expect(cursorStatsAt(times, 3)).toEqual({ + running: 1, + waiting: 1, + completed: 0, + inflight: 2, + }); + // t=6: A and B running. + expect(cursorStatsAt(times, 6)).toEqual({ + running: 2, + waiting: 0, + completed: 0, + inflight: 2, + }); + // t=13: A and B done, C waiting in queue. + expect(cursorStatsAt(times, 13)).toEqual({ + running: 0, + waiting: 1, + completed: 2, + inflight: 1, + }); + }); + + it('counts a request as still running at its exact end instant', () => { + // end < t (strict) excludes the request from "ended", so at t === end it + // still counts as running — matches the popover's documented semantics. + expect(cursorStatsAt(times, 8).running).toBe(2); + expect(cursorStatsAt(times, 8).completed).toBe(1); + }); + + it('never returns negative counts on inconsistent columns', () => { + expect(cursorStatsAt({ credits: [], starts: [0], ends: [] }, 5).waiting).toBe(0); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts new file mode 100644 index 00000000..801cec95 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/timeline-cursor-stats.ts @@ -0,0 +1,57 @@ +/** + * Pure math behind the cursor stats popover: count how many requests are + * running / waiting / completed at a given instant, in O(log n) per query via + * binary search over pre-sorted timestamp columns. + */ + +/** Pre-sorted (ascending) timestamp columns for one filtered request set. */ +export interface SortedRequestTimes { + credits: number[]; + starts: number[]; + ends: number[]; +} + +export interface CursorStats { + running: number; + waiting: number; + completed: number; + inflight: number; +} + +/** Number of values in a sorted ascending array that are <= target. */ +export function countLeq(sorted: number[], target: number): number { + let lo = 0; + let hi = sorted.length; + while (lo < hi) { + const mid = (lo + hi) >>> 1; + if (sorted[mid]! <= target) lo = mid + 1; + else hi = mid; + } + return lo; +} + +/** Number of values in a sorted ascending array that are < target. */ +export function countLt(sorted: number[], target: number): number { + let lo = 0; + let hi = sorted.length; + while (lo < hi) { + const mid = (lo + hi) >>> 1; + if (sorted[mid]! < target) lo = mid + 1; + else hi = mid; + } + return lo; +} + +/** + * Request counts at time t (ns offset on the same axis as the sorted columns): + * running = #(start <= t) - #(end < t) + * waiting = #(credit <= t) - #(start <= t) + * completed = #(end <= t) + */ +export function cursorStatsAt(times: SortedRequestTimes, t: number): CursorStats { + const startsLeq = countLeq(times.starts, t); + const running = Math.max(0, startsLeq - countLt(times.ends, t)); + const waiting = Math.max(0, countLeq(times.credits, t) - startsLeq); + const completed = countLeq(times.ends, t); + return { running, waiting, completed, inflight: running + waiting }; +} diff --git a/packages/app/src/components/inference/agentic-point/timeline-format.ts b/packages/app/src/components/inference/agentic-point/timeline-format.ts new file mode 100644 index 00000000..1c0020f3 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/timeline-format.ts @@ -0,0 +1,15 @@ +/** Time formatting shared by the timeline axis, header stats, and tooltips. */ + +/** Format ns offset → "+12.3s" / "+1.2m". */ +export function formatTickLabel(ns: number): string { + const ms = ns / 1e6; + if (ms < 1000) return `+${ms.toFixed(0)}ms`; + if (ms < 60_000) return `+${(ms / 1000).toFixed(ms < 10_000 ? 1 : 0)}s`; + return `+${(ms / 60_000).toFixed(1)}m`; +} + +export function formatDuration(ms: number): string { + if (ms < 1000) return `${ms.toFixed(0)}ms`; + if (ms < 60_000) return `${(ms / 1000).toFixed(2)}s`; + return `${(ms / 60_000).toFixed(2)}m`; +} diff --git a/packages/app/src/components/inference/agentic-point/timeline-layout.ts b/packages/app/src/components/inference/agentic-point/timeline-layout.ts new file mode 100644 index 00000000..7043e487 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/timeline-layout.ts @@ -0,0 +1,21 @@ +/** Layout constants shared by the timeline component and its SVG content. */ + +// The timeline body is capped at this height and scrolls internally, so a run +// with many conversations/workers doesn't make the card grow unbounded and push +// the rest of the detail page down. Sized to show ~16 rows + the header. +export const TIMELINE_BODY_MAX_HEIGHT = 480; + +// Wide enough for a full 36-char conversation id at 10px font, plus the +// indent + color stripe + count badge. Subagent rows inherit the same +// width but truncate the longer "↳ subagent N · hash" tail with ellipsis. +export const LABEL_WIDTH = 360; +export const ROW_HEIGHT = 22; +export const ROW_GAP = 3; +export const HEADER_HEIGHT = 24; +export const PADDING_RIGHT = 12; +export const CHART_WIDTH = 920; + +/** Chart height for a given row count (header + rows + bottom padding). */ +export function timelineSvgHeight(rowCount: number): number { + return HEADER_HEIGHT + rowCount * (ROW_HEIGHT + ROW_GAP) + 6; +} diff --git a/packages/app/src/components/inference/agentic-point/timeline-rows.ts b/packages/app/src/components/inference/agentic-point/timeline-rows.ts new file mode 100644 index 00000000..14bda4ae --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/timeline-rows.ts @@ -0,0 +1,476 @@ +/** + * Pure row-building logic for the request timeline: cid parsing, deep-link + * hrefs, stable ordering/coloring, and grouping requests into Gantt rows. + * No React — everything here is unit-testable data transformation. + */ + +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; + +export type RowMode = 'conversation' | 'worker'; + +/** + * The dataset conversation id for a request: the cid with any subagent/forked + * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in + * dataset_conversations, so it deep-links into /datasets//conversations/. + */ +export function datasetConvId(cid: string): string { + const i = cid.indexOf('::'); + return i === -1 ? cid : cid.slice(0, i); +} + +/** + * The subagent id encoded in a cid (`…::sa:[:s|:aux:]`), or null + * for a main-conversation request. The harness fans a single subagent into + * parallel streams with a `:s` or `:aux:` suffix; the dataset + * SubagentNode.agentId is the bare base (e.g. `subagent_001_b00fdc12`). Agent + * ids never contain a colon, so the base is everything up to the first one. + */ +export function subagentIdOf(cid: string): string | null { + const i = cid.indexOf('::sa:'); + if (i === -1) return null; + const raw = cid.slice(i + '::sa:'.length); + const colon = raw.indexOf(':'); + return colon === -1 ? raw : raw.slice(0, colon); +} + +/** + * Deep-link URL for the dataset conversation a request maps to. Carries the turn + * (and, for subagent requests, the subagent id) so the flamegraph can scroll to + * / highlight the exact node. Used both for SPA navigation on click and as the + * real `href` on the request bar so the browser's native "open in new tab" + * (right-click, ⌘/Ctrl-click, middle-click) works. + */ +export function conversationHref(datasetSlug: string, req: RequestRecord): string { + const convId = req.srcTrace ?? datasetConvId(req.cid); + const params = new URLSearchParams({ turn: String(req.ti) }); + if (typeof req.srcOuter === 'number' && Number.isInteger(req.srcOuter) && req.srcOuter >= 0) { + params.set('raw', String(req.srcOuter)); + if (typeof req.srcInner === 'number' && Number.isInteger(req.srcInner) && req.srcInner >= 0) { + params.set('inner', String(req.srcInner)); + } + } + const sa = subagentIdOf(req.cid); + if (sa && !params.has('inner')) params.set('sa', sa); + return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`; +} + +/** Human label for where a request came from (raw trace index or replay turn). */ +export function requestSourceLabel(req: RequestRecord): string { + if (typeof req.srcOuter === 'number') { + if (typeof req.srcInner === 'number') return `raw ${req.srcOuter} / child ${req.srcInner}`; + return `raw ${req.srcOuter}`; + } + return `replay turn ${req.ti + 1}`; +} + +export interface RequestIdleStats { + /** Total time between the first start and last end with no request running. */ + idleNs: number; + /** Wall-clock span from the first request start to the final request end. */ + spanNs: number; +} + +/** + * Merge request intervals and sum the gaps between them. Queue time before a + * request starts is intentionally excluded: "in flight" means [start, end]. + */ +export function requestIdleStats(requests: readonly RequestRecord[]): RequestIdleStats { + const intervals = requests + .filter(({ start, end }) => Number.isFinite(start) && Number.isFinite(end) && end >= start) + .map(({ start, end }) => ({ start, end })) + .toSorted((a, b) => a.start - b.start || a.end - b.end); + if (intervals.length === 0) return { idleNs: 0, spanNs: 0 }; + + const firstStart = intervals[0]!.start; + let mergedEnd = intervals[0]!.end; + let idleNs = 0; + for (let i = 1; i < intervals.length; i++) { + const interval = intervals[i]!; + if (interval.start > mergedEnd) idleNs += interval.start - mergedEnd; + if (interval.end > mergedEnd) mergedEnd = interval.end; + } + return { idleNs, spanNs: mergedEnd - firstStart }; +} + +/** A stable color palette indexed by row-key hash. */ +const ROW_COLORS = [ + '#3b82f6', + '#ef4444', + '#10b981', + '#f59e0b', + '#a855f7', + '#06b6d4', + '#f97316', + '#84cc16', + '#ec4899', + '#14b8a6', + '#8b5cf6', + '#eab308', +]; + +/** + * Row kinds: + * parent — top-level conversation (depth 0) + * worker — worker swimlane (depth 0, worker mode) + * subagent — a subagent invocation (depth 1). Either a single + * stream (renders its own bars), or a multi-stream + * container whose bars are the union of its streams + * when collapsed. + * stream — one :sN stream of a multi-stream subagent (depth 2). + * Hidden by default; toggled in via the parent's chevron. + * aux — one :aux:N parallel lane (depth 2). Always visible + * beneath its owning subagent. + */ +type RowKind = 'parent' | 'worker' | 'subagent' | 'stream' | 'aux'; + +export interface RequestTimelineRow { + key: string; + label: string; + color: string; + requests: RequestRecord[]; + depth: number; + kind: RowKind; + /** Number of streams under this subagent (>=1). Only set for subagent rows. */ + streamCount?: number; + /** For stream rows: the parent subagent's row key (drives expand/collapse). */ + parentRowKey?: string; + /** Number of always-visible auxiliary lanes under this subagent. */ + auxCount?: number; +} + +/** + * Conversation ids for subagent calls look like + * ::sa:[:s|:aux:] + * The optional `:s` suffix is set when the harness fans a single + * subagent into multiple parallel "streams" (interval-graph + * decomposition in weka_trace._pack_into_streams). We split it off so + * we can group every parallel lane under a single subagent header row. + * + * Aux lanes can also hang directly off the main conversation (no `::sa:` + * segment): `::aux:` or `::aux:red:`. + * These are parallel requests belonging to the main agent itself, so they + * nest under the parent conversation row rather than forming their own + * top-level group. + */ +export function splitTimelineCid(cid: string): { + parent: string; + subagentBase: string | null; + stream: number | null; + aux: string | null; +} { + const sep = cid.indexOf('::sa:'); + if (sep === -1) { + const auxSep = cid.indexOf('::aux:'); + if (auxSep !== -1) { + return { + parent: cid.slice(0, auxSep), + subagentBase: null, + stream: null, + aux: cid.slice(auxSep + '::aux:'.length), + }; + } + return { parent: cid, subagentBase: null, stream: null, aux: null }; + } + const parent = cid.slice(0, sep); + const raw = cid.slice(sep + 5); + const auxMatch = /^(?[^:]+):aux:(?.+)$/.exec(raw); + if (auxMatch) { + return { + parent, + subagentBase: auxMatch.groups!.base!, + stream: null, + aux: auxMatch.groups!.aux!, + }; + } + const m = /^(?.*):s(?\d+)$/.exec(raw); + if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]), aux: null }; + return { parent, subagentBase: raw, stream: null, aux: null }; +} + +/** + * Stable order/color index for the top-level row groups (conversations in + * conversation mode, workers in worker mode), keyed by group id and computed + * over the FULL (unfiltered) request set. Both the row ordering and the color + * palette are driven by this index, so a conversation/worker keeps the same + * position and color when the phase filter changes the visible subset — without + * it, filtering to warmup vs profiling re-sorts and re-colors by whatever subset + * is showing, making rows jump and swap colors. + * + * Groups that span BOTH phases sort first. The shared set is by definition + * present in either phase's view, so this leading block renders identically in + * both — a conversation that carries over from warmup into profiling stays on + * the exact same row when the toggle flips. Phase-exclusive groups follow, and + * only they reflow between views. Within each block the order key is the + * group's earliest request start across all phases; ties break on the group id + * for determinism. + */ +export function computeStableRowIndex( + requests: readonly RequestRecord[], + mode: RowMode, +): Map { + const firstStart = new Map(); + // Which phases each group appears in. Mirrors requestsForPhase's split: + // 'profiling' is exact, anything else counts as warmup. + const inProfiling = new Set(); + const inWarmup = new Set(); + for (const r of requests) { + const key = mode === 'conversation' ? splitTimelineCid(r.cid).parent : r.wid; + const cur = firstStart.get(key); + if (cur === undefined || r.start < cur) firstStart.set(key, r.start); + if (r.phase === 'profiling') inProfiling.add(key); + else inWarmup.add(key); + } + const spansBoth = (key: string) => inProfiling.has(key) && inWarmup.has(key); + const keys = [...firstStart.keys()].toSorted( + (a, b) => + Number(spansBoth(b)) - Number(spansBoth(a)) || + firstStart.get(a)! - firstStart.get(b)! || + (a < b ? -1 : a > b ? 1 : 0), + ); + const index = new Map(); + keys.forEach((key, i) => index.set(key, i)); + return index; +} + +/** + * Group requests into rows. In conversation mode, output order is: + * parent_conv + * subagent_001 (collapsed by default, container) + * :s0 (hidden unless expanded) + * :s1 + * aux 011 · parallel (always visible) + * subagent_002 + * ... + * + * `expandedSubagents` controls which subagent containers reveal their + * stream children. Bars on a collapsed subagent are the UNION of all its + * streams' requests — overlapping bars visually communicate the + * stream-level parallelism without expanding. + * + * `stableRowIndex` (optional) pins the top-level order + color per group so they + * survive phase-filter changes; when omitted it's derived from `requests` (the + * legacy self-contained behavior, used by unit tests). + */ +export function buildRequestTimelineRows( + requests: RequestRecord[], + mode: RowMode, + expandedSubagents: ReadonlySet, + stableRowIndex?: ReadonlyMap, +): RequestTimelineRow[] { + const index = stableRowIndex ?? computeStableRowIndex(requests, mode); + const colorFor = (key: string) => + ROW_COLORS[ + (((index.get(key) ?? 0) % ROW_COLORS.length) + ROW_COLORS.length) % ROW_COLORS.length + ]!; + const orderOf = (key: string) => index.get(key) ?? Number.POSITIVE_INFINITY; + if (mode !== 'conversation') { + // Worker mode: flat rows, sorted by first activity. + const groups = new Map(); + for (const r of requests) { + let list = groups.get(r.wid); + if (!list) { + list = []; + groups.set(r.wid, list); + } + list.push(r); + } + const rows: RequestTimelineRow[] = []; + for (const [key, list] of groups) { + list.sort((a, b) => a.start - b.start); + rows.push({ + key, + label: shortenWid(key), + color: colorFor(key), + requests: list, + depth: 0, + kind: 'worker', + }); + } + rows.sort( + (a, b) => orderOf(a.key) - orderOf(b.key) || a.requests[0]!.start - b.requests[0]!.start, + ); + return rows; + } + + // Conversation mode — tree: parent → subagent → stream/aux lane. + interface SubagentLanes { + streams: Map; + aux: Map; + } + interface Tree { + parentCid: string; + parentReqs: RequestRecord[]; + // Aux lanes hanging directly off the main agent (`::aux:…`). + parentAux: Map; + // subagentBase → primary streams + always-visible auxiliary lanes. + subagents: Map; + firstStart: number; + } + const trees = new Map(); + for (const r of requests) { + const { parent, subagentBase, stream, aux } = splitTimelineCid(r.cid); + let tree = trees.get(parent); + if (!tree) { + tree = { + parentCid: parent, + parentReqs: [], + parentAux: new Map(), + subagents: new Map(), + firstStart: Number.POSITIVE_INFINITY, + }; + trees.set(parent, tree); + } + if (subagentBase === null && aux !== null) { + const list = tree.parentAux.get(aux); + if (list) list.push(r); + else tree.parentAux.set(aux, [r]); + } else if (subagentBase === null) { + tree.parentReqs.push(r); + } else { + let lanes = tree.subagents.get(subagentBase); + if (!lanes) { + lanes = { streams: new Map(), aux: new Map() }; + tree.subagents.set(subagentBase, lanes); + } + if (aux === null) { + const list = lanes.streams.get(stream); + if (list) list.push(r); + else lanes.streams.set(stream, [r]); + } else { + const list = lanes.aux.get(aux); + if (list) list.push(r); + else lanes.aux.set(aux, [r]); + } + } + if (r.start < tree.firstStart) tree.firstStart = r.start; + } + + const sortedTrees = [...trees.values()].toSorted( + (a, b) => orderOf(a.parentCid) - orderOf(b.parentCid) || a.firstStart - b.firstStart, + ); + const rows: RequestTimelineRow[] = []; + for (const tree of sortedTrees) { + const color = colorFor(tree.parentCid); + // Parent row (use a placeholder key if the parent itself wasn't replayed). + tree.parentReqs.sort((a, b) => a.start - b.start); + const parentRowKey = tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`; + rows.push({ + key: parentRowKey, + label: tree.parentCid, + color, + requests: tree.parentReqs, + depth: 0, + kind: 'parent', + }); + + // Aux lanes belonging to the main agent itself (`::aux:…`), nested + // directly beneath the parent row. Always visible, like subagent aux lanes. + const parentAuxEntries = [...tree.parentAux.entries()].toSorted( + (a, b) => + (a[1][0]?.start ?? Number.POSITIVE_INFINITY) - (b[1][0]?.start ?? Number.POSITIVE_INFINITY), + ); + for (const [auxId, reqs] of parentAuxEntries) { + reqs.sort((a, b) => a.start - b.start); + rows.push({ + key: `${tree.parentCid}::aux:${auxId}`, + label: `aux ${auxId} · parallel`, + color, + requests: reqs, + depth: 1, + kind: 'aux', + parentRowKey, + }); + } + + // One subagent row per base (which may contain N streams). + const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => { + const aStart = Math.min( + ...[...a[1].streams.values(), ...a[1].aux.values()].map( + (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY, + ), + ); + const bStart = Math.min( + ...[...b[1].streams.values(), ...b[1].aux.values()].map( + (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY, + ), + ); + return aStart - bStart; + }); + for (const [saBase, lanes] of subagentEntries) { + const subagentKey = `${tree.parentCid}::sa:${saBase}`; + // Union of primary stream requests for collapsed-view bars. Aux lanes + // stay separate so their overlap remains visible as parallel work. + const allReqs: RequestRecord[] = []; + for (const reqs of lanes.streams.values()) allReqs.push(...reqs); + allReqs.sort((a, b) => a.start - b.start); + const streamCount = lanes.streams.size; + rows.push({ + key: subagentKey, + label: `↳ ${formatSubagentLabel(saBase)}`, + color, + requests: allReqs, + depth: 1, + kind: 'subagent', + streamCount, + auxCount: lanes.aux.size, + }); + + // Stream children only when expanded AND there's more than one + // stream (a single-stream subagent has nothing extra to show). + if (streamCount > 1 && expandedSubagents.has(subagentKey)) { + const streamEntries = [...lanes.streams.entries()].toSorted((a, b) => { + // Sort by stream index (null first as the "default" stream) + const ai = a[0] ?? -1; + const bi = b[0] ?? -1; + return ai - bi; + }); + for (const [streamIdx, reqs] of streamEntries) { + reqs.sort((a, b) => a.start - b.start); + rows.push({ + key: `${subagentKey}:s${streamIdx ?? '∅'}`, + label: `stream ${streamIdx ?? '∅'}`, + color, + requests: reqs, + depth: 2, + kind: 'stream', + parentRowKey: subagentKey, + }); + } + } + + // Aux lanes encode concurrent requests within the subagent. Keep them + // visible even when primary streams are collapsed so parallelism is not + // hidden behind an interaction. + const auxEntries = [...lanes.aux.entries()].toSorted( + (a, b) => + (a[1][0]?.start ?? Number.POSITIVE_INFINITY) - + (b[1][0]?.start ?? Number.POSITIVE_INFINITY), + ); + for (const [auxId, reqs] of auxEntries) { + reqs.sort((a, b) => a.start - b.start); + rows.push({ + key: `${subagentKey}:aux:${auxId}`, + label: `aux ${auxId} · parallel`, + color, + requests: reqs, + depth: 2, + kind: 'aux', + parentRowKey: subagentKey, + }); + } + } + } + return rows; +} + +/** `subagent_001_bf1c5c16` → `subagent 001 · bf1c` (compact, readable). */ +function formatSubagentLabel(raw: string): string { + const m = /^subagent_(?\d+)_(?[0-9a-f]+)$/iu.exec(raw); + if (!m) return raw; + return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`; +} + +/** `worker_4ae87bea` → `w_4ae8` (compact worker swimlane label). */ +export function shortenWid(wid: string): string { + return wid.replace(/^worker_/, 'w_').slice(0, 12); +} diff --git a/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx b/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx new file mode 100644 index 00000000..7aa63efc --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/timeline-tooltips.tsx @@ -0,0 +1,143 @@ +'use client'; + +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; + +import { formatDuration, formatTickLabel } from './timeline-format'; +import { cursorStatsAt, type SortedRequestTimes } from './timeline-cursor-stats'; +import { requestSourceLabel, shortenWid, type RequestTimelineRow } from './timeline-rows'; + +export interface TooltipData { + x: number; + y: number; + row: RequestTimelineRow; + req: RequestRecord; +} + +/** Per-request hover tooltip (fixed-position, follows the mouse). */ +export function TimelineTooltip({ data, linkable }: { data: TooltipData; linkable?: boolean }) { + const { row, req } = data; + const totalMs = (req.end - req.start) / 1e6; + const queueMs = (req.start - req.credit) / 1e6; + return ( +
+
+ + {row.label} + · {requestSourceLabel(req)} + {req.cancelled && · cancelled} +
+
+ Total + {formatDuration(totalMs)} + Queue wait + + {queueMs > 0.5 ? formatDuration(queueMs) : '—'} + + {req.ttftMs !== null && ( + <> + TTFT + + {formatDuration(req.ttftMs)} + + + )} + {req.isl !== null && ( + <> + ISL + + {req.isl.toLocaleString()} + + + )} + {req.osl !== null && ( + <> + OSL + + {req.osl.toLocaleString()} + + + )} + Phase + {req.phase} + {req.ad > 0 && ( + <> + Agent depth + {req.ad} + + )} + Worker + {shortenWid(req.wid)} +
+
+ Started at {formatTickLabel(req.start)} +
+ {linkable && ( +
+ Click to view this conversation in the dataset → +
+ )} +
+ ); +} + +export interface CursorState { + /** Cursor x in svg-local px (drives the crosshair line). */ + xPx: number; + /** ns offset from dataStart the cursor points at. */ + tNs: number; + clientX: number; + clientY: number; +} + +/** Cursor stats popover: requests in flight / waiting / completed at time t. */ +export function CursorPopover({ + cursor, + dataStart, + times, +}: { + cursor: CursorState; + dataStart: number; + times: SortedRequestTimes; +}) { + const t = cursor.tNs; + const { running, waiting, completed, inflight } = cursorStatsAt(times, t); + // Absolute wall-clock seconds since the timeline origin (dataStart). + const tSec = t / 1e9; + // Position the popover near the cursor without overflowing the viewport. + // 200 px wide; flip to the left of the cursor if it would clip the right. + const wantLeft = cursor.clientX + 14; + const left = + typeof window === 'undefined' || wantLeft + 220 < window.innerWidth + ? wantLeft + : cursor.clientX - 220; + return ( +
+
+ t = + + {tSec < 60 ? `${tSec.toFixed(3)} s` : `${(tSec / 60).toFixed(3)} m`} + +
+
+ In flight + {inflight} + running + {running} + waiting + {waiting} + Completed + {completed} +
+ {/* dataStart is informational — the displayed t is relative to it. */} +
+ relative to t₀ ({(dataStart / 1e9).toFixed(0)}s wall-clock) +
+
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts b/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts new file mode 100644 index 00000000..631bdd94 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/timeline-view-snapshot.ts @@ -0,0 +1,108 @@ +/** + * Persisted view-state snapshot for the request timeline (zoom window, row + * mode, phase filter, expansions, scroll offsets). Written to sessionStorage on + * click-through to a dataset conversation, consumed once on the next mount so + * the browser back button restores the user's exact position. + */ + +import type { StagePhase } from './phase-slice'; +import type { RowMode } from './timeline-rows'; + +// Two phases shown separately (no combined view) — matches the per-point detail +// stage toggle. Reuses StagePhase so the filter predicate is shared. +export type PhaseFilter = StagePhase; + +/** + * Persisted snapshot of the timeline's view state, used to restore the user's + * zoom / scroll / filter position when they return to the page (e.g. clicking a + * request to open the dataset flamegraph, then hitting the browser back button). + * Stored in sessionStorage keyed by point id; written on click-through and + * consumed once on the next mount. + */ +export interface TimelineViewSnapshot { + /** Zoom-pan window start (ns offset from dataStart). */ + viewStart: number; + /** Zoom-pan window end, or null when not zoomed (full extent). */ + viewEnd: number | null; + rowMode: RowMode; + phaseFilter: PhaseFilter; + /** Keys of expanded multi-stream subagent rows. */ + expanded: string[]; + /** Scroll container offsets (vertical row scroll + horizontal). */ + scrollTop: number; + scrollLeft: number; +} + +const TIMELINE_VIEW_SNAPSHOT_PREFIX = 'agentic-timeline-view:'; +const ROW_MODE_VALUES: readonly RowMode[] = ['conversation', 'worker']; +const PHASE_FILTER_VALUES: readonly PhaseFilter[] = ['warmup', 'profiling']; + +const finiteOr = (value: unknown, fallback: number): number => + typeof value === 'number' && Number.isFinite(value) ? value : fallback; + +/** + * Parse a persisted snapshot, coercing/validating each field and falling back + * to defaults so a malformed or stale blob can never break restore. Returns + * null only when the input is absent or not parseable JSON. + */ +export function parseTimelineViewSnapshot(raw: string | null): TimelineViewSnapshot | null { + if (!raw) return null; + let parsed: unknown; + try { + parsed = JSON.parse(raw); + } catch { + return null; + } + if (!parsed || typeof parsed !== 'object') return null; + const record = parsed as Record; + const rowMode = ROW_MODE_VALUES.includes(record.rowMode as RowMode) + ? (record.rowMode as RowMode) + : 'conversation'; + const phaseFilter = PHASE_FILTER_VALUES.includes(record.phaseFilter as PhaseFilter) + ? (record.phaseFilter as PhaseFilter) + : 'profiling'; + const viewEnd = + typeof record.viewEnd === 'number' && Number.isFinite(record.viewEnd) ? record.viewEnd : null; + const expanded = Array.isArray(record.expanded) + ? record.expanded.filter((entry): entry is string => typeof entry === 'string') + : []; + return { + viewStart: finiteOr(record.viewStart, 0), + viewEnd, + rowMode, + phaseFilter, + expanded, + scrollTop: finiteOr(record.scrollTop, 0), + scrollLeft: finiteOr(record.scrollLeft, 0), + }; +} + +function timelineSnapshotKey(pointId: number): string { + return `${TIMELINE_VIEW_SNAPSHOT_PREFIX}${pointId}`; +} + +export function saveTimelineViewSnapshot(pointId: number, snapshot: TimelineViewSnapshot): void { + if (typeof window === 'undefined') return; + try { + window.sessionStorage.setItem(timelineSnapshotKey(pointId), JSON.stringify(snapshot)); + } catch { + // sessionStorage can throw (private mode / quota exceeded) — restore is + // best-effort, so a failed write just means no restore next time. + } +} + +/** + * Read AND remove the snapshot (one-shot): we only want to restore once per + * click-through, so a later reload of the same point starts from defaults. + */ +export function consumeTimelineViewSnapshot(pointId: number): TimelineViewSnapshot | null { + if (typeof window === 'undefined') return null; + try { + const key = timelineSnapshotKey(pointId); + const raw = window.sessionStorage.getItem(key); + window.sessionStorage.removeItem(key); + return parseTimelineViewSnapshot(raw); + } catch { + return null; + } +} From 068c5b21d80ea8cbfcde288ad6b368fe5ba596f4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:12:15 -0500 Subject: [PATCH 09/40] feat(inference): agentic-traces integration in the dashboard chart and filters --- .../src/components/GlobalFilterContext.tsx | 27 +- packages/app/src/components/header/header.tsx | 6 + .../components/inference/InferenceContext.tsx | 202 ++++- .../inference/hooks/useChartData.ts | 267 ++++++- .../inference/inference-chart-config.json | 10 +- .../inference/replay/buildReplayTimeline.ts | 3 +- .../app/src/components/inference/types.ts | 79 ++ .../components/inference/ui/ChartControls.tsx | 38 +- .../components/inference/ui/ChartDisplay.tsx | 724 ++++++++++-------- .../src/components/inference/ui/GPUGraph.tsx | 76 +- .../ui/ScatterGraph.decoration.test.tsx | 7 + .../components/inference/ui/ScatterGraph.tsx | 421 +++++++--- .../inference/ui/UnofficialChartDisplay.tsx | 4 +- .../src/components/inference/utils.test.ts | 78 +- .../app/src/components/inference/utils.ts | 29 +- .../inference/utils/parallelism-label.test.ts | 58 ++ .../inference/utils/parallelism-label.ts | 79 ++ .../inference/utils/tooltip-utils.test.ts | 32 + .../inference/utils/tooltipUtils.ts | 202 +++-- .../app/src/components/ui/chart-legend.tsx | 26 + .../app/src/components/ui/chart-selectors.tsx | 150 ++++ .../src/components/ui/d3-chart-wrapper.tsx | 53 +- .../unofficial-run-provider.test.ts | 3 + .../components/unofficial-run-provider.tsx | 4 +- packages/app/src/lib/api.ts | 15 +- .../app/src/lib/benchmark-transform.test.ts | 95 ++- packages/app/src/lib/benchmark-transform.ts | 121 ++- packages/app/src/lib/chart-utils.test.ts | 39 +- packages/app/src/lib/chart-utils.ts | 33 +- .../app/src/lib/compare-pair-defaults.test.ts | 3 + packages/app/src/lib/compare-pair-defaults.ts | 1 + packages/app/src/lib/compare-ssr.test.ts | 7 + .../d3-chart/layers/scatter-points.test.ts | 50 +- .../src/lib/d3-chart/layers/scatter-points.ts | 97 ++- packages/app/src/lib/data-mappings.ts | 68 +- packages/app/src/lib/energy-metrics.test.ts | 20 + packages/app/src/lib/url-state.ts | 8 +- 37 files changed, 2469 insertions(+), 666 deletions(-) create mode 100644 packages/app/src/components/inference/utils/parallelism-label.test.ts create mode 100644 packages/app/src/components/inference/utils/parallelism-label.ts diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 6e7afb0b..fddf7871 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -12,6 +12,8 @@ import { useState, } from 'react'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; + // useLayoutEffect warns during SSR; alias to useEffect on the server (no-op there anyway). const useIsomorphicLayoutEffect = typeof window === 'undefined' ? useEffect : useLayoutEffect; @@ -22,8 +24,6 @@ function isEnumValue>(e: T, v: string): v is T[ const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u; const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; - import { useAvailability } from '@/hooks/api/use-availability'; import { useWorkflowInfo } from '@/hooks/api/use-workflow-info'; import { useUrlState } from '@/hooks/useUrlState'; @@ -100,7 +100,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record { const runs: Record = {}; for (const run of data.runs) { const runId = String(run.github_run_id); - const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id); + const runChangelogs = data.changelogs.filter( + (c) => String(c.workflow_run_id) === String(run.github_run_id), + ); runs[runId] = { runId, runDate: run.created_at, @@ -147,7 +149,11 @@ export function GlobalFilterProvider({ const [selectedSequence, setSelectedSequence] = useState(() => { if (initialSequence) return initialSequence; - return Sequence.EightK_OneK; + const urlSeq = getUrlParam('i_seq'); + if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence; + // Prefer Agentic Traces by default when the selected model has it; the + // effectiveSequence fallback below handles models without agentic data. + return Sequence.AgenticTraces; }); const initialValidPrecisions = useMemo( @@ -277,9 +283,7 @@ export function GlobalFilterProvider({ if (!availabilityRows) { return unofficialSeqs.length > 0 ? [...new Set(unofficialSeqs)] : SEQUENCE_OPTIONS; } - const dbSeqs = modelRows - .map((r) => islOslToSequence(r.isl, r.osl)) - .filter((s): s is Sequence => s !== null); + const dbSeqs = modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null); const merged = [...new Set([...dbSeqs, ...unofficialSeqs])]; return merged.length > 0 ? merged : SEQUENCE_OPTIONS; }, [availabilityRows, modelRows, unofficialAvailable, selectedModel]); @@ -298,7 +302,7 @@ export function GlobalFilterProvider({ if (!availabilityRows) { return unofficialPrecs.length > 0 ? [...new Set(unofficialPrecs)].toSorted() : ['fp4']; } - const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const dbPrecs = rows.map((r) => r.precision); const merged = [...new Set([...dbPrecs, ...unofficialPrecs])].toSorted(); return merged.length > 0 ? merged : ['fp4']; @@ -307,10 +311,7 @@ export function GlobalFilterProvider({ // Curve count per precision (distinct hw/framework/spec/disagg series) for the // selected model + sequence — drives the auto default toward the densest one. const precisionCurveCounts = useMemo( - () => - countCurvesByPrecision( - modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence), - ), + () => countCurvesByPrecision(modelRows.filter((r) => rowToSequence(r) === effectiveSequence)), [modelRows, effectiveSequence], ); @@ -346,7 +347,7 @@ export function GlobalFilterProvider({ // Dates available for selected model + sequence + precisions const availableDates = useMemo(() => { if (!availabilityRows) return []; - const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision)); if (rows.length === 0) { return [...new Set(seqRows.map((r) => r.date))].toSorted(); diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx index 8fbf52ac..95fc0acb 100644 --- a/packages/app/src/components/header/header.tsx +++ b/packages/app/src/components/header/header.tsx @@ -46,6 +46,12 @@ const NAV_LINKS = [ testId: 'nav-link-supporters', event: 'header_supporters_clicked', }, + { + href: '/datasets', + label: 'Datasets', + testId: 'nav-link-datasets', + event: 'header_datasets_clicked', + }, { href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' }, { href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' }, ] as const; diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 796a8eed..98962126 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -12,7 +12,7 @@ import { useState, } from 'react'; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; import { FAVORITE_PRESETS, @@ -44,7 +44,7 @@ import { import { useUrlState } from '@/hooks/useUrlState'; import { buildAvailabilityHwKey } from '@/lib/chart-utils'; import { getHardwareConfig, getModelSortIndex, isKnownGpu, TABLEAU_10 } from '@/lib/constants'; -import { getModelExclusion, MODEL_PREFIX_MAPPING } from '@/lib/data-mappings'; +import { getModelExclusion, MODEL_PREFIX_MAPPING, sequenceKind } from '@/lib/data-mappings'; import { MtpEngineConflictToast, type MtpEngineConflictDetail, @@ -57,7 +57,12 @@ import { } from '@/lib/exclusion'; import { filterRunsByModel, getDisplayLabel } from '@/lib/utils'; -import { useChartData } from './hooks/useChartData'; +import { + isAgenticOnlyXAxisMode, + useChartData, + X_AXIS_MODES, + type XAxisMode, +} from './hooks/useChartData'; import { resolveComparisonEntries } from './utils/comparisonEntry'; import { EMPTY_QUICK_FILTERS, @@ -150,10 +155,44 @@ export function InferenceProvider({ () => getUrlParam('i_metric') || initialYAxisMetric || 'y_tpPerGpu', ); const [selectedXAxisMetric, setSelectedXAxisMetric] = useState( - () => getUrlParam('i_xmetric') || 'p99_ttft', + () => getUrlParam('i_xmetric') || 'p90_ttft', ); const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState( - () => getUrlParam('i_e2e_xmetric') || null, + () => getUrlParam('i_e2e_xmetric') || 'p90_ttft', + ); + // Selected chart variant. Initialize from URL only — SSR cannot read URL, so + // computing a kind-based default here would diverge between server and client + // and cause a hydration mismatch. The scenario-kind default is applied in a + // post-mount effect below (and a ref tracks whether the user has overridden). + // + // SSR has no URL access, so seed with a fixed default and apply the URL + // value (if any) in a post-mount effect — keeps server + client first render + // identical and avoids "didn't match" hydration warnings when the URL holds + // a non-default mode. + const [selectedXAxisMode, setSelectedXAxisMode] = useState('ttft'); + const xAxisModeFromUrlRef = useRef(false); + useEffect(() => { + if (xAxisModeFromUrlRef.current) return; + const v = getUrlParam('i_xmode'); + if (v && (X_AXIS_MODES as readonly string[]).includes(v)) { + xAxisModeFromUrlRef.current = true; + setSelectedXAxisMode(v as XAxisMode); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the + // existing useChartData pipeline keys off that flag for the e2e chart's x-axis. + const handleSetXAxisMode = useCallback((mode: XAxisMode) => { + xAxisModeFromUrlRef.current = true; + setSelectedXAxisMode(mode); + // The e2e chart's x-axis metric is reconciled in a separate effect below, + // because it depends on sequence kind (fixed-seq has no p90_* metrics) and + // the agentic percentile, both of which can change independently. + }, []); + // Latency percentile applied to the chart x-axis for agentic scenarios. + // Values: 'p90' | 'p99'. Non-agentic charts ignore. + const [selectedPercentile, setSelectedPercentile] = useState( + () => getUrlParam('i_pctl') || 'p90', ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', @@ -201,6 +240,8 @@ export function InferenceProvider({ const dataQuickFilters = activeTab === 'historical' ? EMPTY_QUICK_FILTERS : quickFilters; const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({ urlPrefix: 'i_', + // Inference chart defaults to high contrast (?i_hc=0 overrides off). + defaultHighContrast: true, }); const [hideNonOptimal, setHideNonOptimal] = useState(() => getUrlParam('i_optimal') !== '0'); @@ -208,21 +249,22 @@ export function InferenceProvider({ // Legacy `?i_nolabel=1` from before the rename: keep hiding point labels // explicitly so the share link's intent survives future default changes. if (getUrlParam('i_nolabel') === '1') return false; + if (getUrlParam('i_label') === '0') return false; if (getUrlParam('i_label') === '1') return true; - // Old share links set `?i_advlabel=1` while keeping the labels default - // (shown). Mirror the toggle's auto-enable side-effect on load so those - // links still render advanced labels under the new default-off behavior. - if (getUrlParam('i_advlabel') === '1') return true; - return false; + // Default on: parallelism labels (also default on) are point labels and + // are pointless without them shown. + return true; }); const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1'); + // Parallelism labels default on (?i_advlabel=0 overrides off). const [useAdvancedLabels, setUseAdvancedLabels] = useState( - () => getUrlParam('i_advlabel') === '1', + () => getUrlParam('i_advlabel') !== '0', ); const [showGradientLabels, setShowGradientLabels] = useState( () => getUrlParam('i_gradlabel') === '1', ); - const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') !== '0'); + // Line labels default off (?i_linelabel=1 overrides on). + const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') === '1'); const [showSpeedOverlay, setShowSpeedOverlay] = useState(() => getUrlParam('i_speed') === '1'); const [showMinecraftOverlay, setShowMinecraftOverlay] = useState( () => getUrlParam('i_mc') === '1', @@ -291,13 +333,68 @@ export function InferenceProvider({ return ids.length > 0 ? ids.reduce((max, id) => (id > max ? id : max), ids[0]) : ''; }, [filteredAvailableRuns]); - // Only constrain the query when an earlier-than-latest run is selected; otherwise - // the chart shows the full latest view (and reuses the materialized-view fast path). + // Only constrain the base query when an earlier-than-latest run is selected. const asOfRunId = effectiveSelectedRunId && latestRunIdForModel && effectiveSelectedRunId !== latestRunIdForModel ? effectiveSelectedRunId : undefined; + // Run-selector scoping: only constrain benchmark data to a specific run when + // there's actually a disambiguation to make for the CURRENT model. The + // raw `availableRuns` is across ALL models on the date, so the picker may + // auto-select a run that produced nothing for the current model — passing + // that runId would return zero rows and hide the chart entirely. + // Compute the set of runs whose CHANGELOG explicitly mentions this model + + // precision. We can't reuse `filterRunsByModel` here because it has a + // fallback that returns all runs when nothing matches (so the picker still + // renders) — which would make us pass a runId that produced no rows for + // the current model, hiding the chart. + // Map each FULL config_key (model-precision-hardware-framework) a run's + // changelog claims to the set of runs claiming it. Single-run scoping should + // only kick in when two runs contest the SAME full key — e.g. a same-day + // re-run of one hardware — because then a DISTINCT ON merge could mix them + // and the user needs to pick which run wins. Runs covering DIFFERENT hardware + // of the same model (e.g. a B300 run and a B200 run on the same date) are + // complementary: both must render via carry-forward. Matching on model+ + // precision alone (the old behavior) wrongly treated those as alternatives + // and scoped the chart to one run, hiding the other GPU's curve. + const contestedRunIds = useMemo(() => { + const runsByConfigKey = new Map>(); + if (availableRuns) { + for (const [runId, runInfo] of Object.entries(availableRuns)) { + if (!runInfo.changelog) continue; + for (const entry of runInfo.changelog.entries) { + for (const key of entry.config_keys) { + const parts = key.split('-'); + if (modelPrefixes.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!)) { + let runs = runsByConfigKey.get(key); + if (!runs) { + runs = new Set(); + runsByConfigKey.set(key, runs); + } + runs.add(runId); + } + } + } + } + } + // A run is "contested" only if some full config_key it claims is also claimed + // by another run. Only then does picking a run disambiguate anything. + // Downstream (useChartData / mergeRunScopedRows) this no longer scopes the + // WHOLE chart to the run: only the configs the run actually produced are + // pinned to it, and every other config (e.g. another framework's same-day + // run) still carries forward from the normal latest-per-config rows. + const contested = new Set(); + for (const runs of runsByConfigKey.values()) { + if (runs.size > 1) for (const r of runs) contested.add(r); + } + return contested; + }, [availableRuns, modelPrefixes, effectivePrecisions]); + const benchmarkRunId = + effectiveSelectedRunId && contestedRunIds.has(String(effectiveSelectedRunId)) + ? String(effectiveSelectedRunId) + : undefined; + const { graphs, loading: chartDataLoading, @@ -319,7 +416,10 @@ export function InferenceProvider({ effectiveRunDate, isActive, latestDate, + selectedPercentile, compareGpuPair ?? null, + benchmarkRunId, + selectedXAxisMode, asOfRunId, dataQuickFilters, ); @@ -335,7 +435,7 @@ export function InferenceProvider({ if (!availabilityRows) return availableDates; const rows = availabilityRows.filter((r) => { if (!dbModelKeys.includes(r.model)) return false; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false; + if (rowToSequence(r) !== effectiveSequence) return false; if (!effectivePrecisions.includes(r.precision)) return false; if (!r.hardware) return false; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -360,7 +460,7 @@ export function InferenceProvider({ const hwKeys = new Set(); for (const r of availabilityRows) { if (!dbModelKeys.includes(r.model)) continue; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue; + if (rowToSequence(r) !== effectiveSequence) continue; if (!effectivePrecisions.includes(r.precision)) continue; if (!r.hardware) continue; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -432,6 +532,60 @@ export function InferenceProvider({ setTrackedConfigs((prev) => (prev.length > 0 ? [] : prev)); }, [selectedModel, effectiveSequence, effectivePrecisions, selectedYAxisMetric]); + // Reconcile the x-axis mode with the scenario kind: + // - On mount with no `i_xmode` URL param: snap to the kind's natural default + // (interactivity for both agentic and fixed-sequence scenarios). The state was initialized + // to a SSR-stable constant so server and client render the same DOM; this + // effect fixes it up after hydration. + // - When the user later switches sequence kinds: snap to the new kind's + // natural default (the prior selection was for a different kind, so it + // doesn't carry over). + const lastSeqKindRef = useRef | null>(null); + useEffect(() => { + const kind = sequenceKind(effectiveSequence); + const isInitialMount = lastSeqKindRef.current === null; + const isAgenticOnlyMode = isAgenticOnlyXAxisMode(selectedXAxisMode); + // On a stale render where kind hasn't changed, bail unless the current + // mode is agentic-only and we just landed on a fixed-seq scenario — in + // that case force the snap so the chart doesn't try to plot trace-derived + // metrics against rows that have no trace_replay. + if (!isInitialMount && lastSeqKindRef.current === kind) { + if (kind === 'fixed-seq' && isAgenticOnlyMode) { + handleSetXAxisMode('interactivity'); + } + return; + } + lastSeqKindRef.current = kind; + if ( + isInitialMount && + xAxisModeFromUrlRef.current && + !(kind === 'fixed-seq' && isAgenticOnlyMode) + ) { + // URL-restored agentic-only mode on a fixed-seq sequence makes no sense + // — fall through to the default snap below. + return; + } + handleSetXAxisMode('interactivity'); + }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]); + + // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or + // agentic percentile changes. For fixed-seq the JSONB only carries + // median_* / p99_* (no p90_*), so the TTFT button there has to point at + // median_ttft — otherwise the chart goes blank. For agentic, we point at + // the user's chosen percentile so the dropdown actually drives the axis. + useEffect(() => { + const isAgentic = sequenceKind(effectiveSequence) === 'agentic'; + if (selectedXAxisMode === 'ttft') { + setSelectedE2eXAxisMetric(isAgentic ? `${selectedPercentile}_ttft` : 'median_ttft'); + } else if (selectedXAxisMode === 'e2e') { + // null = use the chart-config natural x (median_e2el), which useChartData + // rewrites to _e2el for agentic via withPercentile(). + setSelectedE2eXAxisMetric(null); + } + // 'interactivity' mode renders the interactivity chart, which keys off + // selectedXAxisMetric (not the e2e one), so nothing to do here. + }, [selectedXAxisMode, effectiveSequence, selectedPercentile]); + // Ref guard: when true, filter changes don't clear the active preset. // FavoritePresetsDropdown sets this while applying a preset so its own // programmatic setter calls don't accidentally deactivate it. @@ -875,21 +1029,23 @@ export function InferenceProvider({ useUrlStateSync( { i_metric: selectedYAxisMetric, + i_pctl: selectedPercentile, i_gpus: selectedGPUs.join(','), i_dates: selectedDates.join(','), i_dstart: selectedDateRange.startDate, i_dend: selectedDateRange.endDate, i_optimal: hideNonOptimal ? '' : '0', - i_label: showPointLabels ? '1' : '', - i_hc: highContrast ? '1' : '', + i_label: showPointLabels ? '' : '0', + i_hc: highContrast ? '' : '0', i_log: logScale ? '1' : '', i_xmetric: selectedXAxisMetric || '', i_e2e_xmetric: selectedE2eXAxisMetric || '', + i_xmode: selectedXAxisMode, i_scale: scaleType, i_legend: isLegendExpanded ? '' : '0', - i_advlabel: useAdvancedLabels ? '1' : '', + i_advlabel: useAdvancedLabels ? '' : '0', i_gradlabel: showGradientLabels ? '1' : '', - i_linelabel: showLineLabels ? '' : '0', + i_linelabel: showLineLabels ? '1' : '', i_speed: showSpeedOverlay ? '1' : '', i_mc: showMinecraftOverlay ? '1' : '', i_active: iActiveStr, @@ -902,6 +1058,7 @@ export function InferenceProvider({ selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, + selectedXAxisMode, scaleType, selectedGPUs, selectedDates, @@ -1066,6 +1223,8 @@ export function InferenceProvider({ setSelectedXAxisMetric, selectedE2eXAxisMetric, setSelectedE2eXAxisMetric, + selectedXAxisMode, + setSelectedXAxisMode: handleSetXAxisMode, scaleType, setScaleType, quickFilters, @@ -1079,6 +1238,8 @@ export function InferenceProvider({ workflowInfo, selectedYAxisMetric, setSelectedYAxisMetric: setSelectedYAxisMetricAndClear, + selectedPercentile, + setSelectedPercentile, selectedGPUs, setSelectedGPUs: setSelectedGPUsAndClear, availableGPUs, @@ -1143,6 +1304,7 @@ export function InferenceProvider({ selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, + selectedXAxisMode, scaleType, quickFilters, availableQuickFilters, diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 8e894d0e..183641d4 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -1,7 +1,7 @@ import { useMemo, useRef } from 'react'; import { useQueries } from '@tanstack/react-query'; -import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants'; +import { rowToSequence } from '@semianalysisai/inferencex-constants'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { @@ -23,9 +23,14 @@ import { getModelSortIndex, hardwareKeyMatchesAnyBase, } from '@/lib/constants'; -import { transformBenchmarkRows } from '@/lib/benchmark-transform'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { + mergeRunScopedRows, + transformBenchmarkRows, + withPercentile, +} from '@/lib/benchmark-transform'; +import { Sequence, type Model } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; +import { paretoFrontForDirection, type ParetoDirection } from '@/lib/chart-utils'; import { applyQuickFilters, computeAvailableQuickFilters, @@ -33,6 +38,90 @@ import { type QuickFilters, } from '@/components/inference/utils/quickFilters'; +/** + * Chart x-axis variant selected by the mode buttons above the plot. This is + * the single definition — InferenceContext (URL/state) and ChartDisplay + * (buttons, derived-metric remapping) import it from here. + */ +export type XAxisMode = + | 'ttft' + | 'e2e' + | 'normalized-e2e' + | 'interactivity' + | 'session-time' + | 'prefill-tps'; + +export const X_AXIS_MODES: readonly XAxisMode[] = [ + 'ttft', + 'e2e', + 'normalized-e2e', + 'interactivity', + 'session-time', + 'prefill-tps', +]; + +/** + * Modes whose x metric is derived from persisted per-request traces — + * these only exist for agentic scenarios (fixed-seq rows have no + * trace_replay blob to derive them from). + */ +export function isAgenticOnlyXAxisMode(mode: XAxisMode): boolean { + return mode === 'normalized-e2e' || mode === 'session-time' || mode === 'prefill-tps'; +} + +/** + * Compute the set of benchmark_results.id values that sit on the + * (e2e_latency, y) Pareto frontier within each (hwKey, precision, date) + * group. Used to restrict the non-e2e xmode charts (ttft, interactivity, + * session-time, prefill-tps) so they show *only* the points that win on + * end-to-end latency — preventing benchmark-hacking where a config tops + * one axis while tanking the other. + * + * Returns null when the y-metric has no roofline direction declared on + * the e2e chart (caller falls back to no filtering in that case). + */ +function e2eParetoIds( + points: InferenceData[], + selectedYAxisMetric: string, + percentile: string, +): Set | null { + const e2eChartDef = (chartDefinitions as ChartDefinition[]).find((c) => c.chartType === 'e2e'); + if (!e2eChartDef) return null; + const dir = e2eChartDef[`${selectedYAxisMetric}_roofline` as keyof ChartDefinition] as + | ParetoDirection + | undefined; + if (!dir) return null; + const frontierFn = paretoFrontForDirection(dir); + // Percentile-prefixed e2e-latency field name (e.g. 'p90_e2el'). + const e2elField = withPercentile('median_e2el', percentile); + const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey; + + // Re-frame each candidate point in (e2el, y) space, then compute the + // pareto per (hwKey, precision, date) bucket — frontiers don't span dates + // (a May 17 point can't dominate a May 15 plot). + const byGroup = new Map(); + for (const p of points) { + const yValue = (p[metricKey] as { y?: number } | undefined)?.y; + const xValue = (p as unknown as Record)[e2elField]; + if (typeof xValue !== 'number' || !Number.isFinite(xValue)) continue; + if (typeof yValue !== 'number' || !Number.isFinite(yValue)) continue; + const key = `${p.hwKey}|${p.precision}|${p.date}`; + let bucket = byGroup.get(key); + if (!bucket) { + bucket = []; + byGroup.set(key, bucket); + } + bucket.push({ ...p, x: xValue, y: yValue }); + } + const ids = new Set(); + for (const bucket of byGroup.values()) { + for (const f of frontierFn(bucket)) { + if (typeof f.id === 'number') ids.add(f.id); + } + } + return ids; +} + /** Build deduplicated comparison dates, excluding the main run date. */ export function buildComparisonDates( selectedGPUs: string[], @@ -92,11 +181,26 @@ export function useChartData( selectedRunDate?: string, enabled = true, latestAvailableDate?: string, + selectedPercentile = 'p90', /** When set, only series for these two registry GPU keys are shown (compare pages). */ compareGpuPair?: readonly [string, string] | null, /** - * GitHub run id for the "as of run" view. Set only when an earlier-than-latest - * run is selected; the chart then shows the data as it stood at that run. + * Exact GitHub run id used to pin contested configs while carrying forward + * configs that the selected run did not produce. + */ + selectedRunId?: string, + /** + * Current x-axis mode. When set to anything other than 'e2e', the displayed + * data is filtered to the (e2e-latency, y) Pareto frontier so the ttft / + * interactivity / session-time / prefill-tps charts show only points that + * also win on end-to-end latency — preventing benchmark-hacking where a + * config tops one metric while tanking the other. The 'e2e' mode is the + * source of truth and keeps the full point set. + */ + selectedXAxisMode: XAxisMode = 'e2e', + /** + * GitHub run id for the "as of run" base view. Set only when an + * earlier-than-latest run is selected. */ asOfRunId?: string, /** @@ -118,11 +222,35 @@ export function useChartData( ? '' : selectedRunDate; + // Two queries: the normal latest-per-config view (always), plus the + // run-scoped rows when a specific workflow run is selected. The merged + // result pins ONLY the configs the selected run produced to that run, and + // carries every other config forward from the base rows — selecting one of + // two same-day vLLM runs must not hide the day's SGLang curve just because + // it lives in a different workflow run. The base query is the default view + // query, so it's almost always already in the React Query cache. const { - data: allRows, - isLoading: queryLoading, - error: queryError, + data: baseRows, + isLoading: baseLoading, + error: baseError, } = useBenchmarks(selectedModel, queryDate, enabled, asOfRunId); + const { + data: runRows, + isLoading: runLoading, + error: runError, + } = useBenchmarks(selectedModel, '', enabled && Boolean(selectedRunId), selectedRunId, true); + + const allRows = useMemo(() => { + if (!selectedRunId) return baseRows; + // Wait for the run rows before rendering a scoped view — rendering base + // rows first would flash the un-scoped chart, then swap contested points. + if (!runRows) return undefined; + if (!baseRows) return runRows; + return mergeRunScopedRows(runRows, baseRows); + }, [selectedRunId, runRows, baseRows]); + + const queryLoading = baseLoading || (Boolean(selectedRunId) && runLoading); + const queryError = baseError ?? (selectedRunId ? runError : null); // GPU comparison: fetch data for each additional comparison date const comparisonDates = useMemo( @@ -155,11 +283,13 @@ export function useChartData( // Merge main rows with comparison date rows. // Stamp each row with the *requested* date (not the actual DB date) so that // GPUGraph's activeDates filter (keyed by user-selected date) matches the points. - const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]); + // + // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via + // benchmark_type), so one filter covers every scenario. const rows = useMemo(() => { - if (!allRows || !sequenceIslOsl) return []; - const seqFilter = (r: { isl: number; osl: number }) => - r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl; + if (!allRows) return []; + const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) => + rowToSequence(r) === selectedSequence; const seqFiltered = allRows.filter(seqFilter); // For each (hw, framework, spec_method, disagg, precision) group, keep only @@ -186,14 +316,14 @@ export function useChartData( .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })), ); return [...mainRows, ...extraRows]; - }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]); + }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]); // Transform filtered rows into chart data const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => { if (rows.length === 0) return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig }; - return transformBenchmarkRows(rows); - }, [rows]); + return transformBenchmarkRows(rows, selectedPercentile); + }, [rows, selectedPercentile]); // Sort hardware config — stabilize reference when keys haven't changed. // Different sequences for the same model often have the same GPU configs, @@ -241,8 +371,11 @@ export function useChartData( (chartDefinitions as ChartDefinition[]).map((chartDef) => { const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey; - // Determine dynamic x-axis - let xAxisField: keyof AggDataEntry = chartDef.x; + // Default x-axis = chart's natural latency metric, percentile-adjusted + // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic + // scenarios `withPercentile` is a no-op when percentile === 'median'. + const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry; + let xAxisField: keyof AggDataEntry = naturalX; let xAxisLabel = chartDef.x_label; const metricTitle = @@ -252,14 +385,25 @@ export function useChartData( // Resolve the effective x-axis override per chart type const effectiveXMetric = chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric; + // The TTFT override is now any *_ttft metric (not just p90_ttft) — the + // x-axis-mode picker reconciles the percentile prefix based on sequence + // kind (fixed-seq → median, agentic → user-picked percentile). const isTtftOverride = - effectiveXMetric === 'p99_ttft' || effectiveXMetric === 'median_ttft'; - const ttftLabel = - effectiveXMetric === 'p99_ttft' - ? 'P99 Time To First Token (s)' - : 'Median Time To First Token (s)'; - - if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) { + typeof effectiveXMetric === 'string' && effectiveXMetric.endsWith('_ttft'); + const ttftPctl = isTtftOverride + ? (effectiveXMetric as string).replace(/_ttft$/u, '') + : 'p90'; + const ttftPctlWord = ttftPctl === 'median' ? 'Median' : ttftPctl.toUpperCase(); + const ttftLabel = `${ttftPctlWord} Time To First Token (s)`; + + const isAgentic = selectedSequence === Sequence.AgenticTraces; + + if ( + effectiveXMetric && + chartDef.chartType === 'interactivity' && + isInputMetric && + !isAgentic + ) { xAxisField = effectiveXMetric as keyof AggDataEntry; const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) { @@ -268,6 +412,10 @@ export function useChartData( xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label; } } else if (chartDef.chartType === 'interactivity' && isInputMetric) { + // Agentic falls through here too — the manual X-axis dropdown is + // hidden in agentic mode (would double up with the percentile + // selector), so the config default + percentile post-processing + // below drives the x axis. const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition; const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x; @@ -277,12 +425,35 @@ export function useChartData( xAxisLabel = ttftLabel; } + // Agentic: rewrite the resolved x metric to the chosen percentile, + // and relabel accordingly. Both have to be updated unconditionally — + // xAxisField may already be percentile-adjusted (via naturalX) while + // xAxisLabel still carries the raw chartDef.x_label prefix. + // The chart heading ("vs. ") is also rewritten to include + // the percentile so the title above the plot reflects what's drawn. + const headingKey = `${selectedYAxisMetric}_heading` as keyof ChartDefinition; + let chartHeading = (chartDef[headingKey] as string) || chartDef.heading; + if (isAgentic) { + xAxisField = withPercentile( + xAxisField as string, + selectedPercentile, + ) as keyof AggDataEntry; + const pctlWord = selectedPercentile.toUpperCase(); + xAxisLabel = xAxisLabel.replace(/^(?:Median|Mean|P75|P90|P95|P99(?:\.9)?)\b/iu, pctlWord); + chartHeading = chartHeading.replace( + /^(?vs\.\s+)(?:(?:Median|Mean|P75|P90|P95|P99(?:\.9)?)\s+)?/iu, + `$1${pctlWord} `, + ); + } + // The x-axis is "flipped" only when the good-direction reverses // (e.g. interactivity → TTFT: "higher is better" → "lower is better"). // E2EL → TTFT keeps the same direction ("lower is better" for both), // so no roofline flip is needed for the e2e chart. + // Compare against `naturalX` (percentile-adjusted) — switching the + // percentile of the same logical metric is NOT a flip. const xAxisFlipped = - xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride); + xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride); const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition; const dynamicYLabel = chartDef[yLabelKey]; @@ -303,6 +474,7 @@ export function useChartData( chartDefinition: { ...chartDef, ...rooflineOverrides, + heading: chartHeading, x_label: xAxisLabel, y_label: dynamicYLabel === null ? undefined : String(dynamicYLabel), }, @@ -310,7 +482,13 @@ export function useChartData( xAxisField, }; }), - [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric], + [ + selectedYAxisMetric, + selectedXAxisMetric, + selectedE2eXAxisMetric, + selectedPercentile, + selectedSequence, + ], ); // Build renderable graphs (data processing + stable chart definitions) @@ -344,9 +522,30 @@ export function useChartData( filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric); + // For AGENTIC workloads only: when the user is NOT viewing the + // e2e latency chart, mark each point with whether it sits on the + // (e2e_latency, y) Pareto frontier for its (hwKey, precision, + // date) group. The chart still renders every point as scatter — + // only e2e-Pareto winners feed the roofline (ScatterGraph honors + // the flag). Prevents benchmark-hacking the TTFT / interactivity + // line by tanking decode (or vice versa) without hiding the + // non-optimal configs from view. + // + // Fixed-seq workloads keep the existing per-axis Pareto since + // there's no separate "session-time" notion of total latency — + // their e2e IS the request latency, so a TTFT hack there reads + // honestly on e2e too. The anti-hack constraint is specifically + // about multi-turn agentic where TTFT measures a tiny fraction + // of the user-visible session time. + const isAgentic = selectedSequence === Sequence.AgenticTraces; + const e2eParetoSet = + isAgentic && selectedXAxisMode !== 'e2e' + ? e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile) + : null; + // Filter to points that have the selected metric, then remap x/y const hasMetric = filteredData.some((d) => metricKey in d); - const isTtftX = xAxisField === 'p99_ttft' || xAxisField === 'median_ttft'; + const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft'); const processedData = hasMetric ? filteredData .filter((d) => metricKey in d) @@ -359,18 +558,26 @@ export function useChartData( // d.x would otherwise mask the regression). const xCandidate = (d as Partial)[xAxisField]; const xValue = typeof xCandidate === 'number' ? xCandidate : d.x; + const isOnE2eFrontier = + e2eParetoSet === null + ? undefined + : typeof d.id === 'number' && e2eParetoSet.has(d.id); return { ...d, x: xValue, y: yValue, roof, + isOnE2eFrontier, }; }) - // When TTFT is on the x-axis, apply the latency limit to filter overload outliers - // (e.g. conc=2048 rows with TTFT > 60s that compress all real data to the far left) + // When TTFT is on the x-axis, apply the latency limit to filter + // overload outliers (fixed-seq conc=2048 rows with TTFT > 60s that + // compress all real data to the far left). Skip for agentic — long + // TTFTs there reflect real workloads (multi-turn, big prompts). .filter( (d) => !isTtftX || + isAgentic || !chartDefinition.y_latency_limit || d.x <= chartDefinition.y_latency_limit, ) @@ -395,6 +602,8 @@ export function useChartData( userPowers, stableChartDefinitions, compareGpuPair, + selectedXAxisMode, + selectedPercentile, quickFilters, ]); diff --git a/packages/app/src/components/inference/inference-chart-config.json b/packages/app/src/components/inference/inference-chart-config.json index d9a29181..9617638f 100644 --- a/packages/app/src/components/inference/inference-chart-config.json +++ b/packages/app/src/components/inference/inference-chart-config.json @@ -13,9 +13,9 @@ "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)", "y_inputTputPerGpu_title": "Input Token Throughput per GPU", "y_inputTputPerGpu_roofline": "upper_left", - "y_inputTputPerGpu_x": "p99_ttft", - "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)", - "y_inputTputPerGpu_heading": "vs. P99 Time To First Token", + "y_inputTputPerGpu_x": "p90_ttft", + "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)", + "y_inputTputPerGpu_heading": "vs. P90 Time To First Token", "y_outputTputPerGpu": "outputTputPerGpu.y", "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)", "y_outputTputPerGpu_title": "Output Token Throughput per GPU", @@ -126,8 +126,8 @@ "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)", "y_inputTputPerGpu_title": "Input Token Throughput per GPU", "y_inputTputPerGpu_roofline": "upper_right", - "y_inputTputPerGpu_x": "p99_ttft", - "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)", + "y_inputTputPerGpu_x": "p90_ttft", + "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)", "y_outputTputPerGpu": "outputTputPerGpu.y", "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)", "y_outputTputPerGpu_title": "Output Token Throughput per GPU", diff --git a/packages/app/src/components/inference/replay/buildReplayTimeline.ts b/packages/app/src/components/inference/replay/buildReplayTimeline.ts index 91db3d40..91761604 100644 --- a/packages/app/src/components/inference/replay/buildReplayTimeline.ts +++ b/packages/app/src/components/inference/replay/buildReplayTimeline.ts @@ -107,8 +107,7 @@ function resolveXAxisField( const metricTitle = (chartDef[`${selectedYAxisMetric}_title` as keyof ChartDefinition] as string) || ''; const isInputMetric = metricTitle.toLowerCase().includes('input'); - const isTtftOverride = - selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft'; + const isTtftOverride = selectedXAxisMetric === 'p90_ttft'; if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) { return selectedXAxisMetric; diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index ecf2fe33..5d0981b8 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -80,6 +80,8 @@ export interface WorkerPower { * @property {number} p99_e2el - 99th percentile of End-to-End Latency. */ export interface AggDataEntry { + /** Stable per-point id from benchmark_results — for trace_replay lookups. */ + id?: number; hw: string; mtp?: string; hwKey: string; @@ -94,23 +96,43 @@ export interface AggDataEntry { mean_ttft: number; median_ttft: number; std_ttft: number; + p75_ttft: number; + p90_ttft: number; + p95_ttft: number; p99_ttft: number; + 'p99.9_ttft': number; mean_tpot: number; mean_intvty: number; median_tpot: number; median_intvty: number; std_tpot: number; std_intvty: number; + p75_tpot: number; + p75_intvty: number; + p90_tpot: number; + p90_intvty: number; + p95_tpot: number; + p95_intvty: number; p99_tpot: number; p99_intvty: number; + 'p99.9_tpot': number; + 'p99.9_intvty': number; mean_itl: number; median_itl: number; std_itl: number; + p75_itl: number; + p90_itl: number; + p95_itl: number; p99_itl: number; + 'p99.9_itl': number; mean_e2el: number; median_e2el: number; std_e2el: number; + p75_e2el: number; + p90_e2el: number; + p95_e2el: number; p99_e2el: number; + 'p99.9_e2el': number; // Measured GPU telemetry (emitted by runner's aggregate_power.py). // Optional because historical runs predate the fields. avg_power_w?: number; @@ -162,6 +184,29 @@ export interface AggDataEntry { actualDate?: string; /** URL to the GitHub Actions workflow run that produced this data point. */ run_url?: string; + /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */ + benchmark_type?: string; + /** ISL in tokens — null for agentic_traces. */ + isl?: number | null; + /** OSL in tokens — null for agentic_traces. */ + osl?: number | null; + // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ── + /** "on" | "off" — whether KV cache offload to CPU was enabled. */ + offload_mode?: string; + /** Actual server-observed GPU prefix-cache hit rate (0..1). */ + server_gpu_cache_hit_rate?: number; + /** Actual server-observed CPU prefix-cache hit rate (0..1). */ + server_cpu_cache_hit_rate?: number; + /** Infinite-cache theoretical hit rate (0..1) computed from trace. */ + theoretical_cache_hit_rate?: number; + /** Total requests attempted during the window. */ + num_requests_total?: number; + /** Requests that completed successfully. */ + num_requests_successful?: number; + /** Total prompt tokens served. */ + total_prompt_tokens?: number; + /** Total generated (output) tokens. */ + total_generation_tokens?: number; } /** @@ -187,6 +232,17 @@ export interface InferenceData extends Partial void; + /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */ + selectedPercentile: string; + setSelectedPercentile: (p: string) => void; selectedXAxisMetric: string | null; setSelectedXAxisMetric: (metric: string | null) => void; selectedE2eXAxisMetric: string | null; setSelectedE2eXAxisMetric: (metric: string | null) => void; + /** + * Which chart variant the user wants to see — the inference card shows one chart + * at a time, picked by the big buttons above the chart. + * - 'ttft' → e2e chartType with x-axis forced to p90_ttft + * - 'e2e' → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el) + * - 'normalized-e2e'→ agentic-only; x = per-request E2E normalized to 400 output tokens + * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty) + * - 'session-time' → agentic-only; x = mean-normalized session time (live-computed from trace blobs) + * - 'prefill-tps' → agentic-only; x = mean of P90 prefill TPS/user per session + */ + selectedXAxisMode: + | 'ttft' + | 'e2e' + | 'normalized-e2e' + | 'interactivity' + | 'session-time' + | 'prefill-tps'; + setSelectedXAxisMode: ( + mode: 'ttft' | 'e2e' | 'normalized-e2e' | 'interactivity' | 'session-time' | 'prefill-tps', + ) => void; scaleType: 'auto' | 'linear' | 'log'; setScaleType: (type: 'auto' | 'linear' | 'log') => void; /** Coarse vendor / framework / agg-disagg / mtp-stp filters applied to the chart point set. */ diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx index 84db5e1f..9f333482 100644 --- a/packages/app/src/components/inference/ui/ChartControls.tsx +++ b/packages/app/src/components/inference/ui/ChartControls.tsx @@ -1,6 +1,6 @@ 'use client'; -import { useMemo, useState } from 'react'; +import { useEffect, useMemo, useState } from 'react'; import { track } from '@/lib/analytics'; import { useFeatureGate } from '@/lib/use-feature-gate'; @@ -9,7 +9,8 @@ import { cn } from '@/lib/utils'; import { useInference } from '@/components/inference/InferenceContext'; import { ModelSelector, - SequenceSelector, + ScenarioSelector, + PercentileSelector, PrecisionSelector, } from '@/components/ui/chart-selectors'; import { DateRangePicker } from '@/components/ui/date-range-picker'; @@ -28,7 +29,7 @@ import { Button } from '@/components/ui/button'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { ChartDefinition, DisaggMode, SpecMode } from '@/components/inference/types'; import { FRAMEWORK_FAMILIES } from '@/components/inference/utils/quickFilters'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { Sequence, type Model, type Percentile } from '@/lib/data-mappings'; /** * Y-axis metric options from static chart config JSON — available immediately, no API wait. @@ -109,6 +110,13 @@ interface ChartControlsProps { } export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) { + // The percentile selector is rendered conditionally on `selectedSequence`, + // which on the client is hydrated from URL params. SSR doesn't see the URL, + // so deferring the conditional until after mount keeps the initial DOM + // identical between server and client (avoids hydration warnings). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const [openDropdown, setOpenDropdown] = useState(null); const handleDropdownOpenChange = (dropdownKey: string) => (open: boolean) => { if (open) { @@ -117,6 +125,7 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro } setOpenDropdown((current) => (current === dropdownKey ? null : current)); }; + const { selectedModel, setSelectedModel, @@ -126,6 +135,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro setSelectedPrecisions, selectedYAxisMetric, setSelectedYAxisMetric, + selectedPercentile, + setSelectedPercentile, graphs, selectedGPUs, setSelectedGPUs, @@ -354,14 +365,21 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro availableModels={availableModels} data-testid="model-selector" /> - + {mounted && selectedSequence === Sequence.AgenticTraces && ( + setSelectedPercentile(p)} + data-testid="percentile-selector" + /> + )} {graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') && - isInputMetric && ( + isInputMetric && + selectedSequence !== Sequence.AgenticTraces && (
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 882b6f93..6952f439 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -1,9 +1,12 @@ 'use client'; -import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants'; +import { + DISPLAY_MODEL_TO_DB, + NORMALIZED_E2E_OUTPUT_TOKENS, +} from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; import dynamic from 'next/dynamic'; import { useEffect, useMemo, useRef, useState } from 'react'; -import { BarChart3, ChevronDown, Table2, X } from 'lucide-react'; +import { BarChart3, Table2, X } from 'lucide-react'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import { useInference } from '@/components/inference/InferenceContext'; @@ -14,7 +17,10 @@ import type { OverlayData, TrendDataPoint, } from '@/components/inference/types'; -import { processOverlayChartData } from '@/components/inference/utils'; +import { + processOverlayChartData, + selectUnofficialOverlayForMode, +} from '@/components/inference/utils'; import { isRunComparisonEntry, makeRunComparisonEntry, @@ -38,7 +44,6 @@ import { DialogHeader, DialogTitle, } from '@/components/ui/dialog'; -import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover'; import { Skeleton } from '@/components/ui/skeleton'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; import { @@ -48,8 +53,14 @@ import { getModelLabel, getPrecisionLabel, getSequenceLabel, + sequenceKind, } from '@/lib/data-mappings'; import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs'; +import { + useDerivedAgenticMetrics, + type DerivedAgenticMetric, +} from '@/hooks/api/use-derived-agentic-metrics'; +import { isAgenticOnlyXAxisMode, type XAxisMode } from '@/components/inference/hooks/useChartData'; import { useTrendData } from '@/components/inference/hooks/useTrendData'; import { getHardwareConfig, hardwareKeyMatchesAnyBase } from '@/lib/constants'; @@ -67,55 +78,58 @@ const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagra }); import WorkflowInfoDisplay from './WorkflowInfoDisplay'; -/** Controlled popover dropdown for the e2e chart x-axis toggle. */ -function E2eXAxisDropdown({ - xAxisLabel, - xAxisOptions, - selectedValue, - onSelect, -}: { - xAxisLabel: string; - xAxisOptions: { value: string | null; label: string }[]; - selectedValue: string | null; - onSelect: (value: string | null) => void; -}) { - const [open, setOpen] = useState(false); - return ( - - - - - - {xAxisOptions.map((opt) => ( - - ))} - - - ); +type InferenceViewMode = 'chart' | 'table'; + +const X_AXIS_MODE_BUTTONS: { value: XAxisMode; label: string }[] = [ + { value: 'ttft', label: 'TTFT' }, + { value: 'e2e', label: 'E2E Latency' }, + { value: 'normalized-e2e', label: 'Normalized E2E' }, + { value: 'interactivity', label: 'Interactivity' }, + { value: 'session-time', label: 'Session Time' }, + { value: 'prefill-tps', label: 'Prefill TPS / user' }, +]; + +/** + * Presentation + data plumbing for the trace-derived x-axis modes (the + * agentic-only modes). One spec per mode keeps the x-label, chart heading, + * roofline corner, and derived-metric accessor in sync instead of scattering + * `selectedXAxisMode === …` conditionals through the render. + */ +interface DerivedXModeSpec { + xLabel: (percentileLabel: string) => string; + /** Chart heading suffix ("vs. …") shown above the plot. */ + heading: (percentileLabel: string) => string; + rooflineCorner: 'upper_right' | 'upper_left'; + /** Pull the raw metric for this mode off the derived-metrics payload. */ + value: (m: DerivedAgenticMetric | undefined, percentile: string) => number | null | undefined; + /** Convert the raw metric to the plotted x value. */ + toX: (raw: number) => number; } -type InferenceViewMode = 'chart' | 'table'; +const DERIVED_X_MODE_SPECS: Partial> = { + 'session-time': { + xLabel: () => 'Mean Normalized Session Time (min)', + heading: () => 'vs. Mean Normalized Session Time', + rooflineCorner: 'upper_right', + value: (m) => m?.normalized_session_time_s, + toX: (raw) => raw / 60, + }, + 'normalized-e2e': { + xLabel: (pctl) => `${pctl} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens (s)`, + heading: (pctl) => `vs. ${pctl} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens`, + rooflineCorner: 'upper_right', + value: (m, percentile) => + percentile === 'p75' ? m?.p75_normalized_e2e_400_s : m?.p90_normalized_e2e_400_s, + toX: (raw) => raw, + }, + 'prefill-tps': { + xLabel: () => 'P90 Prefill TPS per user (tok/s)', + heading: () => 'vs. P90 Prefill TPS / user', + rooflineCorner: 'upper_left', + value: (m) => m?.p90_prefill_tps_per_user, + toX: (raw) => raw, + }, +}; const VIEW_MODE_OPTIONS: SegmentedToggleOption[] = [ { @@ -161,8 +175,10 @@ export default function ChartDisplay() { logScale, activeHwTypes, activeDates, - setSelectedE2eXAxisMetric, + selectedPercentile, compareGpuPair, + selectedXAxisMode, + setSelectedXAxisMode, } = useInference(); const { @@ -171,6 +187,9 @@ export default function ChartDisplay() { totalDatesQueried, } = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates); + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const modelDbKeys = useMemo( () => DISPLAY_MODEL_TO_DB[selectedModel] ?? [selectedModel], [selectedModel], @@ -278,6 +297,7 @@ export default function ChartDisplay() { chartType, selectedYAxisMetric, effectiveXMetric, + { isAgentic: sequenceKind(selectedSequence) === 'agentic' }, ); let overlayPoints = processed; @@ -395,238 +415,267 @@ export default function ChartDisplay() { })); }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]); - const displayGraphs = isFirstLoad - ? Array.from({ length: 2 }).map((_, index) => ( - - - - - - )) - : effectiveGraphs.length === 0 - ? [] - : effectiveGraphs.map((graph, graphIndex) => { - const isTimelineMode = Boolean( - selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0, - ); - const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode; - return ( -
-
- handleViewModeChange(graphIndex, v)} - ariaLabel="View mode" - testId={`inference-view-toggle-${graphIndex}`} - /> - } - hideImageExport={getViewMode(graphIndex) === 'table'} - setIsLegendExpanded={setIsLegendExpanded} - exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`} - onExportMp4={ - replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined - } - onExportCsv={() => { - const visibleData = graph.data.filter((d) => + const visibleGraphs = useMemo(() => { + const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e'; + const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType); + return filtered.length > 0 ? filtered : effectiveGraphs; + }, [effectiveGraphs, selectedXAxisMode]); + + const isAgenticSequence = sequenceKind(selectedSequence) === 'agentic'; + const useDerived = isAgenticSequence && isAgenticOnlyXAxisMode(selectedXAxisMode); + const derivedTargetIds = useMemo(() => { + if (!useDerived) return [] as number[]; + const ids = new Set(); + for (const graph of visibleGraphs) { + for (const point of graph.data) { + if (point.benchmark_type === 'agentic_traces' && typeof point.id === 'number') { + ids.add(point.id); + } + } + } + return [...ids]; + }, [useDerived, visibleGraphs]); + const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived); + const derivedMetrics = derivedQuery.data; + const isDerivedLoading = + useDerived && + derivedTargetIds.length > 0 && + (derivedQuery.isPending || derivedQuery.isFetching) && + !derivedMetrics; + + // Set only when the user is on a derived (agentic-only) x-axis mode; the + // specs are module constants so this is referentially stable per mode. + const derivedSpec = useDerived ? DERIVED_X_MODE_SPECS[selectedXAxisMode] : undefined; + + const renderableGraphs = useMemo(() => { + if (!derivedSpec) return visibleGraphs; + if (!derivedMetrics) return visibleGraphs.map((graph) => ({ ...graph, data: [] })); + const xLabel = derivedSpec.xLabel(selectedPercentile.toUpperCase()); + return visibleGraphs.map((graph) => { + const chartDefinition = { + ...graph.chartDefinition, + x_label: xLabel, + y_latency_limit: undefined, + [`${selectedYAxisMetric}_roofline` as keyof typeof graph.chartDefinition]: + derivedSpec.rooflineCorner, + }; + const data = graph.data + .map((point) => { + if (typeof point.id !== 'number') return null; + const raw = derivedSpec.value(derivedMetrics[point.id], selectedPercentile); + if (raw === null || raw === undefined || !Number.isFinite(raw)) return null; + return { ...point, x: derivedSpec.toX(raw) }; + }) + .filter((point): point is NonNullable => point !== null); + return { ...graph, chartDefinition, data }; + }); + }, [derivedSpec, visibleGraphs, derivedMetrics, selectedYAxisMetric, selectedPercentile]); + + const displayGraphs = + isFirstLoad || isDerivedLoading + ? [ + + + + + , + ] + : renderableGraphs.length === 0 + ? [] + : renderableGraphs.map((graph, graphIndex) => { + const isTimelineMode = Boolean( + selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0, + ); + const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode; + return ( +
+
+ - activeOverlayHwTypes.has(p.hwKey as string) && - selectedPrecisions.includes(p.precision), - ); - const issueNotes = matchKnownConfigIssues(graph.model, [ - ...visibleData, - ...visibleOverlayRows, - ]).map((issue) => - knownIssueCsvNote( - issue, - getDisplayLabel(getHardwareConfig(issue.hwKey, graph.model)), - ), - ); - exportToCsv( - `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`, - headers, - rows, - issueNotes, - ); - }} - /> - - {(() => { - const chartCaption = ( - <> -

- { - graph.chartDefinition[ - `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition - ] - }{' '} - {(() => { - // For Input metrics with dynamic x-axis, use dynamic heading - const metricTitle = - (graph.chartDefinition[ + ? 'gpu_timeseries' + : graph.chartDefinition.chartType === 'e2e' + ? 'latency' + : 'interactivity' + } + leadingControls={ + handleViewModeChange(graphIndex, v)} + ariaLabel="View mode" + testId={`inference-view-toggle-${graphIndex}`} + /> + } + hideImageExport={getViewMode(graphIndex) === 'table'} + setIsLegendExpanded={setIsLegendExpanded} + exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`} + onExportMp4={ + replayAvailable + ? () => replayHandlesRef.current[graphIndex]?.open() + : undefined + } + onExportCsv={() => { + const visibleData = graph.data.filter((d) => + isTimelineMode + ? activeDates.has(`${d.date}_${d.hwKey}`) + : activeHwTypes.has(d.hwKey as string) && + selectedPrecisions.includes(d.precision), + ); + const { headers, rows } = inferenceChartToCsv( + visibleData, + graph.model, + graph.sequence, + ); + // Match warnings against the same series the chart annotates, + // including visible unofficial-run overlay series. + const overlay = selectUnofficialOverlayForMode( + selectedXAxisMode, + graph.chartDefinition.chartType, + overlayDataByChartType, + ); + const visibleOverlayRows = isTimelineMode + ? [] + : (overlay?.data ?? []).filter( + (p) => + activeOverlayHwTypes.has(p.hwKey as string) && + selectedPrecisions.includes(p.precision), + ); + const issueNotes = matchKnownConfigIssues(graph.model, [ + ...visibleData, + ...visibleOverlayRows, + ]).map((issue) => + knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))), + ); + exportToCsv( + `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`, + headers, + rows, + issueNotes, + ); + }} + /> + + {(() => { + const chartCaption = ( + <> +

+ { + graph.chartDefinition[ `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition - ] as string) || ''; - const isInputMetric = metricTitle.toLowerCase().includes('input'); - if ( - graph.chartDefinition.chartType === 'interactivity' && - isInputMetric && - selectedXAxisMetric - ) { - if (selectedXAxisMetric === 'p99_ttft') { - return 'vs. P99 Time To First Token'; - } else if (selectedXAxisMetric === 'median_ttft') { - return 'vs. Median Time To First Token'; + ] + }{' '} + {(() => { + // For Input metrics with dynamic x-axis, use dynamic heading + const metricTitle = + (graph.chartDefinition[ + `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition + ] as string) || ''; + const isInputMetric = metricTitle.toLowerCase().includes('input'); + if ( + graph.chartDefinition.chartType === 'interactivity' && + isInputMetric && + selectedXAxisMetric + ) { + if (selectedXAxisMetric === 'p99_ttft') { + return 'vs. P99 Time To First Token'; + } else if (selectedXAxisMetric === 'median_ttft') { + return 'vs. Median Time To First Token'; + } + } + + // The e2e chart heading follows the branch-level x-axis mode + // selector, including agentic-only derived metrics. + if (graph.chartDefinition.chartType === 'e2e') { + const modeSpec = DERIVED_X_MODE_SPECS[selectedXAxisMode]; + if (modeSpec) { + return modeSpec.heading(selectedPercentile.toUpperCase()); + } + if (selectedE2eXAxisMetric?.endsWith('_ttft')) { + const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, ''); + const word = + percentile === 'median' ? 'Median' : percentile.toUpperCase(); + return `vs. ${word} Time To First Token`; + } + return isAgenticSequence + ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency` + : 'vs. End-to-end Latency'; } - } - // For e2e chart: render clickable inline dropdown for x-axis - if (graph.chartDefinition.chartType === 'e2e') { - const xAxisLabel = - selectedE2eXAxisMetric === 'p99_ttft' - ? 'P99 TTFT' - : selectedE2eXAxisMetric === 'median_ttft' - ? 'Median TTFT' - : 'End-to-end Latency'; - const xAxisOptions = [ - { value: null, label: 'End-to-end Latency' }, - { value: 'p99_ttft', label: 'P99 TTFT' }, - { value: 'median_ttft', label: 'Median TTFT' }, - ]; - const zoomPrefix = - selectedDateRange.startDate && - selectedDateRange.endDate && - selectedGPUs.length > 0 - ? 'gpu_timeseries' - : 'latency'; + // Fall back to configured heading return ( - { - setSelectedE2eXAxisMetric(value); - track('latency_x_axis_metric_selected', { - metric: value ?? 'median_e2el', - }); - window.dispatchEvent( - new CustomEvent( - `${zoomPrefix}_zoom_reset_chart-${graphIndex}`, - ), - ); - }} - /> + graph.chartDefinition[ + `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition + ] || graph.chartDefinition.heading ); - } - - // Fall back to configured heading - return ( - graph.chartDefinition[ - `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition - ] || graph.chartDefinition.heading - ); - })()} -

-

- {getModelLabel(graph.model as Model)} •{' '} - {selectedPrecisions - .map((prec) => getPrecisionLabel(prec as Precision)) - .join(', ')}{' '} - • {getSequenceLabel(graph.sequence as Sequence)} •{' '} - {isUnofficialRun - ? 'Source: UNOFFICIAL' - : 'Source: SemiAnalysis InferenceX™'} - {selectedRunDate && ( - <> - {' '} - • Updated:{' '} - {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString( - 'en-US', - { - year: 'numeric', - month: '2-digit', - day: '2-digit', - timeZone: 'UTC', - }, - )} - + })()} +

+

+ {getModelLabel(graph.model as Model)} •{' '} + {selectedPrecisions + .map((prec) => getPrecisionLabel(prec as Precision)) + .join(', ')}{' '} + • {getSequenceLabel(graph.sequence as Sequence)} •{' '} + {isUnofficialRun + ? 'Source: UNOFFICIAL' + : 'Source: SemiAnalysis InferenceX™'} + {selectedRunDate && ( + <> + {' '} + • Updated:{' '} + {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString( + 'en-US', + { + year: 'numeric', + month: '2-digit', + day: '2-digit', + timeZone: 'UTC', + }, + )} + + )} +

+ + {isUnofficialRun && selectedXAxisMode === 'normalized-e2e' && ( +

+ Normalized E2E requires persisted per-request traces, so + unofficial-run overlays are unavailable for this experimental view. +

)} -

- - - - ); - - if (getViewMode(graphIndex) === 'table') { - const overlay = - graph.chartDefinition.chartType === 'e2e' - ? overlayDataByChartType.e2e - : overlayDataByChartType.interactivity; - const overlayRows = (overlay?.data ?? []).filter((p) => - selectedPrecisions.includes(p.precision), - ); - return ( - <> - {chartCaption} - 0 ? [...graph.data, ...overlayRows] : graph.data - } - chartDefinition={graph.chartDefinition} - selectedYAxisMetric={selectedYAxisMetric} - /> + ); - } - return selectedGPUs.length > 0 && - ((selectedDateRange.startDate && selectedDateRange.endDate) || - selectedDates.length > 0) ? ( - - ) : ( -
- + selectedPrecisions.includes(p.precision), + ); + return ( + <> + {chartCaption} + 0 + ? [...graph.data, ...overlayRows] + : graph.data + } + chartDefinition={graph.chartDefinition} + selectedYAxisMetric={selectedYAxisMetric} + /> + + ); + } + + return selectedGPUs.length > 0 && + ((selectedDateRange.startDate && selectedDateRange.endDate) || + selectedDates.length > 0) ? ( + - {selectedGPUs.length > 0 && - (!selectedDateRange.startDate || !selectedDateRange.endDate) && - selectedDates.length === 0 && ( -
-

- Select a date range or add a run to view GPU comparison -

-
- )} -
- ); - })()} - {replayAvailable && ( - { - replayHandlesRef.current[graphIndex] = handle; - }} - parentChartId={`chart-${graphIndex}`} - chartDefinition={graph.chartDefinition} - yLabel={`${ - graph.chartDefinition[ - `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition - ] - }`} - xLabel={graph.chartDefinition.x_label} - /> - )} -
-
-
- ); - }); + ) : ( +
+ + {selectedGPUs.length > 0 && + (!selectedDateRange.startDate || !selectedDateRange.endDate) && + selectedDates.length === 0 && ( +
+

+ Select a date range or add a run to view GPU comparison +

+
+ )} +
+ ); + })()} + {replayAvailable && ( + { + replayHandlesRef.current[graphIndex] = handle; + }} + parentChartId={`chart-${graphIndex}`} + chartDefinition={graph.chartDefinition} + yLabel={`${ + graph.chartDefinition[ + `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition + ] + }`} + xLabel={graph.chartDefinition.x_label} + /> + )} + +
+
+ ); + }); return (
@@ -733,6 +800,41 @@ export default function ChartDisplay() { )} +
+ {X_AXIS_MODE_BUTTONS.filter(({ value }) => { + if (!isAgenticOnlyXAxisMode(value)) return true; + // Before mount, render all buttons so SSR and first client render match. + if (!mounted) return true; + return isAgenticSequence; + }).map(({ value, label }) => { + const isActive = selectedXAxisMode === value; + return ( + + ); + })} +
{displayGraphs}
{/* Performance Over Time — Modal Drill-Down */} diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx index df22b8f5..a8cfed48 100644 --- a/packages/app/src/components/inference/ui/GPUGraph.tsx +++ b/packages/app/src/components/inference/ui/GPUGraph.tsx @@ -12,6 +12,7 @@ import { getChartWatermark } from '@/lib/data-mappings'; import { generateGpuDateColors } from '@/lib/dynamic-colors'; import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils'; import { useThemeColors } from '@/hooks/useThemeColors'; +import { useTraceAvailability } from '@/hooks/api/use-trace-availability'; import { D3Chart } from '@/lib/d3-chart/D3Chart'; import type { CustomLayerConfig, @@ -26,6 +27,7 @@ import { formatLargeNumber, getShapeKeyForPrecision, logTickFormat, + POINT_SIZE, } from '@/lib/chart-rendering'; import { paretoFrontLowerLeft, @@ -259,6 +261,20 @@ const GPUGraph = React.memo( return pts; }, [groupedData, activeDates, hideNonOptimal, optimalPointKeys]); + // GPU comparison currently renders official DB-backed points only. Unofficial + // overlays have no benchmark_results id or persisted trace, so they cannot + // open the dedicated per-point charts route. + const agenticIds = useMemo( + () => + filteredData.flatMap((point) => + point.benchmark_type === 'agentic_traces' && typeof point.id === 'number' + ? [point.id] + : [], + ), + [filteredData], + ); + const { data: traceAvailability } = useTraceAvailability(agenticIds); + // Warning annotations for visible series with known upstream issues — // same treatment the scatter view gets, applied to the date-comparison view. // Lines here are colored per (gpu, date) pair, so take the first active @@ -755,7 +771,11 @@ const GPUGraph = React.memo( config: { getColor, hideLabels: !showPointLabels, - getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)), + // Match ScatterGraph: append the concurrency (C=) to the + // parallelism/tp label so compare-mode points are annotated the + // same way as the single-run scatter chart. + getLabelText: (d) => + useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { series: (d) => `${d.date}_${d.hwKey}`, @@ -794,6 +814,7 @@ const GPUGraph = React.memo( selectedYAxisMetric, hardwareConfig, runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined, + hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false, }), getRulerX: (d, xScale) => (xScale as d3.ScaleLinear)(d.x), getRulerY: (d, yScale) => (yScale as d3.ScaleLinear)(d.y), @@ -807,6 +828,37 @@ const GPUGraph = React.memo( sel.select('.visible-shape') as any, getShapeKeyForPrecision(d.precision, selectedPrecisions), ), + onPointClick: (d: InferenceData) => { + track('gpu_timeseries_data_point_clicked', { + id: d.id, + hw: String(d.hwKey), + x: d.x, + y: d.y, + }); + const tooltipEl = chartRef.current?.getTooltipElement(); + if (!tooltipEl) return; + const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]'); + if (!viewBtn || typeof d.id !== 'number') return; + viewBtn.addEventListener('click', (event) => { + event.stopPropagation(); + track('gpu_timeseries_view_charts_opened', { + id: d.id, + hwKey: String(d.hwKey), + conc: d.conc, + }); + }); + // Pinning updates D3Chart's React state. GPU comparison rebuilds + // several inline layer configs on that render, whose cleanup can + // briefly hide the otherwise-pinned portal tooltip. Restore its + // pinned visibility after that render settles. + requestAnimationFrame(() => { + const pinnedTooltip = chartRef.current?.getTooltipElement(); + if (!pinnedTooltip || chartRef.current?.getPinnedPoint() !== d) return; + pinnedTooltip.style.opacity = '1'; + pinnedTooltip.style.display = 'block'; + pinnedTooltip.style.pointerEvents = 'auto'; + }); + }, attachToLayer: 1, }} onRender={(ctx: RenderContext) => { @@ -819,6 +871,28 @@ const GPUGraph = React.memo( } // Set foreground color on scatter point labels ctx.layout.zoomGroup.selectAll('.point-label').style('fill', 'var(--foreground)'); + + // Offload halo: dashed ring on every point that used KV offload + // (mirrors ScatterGraph so compare mode shows the same CPU-offload + // indicator). The ring is a child of the dot-group, so it travels + // with the point on zoom/pan without a separate onZoom pass. + ctx.layout.zoomGroup + .selectAll('.dot-group') + .each(function (d) { + const showHalo = d.offload_mode === 'on'; + d3.select(this) + .selectAll('.offload-halo') + .data(showHalo ? [true] : []) + .join('circle') + .attr('class', 'offload-halo') + .attr('r', POINT_SIZE + 4) + .attr('fill', 'none') + .attr('stroke', 'var(--foreground)') + .attr('stroke-width', 1.5) + .attr('stroke-dasharray', '3 2') + .attr('opacity', 0.9) + .attr('pointer-events', 'none'); + }); }} legendElement={ ({ useUnofficialRun: () => overlayState.current, })); +// ScatterGraph calls useTraceAvailability (a useQuery) for the agentic "View +// charts" tooltip button. Stub it so these decoration tests don't need a +// QueryClientProvider — trace presence is irrelevant to the toggle path. +vi.mock('@/hooks/api/use-trace-availability', () => ({ + useTraceAvailability: () => ({ data: undefined }), +})); + import ScatterGraph from './ScatterGraph'; // ── Environment stubs ──────────────────────────────────────────────────────── diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 64a8b218..fe4ca820 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -6,6 +6,7 @@ import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef } from import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry'; import { useInference } from '@/components/inference/InferenceContext'; +import { useTraceAvailability } from '@/hooks/api/use-trace-availability'; import { pointNearestX } from '@/components/inference/ui/line-label-anchor'; import { labelOpacityForActiveState, @@ -15,7 +16,12 @@ import ChartLegend from '@/components/ui/chart-legend'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; import { computeToggle } from '@/hooks/useTogglableSet'; import { getHardwareConfig, getModelSortIndex } from '@/lib/constants'; -import { getChartWatermark, getPrecisionLabel, type Precision } from '@/lib/data-mappings'; +import { + getChartWatermark, + getPrecisionLabel, + type Precision, + Sequence, +} from '@/lib/data-mappings'; import { matchKnownConfigIssues, pointMatchesIssue } from '@/lib/known-issues'; import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils'; import { D3Chart } from '@/lib/d3-chart/D3Chart'; @@ -44,12 +50,7 @@ import { getShapeKeyForPrecision, } from '@/lib/chart-rendering'; import { useThemeColors } from '@/hooks/useThemeColors'; -import { - paretoFrontLowerLeft, - paretoFrontLowerRight, - paretoFrontUpperLeft, - paretoFrontUpperRight, -} from '@/lib/chart-utils'; +import { paretoFrontForDirection, type ParetoDirection } from '@/lib/chart-utils'; import { type RooflineDirection, getSpeedOverlayCorners } from '@/lib/speed-overlay'; import type { ChartDefinition, @@ -76,6 +77,96 @@ import { } from '@/components/inference/utils/knownIssueAnnotations'; import { matchesQuickFilters } from '@/components/inference/utils/quickFilters'; +// Greedy label-collision avoidance. +// Each candidate is the y-position of the FIRST baseline (relative to point +// center) which we apply via the first tspan's `dy` — later tspans cascade +// down by 1.1em. We try above/below at primary and secondary offsets, and +// hide the label if all four positions collide. +function avoidLabelCollisions( + zoomGroup: d3.Selection, +): void { + interface LabelInfo { + el: SVGTextElement; + firstTspan: SVGTSpanElement; + cx: number; + cy: number; + w: number; + nLines: number; + defaultFirstY: number; + } + const labels: LabelInfo[] = []; + const ASCENT = 9; + const DESCENT = 3; + const LINE_H = 11; + + zoomGroup.selectAll('.dot-group').each(function () { + const labelEl = this.querySelector('.point-label'); + if (!labelEl) return; + if ((this as SVGGElement).style.opacity === '0') return; + const tspans = labelEl.querySelectorAll('tspan'); + if (tspans.length === 0) return; + const transform = (this as SVGGElement).getAttribute('transform') ?? ''; + const m = transform.match(/translate\((?[^,]+),(?[^)]+)\)/u); + if (!m) return; + const cx = parseFloat(m[1]); + const cy = parseFloat(m[2]); + const nLines = tspans.length; + const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point + // Reset to default before measuring so prior positioning doesn't bias bbox + tspans[0].setAttribute('dy', `${defaultFirstY}px`); + labelEl.style.opacity = '1'; + const bbox = labelEl.getBBox(); + labels.push({ + el: labelEl, + firstTspan: tspans[0], + cx, + cy, + w: bbox.width, + nLines, + defaultFirstY, + }); + }); + + labels.sort((a, b) => a.cx - b.cx); + const placed: { left: number; right: number; top: number; bottom: number }[] = []; + const pad = 2; + + for (const lab of labels) { + const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT; + const aboveFirstY = lab.defaultFirstY; + const belowFirstY = 14; // first baseline 14px below point center + const candidates = [ + aboveFirstY, + belowFirstY, + aboveFirstY - blockH - 2, + belowFirstY + blockH + 2, + ]; + let chosenY: number | null = null; + let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; + for (const firstY of candidates) { + const top = lab.cy + firstY - ASCENT - pad; + const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad; + const left = lab.cx - lab.w / 2 - pad; + const right = lab.cx + lab.w / 2 + pad; + const collides = placed.some( + (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom), + ); + if (!collides) { + chosenY = firstY; + chosenBox = { left, right, top, bottom }; + break; + } + } + if (chosenY !== null && chosenBox) { + lab.firstTspan.setAttribute('dy', `${chosenY}px`); + lab.el.style.opacity = '1'; + placed.push(chosenBox); + } else { + lab.el.style.opacity = '0'; + } + } +} + // X-shape path for overlay (unofficial) data points const X_SIZE = 5; const X_HOVER_SIZE = 7; @@ -108,6 +199,32 @@ const formatChangelogDescription = (desc: string | string[]): React.JSX.Element const CHART_MARGIN = { top: 24, right: 10, bottom: 60, left: 60 }; +/** + * Bucket points by their (requested) date. Comparison overlays put multiple + * dates under one legend key, and rooflines / gradient paths must never span + * dates — a May 15 point can't dominate a May 17 plot. + */ +function groupPointsByDate(points: InferenceData[]): Map { + const byDate = new Map(); + for (const p of points) { + let bucket = byDate.get(p.date); + if (!bucket) { + bucket = []; + byDate.set(p.date, bucket); + } + bucket.push(p); + } + return byDate; +} + +/** Identity key for "is this point on a roofline" lookups (scoped per date). */ +const optimalPointKey = (d: InferenceData): string => + `${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`; + +/** Point label lines: TP (or full parallelism label) plus the C= concurrency. */ +const pointLabelText = (d: InferenceData, advanced: boolean): string => + advanced ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`; + // Referentially stable "no overlay data" result (see processedOverlayData). const EMPTY_OVERLAY_DATA: InferenceData[] = []; @@ -214,6 +331,8 @@ const ScatterGraph = React.memo( trackedConfigs, addTrackedConfig, removeTrackedConfig, + selectedXAxisMode, + selectedSequence, quickFilters, } = useInference(); @@ -289,10 +408,18 @@ const ScatterGraph = React.memo( () => [...effectiveOfficialHwTypes], [effectiveOfficialHwTypes], ); + // High-contrast palette is keyed off the FULL set of official hw types with + // data, not the active subset. Otherwise deselecting a line shrinks the key + // set, which re-sizes the iwanthue palette and shifts every remaining line's + // hue (most visible for single-vendor agentic runs that span the full wheel — + // e.g. deselecting B300 would recolor B200 from red to blue). Keying off the + // stable full set fixes each hw's color so toggling only hides/shows lines. + const stableHcKeys = useMemo(() => [...hwTypesWithData], [hwTypesWithData]); const { resolveColor, getCssColor } = useThemeColors({ highContrast, identifiers: activeHwKeys, activeKeys: activeOfficialKeys, + hcKeys: stableHcKeys, }); // --- Changelog --- @@ -328,34 +455,40 @@ const ScatterGraph = React.memo( ); const rooflines = useMemo(() => { + // Frontier scope is (hw, precision, date) — points from different dates + // can never share a frontier (a May 15 point can't dominate a May 17 plot). + // The legend grouping is still by (hw, precision); we just split the + // pareto compute per date and re-merge into the legend bucket. const result: Record = {}; const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition; - const dir = chartDefinition[rooflineKey] as - | 'upper_right' - | 'upper_left' - | 'lower_left' - | 'lower_right' - | undefined; - for (const hw of Object.keys(groupedData)) { - const front = - dir === 'upper_right' - ? paretoFrontUpperRight(groupedData[hw]) - : dir === 'upper_left' - ? paretoFrontUpperLeft(groupedData[hw]) - : dir === 'lower_left' - ? paretoFrontLowerLeft(groupedData[hw]) - : paretoFrontLowerRight(groupedData[hw]); - front.sort((a, b) => a.x - b.x); - result[hw] = front; + const dir = chartDefinition[rooflineKey] as ParetoDirection | undefined; + const frontierFn = paretoFrontForDirection(dir ?? 'lower_right'); + for (const hwKey of Object.keys(groupedData)) { + const combined: InferenceData[] = []; + for (const datePoints of groupPointsByDate(groupedData[hwKey]).values()) { + // In non-e2e xmodes, useChartData stamps every point with an + // `isOnE2eFrontier` flag so the line is restricted to the + // e2e-Pareto winners — same set of points across every chart, + // just re-plotted at the chosen x metric. When the flag is + // present on ANY point in the bucket, narrow to the winners + // before paretoing (otherwise we'd recompute a fresh frontier + // on the swapped x axis and reintroduce the benchmark hack). + const flagged = datePoints.some((p) => p.isOnE2eFrontier !== undefined); + const seedPoints = flagged + ? datePoints.filter((p) => p.isOnE2eFrontier === true) + : datePoints; + if (seedPoints.length === 0) continue; + combined.push(...frontierFn(seedPoints)); + } + combined.sort((a, b) => a.x - b.x); + result[hwKey] = combined; } return result; }, [groupedData, selectedYAxisMetric, chartDefinition]); const optimalPointKeys = useMemo(() => { const keys = new Set(); - Object.values(rooflines).forEach((pts) => - pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}-${p.x}-${p.y}`)), - ); + Object.values(rooflines).forEach((pts) => pts.forEach((p) => keys.add(optimalPointKey(p)))); return keys; }, [rooflines]); @@ -381,6 +514,10 @@ const ScatterGraph = React.memo( const buildPointConfigId = useCallback((point: InferenceData): string => { let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`; if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`; + // Agentic runs emit two rows per (config, conc) — one offload=on, one off. + // Without this suffix, d3's data join treats them as the same point and + // drops one variant (along with its halo). + if (point.offload_mode) key += `|offload-${point.offload_mode}`; return key; }, []); @@ -454,22 +591,11 @@ const ScatterGraph = React.memo( {} as Record, ); const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition; - const dir = chartDefinition[rooflineKey] as - | 'upper_right' - | 'upper_left' - | 'lower_left' - | 'lower_right' - | undefined; + const dir = chartDefinition[rooflineKey] as ParetoDirection | undefined; + const frontierFn = paretoFrontForDirection(dir ?? 'lower_right'); const result: Record = {}; for (const [key, group] of Object.entries(grouped)) { - const front = - dir === 'upper_right' - ? paretoFrontUpperRight(group.points) - : dir === 'upper_left' - ? paretoFrontUpperLeft(group.points) - : dir === 'lower_left' - ? paretoFrontLowerLeft(group.points) - : paretoFrontLowerRight(group.points); + const front = frontierFn(group.points); front.sort((a, b) => a.x - b.x); result[key] = { hwKey: group.hwKey, runIndex: group.runIndex, points: front }; } @@ -479,6 +605,20 @@ const ScatterGraph = React.memo( // All official points for rendering (unfiltered — visibility via opacity) const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]); + // Bulk presence lookup for agentic points: which ids have a stored + // trace_replay blob → controls the "View charts" button in the pinned + // tooltip. We deliberately don't fetch the histograms themselves here; + // a 95-point dsv4-b300 dashboard would pull GB of profile blobs through + // Neon's HTTP API and trip its 64 MB per-response cap. + const agenticIds = useMemo(() => { + const ids: number[] = []; + for (const p of pointsData) { + if (p.benchmark_type === 'agentic_traces' && typeof p.id === 'number') ids.push(p.id); + } + return ids; + }, [pointsData]); + const { data: traceAvailability } = useTraceAvailability(agenticIds); + // Gradient label data const allPointLabelsByKey = useMemo(() => { const globalLabelColorMap = new Map(); @@ -518,7 +658,7 @@ const ScatterGraph = React.memo( const visiblePoints = useMemo(() => { let pts = filteredData; if (hideNonOptimal) { - pts = pts.filter((d) => optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)); + pts = pts.filter((d) => optimalPointKeys.has(optimalPointKey(d))); } return processedOverlayData.length > 0 ? [...pts, ...processedOverlayData] : pts; }, [filteredData, processedOverlayData, hideNonOptimal, optimalPointKeys]); @@ -607,7 +747,7 @@ const ScatterGraph = React.memo( (d: InferenceData) => effectiveActiveHwTypes.has(d.hwKey as string) && selectedPrecisions.includes(d.precision) && - (!hideNonOptimal || optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)), + (!hideNonOptimal || optimalPointKeys.has(optimalPointKey(d))), [effectiveActiveHwTypes, selectedPrecisions, hideNonOptimal, optimalPointKeys], ); @@ -755,6 +895,7 @@ const ScatterGraph = React.memo( d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any, ); } + avoidLabelCollisions(ctx.layout.zoomGroup); }, }), [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type], @@ -774,6 +915,7 @@ const ScatterGraph = React.memo( hardwareConfig, isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)), runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined, + hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false, }), getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x), getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y), @@ -789,26 +931,39 @@ const ScatterGraph = React.memo( ), onPointClick: (d: InferenceData) => { track('latency_data_point_clicked', { hw: String(d.hwKey), x: d.x, y: d.y }); - // Attach track-over-time button handler in the tooltip const tooltipEl = chartRef.current?.getTooltipElement(); - if (tooltipEl) { - const btn = tooltipEl.querySelector('[data-action="track-over-time"]'); - if (btn) { - btn.addEventListener('click', (btnEvent) => { - btnEvent.stopPropagation(); - const configId = buildPointConfigId(d); - if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId); - else addTrackedConfig(d, chartDefinition.chartType); - chartRef.current?.dismissTooltip(); - chartRef.current?.hideTooltip(); - track('latency_point_tracked_via_tooltip', { - hwKey: String(d.hwKey), - tp: d.tp, - conc: d.conc, - precision: d.precision, - }); + if (!tooltipEl) return; + + // ── Summary-page actions ────────────────────────────────────────── + const trackBtn = tooltipEl.querySelector('[data-action="track-over-time"]'); + if (trackBtn) { + trackBtn.addEventListener('click', (btnEvent) => { + btnEvent.stopPropagation(); + const configId = buildPointConfigId(d); + if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId); + else addTrackedConfig(d, chartDefinition.chartType); + chartRef.current?.dismissTooltip(); + chartRef.current?.hideTooltip(); + track('latency_point_tracked_via_tooltip', { + hwKey: String(d.hwKey), + tp: d.tp, + conc: d.conc, + precision: d.precision, }); - } + }); + } + + // ── "View charts" real link (supports browser open-in-new-tab) ─── + const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]'); + if (viewBtn && typeof d.id === 'number') { + viewBtn.addEventListener('click', (btnEvent) => { + btnEvent.stopPropagation(); + track('latency_view_charts_opened', { + id: d.id, + hwKey: String(d.hwKey), + conc: d.conc, + }); + }); } }, attachToLayer: 1, // scatter layer is index 1 (after rooflines at 0) @@ -822,6 +977,11 @@ const ScatterGraph = React.memo( addTrackedConfig, removeTrackedConfig, chartDefinition.chartType, + // selectedPrecisions is read via interactionRef.current in the hover + // handlers, so it isn't a dep. traceAvailability IS read directly in the + // tooltip content closure (the "View charts" button), so rebuild the + // config when the presence fetch resolves. + traceAvailability, ], ); @@ -876,35 +1036,56 @@ const ScatterGraph = React.memo( const precision = key.split('_').pop()!; const visible = ir.effectiveActiveHwTypes.has(hw) && ir.selectedPrecisions.includes(precision); - let stroke = ir.getCssColor(ir.resolveColor(hw)); - - if (showGradientLabels) { - const pointLabels = allPointLabelsByKey[key]; - if (pointLabels) { - const stops = computeGradientStops(pointLabels, xScale); - if (stops) { - const gid = `roofline-gradient-${chartId}-${key}`; - activeGradientIds.add(gid); - let gradient = defs.select(`#${CSS.escape(gid)}`); - if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid); - gradient - .attr('gradientUnits', 'userSpaceOnUse') - .attr('x1', xScale(pts[0].x)) - .attr('y1', 0) - .attr('x2', xScale(pts.at(-1)!.x)) - .attr('y2', 0); - gradient - .selectAll('stop') - .data(stops) - .join('stop') - .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`) - .attr('stop-color', (s) => s.color); - stroke = `url(#${gid})`; + const baseStroke = ir.getCssColor(ir.resolveColor(hw)); + + // Split into per-date sub-paths so the line never crosses dates. + // (When only one date is present the loop runs once with the full set.) + const byDate = groupPointsByDate(pts); + const singleDate = byDate.size === 1; + + for (const [date, datePoints] of byDate) { + if (datePoints.length <= 1) continue; + const entryKey = singleDate ? key : `${key}__${date}`; + let stroke = baseStroke; + + // Gradient labels only apply in the single-date case; mapping the + // (key-wide) ParetoPointLabel array onto per-date sub-segments is + // ambiguous and the comparison-date overlay is a rare combo. + if (singleDate && showGradientLabels) { + const pointLabels = allPointLabelsByKey[key]; + if (pointLabels) { + const stops = computeGradientStops(pointLabels, xScale); + if (stops) { + const gid = `roofline-gradient-${chartId}-${entryKey}`; + activeGradientIds.add(gid); + let gradient = defs.select(`#${CSS.escape(gid)}`); + if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid); + gradient + .attr('gradientUnits', 'userSpaceOnUse') + .attr('x1', xScale(datePoints[0].x)) + .attr('y1', 0) + .attr('x2', xScale(datePoints.at(-1)!.x)) + .attr('y2', 0); + gradient + .selectAll('stop') + .data(stops) + .join('stop') + .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`) + .attr('stop-color', (s) => s.color); + stroke = `url(#${gid})`; + } } } - } - entries.push({ key, hw, precision, points: pts, stroke, visible }); + entries.push({ + key: entryKey, + hw, + precision, + points: datePoints, + stroke, + visible, + }); + } }); // Remove stale gradients @@ -1346,11 +1527,18 @@ const ScatterGraph = React.memo( .y((d) => newYScale(d.y)) .curve(d3.curveMonotoneX); - // Update roofline paths + // Update roofline paths — must split per-date so the zoom redraw + // matches the per-date sub-paths created in the initial render. Object.entries(rooflines).forEach(([key, pts]) => { if (pts.length < 2) return; - const sel = zoomGroup.select(`.roofline-${key}`); - if (!sel.empty()) sel.attr('d', lineGen(pts) as string); + const byDate = groupPointsByDate(pts); + const singleDate = byDate.size === 1; + for (const [date, datePoints] of byDate) { + if (datePoints.length < 2) continue; + const cls = singleDate ? `roofline-${key}` : `roofline-${key}__${date}`; + const sel = zoomGroup.select(`.${CSS.escape(cls)}`); + if (!sel.empty()) sel.attr('d', lineGen(datePoints) as string); + } }); // Update gradient coordinates @@ -1578,7 +1766,8 @@ const ScatterGraph = React.memo( getOpacity: (d) => (interactionRef.current.isPointVisible(d) ? 1 : 0), getPointerEvents: (d) => (interactionRef.current.isPointVisible(d) ? 'auto' : 'none'), hideLabels: !showPointLabels || showGradientLabels, - getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)), + // Keep the concurrency (C=) annotation from the agentx scatter labels. + getLabelText: (d) => pointLabelText(d, useAdvancedLabels), foreground: 'var(--foreground)', dataAttrs: { 'hw-key': (d) => String(d.hwKey), @@ -1679,17 +1868,26 @@ const ScatterGraph = React.memo( // Labels const showLabels = showPointLabels && !showGradientLabels; overlayPoints.each(function (d) { - d3.select(this) + const lines = showLabels ? pointLabelText(d, useAdvancedLabels).split('\n') : []; + const text = d3 + .select(this) .selectAll('.overlay-label') .data(showLabels ? [true] : []) .join('text') .attr('class', 'overlay-label') - .attr('dy', -10) .attr('text-anchor', 'middle') .style('fill', 'var(--foreground)') .attr('font-size', '10px') - .attr('pointer-events', 'none') - .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp)); + .attr('font-weight', '700') + .attr('pointer-events', 'none'); + const firstDy = -(1 + (lines.length - 1) * 1.1); + text + .selectAll('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) + .text((l) => l); }); // Overlay tooltip handlers @@ -2007,6 +2205,23 @@ const ScatterGraph = React.memo( .attr('pointer-events', 'none'); }); + // Offload halo: dashed ring on every point that used KV offload (Pareto or not) + zoomGroup.selectAll('.dot-group').each(function (d) { + const showHalo = d.offload_mode === 'on'; + d3.select(this) + .selectAll('.offload-halo') + .data(showHalo ? [true] : []) + .join('circle') + .attr('class', 'offload-halo') + .attr('r', POINT_SIZE + 4) + .attr('fill', 'none') + .attr('stroke', 'var(--foreground)') + .attr('stroke-width', 1.5) + .attr('stroke-dasharray', '3 2') + .attr('opacity', 0.9) + .attr('pointer-events', 'none'); + }); + // Double-click to track/untrack zoomGroup .selectAll('.dot-group') @@ -2041,6 +2256,8 @@ const ScatterGraph = React.memo( }); }); + avoidLabelCollisions(zoomGroup); + // Log tick formatting on initial render if (xScaleConfig._isLog) { const xScale = ctx.xScale as d3.ScaleLogarithmic; @@ -2063,6 +2280,9 @@ const ScatterGraph = React.memo( chartDefinition.chartType, xScaleConfig._isLog, yScaleConfig.type, + optimalPointKeys, + getCssColor, + resolveColor, ], ); @@ -2373,6 +2593,17 @@ const ScatterGraph = React.memo( setHideNonOptimal(checked); track('latency_hide_non_optimal_toggled', { enabled: checked }); }, + // On agentic + non-e2e chart, "optimal" means "on the + // e2e-latency Pareto frontier" (not a per-axis Pareto on the + // current x metric). Explain that so users don't wonder why + // a point sitting above the line is still considered + // dominated. + ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e' + ? { + infoTooltip: + "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.", + } + : {}), }, { id: 'scatter-point-labels', diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx index 799854d7..f18903ea 100644 --- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx @@ -194,9 +194,7 @@ export function UnofficialChartDisplay() { `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition ] }{' '} - {graph.chartDefinition[ - `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition - ] || graph.chartDefinition.heading} + {graph.chartDefinition.heading}

{graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence} diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts index 8f8705e1..7d5b1482 100644 --- a/packages/app/src/components/inference/utils.test.ts +++ b/packages/app/src/components/inference/utils.test.ts @@ -1,7 +1,26 @@ import { describe, it, expect } from 'vitest'; import type { ChartDefinition, InferenceData } from '@/components/inference/types'; -import { filterDataByCostLimit, processOverlayChartData } from '@/components/inference/utils'; +import { + filterDataByCostLimit, + processOverlayChartData, + selectUnofficialOverlayForMode, +} from '@/components/inference/utils'; + +describe('selectUnofficialOverlayForMode', () => { + const overlays = { e2e: { id: 'e2e' }, interactivity: { id: 'interactivity' } }; + + it('suppresses raw unofficial E2E data for normalized E2E mode', () => { + expect(selectUnofficialOverlayForMode('normalized-e2e', 'e2e', overlays)).toBeNull(); + }); + + it('preserves matching unofficial overlays for supported modes', () => { + expect(selectUnofficialOverlayForMode('e2e', 'e2e', overlays)).toBe(overlays.e2e); + expect(selectUnofficialOverlayForMode('interactivity', 'interactivity', overlays)).toBe( + overlays.interactivity, + ); + }); +}); // --------------------------------------------------------------------------- // fixture factories @@ -157,12 +176,12 @@ describe('processOverlayChartData', () => { }); it('remaps x to config override for input metrics on interactivity chart', () => { - // inputTputPerGpu has x override to p99_ttft on interactivity chart + // inputTputPerGpu has x override to p90_ttft on interactivity chart const data = [ pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_intvty: 50, } as any), ]; @@ -176,16 +195,11 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - median_ttft: 0.1, + p90_ttft: 0.1, median_intvty: 50, } as any), ]; - const result = processOverlayChartData( - data, - 'interactivity', - 'y_inputTputPerGpu', - 'median_ttft', - ); + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.1); }); @@ -195,76 +209,62 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_e2el: 2.5, } as any), ]; const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null); expect(result).toHaveLength(1); - // e2e uses median_e2el as x (from chart config default), not p99_ttft + // e2e uses median_e2el as x (from chart config default), not p90_ttft expect(result[0].x).toBe(2.5); }); - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => { - const data = [ - pt({ - x: 100, - tpPerGpu: { y: 42, roof: false }, - p99_ttft: 0.35, - median_e2el: 2.5, - } as any), - ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); - expect(result).toHaveLength(1); - expect(result[0].x).toBe(0.35); - }); - - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => { + it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => { const data = [ pt({ x: 100, tpPerGpu: { y: 42, roof: false }, - median_ttft: 0.12, + p90_ttft: 0.12, median_e2el: 2.5, } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.12); }); it('filters e2e TTFT outliers exceeding y_latency_limit', () => { const data = [ - pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any), - pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any), + pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any), + pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); it('does not filter interactivity points by latency limit when x-axis is default', () => { - // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity + // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity // chart's x-axis stays median_intvty for non-input metrics. The latency limit // (60) must NOT apply to median_intvty values. const data = [ pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any), pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(2); }); it('applies latency limit on interactivity only when x-axis is actually overridden', () => { - // When an input metric IS selected and x-axis overrides to p99_ttft, + // When an input metric IS selected and x-axis overrides to p90_ttft, // the latency limit should apply. const data = [ - pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any), - pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any), + pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any), + pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft'); - // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999 + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); + // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999 expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts index 4b5335b6..f6ebd0f8 100644 --- a/packages/app/src/components/inference/utils.ts +++ b/packages/app/src/components/inference/utils.ts @@ -8,6 +8,20 @@ import chartDefinitions from '@/components/inference/inference-chart-config.json import type { ChartDefinition, InferenceData, YAxisMetricKey } from './types'; +/** + * Select the matching unofficial-run overlay for a chart mode. Normalized E2E + * is intentionally excluded: unofficial benchmark rows do not include the + * persisted per-request trace needed to normalize before taking percentiles. + */ +export function selectUnofficialOverlayForMode( + xAxisMode: string, + chartType: 'e2e' | 'interactivity', + overlays: { e2e: T | null; interactivity: T | null }, +): T | null { + if (xAxisMode === 'normalized-e2e') return null; + return overlays[chartType]; +} + /** * Filters data points based on cost limits defined in the chart definition. * Only applies filtering for cost-related metrics, and only filters based on @@ -75,11 +89,13 @@ export function processOverlayChartData( chartType: 'e2e' | 'interactivity', selectedYAxisMetric: string, selectedXAxisMetric: string | null, + options?: { isAgentic?: boolean }, ): InferenceData[] { const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType); if (!chartDef) return []; const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey; + const isAgentic = options?.isAgentic === true; // Resolve x-axis field (must match useChartData logic) const metricTitle = @@ -87,9 +103,11 @@ export function processOverlayChartData( const isInputMetric = metricTitle.toLowerCase().includes('input'); let xAxisField: string = chartDef.x; // selectedXAxisMetric is already the effective metric for this chart type - // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric) + // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric). + // Match any *_ttft metric — the x-axis-mode picker can now select any + // percentile (median/p75/p90/p99) depending on sequence kind. const isTtftOverride = - selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft'; + typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft'); if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) { xAxisField = selectedXAxisMetric; @@ -109,7 +127,12 @@ export function processOverlayChartData( }) .filter( (d) => - xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit, + // Skip the latency limit for the natural x-axis or for agentic + // (long TTFTs are normal there, not overload outliers). + xAxisField === chartDef.x || + isAgentic || + !chartDef.y_latency_limit || + d.x <= chartDef.y_latency_limit, ); return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric); diff --git a/packages/app/src/components/inference/utils/parallelism-label.test.ts b/packages/app/src/components/inference/utils/parallelism-label.test.ts new file mode 100644 index 00000000..aaf715d3 --- /dev/null +++ b/packages/app/src/components/inference/utils/parallelism-label.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from 'vitest'; + +import { configSegmentLabel, parallelismLabel } from './parallelism-label'; + +describe('configSegmentLabel', () => { + it('collapses symmetric tp===ep to TEP / DEP by dp-attention', () => { + expect(configSegmentLabel(8, 8, false)).toBe('TEP8'); + expect(configSegmentLabel(8, 8, true)).toBe('DEP8'); + }); + + it('uses EP / DPAEP when ep>1 and tp!==ep', () => { + expect(configSegmentLabel(4, 16, false)).toBe('EP16'); + expect(configSegmentLabel(4, 16, true)).toBe('DPAEP16'); + }); + + it('uses TP / DPATP when ep<=1 or absent', () => { + expect(configSegmentLabel(8, 1, false)).toBe('TP8'); + expect(configSegmentLabel(8, undefined, false)).toBe('TP8'); + expect(configSegmentLabel(8, 1, true)).toBe('DPATP8'); + }); +}); + +describe('parallelismLabel', () => { + it('falls back to bare tp when no ep data', () => { + expect(parallelismLabel({ tp: 8 })).toBe('8'); + }); + + it('labels a single-segment config', () => { + expect(parallelismLabel({ tp: 8, ep: 8, dpAttention: true })).toBe('DEP8'); + expect(parallelismLabel({ tp: 4, ep: 8, dpAttention: false })).toBe('EP8'); + }); + + it('builds multinode-disagg per-role worker segments', () => { + expect( + parallelismLabel({ + tp: 8, + ep: 4, + disagg: true, + isMultinode: true, + prefillTp: 4, + prefillEp: 4, + prefillDpAttention: false, + prefillNumWorkers: 2, + decodeTp: 8, + decodeEp: 8, + decodeDpAttention: true, + decodeNumWorkers: 1, + }), + ).toBe('2xTEP4+1xDEP8'); + }); + + it('single-node disagg uses the single (decode) segment, not worker syntax', () => { + // is_multinode false → no "NxPrefill+MxDecode" expansion. + expect( + parallelismLabel({ tp: 8, ep: 8, dpAttention: false, disagg: true, isMultinode: false }), + ).toBe('TEP8'); + }); +}); diff --git a/packages/app/src/components/inference/utils/parallelism-label.ts b/packages/app/src/components/inference/utils/parallelism-label.ts new file mode 100644 index 00000000..98207110 --- /dev/null +++ b/packages/app/src/components/inference/utils/parallelism-label.ts @@ -0,0 +1,79 @@ +/** + * Shared parallelism-config labeling — the single source of truth for the + * short "TP8 / EP8 / TEP8 / DEP8 / DPAEP8 / 2xEP4+1xDPAEP32" labels. + * + * Used by the scatter/GPU chart point labels (via getPointLabel) and the + * agentic detail page's sibling navigator chips, so both surfaces describe a + * config identically. + */ + +/** + * Generates a short config segment label from parallelism params. + * - tp == ep and dp-attn false: "TEP{N}" + * - tp == ep and dp-attn true: "DEP{N}" + * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}" + * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}" + */ +export const configSegmentLabel = ( + tp: number, + ep: number | undefined, + dpAttention: boolean | undefined, +): string => { + if (ep !== null && ep !== undefined && ep > 1 && tp === ep) { + return dpAttention ? `DEP${tp}` : `TEP${tp}`; + } + const dpaPrefix = dpAttention ? 'DPA' : ''; + if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`; + return `${dpaPrefix}EP${ep}`; +}; + +/** Parallelism params for one benchmark config, framework-agnostic. */ +export interface ParallelismFields { + tp: number; + ep?: number; + dpAttention?: boolean; + disagg?: boolean; + isMultinode?: boolean; + prefillTp?: number; + prefillEp?: number; + prefillDpAttention?: boolean; + prefillNumWorkers?: number; + decodeTp?: number; + decodeEp?: number; + decodeDpAttention?: boolean; + decodeNumWorkers?: number; +} + +/** + * Returns the short parallelism label for a config. + * - No EP data (old rows): falls back to the bare tp value (e.g. "8"). + * - Multinode disagg: per-role segments with worker counts, + * e.g. "2xEP4+1xDPAEP32". + * - Otherwise: a single segment from (tp, ep, dpAttention). + */ +export const parallelismLabel = (f: ParallelismFields): string => { + if ( + (f.ep === null || f.ep === undefined) && + (f.prefillEp === null || f.prefillEp === undefined) + ) { + return String(f.tp); + } + + if (f.isMultinode && f.disagg) { + const prefillLabel = configSegmentLabel( + f.prefillTp ?? f.tp, + f.prefillEp ?? f.ep, + f.prefillDpAttention ?? f.dpAttention, + ); + const decodeLabel = configSegmentLabel( + f.decodeTp ?? f.tp, + f.decodeEp ?? f.ep, + f.decodeDpAttention ?? f.dpAttention, + ); + const pw = f.prefillNumWorkers ?? 1; + const dw = f.decodeNumWorkers ?? 1; + return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`; + } + + return configSegmentLabel(f.tp, f.ep, f.dpAttention); +}; diff --git a/packages/app/src/components/inference/utils/tooltip-utils.test.ts b/packages/app/src/components/inference/utils/tooltip-utils.test.ts index 5a5bd7e9..e4b9d31f 100644 --- a/packages/app/src/components/inference/utils/tooltip-utils.test.ts +++ b/packages/app/src/components/inference/utils/tooltip-utils.test.ts @@ -150,6 +150,15 @@ describe('getPointLabel', () => { // generateTooltipContent // =========================================================================== describe('generateTooltipContent', () => { + it('renders View charts as a same-tab anchor so browsers offer open-in-new-tab', () => { + const html = generateTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }), + ); + expect(html).toContain(' { const html = generateTooltipContent(tooltipConfig()); expect(html).toContain('H100'); @@ -365,4 +374,27 @@ describe('generateGPUGraphTooltipContent', () => { ); expect(html).toContain('vllm-v0.6.0
abc123'); }); + + it('shows View charts only for pinned points with stored trace data', () => { + expect( + generateGPUGraphTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }), + ), + ).toContain('data-action="view-charts"'); + expect( + generateGPUGraphTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }), + ), + ).toContain('href="/inference/agentic/1"'); + expect( + generateGPUGraphTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: false, hasTrace: true }), + ), + ).not.toContain('data-action="view-charts"'); + expect( + generateGPUGraphTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: false }), + ), + ).not.toContain('data-action="view-charts"'); + }); }); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index 9143f40f..e3f0de6d 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -1,6 +1,7 @@ import { formatNumber, getDisplayLabel } from '@/lib/utils'; import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types'; +import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; export interface TooltipConfig { /** The data point to display */ @@ -19,6 +20,14 @@ export interface TooltipConfig { isTracked?: boolean; /** URL to the GitHub Actions workflow run */ runUrl?: string; + /** + * Whether this agentic point has a stored trace_replay blob. Controls + * visibility of the "View charts" button — the actual distributions are + * rendered on the detail page, not inline, so all the tooltip needs is a + * presence boolean (sourced from the bulk `/api/v1/trace-availability` + * call so we don't ship megabytes of profile JSONL just for this check). + */ + hasTrace?: boolean; } export interface OverlayTooltipConfig extends TooltipConfig { @@ -26,57 +35,37 @@ export interface OverlayTooltipConfig extends TooltipConfig { overlayData: OverlayData; } -/** - * Generates a short config segment label from parallelism params. - * - tp == ep and dp-attn false: "TEP{N}" - * - tp == ep and dp-attn true: "DEP{N}" - * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}" - * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}" - */ -const configSegmentLabel = ( - tp: number, - ep: number | undefined, - dpAttention: boolean | undefined, -): string => { - if (ep !== null && ep !== undefined && ep > 1 && tp === ep) { - return dpAttention ? `DEP${tp}` : `TEP${tp}`; - } - const dpaPrefix = dpAttention ? 'DPA' : ''; - if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`; - return `${dpaPrefix}EP${ep}`; -}; +// `dp_attention` is `boolean | string` on InferenceData (DB sends raw, the +// transform narrows "true"/"false" → boolean). Coerce to a plain boolean for +// the shared labeler, treating the legacy string form correctly. +const asBool = (v: boolean | string | undefined): boolean | undefined => + typeof v === 'string' ? v === 'true' : v; /** * Returns the short label for a data point on the chart. * - Non-multinode: e.g. "TP8", "EP8", "TEP8", "DEP8", "DPAEP8" * - Multinode disagg: e.g. "2xEP4+1xDPAEP32" * - Old data (no ep field): falls back to tp value + * + * Delegates to the shared {@link parallelismLabel} so the chart points and the + * agentic sibling navigator describe a config identically. */ -export const getPointLabel = (d: InferenceData): string => { - if ( - (d.ep === null || d.ep === undefined) && - (d.prefill_ep === null || d.prefill_ep === undefined) - ) - return String(d.tp); - - if (d.is_multinode && d.disagg) { - const prefillLabel = configSegmentLabel( - d.prefill_tp ?? d.tp, - d.prefill_ep ?? d.ep, - d.prefill_dp_attention ?? d.dp_attention, - ); - const decodeLabel = configSegmentLabel( - d.decode_tp ?? d.tp, - d.decode_ep ?? d.ep, - d.decode_dp_attention ?? d.dp_attention, - ); - const pw = d.prefill_num_workers ?? 1; - const dw = d.decode_num_workers ?? 1; - return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`; - } - - return configSegmentLabel(d.tp, d.ep, d.dp_attention); -}; +export const getPointLabel = (d: InferenceData): string => + parallelismLabel({ + tp: d.tp, + ep: d.ep, + dpAttention: asBool(d.dp_attention), + disagg: d.disagg, + isMultinode: d.is_multinode, + prefillTp: d.prefill_tp, + prefillEp: d.prefill_ep, + prefillDpAttention: asBool(d.prefill_dp_attention), + prefillNumWorkers: d.prefill_num_workers, + decodeTp: d.decode_tp, + decodeEp: d.decode_ep, + decodeDpAttention: asBool(d.decode_dp_attention), + decodeNumWorkers: d.decode_num_workers, + }); const runLinkHTML = (runUrl?: string) => runUrl @@ -88,6 +77,78 @@ const runLinkHTML = (runUrl?: string) => const tooltipLine = (label: string, value: string | number) => `

`; +const formatPct = (v: number | undefined): string | null => + v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`; + +/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */ +const fmt = (v: number): string => { + if (!Number.isFinite(v)) return String(v); + const rounded = parseFloat(v.toFixed(3)); + if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded); + return String(rounded); +}; + +/** + * Agentic-only tooltip rows: offload mode, KV cache hit rates, request + * success, token totals. Returns an empty string for non-agentic rows. + */ +const generateAgenticHTML = (d: InferenceData): string => { + if (d.benchmark_type !== 'agentic_traces') return ''; + + const parts: string[] = []; + if (d.offload_mode) { + parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase())); + } + + const gpuHit = formatPct(d.server_gpu_cache_hit_rate); + const cpuHit = formatPct(d.server_cpu_cache_hit_rate); + const theoHit = formatPct(d.theoretical_cache_hit_rate); + if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit)); + if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit)); + if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit)); + + if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) { + const successPct = + d.num_requests_total > 0 + ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)` + : ''; + parts.push( + tooltipLine( + 'Requests', + `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`, + ), + ); + } + + if (d.total_prompt_tokens !== undefined) { + parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens))); + } + if (d.total_generation_tokens !== undefined) { + parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens))); + } + + // Histograms + time-series live on the dedicated detail page now; the + // "View charts" button (rendered by the wrapper when pinned + has trace + // data) takes the user there. + + return parts.join(''); +}; + +/** "View charts" link — only visible when the tooltip is pinned and the + * point has stored trace data. Wired up by the scatter/GPU graph click handlers. */ +const viewChartsButtonHTML = ( + isPinned: boolean, + hasTraceData: boolean, + pointId: number | undefined, +): string => { + if (!isPinned || !hasTraceData || typeof pointId !== 'number') return ''; + return `View charts →`; +}; + const shortenSha = (image: string) => image.replaceAll(/(?sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$…'); @@ -139,7 +200,16 @@ const generateParallelismHTML = (d: InferenceData): string => { * @returns HTML string for the tooltip content */ export const generateTooltipContent = (config: TooltipConfig): string => { - const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config; + const { + data: d, + isPinned, + xLabel, + yLabel, + selectedYAxisMetric, + hardwareConfig, + runUrl, + hasTrace, + } = config; return `
@@ -157,16 +227,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => { : '' }
- ${xLabel}: ${formatNumber(d.x)} + ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)} + ${yLabel}: ${fmt(d.y)}
${ selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu'] ? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)} + Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
` : '' } @@ -174,7 +244,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => { selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu'] ? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)} + Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
` : '' } @@ -183,10 +253,12 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} + ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)} ${ isPinned ? `
- ${yLabel}: ${formatNumber(d.y)} + ${yLabel}: ${fmt(d.y)}
${tooltipLine('Total GPUs', d.tp)} ${generateParallelismHTML(d)}
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
`; }; @@ -254,7 +327,16 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str * @returns HTML string for the tooltip content */ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => { - const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config; + const { + data: d, + isPinned, + xLabel, + yLabel, + selectedYAxisMetric, + hardwareConfig, + runUrl, + hasTrace, + } = config; return `
@@ -272,16 +354,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => : '' }
- ${xLabel}: ${formatNumber(d.x)} + ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)} + ${yLabel}: ${fmt(d.y)}
${ selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu'] ? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)} + Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
` : '' } @@ -289,7 +371,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu'] ? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)} + Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
` : '' } @@ -298,10 +380,12 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} + ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)}
`; }; diff --git a/packages/app/src/components/ui/chart-legend.tsx b/packages/app/src/components/ui/chart-legend.tsx index 25238522..ca7424bf 100644 --- a/packages/app/src/components/ui/chart-legend.tsx +++ b/packages/app/src/components/ui/chart-legend.tsx @@ -8,6 +8,7 @@ import { ChevronRight, Circle, Diamond, + Info, Square, Triangle, X, @@ -38,6 +39,8 @@ export interface LegendSwitchConfig { label: string; checked: boolean; onCheckedChange: (checked: boolean) => void; + /** Optional explainer rendered as an info-icon tooltip next to the label. */ + infoTooltip?: React.ReactNode; advanced?: boolean; } @@ -279,6 +282,29 @@ export default function ChartLegend({ > {sw.label} + {sw.infoTooltip && ( + + + + + + + {sw.infoTooltip} + + + + )}
))}
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index de18da09..6aee97dd 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -5,17 +5,30 @@ import { Info } from 'lucide-react'; import { LabelWithTooltip } from '@/components/ui/label-with-tooltip'; import { track } from '@/lib/analytics'; import { MultiSelect } from '@/components/ui/multi-select'; +import { + Select, + SelectContent, + SelectGroup, + SelectItem, + SelectLabel, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; import { TooltipContent, TooltipRoot, TooltipTrigger } from '@/components/ui/tooltip'; import { type Model, type Precision, type Sequence, + type Percentile, + PERCENTILE_OPTIONS, getModelCategory, getModelLabel, + getPercentileLabel, getPrecisionLabel, getSequenceCategory, getSequenceLabel, groupByCategory, + sequenceKind, } from '@/lib/data-mappings'; function CategorySectionTitle({ label, reason }: { label: string; reason: string }) { @@ -228,6 +241,143 @@ export function SequenceSelector({ ); } +interface ScenarioSelectorProps { + id?: string; + value: string; + onChange: (value: Sequence) => void; + open?: boolean; + onOpenChange?: (open: boolean) => void; + availableSequences: string[]; + 'data-testid'?: string; +} + +/** + * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length", + * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL + * framing only applies to the fixed-seq subset). + */ +export function ScenarioSelector({ + id = 'scenario-select', + value, + onChange, + open, + onOpenChange, + availableSequences, + 'data-testid': testId, +}: ScenarioSelectorProps) { + const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq'); + const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic'); + const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence)); + + return ( +
+ + +
+ ); +} + +interface PercentileSelectorProps { + id?: string; + value: string; + onChange: (value: Percentile) => void; + 'data-testid'?: string; +} + +/** + * Latency percentile selector for agentic-trace charts. The selected value + * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so + * picking p99 plots p99 e2e latency / interactivity instead of the median. + */ +export function PercentileSelector({ + id = 'percentile-select', + value, + onChange, + 'data-testid': testId, +}: PercentileSelectorProps) { + return ( +
+ + +
+ ); +} + interface PrecisionSelectorProps { id?: string; value: string[]; diff --git a/packages/app/src/components/ui/d3-chart-wrapper.tsx b/packages/app/src/components/ui/d3-chart-wrapper.tsx index 0392ac10..44013b1b 100644 --- a/packages/app/src/components/ui/d3-chart-wrapper.tsx +++ b/packages/app/src/components/ui/d3-chart-wrapper.tsx @@ -1,6 +1,41 @@ 'use client'; -import React from 'react'; +import React, { useEffect, useState } from 'react'; +import { createPortal } from 'react-dom'; + +/** + * Renders the d3 tooltip element via React Portal to document.body so it + * escapes any parent stacking context (e.g. the chart Card's backdrop-filter + * creates one, trapping z-index inside it). Position is set as viewport + * coordinates by the d3 layer. + */ +function PortalTooltip({ + tooltipRef, + pinned, +}: { + tooltipRef: React.RefObject; + pinned: boolean; +}) { + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const node = ( +
+ ); + if (!mounted || typeof document === 'undefined') return node; + return createPortal(node, document.body); +} export interface D3ChartWrapperProps { chartId: string; @@ -72,17 +107,11 @@ export function D3ChartWrapper({ } }} /> -
+ {/* Tooltip is portalled to with position:fixed so it can + rise above sibling chart cards' stacking contexts. The d3 layer + writes viewport-coords into style.left/top — see + computeTooltipPosition. */} + {noDataOverlay}

{instructions}

diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts index 1863060d..3c24d32b 100644 --- a/packages/app/src/components/unofficial-run-provider.test.ts +++ b/packages/app/src/components/unofficial-run-provider.test.ts @@ -12,6 +12,7 @@ import { buildChartData, parseAvailableModelsAndSequences } from './unofficial-r /** Minimal BenchmarkRow stub — only fields used by buildChartData key logic. */ function stubRow(overrides: Partial = {}): BenchmarkRow { return { + id: 1, hardware: 'h200', framework: 'sglang', model: 'dsr1', @@ -29,6 +30,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 128, diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index 310a4d1a..54b470ff 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -12,7 +12,7 @@ import { import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types'; import { UnofficialBanner } from '@/components/ui/unofficial-banner'; -import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants'; import { computeToggle } from '@/hooks/useTogglableSet'; import type { BenchmarkRow, EvalRow } from '@/lib/api'; import { normalizeEvalHardwareKey } from '@/lib/chart-utils'; @@ -110,7 +110,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData const groups = new Map(); for (const row of benchmarks) { const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model; - const sequence = islOslToSequence(row.isl, row.osl); + const sequence = rowToSequence(row); if (!sequence) continue; const key = `${displayModel}_${sequence}`; if (!groups.has(key)) groups.set(key, []); diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 0dac5883..a9d66715 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -8,6 +8,8 @@ import type { WorkerPower } from '@/components/inference/types'; import type { SubmissionsResponse } from './submissions-types'; export interface BenchmarkRow { + /** Stable per-point id from benchmark_results; used for agentic detail lookups. */ + id: number; hardware: string; framework: string; model: string; @@ -25,9 +27,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode. Defaults to 'off' for fixed-sequence rows. */ + offload_mode: string; image: string | null; metrics: Record; /** @@ -176,13 +182,14 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) { export interface AvailabilityRow { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; spec_method: string; disagg: boolean; + benchmark_type: string; date: string; } diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index 8f27cc8f..648ebaae 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -2,10 +2,15 @@ import { describe, it, expect, vi } from 'vitest'; import type { BenchmarkRow } from '@/lib/api'; -import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform'; +import { + mergeRunScopedRows, + rowToAggDataEntry, + transformBenchmarkRows, +} from './benchmark-transform'; function makeRow(overrides: Partial = {}): BenchmarkRow { return { + id: 1, hardware: 'h200', framework: 'trt', model: 'dsr1', @@ -23,6 +28,8 @@ function makeRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 64, @@ -793,3 +800,89 @@ describe('transformBenchmarkRows — dp_attention narrowing', () => { expect(point.decode_dp_attention).toBe(true); }); }); + +describe('mergeRunScopedRows', () => { + const vllmRun = (over: Partial = {}) => + makeRow({ model: 'dsv4', hardware: 'b300', framework: 'vllm', precision: 'fp4', ...over }); + const sglangBase = (over: Partial = {}) => + makeRow({ model: 'dsv4', hardware: 'b300', framework: 'sglang', precision: 'fp4', ...over }); + + it('pins configs the run covers to the run rows, replacing base rows', () => { + const runRows = [vllmRun({ id: 10, conc: 32 }), vllmRun({ id: 11, conc: 64 })]; + const baseRows = [vllmRun({ id: 90, conc: 32 }), vllmRun({ id: 91, conc: 128 })]; + const merged = mergeRunScopedRows(runRows, baseRows); + // All vllm base rows dropped (incl. conc=128 the run didn't cover) — a + // partial-sweep run must fully own its config or the DISTINCT-ON mixing + // the scoping exists to prevent comes right back. + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]); + }); + + it('carries forward configs the run does not cover (the same-day other-framework curve)', () => { + const runRows = [vllmRun({ id: 10 })]; + const baseRows = [ + vllmRun({ id: 90 }), + sglangBase({ id: 91 }), + sglangBase({ id: 92, conc: 128 }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91, 92]); + }); + + it('keeps base rows of other hardware / precision / model untouched', () => { + const runRows = [vllmRun({ id: 10 })]; + const baseRows = [ + vllmRun({ id: 90, hardware: 'b200' }), + vllmRun({ id: 91, precision: 'fp8' }), + vllmRun({ id: 92, model: 'kimik2.5' }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 90, 91, 92]); + }); + + it('scopes per benchmark_type — an agentic run does not hide fixed-seq carry-forward', () => { + const runRows = [vllmRun({ id: 10, benchmark_type: 'agentic_traces' })]; + const baseRows = [ + vllmRun({ id: 90, benchmark_type: 'agentic_traces' }), + vllmRun({ id: 91, benchmark_type: 'single_turn' }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]); + }); + + it('returns base rows unchanged when the run produced nothing', () => { + const baseRows = [vllmRun({ id: 90 }), sglangBase({ id: 91 })]; + expect(mergeRunScopedRows([], baseRows)).toBe(baseRows); + }); +}); + +describe('rowToAggDataEntry — agentic interactivity invariant', () => { + // Agentic artifacts have shipped *_intvty under two definitions across harness + // versions (slow-tail 1/p(ITL) vs fast-tail p(1/ITL)). The chart's + // interactivity selector is slow-tail, so we always derive intvty = 1/itl and + // discard the artifact value. Mirrors the ingest mapper + backfill. + const agentic = (metrics: Record) => + rowToAggDataEntry(makeRow({ benchmark_type: 'agentic_traces', isl: null, osl: null, metrics })); + + it('overrides an artifact-supplied (fast-tail) *_intvty with 1/*_itl', () => { + const entry = agentic({ + p90_itl: 0.0893, // slow-tail 1/itl ≈ 11.198 + p90_intvty: 23.91, // fast-tail contamination — must be discarded + p75_itl: 0.0692, + p75_intvty: 19, // must be discarded + }); + expect(entry.p90_intvty).toBeCloseTo(1 / 0.0893, 6); + expect(entry.p75_intvty).toBeCloseTo(1 / 0.0692, 6); + expect(entry.p90_intvty).not.toBeCloseTo(23.91, 1); + }); + + it('derives intvty from itl when the artifact omits intvty entirely', () => { + const entry = agentic({ p90_itl: 0.1, p95_itl: 0.2 }); + expect(entry.p90_intvty).toBeCloseTo(10, 6); + expect(entry.p95_intvty).toBeCloseTo(5, 6); + }); + + it('does not invert interactivity for single_turn rows', () => { + const entry = rowToAggDataEntry(makeRow({ metrics: { p90_itl: 0.05, p90_intvty: 999 } })); + expect(entry.p90_intvty).toBe(999); + }); +}); diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index ac806b79..cb8e3ceb 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -15,10 +15,47 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils'; import { getHardwareConfig } from '@/lib/constants'; import type { BenchmarkRow } from '@/lib/api'; +/** + * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl + * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here: + * e2el ≡ ttlt (time-to-last-token == end-to-end latency) + * tpot ≡ itl (time-per-output-token == inter-token-latency for single-output) + * intvty ≡ 1/itl (tok/s from the user's perspective) + * + * e2el/tpot only fill gaps (existing fields win). `intvty` is ALWAYS derived from + * itl, overriding any artifact-supplied value: the harness definition of + * `*_intvty` has drifted (some versions emit `p(1/ITL)`, which inverts percentile + * order), so for a slow-tail selector interactivity must be `1/p(ITL)`. This + * matches the ingest mapper + backfill-agentic-intvty for official rows; doing it + * here keeps overlay / `?unofficialrun=` rows (transformed live from raw + * artifacts, never through the DB) on the same definition. + */ +function agenticAliases(m: Record): Record { + const out: Record = {}; + for (const suffix of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) { + const itl = m[`${suffix}_itl`]; + const ttlt = m[`${suffix}_ttlt`]; + if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; + if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl; + if (itl !== undefined && itl > 0) out[`${suffix}_intvty`] = 1 / itl; + } + return out; +} + /** Convert a DB benchmark row to an AggDataEntry. */ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { - const m = row.metrics; + const isAgentic = row.benchmark_type === 'agentic_traces'; + const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics; + // Prefer the dedicated column (added in migration 004); fall back to the + // legacy stash inside `metrics` for any rows ingested before that column + // existed. + const rawMetrics = row.metrics as Record; + const offloadMode = + row.offload_mode ?? + (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined); return { + // Coerce: Postgres bigint comes through the SQL client as a string. + id: typeof row.id === 'number' ? row.id : Number(row.id), hw: row.hardware, framework: row.framework, model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model, @@ -32,23 +69,43 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { mean_ttft: m.mean_ttft ?? 0, median_ttft: m.median_ttft ?? 0, std_ttft: m.std_ttft ?? 0, + p75_ttft: m.p75_ttft ?? 0, + p90_ttft: m.p90_ttft ?? 0, + p95_ttft: m.p95_ttft ?? 0, p99_ttft: m.p99_ttft ?? 0, + 'p99.9_ttft': m['p99.9_ttft'] ?? 0, mean_tpot: m.mean_tpot ?? 0, median_tpot: m.median_tpot ?? 0, std_tpot: m.std_tpot ?? 0, + p75_tpot: m.p75_tpot ?? 0, + p90_tpot: m.p90_tpot ?? 0, + p95_tpot: m.p95_tpot ?? 0, p99_tpot: m.p99_tpot ?? 0, + 'p99.9_tpot': m['p99.9_tpot'] ?? 0, mean_intvty: m.mean_intvty ?? 0, median_intvty: m.median_intvty ?? 0, std_intvty: m.std_intvty ?? 0, + p75_intvty: m.p75_intvty ?? 0, + p90_intvty: m.p90_intvty ?? 0, + p95_intvty: m.p95_intvty ?? 0, p99_intvty: m.p99_intvty ?? 0, + 'p99.9_intvty': m['p99.9_intvty'] ?? 0, mean_itl: m.mean_itl ?? 0, median_itl: m.median_itl ?? 0, std_itl: m.std_itl ?? 0, + p75_itl: m.p75_itl ?? 0, + p90_itl: m.p90_itl ?? 0, + p95_itl: m.p95_itl ?? 0, p99_itl: m.p99_itl ?? 0, + 'p99.9_itl': m['p99.9_itl'] ?? 0, mean_e2el: m.mean_e2el ?? 0, median_e2el: m.median_e2el ?? 0, std_e2el: m.std_e2el ?? 0, + p75_e2el: m.p75_e2el ?? 0, + p90_e2el: m.p90_e2el ?? 0, + p95_e2el: m.p95_e2el ?? 0, p99_e2el: m.p99_e2el ?? 0, + 'p99.9_e2el': m['p99.9_e2el'] ?? 0, // Measured GPU telemetry (runner's aggregate_power.py). Left undefined for // rows predating the field so downstream chart code can distinguish // "no measurement" from "0 W" via createChartDataPoint's typeof guard. @@ -91,6 +148,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { date: row.date, actualDate: (row as any).actualDate ?? row.date, run_url: row.run_url ?? undefined, + benchmark_type: row.benchmark_type, + isl: row.isl, + osl: row.osl, + offload_mode: offloadMode, + server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate, + server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate, + theoretical_cache_hit_rate: m.theoretical_cache_hit_rate, + num_requests_total: m.num_requests_total, + num_requests_successful: m.num_requests_successful, + total_prompt_tokens: m.total_prompt_tokens, + total_generation_tokens: m.total_generation_tokens, }; } @@ -100,13 +168,59 @@ interface PreparedEntry { date: string; } +/** + * Rewrite a chart x-axis key to use a different latency percentile prefix + * (`median_` → `p99_` etc). Only touches keys that start with a known + * percentile prefix; leaves everything else alone. + */ +export function withPercentile(key: string, percentile: string): string { + return key.replace(/^(?:mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`); +} + +// Replacement granularity for single-run scoping: the changelog config_key +// tuple (model-precision-hardware-framework) plus benchmark_type, so an +// agentic-only run never hides the same config's fixed-seq carry-forward. +const runScopeKey = (r: BenchmarkRow): string => + `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}`; + +/** + * Merge run-scoped benchmark rows with the normal latest-per-config rows. + * + * When the user picks a specific workflow run (to disambiguate two same-day + * sweeps of the same config), only the configs that run actually produced + * should be pinned to it — every other config must keep its normal + * carry-forward rows. Scoping the whole chart to the run (the old behavior) + * silently hid complementary configs that happened to land on the same date, + * e.g. selecting one of two same-day vLLM runs made the day's SGLang curve + * vanish because it lived in a different workflow run. + * + * Run rows win for every (model, precision, hardware, framework, + * benchmark_type) group they cover; base rows fill in the rest. + */ +export function mergeRunScopedRows( + runRows: BenchmarkRow[], + baseRows: BenchmarkRow[], +): BenchmarkRow[] { + if (runRows.length === 0) return baseRows; + const claimed = new Set(runRows.map(runScopeKey)); + return [...runRows, ...baseRows.filter((r) => !claimed.has(runScopeKey(r)))]; +} + /** * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig. * Returns one InferenceData[] per chart definition (e2e, interactivity). * * Converts rows to AggDataEntry once, then reuses for each chart definition. + * + * @param percentile Optional latency percentile for the chart x-axis + * (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart + * definition for the chosen percentile — only agentic rows carry the + * full set (median/p90/p99/p99.9) so this mainly affects that scenario. */ -export function transformBenchmarkRows(rows: BenchmarkRow[]): { +export function transformBenchmarkRows( + rows: BenchmarkRow[], + percentile = 'median', +): { chartData: InferenceData[][]; hardwareConfig: HardwareConfig; } { @@ -132,13 +246,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): { // Phase 2: Build chart data per chart definition (reusing prepared entries) const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => { + const xKey = withPercentile(chartDef.x, percentile); const groupedByHw: Record = {}; for (const { entry, hwKey, date } of prepared) { const dataPoint = createChartDataPoint( date, entry, - chartDef.x as keyof AggDataEntry, + xKey as keyof AggDataEntry, chartDef.y as keyof AggDataEntry, hwKey, ); diff --git a/packages/app/src/lib/chart-utils.test.ts b/packages/app/src/lib/chart-utils.test.ts index db569118..052d498f 100644 --- a/packages/app/src/lib/chart-utils.test.ts +++ b/packages/app/src/lib/chart-utils.test.ts @@ -353,30 +353,29 @@ describe('generateHighContrastColors', () => { expect(Object.values(dark).join(',')).not.toEqual(Object.values(light).join(',')); }); - // ---------- Tier 1: few items → brand zone ---------- - - it('3 NVIDIA GPUs are not red', () => { + // ---------- Single vendor: full wheel for maximum contrast ---------- + // Brand-zone / rival-ban only apply when MULTIPLE vendors are present (so the + // vendors stay visually separable). With a single vendor there's no rival to + // distinguish from, so HC opens the full hue wheel — brand hue is sacrificed + // for the contrast HC exists to provide (fixes the all-NVIDIA agentic case + // where every series otherwise collapsed into the green brand band). + + it('3 NVIDIA GPUs (single vendor) are distinguishable across the full wheel', () => { const result = generateHighContrastColors(['h100_vllm', 'h200_vllm', 'b200_vllm'], 'dark'); - for (const color of Object.values(result)) { - expect(isNotReddish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(3); assertMinDist(result, 30); }); - it('2 AMD GPUs are not green', () => { + it('2 AMD GPUs (single vendor) are distinguishable across the full wheel', () => { const result = generateHighContrastColors(['mi300x_sglang', 'mi325x_sglang'], 'dark'); - for (const color of Object.values(result)) { - expect(isNotGreenish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(2); assertMinDist(result, 30); }); - it('4 NVIDIA GPUs stay in brand zone and are distinguishable', () => { + it('4 NVIDIA GPUs (single vendor) use the full wheel and stay well-separated', () => { const keys = ['h100_vllm', 'h200_vllm', 'b200_vllm', 'b300_vllm']; const result = generateHighContrastColors(keys, 'dark'); - for (const color of Object.values(result)) { - expect(isNotReddish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(4); assertMinDist(result, 25); }); @@ -401,19 +400,13 @@ describe('generateHighContrastColors', () => { assertMinDist(result, 25); }); - // ---------- Tier 2: moderate items → full wheel minus rival color ---------- + // ---------- Single vendor, many items → full wheel, best spacing ---------- - it('10 NVIDIA GPUs: no red hues, still distinguishable', () => { + it('10 NVIDIA GPUs (single vendor) are well-separated across the full wheel', () => { const gpus = ['h100', 'h200', 'b200', 'b300', 'gb200']; const keys = gpus.flatMap((g) => [`${g}_vllm`, `${g}_sglang`]); const result = generateHighContrastColors(keys, 'dark'); - // Should not be reddish (banned) - for (const color of Object.values(result)) { - const rgb = parseRgb(color); - // Not red-dominant with low green — i.e. not in the red/pink zone - const isRedPink = rgb[0] > 150 && rgb[1] < 80 && rgb[2] < 150; - expect(isRedPink).toBe(false); - } + expect(Object.keys(result)).toHaveLength(10); assertMinDist(result, 20); }); diff --git a/packages/app/src/lib/chart-utils.ts b/packages/app/src/lib/chart-utils.ts index 33a5b4e3..ce903fe0 100644 --- a/packages/app/src/lib/chart-utils.ts +++ b/packages/app/src/lib/chart-utils.ts @@ -61,10 +61,17 @@ const PALETTE_CACHE = new Map(); /** * Generates high-contrast colors using iwanthue (k-means in CIELab space). * - * Tiered strategy per vendor: + * Tiered strategy per vendor (only when >1 vendor is present): * ≤ PREFERRED_MAX → constrain to brand zone (NVIDIA=green, AMD=red) * ≤ BAN_MAX → full wheel minus rival's brand color * > BAN_MAX → full wheel, no restrictions, best spacing wins + * + * Single-vendor case (e.g. an all-NVIDIA agentic comparison of B200/B300 × + * vLLM/SGLang): the brand zone and rival-ban exist to keep vendors apart at a + * glance, but with one vendor there's no rival — clamping every series into the + * same narrow hue band just collapses the contrast HC is supposed to maximize. + * So skip both restrictions and use the full wheel, giving the series the widest + * possible separation. */ export const generateHighContrastColors = ( keys: string[], @@ -91,6 +98,12 @@ export const generateHighContrastColors = ( list.push(key); } + // Brand-zone / rival-ban only serve to keep DIFFERENT vendors apart. With a + // single vendor present there's nothing to separate from, so those + // restrictions only shrink the usable hue range and kill contrast — open the + // full wheel instead (the common all-NVIDIA agentic comparison case). + const multiVendor = groups.size > 1; + for (const [vendor, vendorKeys] of groups) { const count = vendorKeys.length; const isBanned = BANNED_HUE_TEST[vendor] ?? null; @@ -99,8 +112,8 @@ export const generateHighContrastColors = ( // Tier 1: few items → brand zone only // Tier 2: moderate → full wheel minus rival color // Tier 3: many → full wheel, no restrictions - const usePreferred = preferred && count <= PREFERRED_MAX; - const useBan = !usePreferred && isBanned && count <= BAN_MAX; + const usePreferred = multiVendor && preferred && count <= PREFERRED_MAX; + const useBan = multiVendor && !usePreferred && isBanned && count <= BAN_MAX; // Everything iwanthue's output depends on (the ban filter and preferred // zone are functions of vendor; the seed is vendor+theme). @@ -579,6 +592,20 @@ export const paretoFrontLowerRight = (points: InferenceData[]): InferenceData[] return front; }; +const PARETO_BY_DIRECTION = { + upper_right: paretoFrontUpperRight, + upper_left: paretoFrontUpperLeft, + lower_left: paretoFrontLowerLeft, + lower_right: paretoFrontLowerRight, +} as const; + +export type ParetoDirection = keyof typeof PARETO_BY_DIRECTION; + +/** Look up the Pareto frontier function for a roofline direction. */ +export const paretoFrontForDirection = ( + dir: ParetoDirection, +): ((points: InferenceData[]) => InferenceData[]) => PARETO_BY_DIRECTION[dir]; + /** * Calculates the roofline for a given set of points. */ diff --git a/packages/app/src/lib/compare-pair-defaults.test.ts b/packages/app/src/lib/compare-pair-defaults.test.ts index f0f1ef5b..da81ca0e 100644 --- a/packages/app/src/lib/compare-pair-defaults.test.ts +++ b/packages/app/src/lib/compare-pair-defaults.test.ts @@ -6,6 +6,7 @@ import { pickPairDefaults } from './compare-pair-defaults'; function makeRow(overrides: Partial): BenchmarkRow { return { + id: 1, hardware: 'h100', framework: 'sglang', model: 'dsr1', @@ -30,6 +31,8 @@ function makeRow(overrides: Partial): BenchmarkRow { metrics: { tput_per_gpu: 100 }, date: '2026-01-01', run_url: null, + benchmark_type: 'single_turn', + offload_mode: 'off', ...overrides, }; } diff --git a/packages/app/src/lib/compare-pair-defaults.ts b/packages/app/src/lib/compare-pair-defaults.ts index be6450ad..f5a37e1f 100644 --- a/packages/app/src/lib/compare-pair-defaults.ts +++ b/packages/app/src/lib/compare-pair-defaults.ts @@ -14,6 +14,7 @@ export function pickPairDefaults( const seenB = new Map>(); for (const row of rows) { if (row.hardware !== a && row.hardware !== b) continue; + if (row.isl === null || row.osl === null) continue; const seq = islOslToSequence(row.isl, row.osl); if (!seq) continue; const key = `${seq}|${row.precision}`; diff --git a/packages/app/src/lib/compare-ssr.test.ts b/packages/app/src/lib/compare-ssr.test.ts index 5f2828ea..4bf99f89 100644 --- a/packages/app/src/lib/compare-ssr.test.ts +++ b/packages/app/src/lib/compare-ssr.test.ts @@ -4,8 +4,13 @@ import type { BenchmarkRow } from '@/lib/api'; import { computeCompareImageRows } from './compare-ssr'; +// BenchmarkRow.id is required (stable per-point id from benchmark_results); +// hand out a fresh one per stub so id-keyed logic can't collide across rows. +let nextStubId = 1; + function stubRow(overrides: Partial = {}): BenchmarkRow { return { + id: nextStubId++, hardware: 'h200', framework: 'sglang', model: 'dsr1', @@ -23,6 +28,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 128, diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts index debbb788..8b691ee4 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts @@ -4,7 +4,7 @@ import { describe, expect, it } from 'vitest'; import type { ShapeKey } from '@/lib/chart-rendering'; -import { renderScatterPoints, syncPointShape } from './scatter-points'; +import { computeTooltipPosition, renderScatterPoints, syncPointShape } from './scatter-points'; interface TestPoint { hwKey: string; @@ -163,3 +163,51 @@ describe('syncPointShape', () => { expect(g.selectAll('.visible-shape').size()).toBe(1); }); }); + +describe('computeTooltipPosition', () => { + it('keeps a tall pinned tooltip inside the visible viewport', () => { + const tooltipNode = document.createElement('div'); + document.body.append(tooltipNode); + Object.defineProperty(tooltipNode, 'getBoundingClientRect', { + value: () => ({ + width: 300, + height: 400, + left: 0, + top: 0, + right: 300, + bottom: 400, + x: 0, + y: 0, + toJSON: () => ({}), + }), + }); + + const container = document.createElement('div'); + Object.defineProperties(container, { + clientWidth: { value: 800 }, + clientHeight: { value: 600 }, + getBoundingClientRect: { + value: () => ({ + width: 800, + height: 600, + left: 100, + top: 600, + right: 900, + bottom: 1200, + x: 100, + y: 600, + toJSON: () => ({}), + }), + }, + }); + Object.defineProperties(document.documentElement, { + clientWidth: { configurable: true, value: 1280 }, + clientHeight: { configurable: true, value: 720 }, + }); + + expect(computeTooltipPosition(450, 100, d3.select(tooltipNode), container)).toEqual({ + left: 560, + top: 316, + }); + }); +}); diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 0c316366..433ed6d1 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -107,17 +107,33 @@ export function renderScatterPoints { + text + .append('tspan') + .attr('x', 0) + .attr('dy', i === 0 ? `${firstDy}em` : '1.1em') + .text(line); + }); + }); } // Exit: remove stale points @@ -150,20 +166,32 @@ export function renderScatterPoints('.point-label') + const lines = labelGetter(d).split('\n'); + const text = d3 + .select(this) + .selectAll('.point-label') .data([true]) .join('text') .attr('class', 'point-label') - .attr('dy', -8) .attr('text-anchor', 'middle') .attr('fill', config.foreground!) .attr('font-size', '10px') - .attr('pointer-events', 'none') - .text(config.getLabelText!(d)); + .attr('font-weight', '700') + .attr('pointer-events', 'none'); + const firstDy = -(0.8 + (lines.length - 1) * 1.1); + text + .selectAll('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) + .text((l) => l); }); } else { points.selectAll('.point-label').remove(); @@ -283,7 +311,22 @@ export function attachScatterTooltipHandlers< }); } -/** Compute tooltip left/top, flipping when it would overflow the chart container. */ +/** + * Compute tooltip left/top **in viewport coordinates** so the tooltip can be + * rendered via portal with `position: fixed`. Callers still pass cursor coords + * relative to `container` (matching `d3.pointer(event, container)`). + * + * Why viewport coords: the chart cards use `backdrop-filter`, which creates + * a stacking context. A tooltip painted inside the upper card's stacking + * context cannot rise above the lower card's stacking context regardless of + * its z-index. Portalling to document.body + `position: fixed` sidesteps the + * whole problem; we just need the coordinates in viewport space. + * + * Strategy: pick preferred side (right/below cursor), flip if it overflows the + * container, then clamp the final fixed coordinates to the viewport. The + * viewport clamp matters when a chart continues below the fold: container- + * local coordinates can otherwise place a pinned tooltip's actions offscreen. + */ export function computeTooltipPosition( mx: number, my: number, @@ -302,11 +345,27 @@ export function computeTooltipPosition( // Force reflow so we get real dimensions const tw = node.getBoundingClientRect().width || node.offsetWidth; const th = node.getBoundingClientRect().height || node.offsetHeight; + const rect = container.getBoundingClientRect(); const cw = container.clientWidth; const ch = container.clientHeight; + const EDGE_PAD = 4; + + // Prefer right of cursor; flip to left if no room. + let left = mx + offset + tw <= cw ? mx + offset : mx - offset - tw; + left = Math.max(EDGE_PAD, Math.min(cw - tw - EDGE_PAD, left)); + + // Prefer below cursor; flip above if no room. + let top = my + offset + th <= ch ? my + offset : my - offset - th; + top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top)); - const left = mx + offset + tw > cw ? mx - offset - tw : mx + offset; - const top = my + offset + th > ch ? my - offset - th : my + offset; + // Convert container-local coords → viewport coords for `position: fixed`, + // then keep the complete tooltip visible when its dimensions permit it. + const viewportWidth = document.documentElement.clientWidth || window.innerWidth; + const viewportHeight = document.documentElement.clientHeight || window.innerHeight; + left += rect.left; + top += rect.top; + left = Math.max(EDGE_PAD, Math.min(viewportWidth - tw - EDGE_PAD, left)); + top = Math.max(EDGE_PAD, Math.min(viewportHeight - th - EDGE_PAD, top)); return { left, top }; } diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index 62208aa7..e217afbd 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -180,17 +180,73 @@ export enum Sequence { OneK_OneK = '1k/1k', OneK_EightK = '1k/8k', EightK_OneK = '8k/1k', + AgenticTraces = 'agentic-traces', } -const SEQUENCE_CONFIG: Record = - { - [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' }, - [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' }, - [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' }, - }; +/** + * Top-level scenario kind. Fixed-seq sequences cluster under a single group + * in the selector; agentic traces sit alongside as their own kind. + */ +export type ScenarioKind = 'fixed-seq' | 'agentic'; + +export function sequenceKind(seq: Sequence): ScenarioKind { + return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq'; +} + +const SEQUENCE_CONFIG: Record< + Sequence, + { label: string; compact: string; category: CategoryTag; kind: ScenarioKind } +> = { + [Sequence.OneK_OneK]: { + label: '1K / 1K', + compact: '1k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.OneK_EightK]: { + label: '1K / 8K', + compact: '1k8k', + category: 'deprecated', + kind: 'fixed-seq', + }, + [Sequence.EightK_OneK]: { + label: '8K / 1K', + compact: '8k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.AgenticTraces]: { + label: 'Agentic Traces', + compact: 'agentic', + category: 'default', + kind: 'agentic', + }, +}; export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; +/** + * Percentile of the latency distribution used for the chart x-axis when + * viewing agentic traces. Agentic rows carry median/p75/p90/p95/p99/p99.9 + * variants for ttft, ttlt (=e2el), and itl (and intvty derived from itl); + * p75 and p90 are surfaced in the UI. + */ +export enum Percentile { + P75 = 'p75', + P90 = 'p90', +} + +const PERCENTILE_CONFIG: Record = { + [Percentile.P75]: { label: 'p75' }, + [Percentile.P90]: { label: 'p90' }, +}; + +export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; + +export function getPercentileLabel(p: Percentile): string { + return PERCENTILE_CONFIG[p]?.label ?? p; +} + export const DEPRECATED_SEQUENCES: ReadonlySet = new Set( (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][]) .filter(([, c]) => c.category === 'deprecated') diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts index 28cc1e36..2f5844c1 100644 --- a/packages/app/src/lib/energy-metrics.test.ts +++ b/packages/app/src/lib/energy-metrics.test.ts @@ -57,23 +57,43 @@ function makeEntry(overrides: Partial = {}): AggDataEntry { mean_ttft: 0.5, median_ttft: 0.4, std_ttft: 0.1, + p75_ttft: 0.65, + p90_ttft: 0.7, + p95_ttft: 0.75, p99_ttft: 0.8, + 'p99.9_ttft': 0.9, mean_tpot: 0.02, mean_intvty: 45, median_tpot: 0.02, median_intvty: 44, std_tpot: 0.005, std_intvty: 5, + p75_tpot: 0.022, + p75_intvty: 50, + p90_tpot: 0.025, + p90_intvty: 55, + p95_tpot: 0.028, + p95_intvty: 58, p99_tpot: 0.03, p99_intvty: 60, + 'p99.9_tpot': 0.035, + 'p99.9_intvty': 65, mean_itl: 0.01, median_itl: 0.01, std_itl: 0.002, + p75_itl: 0.012, + p90_itl: 0.013, + p95_itl: 0.014, p99_itl: 0.015, + 'p99.9_itl': 0.018, mean_e2el: 5, median_e2el: 4.8, std_e2el: 0.5, + p75_e2el: 5.2, + p90_e2el: 5.5, + p95_e2el: 5.8, p99_e2el: 6, + 'p99.9_e2el': 6.5, disagg: false, num_prefill_gpu: 0, num_decode_gpu: 0, diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts index c78bf588..1c8cab81 100644 --- a/packages/app/src/lib/url-state.ts +++ b/packages/app/src/lib/url-state.ts @@ -22,8 +22,10 @@ const URL_STATE_KEYS = [ 'i_seq', 'i_prec', 'i_metric', + 'i_pctl', 'i_xmetric', 'i_e2e_xmetric', + 'i_xmode', 'i_scale', 'i_gpus', 'i_dates', @@ -78,8 +80,10 @@ export const PARAM_DEFAULTS: Record = { // "default") or it would silently revert to the per-model auto default on reload. i_prec: '', i_metric: 'y_tpPerGpu', - i_xmetric: 'p99_ttft', - i_e2e_xmetric: '', + i_pctl: 'p90', + i_xmetric: 'p90_ttft', + i_e2e_xmetric: 'p90_ttft', + i_xmode: '', i_scale: 'auto', i_gpus: '', i_dates: '', From 760026f5837960cfa9b9ecc2bd5333c109a2e306 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:12:20 -0500 Subject: [PATCH 10/40] test: agentic e2e and component coverage; subsystem docs --- docs/data-pipeline.md | 12 + .../kv-cache-hit-rate-anomaly.md | 113 +++++++ .../app/cypress/component/dataset-list.cy.tsx | 93 +++++ .../component/distribution-card.cy.tsx | 82 +++++ .../component/inference-chart-controls.cy.tsx | 4 +- .../cypress/component/trace-flamegraph.cy.tsx | 86 +++++ .../e2e/agentic-point-time-series.cy.ts | 320 ++++++++++++++++++ .../cypress/e2e/datasets-distributions.cy.ts | 133 ++++++++ .../e2e/datasets-flamegraph-time.cy.ts | 127 +++++++ .../app/cypress/e2e/dropdown-switching.cy.ts | 4 +- .../e2e/gpu-compare-agentic-detail.cy.ts | 54 +++ .../app/cypress/e2e/gradient-labels.cy.ts | 16 +- .../app/cypress/e2e/historical-trends.cy.ts | 4 +- packages/app/cypress/e2e/line-labels.cy.ts | 31 +- .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts | 108 ++++-- packages/app/cypress/e2e/url-params.cy.ts | 24 +- packages/app/cypress/support/mock-data.ts | 4 + 17 files changed, 1148 insertions(+), 67 deletions(-) create mode 100644 docs/investigations/kv-cache-hit-rate-anomaly.md create mode 100644 packages/app/cypress/component/dataset-list.cy.tsx create mode 100644 packages/app/cypress/component/distribution-card.cy.tsx create mode 100644 packages/app/cypress/component/trace-flamegraph.cy.tsx create mode 100644 packages/app/cypress/e2e/agentic-point-time-series.cy.ts create mode 100644 packages/app/cypress/e2e/datasets-distributions.cy.ts create mode 100644 packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts create mode 100644 packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts diff --git a/docs/data-pipeline.md b/docs/data-pipeline.md index 38e7d471..bc439e8a 100644 --- a/docs/data-pipeline.md +++ b/docs/data-pipeline.md @@ -62,6 +62,18 @@ Configs are preloaded into an in-memory Map at ingest start. `getOrCreateConfig( Unmapped models/hardware are tracked (not silently dropped) so operators can see what new GPU or model names appeared in CI artifacts. This is how new GPUs get added to the system — the skip tracker acts as a change detection mechanism. +### Server-Metric Orchestrator Adapters + +AIPerf defines the `server_metrics_export.json` envelope, but labels such as worker role and rank belong to the serving orchestrator. The chart-series ETL therefore normalizes raw series through an orchestrator-specific adapter before exposing per-worker metrics. For example, the Dynamo adapter maps `dynamo_component=prefill|backend` to canonical `prefill|decode` roles and uses the endpoint, worker ID, DP rank, and engine together as the source identity. + +Adapters are selected from the benchmark's canonical framework, and per-worker series are only emitted for disaggregated configs with a recognized adapter. Unknown orchestrators and non-disaggregated configs retain their aggregate-only series; roles are never guessed from ports or metric names. The frontend only consumes the canonical source identity and never interprets orchestrator-native labels. + +### Agentic Dataset Provenance + +AIPerf exports public-dataset provenance in `metadata.dataset`, including the Hugging Face dataset ID. InferenceX preserves that object as `dataset` on each agentic aggregate benchmark row. During benchmark ingest, `ingest-ci-run.ts` derives the dashboard slug from `hf_dataset_name` (for example, `semianalysisai/cc-traces-weka-062126` becomes `cc-traces-weka-062126`) and upserts `run_datasets` for the workflow run. + +Legacy artifacts without provenance leave any existing mapping untouched. A workflow run can map to only one dataset; conflicting dataset IDs fail ingest rather than silently linking the run to an arbitrary dataset. + ## Frontend Transform Pipeline ### Why transformBenchmarkRows Exists diff --git a/docs/investigations/kv-cache-hit-rate-anomaly.md b/docs/investigations/kv-cache-hit-rate-anomaly.md new file mode 100644 index 00000000..61ffee42 --- /dev/null +++ b/docs/investigations/kv-cache-hit-rate-anomaly.md @@ -0,0 +1,113 @@ +# KV cache hit-rate anomaly on agentic benchmarks (dsv4, b200, vllm) + +## Core issue + +vLLM's prefix cache should be hitting at ~98% on multi-turn agentic conversation replay (each turn extends the prior turn's context). It isn't. Something in the **dataset definition** or **aiperf replay** is producing requests whose token streams aren't actually prefix-compatible turn-to-turn. + +| Concurrency | Theoretical max hit % | vLLM actual hit % | +| ----------: | --------------------: | ----------------: | +| 1 | 97.45% | 83.15% | +| 2 | 98.34% | 46.78% | +| 4 | 97.99% | 12.43% | + +This is **not** a capacity problem. KV cache is sized at 3.29M tokens (12,868 blocks × 256). The conc=4 workload's unique-content footprint is **~1.11M DSV4 tokens** — would fit in ~34% util. Observed peak util is 49.8%, so the cache is holding more blocks than the workload needs, yet vLLM can't find them on lookup. + +## Data sources + +- **Benchmark points**: + - http://localhost:3002/inference/agentic/206252 (conc=1) + - http://localhost:3002/inference/agentic/206245 (conc=2) + - http://localhost:3002/inference/agentic/206247 (conc=4) +- **Neon DB**: project `silent-pond-29172997`, branch `br-cold-sky-ai0c09cy` (agentx-dev). Connection via `DATABASE_WRITE_URL` in `.env`. Console: https://console.neon.tech/app/projects/silent-pond-29172997/branches/br-cold-sky-ai0c09cy + - `agentic_trace_replay.profile_export_jsonl_gz` — gzipped aiperf per-request records + - `agentic_trace_replay.server_metrics_json_gz` — gzipped vllm per-scrape prometheus metrics + - `agentic_trace_replay.request_timeline` (jsonb) — pre-computed per-request timeline used by the simulation +- **Trace replay dataset** (the source-of-truth for "what should be cacheable"): https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-051926. Each row has pre-computed 64-token block `hash_ids` per turn; `hash_id_scope: 'local'` (per-conversation). + +## Theoretical max simulation + +For each replayed request, look up the matching turn in the HF dataset and walk a per-conversation trie of 64-token block hash IDs. Hits = longest contiguous prefix from block 0 that has appeared in any prior request (mirrors vLLM's chained-hash semantics). + +Confirms: the workload IS prefix-cacheable end-to-end. Theoretical max ≈ 98% across all three concurrency levels — same dataset, same conversations, just different dispatch order. + +## Why this points at the dataset/replay, not vLLM + +- **Capacity is not the bottleneck.** Cache holds ~3× the unique content of the workload. Cache util tops out below capacity. +- **The metric isn't lying.** vLLM's own counters cross-check: `prefill_kv_computed_tokens + prefix_cache_hits ≈ request_prompt_tokens` (67.85M + 9.61M ≈ 77.47M for conc=4). +- **It's not a tokenizer artifact.** DSV4 tokens are ~54% the count of Claude tokens, but BPE is left-monotonic on stable text — hit-rate ratio is invariant to tokenizer choice for prefix-growth workloads. +- **It's not the multi-engine DP bug** we found earlier (commit `f2618f4`) — this deployment has 1 engine. + +What's left: the bytes that vLLM actually receives turn-to-turn are not the same prefix + delta that the dataset's `hash_ids` describe. Most likely culprits: + +1. **aiperf isn't sending the cumulative chat history** the way the dataset assumes — each turn is being assembled differently than the previous, breaking the byte-level prefix. +2. **Something in the request payload varies per request** (timestamps, request IDs, tool result serialization order, etc.) — invalidates block 0's hash, cascades to every subsequent block via vLLM's chained hashing. +3. **BPE re-merging across message boundaries** when aiperf re-tokenizes the full history each turn instead of appending tokens. + +## Root cause: `ConversationReconstructor` strips the prev user's `partial_tail` every turn + +The bug is in `utils/aiperf/src/aiperf/dataset/loader/weka_synth_buf.py` — specifically the **boundary case** in `truncate_synth_buf_at_block` (line 453–464) combined with `turn_delta`'s reset logic (line 354–360). + +What happens turn-to-turn: + +1. `init_turn_0` builds a trailing user segment whose `tokens` = `[block_aligned_tokens] + [partial_tail_tokens]` where `partial_tail_n = in_tokens % bs`. The wire prompt for turn 0 includes these tail tokens. +2. `advance_turn` computes `lcp = longest_common_prefix(prev_hash_ids, curr_hash_ids)`. When the LCP equals the prev turn's total block count (the normal append-only case), `truncate_synth_buf_at_block` hits its boundary branch: `cursor + seg.block_count == target_blocks`. +3. That branch **strips `prev_partial_tail` tokens off the trailing user segment in place** and re-decodes its `content`. This sets `_last_disturbance_at = i` (the index of the prev trailing user segment). +4. New `assistant` + `user` segments are appended. +5. `turn_delta` sees `_last_disturbance_at < _emitted_segment_count` and forces `reset_context=True`, re-emitting **the whole conversation** with the now-stripped trailing user. + +The endpoint (`utils/aiperf/src/aiperf/endpoints/base_endpoint.py:110-140`) honors `reset_context=True` via `messages = list(turn.raw_messages)` instead of `messages.extend(...)`. + +Result: every turn sends the full chat history, but the bytes of the prev user message differ from what was sent the turn before — the trailing `partial_tail` chars are missing. vLLM tokenizes the new prompt, hashes 256-token blocks, and the chained-hash invariant breaks at the first block containing the trimmed boundary. That block + every subsequent block of the new turn miss the cache. + +### Empirical confirmation + +Reproducer at `/tmp/test-reconstructor.py` instantiates `ConversationReconstructor` with mock decoders and walks a synthetic 3-turn conversation: + +``` +=== Turn 0 === + delta msgs: 2, reset=False + wire len: 21683 + +=== Turn 1 === + delta msgs: 4, reset=True ← every turn resets + wire len: 25307 + +=== DIFF turn 0 vs turn 1 (wire-level) === + common prefix chars: 21549 / wire0 21683 (99.4%) + wire0[...] = '... 983406 12 1 133 184 16 57 71 155 37 ' ← partial_tail decoded + wire1[...] = '... 983406<|im_end|>\n<|im_start|>assista' ← stripped, template marker next + turn0 user content len: 19812, turn1 user[0] content len: 19711 ← 101 chars stripped +``` + +Across the conc=1 run (point 206252), **280/280 (100%)** consecutive turn-pairs have `prev_in_tokens % bs != 0` — i.e., every single turn hits this boundary disturbance. + +### Why the gap widens with concurrency + +At conc=1 the gap (97.45% − 83.15% = 14pp) is roughly the fraction of each turn's blocks lost to the trimmed-tail invalidation (last user block + chat-template delta). At higher conc: + +- `reset_context=True` makes every request re-send the **entire** conversation prompt, so wire bandwidth + prefill work scale superlinearly per turn. +- Concurrent conversations all do this simultaneously; each writes long sequences of "new" blocks past their respective divergence points, evicting other conversations' usable prefix blocks even though aggregate unique content (1.11M tokens) fits comfortably in the 3.29M-token cache. + +### Fix sketch + +The boundary-cut strip exists to keep the next turn's `assistant` segment block-aligned. Two viable fixes: + +1. **Don't mutate the prev trailing user segment.** Leave its `partial_tail` tokens intact; append the new asst+user as strict-append (no reset_context). The wire-prefix becomes byte-stable turn-to-turn. Cost: the new asst content's block_start no longer aligns to the prev_hash_ids tail, so hash_id accounting for asst blocks loses 1 block of fidelity per turn. +2. **Track `partial_tail` separately** from the prev user segment so the segment's emitted content stays byte-stable, and only the trailing tail (which is regenerated each turn anyway) is allowed to vary. + +Option 1 is the minimal change. Validate with the reproducer above — remove the strip in `truncate_synth_buf_at_block`'s boundary case and re-run; turn N+1's wire prefix should equal turn N's wire byte-for-byte up to the end of the prev assistant template. + +## Re-running the simulation + +```bash +# 1. dump request timelines from DB +pnpm --filter @semianalysisai/inferencex-db exec dotenv -e ../../.env -- tsx /tmp/dump-rt-multi.ts + +# 2. run analysis (needs `pip3 install --break-system-packages --user datasets`) +python3 /tmp/cache-sim-multi.py + +# 3. reproduce the partial_tail strip +python3 /tmp/test-reconstructor.py +``` + +Scripts live in `/tmp/` from this session; recreate from inline code in the previous version of this doc if missing. diff --git a/packages/app/cypress/component/dataset-list.cy.tsx b/packages/app/cypress/component/dataset-list.cy.tsx new file mode 100644 index 00000000..f7cfcb9a --- /dev/null +++ b/packages/app/cypress/component/dataset-list.cy.tsx @@ -0,0 +1,93 @@ +import { QueryClient, QueryClientProvider } from '@tanstack/react-query'; +import { AppRouterContext } from 'next/dist/shared/lib/app-router-context.shared-runtime'; + +import { DatasetList } from '@/components/datasets/dataset-list'; +import type { DatasetRecord } from '@/hooks/api/use-datasets'; + +const datasets: DatasetRecord[] = [ + { + id: 'ds-1', + slug: 'cc-traces-weka-full', + label: 'cc-traces-weka (full)', + variant: 'full', + description: 'Every captured request, unmodified.', + hf_url: 'https://huggingface.co/datasets/semianalysisai/cc-traces-weka-full', + license: 'apache-2.0', + conversation_count: 1234, + summary: { + totalIn: 5_000_000, + totalOut: 250_000, + cachedPct: 0.82, + mainTurns: 9800, + subagentGroups: 540, + }, + ingested_at: '2026-06-20T00:00:00Z', + }, + { + id: 'ds-2', + slug: 'cc-traces-weka-256k', + label: 'cc-traces-weka (256k)', + variant: '256k', + description: 'Turns trimmed to a 256k context window.', + hf_url: null, + license: 'apache-2.0', + conversation_count: 980, + summary: { + totalIn: 3_200_000, + totalOut: 180_000, + cachedPct: 0.79, + mainTurns: 7600, + subagentGroups: 410, + }, + ingested_at: '2026-06-19T00:00:00Z', + }, +]; + +function createMockRouter() { + return { + push: cy.stub(), + replace: cy.stub(), + refresh: cy.stub(), + back: cy.stub(), + forward: cy.stub(), + prefetch: cy.stub().resolves(), + }; +} + +function mountList() { + const queryClient = new QueryClient({ defaultOptions: { queries: { retry: false } } }); + cy.mount( + + + + + , + ); +} + +describe('DatasetList', () => { + it('renders a card per dataset with its summary stats', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: datasets }).as('list'); + mountList(); + cy.wait('@list'); + cy.contains('cc-traces-weka (full)').should('be.visible'); + cy.contains('cc-traces-weka (256k)').should('be.visible'); + cy.contains('1,234').should('be.visible'); // conversation_count, localized + cy.contains('82%').should('be.visible'); // cachedPct + cy.get('a[href="/datasets/cc-traces-weka-full"]').should('exist'); + }); + + it('shows the empty state when no datasets are ingested', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: [] }).as('empty'); + mountList(); + cy.wait('@empty'); + cy.contains('No datasets ingested yet.').should('be.visible'); + }); + + it('shows the error state when the request fails', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 500, body: { error: 'boom' } }).as('err'); + mountList(); + cy.wait('@err'); + cy.contains('Failed to load datasets.').should('be.visible'); + }); +}); diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx new file mode 100644 index 00000000..511505b9 --- /dev/null +++ b/packages/app/cypress/component/distribution-card.cy.tsx @@ -0,0 +1,82 @@ +import { DistributionCard } from '@/components/datasets/distribution-card'; +import type { Distribution } from '@/hooks/api/use-datasets'; + +const distribution: Distribution = { + bins: [ + { x0: 0, x1: 100, count: 5 }, + { x0: 100, x1: 200, count: 20 }, + { x0: 200, x1: 300, count: 12 }, + { x0: 300, x1: 400, count: 3 }, + ], + stats: { + count: 40, + min: 10, + max: 390, + mean: 180, + median: 175, + p75: 250, + p90: 320, + p95: 360, + }, +}; + +describe('DistributionCard', () => { + it('renders the title, summary stats, and one bar per bin', () => { + cy.mount( + , + ); + cy.contains('Input tokens per turn').should('be.visible'); + cy.contains('n=40').should('be.visible'); + cy.contains('p50 175').should('be.visible'); + cy.contains('p75 250').should('be.visible'); + cy.contains('p90 320').should('be.visible'); + cy.contains('p95 360').should('be.visible'); + cy.get( + 'line[stroke="#3b82f6"], line[stroke="#22c55e"], line[stroke="#f59e0b"], line[stroke="#ef4444"]', + ).should('have.length', 8); + // One filled bar rect per bin (ChartHover may add a transparent overlay rect). + cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length); + }); + + it('shows a "No data" placeholder when no distribution is provided', () => { + cy.mount(); + cy.contains('Empty metric').should('be.visible'); + cy.contains('No data').should('be.visible'); + cy.get('rect[class*="fill-primary"]').should('not.exist'); + }); + + it('marks the chart as log scale when scale="log"', () => { + cy.mount( + , + ); + cy.contains('log scale').should('be.visible'); + }); + + it('renders older v1 stats without unavailable percentile guides', () => { + cy.mount( + , + ); + cy.contains('p50 175').should('be.visible'); + cy.contains('p90 320').should('be.visible'); + cy.contains('NaN').should('not.exist'); + }); +}); diff --git a/packages/app/cypress/component/inference-chart-controls.cy.tsx b/packages/app/cypress/component/inference-chart-controls.cy.tsx index 03e6a50c..5a6311f4 100644 --- a/packages/app/cypress/component/inference-chart-controls.cy.tsx +++ b/packages/app/cypress/component/inference-chart-controls.cy.tsx @@ -14,8 +14,8 @@ describe('Inference ChartControls', () => { it('renders the sequence selector with the current sequence', () => { // Default mock: selectedSequence = Sequence.EightK_OneK -> label "8K / 1K" - cy.get('#sequence-select').should('be.visible'); - cy.get('#sequence-select').should('contain.text', '8K / 1K'); + cy.get('#scenario-select').should('be.visible'); + cy.get('#scenario-select').should('contain.text', '8K / 1K'); }); it('renders the precision multi-select with the current precision', () => { diff --git a/packages/app/cypress/component/trace-flamegraph.cy.tsx b/packages/app/cypress/component/trace-flamegraph.cy.tsx new file mode 100644 index 00000000..1be90e0c --- /dev/null +++ b/packages/app/cypress/component/trace-flamegraph.cy.tsx @@ -0,0 +1,86 @@ +import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; +import type { ConversationStructure } from '@/hooks/api/use-datasets'; + +// Two main turns followed by one subagent group with two child turns. +// Node indices: 0 = turn, 1 = turn, 2 = subagent (so its rows key off `g-2`). +const structure: ConversationStructure = { + blockSize: 64, + nodes: [ + { kind: 'turn', turnIndex: 0, model: 'claude', in: 1000, out: 200, cached: 600, uncached: 400 }, + { + kind: 'turn', + turnIndex: 1, + model: 'claude', + in: 2000, + out: 300, + cached: 1500, + uncached: 500, + }, + { + kind: 'subagent', + label: 'Subagent: search', + agentId: 'agent-1', + durationMs: 12000, + in: 5000, + out: 800, + cached: 3000, + uncached: 2000, + children: [ + { + kind: 'turn', + turnIndex: 0, + model: 'claude', + in: 2500, + out: 400, + cached: 1500, + uncached: 1000, + }, + { + kind: 'turn', + turnIndex: 1, + model: 'claude', + in: 2500, + out: 400, + cached: 1500, + uncached: 1000, + }, + ], + }, + ], + totals: { in: 8000, out: 1300, cached: 5100, uncached: 2900, numTurns: 2, numSubagentGroups: 1 }, +}; + +describe('TraceFlamegraph', () => { + it('renders the legend, main-turn rows, and the subagent group header', () => { + cy.mount(); + cy.contains('Cached prefix').should('be.visible'); + cy.contains('Uncached input').should('be.visible'); + cy.contains('Output').should('be.visible'); + cy.get('[data-rowkey="t-0"]').should('contain.text', 'Turn 1'); + cy.get('[data-rowkey="t-1"]').should('contain.text', 'Turn 2'); + cy.contains('Subagent: search').should('be.visible'); + }); + + it('keeps subagent children collapsed until the group is expanded', () => { + cy.mount(); + cy.get('[data-rowkey="g-2-c-0"]').should('not.exist'); + cy.contains('button', 'Subagent: search').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('be.visible'); + cy.get('[data-rowkey="g-2-c-1"]').should('be.visible'); + }); + + it('expand all / collapse all toggles every subagent group', () => { + cy.mount(); + cy.contains('button', 'Expand all').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('be.visible'); + cy.contains('button', 'Collapse all').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('not.exist'); + }); + + it('auto-expands and highlights the target group child for a request-timeline deep link', () => { + cy.mount( + , + ); + cy.get('[data-rowkey="g-2-c-1"]').should('be.visible').and('have.class', 'ring-primary'); + }); +}); diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts new file mode 100644 index 00000000..4a450f7c --- /dev/null +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -0,0 +1,320 @@ +const timelineRequest = ( + index: number, + ttftMs: number, + tpotMs: number, + overrides: Record = {}, +) => ({ + cid: 'conversation-1', + ti: index, + wid: 'worker-1', + ad: 0, + phase: 'profiling', + credit: index * 1_000_000_000, + start: index * 1_000_000_000, + ack: null, + end: (index + 1) * 1_000_000_000, + ttftMs, + tpotMs, + isl: 1024, + osl: 128, + cancelled: false, + ...overrides, +}); + +describe('Agentic point request metric time series', () => { + before(() => { + cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} }); + cy.intercept('GET', '/api/v1/trace-server-metrics*', { body: null }); + cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/request-timeline*', { + body: { + version: 3, + startNs: 0, + endNs: 7_000_000_000, + durationS: 7, + requests: [ + timelineRequest(0, 100, 10), + timelineRequest(1, 200, 20), + timelineRequest(2, 400, 25), + timelineRequest(3, 800, 40), + timelineRequest(4, 1600, 80), + timelineRequest(5, 3200, 160, { phase: 'warmup' }), + timelineRequest(6, 6400, 320, { cancelled: true }), + timelineRequest(7, 0, 0, { + cid: 'conversation-1::sa:subagent_001_abcd', + credit: 1_100_000_000, + start: 1_100_000_000, + end: 1_900_000_000, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + }), + timelineRequest(8, 0, 0, { + cid: 'conversation-1::sa:subagent_001_abcd:aux:011', + credit: 1_200_000_000, + start: 1_200_000_000, + end: 1_800_000_000, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + }), + ], + }, + }); + cy.visit('/inference/agentic/206885'); + }); + + it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => { + cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { + cy.contains('h2', 'Interactivity over time').should('be.visible'); + cy.get('[data-testid="interactivity-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P90'); + cy.get('[data-testid="interactivity-point-count"]').should('have.text', '5 points'); + cy.get('svg circle').should('have.length', 5); + cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); + cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.contains('h2', 'TTFT over time').should('be.visible'); + cy.get('[data-testid="ttft-point-count"]').should('have.text', '5 points'); + cy.get('svg circle').should('have.length', 5); + cy.get('svg').should('contain.text', 'TTFT (s)'); + cy.get('svg').should('contain.text', 'Cumulative P90 TTFT'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + }); + + it('switches ISL and OSL cards from distributions to in-flight averages', () => { + cy.get('[data-testid="isl-metric-chart"]').within(() => { + cy.get('[data-testid="isl-metric-inflight"]').click(); + cy.contains('h2', 'Average ISL in flight').should('be.visible'); + cy.get('svg').should('contain.text', 'Average ISL in flight (30s avg)'); + }); + cy.get('[data-testid="osl-metric-chart"]').within(() => { + cy.get('[data-testid="osl-metric-inflight"]').click(); + cy.contains('h2', 'Average OSL in flight').should('be.visible'); + cy.contains('Retrospective: final observed OSL').should('be.visible'); + cy.get('svg').should('contain.text', 'Average OSL in flight (30s avg)'); + }); + }); + + it('switches the TTFT chart to E2E request latency over time', () => { + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.get('[data-testid="latency-metric-e2e"]').click(); + cy.contains('h2', 'E2E latency over time').should('be.visible'); + cy.get('[data-testid="e2e-point-count"]').should('have.text', '7 points'); + cy.get('svg circle').should('have.length', 7); + cy.get('svg').should('contain.text', 'E2E latency (s)'); + cy.get('svg').should('contain.text', 'Cumulative P90 E2E latency'); + + cy.get('[data-testid="latency-metric-ttft"]').click(); + cy.contains('h2', 'TTFT over time').should('be.visible'); + }); + }); + + it('switches each chart independently from P90 to P75', () => { + cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { + cy.contains('svg', 'P90 (rolling 50 req)') + .find('path') + .first() + .invoke('attr', 'd') + .as('p90Path'); + cy.contains('button', 'P75').click(); + cy.get('[data-testid="interactivity-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P75'); + cy.get('svg').should('contain.text', '1 / cumulative P75 TPOT'); + cy.contains('svg', 'P75 (rolling 50 req)') + .find('path') + .first() + .invoke('attr', 'd') + .then(function (p75Path) { + expect(p75Path).not.to.equal(this.p90Path); + }); + }); + + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.get('[data-testid="ttft-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P90'); + cy.contains('button', 'P75').click(); + cy.get('svg').should('contain.text', 'P75 (rolling 50 req)'); + cy.get('svg').should('contain.text', 'Cumulative P75 TTFT'); + }); + }); + + it('switches the request activity card from queue depth to cumulative completions', () => { + cy.get('[data-testid="request-activity-chart"]').within(() => { + cy.contains('h2', 'Request queue depth').should('be.visible'); + cy.get('[data-testid="request-activity-completed"]').click(); + cy.contains('h2', 'Cumulative completed requests').should('be.visible'); + cy.get('svg').should('contain.text', 'Completed requests'); + cy.get('svg').should('contain.text', 'Requests'); + cy.get('[data-testid="request-activity-queue"]').click(); + cy.contains('h2', 'Request queue depth').should('be.visible'); + }); + }); + + it('shows total time with no requests in flight on the request timeline', () => { + cy.get('[data-testid="detail-view-timeline"]').click(); + cy.location('search').should('contain', 'view=timeline'); + cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 1.00s (14.3%)'); + cy.get('[data-timeline-row-kind="aux"]') + .should('have.css', 'padding-left', '24px') + .and('contain.text', 'aux 011 · parallel'); + }); + + it('restores the request timeline view after browser Back from a dataset route', () => { + cy.window().then((win) => { + win.history.pushState({}, '', '/datasets/test-dataset/conversations/conversation-1'); + }); + cy.go('back'); + cy.location('pathname').should('eq', '/inference/agentic/206885'); + cy.location('search').should('contain', 'view=timeline'); + cy.get('[data-testid="detail-view-timeline"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="timeline-total-idle-time"]').should('be.visible'); + }); + + it('shows a cumulative average for unique input tokens in flight', () => { + cy.get('[data-testid="detail-view-point"]').click(); + cy.get('[data-testid="unique-input-inflight-chart"]').within(() => { + cy.get('svg').should('contain.text', 'Cumulative average'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + }); +}); + +const pointMeta = { + id: 206885, + hardware: 'gb200', + framework: 'dynamo-vllm', + model: 'deepseek-r1-0528', + precision: 'fp8', + spec_method: 'none', + disagg: true, + conc: 128, + offload_mode: 'off', + isl: null, + osl: null, + benchmark_type: 'agentic_traces', + date: '2026-06-23', + run_url: null, + server_gpu_cache_hit_rate: 0.5, + server_cpu_cache_hit_rate: null, +}; + +const sourceSeries = (source: Record, prompt: number, generation: number) => ({ + source, + kvCacheUsage: [ + { t: 0, value: 0.25 }, + { t: 1, value: 0.5 }, + ], + prefixCacheHitRate: [{ t: 0, value: 0.5 }], + queueDepth: [{ t: 0, running: 2, waiting: 1, total: 3 }], + promptTokensBySource: { miss: [{ t: 0, value: prompt }] }, + promptTps: [{ t: 0, value: prompt }], + generationTps: [{ t: 0, value: generation }], + prefixCacheHitsTps: [{ t: 0, value: prompt / 2 }], + hostKvCacheUsage: [], + kvCacheUsageByEngine: [], +}); + +describe('Agentic point orchestrator metric sources', () => { + beforeEach(() => { + const prefill = sourceSeries( + { + id: 'dynamo|prefill|10.30.1.56:7500|prefill-a|0|0', + adapter: 'dynamo', + role: 'prefill', + endpointUrl: '10.30.1.56:7500', + nativeRole: 'prefill', + workerId: 'prefill-a', + dpRank: '0', + engine: '0', + }, + 100, + 1, + ); + const decode = sourceSeries( + { + id: 'dynamo|decode|10.30.1.206:7516|decode-a|0|0', + adapter: 'dynamo', + role: 'decode', + endpointUrl: '10.30.1.206:7516', + nativeRole: 'backend', + workerId: 'decode-a', + dpRank: '0', + engine: '0', + }, + 300, + 400, + ); + cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} }); + cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/request-timeline*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/trace-server-metrics*', { + body: { + meta: pointMeta, + startNs: 0, + endNs: 2_000_000_000, + durationS: 2, + timeslicesCount: 2, + kvCacheUsage: prefill.kvCacheUsage, + prefixCacheHitRate: prefill.prefixCacheHitRate, + queueDepth: prefill.queueDepth, + promptTokensBySource: prefill.promptTokensBySource, + prefillTps: prefill.promptTps, + decodeTps: decode.generationTps, + prefixCacheHitsTps: prefill.prefixCacheHitsTps, + hostKvCacheUsage: [], + kvCacheUsageByEngine: [], + metricSources: [prefill, decode], + }, + }); + cy.visit('/inference/agentic/206885'); + }); + + it('switches every server chart to an orchestrator-normalized worker', () => { + cy.get('[data-testid="metric-source-toolbar"]') + .should('have.css', 'position', 'sticky') + .and('have.css', 'top', '64px'); + cy.get('[data-testid="metric-source-select"]').should('contain.text', 'All endpoints').click(); + cy.contains('[role="option"]', 'Decode · decode-a').click(); + + cy.get('[data-testid="metric-source-select"]').should('contain.text', 'Decode · decode-a'); + cy.contains('h2', 'Throughput · Decode · decode-a').should('be.visible'); + cy.contains('svg', 'Decode (avg n=50)').should('be.visible'); + + cy.get('[data-testid="metric-source-select"]').click(); + cy.contains('[role="option"]', 'Prefill · prefill-a').click(); + cy.contains('h2', 'Throughput · Prefill · prefill-a').should('be.visible'); + }); + + it('toggles input and decode independently while keeping one visible', () => { + cy.get('[data-testid="throughput-series-input"]') + .should('have.attr', 'aria-pressed', 'true') + .and('not.be.disabled'); + cy.get('[data-testid="throughput-series-decode"]') + .should('have.attr', 'aria-pressed', 'true') + .and('not.be.disabled'); + cy.contains('svg', 'Input (avg n=50)').should('be.visible'); + cy.contains('svg', 'Decode (avg n=50)').should('be.visible'); + cy.contains('svg', 'Total running avg (60s burn-in)').should('be.visible'); + + cy.get('[data-testid="throughput-series-input"]').click(); + cy.get('[data-testid="throughput-series-input"]').should('have.attr', 'aria-pressed', 'false'); + cy.get('[data-testid="throughput-series-decode"]').should('be.disabled'); + cy.contains('svg', 'Input (avg n=50)').should('not.exist'); + cy.contains('svg', 'Total running avg (60s burn-in)').should('not.exist'); + + cy.get('[data-testid="throughput-series-input"]').click(); + cy.get('[data-testid="throughput-series-decode"]').click(); + cy.get('[data-testid="throughput-series-input"]').should('be.disabled'); + cy.get('[data-testid="throughput-series-decode"]').should('have.attr', 'aria-pressed', 'false'); + }); +}); diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts new file mode 100644 index 00000000..6ce4bc34 --- /dev/null +++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts @@ -0,0 +1,133 @@ +const distribution = (values: { + median: number; + p75: number; + p90: number; + p95: number; + max: number; +}) => ({ + bins: [ + { x0: 0, x1: 10, count: 5 }, + { x0: 10, x1: 100, count: 15 }, + ], + stats: { + count: 20, + min: 0, + mean: 40, + ...values, + }, +}); + +describe('Dataset distribution percentiles', () => { + before(() => { + cy.intercept('GET', '/api/v1/datasets/test-dataset', { + body: { + id: 'test-dataset', + slug: 'test-dataset', + label: 'Test dataset', + variant: 'full', + description: null, + hf_url: null, + license: 'apache-2.0', + conversation_count: 1, + summary: { + mainTurns: 20, + subagentGroups: 0, + subagentTurns: 0, + medianRequestsPerConversation: 12, + meanRequestsPerConversation: 14.6, + medianSubagentsPerTrace: 3, + meanSubagentsPerTrace: 4.8, + cachedPct: 0.5, + totalIn: 1000, + totalOut: 200, + }, + chart_data: { + version: 2, + inputTokensPerTurn: distribution({ + median: 100, + p75: 200, + p90: 300, + p95: 400, + max: 500, + }), + outputTokensPerTurn: distribution({ + median: 10, + p75: 20, + p90: 30, + p95: 40, + max: 50, + }), + uncachedInputTokensPerTurn: distribution({ + median: 0, + p75: 64, + p90: 128, + p95: 256, + max: 512, + }), + subagentInputTokensPerRequest: distribution({ + median: 1000, + p75: 2000, + p90: 3000, + p95: 4000, + max: 5000, + }), + subagentOutputTokensPerRequest: distribution({ + median: 100, + p75: 200, + p90: 300, + p95: 400, + max: 500, + }), + }, + ingested_at: '2026-06-23T00:00:00Z', + }, + }); + cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', { + body: { total: 0, items: [] }, + }); + cy.visit('/datasets/test-dataset'); + }); + + it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => { + const expected = [ + ['Input tokens per turn', ['p50 100', 'p75 200', 'p90 300', 'p95 400']], + ['Output tokens per turn', ['p50 10', 'p75 20', 'p90 30', 'p95 40']], + ['Uncached input tokens per request', ['p50 0', 'p75 64', 'p90 128', 'p95 256']], + ] as const; + + for (const [title, percentiles] of expected) { + cy.contains('[data-slot="card"]', title).within(() => { + for (const percentile of percentiles) cy.contains(percentile).should('be.visible'); + cy.get('svg line[stroke="#3b82f6"]').should('exist'); + cy.get('svg line[stroke="#22c55e"]').should('exist'); + cy.get('svg line[stroke="#f59e0b"]').should('exist'); + cy.get('svg line[stroke="#ef4444"]').should('exist'); + }); + } + }); + + it('shows median and mean model requests per conversation', () => { + cy.contains('dt', 'Median requests / convo').next('dd').should('have.text', '12'); + cy.contains('dt', 'Mean requests / convo').next('dd').should('have.text', '14.6'); + }); + + it('summarizes subagents per trace instead of charting group counts', () => { + cy.contains('dt', 'Median subagents / trace').next('dd').should('have.text', '3'); + cy.contains('dt', 'Mean subagents / trace').next('dd').should('have.text', '4.8'); + cy.contains('Subagent groups per conversation').should('not.exist'); + }); + + it('shows ISL and OSL distributions for inner subagent requests only', () => { + const expected = [ + ['Subagent request ISL', ['p50 1.0k', 'p75 2.0k', 'p90 3.0k', 'p95 4.0k']], + ['Subagent request OSL', ['p50 100', 'p75 200', 'p90 300', 'p95 400']], + ] as const; + + for (const [title, percentiles] of expected) { + cy.contains('[data-slot="card"]', title).within(() => { + cy.contains('Inner subagent requests only').should('be.visible'); + for (const percentile of percentiles) cy.contains(percentile).should('be.visible'); + }); + } + }); +}); diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts new file mode 100644 index 00000000..58d95c27 --- /dev/null +++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts @@ -0,0 +1,127 @@ +describe('Dataset conversation flamegraph timing', () => { + before(() => { + cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', { + body: { + conv_id: 'conversation-1', + models: ['model-a'], + num_turns: 2, + num_subagent_groups: 1, + total_in: 1000, + total_out: 100, + total_cached: 500, + structure: { + blockSize: 64, + totals: { + in: 1000, + out: 100, + cached: 500, + uncached: 500, + numTurns: 2, + numSubagentGroups: 1, + }, + nodes: [ + { + kind: 'turn', + turnIndex: 0, + startS: 0, + endS: 1.2, + model: 'model-a', + in: 100, + out: 10, + cached: 0, + uncached: 100, + }, + { + kind: 'subagent', + label: 'Explore', + agentId: 'agent-1', + startS: 3661.2, + endS: 3782.6, + durationMs: 121_400, + in: 800, + out: 80, + cached: 500, + uncached: 300, + children: [ + { + kind: 'turn', + turnIndex: 1, + startS: 3661.2, + endS: 3668.2, + model: 'model-a', + in: 300, + out: 30, + cached: 150, + uncached: 150, + }, + { + kind: 'turn', + turnIndex: 2, + startS: 3665.2, + endS: 3671.2, + model: 'model-a', + in: 300, + out: 30, + cached: 200, + uncached: 100, + }, + { + kind: 'turn', + turnIndex: 3, + startS: 3670.2, + endS: 3675.2, + model: 'model-a', + in: 200, + out: 20, + cached: 150, + uncached: 50, + }, + ], + }, + { + kind: 'turn', + turnIndex: 2, + startS: 65.4, + endS: 67.4, + model: 'model-a', + in: 100, + out: 10, + cached: 0, + uncached: 100, + }, + ], + }, + }, + }); + cy.visit('/datasets/test-dataset/conversations/conversation-1'); + }); + + it('shows turn offsets and a collapsed subagent time range', () => { + cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00–00:01'); + cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05–01:07'); + cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03'); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist'); + }); + + it('shows subturn offsets when the subagent group is expanded', () => { + cy.contains('button', 'Explore').click(); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01–1:01:08'); + // Parallel groups render as left-gutter brackets; each member row carries + // one bracket segment per group it belongs to (non-transitive chains keep + // their own segments/lanes). + cy.get('[data-testid="flamegraph-overlap-g-1-c-0"]') + .should('have.length', 1) + .and('have.attr', 'data-overlap-group', 'subagent-1-1'); + cy.get('[data-testid="flamegraph-overlap-g-1-c-1"]') + .should('have.length', 2) + .then(($segs) => { + expect([...$segs].map((seg) => seg.dataset.overlapGroup).toSorted()).to.deep.equal([ + 'subagent-1-1', + 'subagent-1-2', + ]); + }); + cy.get('[data-testid="flamegraph-overlap-g-1-c-2"]') + .should('have.length', 1) + .and('have.attr', 'data-overlap-group', 'subagent-1-2'); + }); +}); diff --git a/packages/app/cypress/e2e/dropdown-switching.cy.ts b/packages/app/cypress/e2e/dropdown-switching.cy.ts index 34d95ec3..93658af0 100644 --- a/packages/app/cypress/e2e/dropdown-switching.cy.ts +++ b/packages/app/cypress/e2e/dropdown-switching.cy.ts @@ -17,10 +17,10 @@ describe('Dropdown one-click switching', () => { cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'true'); cy.get('[role="option"]').should('have.length.greaterThan', 0); - cy.get('[data-testid="sequence-selector"]').click(); + cy.get('[data-testid="scenario-selector"]').click(); cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'false'); - cy.get('[data-testid="sequence-selector"]').should('have.attr', 'aria-expanded', 'true'); + cy.get('[data-testid="scenario-selector"]').should('have.attr', 'aria-expanded', 'true'); cy.get('[role="option"]').should('have.length.greaterThan', 0); }); diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts new file mode 100644 index 00000000..d574dd2a --- /dev/null +++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts @@ -0,0 +1,54 @@ +describe('GPU comparison agentic point detail', () => { + it('exposes the per-point charts as a normal browser link', () => { + cy.intercept('GET', '/api/v1/trace-availability*', (request) => { + const ids = new URL(request.url).searchParams.get('ids')?.split(',') ?? []; + if (ids.length < 20) request.alias = 'gpuTraceAvailability'; + request.continue(); + }); + + cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + }, + }); + + cy.get('[data-testid="gpu-multiselect"] [role="combobox"]').click({ force: true }); + cy.get('[role="option"]').first().click(); + cy.contains('button', 'Select date range').click(); + cy.get('body').then(($body) => { + if ($body.text().includes('View anyway')) { + cy.contains('button', 'View anyway').click(); + } else { + cy.contains('button', 'Max Range').click(); + cy.contains('button', 'Apply').click(); + } + }); + + cy.get('[data-testid="gpu-graph"]').first().should('be.visible'); + cy.wait('@gpuTraceAvailability'); + cy.wait(100); + cy.get('[data-testid="gpu-graph"]') + .first() + .find('svg .dot-group') + .should('have.length.greaterThan', 0) + .first() + .then(($point) => { + const point = $point[0] as unknown as SVGElement & { + __data__: { benchmark_type?: string; id?: number }; + }; + expect(point.__data__.benchmark_type).to.equal('agentic_traces'); + expect(point.__data__.id).to.be.a('number'); + cy.wrap($point).find('.visible-shape').click({ force: true }); + }); + + cy.get('[data-chart-tooltip]:visible').should('have.length', 1); + cy.get('[data-chart-tooltip]:visible [data-action="view-charts"]') + .should('be.visible') + .then(($link) => { + expect($link).to.match('a'); + expect($link).not.to.have.attr('target'); + expect($link.attr('href')).to.match(/^\/inference\/agentic\/\d+$/u); + }); + cy.location('pathname').should('eq', '/inference'); + }); +}); diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts index 333baa6d..a0753e90 100644 --- a/packages/app/cypress/e2e/gradient-labels.cy.ts +++ b/packages/app/cypress/e2e/gradient-labels.cy.ts @@ -24,8 +24,8 @@ describe('Gradient Labels Toggle', () => { cy.get('label[for="scatter-parallelism-labels"]').should('contain.text', 'Parallelism Labels'); }); - it('Parallelism Labels toggle is off by default', () => { - cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked'); + it('Parallelism Labels toggle is on by default', () => { + cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); }); it('per-point labels are visible by default (gradient labels off)', () => { @@ -60,21 +60,19 @@ describe('Gradient Labels Toggle', () => { }); it('both toggles can be enabled simultaneously', () => { - // Turn on Gradient Labels (off by default) + // Parallelism Labels is on by default; ensure it's on, then turn on Gradient. + cy.get('#scatter-parallelism-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); cy.get('#scatter-gradient-labels').click(); cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); - // Turn on Parallelism Labels - cy.get('#scatter-parallelism-labels').click(); - cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Both should be checked cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Reset for next tests + // Reset gradient for next tests (parallelism stays at its default-on). cy.get('#scatter-gradient-labels').click(); - cy.get('#scatter-parallelism-labels').click(); }); it('URL param i_gradlabel=1 enables gradient labels on load', () => { diff --git a/packages/app/cypress/e2e/historical-trends.cy.ts b/packages/app/cypress/e2e/historical-trends.cy.ts index f0a70a56..55b0e274 100644 --- a/packages/app/cypress/e2e/historical-trends.cy.ts +++ b/packages/app/cypress/e2e/historical-trends.cy.ts @@ -88,8 +88,8 @@ describe('Historical Trends — Content & Interactions', () => { delete doc.body.dataset.scrollLocked; doc.body.style.removeProperty('pointer-events'); }); - cy.get('[data-testid="sequence-selector"]').should('be.visible'); - cy.get('[data-testid="sequence-selector"]').click(); + cy.get('[data-testid="scenario-selector"]').should('be.visible'); + cy.get('[data-testid="scenario-selector"]').click(); cy.get('[role="option"]').should('have.length.greaterThan', 0); cy.get('body').type('{esc}'); }); diff --git a/packages/app/cypress/e2e/line-labels.cy.ts b/packages/app/cypress/e2e/line-labels.cy.ts index 84e655f8..23b372df 100644 --- a/packages/app/cypress/e2e/line-labels.cy.ts +++ b/packages/app/cypress/e2e/line-labels.cy.ts @@ -15,26 +15,30 @@ describe('Line Labels Toggle', () => { cy.get('label[for="scatter-line-labels"]').should('contain.text', 'Line Labels'); }); - it('Line Labels toggle is on by default', () => { - cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked'); - - // Line labels render without any interaction - cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); - }); - - it('toggling Line Labels off then back on removes and restores label elements', () => { - // On by default — turn it off first. - cy.get('#scatter-line-labels').click(); + it('Line Labels toggle is off by default', () => { cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked'); + + // No line labels render without interaction cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0); + }); - // Turn it back on — labels return. + it('toggling Line Labels on then back off adds and removes label elements', () => { + // Off by default — turn it on first. cy.get('#scatter-line-labels').click(); cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked'); cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); + + // Turn it back off — labels disappear. + cy.get('#scatter-line-labels').click(); + cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked'); + cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0); }); it('line labels have colored background rects and text', () => { + // Off by default — ensure on (idempotent; prior test left them off). + cy.get('#scatter-line-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); // Each line label group should contain a background rect and text cy.get('[data-testid="scatter-graph"] svg g.line-label .ll-bg').should( 'have.length.greaterThan', @@ -47,7 +51,10 @@ describe('Line Labels Toggle', () => { }); it('line labels render in the foreground, after the scatter points', () => { - // Labels were toggled on in the test above and remain on here. + // Off by default — ensure on (idempotent; previous test leaves them on). + cy.get('#scatter-line-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); cy.get('[data-testid="scatter-graph"] svg').then(($svg) => { diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index e17a4aff..924ff9a9 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -1,46 +1,90 @@ -describe('TTFT X-Axis Toggle (E2E chart)', () => { +const interceptDerivedMetrics = () => { + cy.intercept('GET', '/api/v1/derived-agentic-metrics*', (request) => { + const ids = new URL(request.url).searchParams.get('ids')?.split(',').filter(Boolean) ?? []; + request.reply({ + body: Object.fromEntries( + ids.map((id, index) => [ + id, + { + id: Number(id), + normalized_session_time_s: 60 + index, + p90_prefill_tps_per_user: 100 + index, + p75_normalized_e2e_400_s: 8 + index, + p90_normalized_e2e_400_s: 12 + index, + }, + ]), + ), + }); + }).as('derivedAgenticMetrics'); +}; + +describe('X-Axis Mode Toggle (inference chart)', () => { before(() => { - cy.window().then((win) => { - win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + cy.visit('/inference', { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + }, }); - cy.visit('/inference'); - cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 2); + cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible'); + cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1); }); - it('shows the x-axis dropdown in the e2e chart heading', () => { - cy.get('[data-testid="chart-figure"]') - .eq(1) - .find('h2 button') - .should('contain.text', 'vs.') - .and('contain.text', 'Latency'); + it('shows Interactivity by default for the agentic view', () => { + cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces'); + cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-interactivity"]') + .should('be.visible') + .and('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); }); - it('opens popover with three x-axis options', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').within(() => { - cy.contains('End-to-end Latency').should('exist'); - cy.contains('P99 TTFT').should('exist'); - cy.contains('Median TTFT').should('exist'); - }); + it('switches the x-axis to TTFT and updates the heading', () => { + cy.get('[data-testid="x-axis-mode-ttft"]').click(); + cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token'); }); - it('switches x-axis to P99 TTFT and updates the heading', () => { - cy.get('[data-slot="popover-content"]').contains('P99 TTFT').click(); - cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'P99 TTFT'); + it('switches the x-axis to E2E Latency and updates the heading', () => { + cy.get('[data-testid="x-axis-mode-e2e"]').click(); + cy.get('[data-testid="x-axis-mode-e2e"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency'); }); - it('switches x-axis to Median TTFT and updates the heading', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').contains('Median TTFT').click(); - cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'Median TTFT'); + it('switches to request-level normalized E2E at 400 output tokens', () => { + interceptDerivedMetrics(); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click(); + cy.wait('@derivedAgenticMetrics'); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + cy.get('[data-testid="chart-figure"] h2').should( + 'contain.text', + 'P90 Normalized E2E @ 400 output tokens', + ); + cy.get('[data-testid="chart-figure"] svg').should( + 'contain.text', + 'P90 Normalized E2E @ 400 output tokens (s)', + ); + + cy.get('[data-testid="percentile-selector"]').click(); + cy.contains('[role="option"]', 'p75').click(); + cy.get('[data-testid="chart-figure"] h2').should( + 'contain.text', + 'P75 Normalized E2E @ 400 output tokens', + ); }); - it('switches back to End-to-end Latency', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').contains('End-to-end Latency').click(); - cy.get('[data-testid="chart-figure"]') - .eq(1) - .find('h2') - .should('contain.text', 'End-to-end Latency'); + it('switches back to Interactivity', () => { + cy.get('[data-testid="x-axis-mode-interactivity"]').click(); + cy.get('[data-testid="x-axis-mode-interactivity"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); }); }); diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts index 33282b9c..927aee5f 100644 --- a/packages/app/cypress/e2e/url-params.cy.ts +++ b/packages/app/cypress/e2e/url-params.cy.ts @@ -21,7 +21,7 @@ const visitWithErrorSpy = (path: string) => { }; const assertNoHydrationMismatch = () => { - cy.get('[data-testid="sequence-selector"]').should('be.visible'); + cy.get('[data-testid="scenario-selector"]').should('be.visible'); cy.get('@consoleError').then((spy) => { const calls = (spy as unknown as { args: unknown[][] }).args; const hydration = calls.filter((args) => @@ -152,7 +152,7 @@ describe('URL Parameter Persistence', () => { it('/inference?i_seq=1k/1k seeds the sequence without a hydration error', () => { visitWithErrorSpy('/inference?i_seq=1k/1k'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); assertNoHydrationMismatch(); }); @@ -160,13 +160,13 @@ describe('URL Parameter Persistence', () => { // Visit the canonical model-prefixed slug so the assertion is directly // about the rendered page, not about a bare-slug redirect interleaving. visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=1k/1k'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); assertNoHydrationMismatch(); }); it('/compare/[slug] with invalid ?i_seq=junk falls back to the seeded default', () => { visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=junk'); - cy.get('[data-testid="sequence-selector"]') + cy.get('[data-testid="scenario-selector"]') .invoke('text') .should('not.contain', 'junk') .and('match', /[18]K . [18]K/u); @@ -228,7 +228,7 @@ describe('URL Parameter Persistence', () => { // `effectivePrecisions` intersects the selection with available precisions // and the UI may render the fallback. dsr1 + fp8 + 1k/1k is supported. visitWithErrorSpy('/inference?i_seq=1k/1k&g_model=DeepSeek-R1-0528&i_prec=fp8'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); cy.get('[data-testid="model-selector"]').should('contain.text', 'DeepSeek'); cy.get('[data-testid="precision-multiselect"]').should('contain.text', 'FP8'); assertNoHydrationMismatch(); @@ -236,9 +236,15 @@ describe('URL Parameter Persistence', () => { }); describe('High contrast mode', () => { - it('page loads without high contrast by default', () => { + it('inference loads with high contrast on by default', () => { visitWithDismissedModal('/inference'); cy.get('[data-testid="scatter-graph"]').should('exist'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked'); + }); + + it('i_hc=0 disables high contrast on load', () => { + visitWithDismissedModal('/inference?i_hc=0'); + cy.get('[data-testid="scatter-graph"]').should('exist'); cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); }); @@ -267,10 +273,12 @@ describe('URL Parameter Persistence', () => { cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); - it('historical trends tab has high contrast switch off by default', () => { + it('historical trends tab shares the inference high-contrast default (on)', () => { + // Historical reads highContrast from the same InferenceContext as the + // scatter chart, so it inherits the default-on behavior. visitWithDismissedModal('/historical'); cy.get('[data-testid="historical-trends-display"]').should('exist'); - cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); + cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); it('i_hc=1 enables historical trends high contrast', () => { diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index bcdfe21b..b2164bcc 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,10 +189,14 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), + selectedPercentile: 'p90', + setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), selectedE2eXAxisMetric: null, setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'), + selectedXAxisMode: 'interactivity' as const, + setSelectedXAxisMode: namedStub('setSelectedXAxisMode'), scaleType: 'auto', setScaleType: namedStub('setScaleType'), quickFilters: { vendors: [], frameworks: [], disagg: [], spec: [] }, From 3efd6b87e475d67339b69e60ef6c13f1620e289d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:21:14 -0500 Subject: [PATCH 11/40] chore: drop completed one-shot backfills and investigation doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit backfill-agentic-intvty, backfill-agentic-server-logs, and backfill-kv-pool were one-time data repairs whose fixes now run inline during ingest; all existing rows have been repaired. The version-driven backfills (chart-series, request-timeline, aggregate-stats, dataset-stats) remain — they re-materialize stored payloads whenever a version constant bumps. --- .../kv-cache-hit-rate-anomaly.md | 113 --------- packages/app/src/lib/benchmark-transform.ts | 2 +- packages/db/package.json | 3 - packages/db/src/backfill-agentic-intvty.ts | 107 --------- .../db/src/backfill-agentic-server-logs.ts | 215 ------------------ packages/db/src/backfill-kv-pool.ts | 103 --------- packages/db/src/etl/benchmark-mapper.ts | 2 +- packages/db/src/lib/github-artifacts.ts | 2 +- 8 files changed, 3 insertions(+), 544 deletions(-) delete mode 100644 docs/investigations/kv-cache-hit-rate-anomaly.md delete mode 100644 packages/db/src/backfill-agentic-intvty.ts delete mode 100644 packages/db/src/backfill-agentic-server-logs.ts delete mode 100644 packages/db/src/backfill-kv-pool.ts diff --git a/docs/investigations/kv-cache-hit-rate-anomaly.md b/docs/investigations/kv-cache-hit-rate-anomaly.md deleted file mode 100644 index 61ffee42..00000000 --- a/docs/investigations/kv-cache-hit-rate-anomaly.md +++ /dev/null @@ -1,113 +0,0 @@ -# KV cache hit-rate anomaly on agentic benchmarks (dsv4, b200, vllm) - -## Core issue - -vLLM's prefix cache should be hitting at ~98% on multi-turn agentic conversation replay (each turn extends the prior turn's context). It isn't. Something in the **dataset definition** or **aiperf replay** is producing requests whose token streams aren't actually prefix-compatible turn-to-turn. - -| Concurrency | Theoretical max hit % | vLLM actual hit % | -| ----------: | --------------------: | ----------------: | -| 1 | 97.45% | 83.15% | -| 2 | 98.34% | 46.78% | -| 4 | 97.99% | 12.43% | - -This is **not** a capacity problem. KV cache is sized at 3.29M tokens (12,868 blocks × 256). The conc=4 workload's unique-content footprint is **~1.11M DSV4 tokens** — would fit in ~34% util. Observed peak util is 49.8%, so the cache is holding more blocks than the workload needs, yet vLLM can't find them on lookup. - -## Data sources - -- **Benchmark points**: - - http://localhost:3002/inference/agentic/206252 (conc=1) - - http://localhost:3002/inference/agentic/206245 (conc=2) - - http://localhost:3002/inference/agentic/206247 (conc=4) -- **Neon DB**: project `silent-pond-29172997`, branch `br-cold-sky-ai0c09cy` (agentx-dev). Connection via `DATABASE_WRITE_URL` in `.env`. Console: https://console.neon.tech/app/projects/silent-pond-29172997/branches/br-cold-sky-ai0c09cy - - `agentic_trace_replay.profile_export_jsonl_gz` — gzipped aiperf per-request records - - `agentic_trace_replay.server_metrics_json_gz` — gzipped vllm per-scrape prometheus metrics - - `agentic_trace_replay.request_timeline` (jsonb) — pre-computed per-request timeline used by the simulation -- **Trace replay dataset** (the source-of-truth for "what should be cacheable"): https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-051926. Each row has pre-computed 64-token block `hash_ids` per turn; `hash_id_scope: 'local'` (per-conversation). - -## Theoretical max simulation - -For each replayed request, look up the matching turn in the HF dataset and walk a per-conversation trie of 64-token block hash IDs. Hits = longest contiguous prefix from block 0 that has appeared in any prior request (mirrors vLLM's chained-hash semantics). - -Confirms: the workload IS prefix-cacheable end-to-end. Theoretical max ≈ 98% across all three concurrency levels — same dataset, same conversations, just different dispatch order. - -## Why this points at the dataset/replay, not vLLM - -- **Capacity is not the bottleneck.** Cache holds ~3× the unique content of the workload. Cache util tops out below capacity. -- **The metric isn't lying.** vLLM's own counters cross-check: `prefill_kv_computed_tokens + prefix_cache_hits ≈ request_prompt_tokens` (67.85M + 9.61M ≈ 77.47M for conc=4). -- **It's not a tokenizer artifact.** DSV4 tokens are ~54% the count of Claude tokens, but BPE is left-monotonic on stable text — hit-rate ratio is invariant to tokenizer choice for prefix-growth workloads. -- **It's not the multi-engine DP bug** we found earlier (commit `f2618f4`) — this deployment has 1 engine. - -What's left: the bytes that vLLM actually receives turn-to-turn are not the same prefix + delta that the dataset's `hash_ids` describe. Most likely culprits: - -1. **aiperf isn't sending the cumulative chat history** the way the dataset assumes — each turn is being assembled differently than the previous, breaking the byte-level prefix. -2. **Something in the request payload varies per request** (timestamps, request IDs, tool result serialization order, etc.) — invalidates block 0's hash, cascades to every subsequent block via vLLM's chained hashing. -3. **BPE re-merging across message boundaries** when aiperf re-tokenizes the full history each turn instead of appending tokens. - -## Root cause: `ConversationReconstructor` strips the prev user's `partial_tail` every turn - -The bug is in `utils/aiperf/src/aiperf/dataset/loader/weka_synth_buf.py` — specifically the **boundary case** in `truncate_synth_buf_at_block` (line 453–464) combined with `turn_delta`'s reset logic (line 354–360). - -What happens turn-to-turn: - -1. `init_turn_0` builds a trailing user segment whose `tokens` = `[block_aligned_tokens] + [partial_tail_tokens]` where `partial_tail_n = in_tokens % bs`. The wire prompt for turn 0 includes these tail tokens. -2. `advance_turn` computes `lcp = longest_common_prefix(prev_hash_ids, curr_hash_ids)`. When the LCP equals the prev turn's total block count (the normal append-only case), `truncate_synth_buf_at_block` hits its boundary branch: `cursor + seg.block_count == target_blocks`. -3. That branch **strips `prev_partial_tail` tokens off the trailing user segment in place** and re-decodes its `content`. This sets `_last_disturbance_at = i` (the index of the prev trailing user segment). -4. New `assistant` + `user` segments are appended. -5. `turn_delta` sees `_last_disturbance_at < _emitted_segment_count` and forces `reset_context=True`, re-emitting **the whole conversation** with the now-stripped trailing user. - -The endpoint (`utils/aiperf/src/aiperf/endpoints/base_endpoint.py:110-140`) honors `reset_context=True` via `messages = list(turn.raw_messages)` instead of `messages.extend(...)`. - -Result: every turn sends the full chat history, but the bytes of the prev user message differ from what was sent the turn before — the trailing `partial_tail` chars are missing. vLLM tokenizes the new prompt, hashes 256-token blocks, and the chained-hash invariant breaks at the first block containing the trimmed boundary. That block + every subsequent block of the new turn miss the cache. - -### Empirical confirmation - -Reproducer at `/tmp/test-reconstructor.py` instantiates `ConversationReconstructor` with mock decoders and walks a synthetic 3-turn conversation: - -``` -=== Turn 0 === - delta msgs: 2, reset=False - wire len: 21683 - -=== Turn 1 === - delta msgs: 4, reset=True ← every turn resets - wire len: 25307 - -=== DIFF turn 0 vs turn 1 (wire-level) === - common prefix chars: 21549 / wire0 21683 (99.4%) - wire0[...] = '... 983406 12 1 133 184 16 57 71 155 37 ' ← partial_tail decoded - wire1[...] = '... 983406<|im_end|>\n<|im_start|>assista' ← stripped, template marker next - turn0 user content len: 19812, turn1 user[0] content len: 19711 ← 101 chars stripped -``` - -Across the conc=1 run (point 206252), **280/280 (100%)** consecutive turn-pairs have `prev_in_tokens % bs != 0` — i.e., every single turn hits this boundary disturbance. - -### Why the gap widens with concurrency - -At conc=1 the gap (97.45% − 83.15% = 14pp) is roughly the fraction of each turn's blocks lost to the trimmed-tail invalidation (last user block + chat-template delta). At higher conc: - -- `reset_context=True` makes every request re-send the **entire** conversation prompt, so wire bandwidth + prefill work scale superlinearly per turn. -- Concurrent conversations all do this simultaneously; each writes long sequences of "new" blocks past their respective divergence points, evicting other conversations' usable prefix blocks even though aggregate unique content (1.11M tokens) fits comfortably in the 3.29M-token cache. - -### Fix sketch - -The boundary-cut strip exists to keep the next turn's `assistant` segment block-aligned. Two viable fixes: - -1. **Don't mutate the prev trailing user segment.** Leave its `partial_tail` tokens intact; append the new asst+user as strict-append (no reset_context). The wire-prefix becomes byte-stable turn-to-turn. Cost: the new asst content's block_start no longer aligns to the prev_hash_ids tail, so hash_id accounting for asst blocks loses 1 block of fidelity per turn. -2. **Track `partial_tail` separately** from the prev user segment so the segment's emitted content stays byte-stable, and only the trailing tail (which is regenerated each turn anyway) is allowed to vary. - -Option 1 is the minimal change. Validate with the reproducer above — remove the strip in `truncate_synth_buf_at_block`'s boundary case and re-run; turn N+1's wire prefix should equal turn N's wire byte-for-byte up to the end of the prev assistant template. - -## Re-running the simulation - -```bash -# 1. dump request timelines from DB -pnpm --filter @semianalysisai/inferencex-db exec dotenv -e ../../.env -- tsx /tmp/dump-rt-multi.ts - -# 2. run analysis (needs `pip3 install --break-system-packages --user datasets`) -python3 /tmp/cache-sim-multi.py - -# 3. reproduce the partial_tail strip -python3 /tmp/test-reconstructor.py -``` - -Scripts live in `/tmp/` from this session; recreate from inline code in the previous version of this doc if missing. diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index cb8e3ceb..df1d328e 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -26,7 +26,7 @@ import type { BenchmarkRow } from '@/lib/api'; * itl, overriding any artifact-supplied value: the harness definition of * `*_intvty` has drifted (some versions emit `p(1/ITL)`, which inverts percentile * order), so for a slow-tail selector interactivity must be `1/p(ITL)`. This - * matches the ingest mapper + backfill-agentic-intvty for official rows; doing it + * matches the ingest mapper for official rows; doing it * here keeps overlay / `?unofficialrun=` rows (transformed live from raw * artifacts, never through the DB) on the same definition. */ diff --git a/packages/db/package.json b/packages/db/package.json index c7836df4..2c8dc067 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -19,12 +19,9 @@ "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts", "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts", "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts", - "db:backfill-agentic-intvty": "dotenv -e ../../.env -- tsx src/backfill-agentic-intvty.ts", "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts", "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts", - "db:backfill-agentic-server-logs": "dotenv -e ../../.env -- tsx src/backfill-agentic-server-logs.ts", "db:backfill-dataset-stats": "dotenv -e ../../.env -- tsx src/backfill-dataset-stats.ts", - "db:backfill-kv-pool": "dotenv -e ../../.env -- tsx src/backfill-kv-pool.ts", "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts", "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts", "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts", diff --git a/packages/db/src/backfill-agentic-intvty.ts b/packages/db/src/backfill-agentic-intvty.ts deleted file mode 100644 index a8eebdba..00000000 --- a/packages/db/src/backfill-agentic-intvty.ts +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Backfill: enforce the slow-tail interactivity invariant on agentic rows. - * - * Agentic trace-replay artifacts emit both `*_itl` and `*_intvty`. Historically - * the harness wrote `*_intvty = 1/p(ITL)` (slow-tail — "interactivity at the - * p-th latency"), which is what the inference chart's interactivity selector - * and the detail time-series both assume. A later "timing fix" harness started - * emitting `*_intvty = p(1/ITL)` instead (fast-tail — equivalent to - * `1/p(100-x)(ITL)`), because taking the reciprocal reverses percentile order. - * Ingest stores every metric verbatim, so those runs landed in the DB with the - * opposite definition — e.g. p90 reading 23.9 instead of 11.2 for the same - * point — contaminating cross-run Pareto comparisons. - * - * This rewrites `mean/p75/p90/p95 _intvty = 1/_itl` for every agentic row so the - * stored value always matches the slow-tail definition the charts use. It is - * idempotent: rows already on the correct definition are left untouched (guarded - * by a relative-deviation check). `std_intvty` is intentionally NOT touched — - * the reciprocal of a standard deviation is meaningless, and the API strips it. - * The prior fast-tail value is discarded on purpose (p10_itl isn't stored, so it - * isn't recoverable anyway, and per project policy fast-tail must not back a - * slow-tail selector). - * - * Usage: - * pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes - */ - -import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; -import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils.js'; - -// Percentile-style keys whose interactivity is the reciprocal of the matching -// ITL percentile. `std` is excluded by design (not a reciprocal); `median`/`p99` -// are absent from agentic artifacts so they never appear here. -const KEYS = ['mean', 'p75', 'p90', 'p95'] as const; - -// Relative tolerance: skip rows already within 1e-6 of 1/itl so correct rows -// keep their original full-precision value and the change counts are accurate. -const REL_TOL = 1e-6; - -const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} }); - -async function contaminationCounts(): Promise> { - const out: Record = {}; - for (const k of KEYS) { - const rows = await sql.unsafe(` - SELECT count(*)::int AS n - FROM benchmark_results - WHERE benchmark_type = 'agentic_traces' - AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0 - AND metrics ? '${k}_intvty' - AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric) - > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric) - `); - out[k] = (rows[0] as unknown as { n: number }).n; - } - return out; -} - -async function main(): Promise { - const total = await sql<{ n: number }[]>` - SELECT count(*)::int AS n FROM benchmark_results WHERE benchmark_type = 'agentic_traces' - `; - console.log(`Agentic rows: ${total[0]!.n}`); - - const before = await contaminationCounts(); - console.log('Contaminated (intvty != 1/itl) before:', JSON.stringify(before)); - if (KEYS.every((k) => before[k] === 0)) { - console.log('Nothing to backfill — all agentic rows already satisfy intvty = 1/itl.'); - await sql.end(); - return; - } - - if (!hasYesFlag() && !(await confirm('Rewrite *_intvty = 1/*_itl for these rows? (y/N) '))) { - await sql.end(); - return; - } - - let totalUpdated = 0; - for (const k of KEYS) { - // keys are from a fixed trusted const — safe to interpolate. - const res = await sql.unsafe(` - UPDATE benchmark_results - SET metrics = jsonb_set(metrics, '{${k}_intvty}', to_jsonb(1.0 / (metrics->>'${k}_itl')::numeric)) - WHERE benchmark_type = 'agentic_traces' - AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0 - AND metrics ? '${k}_intvty' - AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric) - > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric) - `); - console.log(` ${k}_intvty: updated ${res.count} row(s)`); - totalUpdated += res.count; - } - - const after = await contaminationCounts(); - console.log('Contaminated after:', JSON.stringify(after)); - if (!KEYS.every((k) => after[k] === 0)) { - throw new Error('Backfill incomplete — some rows still deviate. Aborting before MV refresh.'); - } - - await refreshLatestBenchmarks(sql); - console.log(`Done. Rewrote ${totalUpdated} metric value(s) across agentic rows.`); - await sql.end(); -} - -main().catch((error) => { - console.error(error); - process.exit(1); -}); diff --git a/packages/db/src/backfill-agentic-server-logs.ts b/packages/db/src/backfill-agentic-server-logs.ts deleted file mode 100644 index 37157861..00000000 --- a/packages/db/src/backfill-agentic-server-logs.ts +++ /dev/null @@ -1,215 +0,0 @@ -/** - * Backfill server logs (and the derived KV-cache pool size) for AGENTIC - * benchmark points. - * - * Agentic runs upload their vLLM server log as a `server_logs_` artifact, - * but the ingest path historically failed to link it to agentic rows (the - * `bmk_agentic_` → `server_logs_` key mismatch, now fixed in - * ingest-ci-run). As a result the agentic server log text was never stored, so - * `kv_cache_pool_tokens` cannot be derived from the DB — we must re-fetch the - * artifacts from GitHub. - * - * For each agentic workflow run this: - * 1. lists the run's artifacts and keeps only `server_logs_*` + `bmk_agentic_*` - * (dedup by logical name, mirroring ingest's runner-suffix collapse), - * 2. downloads + unzips just those (small — skips the multi-MB trace dirs), - * 3. maps each `bmk_agentic_` JSON → config → benchmark_results rows via - * the same mapBenchmarkRow/config-cache logic ingest uses, - * 4. calls insertServerLog(), which stores+links the log AND derives - * `kv_cache_pool_tokens` into benchmark_results.metrics. - * - * Idempotent: insertServerLog only links rows whose server_log_id is null. - * - * Usage: - * pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-server-logs - * [--limit N] only process the first N workflow runs - * [--yes] skip the confirmation prompt - */ - -import fs from 'node:fs'; -import os from 'node:os'; -import path from 'node:path'; - -import { hasNoSslFlag } from './cli-utils'; -import { insertServerLog } from './etl/benchmark-ingest'; -import { mapBenchmarkRow } from './etl/benchmark-mapper'; -import { createConfigCache } from './etl/config-cache'; -import { createAdminSql } from './etl/db-utils'; -import { createSkipTracker } from './etl/skip-tracker'; -import { confirmProceed, parseLimitForceFlags, runBackfillMain } from './lib/backfill-runner'; -import { - RUNNER_SUFFIX_RE, - dedupeArtifactsByLogicalName, - downloadArtifact, - listRunArtifacts, - type ArtifactMeta, -} from './lib/github-artifacts'; - -const REPO = 'SemiAnalysisAI/InferenceX'; - -const flags = parseLimitForceFlags(); -const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} }); - -/** - * List the run's `server_logs_*` / `bmk_agentic_*` artifacts, deduped by - * runner-suffix-stripped logical name (matches ingest's collapse). - */ -function listArtifacts(githubRunId: string): Map { - return dedupeArtifactsByLogicalName( - listRunArtifacts(REPO, githubRunId).filter( - (a) => a.name.startsWith('server_logs_') || a.name.startsWith('bmk_agentic_'), - ), - ); -} - -/** Logical key shared by a server_logs_/bmk_agentic_ artifact pair. */ -function logicalKey(name: string): string { - return name - .replace(/^server_logs_/u, '') - .replace(/^bmk_agentic_/u, '') - .replace(RUNNER_SUFFIX_RE, ''); -} - -/** - * Read up to `maxBytes` of a (possibly huge) server log as UTF-8, stripping NUL - * bytes. vLLM's "GPU KV cache size" startup lines are near the top, so a head - * read is enough to derive the KV pool — and it caps storage for the rare - * multi-hundred-MB logs that exceed V8's ~512 MB string limit. - */ -const stripNul = (s: string): string => s.replaceAll(String.fromCodePoint(0), ''); - -function readServerLogCapped(p: string, maxBytes = 64 * 1024 * 1024): string { - if (fs.statSync(p).size <= maxBytes) return stripNul(fs.readFileSync(p, 'utf8')); - const fd = fs.openSync(p, 'r'); - try { - const buf = Buffer.allocUnsafe(maxBytes); - const n = fs.readSync(fd, buf, 0, maxBytes, 0); - return stripNul(buf.subarray(0, n).toString('utf8')); - } finally { - fs.closeSync(fd); - } -} - -function findJsonFiles(dir: string): string[] { - const out: string[] = []; - const walk = (d: string) => { - for (const e of fs.readdirSync(d, { withFileTypes: true })) { - const p = path.join(d, e.name); - if (e.isDirectory()) walk(p); - else if (e.name.endsWith('.json')) out.push(p); - } - }; - walk(dir); - return out; -} - -async function main(): Promise { - console.log('=== backfill-agentic-server-logs ==='); - console.log(` limit = ${flags.limit ?? 'none'}`); - - // Agentic workflow runs that still have unlinked server logs. - const runs = await sql<{ github_run_id: string; workflow_run_id: number }[]>` - select distinct wr.github_run_id::text as github_run_id, wr.id as workflow_run_id - from benchmark_results br - join workflow_runs wr on wr.id = br.workflow_run_id - where br.benchmark_type = 'agentic_traces' - and br.server_log_id is null - order by wr.id - ${flags.limit ? sql`limit ${flags.limit}` : sql``} - `; - - if (runs.length === 0) { - console.log('\n Nothing to do — all agentic rows already have a server log.'); - return; - } - if (!(await confirmProceed(`${runs.length} agentic workflow run(s) to process.`))) return; - - const cache = createConfigCache(sql); - await cache.preloadConfigs(); - const tracker = createSkipTracker(); - - let linkedRows = 0; - let runsOk = 0; - let runsFailed = 0; - const t0 = Date.now(); - - for (const { github_run_id: githubRunId, workflow_run_id: wrId } of runs) { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `kvpool-${githubRunId}-`)); - try { - const artifacts = listArtifacts(githubRunId); - // server log path by logical key - const serverLogByKey = new Map(); - const bmkDirs: string[] = []; - for (const art of artifacts.values()) { - const dir = downloadArtifact(art, tmp); - if (art.name.startsWith('server_logs_')) { - const logPath = path.join(dir, 'server.log'); - if (fs.existsSync(logPath)) serverLogByKey.set(logicalKey(art.name), logPath); - } else { - bmkDirs.push(dir); - } - } - - let runLinked = 0; - for (const bmkDir of bmkDirs) { - const key = logicalKey(path.basename(bmkDir)); - const logPath = serverLogByKey.get(key); - if (!logPath) continue; - for (const file of findJsonFiles(bmkDir)) { - let raw: unknown; - try { - raw = JSON.parse(fs.readFileSync(file, 'utf8')); - } catch { - continue; - } - const rows = Array.isArray(raw) ? raw : [raw]; - for (const row of rows) { - if (!row || typeof row !== 'object') continue; - const mapped = mapBenchmarkRow(row as Record, tracker); - if (!mapped || mapped.benchmarkType !== 'agentic_traces') continue; - const configId = await cache.getOrCreateConfig(mapped.config); - const ids = await sql<{ id: number }[]>` - select id from benchmark_results - where workflow_run_id = ${wrId} - and config_id = ${configId} - and conc = ${mapped.conc} - and benchmark_type = 'agentic_traces' - and server_log_id is null - `; - if (ids.length === 0) continue; - const serverLog = readServerLogCapped(logPath); - await insertServerLog( - sql, - ids.map((r) => r.id), - serverLog, - ); - runLinked += ids.length; - } - } - } - linkedRows += runLinked; - runsOk++; - const elapsed = Math.round((Date.now() - t0) / 1000); - console.log( - ` ✓ run ${githubRunId}: ${serverLogByKey.size} log(s), linked ${runLinked} row(s) ` + - `(${runsOk}/${runs.length}, ${elapsed}s total)`, - ); - } catch (error) { - runsFailed++; - console.error( - ` ✗ run ${githubRunId}: ${error instanceof Error ? (error.stack ?? error.message) : String(error)}`, - ); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - } - - const totalSec = Math.round((Date.now() - t0) / 1000); - console.log( - `\n=== complete: ${linkedRows} row(s) linked across ${runsOk} run(s) ` + - `(${runsFailed} failed) in ${totalSec}s ===`, - ); - if (runsFailed > 0) process.exitCode = 1; -} - -runBackfillMain('backfill-agentic-server-logs', sql, main); diff --git a/packages/db/src/backfill-kv-pool.ts b/packages/db/src/backfill-kv-pool.ts deleted file mode 100644 index efa04c81..00000000 --- a/packages/db/src/backfill-kv-pool.ts +++ /dev/null @@ -1,103 +0,0 @@ -/** - * Backfill `benchmark_results.metrics->kv_cache_pool_tokens` from the captured - * server logs. The value is parsed from vLLM's authoritative - * "GPU KV cache size: N tokens" startup line(s), summed across data-parallel - * engine cores (see {@link kvCachePoolTokensFromServerLog}). - * - * The ingest path now derives this inline in `insertServerLog`, but existing - * rows need this one-time pass. Idempotent: re-running only touches rows that - * still lack the value (unless --force). - * - * Usage: - * pnpm --filter @semianalysisai/inferencex-db db:backfill-kv-pool - * [--limit N] only process the first N candidate server logs - * [--force] recompute even when the value is already set - * [--yes] skip the confirmation prompt - */ - -import { hasNoSslFlag } from './cli-utils.js'; -import { createAdminSql } from './etl/db-utils.js'; -import { kvCachePoolTokensFromServerLog } from './etl/server-log-metrics.js'; -import { confirmProceed, parseLimitForceFlags, runBackfillMain } from './lib/backfill-runner.js'; - -const flags = parseLimitForceFlags(); - -const sql = createAdminSql({ - noSsl: hasNoSslFlag(), - max: 1, - onnotice: () => {}, -}); - -async function main(): Promise { - console.log('=== backfill-kv-pool ==='); - console.log(` force = ${flags.force}`); - console.log(` limit = ${flags.limit ?? 'none'}`); - - // One server log can be linked to several benchmark_results (multiple - // concurrency points share a server). Group by log id so we parse each log - // once and fan the value out to all its rows. - const candidates = flags.force - ? await sql<{ server_log_id: number }[]>` - select distinct server_log_id - from benchmark_results - where server_log_id is not null - order by server_log_id - ${flags.limit ? sql`limit ${flags.limit}` : sql``} - ` - : await sql<{ server_log_id: number }[]>` - select distinct server_log_id - from benchmark_results - where server_log_id is not null - and metrics->>'kv_cache_pool_tokens' is null - order by server_log_id - ${flags.limit ? sql`limit ${flags.limit}` : sql``} - `; - - if (candidates.length === 0) { - console.log('\n Nothing to do — all rows up to date.'); - return; - } - - if (!(await confirmProceed(`${candidates.length} candidate server log(s).`))) return; - - let updated = 0; - let logsWithValue = 0; - let logsNoValue = 0; - let failed = 0; - const t0 = Date.now(); - for (const { server_log_id: logId } of candidates) { - try { - const [row] = await sql<{ server_log: string | null }[]>` - select server_log from server_logs where id = ${logId} - `; - const tokens = kvCachePoolTokensFromServerLog(row?.server_log ?? null); - if (tokens === null) { - logsNoValue++; - continue; // non-vLLM or no startup line — leave unset - } - logsWithValue++; - const targets = flags.force - ? sql`server_log_id = ${logId}` - : sql`server_log_id = ${logId} and metrics->>'kv_cache_pool_tokens' is null`; - const result = await sql` - update benchmark_results - set metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${tokens}::bigint)) - where ${targets} - `; - updated += result.count; - console.log(` ✓ log=${logId}: ${tokens.toLocaleString()} tok → ${result.count} row(s)`); - } catch (error) { - failed++; - console.error(` ✗ log=${logId}: ${error instanceof Error ? error.message : String(error)}`); - } - } - - const totalSec = Math.round((Date.now() - t0) / 1000); - console.log( - `\n=== backfill complete: ${updated} row(s) updated from ${logsWithValue} log(s) ` + - `(${logsNoValue} log(s) had no KV-pool line, ${failed} failed) in ${totalSec}s ===`, - ); - if (failed > 0) process.exitCode = 1; -} - -runBackfillMain('backfill-kv-pool', sql, main); diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 90c23ef0..caae08c2 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -253,7 +253,7 @@ export function mapBenchmarkRow( // percentile, so we derive it from `*_itl` here rather than trust the artifact, // keeping every agentic row on one definition. `std` is excluded — the // reciprocal of a standard deviation is meaningless. Mirrored in the frontend - // overlay path (agenticAliases) and the one-time backfill-agentic-intvty script. + // overlay path (agenticAliases). if (isAgentic) { for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) { const itl = metrics[`${k}_itl`]; diff --git a/packages/db/src/lib/github-artifacts.ts b/packages/db/src/lib/github-artifacts.ts index 291740cf..c96ae830 100644 --- a/packages/db/src/lib/github-artifacts.ts +++ b/packages/db/src/lib/github-artifacts.ts @@ -1,6 +1,6 @@ /** * GitHub Actions artifact helpers shared by `ingest-ci-run.ts` (download - * mode) and `backfill-agentic-server-logs.ts`. All calls shell out to the + * mode). All calls shell out to the * `gh` CLI, which picks up GITHUB_TOKEN from the environment. */ From b84daff751c1030fa9142ee39bd62f1c1a828c70 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 14:53:58 -0500 Subject: [PATCH 12/40] fix(ingest): preserve derived kv_cache_pool_tokens across metrics upserts kv_cache_pool_tokens is derived from the server log at insertServerLog time and exists in no artifact JSON, so the aggregated results_bmk artifact's ON CONFLICT metrics replacement silently wiped it from every row whose per-config artifact was processed first. Carry the existing value through the upsert. Wiped agentic rows re-derived from stored logs (80 repaired). --- packages/db/src/etl/benchmark-ingest.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts index a405789d..2a2382c8 100644 --- a/packages/db/src/etl/benchmark-ingest.ts +++ b/packages/db/src/etl/benchmark-ingest.ts @@ -74,7 +74,13 @@ export async function bulkIngestBenchmarkRows( unnest(${sql.array(workersJsons)}::jsonb[]) on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode) do update set - metrics = excluded.metrics, + -- Replace metrics with the fresh artifact values, but carry over + -- kv_cache_pool_tokens: it is derived from the server log at + -- insertServerLog time (not present in any artifact JSON), so a later + -- upsert from the aggregated results_bmk artifact would silently wipe it. + metrics = excluded.metrics || jsonb_strip_nulls( + jsonb_build_object('kv_cache_pool_tokens', benchmark_results.metrics->'kv_cache_pool_tokens') + ), image = excluded.image, workers = excluded.workers returning (xmax = 0) as inserted, id From 94d19774006f0d28268fe628db52983af3b04dbe Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 15:10:31 -0500 Subject: [PATCH 13/40] chore(db): renumber agentic migrations after master's 007 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Master shipped its own 007 (latest_benchmarks single-run-per-line, #491) while this branch carried 007_agentic — two migrations with the same number. Renumber the branch set to 008_agentic / 009_latest_benchmarks_single_run_per_line / 010_dataset_request_stats so a fresh deploy applies them strictly after master's lineage; 009 supersedes master's 007 with the offload_mode-aware view definition. --- packages/db/migrations/{007_agentic.sql => 008_agentic.sql} | 0 ..._line.sql => 009_latest_benchmarks_single_run_per_line.sql} | 0 ...dataset_request_stats.sql => 010_dataset_request_stats.sql} | 0 packages/db/src/backfill-aggregate-stats.ts | 3 +-- packages/db/src/backfill-chart-series.ts | 2 +- packages/db/src/backfill-request-timeline.ts | 2 +- 6 files changed, 3 insertions(+), 4 deletions(-) rename packages/db/migrations/{007_agentic.sql => 008_agentic.sql} (100%) rename packages/db/migrations/{008_latest_benchmarks_single_run_per_line.sql => 009_latest_benchmarks_single_run_per_line.sql} (100%) rename packages/db/migrations/{009_dataset_request_stats.sql => 010_dataset_request_stats.sql} (100%) diff --git a/packages/db/migrations/007_agentic.sql b/packages/db/migrations/008_agentic.sql similarity index 100% rename from packages/db/migrations/007_agentic.sql rename to packages/db/migrations/008_agentic.sql diff --git a/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/009_latest_benchmarks_single_run_per_line.sql similarity index 100% rename from packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql rename to packages/db/migrations/009_latest_benchmarks_single_run_per_line.sql diff --git a/packages/db/migrations/009_dataset_request_stats.sql b/packages/db/migrations/010_dataset_request_stats.sql similarity index 100% rename from packages/db/migrations/009_dataset_request_stats.sql rename to packages/db/migrations/010_dataset_request_stats.sql diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts index 2e3a4038..5896529b 100644 --- a/packages/db/src/backfill-aggregate-stats.ts +++ b/packages/db/src/backfill-aggregate-stats.ts @@ -3,8 +3,7 @@ * or were computed by an older `STATS_VERSION`. * * The ingest path now computes stats inline, but existing rows (and rows - * whose computation logic has since changed) still need this pass. Run after - * applying migration 008 and any time `STATS_VERSION` bumps. + * whose computation logic has since changed) still need this pass. Run after the agentic schema migration and any time `STATS_VERSION` bumps. * * Strategy: * - Stream rows one at a time (server_metrics_json_gz can be hundreds of diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts index 94e009cf..94e1700d 100644 --- a/packages/db/src/backfill-chart-series.ts +++ b/packages/db/src/backfill-chart-series.ts @@ -4,7 +4,7 @@ * * The ingest path now computes the time-series inline, but existing rows * (and rows whose computation logic has since changed) still need this - * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION` + * pass. Run after the agentic schema migration and any time `CHART_SERIES_VERSION` * bumps. * * Strategy: diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts index 09126654..67291b6c 100644 --- a/packages/db/src/backfill-request-timeline.ts +++ b/packages/db/src/backfill-request-timeline.ts @@ -4,7 +4,7 @@ * * The ingest path now computes the timeline inline, but existing rows * (and rows whose computation logic has since changed) still need this - * pass. Run after applying migration 010 and any time the version bumps. + * pass. Run after the agentic schema migration and any time the version bumps. * * Usage: * pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline From d5c56bf35c016e9f2a4e0cbfd26111d4a82782da Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 15:10:57 -0500 Subject: [PATCH 14/40] feat(inference): per-series points table from the chart legend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each inference legend row gets a table icon (visible on hover/focus, faint otherwise) that opens a dialog listing every currently-visible point for that hardware/framework series: concurrency, parallelism, offload, tput/GPU, p50/p90 interactivity and TTFT, sorted by concurrency with sortable columns. Rows link the same way scatter points do — agentic points to their per-point detail page, fixed-seq points to the GitHub Actions run — as real anchors so open-in-new-tab works. Unofficial-run overlay series get the same table (metrics only; overlay points have no stored benchmark rows) respecting activeOverlayHwTypes and overlayRunColor. --- .../app/cypress/component/chart-legend.cy.tsx | 145 +++++ .../inference/ui/LegendPointsDialog.tsx | 212 +++++++ .../components/inference/ui/ScatterGraph.tsx | 598 ++++++++++-------- .../utils/legend-points-table.test.ts | 223 +++++++ .../inference/utils/legend-points-table.ts | 123 ++++ .../inference/utils/tooltipUtils.ts | 5 +- .../src/components/ui/chart-legend-item.tsx | 24 +- .../app/src/components/ui/chart-legend.tsx | 6 +- 8 files changed, 1084 insertions(+), 252 deletions(-) create mode 100644 packages/app/src/components/inference/ui/LegendPointsDialog.tsx create mode 100644 packages/app/src/components/inference/utils/legend-points-table.test.ts create mode 100644 packages/app/src/components/inference/utils/legend-points-table.ts diff --git a/packages/app/cypress/component/chart-legend.cy.tsx b/packages/app/cypress/component/chart-legend.cy.tsx index 4a362c2b..535a0053 100644 --- a/packages/app/cypress/component/chart-legend.cy.tsx +++ b/packages/app/cypress/component/chart-legend.cy.tsx @@ -1,5 +1,8 @@ import { useState } from 'react'; +import LegendPointsDialog from '@/components/inference/ui/LegendPointsDialog'; +import type { InferenceData } from '@/components/inference/types'; +import { buildLegendPointsRows } from '@/components/inference/utils/legend-points-table'; import ChartLegend, { type CommonLegendItemProps } from '@/components/ui/chart-legend'; const MOCK_ITEMS: CommonLegendItemProps[] = [ @@ -119,4 +122,146 @@ describe('ChartLegend (sidebar variant)', () => { .click(); cy.get('.sidebar-legend').should('not.have.class', 'bg-accent'); }); + + it('renders no points-table icon when items have no onShowPoints handler', () => { + cy.get('[data-testid^="legend-points-"]').should('not.exist'); + }); +}); + +// --------------------------------------------------------------------------- +// Per-series points table (inference legend drill-down) +// --------------------------------------------------------------------------- + +function mockPoint(overrides: Partial = {}): InferenceData { + return { + date: '2025-06-15', + x: 100, + y: 500, + tp: 8, + conc: 16, + hwKey: 'b300-sxm', + precision: 'fp4', + tput_per_gpu: 1500.5, + median_intvty: 45.2, + p90_intvty: 38.1, + median_ttft: 0.42, + p90_ttft: 0.87, + tpPerGpu: { y: 1500.5, roof: false }, + tpPerMw: { y: 50, roof: false }, + costh: { y: 1, roof: false }, + costn: { y: 1, roof: false }, + costr: { y: 1, roof: false }, + costhi: { y: 1, roof: false }, + costni: { y: 1, roof: false }, + costri: { y: 1, roof: false }, + ...overrides, + } as InferenceData; +} + +const OFFICIAL_POINTS: InferenceData[] = [ + mockPoint({ conc: 32, benchmark_type: 'agentic_traces', id: 206863, offload_mode: 'on' }), + mockPoint({ conc: 4, benchmark_type: 'agentic_traces', id: 206860, offload_mode: 'off' }), +]; + +const OVERLAY_POINTS: InferenceData[] = [ + mockPoint({ conc: 8, run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/1' }), +]; + +/** Mirrors ScatterGraph's wiring: legend rows with onShowPoints → dialog. */ +function LegendWithPointsTable() { + const [openSeries, setOpenSeries] = useState<'official' | 'overlay' | null>(null); + + const items: CommonLegendItemProps[] = [ + { + name: 'b300-sxm', + hw: 'b300-sxm', + label: 'B300 (vLLM)', + color: '#2b83ba', + isActive: true, + onClick: () => {}, + onShowPoints: () => setOpenSeries('official'), + }, + { + name: '✕ unofficial-run-99', + hw: 'overlay-run-99', + label: '✕ my-branch', + color: '#dc2626', + isActive: true, + onClick: () => {}, + onShowPoints: () => setOpenSeries('overlay'), + }, + ]; + + const isOverlay = openSeries === 'overlay'; + return ( + <> + {}} + variant="sidebar" + /> + {openSeries && ( + { + if (!open) setOpenSeries(null); + }} + title={isOverlay ? '✕ my-branch' : 'B300 (vLLM)'} + subtitle="DeepSeek V4 Pro · Agentic Traces" + accentColor={isOverlay ? '#dc2626' : '#2b83ba'} + rows={buildLegendPointsRows(isOverlay ? OVERLAY_POINTS : OFFICIAL_POINTS, isOverlay)} + isOverlay={isOverlay} + /> + )} + + ); +} + +describe('ChartLegend points-table icon + dialog', () => { + beforeEach(() => { + cy.mount(); + }); + + it('renders the icon only for rows with an onShowPoints handler', () => { + cy.get('[data-testid="legend-points-b300-sxm"]').should('exist'); + cy.get('[data-testid="legend-points-overlay-run-99"]').should('exist'); + }); + + it('opens the dialog with the series points sorted by concurrency, with row links', () => { + cy.get('[data-testid="legend-points-b300-sxm"]').click(); + cy.get('[data-testid="legend-points-dialog"]').should('be.visible'); + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'B300 (vLLM)'); + cy.get('[data-testid="legend-points-dialog"]').should( + 'contain.text', + 'DeepSeek V4 Pro · Agentic Traces', + ); + // Two rows, conc ascending, linked to the agentic detail pages + cy.get('[data-testid="legend-points-row"]').should('have.length', 2); + cy.get('a[data-testid="legend-points-row"]') + .first() + .should('have.attr', 'href', '/inference/agentic/206860'); + cy.get('a[data-testid="legend-points-row"]').first().should('contain.text', '4'); + // Offload column present for agentic rows + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'Offload'); + }); + + it('overlay series opens a link-free table with the metrics-only caption', () => { + cy.get('[data-testid="legend-points-overlay-run-99"]').click(); + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', '✕ my-branch'); + cy.get('a[data-testid="legend-points-row"]').should('not.exist'); + cy.get('div[data-testid="legend-points-row"]').should('have.length', 1); + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'metrics only'); + // Metrics still render + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', '1500.5'); + }); + + it('dialog closes and can be reopened', () => { + cy.get('[data-testid="legend-points-b300-sxm"]').click(); + cy.get('[data-testid="legend-points-dialog"]').should('be.visible'); + cy.get('body').type('{esc}'); + cy.get('[data-testid="legend-points-dialog"]').should('not.exist'); + cy.get('[data-testid="legend-points-overlay-run-99"]').click(); + cy.get('[data-testid="legend-points-dialog"]').should('be.visible'); + }); }); diff --git a/packages/app/src/components/inference/ui/LegendPointsDialog.tsx b/packages/app/src/components/inference/ui/LegendPointsDialog.tsx new file mode 100644 index 00000000..0546872c --- /dev/null +++ b/packages/app/src/components/inference/ui/LegendPointsDialog.tsx @@ -0,0 +1,212 @@ +'use client'; + +import { ArrowDown, ArrowUp, ExternalLink } from 'lucide-react'; +import { useMemo, useState } from 'react'; + +import { + Dialog, + DialogContent, + DialogDescription, + DialogHeader, + DialogTitle, +} from '@/components/ui/dialog'; +import { cn } from '@/lib/utils'; + +import { + type LegendPointsSortKey, + type LegendPointsTableRow, + formatRowValue, + sortLegendPointsRows, +} from '@/components/inference/utils/legend-points-table'; + +export interface LegendPointsDialogProps { + open: boolean; + onOpenChange: (open: boolean) => void; + /** Series label, e.g. "B300 (vLLM)". */ + title: string; + /** Context line, e.g. "DeepSeek V4 Pro · Agentic Traces". */ + subtitle: string; + /** Legend swatch color for this series (overlayRunColor for overlay runs). */ + accentColor: string; + /** Rows from buildLegendPointsRows — already default-sorted by concurrency. */ + rows: LegendPointsTableRow[]; + /** Unofficial-run overlay series: metrics only, no detail links. */ + isOverlay: boolean; + onRowClick?: (row: LegendPointsTableRow) => void; +} + +interface Column { + key: LegendPointsSortKey; + label: string; + numeric: boolean; +} + +const cellValue = (row: LegendPointsTableRow, col: Column): string => { + if (col.key === 'conc') return String(row.conc); + if (col.key === 'parallelism') return row.parallelism; + if (col.key === 'offload') return row.offload ?? '—'; + return formatRowValue(row[col.key]); +}; + +/** + * Per-series drill-down opened from the chart legend: every currently-visible + * point of one hardware/framework series, with the same detail links the + * scatter points offer on click. + */ +export default function LegendPointsDialog({ + open, + onOpenChange, + title, + subtitle, + accentColor, + rows, + isOverlay, + onRowClick, +}: LegendPointsDialogProps) { + const [sort, setSort] = useState<{ key: LegendPointsSortKey; dir: 'asc' | 'desc' } | null>(null); + + const hasOffload = rows.some((r) => r.offload !== null); + const columns = useMemo( + (): Column[] => [ + { key: 'conc', label: 'Conc', numeric: true }, + { key: 'parallelism', label: 'Parallelism', numeric: false }, + ...(hasOffload ? [{ key: 'offload', label: 'Offload', numeric: false } as Column] : []), + { key: 'tputPerGpu', label: 'Tput/GPU', numeric: true }, + { key: 'p50Intvty', label: 'p50 Int', numeric: true }, + { key: 'p90Intvty', label: 'p90 Int', numeric: true }, + { key: 'p50Ttft', label: 'p50 TTFT', numeric: true }, + { key: 'p90Ttft', label: 'p90 TTFT', numeric: true }, + ], + [hasOffload], + ); + + const sortedRows = useMemo( + () => (sort ? sortLegendPointsRows(rows, sort.key, sort.dir) : rows), + [rows, sort], + ); + + const toggleSort = (key: LegendPointsSortKey) => { + setSort((prev) => + prev?.key === key ? (prev.dir === 'asc' ? { key, dir: 'desc' } : null) : { key, dir: 'asc' }, + ); + }; + + // Trailing column reserves space for the detail-link icon. + const gridTemplateColumns = `${columns.map(() => 'auto').join(' ')} min-content`; + + const renderCells = (row: LegendPointsTableRow) => ( + <> + {columns.map((col) => ( + + {cellValue(row, col)} + + ))} + + {row.href && + (row.isExternal ? ( + + + ); + + return ( + + + + + + {subtitle} + + + {sortedRows.length === 0 ? ( +

+ No visible points for this series under the current filters. +

+ ) : ( + // One grid owns the column tracks; every row is a subgrid so cells + // align across ALL rows (per-row grids would auto-size independently + // and produce ragged columns). +
+
+ {columns.map((col) => { + const active = sort?.key === col.key; + return ( + + ); + })} + +
+ {sortedRows.map((row) => + row.href ? ( + onRowClick?.(row)} + className="col-span-full grid grid-cols-subgrid items-center rounded-sm hover:bg-accent whitespace-nowrap" + > + {renderCells(row)} + + ) : ( +
+ {renderCells(row)} +
+ ), + )} +
+ )} + +

+ {isOverlay + ? 'Unofficial overlay points have no stored benchmark records — metrics only, no detail links.' + : 'Click a row for the point detail — agentic points open the trace detail page, fixed-seq points open the GitHub Actions run.'}{' '} + Interactivity in tok/s/user · TTFT in s · throughput in tok/s/gpu. +

+
+
+ ); +} diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index fe4ca820..e12522ce 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -2,7 +2,7 @@ import { track } from '@/lib/analytics'; import * as d3 from 'd3'; -import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef } from 'react'; +import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from 'react'; import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry'; import { useInference } from '@/components/inference/InferenceContext'; @@ -19,6 +19,7 @@ import { getHardwareConfig, getModelSortIndex } from '@/lib/constants'; import { getChartWatermark, getPrecisionLabel, + getSequenceLabel, type Precision, Sequence, } from '@/lib/data-mappings'; @@ -62,6 +63,8 @@ import { generateTooltipContent, getPointLabel, } from '@/components/inference/utils/tooltipUtils'; +import LegendPointsDialog from '@/components/inference/ui/LegendPointsDialog'; +import { buildLegendPointsRows } from '@/components/inference/utils/legend-points-table'; import { type ParetoPointLabel, getParetoLabel, @@ -228,6 +231,11 @@ const pointLabelText = (d: InferenceData, advanced: boolean): string => // Referentially stable "no overlay data" result (see processedOverlayData). const EMPTY_OVERLAY_DATA: InferenceData[] = []; +/** Which legend series' points table is open (per-series drill-down dialog). */ +type LegendPointsTarget = + | { kind: 'official'; hwKey: string } + | { kind: 'overlay'; runIndex: number; runId: number; branch: string }; + // Scale configs are recomputed from the visible points on every render, but a // legend / precision toggle usually leaves the actual domain untouched (x-min // is pinned at 0; extremes are owned by a handful of points). Comparing by @@ -619,6 +627,57 @@ const ScatterGraph = React.memo( }, [pointsData]); const { data: traceAvailability } = useTraceAvailability(agenticIds); + // --- Legend points table (per-series drill-down opened from the legend) --- + const [pointsTableTarget, setPointsTableTarget] = useState(null); + + const pointsTable = useMemo(() => { + if (!pointsTableTarget) return null; + if (pointsTableTarget.kind === 'official') { + const { hwKey } = pointsTableTarget; + const hwConfig = hardwareConfig[hwKey]; + // Same visibility filters the chart applies (precision, Optimal Only), + // scoped to the clicked series. + const pts = pointsData.filter( + (p) => + p.hwKey === hwKey && + selectedPrecisions.includes(p.precision) && + (!hideNonOptimal || optimalPointKeys.has(optimalPointKey(p))), + ); + return { + hw: hwKey, + title: hwConfig ? getDisplayLabel(hwConfig) : hwKey, + color: resolveColor(hwKey), + isOverlay: false, + rows: buildLegendPointsRows(pts, false), + }; + } + const { runIndex, runId, branch } = pointsTableTarget; + // Overlay series: this run's points, respecting the overlay hw toggles. + const pts = processedOverlayData.filter( + (p) => + overlayRunIndex(p.run_url ?? null, runIndexByUrl) === runIndex && + activeOverlayHwTypes.has(p.hwKey as string), + ); + return { + hw: `overlay-run-${runId}`, + title: `✕ ${branch}`, + color: overlayRunColor(runIndex), + isOverlay: true, + rows: buildLegendPointsRows(pts, true), + }; + }, [ + pointsTableTarget, + hardwareConfig, + pointsData, + selectedPrecisions, + hideNonOptimal, + optimalPointKeys, + resolveColor, + processedOverlayData, + runIndexByUrl, + activeOverlayHwTypes, + ]); + // Gradient label data const allPointLabelsByKey = useMemo(() => { const globalLabelColorMap = new Map(); @@ -2454,267 +2513,310 @@ const ScatterGraph = React.memo( } return ( - - ref={chartRef} - chartId={chartId} - // Stable across toggles: the render effect keys on this for "data - // changed" rebuilds; scale domains come from x/yScaleConfig (computed - // from the visible points), and visibility is applied via opacity. - data={pointsData} - margin={CHART_MARGIN} - watermark={getChartWatermark(isUnofficialRun)} - testId="scatter-graph" - grabCursor={true} - caption={caption} - xScale={xScaleConfig} - yScale={yScaleConfig} - xAxis={xAxisConfig} - yAxis={yAxisConfig} - layers={layers} - zoom={zoomConfig} - tooltip={tooltipConfig} - transitionDuration={transitionDuration} - onRender={onRender} - noDataOverlay={ - filteredData.length === 0 && processedOverlayData.length === 0 ? ( -
-
-

No data available

-

- Please change the model, sequence, precision, date range or GPU selection. -

+ <> + + ref={chartRef} + chartId={chartId} + // Stable across toggles: the render effect keys on this for "data + // changed" rebuilds; scale domains come from x/yScaleConfig (computed + // from the visible points), and visibility is applied via opacity. + data={pointsData} + margin={CHART_MARGIN} + watermark={getChartWatermark(isUnofficialRun)} + testId="scatter-graph" + grabCursor={true} + caption={caption} + xScale={xScaleConfig} + yScale={yScaleConfig} + xAxis={xAxisConfig} + yAxis={yAxisConfig} + layers={layers} + zoom={zoomConfig} + tooltip={tooltipConfig} + transitionDuration={transitionDuration} + onRender={onRender} + noDataOverlay={ + filteredData.length === 0 && processedOverlayData.length === 0 ? ( +
+
+

No data available

+

+ Please change the model, sequence, precision, date range or GPU selection. +

+
-
- ) : undefined - } - legendElement={ - 0 - ? unofficialRunInfos - .map((info, idx) => { - const hasPoints = overlayData.data.some( - (d) => - overlayRunIndex(d.run_url ?? null, runIndexByUrl) === idx && - selectedPrecisions.includes(d.precision), - ); - if (!hasPoints) return null; - const branch = info.branch || `run ${info.id}`; - return { - name: `✕ unofficial-run-${info.id}`, - label: `✕ ${branch}`, - color: overlayRunColor(idx), - title: `UNOFFICIAL: ${branch}`, - isHighlighted: true, - hw: `overlay-run-${info.id}`, - isActive: true, - onClick: () => {}, - tooltip: ( -
-
UNOFFICIAL RUN
-
Branch: {branch}
- {info.url && ( - - View workflow run - - )} -
- ), - }; - }) - .filter((x): x is NonNullable => x !== null) - : []), - ...Object.entries(hardwareConfig) - .filter(([key]) => - showAllHardwareTypes ? effectiveActiveHwTypes.has(key) : hwTypesWithData.has(key), - ) - .toSorted( - ([a], [b]) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b), - ) - .map(([key, hwConfig]: [string, any]) => ({ - name: hwConfig.name, - label: getDisplayLabel(hwConfig), - color: resolveColor(key), - title: hwConfig.gpu, - isHighlighted: highlightConfigSuffixes.has(key.replaceAll('_', '-')), - hw: key, - isActive: showAllHardwareTypes ? true : effectiveOfficialHwTypes.has(key), - onClick: showAllHardwareTypes - ? () => {} - : () => { - handleToggleHwType(key); - track('latency_hw_type_toggled', { hw: key }); - }, - tooltip: changelog - ? formatChangelogDescription(changelog.entries[0].description) - : null, - })), - ]} - disableActiveSort={false} - isLegendExpanded={isLegendExpanded} - onExpandedChange={(expanded) => { - setIsLegendExpanded(expanded); - track('latency_legend_expanded', { expanded }); - }} - switches={[ - ...(selectedYAxisMetric === 'y_inputTputPerGpu' - ? [] - : [ - { - id: 'scatter-log-scale', - label: 'Log Scale', - checked: logScale, - onCheckedChange: (checked: boolean) => { - setLogScale(checked); - track('latency_log_scale_toggled', { enabled: checked }); - }, + ) : undefined + } + legendElement={ + 0 + ? unofficialRunInfos + .map((info, idx) => { + const hasPoints = overlayData.data.some( + (d) => + overlayRunIndex(d.run_url ?? null, runIndexByUrl) === idx && + selectedPrecisions.includes(d.precision), + ); + if (!hasPoints) return null; + const branch = info.branch || `run ${info.id}`; + return { + name: `✕ unofficial-run-${info.id}`, + label: `✕ ${branch}`, + color: overlayRunColor(idx), + title: `UNOFFICIAL: ${branch}`, + isHighlighted: true, + hw: `overlay-run-${info.id}`, + isActive: true, + onClick: () => {}, + onShowPoints: () => { + setPointsTableTarget({ + kind: 'overlay', + runIndex: idx, + runId: info.id, + branch, + }); + track('inference_legend_points_table_opened', { + hw: `overlay-run-${info.id}`, + framework: 'overlay', + }); + }, + tooltip: ( +
+
UNOFFICIAL RUN
+
Branch: {branch}
+ {info.url && ( + + View workflow run + + )} +
+ ), + }; + }) + .filter((x): x is NonNullable => x !== null) + : []), + ...Object.entries(hardwareConfig) + .filter(([key]) => + showAllHardwareTypes + ? effectiveActiveHwTypes.has(key) + : hwTypesWithData.has(key), + ) + .toSorted( + ([a], [b]) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b), + ) + .map(([key, hwConfig]: [string, any]) => ({ + name: hwConfig.name, + label: getDisplayLabel(hwConfig), + color: resolveColor(key), + title: hwConfig.gpu, + isHighlighted: highlightConfigSuffixes.has(key.replaceAll('_', '-')), + hw: key, + isActive: showAllHardwareTypes ? true : effectiveOfficialHwTypes.has(key), + onClick: showAllHardwareTypes + ? () => {} + : () => { + handleToggleHwType(key); + track('latency_hw_type_toggled', { hw: key }); + }, + onShowPoints: () => { + setPointsTableTarget({ kind: 'official', hwKey: key }); + track('inference_legend_points_table_opened', { + hw: key, + framework: hwConfig.framework ?? '', + }); }, - ]), - { - id: 'scatter-hide-non-optimal', - label: 'Optimal Only', - checked: hideNonOptimal, - onCheckedChange: (checked: boolean) => { - setHideNonOptimal(checked); - track('latency_hide_non_optimal_toggled', { enabled: checked }); + tooltip: changelog + ? formatChangelogDescription(changelog.entries[0].description) + : null, + })), + ]} + disableActiveSort={false} + isLegendExpanded={isLegendExpanded} + onExpandedChange={(expanded) => { + setIsLegendExpanded(expanded); + track('latency_legend_expanded', { expanded }); + }} + switches={[ + ...(selectedYAxisMetric === 'y_inputTputPerGpu' + ? [] + : [ + { + id: 'scatter-log-scale', + label: 'Log Scale', + checked: logScale, + onCheckedChange: (checked: boolean) => { + setLogScale(checked); + track('latency_log_scale_toggled', { enabled: checked }); + }, + }, + ]), + { + id: 'scatter-hide-non-optimal', + label: 'Optimal Only', + checked: hideNonOptimal, + onCheckedChange: (checked: boolean) => { + setHideNonOptimal(checked); + track('latency_hide_non_optimal_toggled', { enabled: checked }); + }, + // On agentic + non-e2e chart, "optimal" means "on the + // e2e-latency Pareto frontier" (not a per-axis Pareto on the + // current x metric). Explain that so users don't wonder why + // a point sitting above the line is still considered + // dominated. + ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e' + ? { + infoTooltip: + "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.", + } + : {}), }, - // On agentic + non-e2e chart, "optimal" means "on the - // e2e-latency Pareto frontier" (not a per-axis Pareto on the - // current x metric). Explain that so users don't wonder why - // a point sitting above the line is still considered - // dominated. - ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e' - ? { - infoTooltip: - "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.", - } - : {}), - }, - { - id: 'scatter-point-labels', - label: 'Labels', - checked: showPointLabels, - onCheckedChange: (checked: boolean) => { - setShowPointLabels(checked); - track('latency_point_labels_toggled', { enabled: checked }); + { + id: 'scatter-point-labels', + label: 'Labels', + checked: showPointLabels, + onCheckedChange: (checked: boolean) => { + setShowPointLabels(checked); + track('latency_point_labels_toggled', { enabled: checked }); + }, }, - }, - { - id: 'scatter-high-contrast', - label: 'High Contrast', - checked: highContrast, - onCheckedChange: (checked: boolean) => { - setHighContrast(checked); - track('latency_high_contrast_toggled', { enabled: checked }); + { + id: 'scatter-high-contrast', + label: 'High Contrast', + checked: highContrast, + onCheckedChange: (checked: boolean) => { + setHighContrast(checked); + track('latency_high_contrast_toggled', { enabled: checked }); + }, }, - }, - { - id: 'scatter-parallelism-labels', - label: 'Parallelism Labels', - checked: useAdvancedLabels, - onCheckedChange: (checked: boolean) => { - setUseAdvancedLabels(checked); - track('latency_advanced_labels_toggled', { enabled: checked }); - // Parallelism labels are point labels; turning them on is - // pointless if labels are hidden, so auto-enable Labels. - if (checked && !showPointLabels) setShowPointLabels(true); - if (checked && !showGradientLabels) { - window.dispatchEvent( - new CustomEvent(GRADIENT_NUDGE_EVENT, { - detail: { - enableGradient: () => { - setShowGradientLabels(true); - setUseAdvancedLabels(false); - track('latency_gradient_labels_toggled', { - enabled: true, - source: 'nudge', - }); + { + id: 'scatter-parallelism-labels', + label: 'Parallelism Labels', + checked: useAdvancedLabels, + onCheckedChange: (checked: boolean) => { + setUseAdvancedLabels(checked); + track('latency_advanced_labels_toggled', { enabled: checked }); + // Parallelism labels are point labels; turning them on is + // pointless if labels are hidden, so auto-enable Labels. + if (checked && !showPointLabels) setShowPointLabels(true); + if (checked && !showGradientLabels) { + window.dispatchEvent( + new CustomEvent(GRADIENT_NUDGE_EVENT, { + detail: { + enableGradient: () => { + setShowGradientLabels(true); + setUseAdvancedLabels(false); + track('latency_gradient_labels_toggled', { + enabled: true, + source: 'nudge', + }); + }, }, - }, - }), - ); - } + }), + ); + } + }, }, - }, - { - id: 'scatter-gradient-labels', - label: 'Gradient Labels', - checked: showGradientLabels, - onCheckedChange: (checked: boolean) => { - setShowGradientLabels(checked); - track('latency_gradient_labels_toggled', { enabled: checked }); + { + id: 'scatter-gradient-labels', + label: 'Gradient Labels', + checked: showGradientLabels, + onCheckedChange: (checked: boolean) => { + setShowGradientLabels(checked); + track('latency_gradient_labels_toggled', { enabled: checked }); + }, }, - }, - { - id: 'scatter-line-labels', - label: 'Line Labels', - checked: showLineLabels, - onCheckedChange: (checked: boolean) => { - setShowLineLabels(checked); - track('latency_line_labels_toggled', { enabled: checked }); + { + id: 'scatter-line-labels', + label: 'Line Labels', + checked: showLineLabels, + onCheckedChange: (checked: boolean) => { + setShowLineLabels(checked); + track('latency_line_labels_toggled', { enabled: checked }); + }, }, - }, - { - id: 'scatter-speed-overlay', - label: 'Bus / Race Car', - advanced: true, - checked: showSpeedOverlay, - onCheckedChange: (checked: boolean) => { - setShowSpeedOverlay(checked); - track('latency_speed_overlay_toggled', { enabled: checked }); + { + id: 'scatter-speed-overlay', + label: 'Bus / Race Car', + advanced: true, + checked: showSpeedOverlay, + onCheckedChange: (checked: boolean) => { + setShowSpeedOverlay(checked); + track('latency_speed_overlay_toggled', { enabled: checked }); + }, }, - }, - { - id: 'scatter-minecraft-overlay', - label: 'Donkey / Elytra', - advanced: true, - checked: showMinecraftOverlay, - onCheckedChange: (checked: boolean) => { - setShowMinecraftOverlay(checked); - track('latency_minecraft_overlay_toggled', { enabled: checked }); + { + id: 'scatter-minecraft-overlay', + label: 'Donkey / Elytra', + advanced: true, + checked: showMinecraftOverlay, + onCheckedChange: (checked: boolean) => { + setShowMinecraftOverlay(checked); + track('latency_minecraft_overlay_toggled', { enabled: checked }); + }, }, - }, - ]} - onAdvancedExpandedChange={(expanded) => { - track('latency_advanced_controls_toggled', { expanded }); - }} - actions={ - effectiveOfficialHwTypes.size < hwTypesWithData.size || - activeOverlayHwTypes.size < allOverlayHwTypes.size - ? [ - { - id: 'scatter-reset-filter', - label: 'Reset filter', - onClick: () => { - selectAllHwTypes(); - setLocalOfficialOverride(null); - resetOverlayHwTypes(); - track('latency_legend_filter_reset'); + ]} + onAdvancedExpandedChange={(expanded) => { + track('latency_advanced_controls_toggled', { expanded }); + }} + actions={ + effectiveOfficialHwTypes.size < hwTypesWithData.size || + activeOverlayHwTypes.size < allOverlayHwTypes.size + ? [ + { + id: 'scatter-reset-filter', + label: 'Reset filter', + onClick: () => { + selectAllHwTypes(); + setLocalOfficialOverride(null); + resetOverlayHwTypes(); + track('latency_legend_filter_reset'); + }, }, - }, - ] - : [] + ] + : [] + } + precisionIndicators={selectedPrecisions} + enableTooltips={true} + /> + } + /> + {pointsTable && ( + { + if (!open) setPointsTableTarget(null); + }} + title={pointsTable.title} + subtitle={`${modelLabel} · ${getSequenceLabel(selectedSequence)}`} + accentColor={pointsTable.color} + rows={pointsTable.rows} + isOverlay={pointsTable.isOverlay} + onRowClick={(row) => + track('inference_legend_points_table_row_clicked', { + hw: pointsTable.hw, + conc: row.conc, + href: row.href ?? '', + }) } - precisionIndicators={selectedPrecisions} - enableTooltips={true} /> - } - /> + )} + ); }, ); diff --git a/packages/app/src/components/inference/utils/legend-points-table.test.ts b/packages/app/src/components/inference/utils/legend-points-table.test.ts new file mode 100644 index 00000000..b29cecbb --- /dev/null +++ b/packages/app/src/components/inference/utils/legend-points-table.test.ts @@ -0,0 +1,223 @@ +import { describe, expect, it } from 'vitest'; + +import type { InferenceData } from '@/components/inference/types'; +import { + buildLegendPointsRows, + formatRowValue, + pointDetailHref, + sortLegendPointsRows, +} from '@/components/inference/utils/legend-points-table'; + +// --------------------------------------------------------------------------- +// fixture factory (mirrors tooltip-utils.test.ts) +// --------------------------------------------------------------------------- +function pt(overrides: Partial = {}): InferenceData { + return { + date: '2025-06-15', + x: 100, + y: 500, + tp: 8, + conc: 64, + hwKey: 'b300_vllm', + precision: 'fp4', + tput_per_gpu: 1234.5678, + median_intvty: 45.2, + p90_intvty: 38.1, + median_ttft: 0.42, + p90_ttft: 0.87, + tpPerGpu: { y: 1000, roof: false }, + tpPerMw: { y: 50, roof: false }, + costh: { y: 1, roof: false }, + costn: { y: 1, roof: false }, + costr: { y: 1, roof: false }, + costhi: { y: 1, roof: false }, + costni: { y: 1, roof: false }, + costri: { y: 1, roof: false }, + ...overrides, + } as InferenceData; +} + +// =========================================================================== +// pointDetailHref +// =========================================================================== +describe('pointDetailHref', () => { + it('agentic point with numeric id links to the in-app detail page', () => { + const d = pt({ benchmark_type: 'agentic_traces', id: 206863 }); + expect(pointDetailHref(d, false)).toEqual({ + href: '/inference/agentic/206863', + isExternal: false, + }); + }); + + it('fixed-seq point links to its GitHub Actions run (repo URL rewritten)', () => { + const d = pt({ + benchmark_type: 'single_turn', + run_url: 'https://github.com/InferenceMAX/InferenceMAX/actions/runs/123', + }); + expect(pointDetailHref(d, false)).toEqual({ + href: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/123', + isExternal: true, + }); + }); + + it('agentic point without a numeric id falls back to the run URL', () => { + const d = pt({ + benchmark_type: 'agentic_traces', + run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/9', + }); + expect(pointDetailHref(d, false)).toEqual({ + href: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/9', + isExternal: true, + }); + }); + + it('returns no link when there is neither an id nor a run URL', () => { + expect(pointDetailHref(pt(), false)).toEqual({ href: null, isExternal: false }); + }); + + it('overlay points never get a link (no DB benchmark id)', () => { + const d = pt({ + benchmark_type: 'agentic_traces', + id: 42, + run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/1', + }); + expect(pointDetailHref(d, true)).toEqual({ href: null, isExternal: false }); + }); +}); + +// =========================================================================== +// buildLegendPointsRows +// =========================================================================== +describe('buildLegendPointsRows', () => { + it('maps official point fields onto table rows', () => { + const rows = buildLegendPointsRows( + [pt({ benchmark_type: 'agentic_traces', id: 1, ep: 8, dp_attention: true })], + false, + ); + expect(rows).toHaveLength(1); + expect(rows[0]).toMatchObject({ + conc: 64, + parallelism: 'DEP8', + precision: 'fp4', + offload: null, + tputPerGpu: 1234.5678, + p50Intvty: 45.2, + p90Intvty: 38.1, + p50Ttft: 0.42, + p90Ttft: 0.87, + href: '/inference/agentic/1', + isExternal: false, + }); + }); + + it('default-sorts by concurrency ascending', () => { + const rows = buildLegendPointsRows( + [pt({ conc: 32 }), pt({ conc: 4 }), pt({ conc: 16 })], + false, + ); + expect(rows.map((r) => r.conc)).toEqual([4, 16, 32]); + }); + + it('keeps agentic offload on/off row pairs adjacent and deterministic', () => { + const rows = buildLegendPointsRows( + [ + pt({ conc: 8, offload_mode: 'on' }), + pt({ conc: 4, offload_mode: 'off' }), + pt({ conc: 4, offload_mode: 'on' }), + ], + false, + ); + expect(rows.map((r) => [r.conc, r.offload])).toEqual([ + [4, 'OFF'], + [4, 'ON'], + [8, 'ON'], + ]); + }); + + it('nulls out metrics missing on old points instead of coercing to 0', () => { + const rows = buildLegendPointsRows( + [pt({ tput_per_gpu: undefined, p90_intvty: undefined, p90_ttft: Number.NaN })], + false, + ); + expect(rows[0].tputPerGpu).toBeNull(); + expect(rows[0].p90Intvty).toBeNull(); + expect(rows[0].p90Ttft).toBeNull(); + }); + + it('treats the transform\'s "?? 0" coercion of absent metrics as missing', () => { + // Agentic rows have no median_* keys in metrics JSONB; benchmark-transform + // fills them with 0. These metrics are strictly positive when measured. + const rows = buildLegendPointsRows([pt({ median_intvty: 0, median_ttft: 0 })], false); + expect(rows[0].p50Intvty).toBeNull(); + expect(rows[0].p50Ttft).toBeNull(); + }); + + it('overlay rows carry metrics but no links', () => { + const rows = buildLegendPointsRows( + [pt({ id: 7, benchmark_type: 'agentic_traces', run_url: 'https://github.com/x/y/runs/1' })], + true, + ); + expect(rows[0].href).toBeNull(); + expect(rows[0].tputPerGpu).toBe(1234.5678); + }); +}); + +// =========================================================================== +// sortLegendPointsRows +// =========================================================================== +describe('sortLegendPointsRows', () => { + const rows = buildLegendPointsRows( + [ + pt({ conc: 4, tput_per_gpu: 300 }), + pt({ conc: 16, tput_per_gpu: undefined }), + pt({ conc: 8, tput_per_gpu: 900 }), + ], + false, + ); + + it('sorts numeric columns in both directions', () => { + expect(sortLegendPointsRows(rows, 'tputPerGpu', 'asc').map((r) => r.conc)).toEqual([4, 8, 16]); + expect(sortLegendPointsRows(rows, 'tputPerGpu', 'desc').map((r) => r.conc)).toEqual([8, 4, 16]); + }); + + it('always sorts null metrics last', () => { + for (const dir of ['asc', 'desc'] as const) { + expect(sortLegendPointsRows(rows, 'tputPerGpu', dir).at(-1)?.conc).toBe(16); + } + }); + + it('sorts string columns alphabetically', () => { + const mixed = buildLegendPointsRows( + [pt({ conc: 1, ep: 8 }), pt({ conc: 2, tp: 4, ep: undefined })], + false, + ); + expect(sortLegendPointsRows(mixed, 'parallelism', 'asc').map((r) => r.parallelism)).toEqual([ + '4', + 'TEP8', + ]); + }); + + it('does not mutate the input array', () => { + const before = rows.map((r) => r.conc); + sortLegendPointsRows(rows, 'tputPerGpu', 'desc'); + expect(rows.map((r) => r.conc)).toEqual(before); + }); +}); + +// =========================================================================== +// formatRowValue +// =========================================================================== +describe('formatRowValue', () => { + it('renders em dash for missing values', () => { + expect(formatRowValue(null)).toBe('—'); + }); + + it('caps at 3 decimals like the scatter tooltip', () => { + expect(formatRowValue(1234.5678)).toBe('1234.568'); + expect(formatRowValue(0.42)).toBe('0.42'); + }); + + it('comma-formats large values like the scatter tooltip', () => { + expect(formatRowValue(123456.7)).toBe('123,456.7'); + }); +}); diff --git a/packages/app/src/components/inference/utils/legend-points-table.ts b/packages/app/src/components/inference/utils/legend-points-table.ts new file mode 100644 index 00000000..0457e7c2 --- /dev/null +++ b/packages/app/src/components/inference/utils/legend-points-table.ts @@ -0,0 +1,123 @@ +import { updateRepoUrl } from '@/lib/utils'; + +import type { InferenceData } from '@/components/inference/types'; +import { fmt, getPointLabel } from '@/components/inference/utils/tooltipUtils'; + +/** + * One row of the per-series points table opened from the chart legend. + * Metric fields are `null` when the point predates the field (old runs) so the + * table can render an em dash instead of a misleading 0. + */ +export interface LegendPointsTableRow { + /** Stable React key — mirrors the scatter chart's per-point identity fields. */ + key: string; + conc: number; + /** Shared parallelism label (e.g. "TP8", "DPAEP8", "2xEP4+1xDPAEP32"). */ + parallelism: string; + precision: string; + /** Agentic offload mode ("ON" / "OFF"), null for fixed-seq points. */ + offload: string | null; + tputPerGpu: number | null; + p50Intvty: number | null; + p90Intvty: number | null; + p50Ttft: number | null; + p90Ttft: number | null; + /** Detail link — null for overlay points (no DB benchmark id). */ + href: string | null; + /** True when href is an external GitHub Actions run (open in new tab). */ + isExternal: boolean; +} + +export type LegendPointsSortKey = + | 'conc' + | 'parallelism' + | 'offload' + | 'tputPerGpu' + | 'p50Intvty' + | 'p90Intvty' + | 'p50Ttft' + | 'p90Ttft'; + +// benchmark-transform coerces absent metrics to 0 (`m.median_ttft ?? 0`), and +// every column metric here (throughput, interactivity, TTFT) is strictly +// positive in reality — so non-positive means "not recorded", shown as a dash. +const num = (v: number | undefined | null): number | null => + typeof v === 'number' && Number.isFinite(v) && v > 0 ? v : null; + +/** + * Detail-page destination for a point — the EXACT same navigation the scatter + * tooltip offers on point click: agentic points go to the in-app + * `/inference/agentic/` detail page; fixed-seq points open the GitHub + * Actions run that produced them. Overlay (unofficial run) points have no DB + * benchmark id, so they get no link. + */ +export function pointDetailHref( + d: InferenceData, + isOverlay: boolean, +): { href: string | null; isExternal: boolean } { + if (isOverlay) return { href: null, isExternal: false }; + if (d.benchmark_type === 'agentic_traces' && typeof d.id === 'number') { + return { href: `/inference/agentic/${d.id}`, isExternal: false }; + } + if (d.run_url) return { href: updateRepoUrl(d.run_url), isExternal: true }; + return { href: null, isExternal: false }; +} + +/** + * Shape a series' visible points into table rows, default-sorted by + * concurrency ascending (offload/parallelism tie-breaks keep the agentic + * on/off row pairs adjacent and deterministic). + */ +export function buildLegendPointsRows( + points: InferenceData[], + isOverlay: boolean, +): LegendPointsTableRow[] { + return points + .map((d, i) => { + const { href, isExternal } = pointDetailHref(d, isOverlay); + return { + key: `${d.hwKey}|${d.precision}|${d.conc}|${getPointLabel(d)}|${d.offload_mode ?? ''}|${i}`, + conc: d.conc, + parallelism: getPointLabel(d), + precision: d.precision, + offload: d.offload_mode ? d.offload_mode.toUpperCase() : null, + tputPerGpu: num(d.tput_per_gpu), + p50Intvty: num(d.median_intvty), + p90Intvty: num(d.p90_intvty), + p50Ttft: num(d.median_ttft), + p90Ttft: num(d.p90_ttft), + href, + isExternal, + }; + }) + .toSorted( + (a, b) => + a.conc - b.conc || + a.parallelism.localeCompare(b.parallelism) || + (a.offload ?? '').localeCompare(b.offload ?? ''), + ); +} + +/** Column sort with nulls always last; concurrency as the stable tie-break. */ +export function sortLegendPointsRows( + rows: LegendPointsTableRow[], + key: LegendPointsSortKey, + dir: 'asc' | 'desc', +): LegendPointsTableRow[] { + const mul = dir === 'asc' ? 1 : -1; + return rows.toSorted((a, b) => { + const av = a[key]; + const bv = b[key]; + if (av === null && bv === null) return a.conc - b.conc; + if (av === null) return 1; + if (bv === null) return -1; + const cmp = + typeof av === 'string' || typeof bv === 'string' + ? String(av).localeCompare(String(bv)) + : (av as number) - (bv as number); + return mul * cmp || a.conc - b.conc; + }); +} + +/** Table cell formatting — same capping as the scatter tooltip values. */ +export const formatRowValue = (v: number | null): string => (v === null ? '—' : fmt(v)); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index e3f0de6d..8f8ab4df 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -80,8 +80,9 @@ const tooltipLine = (label: string, value: string | number) => const formatPct = (v: number | undefined): string | null => v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`; -/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */ -const fmt = (v: number): string => { +/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). + * Exported so the legend points table shows exactly the numbers the tooltip shows. */ +export const fmt = (v: number): string => { if (!Number.isFinite(v)) return String(v); const rounded = parseFloat(v.toFixed(3)); if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded); diff --git a/packages/app/src/components/ui/chart-legend-item.tsx b/packages/app/src/components/ui/chart-legend-item.tsx index fae83360..07344270 100644 --- a/packages/app/src/components/ui/chart-legend-item.tsx +++ b/packages/app/src/components/ui/chart-legend-item.tsx @@ -1,4 +1,4 @@ -import { X } from 'lucide-react'; +import { Table2, X } from 'lucide-react'; import React from 'react'; import { cn } from '@/lib/utils'; @@ -19,6 +19,12 @@ export interface CommonLegendItemProps { isLegendExpanded?: boolean; // Whether the legend is expanded to show full text sidebarMode?: boolean; // Use sidebar-style visual feedback (line-through + faded dot) onRemove?: (name: string) => void; + /** + * When provided, renders a small table icon that opens a per-series points + * table (all data points for this hardware/framework series). Only the + * inference tab's legend passes this — other tabs get no icon. + */ + onShowPoints?: (name: string) => void; } const ChartLegendItem: React.FC = ({ @@ -36,6 +42,7 @@ const ChartLegendItem: React.FC = ({ isLegendExpanded = true, sidebarMode = false, onRemove, + onShowPoints, }) => { const id = `checkbox-${hw || name}`; // Unique ID for accessibility const isLongText = (label ?? '').length > 8; @@ -97,6 +104,20 @@ const ChartLegendItem: React.FC = ({ {label} + {onShowPoints && ( + + )} ); @@ -104,6 +125,7 @@ const ChartLegendItem: React.FC = ({ 'transition-opacity duration-300', isActive ? 'opacity-100' : sidebarMode ? 'no-export' : 'opacity-50 no-export', isHighlighted && 'text-red-900 dark:text-red-400 font-bold', + onShowPoints && 'group/row flex w-full items-center', ); if (asFragment) { diff --git a/packages/app/src/components/ui/chart-legend.tsx b/packages/app/src/components/ui/chart-legend.tsx index ca7424bf..86fadfad 100644 --- a/packages/app/src/components/ui/chart-legend.tsx +++ b/packages/app/src/components/ui/chart-legend.tsx @@ -427,6 +427,7 @@ export default function ChartLegend({ onHover={onItemHover} onHoverEnd={onItemHoverEnd} onRemove={effectiveRemove} + onShowPoints={item.onShowPoints} asFragment isLegendExpanded={effectiveExpanded} sidebarMode={isSidebar} @@ -438,7 +439,9 @@ export default function ChartLegend({ {enableTooltips ? ( -
{legendItem}
+ {/* Full width when the row carries a points-table icon so the + ml-auto icon pins to a consistent right-edge column. */} +
{legendItem}
{item.isHighlighted && item.tooltip && ( @@ -521,6 +524,7 @@ export default function ChartLegend({ onHover={onItemHover} onHoverEnd={onItemHoverEnd} onRemove={effectiveRemove} + onShowPoints={item.onShowPoints} sidebarMode={isSidebar} asFragment /> From d6cf3a60a1ddc9d7c22da61668d46baa8242319e Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 15:25:04 -0500 Subject: [PATCH 15/40] chore: exclude package scratch dirs from typecheck --- tsconfig.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsconfig.json b/tsconfig.json index b1541c43..7ff2f0b1 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -17,5 +17,5 @@ } }, "include": ["packages/**/*.ts", "packages/**/*.tsx"], - "exclude": ["packages/*/node_modules", "packages/*/.next"] + "exclude": ["packages/*/node_modules", "packages/*/.next", "packages/*/scratch"] } From 9c4dca063bebf3f185c388298130fd3a77e623a1 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 15:25:12 -0500 Subject: [PATCH 16/40] fix(agentic): exclude osl=0 turns from normalized-E2E derivation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extractTurn guarded isl<=0 but not osl<=0, so cancelled/empty-output turns collapsed the whole decode window into one ITL interval and the @400-token projection became ttft + 399x(latency-ttft) — ~386x inflation baked into stored p75/p90 aggregates (seeded repro: p90 1104.78s -> 6.01s). STATS_VERSION bumped 4->5 so stored payloads recompute via the version fallback. Adds regression test. --- packages/db/src/queries/agentic-aggregates.ts | 5 ++- .../queries/derived-agentic-metrics.test.ts | 41 +++++++++++++++++++ .../db/src/queries/derived-agentic-metrics.ts | 2 +- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts index 72faa148..0443398d 100644 --- a/packages/db/src/queries/agentic-aggregates.ts +++ b/packages/db/src/queries/agentic-aggregates.ts @@ -49,8 +49,11 @@ export { percentilesOf, type MetricPercentiles } from './agentic-shared'; * they do for vllm runs. * * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL. + * + * v5: reject osl <= 0 in extractTurn to exclude cancelled/empty-output turns + * whose decode-interval math would explode normalized E2E to thousands of seconds. */ -export const STATS_VERSION = 4; +export const STATS_VERSION = 5; export interface AgenticAggregate { id: number; diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts index afc5b22d..84c09193 100644 --- a/packages/db/src/queries/derived-agentic-metrics.test.ts +++ b/packages/db/src/queries/derived-agentic-metrics.test.ts @@ -108,4 +108,45 @@ describe('computeDerivedFromBlob', () => { const out = computeDerivedFromBlob(turns.join('\n')); expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6); }); + + it('excludes osl=0 (cancelled/empty-output) turns from normalized E2E', () => { + // Two normal turns + one cancelled turn (osl=0, latency=30s, ttft=1s). + // + // The cancelled turn must be excluded because observedDecodeIntervals collapses + // to max(0-1,1)=1, making itlMs=(30000-1000)/1=29000ms and normalizedMs explode + // to ~11 572 s — roughly 386× the real scale. (Pre-fix behavior for reference; + // this number is intentionally not asserted below to avoid enshrining the bug.) + // + // Normal turn A: isl=100, osl=50, ttft=500ms, latency=1000ms + // observedDecodeIntervals = max(49,1) = 49 + // itlMs = (1000-500)/49 + // normalizedMs = 500 + 399*(500/49) + // + // Normal turn B: isl=200, osl=100, ttft=1000ms, latency=3000ms + // observedDecodeIntervals = max(99,1) = 99 + // itlMs = (3000-1000)/99 + // normalizedMs = 1000 + 399*(2000/99) + const normA = (500 + (399 * 500) / 49) / 1000; // seconds + const normB = (1000 + (399 * 2000) / 99) / 1000; // seconds + + const jsonl = [ + rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }), + rec('s1', 1, { isl: 200, osl: 100, ttft_ms: 1000, latency_ms: 3000 }), + // Cancelled / empty-output turn — osl=0 must be rejected by extractTurn. + rec('s2', 0, { isl: 150, osl: 0, ttft_ms: 1000, latency_ms: 30000 }), + ].join('\n'); + + const out = computeDerivedFromBlob(jsonl); + + // Only the 2 normal turns contribute; osl=0 record is silently excluded. + expect(out.normalized_e2e_400?.n).toBe(2); + + // p90 of [normA, normB] sorted ascending (normA < normB): + // pos = 1*0.9 = 0.9; result = normA + (normB - normA)*0.9 + const expectedP90 = normA + (normB - normA) * 0.9; + expect(out.normalized_e2e_400?.p90).toBeCloseTo(expectedP90, 6); + + // Sanity: p90 should be single-digit seconds, not thousands. + expect(out.normalized_e2e_400!.p90).toBeLessThan(20); + }); }); diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts index 8e5d15c9..24b24cf1 100644 --- a/packages/db/src/queries/derived-agentic-metrics.ts +++ b/packages/db/src/queries/derived-agentic-metrics.ts @@ -86,7 +86,7 @@ function extractTurn(rec: ProfileRecord): TurnFields | null { const isl = readNum(m.input_sequence_length); const osl = readNum(m.output_sequence_length); if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null; - if (rl <= 0 || tt <= 0 || isl <= 0) return null; + if (rl <= 0 || tt <= 0 || isl <= 0 || osl <= 0) return null; return { request_latency_ms: rl, ttft_ms: tt, isl, osl }; } From ef80fef4f210a0afad729ea681e5e1367a9c77d0 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 15:32:52 -0500 Subject: [PATCH 17/40] fix(db): include offload_mode in getBenchmarksForRun dedup key DISTINCT ON (config_id, conc, isl, osl) collapsed agentic offload on/off variants (isl/osl both NULL) into one arbitrary winner, so run views silently dropped half the sweep (seeded repro: 2 rows -> 4). Adds offload_mode to the SQL DISTINCT ON + ORDER BY and to the json-provider dedup key (normalized ?? 'off' to match lineKey). Every other selection path already keyed on it. Adds 4 regression tests. --- ...on-provider.get-benchmarks-for-run.test.ts | 151 ++++++++++++++++++ packages/db/src/json-provider.ts | 2 +- packages/db/src/queries/benchmarks.ts | 4 +- 3 files changed, 154 insertions(+), 3 deletions(-) create mode 100644 packages/db/src/json-provider.get-benchmarks-for-run.test.ts diff --git a/packages/db/src/json-provider.get-benchmarks-for-run.test.ts b/packages/db/src/json-provider.get-benchmarks-for-run.test.ts new file mode 100644 index 00000000..cd640f0e --- /dev/null +++ b/packages/db/src/json-provider.get-benchmarks-for-run.test.ts @@ -0,0 +1,151 @@ +import { mkdtempSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +import type { getBenchmarksForRun as GetBenchmarksForRun } from './json-provider.js'; + +/** + * Regression guard for the offload_mode dedup bug in getBenchmarksForRun. + * + * Agentic sweeps that test offload ON and OFF at the same (config, conc, + * isl=NULL, osl=NULL) produce two distinct benchmark_results rows that differ + * only in offload_mode. The old dedup key was: + * + * `${config_id}:${conc}:${isl}:${osl}` + * + * which collapsed both offload variants into one, silently dropping the second. + * The fix appends `?? 'off'` normalised offload_mode: + * + * `${config_id}:${conc}:${isl}:${osl}:${offload_mode ?? 'off'}` + * + * This test seeds two rows differing only in offload_mode at the same + * (config, conc, isl=null, osl=null) and asserts BOTH survive. + */ + +const cfg = (id: number) => ({ + id, + hardware: 'h100', + framework: 'vllm', + model: 'testm', + precision: 'fp8', + spec_method: 'none', + disagg: false, + is_multinode: false, + prefill_tp: 1, + prefill_ep: 1, + prefill_dp_attention: false, + prefill_num_workers: 1, + decode_tp: 1, + decode_ep: 1, + decode_dp_attention: false, + decode_num_workers: 1, + num_prefill_gpu: 0, + num_decode_gpu: 8, +}); + +const run = (id: number, githubId: number, date: string) => ({ + id, + github_run_id: githubId, + run_attempt: 1, + name: `run ${githubId}`, + status: 'completed', + conclusion: 'success', + head_sha: 'sha', + head_branch: 'main', + html_url: `https://github.com/x/runs/${githubId}`, + created_at: `${date}T00:00:00Z`, + run_started_at: `${date}T00:00:00Z`, + date, +}); + +let nextId = 1; +const result = ( + runDbId: number, + configId: number, + date: string, + conc: number, + offloadMode: string | null, + isl: number | null = null, + osl: number | null = null, +) => ({ + id: nextId++, + workflow_run_id: runDbId, + config_id: configId, + benchmark_type: 'agentic', + date, + isl, + osl, + conc, + offload_mode: offloadMode, + image: null, + metrics: { median_tpot: 0.1 }, + error: null, + server_log_id: null, +}); + +const DATE = '2026-07-01'; +const GITHUB_RUN_ID = 9999001; + +let getBenchmarksForRun: typeof GetBenchmarksForRun; + +beforeAll(async () => { + const dir = mkdtempSync(join(tmpdir(), 'infx-get-benchmarks-for-run-')); + writeFileSync(join(dir, 'configs.json'), JSON.stringify([cfg(1)])); + writeFileSync( + join(dir, 'workflow_runs.json'), + JSON.stringify([ + run(1, GITHUB_RUN_ID, DATE), // the agentic sweep run + ]), + ); + writeFileSync( + join(dir, 'benchmark_results.json'), + JSON.stringify([ + // conc=16, offload=off + result(1, 1, DATE, 16, 'off'), + // conc=16, offload=on — same (config, conc, isl=null, osl=null), differs only in offload_mode + result(1, 1, DATE, 16, 'on'), + // conc=64, offload=off + result(1, 1, DATE, 64, 'off'), + // conc=64, offload=on + result(1, 1, DATE, 64, 'on'), + ]), + ); + process.env.DUMP_DIR = dir; + const mod = await import('./json-provider.js'); + getBenchmarksForRun = mod.getBenchmarksForRun; +}); + +afterAll(() => { + delete process.env.DUMP_DIR; +}); + +describe('getBenchmarksForRun — offload_mode dedup', () => { + it('returns all 4 rows when an agentic sweep covers offload on+off at both concurrencies', () => { + const rows = getBenchmarksForRun('testm', GITHUB_RUN_ID); + expect(rows).toHaveLength(4); + }); + + it('preserves both offload modes at conc=16', () => { + const rows = getBenchmarksForRun('testm', GITHUB_RUN_ID).filter((r) => r.conc === 16); + expect(rows).toHaveLength(2); + const modes = rows.map((r) => r.offload_mode).toSorted(); + expect(modes).toEqual(['off', 'on']); + }); + + it('preserves both offload modes at conc=64', () => { + const rows = getBenchmarksForRun('testm', GITHUB_RUN_ID).filter((r) => r.conc === 64); + expect(rows).toHaveLength(2); + const modes = rows.map((r) => r.offload_mode).toSorted(); + expect(modes).toEqual(['off', 'on']); + }); + + it('treats null offload_mode as "off" (no double-count with an explicit off row)', () => { + // Only one row with offload_mode=null, no 'off' row — should yield exactly 1 result. + const rows = getBenchmarksForRun('testm', GITHUB_RUN_ID).filter((r) => r.conc === 16); + // Both rows have explicit 'off'/'on'; the null-normalisation is verified by absence of dups. + const nullOrOff = rows.filter((r) => r.offload_mode === null || r.offload_mode === 'off'); + expect(nullOrOff).toHaveLength(1); // exactly one 'off' variant survives dedup + }); +}); diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index b502b243..2d335d17 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -439,7 +439,7 @@ export function getBenchmarksForRun( if (br.workflow_run_id !== run.id) continue; const c = s.configs.get(br.config_id); if (!c || !modelKeys.has(c.model)) continue; - const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}`; + const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}:${br.offload_mode ?? 'off'}`; if (!seen.has(key)) seen.set(key, br); } diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 37301e2b..d09f92f4 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -218,7 +218,7 @@ export async function getBenchmarksForRun( ): Promise { const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; const rows = await sql` - SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) br.id, c.hardware, c.framework, @@ -253,7 +253,7 @@ export async function getBenchmarksForRun( WHERE c.model = ANY(${modelKeys}) AND br.error IS NULL AND wr.github_run_id = ${Number(githubRunId)} - ORDER BY br.config_id, br.conc, br.isl, br.osl, br.date DESC + ORDER BY br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date DESC `; return rows as unknown as BenchmarkRow[]; } From 338b0df54e53522ee5796d3f657fb38353f9e4d4 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 16:00:57 -0500 Subject: [PATCH 18/40] fix(agentic): version-derived cache keys + self-healing stale recomputes Four blob-cached agentic routes had unversioned cache keys; blobSet is write-once and backfills never purge, so payload-version bumps served stale blobs indefinitely (the DB version check is bypassed on blob hits). Keys now derive from the governing VERSION constants (STATS/REQUEST_TIMELINE/CHART_SERIES), asserted by tests. Stale/missing recomputes now persist their result via a best-effort fire-and-forget ::jsonb write-back (no-ops on read replicas), so one request self-heals a row instead of re-gunzipping the raw blob until a manual backfill. STATS_VERSION moves to the dependency-free agentic-shared leaf to avoid an import cycle. Live-verified: stored payloads healed 4->5 / 11->12 / 4->5 on a single query. --- .../app/api/v1/agentic-aggregates/route.ts | 11 +- .../src/app/api/v1/agentic-cache-keys.test.ts | 70 ++++++++ .../api/v1/derived-agentic-metrics/route.ts | 15 +- .../src/app/api/v1/request-timeline/route.ts | 10 +- .../src/app/api/v1/trace-histograms/route.ts | 11 +- .../app/api/v1/trace-server-metrics/route.ts | 10 +- .../db/src/queries/agentic-aggregates.test.ts | 151 +++++++++++++++++- packages/db/src/queries/agentic-aggregates.ts | 147 ++++++++++------- .../db/src/queries/agentic-shared.test.ts | 79 +++++++++ packages/db/src/queries/agentic-shared.ts | 138 +++++++++++++++- .../queries/derived-agentic-metrics.test.ts | 106 +++++++++++- .../db/src/queries/derived-agentic-metrics.ts | 72 +++++++-- .../db/src/queries/request-timeline.test.ts | 52 ++++++ packages/db/src/queries/request-timeline.ts | 14 +- .../src/queries/trace-server-metrics.test.ts | 6 +- .../db/src/queries/trace-server-metrics.ts | 8 + 16 files changed, 815 insertions(+), 85 deletions(-) create mode 100644 packages/app/src/app/api/v1/agentic-cache-keys.test.ts create mode 100644 packages/db/src/queries/agentic-shared.test.ts diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts index 63fd2512..83238e89 100644 --- a/packages/app/src/app/api/v1/agentic-aggregates/route.ts +++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts @@ -1,6 +1,7 @@ import { getDb } from '@semianalysisai/inferencex-db/connection'; import { getAgenticAggregates, + STATS_VERSION, type AgenticAggregateMap, } from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; @@ -13,9 +14,17 @@ export const dynamic = 'force-dynamic'; // blobOnly: response stays small (a few numbers per id), but generating it // parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the // "Aggregates" toggle stays snappy. +// +// Key derived from STATS_VERSION (governs the `aggregate_stats` payload). The +// blob cache is write-once with no post-backfill purge, so deriving the key +// from the constant is what rolls the namespace on a version bump — a +// hand-written string would pin the route to stale blob hits forever. +/** Version-derived blob-cache key namespace (exported for the key-derivation test). */ +export const CACHE_KEY_PREFIX = `agentic-aggregates-v${STATS_VERSION}`; + const getCachedAgenticAggregates = cachedQuery( (ids: number[]): Promise => getAgenticAggregates(getDb(), ids), - 'agentic-aggregates', + CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/app/src/app/api/v1/agentic-cache-keys.test.ts b/packages/app/src/app/api/v1/agentic-cache-keys.test.ts new file mode 100644 index 00000000..58fa194f --- /dev/null +++ b/packages/app/src/app/api/v1/agentic-cache-keys.test.ts @@ -0,0 +1,70 @@ +/** + * Guards that every agentic blob-cache key is DERIVED from the version constant + * that governs its payload — not a hand-written string. `blobSet` is write-once + * and nothing purges the blob cache after a backfill, so an unversioned (or + * hand-bumped) key would serve stale data forever after a payload-version bump. + * Deriving the key from the constant means a future bump rolls the cache + * namespace automatically; these tests fail loudly if a route drifts back to a + * literal string. + */ + +import { describe, expect, it, vi } from 'vitest'; + +// Route modules call getDb() at import time via cachedQuery's closure and pull +// in the blob cache — stub both so importing the route is side-effect-free. +vi.mock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: vi.fn(() => 'mock-sql'), + JSON_MODE: false, + FIXTURES_MODE: false, +})); + +vi.mock('@/lib/api-cache', () => ({ + // Passthrough so importing the route doesn't touch blob storage; the key is + // still exported as CACHE_KEY_PREFIX for us to assert on. + cachedQuery: (fn: (...args: unknown[]) => unknown) => fn, + cachedJson: (data: unknown) => Response.json(data), +})); + +import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; +import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series'; +import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline'; + +import { CACHE_KEY_PREFIX as derivedAgenticMetricsKey } from './derived-agentic-metrics/route'; +import { CACHE_KEY_PREFIX as agenticAggregatesKey } from './agentic-aggregates/route'; +import { CACHE_KEY_PREFIX as requestTimelineKey } from './request-timeline/route'; +import { CACHE_KEY_PREFIX as traceServerMetricsKey } from './trace-server-metrics/route'; +import { CACHE_KEY_PREFIX as traceHistogramsKey } from './trace-histograms/route'; + +describe('agentic blob-cache keys are version-derived', () => { + it('derived-agentic-metrics key embeds STATS_VERSION', () => { + expect(derivedAgenticMetricsKey).toBe(`derived-agentic-metrics-v${STATS_VERSION}`); + }); + + it('agentic-aggregates key embeds STATS_VERSION', () => { + expect(agenticAggregatesKey).toBe(`agentic-aggregates-v${STATS_VERSION}`); + }); + + it('request-timeline key embeds REQUEST_TIMELINE_VERSION', () => { + expect(requestTimelineKey).toBe(`request-timeline-v${REQUEST_TIMELINE_VERSION}`); + }); + + it('trace-server-metrics key embeds CHART_SERIES_VERSION', () => { + expect(traceServerMetricsKey).toBe(`trace-server-metrics-v${CHART_SERIES_VERSION}`); + }); + + it('trace-histograms key embeds REQUEST_TIMELINE_VERSION (its payload is read from request_timeline)', () => { + expect(traceHistogramsKey).toBe(`trace-histograms-v${REQUEST_TIMELINE_VERSION}`); + }); + + it('every key actually contains a version segment (no unversioned literals)', () => { + for (const key of [ + derivedAgenticMetricsKey, + agenticAggregatesKey, + requestTimelineKey, + traceServerMetricsKey, + traceHistogramsKey, + ]) { + expect(key).toMatch(/-v\d+$/u); + } + }); +}); diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts index 836a8d93..647b6dda 100644 --- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts +++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts @@ -1,3 +1,4 @@ +import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; import { getDb } from '@semianalysisai/inferencex-db/connection'; import { getDerivedAgenticMetrics, @@ -13,12 +14,18 @@ export const dynamic = 'force-dynamic'; // blobOnly: the response is one entry per id with two numbers, but the // derivation work parses thousands of JSONL records per blob — cache the // computed result so a chart-refresh hits the warm path. -// Bumped to v3 for per-request normalized-E2E @ 400 output tokens. -// Stale v1 cache entries return undefined for the new field and silently -// blank the chart with "No data available". +// +// The cache key is derived from STATS_VERSION (the payload governs the derived +// metrics read out of `aggregate_stats`). blobSet is write-once and nothing +// purges post-backfill, so a hand-written version string would serve stale +// data forever after a bump — deriving the key from the constant means a +// STATS_VERSION bump automatically rolls the cache namespace. +/** Version-derived blob-cache key namespace (exported for the key-derivation test). */ +export const CACHE_KEY_PREFIX = `derived-agentic-metrics-v${STATS_VERSION}`; + const getCachedDerivedAgenticMetrics = cachedQuery( (ids: number[]): Promise => getDerivedAgenticMetrics(getDb(), ids), - 'derived-agentic-metrics-v3', + CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts index 9a3750d6..bd1d67f5 100644 --- a/packages/app/src/app/api/v1/request-timeline/route.ts +++ b/packages/app/src/app/api/v1/request-timeline/route.ts @@ -1,3 +1,4 @@ +import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline'; import { getDb } from '@semianalysisai/inferencex-db/connection'; import { getRequestTimeline, @@ -10,9 +11,16 @@ import { idQueryRoute } from '../id-routes'; export const dynamic = 'force-dynamic'; +// Key derived from REQUEST_TIMELINE_VERSION (governs the `request_timeline` +// payload). The blob cache is write-once with no post-backfill purge, so the +// version-derived key is what rolls the namespace on a bump — a hand-written +// string would serve stale blob-cached timelines forever. +/** Version-derived blob-cache key namespace (exported for the key-derivation test). */ +export const CACHE_KEY_PREFIX = `request-timeline-v${REQUEST_TIMELINE_VERSION}`; + const getCachedRequestTimeline = cachedQuery( (id: number): Promise => getRequestTimeline(getDb(), id), - 'request-timeline', + CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts index 131010ff..206205f5 100644 --- a/packages/app/src/app/api/v1/trace-histograms/route.ts +++ b/packages/app/src/app/api/v1/trace-histograms/route.ts @@ -1,3 +1,4 @@ +import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline'; import { getDb } from '@semianalysisai/inferencex-db/connection'; import { getTraceHistograms, @@ -14,9 +15,17 @@ export const dynamic = 'force-dynamic'; // unstable_cache limit (each point carries one int per request, ~500-1000+ // requests for agentic), which manifests as a 500 from the route. Blob // storage lets us cache the larger response without losing the warm-cache hit. +// +// Key derived from REQUEST_TIMELINE_VERSION: the histograms are read out of the +// `request_timeline` payload (getTraceHistograms keys its fast path off that +// constant). The blob cache is write-once with no post-backfill purge, so the +// version-derived key is what rolls the namespace on a bump — the previously +// unversioned key would serve stale histograms forever. +export const CACHE_KEY_PREFIX = `trace-histograms-v${REQUEST_TIMELINE_VERSION}`; + const getCachedTraceHistograms = cachedQuery( (ids: number[]): Promise => getTraceHistograms(getDb(), ids), - 'trace-histograms', + CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts index a759e6dc..149fefbf 100644 --- a/packages/app/src/app/api/v1/trace-server-metrics/route.ts +++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts @@ -1,3 +1,4 @@ +import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series'; import { getDb } from '@semianalysisai/inferencex-db/connection'; import { getTraceServerMetrics, @@ -10,9 +11,16 @@ import { idQueryRoute } from '../id-routes'; export const dynamic = 'force-dynamic'; +// Key derived from CHART_SERIES_VERSION (governs the `chart_series` payload). +// The blob cache is write-once with no post-backfill purge, so the +// version-derived key is what rolls the namespace on a bump — a hand-written +// string would serve stale blob-cached series forever. +/** Version-derived blob-cache key namespace (exported for the key-derivation test). */ +export const CACHE_KEY_PREFIX = `trace-server-metrics-v${CHART_SERIES_VERSION}`; + const getCachedTraceServerMetrics = cachedQuery( (id: number): Promise => getTraceServerMetrics(getDb(), id), - 'trace-server-metrics', + CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts index 529306cf..0c4dbc89 100644 --- a/packages/db/src/queries/agentic-aggregates.test.ts +++ b/packages/db/src/queries/agentic-aggregates.test.ts @@ -1,6 +1,16 @@ +import { gzipSync } from 'node:zlib'; + import { describe, expect, it } from 'vitest'; -import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates'; +import type { DbClient } from '../connection.js'; + +import { + extractIslOsl, + extractServerMetricSamples, + getAgenticAggregates, + percentilesOf, + STATS_VERSION, +} from './agentic-aggregates'; describe('percentilesOf', () => { it('returns null for empty input', () => { @@ -111,3 +121,142 @@ describe('extractServerMetricSamples', () => { expect(out.prefixCacheHitRate).toEqual([]); }); }); + +/** The write-back payload as bound to the UPDATE (a partial aggregate_stats). */ +interface WrittenStats { + version: number; + isl: unknown; + osl: unknown; + kvCacheUtil: { mean: number } | null; + prefixCacheHitRate: unknown; + normalizedSessionTimeS: number | null; + p90PrefillTpsPerUser: number | null; + normalizedE2e400: unknown; +} + +/** Capture SQL template text + bound values for the write-back assertions. */ +function mockSql(queue: unknown[][]): { + sql: DbClient; + calls: { text: string; values: unknown[] }[]; +} { + const responses = [...queue]; + const calls: { text: string; values: unknown[] }[] = []; + const sql = ((strings: TemplateStringsArray, ...values: unknown[]) => { + calls.push({ text: strings.join('?'), values }); + return Promise.resolve(responses.shift() ?? []); + }) as unknown as DbClient; + return { sql, calls }; +} + +/** One aiperf profiling record for the fallback profile blob. */ +function profileRec(fields: { + cid: string; + isl: number; + osl: number; + ttft_ms: number; + latency_ms: number; +}): string { + return JSON.stringify({ + metadata: { conversation_id: fields.cid, turn_index: 0, benchmark_phase: 'profiling' }, + metrics: { + request_latency: { value: fields.latency_ms, unit: 'ms' }, + time_to_first_token: { value: fields.ttft_ms, unit: 'ms' }, + input_sequence_length: { value: fields.isl, unit: 'tokens' }, + output_sequence_length: { value: fields.osl, unit: 'tokens' }, + }, + }); +} + +describe('getAgenticAggregates write-back', () => { + it('recomputes ALL profile+server fields and writes a complete bundle back on the stale path', async () => { + const profileBlob = gzipSync( + Buffer.from( + [ + profileRec({ cid: 's1', isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }), + profileRec({ cid: 's1', isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }), + ].join('\n'), + ), + ); + const serverBlob = gzipSync( + Buffer.from( + JSON.stringify({ + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [{ timeslices: [{ start_ns: 0, avg: 0.25 }] }], + }, + }, + }), + ), + ); + + // Stale row with server AND derived fields we must NOT trust — the route + // recomputes both from the blobs, so nothing is carried forward. + const staleStats = { + version: STATS_VERSION - 1, + isl: null, + osl: null, + kvCacheUtil: { mean: 0.9, p50: 0.9, p75: 0.9, p90: 0.9, p99: 0.9, n: 1 }, + prefixCacheHitRate: null, + normalizedSessionTimeS: 999, + p90PrefillTpsPerUser: 999, + }; + + const { sql, calls } = mockSql([ + // fetchAggregateStatsRows + [{ benchmark_result_id: 7, stats: staleStats }], + // Pass 1: profile blob (+ trace_replay_id for write-back) + [{ benchmark_result_id: 7, trace_replay_id: 870, profile_blob: profileBlob }], + // Pass 2: server blob + [{ benchmark_result_id: 7, server_blob: serverBlob }], + ]); + + const result = await getAgenticAggregates(sql, [7]); + + // Response reflects the fresh recompute (isl/osl + kv from the blobs). + expect(result[7]?.isl?.n).toBe(2); + expect(result[7]?.kvCacheUtil?.mean).toBeCloseTo(0.25, 6); + + // 4 calls: stats read, profile read, server read, write-back UPDATE. + expect(calls).toHaveLength(4); + expect(calls[3]!.text).toContain('update agentic_trace_replay set aggregate_stats'); + expect(calls[3]!.text).toContain('::jsonb where id'); + + // The payload OBJECT is bound directly (not stringified — that would + // double-encode into a JSONB string). + const [written, traceReplayId] = calls[3]!.values as [WrittenStats, number]; + expect(traceReplayId).toBe(870); + expect(written.version).toBe(STATS_VERSION); + // Server field FRESHLY recomputed (0.25), not the stale 0.9 carried forward. + expect(written.kvCacheUtil?.mean).toBeCloseTo(0.25, 6); + // Derived fields FRESHLY recomputed (not the stale 999s). + expect(written.normalizedSessionTimeS).toBeCloseTo(3, 6); + expect(written.p90PrefillTpsPerUser).toBeCloseTo(200, 6); + expect(written.normalizedE2e400).not.toBeNull(); + expect(written.isl).not.toBeNull(); + }); + + it('does not write back for an id whose profile blob is missing/malformed', async () => { + const staleStats = { + version: STATS_VERSION - 1, + isl: null, + osl: null, + kvCacheUtil: null, + prefixCacheHitRate: null, + normalizedSessionTimeS: null, + p90PrefillTpsPerUser: null, + }; + const { sql, calls } = mockSql([ + [{ benchmark_result_id: 7, stats: staleStats }], + // Pass 1: no profile blob → nothing to recompute, nothing to heal. + [{ benchmark_result_id: 7, trace_replay_id: 870, profile_blob: null }], + // Pass 2: no server blob either. + [{ benchmark_result_id: 7, server_blob: null }], + ]); + + await getAgenticAggregates(sql, [7]); + + // stats read + 2 blob reads only — no write-back (profile parse never succeeded). + expect(calls).toHaveLength(3); + expect(calls.some((c) => c.text.includes('update agentic_trace_replay'))).toBe(false); + }); +}); diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts index 0443398d..73d6ae58 100644 --- a/packages/db/src/queries/agentic-aggregates.ts +++ b/packages/db/src/queries/agentic-aggregates.ts @@ -23,37 +23,26 @@ import { pick } from 'stream-json/filters/pick.js'; import { streamObject } from 'stream-json/streamers/stream-object.js'; import type { DbClient } from '../connection.js'; +import { computeDerivedFromBlob } from './derived-agentic-metrics'; import { + extractIslOsl, fetchAggregateStatsRows, percentilesOf, - readNum, + STATS_VERSION, + writeBackTraceReplayJsonb, type MetricPercentiles, } from './agentic-shared'; -// Percentile math + envelope reader live in agentic-shared.ts; re-exported -// here because etl/compute-aggregate-stats and the API layer import them -// from this module. -export { percentilesOf, type MetricPercentiles } from './agentic-shared'; - -/** - * Bump when the aggregate-stats computation algorithm changes — the backfill - * script recomputes any row whose stored `aggregate_stats.version` is older. - * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular - * import: the compute helper depends on the extractors below. - * - * v2: aggregate vllm gauges/counters across all engine series (was reading - * only series[0], which under-counted by Nx on multi-engine DP/PP deployments). - * - * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate - * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way - * they do for vllm runs. - * - * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL. - * - * v5: reject osl <= 0 in extractTurn to exclude cancelled/empty-output turns - * whose decode-interval math would explode normalized E2E to thousands of seconds. - */ -export const STATS_VERSION = 5; +// STATS_VERSION, the profile extractor `extractIslOsl`, and the percentile +// math + envelope reader all live in agentic-shared.ts (the cycle-free leaf). +// Re-exported here because etl/compute-aggregate-stats and the API layer +// import them from this module. +export { + extractIslOsl, + percentilesOf, + STATS_VERSION, + type MetricPercentiles, +} from './agentic-shared'; export interface AgenticAggregate { id: number; @@ -76,36 +65,6 @@ export type AgenticAggregateMap = Record; const PROFILE_CHUNK_SIZE = 8; const SERVER_CHUNK_SIZE = 1; -interface ProfileRecord { - metadata?: { benchmark_phase?: string }; - metrics?: { - input_sequence_length?: { value?: number } | number; - output_sequence_length?: { value?: number } | number; - }; -} - -/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */ -export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } { - const isl: number[] = []; - const osl: number[] = []; - for (const line of jsonl.split('\n')) { - if (!line) continue; - let rec: ProfileRecord; - try { - rec = JSON.parse(line) as ProfileRecord; - } catch { - continue; - } - if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue; - const m = rec.metrics ?? {}; - const i = readNum(m.input_sequence_length); - const o = readNum(m.output_sequence_length); - if (typeof i === 'number') isl.push(i); - if (typeof o === 'number') osl.push(o); - } - return { isl, osl }; -} - interface TimeSlice { start_ns?: number; end_ns?: number; @@ -322,17 +281,29 @@ export async function getAgenticAggregates( return result; } + // Accumulate a complete, version-stamped `aggregate_stats` bundle per id as + // the two passes recompute it, so we can self-heal the shared JSONB column + // afterward (see the write-back loop below). Only ids whose profile blob + // parsed cleanly get an entry — a null/malformed blob must never overwrite + // good stored data. + const pendingById = new Map(); + // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ────── for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) { const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE); const rows = (await sql` select br.id as benchmark_result_id, + atr.id as trace_replay_id, atr.profile_export_jsonl_gz as profile_blob from benchmark_results br join agentic_trace_replay atr on atr.id = br.trace_replay_id where br.id = any(${chunk}::bigint[]) - `) as { benchmark_result_id: number; profile_blob: Buffer | null }[]; + `) as { + benchmark_result_id: number; + trace_replay_id: number; + profile_blob: Buffer | null; + }[]; for (const row of rows) { const id = Number(row.benchmark_result_id); result[id] ??= blankAggregate(id); @@ -340,8 +311,29 @@ export async function getAgenticAggregates( try { const jsonl = gunzipSync(row.profile_blob).toString('utf8'); const { isl, osl } = extractIslOsl(jsonl); - result[id].isl = percentilesOf(isl); - result[id].osl = percentilesOf(osl); + const islPct = percentilesOf(isl); + const oslPct = percentilesOf(osl); + result[id].isl = islPct; + result[id].osl = oslPct; + // Recompute the profile-derived fields too (same jsonl, no extra + // read) so the self-healed bundle is a faithful full recompute — not + // a carry-forward of stale derived numbers stamped with a new + // version. Server-derived fields are filled in Pass 2 (or stay null + // when the server blob is absent, which is the correct complete value). + const derived = computeDerivedFromBlob(jsonl); + pendingById.set(id, { + traceReplayId: Number(row.trace_replay_id), + stats: { + version: STATS_VERSION, + isl: islPct, + osl: oslPct, + kvCacheUtil: null, + prefixCacheHitRate: null, + normalizedSessionTimeS: derived.normalized_session_time_s, + p90PrefillTpsPerUser: derived.p90_prefill_tps_per_user, + normalizedE2e400: derived.normalized_e2e_400, + }, + }); } catch { // ignore malformed blob } @@ -385,11 +377,30 @@ export async function getAgenticAggregates( } } if (parsed) { - result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil); - result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate); + const kvPct = percentilesOf(parsed.kvCacheUtil); + const prefixPct = percentilesOf(parsed.prefixCacheHitRate); + result[id].kvCacheUtil = kvPct; + result[id].prefixCacheHitRate = prefixPct; + const pending = pendingById.get(id); + if (pending) { + pending.stats.kvCacheUtil = kvPct; + pending.stats.prefixCacheHitRate = prefixPct; + } } } } + + // Self-heal the shared `aggregate_stats` column: persist the freshly + // recomputed, version-stamped bundle so the next request (this route AND the + // derived-agentic-metrics route, which read the same column) takes the fast + // path instead of re-decompressing these blobs. Only ids whose profile blob + // parsed cleanly are in `pendingById`, so a null/malformed recompute never + // clobbers good data. Fire-and-forget, best-effort (no-ops on a read-only + // replica) — never delays or fails the response. + for (const { traceReplayId, stats } of pendingById.values()) { + writeBackTraceReplayJsonb(sql, 'aggregate_stats', traceReplayId, stats); + } + return result; } @@ -404,6 +415,22 @@ interface AggregateStatsRow { p90PrefillTpsPerUser: number | null; } +/** + * The complete `aggregate_stats` bundle we write back on the fallback path. + * Mirrors `AggregateStats` in etl/compute-aggregate-stats.ts (kept local to + * avoid an import cycle with that module, which depends on this one). + */ +interface FullAggregateStats { + version: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; + normalizedSessionTimeS: number | null; + p90PrefillTpsPerUser: number | null; + normalizedE2e400: MetricPercentiles | null; +} + function blankAggregate(id: number): AgenticAggregate { return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null }; } diff --git a/packages/db/src/queries/agentic-shared.test.ts b/packages/db/src/queries/agentic-shared.test.ts new file mode 100644 index 00000000..35a25d97 --- /dev/null +++ b/packages/db/src/queries/agentic-shared.test.ts @@ -0,0 +1,79 @@ +import { afterEach, describe, expect, it, vi } from 'vitest'; + +import type { DbClient } from '../connection.js'; + +import { _resetWriteBackWarned, writeBackTraceReplayJsonb } from './agentic-shared'; + +/** + * Capture every SQL call: the joined template text plus the bound values, so we + * can assert the write-back targets the right column and binds the JSONB + * payload as a `::jsonb`-cast JSON string (driver-agnostic). + */ +function mockSql(reject?: Error): { + sql: DbClient; + calls: { text: string; values: unknown[] }[]; +} { + const calls: { text: string; values: unknown[] }[] = []; + const sql = ((strings: TemplateStringsArray, ...values: unknown[]) => { + calls.push({ text: strings.join('?'), values }); + return reject ? Promise.reject(reject) : Promise.resolve([]); + }) as unknown as DbClient; + return { sql, calls }; +} + +afterEach(() => { + _resetWriteBackWarned(); + vi.restoreAllMocks(); +}); + +describe('writeBackTraceReplayJsonb', () => { + it('issues a fixed-column UPDATE binding the payload as ::jsonb + the id', () => { + const { sql, calls } = mockSql(); + writeBackTraceReplayJsonb(sql, 'chart_series', 870, { version: 12, foo: 'bar' }); + + expect(calls).toHaveLength(1); + expect(calls[0]!.text).toContain('update agentic_trace_replay set chart_series'); + expect(calls[0]!.text).toContain('::jsonb where id'); + // The payload OBJECT is bound directly (not JSON.stringify'd — that would + // double-encode into a JSONB string), followed by the id. Only the value + + // id are interpolated; the column name is fully static in the SQL text. + expect(calls[0]!.values).toEqual([{ version: 12, foo: 'bar' }, 870]); + }); + + it('targets the requested column verbatim (no cross-talk between columns)', () => { + const cases: ('aggregate_stats' | 'chart_series' | 'request_timeline')[] = [ + 'aggregate_stats', + 'chart_series', + 'request_timeline', + ]; + for (const column of cases) { + const { sql, calls } = mockSql(); + writeBackTraceReplayJsonb(sql, column, 1, { v: 1 }); + expect(calls[0]!.text).toContain(`update agentic_trace_replay set ${column}`); + } + }); + + it('no-ops on a null/undefined payload (never overwrites good data with a hole)', () => { + const { sql, calls } = mockSql(); + writeBackTraceReplayJsonb(sql, 'aggregate_stats', 1, null); + writeBackTraceReplayJsonb(sql, 'aggregate_stats', 1, undefined); + expect(calls).toHaveLength(0); + }); + + it('swallows a rejected UPDATE (read-only replica) and warns exactly once', async () => { + const warn = vi.spyOn(console, 'warn').mockImplementation(() => {}); + const { sql } = mockSql(new Error('cannot execute UPDATE in a read-only transaction')); + + // Fire twice; the helper is fire-and-forget so neither call throws. + expect(() => writeBackTraceReplayJsonb(sql, 'chart_series', 1, { v: 1 })).not.toThrow(); + expect(() => writeBackTraceReplayJsonb(sql, 'chart_series', 2, { v: 1 })).not.toThrow(); + + // Let the caught rejections settle. + await new Promise((resolve) => { + setTimeout(resolve, 0); + }); + + expect(warn).toHaveBeenCalledTimes(1); + expect(warn.mock.calls[0]![0]).toContain('could not persist chart_series'); + }); +}); diff --git a/packages/db/src/queries/agentic-shared.ts b/packages/db/src/queries/agentic-shared.ts index e8a639e7..d8673a07 100644 --- a/packages/db/src/queries/agentic-shared.ts +++ b/packages/db/src/queries/agentic-shared.ts @@ -1,12 +1,69 @@ /** * Helpers shared by the agentic per-point queries (`agentic-aggregates.ts`, * `derived-agentic-metrics.ts`): percentile math over aiperf samples, - * the `{value, unit}` metric-envelope reader, and the single-round-trip - * `aggregate_stats` fetch both fast paths start from. + * the `{value, unit}` metric-envelope reader, the single-round-trip + * `aggregate_stats` fetch both fast paths start from, and the best-effort + * write-back both use to self-heal a stale precomputed payload. + * + * `STATS_VERSION` and the profile-blob extractor `extractIslOsl` live here (the + * dependency-free leaf) rather than in `agentic-aggregates.ts` so both query + * modules — and `etl/compute-aggregate-stats.ts` — can share them without an + * import cycle: `agentic-aggregates` ⇄ `derived-agentic-metrics` would + * otherwise close a loop once each needs the other's blob helpers for + * write-back. (agentic-aggregates re-exports both for existing importers.) */ import type { DbClient } from '../connection.js'; +/** + * Bump when the aggregate-stats computation algorithm changes — the backfill + * script recomputes any row whose stored `aggregate_stats.version` is older, + * and the read-path fast/slow branches key off it. + * + * v2: aggregate vllm gauges/counters across all engine series (was reading + * only series[0], which under-counted by Nx on multi-engine DP/PP deployments). + * + * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate + * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way + * they do for vllm runs. + * + * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL. + * + * v5: reject osl <= 0 in extractTurn to exclude cancelled/empty-output turns + * whose decode-interval math would explode normalized E2E to thousands of seconds. + */ +export const STATS_VERSION = 5; + +interface ProfileRecord { + metadata?: { benchmark_phase?: string }; + metrics?: { + input_sequence_length?: { value?: number } | number; + output_sequence_length?: { value?: number } | number; + }; +} + +/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */ +export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } { + const isl: number[] = []; + const osl: number[] = []; + for (const line of jsonl.split('\n')) { + if (!line) continue; + let rec: ProfileRecord; + try { + rec = JSON.parse(line) as ProfileRecord; + } catch { + continue; + } + if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue; + const m = rec.metrics ?? {}; + const i = readNum(m.input_sequence_length); + const o = readNum(m.output_sequence_length); + if (typeof i === 'number') isl.push(i); + if (typeof o === 'number') osl.push(o); + } + return { isl, osl }; +} + export interface MetricPercentiles { mean: number; p50: number; @@ -79,3 +136,80 @@ export async function fetchAggregateStatsRows( where br.id = any(${benchmarkResultIds}::bigint[]) `) as unknown as { benchmark_result_id: number; stats: Stats | null }[]; } + +/** Trace-replay JSONB columns the read path may self-heal after a recompute. */ +export type WriteBackColumn = 'aggregate_stats' | 'chart_series' | 'request_timeline'; + +/** Logged once per process so a read-only connection doesn't spam the console. */ +let writeBackWarned = false; + +/** Reset the once-per-process warning latch (test-only). */ +export function _resetWriteBackWarned(): void { + writeBackWarned = false; +} + +/** + * Issue the fixed-column UPDATE. Kept as one tagged-template call per column so + * the SQL text is fully static — no column name is ever interpolated — which + * keeps it injection-proof and driver-agnostic. The bound value is the plain + * payload OBJECT cast to `::jsonb`: both the neon HTTP driver and postgres.js + * JSON-serialize an object parameter exactly once, so `::jsonb` parses it to a + * JSONB object. (Passing `JSON.stringify(payload)` instead double-encodes into + * a JSONB *string* — `jsonb_typeof` = 'string' — which is why we don't.) The + * abstract `DbClient` doesn't expose postgres.js's `sql.json()`, so this is the + * portable way to write JSONB. + */ +function updateJsonbColumn( + sql: DbClient, + column: WriteBackColumn, + traceReplayId: number, + value: unknown, +): Promise { + switch (column) { + case 'aggregate_stats': { + return sql`update agentic_trace_replay set aggregate_stats = ${value}::jsonb where id = ${traceReplayId}`; + } + case 'chart_series': { + return sql`update agentic_trace_replay set chart_series = ${value}::jsonb where id = ${traceReplayId}`; + } + case 'request_timeline': { + return sql`update agentic_trace_replay set request_timeline = ${value}::jsonb where id = ${traceReplayId}`; + } + } +} + +/** + * Best-effort, fire-and-forget persist of a freshly recomputed versioned + * payload back into an `agentic_trace_replay` JSONB column, so the next request + * takes the precomputed fast path instead of re-gunzipping the raw blob. + * + * The read path runs on the READONLY connection. On a true read replica (prod's + * `DATABASE_READONLY_URL`) the UPDATE fails at the wire — this catches the + * rejection and silently no-ops (warning once) so the response is never delayed + * or failed. On local/superuser connections (where the readonly URL is also + * write-capable) it self-heals the stored payload. Callers must only pass a + * COMPLETE recomputed payload — never a partial/null-blob result — so a + * self-heal never clobbers good data with holes. + */ +export function writeBackTraceReplayJsonb( + sql: DbClient, + column: WriteBackColumn, + traceReplayId: number, + payload: unknown, +): void { + if (payload === null || payload === undefined) return; + // structuredClone strips any class prototypes so the driver serializes plain + // data only — matches `jsonbParam` in the backfill runner. + const value = structuredClone(payload); + void updateJsonbColumn(sql, column, traceReplayId, value).catch((error: unknown) => { + if (!writeBackWarned) { + writeBackWarned = true; + console.warn( + `[agentic write-back] could not persist ${column} (read-only connection?) — ` + + `serving recomputed result without caching. ${ + error instanceof Error ? error.message : String(error) + }`, + ); + } + }); +} diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts index 84c09193..a39de670 100644 --- a/packages/db/src/queries/derived-agentic-metrics.test.ts +++ b/packages/db/src/queries/derived-agentic-metrics.test.ts @@ -1,6 +1,11 @@ +import { gzipSync } from 'node:zlib'; + import { describe, expect, it } from 'vitest'; -import { computeDerivedFromBlob } from './derived-agentic-metrics.js'; +import { STATS_VERSION } from './agentic-shared'; +import type { DbClient } from '../connection.js'; + +import { computeDerivedFromBlob, getDerivedAgenticMetrics } from './derived-agentic-metrics.js'; /** Build one aiperf JSONL record for the synthetic fixture. */ function rec( @@ -150,3 +155,102 @@ describe('computeDerivedFromBlob', () => { expect(out.normalized_e2e_400!.p90).toBeLessThan(20); }); }); + +/** Capture SQL template text + bound values for the write-back assertions. */ +function mockSql(queue: unknown[][]): { + sql: DbClient; + calls: { text: string; values: unknown[] }[]; +} { + const responses = [...queue]; + const calls: { text: string; values: unknown[] }[] = []; + const sql = ((strings: TemplateStringsArray, ...values: unknown[]) => { + calls.push({ text: strings.join('?'), values }); + return Promise.resolve(responses.shift() ?? []); + }) as unknown as DbClient; + return { sql, calls }; +} + +describe('getDerivedAgenticMetrics write-back', () => { + it('self-heals aggregate_stats from the profile blob, carrying server fields forward', async () => { + const jsonl = [ + rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }), + rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }), + ].join('\n'); + const blob = gzipSync(Buffer.from(jsonl)); + + // Stale v(N-1) row that DOES carry server-derived fields — they must be + // preserved in the healed bundle (derived route can't recompute them). + const staleServerKv = { mean: 0.4, p50: 0.4, p75: 0.5, p90: 0.6, p99: 0.7, n: 3 }; + const staleStats = { + version: STATS_VERSION - 1, + isl: null, + osl: null, + kvCacheUtil: staleServerKv, + prefixCacheHitRate: null, + normalizedSessionTimeS: 999, + p90PrefillTpsPerUser: 999, + normalizedE2e400: null, + }; + + const { sql, calls } = mockSql([ + // fetchAggregateStatsRows + [{ benchmark_result_id: 7, stats: staleStats }], + // fallback profile-blob query + [{ benchmark_result_id: 7, trace_replay_id: 870, blob }], + ]); + + const result = await getDerivedAgenticMetrics(sql, [7]); + + // Response is the freshly recomputed value, not the stale 999s. + expect(result[7]?.normalized_session_time_s).toBeCloseTo(3, 6); + expect(result[7]?.p90_prefill_tps_per_user).toBeCloseTo(200, 6); + + // 3 calls: stats read, blob read, write-back UPDATE. + expect(calls).toHaveLength(3); + expect(calls[2]!.text).toContain('update agentic_trace_replay set aggregate_stats'); + expect(calls[2]!.text).toContain('::jsonb where id'); + + // The write-back binds a COMPLETE, version-stamped bundle at the new version, + // recomputing profile fields and carrying server fields forward untouched. + // The payload OBJECT is bound directly (not stringified — that would + // double-encode into a JSONB string). + interface WrittenStats { + version: number; + isl: unknown; + osl: unknown; + kvCacheUtil: unknown; + normalizedSessionTimeS: number | null; + p90PrefillTpsPerUser: number | null; + } + const [written, traceReplayId] = calls[2]!.values as [WrittenStats, number]; + expect(traceReplayId).toBe(870); + expect(written.version).toBe(STATS_VERSION); + expect(written.normalizedSessionTimeS).toBeCloseTo(3, 6); + expect(written.p90PrefillTpsPerUser).toBeCloseTo(200, 6); + expect(written.isl).not.toBeNull(); + expect(written.osl).not.toBeNull(); + // Server-derived field carried forward from the stale row (not re-read). + expect(written.kvCacheUtil).toEqual(staleServerKv); + }); + + it('takes the fast path (no blob read, no write-back) when stats are current', async () => { + const currentStats = { + version: STATS_VERSION, + isl: null, + osl: null, + kvCacheUtil: null, + prefixCacheHitRate: null, + normalizedSessionTimeS: 1.5, + p90PrefillTpsPerUser: 42, + normalizedE2e400: { mean: 1, p50: 1, p75: 1, p90: 2, p99: 3, n: 5 }, + }; + const { sql, calls } = mockSql([[{ benchmark_result_id: 7, stats: currentStats }]]); + + const result = await getDerivedAgenticMetrics(sql, [7]); + + expect(result[7]?.normalized_session_time_s).toBe(1.5); + expect(result[7]?.p90_normalized_e2e_400_s).toBe(2); + // Only the stats read — no fallback blob query, no write-back. + expect(calls).toHaveLength(1); + }); +}); diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts index 24b24cf1..626ab9c7 100644 --- a/packages/db/src/queries/derived-agentic-metrics.ts +++ b/packages/db/src/queries/derived-agentic-metrics.ts @@ -23,13 +23,15 @@ import { gunzipSync } from 'node:zlib'; import { NORMALIZED_E2E_OUTPUT_TOKENS } from '@semianalysisai/inferencex-constants'; import type { DbClient } from '../connection.js'; -import { STATS_VERSION } from './agentic-aggregates'; import { + extractIslOsl, fetchAggregateStatsRows, meanOf, percentilesOf, quantile, readNum, + STATS_VERSION, + writeBackTraceReplayJsonb, type MetricPercentiles, } from './agentic-shared'; @@ -48,6 +50,27 @@ export interface DerivedAgenticMetric { export type DerivedAgenticMetricMap = Record; +/** + * The full `aggregate_stats` JSONB shape (mirrors `AggregateStats` in + * etl/compute-aggregate-stats.ts). Duplicated here rather than imported to keep + * this module off the etl import graph. When we self-heal from the profile blob + * alone, the server-derived fields (kvCacheUtil, prefixCacheHitRate) are carried + * forward untouched from the stale row — never re-reading the huge server blob. + * This mirrors the profile-only upgrade `backfill-aggregate-stats.ts` performs; + * the agentic-aggregates route (which does read the server blob) heals those + * server fields. + */ +interface StoredAggregateStats { + version: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; + normalizedSessionTimeS: number | null; + p90PrefillTpsPerUser: number | null; + normalizedE2e400: MetricPercentiles | null; +} + /** * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless * HTTP driver caps responses at 64 MB — chunk to stay well under. @@ -205,14 +228,13 @@ export async function getDerivedAgenticMetrics( // ingest pipeline computes both metrics in the same pass that produces the // percentile bundles, so a single SQL round-trip covers most ids without // touching the gzipped profile blob. - const statsRows = await fetchAggregateStatsRows<{ - version?: number; - normalizedSessionTimeS?: number | null; - p90PrefillTpsPerUser?: number | null; - normalizedE2e400?: MetricPercentiles | null; - }>(sql, benchmarkResultIds); + const statsRows = await fetchAggregateStatsRows(sql, benchmarkResultIds); const idsNeedingBlob: number[] = []; + // Carry each stale/missing row's existing stats into the fallback so a + // self-heal preserves the server-derived fields (kvCacheUtil, + // prefixCacheHitRate) it can't recompute from the profile blob alone. + const staleStatsById = new Map(); for (const row of statsRows) { const id = Number(row.benchmark_result_id); if (row.stats && Number(row.stats.version) === STATS_VERSION) { @@ -225,6 +247,7 @@ export async function getDerivedAgenticMetrics( }; } else { idsNeedingBlob.push(id); + staleStatsById.set(id, row.stats ?? null); } } @@ -233,33 +256,60 @@ export async function getDerivedAgenticMetrics( // Fallback: parse the profile blob directly. Used for rows whose // `aggregate_stats` is null or computed by an older STATS_VERSION; the // backfill script drains the population so this path should be rare. - const rows: { benchmark_result_id: number; blob: Buffer }[] = []; + // `trace_replay_id` + the (small) stale `aggregate_stats` come along on the + // same join — no extra round-trip — so we can self-heal after recompute. + const rows: { + benchmark_result_id: number; + trace_replay_id: number; + blob: Buffer; + }[] = []; for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) { const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE); const chunkRows = (await sql` select br.id as benchmark_result_id, + atr.id as trace_replay_id, atr.profile_export_jsonl_gz as blob from benchmark_results br join agentic_trace_replay atr on atr.id = br.trace_replay_id where br.id = any(${chunk}::bigint[]) and atr.profile_export_jsonl_gz is not null - `) as { benchmark_result_id: number; blob: Buffer }[]; + `) as { benchmark_result_id: number; trace_replay_id: number; blob: Buffer }[]; rows.push(...chunkRows); } for (const row of rows) { + const id = Number(row.benchmark_result_id); try { const jsonl = gunzipSync(row.blob).toString('utf8'); const { normalized_session_time_s, p90_prefill_tps_per_user, normalized_e2e_400 } = computeDerivedFromBlob(jsonl); - result[Number(row.benchmark_result_id)] = { - id: Number(row.benchmark_result_id), + result[id] = { + id, normalized_session_time_s, p90_prefill_tps_per_user, p75_normalized_e2e_400_s: normalized_e2e_400?.p75 ?? null, p90_normalized_e2e_400_s: normalized_e2e_400?.p90 ?? null, }; + + // Self-heal the shared `aggregate_stats` bundle. We only have the profile + // blob here, so recompute the profile-derived fields (isl/osl + the three + // derived metrics) and carry the stale row's server-derived fields + // forward untouched — the profile-only upgrade the backfill CLI also + // performs. Fire-and-forget, best-effort (no-ops on a read-only replica). + const { isl, osl } = extractIslOsl(jsonl); + const prior = staleStatsById.get(id) ?? null; + const merged: StoredAggregateStats = { + version: STATS_VERSION, + isl: percentilesOf(isl), + osl: percentilesOf(osl), + kvCacheUtil: prior?.kvCacheUtil ?? null, + prefixCacheHitRate: prior?.prefixCacheHitRate ?? null, + normalizedSessionTimeS: normalized_session_time_s, + p90PrefillTpsPerUser: p90_prefill_tps_per_user, + normalizedE2e400: normalized_e2e_400, + }; + writeBackTraceReplayJsonb(sql, 'aggregate_stats', Number(row.trace_replay_id), merged); } catch { // Skip malformed blobs silently — frontend treats missing ids as "no data". } diff --git a/packages/db/src/queries/request-timeline.test.ts b/packages/db/src/queries/request-timeline.test.ts index 62ba5385..1f1d58a5 100644 --- a/packages/db/src/queries/request-timeline.test.ts +++ b/packages/db/src/queries/request-timeline.test.ts @@ -1,3 +1,5 @@ +import { gzipSync } from 'node:zlib'; + import { describe, expect, it } from 'vitest'; import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline'; @@ -42,4 +44,54 @@ describe('getRequestTimeline', () => { await expect(getRequestTimeline(sql, 422991)).resolves.toBeNull(); expect(calls).toHaveLength(1); }); + + it('recomputes from the blob AND writes the fresh timeline back when the stored one is stale', async () => { + const blob = gzipSync( + Buffer.from( + JSON.stringify({ + metadata: { + conversation_id: 'c1', + turn_index: 0, + worker_id: 'w0', + benchmark_phase: 'profiling', + credit_issued_ns: 1000, + request_start_ns: 1100, + request_end_ns: 2000, + }, + metrics: { + time_to_first_token: { value: 50 }, + input_sequence_length: { value: 128 }, + output_sequence_length: { value: 16 }, + }, + }), + ), + ); + const stale = { ...timeline, version: REQUEST_TIMELINE_VERSION - 1 }; + const { sql, calls } = mockSql([ + [{ trace_replay_id: 870, has_blob: true, request_timeline: stale }], + [{ blob }], + ]); + + const result = await getRequestTimeline(sql, 422991); + + expect(result?.version).toBe(REQUEST_TIMELINE_VERSION); + expect(result?.requests).toHaveLength(1); + // 3 calls: meta read, blob read, then the fire-and-forget write-back. + expect(calls).toHaveLength(3); + expect(calls[1]).toContain('profile_export_jsonl_gz as blob'); + expect(calls[2]).toContain('update agentic_trace_replay set request_timeline'); + expect(calls[2]).toContain('::jsonb where id'); + }); + + it('does not write back when the blob is missing (never persists a null timeline)', async () => { + const stale = { ...timeline, version: REQUEST_TIMELINE_VERSION - 1 }; + const { sql, calls } = mockSql([ + [{ trace_replay_id: 870, has_blob: true, request_timeline: stale }], + [{ blob: null }], + ]); + + await expect(getRequestTimeline(sql, 422991)).resolves.toBeNull(); + // meta read + blob read only — no write-back for a null recompute. + expect(calls).toHaveLength(2); + }); }); diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts index 2a6bb40c..9b7cc4b5 100644 --- a/packages/db/src/queries/request-timeline.ts +++ b/packages/db/src/queries/request-timeline.ts @@ -15,6 +15,7 @@ import { } from '../etl/compute-request-timeline'; import type { DbClient } from '../connection.js'; +import { writeBackTraceReplayJsonb } from './agentic-shared'; export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline'; @@ -60,5 +61,16 @@ export async function getRequestTimeline( from agentic_trace_replay where id = ${row.trace_replay_id} `) as unknown as RawBlobRow[]; - return computeRequestTimeline(blobRows[0]?.blob ?? null); + const timeline = computeRequestTimeline(blobRows[0]?.blob ?? null); + + // Self-heal the stored request_timeline so the next request (and the + // trace-histograms route, which reads the same column) takes the fast path. + // Only write a complete recompute — `computeRequestTimeline` returns null for + // a missing/malformed blob, which we must not persist over good data. + // Fire-and-forget, best-effort (no-ops on a read-only replica). + if (timeline !== null) { + writeBackTraceReplayJsonb(sql, 'request_timeline', row.trace_replay_id, timeline); + } + + return timeline; } diff --git a/packages/db/src/queries/trace-server-metrics.test.ts b/packages/db/src/queries/trace-server-metrics.test.ts index f045dfda..77aea28a 100644 --- a/packages/db/src/queries/trace-server-metrics.test.ts +++ b/packages/db/src/queries/trace-server-metrics.test.ts @@ -92,8 +92,12 @@ describe('getTraceServerMetrics', () => { const result = await getTraceServerMetrics(sql, 42); expect(result?.prefillTps).toEqual([{ t: 0, value: 321 }]); - expect(calls).toHaveLength(2); + // 3 calls: meta read, blob read, then the fire-and-forget chart_series + // write-back that self-heals the stale precomputed series. + expect(calls).toHaveLength(3); expect(calls[1]).toContain('server_metrics_json_gz as blob'); + expect(calls[2]).toContain('update agentic_trace_replay set chart_series'); + expect(calls[2]).toContain('::jsonb where id'); }); it('returns null without a blob and does not issue a second query', async () => { diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index d24d0879..dc03129e 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -20,6 +20,7 @@ import { } from '../etl/compute-chart-series'; import type { DbClient } from '../connection.js'; +import { writeBackTraceReplayJsonb } from './agentic-shared'; export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series'; @@ -207,5 +208,12 @@ export async function getTraceServerMetrics( disagg: row.disagg, }); if (!series) return null; + + // Self-heal the stored chart_series so the next request takes the fast path + // instead of re-decompressing this (tens-of-MB) blob. `series` is complete + // and stamped at CHART_SERIES_VERSION here; fire-and-forget and best-effort + // (no-ops on a read-only replica). trace_replay_id is non-null on this path. + writeBackTraceReplayJsonb(sql, 'chart_series', row.trace_replay_id, series); + return merge(meta, series, kvCachePoolTokens); } From 2a148011d109682341aac8d0aa10f9901049b90b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 17:05:07 -0500 Subject: [PATCH 19/40] ci: allow manual agentic ingest dispatch --- .github/workflows/ingest-agentic-results.yml | 21 +++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml index cf8366ea..75f5a658 100644 --- a/.github/workflows/ingest-agentic-results.yml +++ b/.github/workflows/ingest-agentic-results.yml @@ -23,6 +23,17 @@ name: Ingest Agentic Benchmark Results on: repository_dispatch: types: [ingest-agentic-results] + workflow_dispatch: + inputs: + run-id: + description: InferenceX Actions run ID to ingest + required: true + type: string + run-attempt: + description: InferenceX Actions run attempt to ingest + required: false + default: '1' + type: string jobs: ingest: @@ -55,7 +66,7 @@ jobs: - name: Download artifacts from InferenceX run env: GH_TOKEN: ${{ secrets.INFX_MAIN_PAT }} - RUN_ID: ${{ github.event.client_payload.run-id }} + RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }} ARTIFACTS_PATH: ${{ github.workspace }}/artifacts run: | mkdir -p "$ARTIFACTS_PATH" @@ -110,8 +121,8 @@ jobs: env: DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }} - INGEST_RUN_ID: ${{ github.event.client_payload.run-id }} - INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt }} + INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }} + INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }} INGEST_ARTIFACTS_PATH: ${{ github.workspace }}/artifacts INGEST_REPO: SemiAnalysisAI/InferenceX UNMAPPED_ENTITIES_OUTPUT: ${{ github.workspace }}/unmapped-entities.json @@ -165,7 +176,7 @@ jobs: webhook-type: incoming-webhook payload: | { - "text": ":warning: *Unrecognized entities during agentic ingest*\nRun ID: ${{ github.event.client_payload.run-id }}\n```${{ steps.unmapped.outputs.summary }}```\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" + "text": ":warning: *Unrecognized entities during agentic ingest*\nRun ID: ${{ github.event.client_payload.run-id || inputs.run-id }}\n```${{ steps.unmapped.outputs.summary }}```\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" } - name: Notify Slack on failure @@ -176,5 +187,5 @@ jobs: webhook-type: incoming-webhook payload: | { - "text": ":rotating_light: *Agentic ingest workflow failed*\nRun ID: ${{ github.event.client_payload.run-id }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" + "text": ":rotating_light: *Agentic ingest workflow failed*\nRun ID: ${{ github.event.client_payload.run-id || inputs.run-id }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" } From a1e94d91b217034a229c9e778cb9e0c2bb626600 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 18:09:30 -0500 Subject: [PATCH 20/40] ci: register agentic ingest workflow --- .github/workflows/ingest-agentic-results.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml index 75f5a658..e1f4a8b8 100644 --- a/.github/workflows/ingest-agentic-results.yml +++ b/.github/workflows/ingest-agentic-results.yml @@ -21,6 +21,10 @@ name: Ingest Agentic Benchmark Results # agentic-specific alerting (missing dataset slug). on: + push: + branches: [feat/agentx] + paths: + - .github/workflows/ingest-agentic-results.yml repository_dispatch: types: [ingest-agentic-results] workflow_dispatch: @@ -36,7 +40,14 @@ on: type: string jobs: + register: + if: github.event_name == 'push' + runs-on: ubuntu-latest + steps: + - run: echo "Registering ingest-agentic-results workflow for manual dispatch" + ingest: + if: github.event_name != 'push' # Blob-heavy: uploading trace-replay sidecars for a ~20-point sweep takes # far longer than a fixed-seq-len ingest. timeout-minutes: 60 From 6d55b95789dd04442fd8f7c862568d820539a066 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 18:10:08 -0500 Subject: [PATCH 21/40] ci: use dev database for agentic ingest test --- .github/workflows/ingest-agentic-results.yml | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml index e1f4a8b8..8f84b3ed 100644 --- a/.github/workflows/ingest-agentic-results.yml +++ b/.github/workflows/ingest-agentic-results.yml @@ -21,10 +21,6 @@ name: Ingest Agentic Benchmark Results # agentic-specific alerting (missing dataset slug). on: - push: - branches: [feat/agentx] - paths: - - .github/workflows/ingest-agentic-results.yml repository_dispatch: types: [ingest-agentic-results] workflow_dispatch: @@ -40,14 +36,7 @@ on: type: string jobs: - register: - if: github.event_name == 'push' - runs-on: ubuntu-latest - steps: - - run: echo "Registering ingest-agentic-results workflow for manual dispatch" - ingest: - if: github.event_name != 'push' # Blob-heavy: uploading trace-replay sidecars for a ~20-point sweep takes # far longer than a fixed-seq-len ingest. timeout-minutes: 60 @@ -71,7 +60,7 @@ jobs: - name: Run migrations env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} + DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }} run: pnpm admin:db:migrate --yes - name: Download artifacts from InferenceX run @@ -130,7 +119,7 @@ jobs: - name: Ingest results to DB env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} + DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }} GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }} INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }} INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }} @@ -141,12 +130,12 @@ jobs: - name: Apply run overrides env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} + DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }} run: pnpm admin:db:apply-overrides --yes - name: Verify database env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_WRITE_URL }} + DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }} run: pnpm admin:db:verify - name: Invalidate Vercel cache From cc63a730b0e36338b1e9d850ffe747d59cf26209 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 18:10:33 -0500 Subject: [PATCH 22/40] ci: use dev write database for agentic ingest test --- .github/workflows/ingest-agentic-results.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml index 8f84b3ed..a7d1cd8a 100644 --- a/.github/workflows/ingest-agentic-results.yml +++ b/.github/workflows/ingest-agentic-results.yml @@ -60,7 +60,7 @@ jobs: - name: Run migrations env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }} + DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }} run: pnpm admin:db:migrate --yes - name: Download artifacts from InferenceX run @@ -119,7 +119,7 @@ jobs: - name: Ingest results to DB env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }} + DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }} GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }} INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }} INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }} @@ -130,12 +130,12 @@ jobs: - name: Apply run overrides env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }} + DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }} run: pnpm admin:db:apply-overrides --yes - name: Verify database env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV }} + DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }} run: pnpm admin:db:verify - name: Invalidate Vercel cache From bd0a4905a74cf1d021e7b1bac12fdd15a3ca78ff Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 18:15:09 -0500 Subject: [PATCH 23/40] ci: skip ingest wait for manual dispatch --- .github/workflows/ingest-agentic-results.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml index a7d1cd8a..af94a4c5 100644 --- a/.github/workflows/ingest-agentic-results.yml +++ b/.github/workflows/ingest-agentic-results.yml @@ -45,6 +45,7 @@ jobs: contents: read steps: - name: Wait for source run to finish + if: github.event_name != 'workflow_dispatch' run: sleep 300 - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 From ddd1a267f82af95a616e368a729bd555a8ed79c3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 18:34:22 -0500 Subject: [PATCH 24/40] chore(db): log agentic ingest progress --- packages/db/src/etl/trace-replay-ingest.ts | 61 +++++++++++++++++-- packages/db/src/ingest-ci-run.ts | 70 ++++++++++++++++++++-- 2 files changed, 122 insertions(+), 9 deletions(-) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index b50168db..1c739b7d 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -19,6 +19,25 @@ import type { ServerMetricsContext } from './server-metrics-adapters'; type Sql = ReturnType; +export interface TraceReplayIngestOptions { + metricsContext?: ServerMetricsContext; + progressLabel?: string; +} + +function formatBytes(bytes: number | null | undefined): string { + if (bytes === null || bytes === undefined) return 'none'; + if (bytes < 1024) return `${bytes} B`; + const kib = bytes / 1024; + if (kib < 1024) return `${kib.toFixed(1)} KiB`; + const mib = kib / 1024; + if (mib < 1024) return `${mib.toFixed(1)} MiB`; + return `${(mib / 1024).toFixed(1)} GiB`; +} + +function elapsed(startMs: number): string { + return `${((Date.now() - startMs) / 1000).toFixed(1)}s`; +} + /** * Persist the per-point trace files and link them to `benchmarkResultIds`. * @@ -34,8 +53,8 @@ type Sql = ReturnType; * @param serverMetricsJson Raw bytes of `server_metrics_export.json` — * per-scrape time-series of every Prometheus metric. * Optional, gzipped before storage (~42x ratio). - * @param metricsContext Canonical framework used to select the - * orchestrator-specific metric-label adapter. + * @param options Canonical framework/disagg context plus optional + * progress label for CI logs. */ export async function insertTraceReplay( sql: Sql, @@ -43,36 +62,65 @@ export async function insertTraceReplay( profileExportJsonl: Buffer | null, serverMetricsCsv: Buffer | null, serverMetricsJson: Buffer | null = null, - metricsContext: ServerMetricsContext = {}, + options: TraceReplayIngestOptions = {}, ): Promise { + const { metricsContext = {}, progressLabel } = options; + const log = (message: string): void => { + if (progressLabel) console.log(` trace_replay ${progressLabel}: ${message}`); + }; + if (benchmarkResultIds.length === 0) return; if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return; // Only link rows that don't already point at a trace_replay row — keeps // re-ingest from inserting duplicate sibling blobs. + const linkStart = Date.now(); + log(`checking ${benchmarkResultIds.length} benchmark row(s) for existing links`); const unlinked = await sql<{ id: number }[]>` select id from benchmark_results where id = any(${sql.array(benchmarkResultIds)}::bigint[]) and trace_replay_id is null `; - if (unlinked.length === 0) return; + log(`found ${unlinked.length} unlinked row(s) (${elapsed(linkStart)})`); + if (unlinked.length === 0) { + log('skipping blob insert; all benchmark rows already linked'); + return; + } + const gzipStart = Date.now(); + log( + `compressing profile=${formatBytes(profileExportJsonl?.length)}, ` + + `server_csv=${formatBytes(serverMetricsCsv?.length)}, ` + + `server_json=${formatBytes(serverMetricsJson?.length)}`, + ); const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null; const profileSize = profileExportJsonl ? profileExportJsonl.length : null; const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null; const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null; const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null; + log( + `compressed profile=${formatBytes(profileGz?.length)}, ` + + `server_json=${formatBytes(metricsJsonGz?.length)} (${elapsed(gzipStart)})`, + ); // Pre-compute aggregate stats + chart-ready time-series + per-request // timeline so the detail page doesn't have to re-parse these blobs on // every request. Each helper tolerates a null blob and falls back to // a streaming parser for oversized server_metrics blobs. + const computeStart = Date.now(); + log('computing aggregate stats, chart series, and request timeline'); const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([ computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }), computeChartSeries(metricsJsonGz, metricsContext), Promise.resolve(computeRequestTimeline(profileGz)), ]); + log( + `computed derived JSON: chart_windows=${chartSeries?.timeslicesCount ?? 0}, ` + + `timeline_requests=${requestTimeline?.requests.length ?? 0} (${elapsed(computeStart)})`, + ); + const insertStart = Date.now(); + log('inserting trace_replay blob row'); const [{ id: traceReplayId }] = await sql<{ id: number }[]>` insert into agentic_trace_replay ( profile_export_jsonl_gz, @@ -98,12 +146,16 @@ export async function insertTraceReplay( ) returning id `; + log(`inserted trace_replay_id=${traceReplayId} (${elapsed(insertStart)})`); + const updateStart = Date.now(); + log(`linking trace_replay_id=${traceReplayId} to ${unlinked.length} benchmark row(s)`); await sql` update benchmark_results set trace_replay_id = ${traceReplayId} where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; + log(`linked benchmark rows (${elapsed(updateStart)})`); // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang // runs don't populate these in the harness JSON; vLLM runs do but only @@ -146,6 +198,7 @@ export async function insertTraceReplay( ) where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; + log('updated cache-hit metrics from chart series'); } } } diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index d23a8f63..15267622 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -74,6 +74,29 @@ let runAttemptNum: number; let REPO: string; let tempDir: string | null = null; +function formatBytes(bytes: number | null | undefined): string { + if (bytes === null || bytes === undefined) return 'none'; + if (bytes < 1024) return `${bytes} B`; + const kib = bytes / 1024; + if (kib < 1024) return `${kib.toFixed(1)} KiB`; + const mib = kib / 1024; + if (mib < 1024) return `${mib.toFixed(1)} MiB`; + return `${(mib / 1024).toFixed(1)} GiB`; +} + +function elapsed(startMs: number): string { + return `${((Date.now() - startMs) / 1000).toFixed(1)}s`; +} + +function fileSize(pathname: string | null | undefined): number | null { + if (!pathname) return null; + try { + return fs.statSync(pathname).size; + } catch { + return null; + } +} + if (isDownloadMode) { // --download [repo] // Filter out '--' injected by pnpm arg passthrough @@ -378,13 +401,22 @@ async function main(): Promise { const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))]; console.log(` Found ${allBmkFiles.length} benchmark JSON file(s)`); - for (const file of allBmkFiles) { + for (const [fileIndex, file] of allBmkFiles.entries()) { + const fileStart = Date.now(); + const relativeFile = path.relative(artifactsDir, file); + console.log( + ` [${fileIndex + 1}/${allBmkFiles.length}] ${relativeFile} (${formatBytes(fileSize(file))})`, + ); const data = readJson(file); - if (!data) continue; + if (!data) { + console.log(` skipped unreadable JSON (${elapsed(fileStart)})`); + continue; + } const rawRows: Record[] = Array.isArray(data) ? data : [data as Record]; + console.log(` raw rows: ${rawRows.length}`); for (const rawRow of rawRows) { if (!rawRow || typeof rawRow !== 'object') continue; @@ -397,7 +429,11 @@ async function main(): Promise { .map((r) => mapBenchmarkRow(r, tracker)) .filter((r): r is NonNullable => r !== null); - if (rows.length === 0) continue; + console.log(` mapped rows: ${rows.length}`); + if (rows.length === 0) { + console.log(` skipped; no mappable rows (${elapsed(fileStart)})`); + continue; + } const toInsert = []; for (const row of rows) { @@ -408,15 +444,21 @@ async function main(): Promise { tracker.recordDbError(`config for ${path.basename(file)}`, error); } } + console.log(` rows with resolved configs: ${toInsert.length}`); if (toInsert.length > 0) { try { + const insertStart = Date.now(); const { newCount, dupCount, insertedIds } = await bulkIngestBenchmarkRows( sql, toInsert, workflowRunId, date, ); + console.log( + ` benchmark rows: +${newCount} new, ${dupCount} dup, ` + + `${insertedIds.length} id(s) (${elapsed(insertStart)})`, + ); totalNewBmk += newCount; totalDupBmk += dupCount; @@ -448,8 +490,13 @@ async function main(): Promise { serverLogPaths.get(stripBmkAndAgenticPrefix(parentDir)); if (logPath) { try { + const serverLogStart = Date.now(); + console.log( + ` server_log ${path.basename(logPath)} (${formatBytes(fileSize(logPath))})`, + ); const serverLog = fs.readFileSync(logPath, 'utf8').replaceAll('\u0000', ''); await insertServerLog(sql, insertedIds, serverLog); + console.log(` server_log linked (${elapsed(serverLogStart)})`); } catch (error: any) { tracker.recordDbError(`server_log for ${configKey}`, error); } @@ -468,6 +515,13 @@ async function main(): Promise { : undefined) ?? traceReplayPaths.get(suffix); if (trace) { try { + const traceStart = Date.now(); + console.log( + ` trace_replay ${suffix}: ` + + `profile=${formatBytes(fileSize(trace.profileJsonl))}, ` + + `server_csv=${formatBytes(fileSize(trace.serverMetricsCsv))}, ` + + `server_json=${formatBytes(fileSize(trace.serverMetricsJson))}`, + ); const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null; const metrics = trace.serverMetricsCsv ? fs.readFileSync(trace.serverMetricsCsv) @@ -476,14 +530,19 @@ async function main(): Promise { ? fs.readFileSync(trace.serverMetricsJson) : null; await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson, { - framework: toInsert[0]?.config.framework, - disagg: toInsert[0]?.config.disagg, + metricsContext: { + framework: toInsert[0]?.config.framework, + disagg: toInsert[0]?.config.disagg, + }, + progressLabel: suffix, }); totalTraceReplayLinked += insertedIds.length; + console.log(` trace_replay ${suffix}: done (${elapsed(traceStart)})`); } catch (error: any) { tracker.recordDbError(`trace_replay for ${suffix}`, error); } } else { + console.log(` trace_replay ${suffix}: missing sibling artifact`); tracker.skips.traceReplayMissing++; } } @@ -491,6 +550,7 @@ async function main(): Promise { tracker.recordDbError(path.basename(file), error); } } + console.log(` finished ${relativeFile} (${elapsed(fileStart)})`); } console.log(` Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`); if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) { From 5fc051fadf869ef71899633d0dfd37592262fe3e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 18:47:21 -0500 Subject: [PATCH 25/40] ci: select agentic ingest target --- .github/workflows/ingest-agentic-results.yml | 59 +++++++++++++++++--- packages/db/src/etl/workflow-run.ts | 2 + packages/db/src/ingest-ci-run.ts | 26 ++++++++- 3 files changed, 77 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml index af94a4c5..fab99f5d 100644 --- a/.github/workflows/ingest-agentic-results.yml +++ b/.github/workflows/ingest-agentic-results.yml @@ -8,7 +8,8 @@ name: Ingest Agentic Benchmark Results # -H "Accept: application/vnd.github+v3+json" \ # https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \ # -d '{"event_type": "ingest-agentic-results", -# "client_payload": {"run-id": "", "run-attempt": ""}}' +# "client_payload": {"run-id": "", "run-attempt": "", +# "database-target": "production"}}' # # The ingest script (packages/db/src/ingest-ci-run.ts) auto-detects agentic # artifacts: benchmark rows land in benchmark_results (benchmark_type= @@ -34,6 +35,15 @@ on: required: false default: '1' type: string + database-target: + description: Database/cache target for the ingest + required: false + default: production + type: choice + options: + - production + - dev + - agentx-v1 jobs: ingest: @@ -59,9 +69,45 @@ jobs: env: CYPRESS_INSTALL_BINARY: '0' - - name: Run migrations + - name: Select ingest target env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }} + REQUESTED_DATABASE_TARGET: ${{ github.event.client_payload.database-target || inputs.database-target || 'production' }} + DATABASE_WRITE_URL_PRODUCTION: ${{ secrets.DATABASE_WRITE_URL }} + DATABASE_WRITE_URL_DEV: ${{ secrets.DATABASE_DEV_WRITE_URL }} + DATABASE_WRITE_URL_AGENTX_V1: ${{ secrets.DATABASE_AGENTX_V1_WRITE_URL }} + run: | + case "$REQUESTED_DATABASE_TARGET" in + production) + database_write_url="$DATABASE_WRITE_URL_PRODUCTION" + cache_invalidate_url="https://inferencex.semianalysis.com/api/v1/invalidate" + ;; + dev) + database_write_url="$DATABASE_WRITE_URL_DEV" + cache_invalidate_url="https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app/api/v1/invalidate" + ;; + agentx-v1) + database_write_url="$DATABASE_WRITE_URL_AGENTX_V1" + cache_invalidate_url="https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app/api/v1/invalidate" + ;; + *) + echo "::error::Unsupported database-target: $REQUESTED_DATABASE_TARGET" + exit 1 + ;; + esac + + if [ -z "$database_write_url" ]; then + echo "::error::Database secret is empty for target: $REQUESTED_DATABASE_TARGET" + exit 1 + fi + + echo "::add-mask::$database_write_url" + echo "DATABASE_WRITE_URL=$database_write_url" >> "$GITHUB_ENV" + echo "INGEST_DATABASE_TARGET=$REQUESTED_DATABASE_TARGET" >> "$GITHUB_ENV" + echo "CACHE_INVALIDATE_URL=$cache_invalidate_url" >> "$GITHUB_ENV" + echo "Selected ingest target: $REQUESTED_DATABASE_TARGET" + echo "Cache invalidate URL: $cache_invalidate_url" + + - name: Run migrations run: pnpm admin:db:migrate --yes - name: Download artifacts from InferenceX run @@ -120,7 +166,6 @@ jobs: - name: Ingest results to DB env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }} GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }} INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }} INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }} @@ -130,20 +175,16 @@ jobs: run: pnpm admin:db:ingest:ci - name: Apply run overrides - env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }} run: pnpm admin:db:apply-overrides --yes - name: Verify database - env: - DATABASE_WRITE_URL: ${{ secrets.DATABASE_DEV_WRITE_URL }} run: pnpm admin:db:verify - name: Invalidate Vercel cache env: VERCEL_INVALIDATE_SECRET: ${{ secrets.VERCEL_INVALIDATE_SECRET }} run: | - curl -sSf -X POST "https://inferencex.semianalysis.com/api/v1/invalidate" \ + curl -sSf -X POST "$CACHE_INVALIDATE_URL" \ -H "Authorization: Bearer $VERCEL_INVALIDATE_SECRET" || true - name: Check for unmapped entities diff --git a/packages/db/src/etl/workflow-run.ts b/packages/db/src/etl/workflow-run.ts index 4097a3c5..28d27c87 100644 --- a/packages/db/src/etl/workflow-run.ts +++ b/packages/db/src/etl/workflow-run.ts @@ -26,6 +26,7 @@ export interface GithubRunInfo { runStartedAt: string | null; headSha: string | null; headBranch: string | null; + headCommitMessage: string | null; runAttempt: number | null; pullRequests: GithubPullRequestRef[]; } @@ -101,6 +102,7 @@ export function createWorkflowRunServices(sql: Sql, githubToken?: string) { runStartedAt: d.run_started_at ? String(d.run_started_at) : null, headSha: d.head_sha ? String(d.head_sha) : null, headBranch: d.head_branch ? String(d.head_branch) : null, + headCommitMessage: d.head_commit?.message ? String(d.head_commit.message) : null, runAttempt: typeof d.run_attempt === 'number' ? d.run_attempt : null, pullRequests, }; diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 15267622..8bdb4157 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -58,6 +58,7 @@ import { ingestEvalRow } from './etl/eval-ingest'; import { mapEvalSamples } from './etl/eval-samples-mapper'; import { bulkIngestEvalSamples } from './etl/eval-samples-ingest'; import { + type ChangelogEntry, parseChangelogEntries, ingestChangelogEntries, hasEvalsOnlyFlag, @@ -335,7 +336,7 @@ async function main(): Promise { const parsedChangelogs: { baseRef: string; headRef: string; - entries: ReturnType; + entries: ChangelogEntry[]; }[] = []; for (const file of changelogFiles) { const data = readJson(file) as Record | null; @@ -346,6 +347,29 @@ async function main(): Promise { const entries = parseChangelogEntries(data.entries); if (entries.length > 0) parsedChangelogs.push({ baseRef, headRef, entries }); } + if (parsedChangelogs.length === 0) { + const headRef = workflowGhInfo?.headBranch ?? workflowGhInfo?.headSha ?? `run-${runIdStr}`; + const fallbackDescription = + workflowGhInfo?.headCommitMessage?.trim().split('\n')[0]?.trim() || + workflowGhInfo?.name || + `GitHub Actions run ${runIdStr}`; + + parsedChangelogs.push({ + baseRef: 'unknown', + headRef, + entries: [ + { + configKeys: [], + description: fallbackDescription, + prLink: null, + evalsOnly: false, + }, + ], + }); + console.log( + ` No changelog metadata artifact found; using fallback changelog: ${fallbackDescription}`, + ); + } const evalsOnly = hasEvalsOnlyFlag(parsedChangelogs); if (evalsOnly) { console.log('\n ⚠ evals-only run detected — skipping benchmark and stats ingest'); From 71185999946f796dc32799927def0baa61126e00 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 19:50:55 -0500 Subject: [PATCH 26/40] fix(ingest): prefer the workflow name for fallback changelog descriptions The head commit message usually describes an unrelated code change; the workflow display name describes the sweep itself. --- packages/db/src/ingest-ci-run.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 8bdb4157..cada82d6 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -349,9 +349,12 @@ async function main(): Promise { } if (parsedChangelogs.length === 0) { const headRef = workflowGhInfo?.headBranch ?? workflowGhInfo?.headSha ?? `run-${runIdStr}`; + // Prefer the workflow's display name ("e2e Test - B300 DSv4 AgentX vLLM 1h + // + 10m warmup") — it describes the sweep; the head commit message usually + // describes an unrelated code change. const fallbackDescription = + workflowGhInfo?.name?.trim() || workflowGhInfo?.headCommitMessage?.trim().split('\n')[0]?.trim() || - workflowGhInfo?.name || `GitHub Actions run ${runIdStr}`; parsedChangelogs.push({ From 2335e2f1caeea4b5faab312c6220ad1fd060ae12 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 20:03:50 -0500 Subject: [PATCH 27/40] fix(ingest): skip failed runs that never issued a request The failed-run guard required num_requests_total > 0, so a config whose server never came up (total = 0, e.g. dep4 conc32 in run 28617267459) slipped through as a dataless point. Any row explicitly reporting zero successful requests is a failure regardless of how many were issued. --- packages/db/src/etl/benchmark-mapper.test.ts | 10 ++++++++++ packages/db/src/etl/benchmark-mapper.ts | 9 +++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts index cde2f74b..bb286734 100644 --- a/packages/db/src/etl/benchmark-mapper.test.ts +++ b/packages/db/src/etl/benchmark-mapper.test.ts @@ -850,6 +850,16 @@ describe('mapBenchmarkRow — v3 agentic nested agg schema', () => { expect(tracker.skips.failedRun).toBe(1); }); + it('skips rows where the server never came up (zero total requests)', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeV3AgenticRow({ num_requests_successful: 0, num_requests_total: 0 }), + tracker, + ); + expect(result).toBeNull(); + expect(tracker.skips.failedRun).toBe(1); + }); + it('leaves v2 flat agentic rows byte-identical (no flattening applied)', () => { const tracker = createSkipTracker(); const result = mapBenchmarkRow( diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index caae08c2..e3fb148e 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -197,14 +197,15 @@ export function mapBenchmarkRow( } // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from - // every runner, including ones with 0 successful requests and null metrics. - // Without this skip, the empty row's nulls overwrite a good row via + // every runner, including failed ones with 0 successful requests and null + // metrics — both the "issued requests but none succeeded" case (total > 0) + // and the "server never came up" case (total === 0). Without this skip the + // empty row lands as a dataless point, or overwrites a good row via // ON CONFLICT DO UPDATE when both share the same (config, conc, offload). if ( typeof row.num_requests_successful === 'number' && row.num_requests_successful === 0 && - typeof row.num_requests_total === 'number' && - row.num_requests_total > 0 + typeof row.num_requests_total === 'number' ) { tracker.skips.failedRun++; return null; From 608867fdec11add64debf628e43a87e8108749ae Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 17:42:41 -0500 Subject: [PATCH 28/40] fix(dump-mode): support agentic + dataset surfaces without a DB The documented DUMP_DIR mode 500'd on every new surface: the four new tables (agentic_trace_replay, datasets, dataset_conversations, run_datasets) were missing from TABLE_INSERT_ORDER so dumps never carried them, json-provider had no mirrors, and ten routes called getDb() with no JSON_MODE guard. Tables added in FK-safe order; bytea blobs round-trip through dump/load (Buffer JSON encoding, ::bytea decode); agentic_trace_replay lazy-loads like server_logs; mirrors reuse the same pure compute helpers as the SQL paths for version-stale fallbacks; all ten routes gain the standard JSON_MODE branch. Verified end-to-end: dump-mode server serves all ten endpoints 200, byte-identical to Postgres on 9/10 (remaining diffs are pre-existing benchmarks-mirror nuances). Adds 21 mirror tests. --- packages/app/next.config.ts | 6 + .../app/api/v1/agentic-aggregates/route.ts | 8 +- .../app/api/v1/benchmark-siblings/route.ts | 11 +- .../[slug]/conversations/[convId]/route.ts | 9 +- .../v1/datasets/[slug]/conversations/route.ts | 12 +- .../src/app/api/v1/datasets/[slug]/route.ts | 11 +- packages/app/src/app/api/v1/datasets/route.ts | 11 +- .../api/v1/derived-agentic-metrics/route.ts | 8 +- .../src/app/api/v1/request-timeline/route.ts | 8 +- .../src/app/api/v1/trace-histograms/route.ts | 8 +- .../app/api/v1/trace-server-metrics/route.ts | 8 +- packages/constants/src/tables.ts | 18 + packages/db/src/dump-db.ts | 20 +- .../json-provider.agentic-datasets.test.ts | 592 +++++++++++++++ packages/db/src/json-provider.ts | 709 +++++++++++++++++- packages/db/src/load-dump.ts | 56 +- packages/db/src/reset-db.ts | 10 +- 17 files changed, 1463 insertions(+), 42 deletions(-) create mode 100644 packages/db/src/json-provider.agentic-datasets.test.ts diff --git a/packages/app/next.config.ts b/packages/app/next.config.ts index 39ab4487..32988f05 100644 --- a/packages/app/next.config.ts +++ b/packages/app/next.config.ts @@ -3,6 +3,12 @@ import type { NextConfig } from 'next'; import { allowedDevOriginsFromEnv } from './src/lib/allowed-dev-origins'; const nextConfig: NextConfig = { + // Allow a second, isolated dev server (e.g. a dump-mode instance on another + // port) to run from the same project dir by pointing it at a separate build + // dir via NEXT_DIST_DIR. Defaults to '.next' so the primary server and all + // CI/prod builds are unaffected. Next.js's single-dev-server lock lives under + // distDir, so distinct dirs let the two coexist. + distDir: process.env.NEXT_DIST_DIR || '.next', allowedDevOrigins: allowedDevOriginsFromEnv(), transpilePackages: ['@semianalysisai/inferencex-constants'], serverExternalPackages: ['shiki'], diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts index 83238e89..9cb229d4 100644 --- a/packages/app/src/app/api/v1/agentic-aggregates/route.ts +++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts @@ -1,4 +1,5 @@ -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getAgenticAggregates, STATS_VERSION, @@ -23,7 +24,10 @@ export const dynamic = 'force-dynamic'; export const CACHE_KEY_PREFIX = `agentic-aggregates-v${STATS_VERSION}`; const getCachedAgenticAggregates = cachedQuery( - (ids: number[]): Promise => getAgenticAggregates(getDb(), ids), + (ids: number[]): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getAgenticAggregates(ids)); + return getAgenticAggregates(getDb(), ids); + }, CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts index 38e79c23..0718aae0 100644 --- a/packages/app/src/app/api/v1/benchmark-siblings/route.ts +++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts @@ -1,4 +1,5 @@ -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getBenchmarkSiblings, type BenchmarkSiblings, @@ -10,10 +11,10 @@ import { idQueryRoute } from '../id-routes'; export const dynamic = 'force-dynamic'; -const getCachedSiblings = cachedQuery( - (id: number): Promise => getBenchmarkSiblings(getDb(), id), - 'benchmark-siblings', -); +const getCachedSiblings = cachedQuery((id: number): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getBenchmarkSiblings(id)); + return getBenchmarkSiblings(getDb(), id); +}, 'benchmark-siblings'); /** * GET /api/v1/benchmark-siblings?id=N diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts index 84cc15e3..61672759 100644 --- a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts @@ -1,6 +1,7 @@ import { type NextRequest, NextResponse } from 'next/server'; -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getConversation, type ConversationDetail, @@ -11,8 +12,10 @@ import { cachedJson, cachedQuery } from '@/lib/api-cache'; export const dynamic = 'force-dynamic'; const getCachedConversation = cachedQuery( - (slug: string, convId: string): Promise => - getConversation(getDb(), slug, convId), + (slug: string, convId: string): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getConversation(slug, convId)); + return getConversation(getDb(), slug, convId); + }, 'dataset-conversation', ); diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts index 62b9e5b7..196c29d6 100644 --- a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts @@ -1,6 +1,7 @@ import { type NextRequest, NextResponse } from 'next/server'; -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { listConversations, type ConversationList, @@ -20,13 +21,16 @@ const getCachedConversations = cachedQuery( limit: number, offset: number, sort: string, - ): Promise => - listConversations(getDb(), slug, { + ): Promise => { + const opts: ListConversationsOpts = { search: search || undefined, limit, offset, sort: sort as ListConversationsOpts['sort'], - }), + }; + if (JSON_MODE) return Promise.resolve(jsonProvider.listConversations(slug, opts)); + return listConversations(getDb(), slug, opts); + }, 'dataset-conversations', ); diff --git a/packages/app/src/app/api/v1/datasets/[slug]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/route.ts index 9e4af580..e440ff5d 100644 --- a/packages/app/src/app/api/v1/datasets/[slug]/route.ts +++ b/packages/app/src/app/api/v1/datasets/[slug]/route.ts @@ -1,16 +1,17 @@ import { type NextRequest, NextResponse } from 'next/server'; -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getDataset, type DatasetDetail } from '@semianalysisai/inferencex-db/queries/datasets'; import { cachedJson, cachedQuery } from '@/lib/api-cache'; export const dynamic = 'force-dynamic'; -const getCachedDataset = cachedQuery( - (slug: string): Promise => getDataset(getDb(), slug), - 'dataset', -); +const getCachedDataset = cachedQuery((slug: string): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getDataset(slug)); + return getDataset(getDb(), slug); +}, 'dataset'); /** GET /api/v1/datasets/[slug] — one dataset incl. precomputed chart_data. */ export async function GET( diff --git a/packages/app/src/app/api/v1/datasets/route.ts b/packages/app/src/app/api/v1/datasets/route.ts index f0acca3c..3ad4c15d 100644 --- a/packages/app/src/app/api/v1/datasets/route.ts +++ b/packages/app/src/app/api/v1/datasets/route.ts @@ -1,16 +1,17 @@ import { NextResponse } from 'next/server'; -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { listDatasets, type DatasetRecord } from '@semianalysisai/inferencex-db/queries/datasets'; import { cachedJson, cachedQuery } from '@/lib/api-cache'; export const dynamic = 'force-dynamic'; -const getCachedDatasets = cachedQuery( - (): Promise => listDatasets(getDb()), - 'datasets', -); +const getCachedDatasets = cachedQuery((): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.listDatasets()); + return listDatasets(getDb()); +}, 'datasets'); /** GET /api/v1/datasets — all ingested cc-traces-weka datasets (registry cards). */ export async function GET() { diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts index 647b6dda..3afa5d41 100644 --- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts +++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts @@ -1,5 +1,6 @@ import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getDerivedAgenticMetrics, type DerivedAgenticMetricMap, @@ -24,7 +25,10 @@ export const dynamic = 'force-dynamic'; export const CACHE_KEY_PREFIX = `derived-agentic-metrics-v${STATS_VERSION}`; const getCachedDerivedAgenticMetrics = cachedQuery( - (ids: number[]): Promise => getDerivedAgenticMetrics(getDb(), ids), + (ids: number[]): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getDerivedAgenticMetrics(ids)); + return getDerivedAgenticMetrics(getDb(), ids); + }, CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts index bd1d67f5..89b599af 100644 --- a/packages/app/src/app/api/v1/request-timeline/route.ts +++ b/packages/app/src/app/api/v1/request-timeline/route.ts @@ -1,5 +1,6 @@ import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline'; -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getRequestTimeline, type RequestTimeline, @@ -19,7 +20,10 @@ export const dynamic = 'force-dynamic'; export const CACHE_KEY_PREFIX = `request-timeline-v${REQUEST_TIMELINE_VERSION}`; const getCachedRequestTimeline = cachedQuery( - (id: number): Promise => getRequestTimeline(getDb(), id), + (id: number): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getRequestTimeline(id)); + return getRequestTimeline(getDb(), id); + }, CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts index 206205f5..4d3014ab 100644 --- a/packages/app/src/app/api/v1/trace-histograms/route.ts +++ b/packages/app/src/app/api/v1/trace-histograms/route.ts @@ -1,5 +1,6 @@ import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline'; -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getTraceHistograms, type TraceHistogramMap, @@ -24,7 +25,10 @@ export const dynamic = 'force-dynamic'; export const CACHE_KEY_PREFIX = `trace-histograms-v${REQUEST_TIMELINE_VERSION}`; const getCachedTraceHistograms = cachedQuery( - (ids: number[]): Promise => getTraceHistograms(getDb(), ids), + (ids: number[]): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getTraceHistograms(ids)); + return getTraceHistograms(getDb(), ids); + }, CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts index 149fefbf..2d3554a4 100644 --- a/packages/app/src/app/api/v1/trace-server-metrics/route.ts +++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts @@ -1,5 +1,6 @@ import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series'; -import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getTraceServerMetrics, type TraceServerMetrics, @@ -19,7 +20,10 @@ export const dynamic = 'force-dynamic'; export const CACHE_KEY_PREFIX = `trace-server-metrics-v${CHART_SERIES_VERSION}`; const getCachedTraceServerMetrics = cachedQuery( - (id: number): Promise => getTraceServerMetrics(getDb(), id), + (id: number): Promise => { + if (JSON_MODE) return jsonProvider.getTraceServerMetrics(id); + return getTraceServerMetrics(getDb(), id); + }, CACHE_KEY_PREFIX, { blobOnly: true }, ); diff --git a/packages/constants/src/tables.ts b/packages/constants/src/tables.ts index 60e85182..f482fd5e 100644 --- a/packages/constants/src/tables.ts +++ b/packages/constants/src/tables.ts @@ -2,6 +2,7 @@ export const TABLE_NAMES = { configs: 'configs', workflowRuns: 'workflow_runs', + agenticTraceReplay: 'agentic_trace_replay', benchmarkResults: 'benchmark_results', serverLogs: 'server_logs', runStats: 'run_stats', @@ -9,21 +10,38 @@ export const TABLE_NAMES = { evalSamples: 'eval_samples', changelogEntries: 'changelog_entries', availability: 'availability', + datasets: 'datasets', + datasetConversations: 'dataset_conversations', + runDatasets: 'run_datasets', schemaMigrations: 'schema_migrations', } as const; /** * Data tables in FK-safe insertion order. * Parents before children — safe for dump, load, and (reversed) reset. + * + * FK edges enforced by this ordering (verified against migration 008_agentic.sql + * and the live schema's pg_constraint): + * - benchmark_results.trace_replay_id → agentic_trace_replay(id) + * ⇒ agentic_trace_replay before benchmark_results + * - dataset_conversations.dataset_id → datasets(id) + * ⇒ datasets before dataset_conversations + * - run_datasets.workflow_run_id → workflow_runs(id) + * ⇒ workflow_runs before run_datasets (run_datasets.dataset_slug is a + * plain slug, NOT an FK to datasets, so it needs no ordering vs datasets) */ export const TABLE_INSERT_ORDER = [ TABLE_NAMES.configs, TABLE_NAMES.serverLogs, TABLE_NAMES.workflowRuns, + TABLE_NAMES.agenticTraceReplay, TABLE_NAMES.benchmarkResults, TABLE_NAMES.evalResults, TABLE_NAMES.evalSamples, TABLE_NAMES.runStats, TABLE_NAMES.changelogEntries, TABLE_NAMES.availability, + TABLE_NAMES.datasets, + TABLE_NAMES.datasetConversations, + TABLE_NAMES.runDatasets, ] as const; diff --git a/packages/db/src/dump-db.ts b/packages/db/src/dump-db.ts index 3810fe7a..d0e315d1 100644 --- a/packages/db/src/dump-db.ts +++ b/packages/db/src/dump-db.ts @@ -18,7 +18,25 @@ const sql = createAdminSql({ noSsl: hasNoSslFlag(), readonly: true, max: 1 }); const CURSOR_BATCH = 100; -/** Stream a table to a JSON file using a cursor, writing row-by-row. */ +/** + * Stream a table to a JSON file using a cursor, writing row-by-row. + * + * BYTEA round-trip: postgres.js decodes a `bytea` column to a Node `Buffer`. + * `JSON.stringify(buffer)` invokes Buffer.prototype.toJSON(), which emits + * `{"type":"Buffer","data":[, …]}`. That's a lossless byte-array encoding + * (verified: JSON.parse → Buffer.from(obj.data) reproduces the exact bytes), so + * `agentic_trace_replay`'s blob columns (profile_export_jsonl_gz, + * server_metrics_csv, server_metrics_json_gz) survive the dump verbatim. + * load-dump.ts reconstructs the Buffer and casts it back to `::bytea`. + * + * Dump-size note: the byte-array encoding is ~4-6× the raw bytea size (each + * byte becomes 1-4 ASCII digits + a comma). For the big compressed blobs + * (server_metrics_json_gz can be ~17 MB compressed on high-conc TP+EP rows) + * the resulting agentic_trace_replay.json is the largest file in the dump — the + * same trade-off server_logs.json already makes. We keep all columns (no + * dropping) so dump mode has full parity with the DB, and json-provider + * lazy-loads this file only when a blob-backed route actually needs a fallback. + */ async function streamTable(table: string, outPath: string): Promise { const out = createWriteStream(outPath); out.write('[\n'); diff --git a/packages/db/src/json-provider.agentic-datasets.test.ts b/packages/db/src/json-provider.agentic-datasets.test.ts new file mode 100644 index 00000000..d6cb6601 --- /dev/null +++ b/packages/db/src/json-provider.agentic-datasets.test.ts @@ -0,0 +1,592 @@ +import { mkdtempSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { gzipSync } from 'node:zlib'; + +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +import { REQUEST_TIMELINE_VERSION } from './etl/compute-request-timeline.js'; +import { STATS_VERSION } from './queries/agentic-shared.js'; +import type * as JsonProvider from './json-provider.js'; + +/** + * Fixture-backed parity tests for the PR348 dump-mode mirrors added to + * json-provider.ts: the 6 agentic per-point queries + the 4 dataset queries. + * + * The store is a lazy singleton keyed off DUMP_DIR, so we write a small dump + * directory, point DUMP_DIR at it, and dynamically import the module once. + * + * Coverage per mirror: + * - fast path: precomputed JSONB (aggregate_stats / chart_series / + * request_timeline) at the CURRENT version is served verbatim. + * - blob fallback: a STALE version forces a re-derive from the (dumped) blob + * using the same pure helper the SQL path uses. + * - bytea round-trip: blobs are stored as {type:'Buffer',data:[…]} (what + * dump-db emits) and must gunzip cleanly. + */ + +/** Encode a Buffer the way dump-db.ts does (Buffer.prototype.toJSON()). */ +function byteaJson(buf: Buffer): { type: 'Buffer'; data: number[] } { + return { type: 'Buffer', data: [...buf] }; +} + +// A tiny profiling-phase profile_export.jsonl with two conversations/turns so +// extractIslOsl / computeDerivedFromBlob / computeRequestTimeline all produce +// non-empty output. +const PROFILE_JSONL = [ + JSON.stringify({ + metadata: { + benchmark_phase: 'profiling', + conversation_id: 'convA', + turn_index: 0, + worker_id: 'w0', + credit_issued_ns: 1_000_000_000, + request_start_ns: 1_000_000_000, + request_ack_ns: 1_050_000_000, + request_end_ns: 1_500_000_000, + }, + metrics: { + input_sequence_length: { value: 1000 }, + output_sequence_length: { value: 200 }, + time_to_first_token: { value: 50 }, + request_latency: { value: 500 }, + }, + }), + JSON.stringify({ + metadata: { + benchmark_phase: 'profiling', + conversation_id: 'convB', + turn_index: 0, + worker_id: 'w1', + credit_issued_ns: 2_000_000_000, + request_start_ns: 2_000_000_000, + request_ack_ns: 2_040_000_000, + request_end_ns: 2_800_000_000, + }, + metrics: { + input_sequence_length: { value: 2000 }, + output_sequence_length: { value: 400 }, + time_to_first_token: { value: 40 }, + request_latency: { value: 800 }, + }, + }), +].join('\n'); + +// A minimal server_metrics_json with one KV-cache gauge series so +// extractServerMetricSamples / computeChartSeries yield a value. +const SERVER_JSON = JSON.stringify({ + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + labels: { engine: '0' }, + timeslices: [ + { start_ns: 0, avg: 0.4 }, + { start_ns: 1_000_000_000, avg: 0.6 }, + ], + }, + ], + }, + }, +}); + +const PROFILE_GZ = gzipSync(Buffer.from(PROFILE_JSONL, 'utf8')); +const SERVER_GZ = gzipSync(Buffer.from(SERVER_JSON, 'utf8')); + +// Precomputed JSONB payloads at the CURRENT versions (fast path). +const CURRENT_AGG_STATS = { + version: STATS_VERSION, + isl: { mean: 1500, p50: 1500, p75: 1750, p90: 1900, p99: 1990, n: 2 }, + osl: { mean: 300, p50: 300, p75: 350, p90: 380, p99: 398, n: 2 }, + kvCacheUtil: { mean: 0.5, p50: 0.5, p75: 0.55, p90: 0.58, p99: 0.6, n: 2 }, + prefixCacheHitRate: null, + normalizedSessionTimeS: 0.65, + p90PrefillTpsPerUser: 42, + normalizedE2e400: { mean: 0.5, p50: 0.5, p75: 0.7, p90: 0.9, p99: 0.99, n: 2 }, +}; + +const CURRENT_TIMELINE = { + version: REQUEST_TIMELINE_VERSION, + startNs: 0, + endNs: 1_000_000, + durationS: 0.001, + requests: [ + { + cid: 'convA', + ti: 0, + wid: 'w0', + ad: 0, + phase: 'profiling', + credit: 0, + start: 0, + ack: 5, + end: 500, + ttftMs: 50, + tpotMs: null, + isl: 1000, + osl: 200, + cancelled: false, + }, + ], +}; + +let jp: typeof JsonProvider; + +beforeAll(async () => { + const dir = mkdtempSync(join(tmpdir(), 'infx-pr348-')); + + // configs / workflow_runs / benchmark_results — enough for the agentic mirrors. + writeFileSync( + join(dir, 'configs.json'), + JSON.stringify([ + { + id: 1, + hardware: 'h100', + framework: 'vllm', + model: 'testm', + precision: 'fp8', + spec_method: 'none', + disagg: false, + is_multinode: false, + prefill_tp: 1, + prefill_ep: 1, + prefill_dp_attention: false, + prefill_num_workers: 1, + decode_tp: 2, + decode_ep: 1, + decode_dp_attention: false, + decode_num_workers: 1, + num_prefill_gpu: 0, + num_decode_gpu: 8, + }, + { + id: 2, + hardware: 'h100', + framework: 'vllm', + model: 'testm', + precision: 'fp8', + spec_method: 'none', + disagg: false, + is_multinode: false, + prefill_tp: 1, + prefill_ep: 1, + prefill_dp_attention: false, + prefill_num_workers: 1, + decode_tp: 4, + decode_ep: 1, + decode_dp_attention: false, + decode_num_workers: 1, + num_prefill_gpu: 0, + num_decode_gpu: 8, + }, + ]), + ); + writeFileSync( + join(dir, 'workflow_runs.json'), + JSON.stringify([ + { + id: 10, + github_run_id: 555, + run_attempt: 1, + name: 'run 555', + status: 'completed', + conclusion: 'success', + head_sha: 'sha', + head_branch: 'main', + html_url: 'https://github.com/x/runs/555', + created_at: '2026-06-14T04:00:00Z', + run_started_at: '2026-06-14T04:00:00Z', + date: '2026-06-14', + }, + ]), + ); + // id 1 → trace_replay 100 (fast-path stats + timeline). id 2 → trace_replay 200 + // (STALE stats + timeline → forces blob fallback). id 3 has no trace_replay. + writeFileSync( + join(dir, 'benchmark_results.json'), + JSON.stringify([ + { + id: 1, + workflow_run_id: 10, + config_id: 1, + benchmark_type: 'agentic_traces', + date: '2026-06-14', + isl: null, + osl: null, + conc: 16, + offload_mode: 'off', + image: null, + metrics: { + tput_per_gpu: 123, + total_requests_completed: 200, + server_gpu_cache_hit_rate: 0.5, + }, + error: null, + server_log_id: null, + trace_replay_id: 100, + }, + { + id: 2, + workflow_run_id: 10, + config_id: 2, + benchmark_type: 'agentic_traces', + date: '2026-06-14', + isl: null, + osl: null, + conc: 32, + offload_mode: 'on', + image: null, + metrics: { tput_per_gpu: 456, num_requests_total: 180 }, + error: null, + server_log_id: null, + trace_replay_id: 200, + }, + { + id: 3, + workflow_run_id: 10, + config_id: 1, + benchmark_type: 'agentic_traces', + date: '2026-06-14', + isl: null, + osl: null, + conc: 8, + offload_mode: 'off', + image: null, + metrics: {}, + error: null, + server_log_id: null, + trace_replay_id: null, + }, + ]), + ); + + // agentic_trace_replay: 100 = current JSONB, 200 = stale JSONB (force blob). + writeFileSync( + join(dir, 'agentic_trace_replay.json'), + JSON.stringify([ + { + id: 100, + profile_export_jsonl_gz: byteaJson(PROFILE_GZ), + profile_export_uncompressed_size: PROFILE_JSONL.length, + server_metrics_csv: null, + server_metrics_csv_size: null, + server_metrics_json_gz: byteaJson(SERVER_GZ), + server_metrics_json_uncompressed_size: SERVER_JSON.length, + aggregate_stats: CURRENT_AGG_STATS, + chart_series: null, // no current chart_series → trace-server-metrics uses blob + request_timeline: CURRENT_TIMELINE, + created_at: '2026-06-14T04:00:00Z', + }, + { + id: 200, + profile_export_jsonl_gz: byteaJson(PROFILE_GZ), + profile_export_uncompressed_size: PROFILE_JSONL.length, + server_metrics_csv: null, + server_metrics_csv_size: null, + server_metrics_json_gz: byteaJson(SERVER_GZ), + server_metrics_json_uncompressed_size: SERVER_JSON.length, + aggregate_stats: { version: 1 }, // stale → force profile-blob fallback + chart_series: { version: 1 }, // stale → force server-blob fallback + request_timeline: { version: 1 }, // stale → force profile-blob fallback + created_at: '2026-06-14T04:00:00Z', + }, + ]), + ); + + // Datasets fixtures. + writeFileSync( + join(dir, 'datasets.json'), + JSON.stringify([ + { + id: 'org/ds-new', + slug: 'ds-new', + label: 'DS New', + variant: 'full', + description: 'newest', + hf_url: null, + license: null, + conversation_count: 3, + summary: { totalIn: 100 }, + chart_data: { hist: [1, 2, 3] }, + dataset_version: 1, + ingested_at: '2026-06-20T00:00:00Z', + }, + { + id: 'org/ds-old', + slug: 'ds-old', + label: 'DS Old', + variant: 'full', + description: 'oldest', + hf_url: null, + license: null, + conversation_count: 0, + summary: {}, + chart_data: {}, + dataset_version: 1, + ingested_at: '2026-06-10T00:00:00Z', + }, + ]), + ); + writeFileSync( + join(dir, 'dataset_conversations.json'), + JSON.stringify([ + { + id: 1, + dataset_id: 'org/ds-new', + conv_id: 'agent-alpha', + models: ['m1'], + num_turns: 5, + num_subagent_groups: 2, + total_in: 300, + total_out: 30, + total_cached: 10, + structure: { nodes: [] }, + }, + { + id: 2, + dataset_id: 'org/ds-new', + conv_id: 'AGENT-beta', + models: ['m2'], + num_turns: 9, + num_subagent_groups: 1, + total_in: 100, + total_out: 20, + total_cached: 5, + structure: { nodes: [{ kind: 'turn' }] }, + }, + { + id: 3, + dataset_id: 'org/ds-new', + conv_id: 'plain-gamma', + models: ['m1'], + num_turns: 2, + num_subagent_groups: 4, + total_in: 200, + total_out: 40, + total_cached: 15, + structure: { nodes: [] }, + }, + ]), + ); + writeFileSync( + join(dir, 'run_datasets.json'), + JSON.stringify([ + { workflow_run_id: 10, dataset_slug: 'ds-new', created_at: '2026-06-14T04:00:00Z' }, + ]), + ); + + // Empty tables the store loads eagerly. + for (const f of [ + 'run_stats.json', + 'eval_results.json', + 'availability.json', + 'changelog_entries.json', + ]) { + writeFileSync(join(dir, f), '[]'); + } + + process.env.DUMP_DIR = dir; + jp = await import('./json-provider.js'); +}); + +afterAll(() => { + delete process.env.DUMP_DIR; +}); + +describe('agentic aggregates mirror', () => { + it('serves precomputed aggregate_stats at the current version (fast path)', () => { + const map = jp.getAgenticAggregates([1]); + expect(map[1]?.isl).toEqual(CURRENT_AGG_STATS.isl); + expect(map[1]?.kvCacheUtil).toEqual(CURRENT_AGG_STATS.kvCacheUtil); + }); + + it('re-derives from the dumped blobs when the stored version is stale', () => { + const map = jp.getAgenticAggregates([2]); + // isl percentiles from the two-turn profile blob (1000, 2000). + expect(map[2]?.isl?.n).toBe(2); + expect(map[2]?.isl?.mean).toBe(1500); + // kv cache util from the server blob (0.4, 0.6). + expect(map[2]?.kvCacheUtil?.n).toBe(2); + expect(map[2]?.kvCacheUtil?.mean).toBeCloseTo(0.5); + }); + + it('returns a blank aggregate for an id with no trace_replay', () => { + const map = jp.getAgenticAggregates([3]); + expect(map[3]).toEqual({ + id: 3, + isl: null, + osl: null, + kvCacheUtil: null, + prefixCacheHitRate: null, + }); + }); +}); + +describe('derived agentic metrics mirror', () => { + it('fast path reads the derived fields out of aggregate_stats', () => { + const map = jp.getDerivedAgenticMetrics([1]); + expect(map[1]?.normalized_session_time_s).toBe(0.65); + expect(map[1]?.p90_prefill_tps_per_user).toBe(42); + expect(map[1]?.p75_normalized_e2e_400_s).toBe(0.7); + }); + + it('blob fallback recomputes via computeDerivedFromBlob', () => { + const map = jp.getDerivedAgenticMetrics([2]); + expect(map[2]?.normalized_session_time_s).not.toBeNull(); + expect(map[2]?.p90_prefill_tps_per_user).not.toBeNull(); + }); + + it('omits ids without a trace_replay (SQL joins on it)', () => { + const map = jp.getDerivedAgenticMetrics([3]); + expect(map[3]).toBeUndefined(); + }); +}); + +describe('request timeline mirror', () => { + it('serves the precomputed timeline at the current version', () => { + const t = jp.getRequestTimeline(1); + expect(t?.version).toBe(REQUEST_TIMELINE_VERSION); + expect(t?.requests).toHaveLength(1); + }); + + it('recomputes from the profile blob when stale', () => { + const t = jp.getRequestTimeline(2); + expect(t?.version).toBe(REQUEST_TIMELINE_VERSION); + // Two turns in the fixture blob. + expect(t?.requests).toHaveLength(2); + }); + + it('returns null for an id without a trace_replay', () => { + expect(jp.getRequestTimeline(3)).toBeNull(); + }); +}); + +describe('trace server metrics mirror', () => { + it('computes chart series from the server blob (no current chart_series)', async () => { + const m = await jp.getTraceServerMetrics(1); + expect(m).not.toBeNull(); + expect(m?.meta.hardware).toBe('h100'); + expect(m?.meta.run_url).toBe('https://github.com/x/runs/555/attempts/1'); + expect(m?.kvCacheUsage.length).toBeGreaterThan(0); + }); + + it('returns null for an id without a trace_replay blob', async () => { + expect(await jp.getTraceServerMetrics(3)).toBeNull(); + }); +}); + +describe('trace histograms mirror', () => { + it('extracts isl/osl from the current request_timeline (fast path)', () => { + const map = jp.getTraceHistograms([1]); + expect(map[1]?.isl).toEqual([1000]); + expect(map[1]?.osl).toEqual([200]); + }); + + it('falls back to the profile blob when the timeline is stale', () => { + const map = jp.getTraceHistograms([2]); + expect(map[2]?.isl).toEqual([1000, 2000]); + expect(map[2]?.osl).toEqual([200, 400]); + }); + + it('omits ids without a trace_replay', () => { + const map = jp.getTraceHistograms([3]); + expect(map[3]).toBeUndefined(); + }); +}); + +describe('benchmark siblings mirror', () => { + it('groups rows sharing the SKU within the run, sorted by decode_tp then offload', () => { + const res = jp.getBenchmarkSiblings(1); + expect(res).not.toBeNull(); + expect(res?.sku.model).toBe('testm'); + expect(res?.sku.dataset_slug).toBe('ds-new'); // via run_datasets + // ids 1 (tp2/off/conc16), 2 (tp4/on), 3 (tp2/off/conc8) share the SKU. + // ORDER BY decode_tp asc → tp2 group (ids 1,3) before tp4 (id 2); within + // tp2 both are offload 'off', so final tie-break is conc asc → id 3 (conc 8) + // before id 1 (conc 16). Matches the SQL `order by … br.conc`. + const ids = res?.siblings.map((s) => s.id); + expect(ids).toEqual([3, 1, 2]); + expect(res?.siblings.find((s) => s.id === 1)?.is_current).toBe(true); + expect(res?.siblings.find((s) => s.id === 1)?.has_trace).toBe(true); + expect(res?.siblings.find((s) => s.id === 3)?.has_trace).toBe(false); + // total_requests coalesces total_requests_completed then num_requests_total. + expect(res?.siblings.find((s) => s.id === 1)?.total_requests).toBe(200); + expect(res?.siblings.find((s) => s.id === 2)?.total_requests).toBe(180); + }); + + it('returns null for an unknown benchmark id', () => { + expect(jp.getBenchmarkSiblings(9999)).toBeNull(); + }); +}); + +describe('dataset mirrors', () => { + it('listDatasets orders newest ingested first', () => { + const rows = jp.listDatasets(); + expect(rows.map((r) => r.slug)).toEqual(['ds-new', 'ds-old']); + // chart_data is excluded from the list rows (DatasetRecord, not DatasetDetail). + expect((rows[0] as unknown as Record).chart_data).toBeUndefined(); + }); + + it('getDataset returns one dataset including chart_data', () => { + const d = jp.getDataset('ds-new'); + expect(d?.label).toBe('DS New'); + expect(d?.chart_data).toEqual({ hist: [1, 2, 3] }); + expect(jp.getDataset('nope')).toBeNull(); + }); + + it('renders ingested_at in Postgres ::text form (parity with the SQL path)', () => { + // Dump stores ISO ('2026-06-20T00:00:00Z'); the SQL query casts ::text → + // '2026-06-20 00:00:00+00'. The mirror must match, not leak the ISO form. + expect(jp.getDataset('ds-new')?.ingested_at).toBe('2026-06-20 00:00:00+00'); + expect(jp.listDatasets()[0]?.ingested_at).toBe('2026-06-20 00:00:00+00'); + }); + + it('listConversations applies case-insensitive search, sort, and pagination', () => { + // Default sort = tokens (total_in desc): alpha(300), plain(200), AGENT-beta(100). + const all = jp.listConversations('ds-new'); + expect(all?.total).toBe(3); + expect(all?.items.map((c) => c.conv_id)).toEqual(['agent-alpha', 'plain-gamma', 'AGENT-beta']); + + // ILIKE '%agent%' matches 'agent-alpha' and 'AGENT-beta' (case-insensitive). + const search = jp.listConversations('ds-new', { search: 'agent' }); + expect(search?.total).toBe(2); + expect(search?.items.map((c) => c.conv_id).toSorted()).toEqual(['AGENT-beta', 'agent-alpha']); + + // sort=turns desc → beta(9), alpha(5), gamma(2). + const byTurns = jp.listConversations('ds-new', { sort: 'turns' }); + expect(byTurns?.items.map((c) => c.conv_id)).toEqual([ + 'AGENT-beta', + 'agent-alpha', + 'plain-gamma', + ]); + + // sort=subagents desc → gamma(4), alpha(2), beta(1). + const bySub = jp.listConversations('ds-new', { sort: 'subagents' }); + expect(bySub?.items.map((c) => c.conv_id)).toEqual([ + 'plain-gamma', + 'agent-alpha', + 'AGENT-beta', + ]); + + // sort=id asc. Postgres en_US.utf8 collation (verified against the live DB) + // orders 'agent-alpha' before 'AGENT-beta'; String.localeCompare matches. + const byId = jp.listConversations('ds-new', { sort: 'id' }); + expect(byId?.items.map((c) => c.conv_id)).toEqual(['agent-alpha', 'AGENT-beta', 'plain-gamma']); + + // limit + offset. + const paged = jp.listConversations('ds-new', { limit: 1, offset: 1 }); + expect(paged?.total).toBe(3); + expect(paged?.items.map((c) => c.conv_id)).toEqual(['plain-gamma']); + + // Unknown dataset → null. + expect(jp.listConversations('nope')).toBeNull(); + }); + + it('getConversation returns one flamegraph structure', () => { + const c = jp.getConversation('ds-new', 'agent-alpha'); + expect(c?.num_turns).toBe(5); + expect(c?.structure).toEqual({ nodes: [] }); + expect(jp.getConversation('ds-new', 'missing')).toBeNull(); + expect(jp.getConversation('nope', 'agent-alpha')).toBeNull(); + }); +}); diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index 2d335d17..f6c626f0 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -12,9 +12,47 @@ import { existsSync, readFileSync } from 'node:fs'; import { dirname, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; +// Runtime-value cross-module imports use extensionless relative paths (the +// convention in etl/queries here), NOT the `.js` type-only style below — the +// app bundler (Turbopack) resolves the former but not a `.js` on a value import. +import { + CHART_SERIES_VERSION, + computeChartSeries, + type ChartSeries, +} from './etl/compute-chart-series'; +import { + REQUEST_TIMELINE_VERSION, + computeRequestTimeline, + type RequestTimeline, +} from './etl/compute-request-timeline'; +import { + extractIslOsl, + extractServerMetricSamples, + percentilesOf, + STATS_VERSION, + type AgenticAggregate, + type AgenticAggregateMap, +} from './queries/agentic-aggregates'; import type { BenchmarkRow, BenchmarkWorkerRow } from './queries/benchmarks.js'; +import type { BenchmarkSiblings } from './queries/benchmark-siblings.js'; +import type { + ConversationDetail, + ConversationList, + ConversationListItem, + DatasetDetail, + DatasetRecord, + ListConversationsOpts, +} from './queries/datasets.js'; +import { + computeDerivedFromBlob, + type DerivedAgenticMetric, + type DerivedAgenticMetricMap, +} from './queries/derived-agentic-metrics'; import type { EvalRow } from './queries/evaluations.js'; import type { ReliabilityRow } from './queries/reliability.js'; +import type { TraceHistogramMap, TraceHistogramPoint } from './queries/trace-histograms.js'; +import type { PointMeta, TraceServerMetrics } from './queries/trace-server-metrics.js'; +import type { ConversationStructure } from './etl/weka-structure.js'; import type { AvailabilityRow, ChangelogRow, @@ -22,6 +60,7 @@ import type { RunConfigRow, WorkflowRunRow, } from './queries/workflow-info.js'; +import { gunzipSync } from 'node:zlib'; // --------------------------------------------------------------------------- // Raw table types (matching dump-db.ts output) @@ -132,6 +171,78 @@ interface RawServerLog { server_log: string; } +/** + * A serialized bytea column from the dump. dump-db.ts writes postgres.js Buffers + * via Buffer.prototype.toJSON() → {"type":"Buffer","data":[…]}. Decode with + * {@link bufferFromJson} back to a Node Buffer for the compute helpers (which + * take the same `Buffer | null` a live DB read would hand them). + */ +interface BufferJson { + type: 'Buffer'; + data: number[]; +} + +/** + * agentic_trace_replay rows. Blob columns are big (server_metrics_json_gz can be + * ~17 MB compressed), so this whole table is lazy-loaded like server_logs. The + * precomputed JSONB columns (aggregate_stats / chart_series / request_timeline) + * are what the fast paths actually serve; the blobs only feed the version-stale + * fallback (reusing the exact same compute helpers the SQL path uses). + */ +interface RawTraceReplay { + id: number; + profile_export_jsonl_gz: BufferJson | null; + profile_export_uncompressed_size: number | null; + server_metrics_csv: BufferJson | null; + server_metrics_csv_size: number | null; + server_metrics_json_gz: BufferJson | null; + server_metrics_json_uncompressed_size: number | null; + aggregate_stats: Record | null; + chart_series: Record | null; + request_timeline: Record | null; + created_at: string; +} + +interface RawDataset { + id: string; + slug: string; + label: string; + variant: string; + description: string | null; + hf_url: string | null; + license: string | null; + conversation_count: number; + summary: Record; + chart_data: Record; + dataset_version: number; + ingested_at: string; +} + +interface RawDatasetConversation { + id: number; + dataset_id: string; + conv_id: string; + models: string[]; + num_turns: number; + num_subagent_groups: number; + total_in: number; + total_out: number; + total_cached: number; + structure: Record; +} + +interface RawRunDataset { + workflow_run_id: number; + dataset_slug: string; + created_at: string; +} + +/** Decode a dumped bytea ({type:'Buffer',data:[…]}) back into a Node Buffer. */ +function bufferFromJson(b: BufferJson | null | undefined): Buffer | null { + if (!b || !Array.isArray(b.data)) return null; + return Buffer.from(b.data); +} + // --------------------------------------------------------------------------- // In-memory store (lazy-loaded singleton) // --------------------------------------------------------------------------- @@ -152,6 +263,24 @@ interface Store { serverLogs: Map | null; /** benchmark_result.id → server_log_id (for server-log lookups) */ benchmarkServerLogMap: Map; + /** benchmark_result.id → trace_replay_id (for agentic blob-backed lookups) */ + benchmarkTraceReplayMap: Map; + /** + * Lazy-loaded: agentic_trace_replay.json holds the big compressed blobs. + * Keyed by trace_replay id. Loaded on first agentic-route access, mirroring + * the server_logs lazy pattern. Null until then. + */ + traceReplay: Map | null; + /** Datasets registry (small, eager). */ + datasets: RawDataset[]; + /** dataset id → dataset (fast lookup). */ + datasetsById: Map; + /** dataset slug → dataset (slug is unique). */ + datasetsBySlug: Map; + /** All conversation rows (eager; counts + structure JSONB, no blobs). */ + datasetConversations: RawDatasetConversation[]; + /** workflow_run_id → dataset_slug (for benchmark-siblings SKU deep-link). */ + runDatasetSlugByRunId: Map; } let store: Store | null = null; @@ -192,6 +321,15 @@ function getStore(): Store { const rawEvals = loadTable(resolvedDir, 'eval_results.json'); const rawAvailability = loadTable(resolvedDir, 'availability.json'); const rawChangelog = loadTable(resolvedDir, 'changelog_entries.json'); + // Datasets + run_datasets are small (registry rows + one row per run) and + // dataset_conversations holds only counts + a per-conversation structure + // JSONB — all comfortably eager. agentic_trace_replay is lazy (blobs) below. + const rawDatasets = loadTable(resolvedDir, 'datasets.json'); + const rawDatasetConversations = loadTable( + resolvedDir, + 'dataset_conversations.json', + ); + const rawRunDatasets = loadTable(resolvedDir, 'run_datasets.json'); // Postgres bigserial columns serialize as strings in JSON — coerce to numbers. for (const wr of rawRuns) { @@ -216,6 +354,18 @@ function getStore(): Store { cl.id = Number(cl.id); cl.workflow_run_id = Number(cl.workflow_run_id); } + // Postgres bigint/bigserial + integer columns serialize as strings in JSON — + // coerce to numbers so the mirrors do numeric math and JSON parity matches. + for (const d of rawDatasets) d.conversation_count = Number(d.conversation_count); + for (const dc of rawDatasetConversations) { + dc.id = Number(dc.id); + dc.num_turns = Number(dc.num_turns); + dc.num_subagent_groups = Number(dc.num_subagent_groups); + dc.total_in = Number(dc.total_in); + dc.total_out = Number(dc.total_out); + dc.total_cached = Number(dc.total_cached); + } + for (const rd of rawRunDatasets) rd.workflow_run_id = Number(rd.workflow_run_id); // Build configs index const configs = new Map(); @@ -242,6 +392,26 @@ function getStore(): Store { } } + // Build benchmark → trace_replay_id map. `trace_replay_id` was added by the + // agentic migration; older dumps lack it (undefined → treated as "no trace"). + const benchmarkTraceReplayMap = new Map(); + for (const br of rawBenchmarks) { + const trId = (br as { trace_replay_id?: number | string | null }).trace_replay_id; + if (trId !== null && trId !== undefined) { + benchmarkTraceReplayMap.set(br.id, Number(trId)); + } + } + + // Datasets indexes + const datasetsById = new Map(); + const datasetsBySlug = new Map(); + for (const d of rawDatasets) { + datasetsById.set(d.id, d); + datasetsBySlug.set(d.slug, d); + } + const runDatasetSlugByRunId = new Map(); + for (const rd of rawRunDatasets) runDatasetSlugByRunId.set(rd.workflow_run_id, rd.dataset_slug); + store = { dumpDir: resolvedDir, configs, @@ -254,15 +424,52 @@ function getStore(): Store { changelog: rawChangelog, serverLogs: null, // lazy-loaded on first getServerLog() call (can be multiple GB) benchmarkServerLogMap, + benchmarkTraceReplayMap, + traceReplay: null, // lazy-loaded on first agentic blob-backed access (blobs are big) + datasets: rawDatasets, + datasetsById, + datasetsBySlug, + datasetConversations: rawDatasetConversations, + runDatasetSlugByRunId, }; console.log( - `json-provider: loaded ${rawConfigs.length} configs, ${latestRunsById.size} runs, ${rawBenchmarks.length} benchmarks`, + `json-provider: loaded ${rawConfigs.length} configs, ${latestRunsById.size} runs, ` + + `${rawBenchmarks.length} benchmarks, ${rawDatasets.length} datasets, ` + + `${rawDatasetConversations.length} conversations`, ); return store; } +/** + * Lazy-load agentic_trace_replay.json on first blob-backed access. Mirrors the + * server_logs lazy pattern — the file carries the big compressed blobs so we + * only pay to parse it when an agentic route actually needs a fallback (most + * routes serve the precomputed JSONB columns and never touch the blobs). The + * blob columns arrive as {type:'Buffer',data:[…]} and are decoded to Buffers on + * demand by the callers that need them. + */ +function getTraceReplay(): Map { + const s = getStore(); + if (s.traceReplay) return s.traceReplay; + console.log('json-provider: loading agentic_trace_replay.json (this may take a moment)...'); + const raw = loadTable(s.dumpDir, 'agentic_trace_replay.json'); + const map = new Map(); + for (const tr of raw) map.set(Number(tr.id), tr); + s.traceReplay = map; + console.log(`json-provider: loaded ${map.size} agentic_trace_replay rows`); + return map; +} + +/** Resolve a benchmark_result id → its agentic_trace_replay row (or null). */ +function traceReplayForBenchmark(benchmarkResultId: number): RawTraceReplay | null { + const s = getStore(); + const trId = s.benchmarkTraceReplayMap.get(benchmarkResultId); + if (trId === null || trId === undefined) return null; + return getTraceReplay().get(trId) ?? null; +} + // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- @@ -272,6 +479,32 @@ function toDateString(d: string): string { return d.slice(0, 10); } +/** + * Render a dumped timestamptz to match Postgres `::text` output, so the + * datasets mirrors are byte-identical to the SQL path. postgres.js decodes a + * timestamptz to a JS Date, which the dump serialized as ISO + * ("2026-07-02T09:00:00.000Z"); Postgres `::text` instead yields + * "2026-07-02 09:00:00+00" (space separator, no trailing ".000", "+00" offset, + * fractional seconds only when non-zero). Convert ISO → that form; pass through + * anything already in Postgres form (e.g. a dump produced without the Date step). + */ +const pad = (n: number, w = 2): string => String(n).padStart(w, '0'); + +function pgTimestampText(v: string): string { + // Already Postgres text form (has a space date/time separator, no 'T'). + if (!v.includes('T')) return v; + const d = new Date(v); + if (Number.isNaN(d.getTime())) return v; + const base = + `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())} ` + + `${pad(d.getUTCHours())}:${pad(d.getUTCMinutes())}:${pad(d.getUTCSeconds())}`; + const ms = d.getUTCMilliseconds(); + // Postgres prints fractional seconds only when non-zero (up to 6 digits; + // a Date carries at most ms precision, and dumps here have zero fractions). + const frac = ms === 0 ? '' : `.${pad(ms, 3).replace(/0+$/u, '')}`; + return `${base}${frac}+00`; +} + function buildRunUrl(wr: RawWorkflowRun): string | null { return wr.html_url ? `${wr.html_url}/attempts/${wr.run_attempt}` : null; } @@ -717,3 +950,477 @@ export function getServerLog(benchmarkResultId: number): string | null { return s.serverLogs.get(logId) ?? null; } + +// --------------------------------------------------------------------------- +// Agentic per-point mirrors (blob-backed; lazy trace_replay) +// +// Parity strategy: the SQL fast path reads the precomputed JSONB column +// (aggregate_stats / chart_series / request_timeline) when its inner `version` +// matches the current constant, else it re-derives from the gzipped blob using +// a shared pure helper (computeChartSeries / computeRequestTimeline / +// extract*+percentilesOf / computeDerivedFromBlob). These mirrors take the +// same two branches so dump mode yields the same payloads: serve the stored +// JSONB at the current version, otherwise gunzip the dumped blob and reuse the +// identical helper (the blobs ARE in the dump). Only if a stale/missing JSONB +// row also has no usable blob do we fall through to null — exactly as the SQL +// path does. No version-gated payload is ever served blindly. +// --------------------------------------------------------------------------- + +function blankAggregate(id: number): AgenticAggregate { + return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null }; +} + +/** Read a finite numeric metric out of a benchmark_results.metrics JSONB (or null). */ +function readFiniteMetric(m: Record, key: string): number | null { + const v = m[key]; + return typeof v === 'number' && Number.isFinite(v) ? v : null; +} + +/** + * NULLS-FIRST rank for an offload_mode value, mirroring the SQL + * `order by … br.offload_mode nulls first`: null → rank 0, else rank 1 keyed by + * the string value. + */ +function offloadRank(v: string | null | undefined): [number, string] { + return v === null || v === undefined ? [0, ''] : [1, v]; +} + +/** conv_id ASC tie-break, matching Postgres en_US.utf8 `order by conv_id asc`. */ +function compareConvId(a: RawDatasetConversation, b: RawDatasetConversation): number { + return a.conv_id.localeCompare(b.conv_id); +} + +/** + * Mirror of {@link import('./queries/agentic-aggregates.js').getAgenticAggregates}. + * Fast path: aggregate_stats at the current STATS_VERSION. Fallback: gunzip the + * profile blob for isl/osl percentiles and the server blob for KV/prefix, reusing + * the same extract*+percentilesOf helpers the SQL path uses. + */ +export function getAgenticAggregates(benchmarkResultIds: number[]): AgenticAggregateMap { + if (benchmarkResultIds.length === 0) return {}; + const result: AgenticAggregateMap = {}; + for (const id of benchmarkResultIds) { + const agg = blankAggregate(id); + const tr = traceReplayForBenchmark(id); + if (tr) { + const stats = tr.aggregate_stats as { + version?: number; + isl?: AgenticAggregate['isl']; + osl?: AgenticAggregate['osl']; + kvCacheUtil?: AgenticAggregate['kvCacheUtil']; + prefixCacheHitRate?: AgenticAggregate['prefixCacheHitRate']; + } | null; + if (stats && Number(stats.version) === STATS_VERSION) { + agg.isl = stats.isl ?? null; + agg.osl = stats.osl ?? null; + agg.kvCacheUtil = stats.kvCacheUtil ?? null; + agg.prefixCacheHitRate = stats.prefixCacheHitRate ?? null; + } else { + // Stale/missing precomputed stats → re-derive from the dumped blobs, + // reusing the exact SQL-path helpers (blobs are in the dump). + const profile = bufferFromJson(tr.profile_export_jsonl_gz); + if (profile) { + try { + const jsonl = gunzipSync(profile).toString('utf8'); + const { isl, osl } = extractIslOsl(jsonl); + agg.isl = percentilesOf(isl); + agg.osl = percentilesOf(osl); + } catch { + // malformed blob — leave nulls + } + } + const server = bufferFromJson(tr.server_metrics_json_gz); + if (server) { + try { + const json = gunzipSync(server).toString('utf8'); + const samples = extractServerMetricSamples(json); + agg.kvCacheUtil = percentilesOf(samples.kvCacheUtil); + agg.prefixCacheHitRate = percentilesOf(samples.prefixCacheHitRate); + } catch { + // dump-mode blobs are small (no >512 MB decompress case) — leave nulls + } + } + } + } + result[id] = agg; + } + return result; +} + +/** + * Mirror of {@link import('./queries/derived-agentic-metrics.js').getDerivedAgenticMetrics}. + * Fast path: aggregate_stats at STATS_VERSION. Fallback: computeDerivedFromBlob + * over the gunzipped profile blob (same helper as the SQL path). Ids without a + * trace_replay row are omitted, matching the SQL join. + */ +export function getDerivedAgenticMetrics(benchmarkResultIds: number[]): DerivedAgenticMetricMap { + if (benchmarkResultIds.length === 0) return {}; + const result: DerivedAgenticMetricMap = {}; + for (const id of benchmarkResultIds) { + const tr = traceReplayForBenchmark(id); + if (!tr) continue; // SQL joins on trace_replay — no row → omitted + const stats = tr.aggregate_stats as { + version?: number; + normalizedSessionTimeS?: number | null; + p90PrefillTpsPerUser?: number | null; + normalizedE2e400?: { p75?: number | null; p90?: number | null } | null; + } | null; + if (stats && Number(stats.version) === STATS_VERSION) { + result[id] = { + id, + normalized_session_time_s: stats.normalizedSessionTimeS ?? null, + p90_prefill_tps_per_user: stats.p90PrefillTpsPerUser ?? null, + p75_normalized_e2e_400_s: stats.normalizedE2e400?.p75 ?? null, + p90_normalized_e2e_400_s: stats.normalizedE2e400?.p90 ?? null, + }; + continue; + } + // Fallback: re-derive from the dumped profile blob via the shared helper. + const profile = bufferFromJson(tr.profile_export_jsonl_gz); + if (!profile) continue; // SQL fallback requires the blob to be non-null + try { + const jsonl = gunzipSync(profile).toString('utf8'); + const { normalized_session_time_s, p90_prefill_tps_per_user, normalized_e2e_400 } = + computeDerivedFromBlob(jsonl); + const entry: DerivedAgenticMetric = { + id, + normalized_session_time_s, + p90_prefill_tps_per_user, + p75_normalized_e2e_400_s: normalized_e2e_400?.p75 ?? null, + p90_normalized_e2e_400_s: normalized_e2e_400?.p90 ?? null, + }; + result[id] = entry; + } catch { + // malformed blob — omit id (SQL treats missing as "no data") + } + } + return result; +} + +/** + * Mirror of {@link import('./queries/request-timeline.js').getRequestTimeline}. + * Fast path: request_timeline at REQUEST_TIMELINE_VERSION. Fallback: + * computeRequestTimeline over the profile blob (same helper as the SQL path). + */ +export function getRequestTimeline(benchmarkResultId: number): RequestTimeline | null { + const tr = traceReplayForBenchmark(benchmarkResultId); + if (!tr) return null; + const stored = tr.request_timeline as (RequestTimeline & { version?: number }) | null; + if (stored && Number(stored.version) === REQUEST_TIMELINE_VERSION) return stored; + return computeRequestTimeline(bufferFromJson(tr.profile_export_jsonl_gz)); +} + +/** + * Mirror of {@link import('./queries/trace-server-metrics.js').getTraceServerMetrics}. + * Fast path: chart_series at CHART_SERIES_VERSION. Fallback: computeChartSeries + * over the server blob (same helper as the SQL path). Returns null when the point + * has no server_metrics blob, matching the SQL `has_blob` gate. + */ +export async function getTraceServerMetrics( + benchmarkResultId: number, +): Promise { + const s = getStore(); + const br = s.benchmarks.find((b) => b.id === benchmarkResultId); + if (!br) return null; + const c = s.configs.get(br.config_id); + const wr = s.latestRunsById.get(br.workflow_run_id) ?? null; + if (!c) return null; + const tr = traceReplayForBenchmark(benchmarkResultId); + // SQL gates on (server_metrics blob present AND trace_replay_id non-null). + const hasServerBlob = tr ? tr.server_metrics_json_gz !== null : false; + if (!tr || !hasServerBlob) return null; + + const num = (key: string): number | null => { + const v = br.metrics?.[key]; + return typeof v === 'number' && Number.isFinite(v) ? v : null; + }; + const meta: PointMeta = { + id: br.id, + hardware: c.hardware, + framework: c.framework, + model: c.model, + precision: c.precision, + spec_method: c.spec_method, + disagg: c.disagg, + conc: br.conc, + offload_mode: (br as { offload_mode?: string | null }).offload_mode ?? null, + isl: br.isl, + osl: br.osl, + benchmark_type: br.benchmark_type ?? 'single_turn', + date: toDateString(br.date), + run_url: wr ? buildRunUrl(wr) : null, + server_gpu_cache_hit_rate: num('server_gpu_cache_hit_rate'), + server_cpu_cache_hit_rate: num('server_cpu_cache_hit_rate'), + }; + const kvCachePoolTokens = num('kv_cache_pool_tokens'); + + const merge = (series: ChartSeries): TraceServerMetrics => ({ + meta, + kvCachePoolTokens, + startNs: series.startNs, + endNs: series.endNs, + durationS: series.durationS, + timeslicesCount: series.timeslicesCount, + kvCacheUsage: series.kvCacheUsage, + prefixCacheHitRate: series.prefixCacheHitRate, + queueDepth: series.queueDepth, + promptTokensBySource: series.promptTokensBySource, + prefillTps: series.prefillTps, + decodeTps: series.decodeTps, + prefixCacheHitsTps: series.prefixCacheHitsTps ?? [], + hostKvCacheUsage: series.hostKvCacheUsage ?? [], + kvCacheUsageByEngine: series.kvCacheUsageByEngine ?? [], + metricSources: series.metricSources ?? [], + }); + + const stored = tr.chart_series as (ChartSeries & { version?: number }) | null; + if (stored && Number(stored.version) === CHART_SERIES_VERSION) return merge(stored); + + const series = await computeChartSeries(bufferFromJson(tr.server_metrics_json_gz), { + framework: c.framework, + disagg: c.disagg, + }); + if (!series) return null; + return merge(series); +} + +/** + * Mirror of {@link import('./queries/trace-histograms.js').getTraceHistograms}. + * Fast path: pull isl/osl out of a current request_timeline. Fallback: parse the + * profile blob's per-request input/output_sequence_length. Ids without a + * trace_replay row are omitted (SQL joins on it). + */ +export function getTraceHistograms(benchmarkResultIds: number[]): TraceHistogramMap { + if (benchmarkResultIds.length === 0) return {}; + const result: TraceHistogramMap = {}; + for (const id of benchmarkResultIds) { + const tr = traceReplayForBenchmark(id); + if (!tr) continue; + const timeline = tr.request_timeline as (RequestTimeline & { version?: number }) | null; + if (timeline && Number(timeline.version) === REQUEST_TIMELINE_VERSION) { + const isl: number[] = []; + const osl: number[] = []; + for (const req of timeline.requests) { + if (typeof req.isl === 'number' && Number.isFinite(req.isl)) isl.push(req.isl); + if (typeof req.osl === 'number' && Number.isFinite(req.osl)) osl.push(req.osl); + } + result[id] = { id, isl, osl } satisfies TraceHistogramPoint; + continue; + } + // Fallback: parse the profile blob (same field extraction the SQL path uses). + const profile = bufferFromJson(tr.profile_export_jsonl_gz); + if (!profile) continue; + try { + const jsonl = gunzipSync(profile).toString('utf8'); + const { isl, osl } = extractIslOsl(jsonl); + result[id] = { id, isl, osl } satisfies TraceHistogramPoint; + } catch { + // malformed blob — omit id + } + } + return result; +} + +/** + * Mirror of {@link import('./queries/benchmark-siblings.js').getBenchmarkSiblings}. + * Plain-row logic: resolve the seed SKU, then every row in the same workflow_run + * sharing hw/framework/model/precision/spec_method/benchmark_type. Sort mirrors + * the SQL `order by decode_tp, decode_ep, offload_mode nulls first, conc`. + */ +export function getBenchmarkSiblings(benchmarkResultId: number): BenchmarkSiblings | null { + const s = getStore(); + const seed = s.benchmarks.find((b) => b.id === benchmarkResultId); + if (!seed) return null; + const seedC = s.configs.get(seed.config_id); + const seedWr = s.latestRunsById.get(seed.workflow_run_id); + // getBenchmarkSiblings joins workflow_runs (inner) for github_run_id — a + // missing run yields no seed row in SQL. + if (!seedC || !seedWr) return null; + const seedType = seed.benchmark_type ?? 'single_turn'; + + const rows = s.benchmarks + .filter((b) => { + if (b.workflow_run_id !== seed.workflow_run_id) return false; + if ((b.benchmark_type ?? 'single_turn') !== seedType) return false; + const c = s.configs.get(b.config_id); + if (!c) return false; + return ( + c.hardware === seedC.hardware && + c.framework === seedC.framework && + c.model === seedC.model && + c.precision === seedC.precision && + c.spec_method === seedC.spec_method + ); + }) + .map((b) => ({ b, c: s.configs.get(b.config_id)! })) + // ORDER BY c.decode_tp, c.decode_ep, br.offload_mode NULLS FIRST, br.conc + .toSorted((x, y) => { + if (x.c.decode_tp !== y.c.decode_tp) return x.c.decode_tp - y.c.decode_tp; + if (x.c.decode_ep !== y.c.decode_ep) return x.c.decode_ep - y.c.decode_ep; + const [xr, xv] = offloadRank((x.b as { offload_mode?: string | null }).offload_mode); + const [yr, yv] = offloadRank((y.b as { offload_mode?: string | null }).offload_mode); + if (xr !== yr) return xr - yr; + if (xv !== yv) return xv.localeCompare(yv); + return x.b.conc - y.b.conc; + }); + + const siblings = rows.map(({ b, c }) => { + const totalRequests = + readFiniteMetric(b.metrics, 'total_requests_completed') ?? + readFiniteMetric(b.metrics, 'num_requests_total'); + return { + id: b.id, + conc: b.conc, + offload_mode: (b as { offload_mode?: string | null }).offload_mode ?? null, + decode_tp: c.decode_tp, + decode_ep: c.decode_ep, + decode_dp_attention: c.decode_dp_attention, + decode_num_workers: c.decode_num_workers, + prefill_tp: c.prefill_tp, + prefill_ep: c.prefill_ep, + prefill_dp_attention: c.prefill_dp_attention, + prefill_num_workers: c.prefill_num_workers, + num_prefill_gpu: c.num_prefill_gpu, + num_decode_gpu: c.num_decode_gpu, + disagg: c.disagg, + is_multinode: c.is_multinode, + tput_per_gpu: readFiniteMetric(b.metrics, 'tput_per_gpu'), + total_requests: totalRequests, + is_current: b.id === benchmarkResultId, + has_trace: s.benchmarkTraceReplayMap.has(b.id), + }; + }); + + return { + sku: { + hardware: seedC.hardware, + framework: seedC.framework, + model: seedC.model, + precision: seedC.precision, + spec_method: seedC.spec_method, + benchmark_type: seedType, + github_run_id: seedWr.github_run_id, + date: toDateString(seed.date), + dataset_slug: s.runDatasetSlugByRunId.get(seed.workflow_run_id) ?? null, + }, + siblings, + }; +} + +// --------------------------------------------------------------------------- +// Dataset mirrors (plain-row logic) +// --------------------------------------------------------------------------- + +/** Mirror of {@link import('./queries/datasets.js').listDatasets}: newest first. */ +export function listDatasets(): DatasetRecord[] { + const s = getStore(); + // ORDER BY ingested_at DESC, slug ASC. ingested_at is an ISO string. + const sorted = s.datasets.toSorted((a, b) => { + const t = b.ingested_at.localeCompare(a.ingested_at); + return t === 0 ? a.slug.localeCompare(b.slug) : t; + }); + return sorted.map((d) => ({ + id: d.id, + slug: d.slug, + label: d.label, + variant: d.variant, + description: d.description, + hf_url: d.hf_url, + license: d.license, + conversation_count: Number(d.conversation_count), + summary: d.summary, + ingested_at: pgTimestampText(d.ingested_at), + })); +} + +/** Mirror of {@link import('./queries/datasets.js').getDataset}: one dataset incl. chart_data. */ +export function getDataset(slug: string): DatasetDetail | null { + const s = getStore(); + const d = s.datasetsBySlug.get(slug); + if (!d) return null; + return { + id: d.id, + slug: d.slug, + label: d.label, + variant: d.variant, + description: d.description, + hf_url: d.hf_url, + license: d.license, + conversation_count: Number(d.conversation_count), + summary: d.summary, + chart_data: d.chart_data, + ingested_at: pgTimestampText(d.ingested_at), + }; +} + +const CONVERSATIONS_MAX_LIMIT = 200; + +/** + * Mirror of {@link import('./queries/datasets.js').listConversations}. Applies + * the same ILIKE (case-insensitive substring) search, sort (tokens/turns/ + * subagents/id), limit clamp (1..200), and offset the SQL uses. `total` + * reflects the filtered count before pagination. + */ +export function listConversations( + slug: string, + opts: ListConversationsOpts = {}, +): ConversationList | null { + const s = getStore(); + const dataset = s.datasetsBySlug.get(slug); + if (!dataset) return null; + + const limit = Math.min(CONVERSATIONS_MAX_LIMIT, Math.max(1, opts.limit ?? 50)); + const offset = Math.max(0, opts.offset ?? 0); + const search = opts.search?.trim(); + const needle = search ? search.toLowerCase() : null; + + const filtered = s.datasetConversations.filter( + (dc) => + dc.dataset_id === dataset.id && + (needle === null || dc.conv_id.toLowerCase().includes(needle)), + ); + const total = filtered.length; + + // ORDER BY [DESC], conv_id ASC — replicate the SQL tie-break. + const sort = opts.sort ?? 'tokens'; + const sorted = filtered.toSorted((a, b) => { + if (sort === 'turns') return b.num_turns - a.num_turns || compareConvId(a, b); + if (sort === 'subagents') + return b.num_subagent_groups - a.num_subagent_groups || compareConvId(a, b); + if (sort === 'id') return compareConvId(a, b); + return b.total_in - a.total_in || compareConvId(a, b); // 'tokens' (default) + }); + + const items: ConversationListItem[] = sorted.slice(offset, offset + limit).map((dc) => ({ + conv_id: dc.conv_id, + models: dc.models, + num_turns: Number(dc.num_turns), + num_subagent_groups: Number(dc.num_subagent_groups), + total_in: Number(dc.total_in), + total_out: Number(dc.total_out), + total_cached: Number(dc.total_cached), + })); + + return { total, items }; +} + +/** Mirror of {@link import('./queries/datasets.js').getConversation}: one flamegraph. */ +export function getConversation(slug: string, convId: string): ConversationDetail | null { + const s = getStore(); + const dataset = s.datasetsBySlug.get(slug); + if (!dataset) return null; + const dc = s.datasetConversations.find( + (r) => r.dataset_id === dataset.id && r.conv_id === convId, + ); + if (!dc) return null; + return { + conv_id: dc.conv_id, + models: dc.models, + num_turns: Number(dc.num_turns), + num_subagent_groups: Number(dc.num_subagent_groups), + total_in: Number(dc.total_in), + total_out: Number(dc.total_out), + total_cached: Number(dc.total_cached), + structure: dc.structure as unknown as ConversationStructure, + }; +} diff --git a/packages/db/src/load-dump.ts b/packages/db/src/load-dump.ts index b1b4af70..108627b6 100644 --- a/packages/db/src/load-dump.ts +++ b/packages/db/src/load-dump.ts @@ -22,20 +22,44 @@ import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils'; const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1 }); -// Tables with serial/bigserial PKs that need sequence resets +// Tables with serial/bigserial PKs that need sequence resets. +// (datasets.id is text and run_datasets.workflow_run_id is a plain bigint FK — +// neither owns a sequence, so they're intentionally omitted.) const SEQUENCES: { seq: string; table: string; col: string }[] = [ { seq: 'configs_id_seq', table: TABLE_NAMES.configs, col: 'id' }, { seq: 'server_logs_id_seq', table: TABLE_NAMES.serverLogs, col: 'id' }, { seq: 'workflow_runs_id_seq', table: TABLE_NAMES.workflowRuns, col: 'id' }, + { seq: 'agentic_trace_replay_id_seq', table: TABLE_NAMES.agenticTraceReplay, col: 'id' }, { seq: 'benchmark_results_id_seq', table: TABLE_NAMES.benchmarkResults, col: 'id' }, { seq: 'eval_results_id_seq', table: TABLE_NAMES.evalResults, col: 'id' }, { seq: 'eval_samples_id_seq', table: TABLE_NAMES.evalSamples, col: 'id' }, { seq: 'run_stats_id_seq', table: TABLE_NAMES.runStats, col: 'id' }, { seq: 'changelog_entries_id_seq', table: TABLE_NAMES.changelogEntries, col: 'id' }, + { + seq: 'dataset_conversations_id_seq', + table: TABLE_NAMES.datasetConversations, + col: 'id', + }, ]; const BATCH_SIZE = 500; +/** The JSON shape Buffer.prototype.toJSON() emits (what dump-db writes for bytea). */ +interface BufferJson { + type: 'Buffer'; + data: number[]; +} + +/** True for a `{ type: 'Buffer', data: number[] }` object (a serialized bytea). */ +function isBufferJson(val: unknown): val is BufferJson { + return ( + typeof val === 'object' && + val !== null && + (val as { type?: unknown }).type === 'Buffer' && + Array.isArray((val as { data?: unknown }).data) + ); +} + /** * Stream-parse a JSON array file, yielding objects one at a time. * Avoids loading the entire file into memory. @@ -118,18 +142,36 @@ async function loadTable(dumpDir: string, table: string): Promise { const flush = async () => { if (batch.length === 0 || !columns) return; - // Track which columns have plain-object values (JSONB) for casting - const jsonbCols = new Set(); - const values: unknown[][] = batch.map((row) => + // Track which columns need a per-value cast. JSONB columns pass objects + // as-is under a `::jsonb` cast; BYTEA columns are reconstructed into a real + // Node Buffer under a `::bytea` cast. Casts are tracked per (row, col) — + // not per column — because a nullable blob/jsonb column can be null on some + // rows and populated on others within the same batch, and a NULL param + // needs no cast (Postgres would reject `NULL::bytea` from an untyped param + // only in edge cases, but more importantly the cast set must match the + // value actually bound for that cell). + const jsonbCells = new Set(); + const byteaCells = new Set(); + const values: unknown[][] = batch.map((row, rowIdx) => columns!.map((col, colIdx) => { const val = row[col]; if (val === null || val === undefined) return null; // Postgres text[] arrays: convert JSON ["a","b"] → Postgres {a,b} literal if (Array.isArray(val) && val.every((v) => typeof v === 'string')) return `{${(val as string[]).map((v) => `"${v.replaceAll('\\', String.raw`\\`).replaceAll('"', String.raw`\"`)}"`).join(',')}}`; + // BYTEA columns: dump-db.ts serialized the postgres.js Buffer via + // Buffer.prototype.toJSON() → {"type":"Buffer","data":[…]}. Rebuild the + // Buffer and bind it under a ::bytea cast so the blob round-trips + // byte-for-byte (agentic_trace_replay.*_gz / server_metrics_csv). Must + // be checked BEFORE the generic object→jsonb branch, or the blob would + // be mis-cast to jsonb and corrupt on insert. + if (isBufferJson(val)) { + byteaCells.add(`${rowIdx}:${colIdx}`); + return Buffer.from((val as BufferJson).data); + } // JSONB columns: pass objects as-is (sql.unsafe serializes them correctly with ::jsonb cast) if (typeof val === 'object') { - jsonbCols.add(colIdx); + jsonbCells.add(`${rowIdx}:${colIdx}`); return val; } return val as string | number | boolean; @@ -143,7 +185,9 @@ async function loadTable(dumpDir: string, table: string): Promise { `(${columns! .map((_col, j) => { const p = `$${i * columns!.length + j + 1}`; - return jsonbCols.has(j) ? `${p}::jsonb` : p; + if (byteaCells.has(`${i}:${j}`)) return `${p}::bytea`; + if (jsonbCells.has(`${i}:${j}`)) return `${p}::jsonb`; + return p; }) .join(', ')})`, ) diff --git a/packages/db/src/reset-db.ts b/packages/db/src/reset-db.ts index a895617c..760eb4c2 100644 --- a/packages/db/src/reset-db.ts +++ b/packages/db/src/reset-db.ts @@ -20,8 +20,9 @@ const sql = createAdminSql({ async function reset(): Promise { console.log('=== db:reset ==='); console.log( - 'This will DROP all tables (configs, workflow_runs, benchmark_results,\n' + - 'server_logs, run_stats, eval_results, changelog_entries, availability, schema_migrations).\n' + + 'This will DROP all tables (configs, workflow_runs, agentic_trace_replay,\n' + + 'benchmark_results, server_logs, run_stats, eval_results, changelog_entries,\n' + + 'availability, datasets, dataset_conversations, run_datasets, schema_migrations).\n' + 'You must run db:migrate after this before ingesting data.\n', ); @@ -37,10 +38,15 @@ async function reset(): Promise { await sql`DROP MATERIALIZED VIEW IF EXISTS latest_benchmarks`; await sql`DROP VIEW IF EXISTS latest_workflow_runs`; + // Child-before-parent order (CASCADE handles the rest, but keep it FK-safe). await sql`DROP TABLE IF EXISTS + ${sql(TABLE_NAMES.runDatasets)}, + ${sql(TABLE_NAMES.datasetConversations)}, + ${sql(TABLE_NAMES.datasets)}, ${sql(TABLE_NAMES.changelogEntries)}, ${sql(TABLE_NAMES.evalResults)}, ${sql(TABLE_NAMES.benchmarkResults)}, + ${sql(TABLE_NAMES.agenticTraceReplay)}, ${sql(TABLE_NAMES.serverLogs)}, ${sql(TABLE_NAMES.runStats)}, ${sql(TABLE_NAMES.availability)}, From d1dd59f12d64d5cee38070bbbc20bf6fa08020ec Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 18:35:16 -0500 Subject: [PATCH 29/40] fix(inference): gate agentic default sequence on availability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AgenticTraces default resolved before availability loaded (static SEQUENCE_OPTIONS fallback), so fixed-seq-only models flashed 'Agentic Traces', fired a wasted agentic fetch, then snapped to 1k/1k. New pure resolveEffectiveSequence helper (mirrors default-precisions pattern) returns the real scenario only once availability is known; benchmark fetching gates on the new sequenceResolved flag; non-agentic models fall back to 8k/1k (master's default) when available. Fixes the url-params and historical-trends e2e failures the PR description labels 'pre-existing' — they were caused by this default and now pass with no assertion changes. ttft-x-axis-toggle gets spec-scoped agentic intercepts (shared fixtures have no agentic rows). Verified live: llama70b -> 8K/1K, zero agentic calls, one benchmarks fetch; dsr1 -> Agentic Traces, one fetch. --- .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts | 109 ++++++++++++++++ packages/app/cypress/support/mock-data.ts | 3 + .../src/components/GlobalFilterContext.tsx | 65 +++++++++- .../components/inference/InferenceContext.tsx | 8 +- packages/app/src/lib/default-sequence.test.ts | 119 ++++++++++++++++++ packages/app/src/lib/default-sequence.ts | 52 ++++++++ 6 files changed, 349 insertions(+), 7 deletions(-) create mode 100644 packages/app/src/lib/default-sequence.test.ts create mode 100644 packages/app/src/lib/default-sequence.ts diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index 924ff9a9..c634cd27 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -18,8 +18,117 @@ const interceptDerivedMetrics = () => { }).as('derivedAgenticMetrics'); }; +// This spec exercises the agentic x-axis modes, which only exist when the +// selected model resolves to the Agentic Traces scenario. The default e2e +// fixtures (cypress/fixtures/api/*.json) have NO agentic rows for any model, so +// after the availability-gated effectiveSequence fix the bare-/inference default +// correctly resolves to a fixed-seq scenario. We therefore inject agentic +// availability + benchmark rows for the default model VIA SPEC-SCOPED INTERCEPTS +// (not the shared fixtures) so this test — and only this test — sees the agentic +// view. Scoping to intercepts keeps every other spec's default fixed-seq. +const DEFAULT_MODEL_DB_KEY = 'dsv4'; // DeepSeek-V4-Pro is the default model +const AGENTIC_DATE = '2026-06-12'; + +// Percentile ladder for one metric family (median/p75/p90/p95/p99/std). +const percentileLadder = (prefix: string, base: number): Record => ({ + [`median_${prefix}`]: base, + [`p75_${prefix}`]: base * 1.2, + [`p90_${prefix}`]: base * 1.5, + [`p95_${prefix}`]: base * 1.7, + [`p99_${prefix}`]: base * 2.2, + [`std_${prefix}`]: base * 0.3, +}); + +const agenticMetrics = (conc: number): Record => { + const scale = conc / 16; + const itl = 0.011 * scale; + return { + ...percentileLadder('ttft', 0.4 * scale), + ...percentileLadder('tpot', 0.012 * scale), + ...percentileLadder('itl', itl), + ...percentileLadder('e2el', 8 * scale), + median_intvty: 1 / itl, + p75_intvty: 1 / (itl * 1.2), + p90_intvty: 1 / (itl * 1.5), + p99_intvty: 1 / (itl * 2.2), + std_intvty: (1 / itl) * 0.1, + tput_per_gpu: 950 / Math.sqrt(scale), + output_tput_per_gpu: 210, + input_tput_per_gpu: 740, + total_tput_tps: 7600 * conc * 0.05, + }; +}; + +const agenticGpus = [ + { hardware: 'b200', framework: 'vllm', disagg: false }, + { hardware: 'b300', framework: 'vllm', disagg: false }, +]; + +// Availability: default model has BOTH agentic and fixed-seq, so the default +// resolves to agentic (the product-intended, agentic-preferred behavior). +const agenticAvailability = [ + ...agenticGpus.map((g) => ({ + model: DEFAULT_MODEL_DB_KEY, + isl: null, + osl: null, + precision: 'fp4', + hardware: g.hardware, + framework: g.framework, + spec_method: 'none', + disagg: g.disagg, + benchmark_type: 'agentic_traces', + date: AGENTIC_DATE, + })), + ...agenticGpus.map((g) => ({ + model: DEFAULT_MODEL_DB_KEY, + isl: 8192, + osl: 1024, + precision: 'fp4', + hardware: g.hardware, + framework: g.framework, + spec_method: 'none', + disagg: g.disagg, + benchmark_type: 'single_turn', + date: AGENTIC_DATE, + })), +]; + +let benchIdCursor = 900000; +const agenticBenchmarks = agenticGpus.flatMap((g) => + [16, 64, 128].map((conc) => ({ + id: benchIdCursor++, + hardware: g.hardware, + framework: g.framework, + model: DEFAULT_MODEL_DB_KEY, + precision: 'fp4', + spec_method: 'none', + disagg: g.disagg, + is_multinode: false, + prefill_tp: 8, + decode_tp: 8, + num_prefill_gpu: 8, + num_decode_gpu: 8, + isl: null, + osl: null, + conc, + offload_mode: 'off', + benchmark_type: 'agentic_traces', + image: 'vllm/vllm-openai:v0.9.0', + metrics: agenticMetrics(conc), + workers: null, + date: AGENTIC_DATE, + run_url: null, + })), +); + +const interceptAgenticData = () => { + cy.intercept('GET', '/api/v1/availability', { body: agenticAvailability }).as('availability'); + cy.intercept('GET', '/api/v1/benchmarks*', { body: agenticBenchmarks }).as('benchmarks'); +}; + describe('X-Axis Mode Toggle (inference chart)', () => { before(() => { + interceptAgenticData(); cy.visit('/inference', { onBeforeLoad(win) { win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index b2164bcc..490fca87 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -423,6 +423,9 @@ export function createMockGlobalFilterContext( selectedPrecisions: [Precision.FP4], setSelectedPrecisions: namedStub('setSelectedPrecisions_global'), effectiveSequence: Sequence.EightK_OneK, + // Mocks represent a settled state: availability is known and the sequence is + // resolved. Tests exercising the pre-availability window override this. + sequenceResolved: true, effectivePrecisions: [Precision.FP4], selectedRunDate: '2025-03-01', setSelectedRunDate: namedStub('setSelectedRunDate_global'), diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index fddf7871..e7aa751c 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -24,6 +24,14 @@ function isEnumValue>(e: T, v: string): v is T[ const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u; const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u; +// Placeholder for the public (non-null) `effectiveSequence` during the window +// before availability has loaded. It must be a fixed-seq scenario — never +// AgenticTraces — so the scenario selector doesn't flash "Agentic Traces" for a +// fixed-seq-only model while the chart shows its loading skeleton. `8k/1k` is +// the pre-agentic default for non-agentic models. Consumers that must not act on +// an unresolved sequence gate on `sequenceResolved` instead. +const PRE_AVAILABILITY_SEQUENCE = Sequence.EightK_OneK; + import { useAvailability } from '@/hooks/api/use-availability'; import { useWorkflowInfo } from '@/hooks/api/use-workflow-info'; import { useUrlState } from '@/hooks/useUrlState'; @@ -38,6 +46,7 @@ import { } from '@/lib/data-mappings'; import { computeAutoSwitchDecision } from '@/lib/unofficial-run-auto-switch'; import { countCurvesByPrecision, resolveEffectivePrecisions } from '@/lib/default-precisions'; +import { resolveEffectiveSequence } from '@/lib/default-sequence'; import type { AvailabilityRow, WorkflowInfoResponse } from '@/lib/api'; interface RunInfo { @@ -66,6 +75,15 @@ export interface GlobalFilterContextType { // Effective (validated) values effectiveSequence: Sequence; + /** + * Whether `effectiveSequence` reflects the selected model's real availability + * (DB or unofficial run) rather than the pre-load placeholder. False during + * the brief window before availability loads. Consumers that trigger data + * fetches or render sequence-dependent labels should gate on this so a + * fixed-seq-only model never fires an agentic fetch or flashes "Agentic + * Traces" before availability settles. + */ + sequenceResolved: boolean; effectivePrecisions: string[]; // Run date & run ID @@ -288,11 +306,39 @@ export function GlobalFilterProvider({ return merged.length > 0 ? merged : SEQUENCE_OPTIONS; }, [availabilityRows, modelRows, unofficialAvailable, selectedModel]); - // Synchronously validated sequence - const effectiveSequence = useMemo(() => { - if (availableSequences.includes(selectedSequence)) return selectedSequence; - return availableSequences[0] ?? selectedSequence; - }, [availableSequences, selectedSequence]); + // Whether we actually know the selected model's sequences yet. Availability + // may arrive from the DB (`availabilityRows`) OR from a loaded unofficial run + // (`unofficialAvailable` for this model) — either source lets us resolve a + // trustworthy effectiveSequence. Until then `availableSequences` is the static + // SEQUENCE_OPTIONS fallback (which contains AgenticTraces), so resolving + // eagerly would fetch + label an agentic scenario for fixed-seq-only models, + // then snap once availability lands (flash + wasted request). + const availabilityLoaded = useMemo( + () => + availabilityRows !== undefined || unofficialAvailable.some((a) => a.model === selectedModel), + [availabilityRows, unofficialAvailable, selectedModel], + ); + + // Synchronously validated sequence. + // + // `resolveEffectiveSequence` returns null while availability is still loading + // — we surface that as `sequenceResolved` so InferenceContext can gate the + // benchmark fetch until the real sequence is known (no agentic fetch fires for + // a fixed-seq-only model). For the non-null public `effectiveSequence` value + // we substitute a fixed-seq scenario (never AgenticTraces) during that window + // so the scenario selector never flashes "Agentic Traces"; the chart shows its + // normal loading skeleton until `sequenceResolved` flips true. + const resolvedSequence = useMemo( + () => + resolveEffectiveSequence({ + selectedSequence, + availableSequences, + availabilityLoaded, + }), + [selectedSequence, availableSequences, availabilityLoaded], + ); + const sequenceResolved = resolvedSequence !== null; + const effectiveSequence = resolvedSequence ?? PRE_AVAILABILITY_SEQUENCE; // Precisions available for the selected model + sequence (DB ∪ unofficial run) const availablePrecisions = useMemo(() => { @@ -439,7 +485,11 @@ export function GlobalFilterProvider({ g_model: selectedModel, g_rundate: selectedRunDate, g_runid: selectedRunId, - i_seq: effectiveSequence, + // Don't pin the sequence to the URL until it's resolved from real + // availability — writing the pre-load placeholder (8k/1k) would clobber a + // shared `?i_seq=agentic-traces` link before the model's availability + // confirms it has agentic data. + i_seq: sequenceResolved ? effectiveSequence : undefined, // Only pin the precision in the URL once chosen explicitly; in auto mode // leave it out so the link keeps following the per-model densest default. i_prec: precisionExplicit ? effectivePrecisions.join(',') : undefined, @@ -449,6 +499,7 @@ export function GlobalFilterProvider({ selectedRunDate, selectedRunId, effectiveSequence, + sequenceResolved, effectivePrecisions, precisionExplicit, setUrlParams, @@ -463,6 +514,7 @@ export function GlobalFilterProvider({ selectedPrecisions, setSelectedPrecisions, effectiveSequence, + sequenceResolved, effectivePrecisions, selectedRunDate: effectiveRunDate, setSelectedRunDate: setSelectedRunDateManual, @@ -485,6 +537,7 @@ export function GlobalFilterProvider({ selectedSequence, selectedPrecisions, effectiveSequence, + sequenceResolved, effectivePrecisions, effectiveRunDate, setSelectedRunDateManual, diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 98962126..b9cbc7ce 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -110,6 +110,7 @@ export function InferenceProvider({ selectedModel, setSelectedModel, effectiveSequence, + sequenceResolved, setSelectedSequence, effectivePrecisions, setSelectedPrecisions, @@ -414,7 +415,12 @@ export function InferenceProvider({ userCosts, userPowers, effectiveRunDate, - isActive, + // Gate benchmark fetching on sequenceResolved: before availability loads we + // don't yet know the model's real sequence, and the selectedSequence default + // is AgenticTraces. Fetching now would fire the agentic data path for a + // fixed-seq-only model, then refetch once availability snaps the sequence. + // The chart's normal loading state covers this brief window. + isActive && sequenceResolved, latestDate, selectedPercentile, compareGpuPair ?? null, diff --git a/packages/app/src/lib/default-sequence.test.ts b/packages/app/src/lib/default-sequence.test.ts new file mode 100644 index 00000000..4fd8a6b9 --- /dev/null +++ b/packages/app/src/lib/default-sequence.test.ts @@ -0,0 +1,119 @@ +import { describe, expect, it } from 'vitest'; + +import { Sequence } from './data-mappings'; +import { resolveEffectiveSequence } from './default-sequence'; + +describe('resolveEffectiveSequence', () => { + describe('availability gate (rule 1)', () => { + it('returns null while availability has not loaded, even if the selection looks valid', () => { + // Pre-availability, availableSequences is the static fallback (which + // contains AgenticTraces). Resolving here would fetch + label an agentic + // scenario for a fixed-seq-only model, so we hold off. + expect( + resolveEffectiveSequence({ + selectedSequence: Sequence.AgenticTraces, + availableSequences: [ + Sequence.OneK_OneK, + Sequence.OneK_EightK, + Sequence.EightK_OneK, + Sequence.AgenticTraces, + ], + availabilityLoaded: false, + }), + ).toBeNull(); + }); + + it('returns null pre-availability regardless of the selected sequence', () => { + expect( + resolveEffectiveSequence({ + selectedSequence: Sequence.EightK_OneK, + availableSequences: [Sequence.EightK_OneK], + availabilityLoaded: false, + }), + ).toBeNull(); + }); + }); + + describe('honors a valid selection (rule 2a)', () => { + it('keeps AgenticTraces when the model actually has agentic data (dsr1 case)', () => { + // DeepSeek-R1 in the seeded DB has both agentic and 8k/1k — the agentic + // default must survive so the PR intent (agentic-preferred) holds. + expect( + resolveEffectiveSequence({ + selectedSequence: Sequence.AgenticTraces, + availableSequences: [Sequence.EightK_OneK, Sequence.AgenticTraces], + availabilityLoaded: true, + }), + ).toBe(Sequence.AgenticTraces); + }); + + it('keeps a fixed-seq selection when available', () => { + expect( + resolveEffectiveSequence({ + selectedSequence: Sequence.OneK_OneK, + availableSequences: [Sequence.OneK_OneK, Sequence.EightK_OneK], + availabilityLoaded: true, + }), + ).toBe(Sequence.OneK_OneK); + }); + }); + + describe('fallback ordering when the selection is unavailable (rule 2b/2c)', () => { + it('for a fixed-seq-only model, agentic default falls back to 8k/1k, not the raw first entry (llama70b case)', () => { + // Llama-3.3-70B has only 8k/1k in the seeded DB. The agentic default is + // unavailable, so it must resolve to a fixed-seq scenario — here the sole + // available one. + expect( + resolveEffectiveSequence({ + selectedSequence: Sequence.AgenticTraces, + availableSequences: [Sequence.EightK_OneK], + availabilityLoaded: true, + }), + ).toBe(Sequence.EightK_OneK); + }); + + it('prefers 8k/1k over availableSequences[0] when both 1k/1k and 8k/1k exist', () => { + // DB row order can surface 1k/1k first. Master defaulted non-agentic + // models to 8k/1k, so prefer it rather than snapping to 1k/1k. + expect( + resolveEffectiveSequence({ + selectedSequence: Sequence.AgenticTraces, + availableSequences: [Sequence.OneK_OneK, Sequence.EightK_OneK], + availabilityLoaded: true, + }), + ).toBe(Sequence.EightK_OneK); + }); + + it('falls back to availableSequences[0] when 8k/1k is not available', () => { + expect( + resolveEffectiveSequence({ + selectedSequence: Sequence.AgenticTraces, + availableSequences: [Sequence.OneK_OneK, Sequence.OneK_EightK], + availabilityLoaded: true, + }), + ).toBe(Sequence.OneK_OneK); + }); + + it('never resolves to AgenticTraces via fallback when the model lacks it', () => { + const result = resolveEffectiveSequence({ + selectedSequence: Sequence.AgenticTraces, + availableSequences: [Sequence.OneK_OneK, Sequence.OneK_EightK, Sequence.EightK_OneK], + availabilityLoaded: true, + }); + expect(result).not.toBe(Sequence.AgenticTraces); + expect(result).toBe(Sequence.EightK_OneK); + }); + + it('returns the selection itself when the model has no sequences at all', () => { + // Degenerate case: keeps a non-null value so the type contract holds; the + // chart shows empty. (availabilityLoaded true but zero sequences.) + expect( + resolveEffectiveSequence({ + selectedSequence: Sequence.OneK_OneK, + availableSequences: [], + availabilityLoaded: true, + }), + ).toBe(Sequence.OneK_OneK); + }); + }); +}); diff --git a/packages/app/src/lib/default-sequence.ts b/packages/app/src/lib/default-sequence.ts new file mode 100644 index 00000000..d06a5307 --- /dev/null +++ b/packages/app/src/lib/default-sequence.ts @@ -0,0 +1,52 @@ +import { Sequence } from './data-mappings'; + +/** + * Effective-sequence resolution. + * + * `selectedSequence` defaults to {@link Sequence.AgenticTraces} (a deliberate + * product choice — agentic-preferred), but not every model has agentic data. + * This helper turns the raw user/default selection into the sequence the chart + * should actually render, given what the selected model offers. + * + * Two rules, in order: + * + * 1. **Availability gate.** Until availability rows have loaded we do NOT know + * which sequences the model has. Resolving eagerly here would pick the static + * fallback list (which contains AgenticTraces) and make the page fetch + label + * an agentic scenario for fixed-seq-only models (e.g. Llama-3.3-70B), then + * snap to a fixed-seq scenario once availability arrives — a visible flash of + * "Agentic Traces" plus a wasted request. When `availabilityLoaded` is false + * we return `null`; callers gate data fetching and selector display on a + * non-null result (a loading skeleton covers this window, which is short). + * + * 2. **Fallback ordering.** Once availability is known: keep the user's + * `selectedSequence` if the model has it. Otherwise fall back to a sensible + * fixed-seq scenario. `availableSequences[0]` follows DB row order, which can + * surface `1k/1k` even when `8k/1k` exists — but `8k/1k` was the pre-agentic + * default for non-agentic models, so prefer it when present to match that + * long-standing behavior. Only if neither the selection nor `8k/1k` is + * available do we fall to `availableSequences[0]`. + */ +export function resolveEffectiveSequence({ + selectedSequence, + availableSequences, + availabilityLoaded, +}: { + selectedSequence: Sequence; + availableSequences: Sequence[]; + availabilityLoaded: boolean; +}): Sequence | null { + // Rule 1: do not commit to a sequence before we know what the model has. + if (!availabilityLoaded) return null; + + // Rule 2a: honor the user's / default selection when the model supports it. + if (availableSequences.includes(selectedSequence)) return selectedSequence; + + // Rule 2b: prefer 8k/1k (the pre-agentic default for non-agentic models) over + // whatever availableSequences[0] happens to be (DB row order can yield 1k/1k). + if (availableSequences.includes(Sequence.EightK_OneK)) return Sequence.EightK_OneK; + + // Rule 2c: last resort — first available, or the selection itself if the model + // has no sequences at all (keeps the type non-null; downstream shows empty). + return availableSequences[0] ?? selectedSequence; +} From 1d4b027fee797e2ffe23d700b749c7973d17d4b2 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 18:45:30 -0500 Subject: [PATCH 30/40] fix(datasets): escape LIKE wildcards and cap conversation search length The public conversation search embedded user input in ILIKE unescaped and uncapped: '%' matched every row and long stacked-wildcard patterns could push Neon to statement timeout (500s). escapeLikePattern escapes backslash-first then %/_ so searches are literal substring matches (now agreeing exactly with the dump-mode mirror's .includes semantics); the route trims and rejects >100 chars with 400 before touching the DB. Live: ?search=%25 30 -> 0 rows; 150-char input -> 400; real searches unchanged. Adds 14 tests. --- .../[slug]/conversations/route.test.ts | 116 ++++++++++++++++++ .../v1/datasets/[slug]/conversations/route.ts | 16 ++- .../json-provider.agentic-datasets.test.ts | 17 +++ packages/db/src/queries/datasets.test.ts | 32 ++++- packages/db/src/queries/datasets.ts | 25 +++- 5 files changed, 203 insertions(+), 3 deletions(-) create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts new file mode 100644 index 00000000..b582e79c --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts @@ -0,0 +1,116 @@ +import { describe, expect, it, vi, beforeEach } from 'vitest'; + +const { mockListConversations, mockGetDb } = vi.hoisted(() => ({ + mockListConversations: vi.fn(), + mockGetDb: vi.fn(() => 'mock-sql'), +})); + +vi.mock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: mockGetDb, + JSON_MODE: false, + FIXTURES_MODE: false, +})); + +vi.mock('@semianalysisai/inferencex-db/queries/datasets', () => ({ + listConversations: mockListConversations, +})); + +vi.mock('@semianalysisai/inferencex-db/json-provider', () => ({ + listConversations: vi.fn(), +})); + +vi.mock('@/lib/api-cache', () => ({ + cachedQuery: (fn: (...args: any[]) => any) => fn, + cachedJson: (data: unknown) => Response.json(data), +})); + +import { GET } from './route'; +import { NextRequest } from 'next/server'; + +function req(path: string): NextRequest { + return new NextRequest(new URL(path, 'http://localhost')); +} + +const PARAMS = Promise.resolve({ slug: 'test-dataset' }); + +beforeEach(() => { + vi.clearAllMocks(); +}); + +describe('GET /api/v1/datasets/[slug]/conversations — search input validation', () => { + it('returns 400 when search exceeds 100 characters', async () => { + const longSearch = 'a'.repeat(101); + const res = await GET(req(`/api/v1/datasets/test-dataset/conversations?search=${longSearch}`), { + params: PARAMS, + }); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('search too long'); + // DB must not be called. + expect(mockListConversations).not.toHaveBeenCalled(); + }); + + it('accepts a search string exactly at the 100-character limit', async () => { + const exactSearch = 'a'.repeat(100); + mockListConversations.mockResolvedValueOnce({ total: 0, items: [] }); + const res = await GET( + req(`/api/v1/datasets/test-dataset/conversations?search=${exactSearch}`), + { params: PARAMS }, + ); + expect(res.status).toBe(200); + }); + + it('trims whitespace before applying the length check', async () => { + // A 101-char string that is 100 chars of spaces + 1 real char should become + // 1 char after trimming — well under the limit. + const paddedSearch = `${' '.repeat(100)}a`; + mockListConversations.mockResolvedValueOnce({ total: 1, items: [] }); + const res = await GET( + req(`/api/v1/datasets/test-dataset/conversations?search=${paddedSearch}`), + { params: PARAMS }, + ); + expect(res.status).toBe(200); + expect(mockListConversations).toHaveBeenCalledWith( + 'mock-sql', + 'test-dataset', + expect.objectContaining({ search: 'a' }), + ); + }); + + it('returns 404 when the dataset slug is unknown', async () => { + mockListConversations.mockResolvedValueOnce(null); + const res = await GET(req('/api/v1/datasets/test-dataset/conversations'), { + params: PARAMS, + }); + expect(res.status).toBe(404); + const body = await res.json(); + expect(body.error).toBe('Not found'); + }); + + it('returns conversation data for a valid request', async () => { + const mockData = { total: 2, items: [{ conv_id: 'c1' }, { conv_id: 'c2' }] }; + mockListConversations.mockResolvedValueOnce(mockData); + const res = await GET( + req('/api/v1/datasets/test-dataset/conversations?search=agent&sort=turns&limit=10&offset=0'), + { params: PARAMS }, + ); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body).toEqual(mockData); + expect(mockListConversations).toHaveBeenCalledWith( + 'mock-sql', + 'test-dataset', + expect.objectContaining({ search: 'agent', sort: 'turns', limit: 10, offset: 0 }), + ); + }); + + it('returns 500 when the query throws', async () => { + mockListConversations.mockRejectedValueOnce(new Error('Neon timeout')); + const res = await GET(req('/api/v1/datasets/test-dataset/conversations'), { + params: PARAMS, + }); + expect(res.status).toBe(500); + const body = await res.json(); + expect(body.error).toBe('Internal server error'); + }); +}); diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts index 196c29d6..2dad4ace 100644 --- a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts @@ -34,6 +34,13 @@ const getCachedConversations = cachedQuery( 'dataset-conversations', ); +// Maximum search string length accepted. Longer strings are rejected with 400 +// rather than being forwarded to the DB: an ILIKE on an unindexed conv_id column +// with a very long pattern (or many stacked wildcards) can exhaust Neon's +// statement timeout and return a 500. 100 chars is generous for any real +// conversation-id prefix while keeping the attack surface small. +const MAX_SEARCH_LENGTH = 100; + /** * GET /api/v1/datasets/[slug]/conversations?search=&limit=&offset=&sort= * Paginated conversation list (counts only, no flamegraph structure). @@ -41,7 +48,14 @@ const getCachedConversations = cachedQuery( export async function GET(request: NextRequest, { params }: { params: Promise<{ slug: string }> }) { const { slug } = await params; const sp = request.nextUrl.searchParams; - const search = sp.get('search') ?? ''; + const rawSearch = sp.get('search') ?? ''; + const search = rawSearch.trim(); + + // Reject search strings that exceed the length cap before touching the DB. + if (search.length > MAX_SEARCH_LENGTH) { + return NextResponse.json({ error: 'search too long' }, { status: 400 }); + } + const limit = Math.min(200, Math.max(1, Number(sp.get('limit')) || 50)); const offset = Math.max(0, Number(sp.get('offset')) || 0); const sortParam = sp.get('sort') ?? 'tokens'; diff --git a/packages/db/src/json-provider.agentic-datasets.test.ts b/packages/db/src/json-provider.agentic-datasets.test.ts index d6cb6601..e2e97908 100644 --- a/packages/db/src/json-provider.agentic-datasets.test.ts +++ b/packages/db/src/json-provider.agentic-datasets.test.ts @@ -541,6 +541,23 @@ describe('dataset mirrors', () => { expect(jp.listDatasets()[0]?.ingested_at).toBe('2026-06-20 00:00:00+00'); }); + it('listConversations: search for literal "%" matches no rows (wildcard semantics do not apply)', () => { + // The SQL path now escapes LIKE metacharacters via escapeLikePattern before + // embedding into the ILIKE pattern. The json-provider mirror uses + // .toLowerCase().includes() which already treats input literally. Both paths + // must agree: a search for "%" finds only conv_ids that contain a literal + // percent character — none of the fixture conv_ids do. + const result = jp.listConversations('ds-new', { search: '%' }); + expect(result?.total).toBe(0); + expect(result?.items).toHaveLength(0); + }); + + it('listConversations: search for literal "_" matches no rows', () => { + // Similarly, "_" must not act as a single-character wildcard. + const result = jp.listConversations('ds-new', { search: '_' }); + expect(result?.total).toBe(0); + }); + it('listConversations applies case-insensitive search, sort, and pagination', () => { // Default sort = tokens (total_in desc): alpha(300), plain(200), AGENT-beta(100). const all = jp.listConversations('ds-new'); diff --git a/packages/db/src/queries/datasets.test.ts b/packages/db/src/queries/datasets.test.ts index c1676445..d6693536 100644 --- a/packages/db/src/queries/datasets.test.ts +++ b/packages/db/src/queries/datasets.test.ts @@ -1,7 +1,37 @@ import { describe, expect, it } from 'vitest'; import type { DbClient } from '../connection.js'; -import { getConversation, listConversations, listDatasets } from './datasets.js'; +import { escapeLikePattern, getConversation, listConversations, listDatasets } from './datasets.js'; + +describe('escapeLikePattern', () => { + it('leaves plain text unchanged', () => { + expect(escapeLikePattern('agent')).toBe('agent'); + }); + + it('escapes % so it is treated as a literal percent, not a wildcard', () => { + expect(escapeLikePattern('%')).toBe(String.raw`\%`); + expect(escapeLikePattern('50%off')).toBe(String.raw`50\%off`); + }); + + it('escapes _ so it is treated as a literal underscore, not a wildcard', () => { + expect(escapeLikePattern('_')).toBe(String.raw`\_`); + expect(escapeLikePattern('conv_id')).toBe(String.raw`conv\_id`); + }); + + it('escapes backslash first to avoid double-escaping', () => { + expect(escapeLikePattern('\\')).toBe(String.raw`\\`); + // A backslash followed by % must become \\\% in the escaped output. + expect(escapeLikePattern(String.raw`\%`)).toBe(String.raw`\\\%`); + }); + + it('handles mixed metacharacters', () => { + expect(escapeLikePattern('50%_off')).toBe(String.raw`50\%\_off`); + }); + + it('returns empty string unchanged', () => { + expect(escapeLikePattern('')).toBe(''); + }); +}); /** * Mock DbClient: returns canned result sets in call order. Each call to the diff --git a/packages/db/src/queries/datasets.ts b/packages/db/src/queries/datasets.ts index cfefe391..bbcb2ece 100644 --- a/packages/db/src/queries/datasets.ts +++ b/packages/db/src/queries/datasets.ts @@ -106,6 +106,27 @@ export interface ListConversationsOpts { const MAX_LIMIT = 200; +/** + * Escape Postgres LIKE metacharacters in a user-supplied search string so that + * the pattern performs a literal substring match, not a wildcard match. + * + * Postgres LIKE special characters are: % (any sequence), _ (any single char), + * and \ (the default escape character). We escape \ first so our own escape + * sequences are not double-escaped, then % and _. + * + * postgres.js parameterization already prevents SQL injection; this escaping + * fixes wildcard-semantics only (e.g. searching for literal '%' must not match + * every row). + * + * @example escapeLikePattern('50%_off') === '50\\%\\_off' + */ +export function escapeLikePattern(raw: string): string { + return raw + .replaceAll('\\', String.raw`\\`) + .replaceAll('%', String.raw`\%`) + .replaceAll('_', String.raw`\_`); +} + /** * Paginated conversation list for a dataset (by slug). Returns counts only — * the per-conversation `structure` blob is fetched separately by @@ -125,7 +146,9 @@ export async function listConversations( const limit = Math.min(MAX_LIMIT, Math.max(1, opts.limit ?? 50)); const offset = Math.max(0, opts.offset ?? 0); const search = opts.search?.trim(); - const like = search ? `%${search}%` : null; + // Escape LIKE metacharacters so user input is treated as a literal substring. + // Backslash is escaped first to prevent double-escaping our own escape sequences. + const like = search ? `%${escapeLikePattern(search)}%` : null; const totalRows = (await sql` select count(*)::int as n From 20cc135e63de9273a148b0d13ee36f6d6b4268a0 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 18:57:20 -0500 Subject: [PATCH 31/40] fix(analytics): track sibling nav, dataset pagination, chart expand; overlay-mode e2e Adds the AGENTS.md-required track() calls (agentic_siblings_navigated, datasets_conversations_page_changed, agentic_chart_expanded) to the three untracked interaction clusters, and the mandated overlay-path regression coverage: ttft-x-axis-toggle gains three tests loading an ?unofficialrun= overlay, switching to the ttft x-axis mode (overlay points still render), and asserting the normalized-e2e suppression banner. Cypress 8/8. --- .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts | 111 ++++++++++++++++++ .../components/datasets/dataset-detail.tsx | 12 +- .../agentic-point/expandable-chart.tsx | 6 +- .../inference/agentic-point/sibling-nav.tsx | 21 +++- 4 files changed, 144 insertions(+), 6 deletions(-) diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index c634cd27..dca6cd8e 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -197,3 +197,114 @@ describe('X-Axis Mode Toggle (inference chart)', () => { cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); }); }); + +// --------------------------------------------------------------------------- +// Overlay path — regression coverage for unofficial-run overlays with agentic +// x-axis modes (finding #8 / AGENTS.md: chart features must have overlay tests). +// The overlay behavior itself is verified correct by prior review; this suite +// guards against regressions only and does NOT change overlay behavior. +// --------------------------------------------------------------------------- + +// Build a minimal unofficial-run API response that contains one agentic +// overlay benchmark row so the provider builds overlay chart data. +const OVERLAY_RUN_ID = 99900000001; +const OVERLAY_RUN_URL = `https://github.com/SemiAnalysisAI/InferenceX/actions/runs/${OVERLAY_RUN_ID}`; + +const overlayBenchmarkRow = { + id: 800000, + hardware: 'b200', + framework: 'vllm', + model: DEFAULT_MODEL_DB_KEY, + precision: 'fp4', + spec_method: 'none', + disagg: false, + is_multinode: false, + prefill_tp: 8, + decode_tp: 8, + num_prefill_gpu: 8, + num_decode_gpu: 8, + isl: null, + osl: null, + conc: 32, + offload_mode: 'off', + benchmark_type: 'agentic_traces', + image: 'vllm/vllm-openai:v0.9.0', + metrics: agenticMetrics(32), + workers: null, + date: AGENTIC_DATE, + run_url: OVERLAY_RUN_URL, +}; + +const interceptAgenticDataWithOverlay = () => { + interceptAgenticData(); + cy.intercept('GET', '/api/unofficial-run*', { + body: { + runInfos: [ + { + id: OVERLAY_RUN_ID, + name: 'Overlay regression fixture', + branch: 'test/overlay-regression', + sha: 'abc000', + createdAt: `${AGENTIC_DATE}T00:00:00Z`, + url: OVERLAY_RUN_URL, + conclusion: 'success', + status: 'completed', + isNonMainBranch: true, + }, + ], + benchmarks: [overlayBenchmarkRow], + evaluations: [], + }, + }).as('unofficialRun'); +}; + +describe('X-Axis Mode Toggle — overlay path (finding #8 regression guard)', () => { + before(() => { + interceptAgenticDataWithOverlay(); + cy.visit(`/inference?unofficialrun=${OVERLAY_RUN_ID}`, { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + }, + }); + cy.wait('@unofficialRun'); + cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible'); + cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1); + }); + + it('shows overlay (unofficial-run) watermark SVG when an overlay is loaded', () => { + // The unofficial-run pattern watermark appears when isUnofficialRun is true. + cy.get('[data-testid="inference-chart-display"] svg pattern[id^="unofficial-pattern-"]').should( + 'exist', + ); + }); + + it('switches to ttft x-axis mode and renders SVG with overlay points', () => { + cy.get('[data-testid="x-axis-mode-ttft"]').click(); + cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token'); + // Overlay points render as triangles or circles inside the chart SVG. + cy.get('[data-testid="inference-chart-display"] svg').should('exist'); + cy.get('[data-testid="inference-chart-display"] svg').then(($svgs) => { + let total = 0; + $svgs.each((_i, svg) => { + total += svg.querySelectorAll('circle, polygon, path').length; + }); + expect(total).to.be.greaterThan(0); + }); + }); + + it('normalized-e2e mode shows suppression banner for unofficial-run overlays', () => { + interceptDerivedMetrics(); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click(); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + // The suppression message appears because isUnofficialRun is true and the + // mode is 'normalized-e2e' (documented in ChartDisplay.tsx ~line 640). + cy.contains( + 'Normalized E2E requires persisted per-request traces, so unofficial-run overlays are unavailable for this experimental view.', + ).should('be.visible'); + }); +}); diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx index 051e7457..ccf0a944 100644 --- a/packages/app/src/components/datasets/dataset-detail.tsx +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -288,7 +288,11 @@ export function DatasetDetail({ slug }: { slug: string }) { {mobileMenuOpen && (
- {NAV_LINKS.map(({ href, label, event }) => ( + {navLinks.map(({ href, label, event }) => ( Date: Thu, 2 Jul 2026 23:39:41 -0500 Subject: [PATCH 34/40] chore: format packages/app/tsconfig.json (pnpm fmt was failing on it) --- packages/app/tsconfig.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/app/tsconfig.json b/packages/app/tsconfig.json index 8b658cad..3044b60c 100644 --- a/packages/app/tsconfig.json +++ b/packages/app/tsconfig.json @@ -29,7 +29,9 @@ "**/*.tsx", ".next/types/**/*.ts", "json-custom-types.d.ts", - ".next/dev/types/**/*.ts" + ".next/dev/types/**/*.ts", + ".next-e2e/types/**/*.ts", + ".next-e2e/dev/types/**/*.ts" ], "exclude": ["node_modules"] } From 031a7beb7132dea6e1b5b10cfc9f021b40eba477 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 3 Jul 2026 00:06:15 -0500 Subject: [PATCH 35/40] test(e2e): fix the 4 known-failing agentic specs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit agentic-point-time-series: the point-count assertions predated the time-boundary phase slicing — sliceTimelineByPhase puts every request with start >= the profiling boundary in the profiling window, so the warmup-labeled r5 legitimately lands there and cancelled/null-metric filtering yields 6 interactivity/TTFT points and 8 E2E points (traced to phase-slice.ts + time-series-math.ts, not just observed). No product regression. gpu-compare-agentic-detail: shared fixtures carry no agentic rows, so the flow could never render; spec-scoped intercepts (availability, benchmarks, trace-availability) now exercise the tooltip -> View charts link without touching shared fixtures. Full e2e suite: 449/449. These specs were failing all four CI shards on every push of this branch. --- .../e2e/agentic-point-time-series.cy.ts | 19 ++- .../e2e/gpu-compare-agentic-detail.cy.ts | 133 +++++++++++++++++- 2 files changed, 145 insertions(+), 7 deletions(-) diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts index 86d57b5d..1e5286c1 100644 --- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -74,8 +74,11 @@ describe('Agentic point request metric time series', () => { cy.get('[data-testid="interactivity-percentile-toggle"]') .find('[role="tab"][aria-selected="true"]') .should('have.text', 'P90'); - cy.get('[data-testid="interactivity-point-count"]').should('have.text', '5 points'); - cy.get('svg circle').should('have.length', 5); + // 6 points: profiling slice includes requests 0-4 (profiling) + request 5 + // (phase='warmup' label but start=5s > profiling boundary=0s, so + // sliceTimelineByPhase keeps it); cancelled r6 and null-metric r7/r8 are dropped. + cy.get('[data-testid="interactivity-point-count"]').should('have.text', '6 points'); + cy.get('svg circle').should('have.length', 6); cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT'); cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); @@ -83,8 +86,9 @@ describe('Agentic point request metric time series', () => { cy.get('[data-testid="ttft-over-time-chart"]').within(() => { cy.contains('h2', 'TTFT over time').should('be.visible'); - cy.get('[data-testid="ttft-point-count"]').should('have.text', '5 points'); - cy.get('svg circle').should('have.length', 5); + // Same 6-point slice as interactivity (warmup r5 included by time-boundary). + cy.get('[data-testid="ttft-point-count"]').should('have.text', '6 points'); + cy.get('svg circle').should('have.length', 6); cy.get('svg').should('contain.text', 'TTFT (s)'); cy.get('svg').should('contain.text', 'Cumulative P90 TTFT'); cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); @@ -109,8 +113,11 @@ describe('Agentic point request metric time series', () => { cy.get('[data-testid="ttft-over-time-chart"]').within(() => { cy.get('[data-testid="latency-metric-e2e"]').click(); cy.contains('h2', 'E2E latency over time').should('be.visible'); - cy.get('[data-testid="e2e-point-count"]').should('have.text', '7 points'); - cy.get('svg circle').should('have.length', 7); + // 8 points: e2e = (end−start)/1e6 > 0 for all non-cancelled requests — + // includes r0-r5 (profiling slice) + r7, r8 (subagent/aux with null ttft/tpot + // but valid start/end). Cancelled r6 is excluded. + cy.get('[data-testid="e2e-point-count"]').should('have.text', '8 points'); + cy.get('svg circle').should('have.length', 8); cy.get('svg').should('contain.text', 'E2E latency (s)'); cy.get('svg').should('contain.text', 'Cumulative P90 E2E latency'); diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts index 83171809..6c832e08 100644 --- a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts +++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts @@ -1,11 +1,142 @@ import { unlockAgenticGate } from '../support/e2e'; +// --------------------------------------------------------------------------- +// Spec-scoped fixture helpers +// +// The shared cypress/fixtures/api/*.json files contain ZERO agentic_traces rows +// (by design — adding them flips the bare /inference default to the agentic +// scenario and regresses other specs). This spec therefore injects minimal +// agentic data via spec-scoped cy.intercept overrides that shadow the fixture +// server, following the same pattern used in ttft-x-axis-toggle.cy.ts. +// --------------------------------------------------------------------------- + +const DEFAULT_MODEL_DB_KEY = 'dsv4'; // DeepSeek-V4-Pro +const AGENTIC_DATE = '2026-06-12'; + +// Two GPUs with agentic + single_turn entries so the scenario selector resolves +// to agentic (agentic preferred when both types exist for the same model). +const AGENTIC_HARDWARE = [ + { hardware: 'b200', framework: 'vllm', disagg: false }, + { hardware: 'b300', framework: 'vllm', disagg: false }, +]; + +const agenticAvailability = [ + // Agentic rows (isl/osl null). + ...AGENTIC_HARDWARE.map((g) => ({ + model: DEFAULT_MODEL_DB_KEY, + isl: null, + osl: null, + precision: 'fp4', + hardware: g.hardware, + framework: g.framework, + spec_method: 'none', + disagg: g.disagg, + benchmark_type: 'agentic_traces', + date: AGENTIC_DATE, + })), + // Single-turn rows alongside — without these the scenario selector may not + // see the "both exist" signal it needs to confidently pick agentic. + ...AGENTIC_HARDWARE.map((g) => ({ + model: DEFAULT_MODEL_DB_KEY, + isl: 8192, + osl: 1024, + precision: 'fp4', + hardware: g.hardware, + framework: g.framework, + spec_method: 'none', + disagg: g.disagg, + benchmark_type: 'single_turn', + date: AGENTIC_DATE, + })), +]; + +// Minimal per-metric percentile ladder matching what the chart expects for +// agentic rows (median/p75/p90/p95/p99 + std for each family). +const percentileLadder = (prefix: string, base: number): Record => ({ + [`median_${prefix}`]: base, + [`p75_${prefix}`]: base * 1.2, + [`p90_${prefix}`]: base * 1.5, + [`p95_${prefix}`]: base * 1.7, + [`p99_${prefix}`]: base * 2.2, + [`std_${prefix}`]: base * 0.3, +}); + +const agenticMetrics = (conc: number): Record => { + const scale = conc / 16; + const itl = 0.011 * scale; + return { + ...percentileLadder('ttft', 0.4 * scale), + ...percentileLadder('tpot', 0.012 * scale), + ...percentileLadder('itl', itl), + ...percentileLadder('e2el', 8 * scale), + median_intvty: 1 / itl, + p75_intvty: 1 / (itl * 1.2), + p90_intvty: 1 / (itl * 1.5), + p99_intvty: 1 / (itl * 2.2), + std_intvty: (1 / itl) * 0.1, + tput_per_gpu: 950 / Math.sqrt(scale), + output_tput_per_gpu: 210, + input_tput_per_gpu: 740, + total_tput_tps: 7600 * conc * 0.05, + }; +}; + +// IDs must be unique numbers — the GPU graph uses them as D3 data keys and +// trace-availability is keyed on them. +let benchIdCursor = 800100; +const agenticBenchmarks = AGENTIC_HARDWARE.flatMap((g) => + [16, 64, 128].map((conc) => ({ + id: benchIdCursor++, + hardware: g.hardware, + framework: g.framework, + model: DEFAULT_MODEL_DB_KEY, + precision: 'fp4', + spec_method: 'none', + disagg: g.disagg, + is_multinode: false, + prefill_tp: 8, + prefill_ep: 1, + prefill_dp_attention: false, + prefill_num_workers: 0, + decode_tp: 8, + decode_ep: 1, + decode_dp_attention: false, + decode_num_workers: 0, + num_prefill_gpu: 8, + num_decode_gpu: 8, + isl: null, + osl: null, + conc, + offload_mode: 'off', + benchmark_type: 'agentic_traces', + image: 'vllm/vllm-openai:v0.9.0', + metrics: agenticMetrics(conc), + workers: null, + date: AGENTIC_DATE, + run_url: null, + })), +); + +// All injected IDs with a stored trace blob — the GPU graph renders the +// "View charts" link only when trace-availability returns true for the id. +const agenticIds = new Set(agenticBenchmarks.map((b) => b.id)); + describe('GPU comparison agentic point detail', () => { it('exposes the per-point charts as a normal browser link', () => { + // Shadow the fixture-server availability + benchmarks responses with + // spec-scoped agentic data so the GPU graph renders agentic dots. + cy.intercept('GET', '/api/v1/availability', { body: agenticAvailability }).as( + 'agenticAvailability', + ); + cy.intercept('GET', '/api/v1/benchmarks*', { body: agenticBenchmarks }).as('agenticBenchmarks'); + // Return true for all injected ids so the "View charts" link appears. cy.intercept('GET', '/api/v1/trace-availability*', (request) => { const ids = new URL(request.url).searchParams.get('ids')?.split(',') ?? []; if (ids.length < 20) request.alias = 'gpuTraceAvailability'; - request.continue(); + const result = Object.fromEntries( + ids.filter((id) => agenticIds.has(Number(id))).map((id) => [id, true]), + ); + request.reply({ body: result }); }); cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', { From 7717471e9e0366478838aff191772c76b69d8ce4 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 3 Jul 2026 01:06:59 -0500 Subject: [PATCH 36/40] fix: address reviewer-bot findings (offload dedup, i_seq default, id guards, conv-id encoding, stale intvty, phase filter) - Chart-layer offload dedup: useChartData's latest-date-per-group key and mergeRunScopedRows' claim key now include offload_mode (?? 'off'), completing the SQL-layer fix - a later-dated offload sweep no longer drops the other variant's series or claims its base rows. - PARAM_DEFAULTS.i_seq '' so an explicit 8K/1K pick survives share URLs instead of stripping and reloading as the agentic default; moved PRE_AVAILABILITY_SEQUENCE below imports (code-quality flag). - Shared isPersistedBenchmarkId guard (integer > 0) at every agentic link/id-collection site: no /agentic/NaN or /agentic/0 links, overlay-only views skip the derived-metrics fetch instead of 400ing, and agentic/[id] notFound()s on invalid ids. - Conversation ids: encode once at link producers, decode exactly once (removed double decodeURIComponent in page + route); ids with % / # ? now round-trip. - ETL: missing/zero/invalid ITL now deletes the artifact-provided *_intvty key instead of passing p(1/ITL) values through; same fix on the documented overlay-path mirror in benchmark-transform. - Gantt timeline now phase-slices by time boundary like the per-point charts (sliceTimelineByPhase), so both tabs agree on request sets; spec expectation updated with producing-logic justification. - gitignore **/.next-* (secondary dist dirs from multi-server testing). app 2381 + db 399 unit tests, full e2e 449/449. --- .gitignore | 1 + .../e2e/agentic-point-time-series.cy.ts | 12 ++- .../inference/agentic/[id]/page.tsx | 9 ++- .../conversations/[convId]/route.test.ts | 71 +++++++++++++++++ .../[slug]/conversations/[convId]/route.ts | 6 +- .../[slug]/conversations/[convId]/page.tsx | 11 ++- .../src/components/GlobalFilterContext.tsx | 23 +++--- .../components/datasets/dataset-detail.tsx | 2 +- .../agentic-point/request-timeline.tsx | 15 ++-- .../inference/hooks/useChartData.test.ts | 79 ++++++++++++++++++- .../inference/hooks/useChartData.ts | 58 ++++++++++---- .../components/inference/ui/ChartDisplay.tsx | 7 +- .../utils/legend-points-table.test.ts | 9 +++ .../inference/utils/legend-points-table.ts | 3 +- .../inference/utils/tooltip-utils.test.ts | 11 +++ .../inference/utils/tooltipUtils.ts | 3 +- packages/app/src/lib/benchmark-id.test.ts | 33 ++++++++ packages/app/src/lib/benchmark-id.ts | 20 +++++ .../app/src/lib/benchmark-transform.test.ts | 77 ++++++++++++++++++ packages/app/src/lib/benchmark-transform.ts | 53 ++++++++----- packages/app/src/lib/url-state.test.ts | 30 ++++++- packages/app/src/lib/url-state.ts | 8 +- packages/db/src/etl/benchmark-mapper.test.ts | 16 ++++ packages/db/src/etl/benchmark-mapper.ts | 12 ++- 24 files changed, 501 insertions(+), 68 deletions(-) create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts create mode 100644 packages/app/src/lib/benchmark-id.test.ts create mode 100644 packages/app/src/lib/benchmark-id.ts diff --git a/.gitignore b/.gitignore index 41071934..c52b0482 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ # next.js **/.next +**/.next-* **/out # production diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts index 1e5286c1..e8161066 100644 --- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -169,10 +169,18 @@ describe('Agentic point request metric time series', () => { }); }); - it('shows total time with no requests in flight on the request timeline', () => { + it('shows total idle time on the request timeline (time-boundary phase slice, consistent with the charts)', () => { cy.get('[data-testid="detail-view-timeline"]').click(); cy.location('search').should('contain', 'view=timeline'); - cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 1.00s (14.3%)'); + // The Gantt now slices by TIME BOUNDARY (sliceTimelineByPhase), matching the + // per-point charts, instead of the per-request phase LABEL. The earliest + // profiling request starts at t=0, so the boundary is 0 and warmup-labelled + // r5 (start=5s) is counted as profiling here too — exactly as the interactivity + // /TTFT charts already count it (their 6-point slice includes r5). That fills + // the former 5–6s gap that label-based filtering left open, so in-flight + // coverage is now continuous across [0s, 7s]: idle 0ms (0.0%). A 1.00s value + // here would mean the Gantt had regressed to label-based filtering. + cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 0ms (0.0%)'); cy.get('[data-timeline-row-kind="aux"]') .should('have.css', 'padding-left', '24px') .and('contain.text', 'aux 011 · parallel'); diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx index 34dd169a..91b769bd 100644 --- a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx +++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx @@ -1,7 +1,9 @@ import type { Metadata } from 'next'; +import { notFound } from 'next/navigation'; import { AgenticGate } from '@/components/agentic-gate'; import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail'; +import { isPersistedBenchmarkId } from '@/lib/benchmark-id'; export const metadata: Metadata = { title: 'Agentic trace detail | InferenceX', @@ -14,9 +16,14 @@ export default async function AgenticPointDetailPage({ params: Promise<{ id: string }>; }) { const { id } = await params; + const numericId = Number(id); + // benchmark_results.id is a positive bigserial — anything else (`/agentic/abc`, + // `/agentic/0`, `/agentic/-1`) can never resolve, so 404 instead of rendering a + // blank detail shell that fires doomed id-keyed fetches. + if (!isPersistedBenchmarkId(numericId)) notFound(); return ( - + ); } diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts new file mode 100644 index 00000000..bc374e72 --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts @@ -0,0 +1,71 @@ +import { describe, expect, it, vi, beforeEach } from 'vitest'; + +const { mockGetConversation, mockGetDb } = vi.hoisted(() => ({ + mockGetConversation: vi.fn(), + mockGetDb: vi.fn(() => 'mock-sql'), +})); + +vi.mock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: mockGetDb, + JSON_MODE: false, + FIXTURES_MODE: false, +})); + +vi.mock('@semianalysisai/inferencex-db/queries/datasets', () => ({ + getConversation: mockGetConversation, +})); + +vi.mock('@semianalysisai/inferencex-db/json-provider', () => ({ + getConversation: vi.fn(), +})); + +vi.mock('@/lib/api-cache', () => ({ + cachedQuery: (fn: (...args: any[]) => any) => fn, + cachedJson: (data: unknown) => Response.json(data), +})); + +import { GET } from './route'; +import { NextRequest } from 'next/server'; + +function req(): NextRequest { + return new NextRequest(new URL('http://localhost/api/v1/datasets/ds/conversations/x')); +} + +/** + * App Router decodes each dynamic route segment EXACTLY ONCE before handing it to + * the handler, so `params.convId` is already the raw conversation id. These tests + * pin the route's contract: it must pass that value straight to the query with NO + * further decodeURIComponent (which would over-decode, mis-key '%'/'/' ids, or + * throw on a lone '%'). The client (useDatasetConversation) encodeURIComponent's + * the id before the fetch, so the whole pipeline decodes once end-to-end. + */ +beforeEach(() => { + vi.clearAllMocks(); + mockGetConversation.mockResolvedValue({ conv_id: 'x', turns: [] }); +}); + +describe('GET /api/v1/datasets/[slug]/conversations/[convId] — decode exactly once', () => { + it('passes the already-decoded convId straight through (no second decode)', async () => { + const params = Promise.resolve({ slug: 'ds', convId: 'a/b%c' }); + const res = await GET(req(), { params }); + expect(res.status).toBe(200); + // 'a/b%c' contains a lone '%'; a second decodeURIComponent here would THROW + // (→ 500). Passing through means the query sees the raw id verbatim. + expect(mockGetConversation).toHaveBeenCalledWith('mock-sql', 'ds', 'a/b%c'); + }); + + it('preserves special characters (% / # ?) exactly as decoded by App Router', async () => { + const raw = 'conv/50%_a#b?c'; + const params = Promise.resolve({ slug: 'ds', convId: raw }); + const res = await GET(req(), { params }); + expect(res.status).toBe(200); + expect(mockGetConversation).toHaveBeenCalledWith('mock-sql', 'ds', raw); + }); + + it('returns 404 when the conversation is not found', async () => { + mockGetConversation.mockResolvedValueOnce(null); + const params = Promise.resolve({ slug: 'ds', convId: 'missing' }); + const res = await GET(req(), { params }); + expect(res.status).toBe(404); + }); +}); diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts index 61672759..35f2fddf 100644 --- a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts @@ -26,7 +26,11 @@ export async function GET( ) { const { slug, convId } = await params; try { - const data = await getCachedConversation(slug, decodeURIComponent(convId)); + // App Router has already decoded the `[convId]` segment exactly once, so + // `convId` is the raw conversation id. The client (useDatasetConversation) + // encodeURIComponent-encodes it before the fetch; decoding again here would + // over-decode and mis-key ids containing '%' / '/'. Decode exactly once. + const data = await getCachedConversation(slug, convId); if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); return cachedJson(data); } catch (error) { diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx index 732b9ad1..5bc8fea9 100644 --- a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx +++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx @@ -11,25 +11,32 @@ interface Props { export async function generateMetadata({ params }: Props): Promise { const { slug, convId } = await params; + // App Router has already decoded the dynamic segment exactly once, so `convId` + // is the raw conversation id here. Re-encode for the canonical URL. const short = convId.slice(0, 12); const title = `Conversation ${short} | ${slug}`; const description = `Per-turn token flamegraph (cached prefix vs uncached input vs output) for conversation ${short} in the ${slug} agentic trace dataset.`; return { title, description, - alternates: { canonical: `${SITE_URL}/datasets/${slug}/conversations/${convId}` }, + alternates: { + canonical: `${SITE_URL}/datasets/${slug}/conversations/${encodeURIComponent(convId)}`, + }, robots: { index: false }, // per-conversation pages are too numerous to index }; } export default async function ConversationPage({ params }: Props) { const { slug, convId } = await params; + // `convId` is already decoded once by App Router — pass it straight through. + // A second decodeURIComponent here would over-decode (and throw for ids that + // contain a literal '%'). ConversationView re-encodes when it builds the API URL. return (
- +
diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index fd6a42ae..8bd10c71 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -21,17 +21,6 @@ function isEnumValue>(e: T, v: string): v is T[ return (Object.values(e) as string[]).includes(v); } -const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u; -const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u; - -// Placeholder for the public (non-null) `effectiveSequence` during the window -// before availability has loaded. It must be a fixed-seq scenario — never -// AgenticTraces — so the scenario selector doesn't flash "Agentic Traces" for a -// fixed-seq-only model while the chart shows its loading skeleton. `8k/1k` is -// the pre-agentic default for non-agentic models. Consumers that must not act on -// an unresolved sequence gate on `sequenceResolved` instead. -const PRE_AVAILABILITY_SEQUENCE = Sequence.EightK_OneK; - import { useAvailability } from '@/hooks/api/use-availability'; import { useWorkflowInfo } from '@/hooks/api/use-workflow-info'; import { useUrlState } from '@/hooks/useUrlState'; @@ -50,6 +39,18 @@ import { resolveEffectiveSequence } from '@/lib/default-sequence'; import { useFeatureGate } from '@/lib/use-feature-gate'; import type { AvailabilityRow, WorkflowInfoResponse } from '@/lib/api'; +const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u; +const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u; + +// Placeholder for the public (non-null) `effectiveSequence` during the window +// before availability has loaded. It must be a fixed-seq scenario — never +// AgenticTraces — so the scenario selector doesn't flash "Agentic Traces" for a +// fixed-seq-only model while the chart shows its loading skeleton. `8k/1k` is +// the pre-agentic default for non-agentic models. Consumers that must not act on +// an unresolved sequence gate on `sequenceResolved` instead. +// (Declared after the import block so it never references `Sequence` above its import.) +const PRE_AVAILABILITY_SEQUENCE = Sequence.EightK_OneK; + interface RunInfo { runId: string; runDate: string; diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx index ccf0a944..609a4c8f 100644 --- a/packages/app/src/components/datasets/dataset-detail.tsx +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -250,7 +250,7 @@ export function DatasetDetail({ slug }: { slug: string }) { > track('datasets_conversation_clicked', { slug })} className="font-mono text-xs text-primary hover:underline" > diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 1786c74d..18cb76d5 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -7,7 +7,7 @@ import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-reques import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; import { track } from '@/lib/analytics'; -import { requestsForPhase } from './phase-slice'; +import { sliceTimelineByPhase } from './phase-slice'; import { TimelineBars } from './timeline-bars'; import { formatDuration } from './timeline-format'; import { @@ -158,11 +158,16 @@ export function RequestTimelineView({ [data.requests], ); - // Apply phase filter, then group into rows. With no warmup data the filter - // collapses to "profiling" regardless of the (hidden) toggle state. + // Apply phase filter, then group into rows. Uses the SAME time-boundary + // slicing as the per-point charts (sliceTimelineByPhase) rather than the + // per-request phase LABEL, so the Gantt and the charts agree on exactly which + // requests belong to each phase (they diverge only when a warmup-labelled + // request starts after the first profiling request). With no warmup data the + // boundary is null and this is an identity passthrough — the filter collapses + // to "profiling" regardless of the (hidden) toggle state. const filtered = useMemo( - () => requestsForPhase(data.requests, hasWarmup ? phaseFilter : 'profiling'), - [data.requests, phaseFilter, hasWarmup], + () => sliceTimelineByPhase(data, hasWarmup ? phaseFilter : 'profiling').requests, + [data, phaseFilter, hasWarmup], ); // Stable order/color per conversation (or worker), computed over the FULL // request set — NOT the phase-filtered subset — so a row keeps its position diff --git a/packages/app/src/components/inference/hooks/useChartData.test.ts b/packages/app/src/components/inference/hooks/useChartData.test.ts index 73582998..c4998add 100644 --- a/packages/app/src/components/inference/hooks/useChartData.test.ts +++ b/packages/app/src/components/inference/hooks/useChartData.test.ts @@ -1,6 +1,83 @@ import { describe, it, expect } from 'vitest'; -import { buildComparisonDates, filterByGPU, flipRooflineDirection } from './useChartData'; +import { + buildComparisonDates, + dedupeRowsToLatestPerConfig, + filterByGPU, + flipRooflineDirection, +} from './useChartData'; + +interface DedupeInput { + id: number; + hardware: string; + framework: string; + spec_method: string; + disagg: boolean; + precision: string; + offload_mode?: string | null; + date: string; +} + +const drow = (over: Partial = {}): DedupeInput => ({ + id: 1, + hardware: 'b300', + framework: 'vllm', + spec_method: 'none', + disagg: false, + precision: 'fp4', + offload_mode: 'off', + date: '2026-06-01', + ...over, +}); + +describe('dedupeRowsToLatestPerConfig', () => { + it('keeps only the latest date within a single series', () => { + const rows = [ + drow({ id: 1, date: '2026-06-01' }), + drow({ id: 2, date: '2026-06-03' }), + drow({ id: 3, date: '2026-06-02' }), + ]; + expect(dedupeRowsToLatestPerConfig(rows).map((r) => r.id)).toEqual([2]); + }); + + it('keeps BOTH offload variants even when they were ingested on different dates', () => { + // The regression: offload=on sweep landed LATER than offload=off. Without + // offload in the key, the on-variant's newer date would win the shared group + // and silently drop the (older) off-variant series entirely. + const rows = [ + drow({ id: 1, offload_mode: 'off', date: '2026-06-01' }), + drow({ id: 2, offload_mode: 'on', date: '2026-06-05' }), + ]; + const kept = dedupeRowsToLatestPerConfig(rows) + .map((r) => r.offload_mode) + .toSorted(); + expect(kept).toEqual(['off', 'on']); + }); + + it('still dedupes each offload variant to its own latest date', () => { + const rows = [ + drow({ id: 1, offload_mode: 'off', date: '2026-06-01' }), + drow({ id: 2, offload_mode: 'off', date: '2026-06-04' }), + drow({ id: 3, offload_mode: 'on', date: '2026-06-02' }), + drow({ id: 4, offload_mode: 'on', date: '2026-06-05' }), + ]; + expect( + dedupeRowsToLatestPerConfig(rows) + .map((r) => r.id) + .toSorted(), + ).toEqual([2, 4]); + }); + + it('normalizes a missing offload_mode to "off" (matches the SQL lineKey)', () => { + // A row with no offload_mode collides with an explicit offload=off row of the + // same config — both are the "off" series, so latest-date dedup applies. + const rows = [ + drow({ id: 1, offload_mode: undefined, date: '2026-06-01' }), + drow({ id: 2, offload_mode: 'off', date: '2026-06-03' }), + ]; + expect(dedupeRowsToLatestPerConfig(rows).map((r) => r.id)).toEqual([2]); + }); +}); describe('buildComparisonDates', () => { it('returns empty when no GPUs selected (comparison disabled)', () => { diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 183641d4..f6596656 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -29,6 +29,7 @@ import { withPercentile, } from '@/lib/benchmark-transform'; import { Sequence, type Model } from '@/lib/data-mappings'; +import { isPersistedBenchmarkId } from '@/lib/benchmark-id'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; import { paretoFrontForDirection, type ParetoDirection } from '@/lib/chart-utils'; import { @@ -116,7 +117,7 @@ function e2eParetoIds( const ids = new Set(); for (const bucket of byGroup.values()) { for (const f of frontierFn(bucket)) { - if (typeof f.id === 'number') ids.add(f.id); + if (isPersistedBenchmarkId(f.id)) ids.add(f.id); } } return ids; @@ -166,6 +167,42 @@ export function flipRooflineDirection(dir: RooflineDirection): RooflineDirection return FLIP_MAP[dir]; } +/** The dedup key fields a chart series is identified by. */ +interface DedupeRow { + hardware: string; + framework: string; + spec_method: string; + disagg: boolean; + precision: string; + offload_mode?: string | null; + date: string; +} + +// offload_mode normalized `?? 'off'` to match the SQL layer's getBenchmarksForRun +// lineKey — agentic offload=on and offload=off are distinct series. +const dedupeSeriesKey = (r: DedupeRow): string => + `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}|${r.offload_mode ?? 'off'}`; + +/** + * For each series — (hardware, framework, spec_method, disagg, precision, + * offload_mode) — keep only the rows from that series' most recent date. When + * parallelism settings change between runs, old config_ids create stale points + * under the same legend line; dropping all-but-latest removes them. + * + * Without `offload_mode` in the key, an offload=on sweep ingested on a LATER date + * than the offload=off sweep would win the shared group and silently drop the + * (earlier-dated) offload=off variant — a data-loss regression. + */ +export function dedupeRowsToLatestPerConfig(rows: T[]): T[] { + const maxDatePerGroup = new Map(); + for (const r of rows) { + const k = dedupeSeriesKey(r); + const cur = maxDatePerGroup.get(k); + if (!cur || r.date > cur) maxDatePerGroup.set(k, r.date); + } + return rows.filter((r) => r.date === maxDatePerGroup.get(dedupeSeriesKey(r))); +} + export function useChartData( selectedModel: Model, selectedSequence: Sequence, @@ -292,19 +329,10 @@ export function useChartData( rowToSequence(r) === selectedSequence; const seqFiltered = allRows.filter(seqFilter); - // For each (hw, framework, spec_method, disagg, precision) group, keep only - // rows from the most recent date. When parallelism settings change between runs, - // old config_ids create stale data points under the same legend line — drop them. - const maxDatePerGroup = new Map(); - for (const r of seqFiltered) { - const key = `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}`; - const cur = maxDatePerGroup.get(key); - if (!cur || r.date > cur) maxDatePerGroup.set(key, r.date); - } - const deduped = seqFiltered.filter((r) => { - const key = `${r.hardware}|${r.framework}|${r.spec_method}|${r.disagg}|${r.precision}`; - return r.date === maxDatePerGroup.get(key); - }); + // Keep only each series' latest-date rows (drops stale config_ids left behind + // when parallelism settings change between runs). Keyed per offload variant so + // an offload=on sweep can't hide a differently-dated offload=off series. + const deduped = dedupeRowsToLatestPerConfig(seqFiltered); const mainRows = deduped.map((r) => selectedRunDate ? { ...r, date: selectedRunDate, actualDate: r.date } : r, @@ -561,7 +589,7 @@ export function useChartData( const isOnE2eFrontier = e2eParetoSet === null ? undefined - : typeof d.id === 'number' && e2eParetoSet.has(d.id); + : isPersistedBenchmarkId(d.id) && e2eParetoSet.has(d.id); return { ...d, x: xValue, diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 6952f439..d6c86529 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -61,6 +61,7 @@ import { type DerivedAgenticMetric, } from '@/hooks/api/use-derived-agentic-metrics'; import { isAgenticOnlyXAxisMode, type XAxisMode } from '@/components/inference/hooks/useChartData'; +import { isPersistedBenchmarkId } from '@/lib/benchmark-id'; import { useTrendData } from '@/components/inference/hooks/useTrendData'; import { getHardwareConfig, hardwareKeyMatchesAnyBase } from '@/lib/constants'; @@ -428,7 +429,9 @@ export default function ChartDisplay() { const ids = new Set(); for (const graph of visibleGraphs) { for (const point of graph.data) { - if (point.benchmark_type === 'agentic_traces' && typeof point.id === 'number') { + // Overlay-only agentic points carry no persisted id — skip them so we + // never request `?ids=0`/`?ids=NaN` (which 400s and errors the chart). + if (point.benchmark_type === 'agentic_traces' && isPersistedBenchmarkId(point.id)) { ids.add(point.id); } } @@ -461,7 +464,7 @@ export default function ChartDisplay() { }; const data = graph.data .map((point) => { - if (typeof point.id !== 'number') return null; + if (!isPersistedBenchmarkId(point.id)) return null; const raw = derivedSpec.value(derivedMetrics[point.id], selectedPercentile); if (raw === null || raw === undefined || !Number.isFinite(raw)) return null; return { ...point, x: derivedSpec.toX(raw) }; diff --git a/packages/app/src/components/inference/utils/legend-points-table.test.ts b/packages/app/src/components/inference/utils/legend-points-table.test.ts index b29cecbb..86d6f8b3 100644 --- a/packages/app/src/components/inference/utils/legend-points-table.test.ts +++ b/packages/app/src/components/inference/utils/legend-points-table.test.ts @@ -75,6 +75,15 @@ describe('pointDetailHref', () => { expect(pointDetailHref(pt(), false)).toEqual({ href: null, isExternal: false }); }); + it('does not build an /agentic/ link for a non-persisted id (0 / NaN)', () => { + // `typeof id === 'number'` accepted these; isPersistedBenchmarkId rejects + // them so we never link to /inference/agentic/0 or /inference/agentic/NaN. + for (const badId of [0, Number.NaN]) { + const d = pt({ benchmark_type: 'agentic_traces', id: badId }); + expect(pointDetailHref(d, false)).toEqual({ href: null, isExternal: false }); + } + }); + it('overlay points never get a link (no DB benchmark id)', () => { const d = pt({ benchmark_type: 'agentic_traces', diff --git a/packages/app/src/components/inference/utils/legend-points-table.ts b/packages/app/src/components/inference/utils/legend-points-table.ts index 0457e7c2..87df2fcf 100644 --- a/packages/app/src/components/inference/utils/legend-points-table.ts +++ b/packages/app/src/components/inference/utils/legend-points-table.ts @@ -1,4 +1,5 @@ import { updateRepoUrl } from '@/lib/utils'; +import { isPersistedBenchmarkId } from '@/lib/benchmark-id'; import type { InferenceData } from '@/components/inference/types'; import { fmt, getPointLabel } from '@/components/inference/utils/tooltipUtils'; @@ -56,7 +57,7 @@ export function pointDetailHref( isOverlay: boolean, ): { href: string | null; isExternal: boolean } { if (isOverlay) return { href: null, isExternal: false }; - if (d.benchmark_type === 'agentic_traces' && typeof d.id === 'number') { + if (d.benchmark_type === 'agentic_traces' && isPersistedBenchmarkId(d.id)) { return { href: `/inference/agentic/${d.id}`, isExternal: false }; } if (d.run_url) return { href: updateRepoUrl(d.run_url), isExternal: true }; diff --git a/packages/app/src/components/inference/utils/tooltip-utils.test.ts b/packages/app/src/components/inference/utils/tooltip-utils.test.ts index e4b9d31f..8755fbe7 100644 --- a/packages/app/src/components/inference/utils/tooltip-utils.test.ts +++ b/packages/app/src/components/inference/utils/tooltip-utils.test.ts @@ -159,6 +159,17 @@ describe('generateTooltipContent', () => { expect(html).not.toContain('data-action="view-charts" target='); }); + it('omits View charts when the point id is non-persisted (0 / NaN), even if pinned + hasTrace', () => { + // Overlay agentic points arrive with id 0 / NaN — the button would otherwise + // link to /inference/agentic/0, a doomed lookup. + for (const badId of [0, Number.NaN]) { + const html = generateTooltipContent( + tooltipConfig({ data: pt({ id: badId }), isPinned: true, hasTrace: true }), + ); + expect(html).not.toContain('data-action="view-charts"'); + } + }); + it('includes hardware display label from config', () => { const html = generateTooltipContent(tooltipConfig()); expect(html).toContain('H100'); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index 8f8ab4df..84398397 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -1,4 +1,5 @@ import { formatNumber, getDisplayLabel } from '@/lib/utils'; +import { isPersistedBenchmarkId } from '@/lib/benchmark-id'; import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types'; import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; @@ -142,7 +143,7 @@ const viewChartsButtonHTML = ( hasTraceData: boolean, pointId: number | undefined, ): string => { - if (!isPinned || !hasTraceData || typeof pointId !== 'number') return ''; + if (!isPinned || !hasTraceData || !isPersistedBenchmarkId(pointId)) return ''; return ` { const result = mapBenchmarkRow(makeV1Row({ p90_itl: 0.05, p90_intvty: 999 }), tracker); expect(result!.metrics.p90_intvty).toBe(999); }); + + it('DELETES a stale artifact *_intvty when the matching *_itl is absent', () => { + // Artifact ships intvty (possibly the drifted p(1/ITL) definition) but no itl + // for that percentile. Passing it through would mix harness semantics into a + // column meant to be 1/p(ITL) everywhere — so the key must be removed, not kept. + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeAgenticRow({ p90_intvty: 42, p95_itl: 0.2 }), tracker); + expect(result!.metrics).not.toHaveProperty('p90_intvty'); // stale → deleted + expect(result!.metrics.p95_intvty).toBeCloseTo(5, 6); // derived from itl + }); + + it('DELETES a stale artifact *_intvty when the matching *_itl is zero/invalid', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeAgenticRow({ p90_itl: 0, p90_intvty: 42 }), tracker); + expect(result!.metrics).not.toHaveProperty('p90_intvty'); + }); }); /** diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index e3fb148e..5b00618a 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -255,10 +255,20 @@ export function mapBenchmarkRow( // keeping every agentic row on one definition. `std` is excluded — the // reciprocal of a standard deviation is meaningless. Mirrored in the frontend // overlay path (agenticAliases). + // + // When `*_itl` is absent/zero/invalid we must DELETE any artifact-supplied + // `*_intvty` rather than let it survive: keeping it would mix the harness's + // (possibly `p(1/ITL)`) definition into a column that's meant to be `1/p(ITL)` + // everywhere else. Downstream reads a missing key as "not recorded" + // (rowToAggDataEntry coerces `?? 0`; the legend table renders a dash). if (isAgentic) { for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) { const itl = metrics[`${k}_itl`]; - if (typeof itl === 'number' && itl > 0) metrics[`${k}_intvty`] = 1 / itl; + if (typeof itl === 'number' && itl > 0) { + metrics[`${k}_intvty`] = 1 / itl; + } else { + delete metrics[`${k}_intvty`]; + } } } From b30dd21f0a095edc98eec7544f756a27ab2cac22 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 3 Jul 2026 01:30:03 -0500 Subject: [PATCH 37/40] style(inference): x-axis mode buttons use the SegmentedToggle tab recipe The mode row (TTFT / E2E Latency / Normalized E2E / Interactivity / Session Time / Prefill TPS) was hand-rolled chunky pills, visually disconnected from every other control. It now reuses SegmentedToggle - the repo's actual tab idiom (10+ call sites incl. the adjacent View Mode toggle) - sized up via buttonClassName since this is a primary control. Testids, track() events, tablist/tab aria semantics, and agentic-only visibility unchanged; net -12 lines. Cypress ttft-x-axis-toggle 8/8. --- .../components/inference/ui/ChartDisplay.tsx | 50 +++++++------------ 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index d6c86529..37949de9 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -803,41 +803,29 @@ export default function ChartDisplay() { )} -
- {X_AXIS_MODE_BUTTONS.filter(({ value }) => { + { if (!isAgenticOnlyXAxisMode(value)) return true; // Before mount, render all buttons so SSR and first client render match. if (!mounted) return true; return isAgenticSequence; - }).map(({ value, label }) => { - const isActive = selectedXAxisMode === value; - return ( - - ); - })} -
+ }).map(({ value, label }) => ({ + value, + label, + testId: `x-axis-mode-${value}`, + }))} + onValueChange={(value) => { + setSelectedXAxisMode(value); + track('latency_x_axis_mode_selected', { mode: value }); + }} + ariaLabel="Chart x-axis metric" + testId="x-axis-mode-buttons" + className="flex-wrap justify-center gap-1.5 sm:gap-2" + buttonClassName="min-w-[130px] sm:min-w-[140px] flex-1 sm:flex-initial justify-center rounded-md px-4 py-2 text-sm font-semibold" + activeButtonClassName="bg-muted text-foreground shadow-sm" + inactiveButtonClassName="text-muted-foreground hover:bg-muted/50 hover:text-foreground" + />
{displayGraphs}
{/* Performance Over Time — Modal Drill-Down */} From 87f5dce94235eaf9cf9f3c29ffd2c240e0e23b93 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 3 Jul 2026 01:59:26 -0500 Subject: [PATCH 38/40] style(inference): x-axis mode row as browser-style underline tabs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the SegmentedToggle pills with an accent-underline tab strip (revived the unused Radix-backed ui/tabs.tsx primitive). Active tab gets the repo's established nav-tab underline token (border-secondary in light, dark:border-primary), plus a bg-muted/60 active fill so the minecraft theme — whose global 'button { border: 2px !important }' override suppresses the underline — still distinguishes the active tab. Testids, track() events, agentic-only visibility, and Radix a11y (role=tab/aria-selected/keyboard) all preserved; SegmentedToggle stays for the adjacent view-mode toggle. cypress 8/8, vitest 2381/2381. --- .../components/inference/ui/ChartDisplay.tsx | 45 +++++++++++-------- packages/app/src/components/ui/tabs.tsx | 34 +++++++------- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 37949de9..7bc30ba9 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -31,6 +31,7 @@ import ScatterGraph from '@/components/inference/ui/ScatterGraph'; import { Card } from '@/components/ui/card'; import { ChartButtons } from '@/components/ui/chart-buttons'; import { type SegmentedToggleOption, SegmentedToggle } from '@/components/ui/segmented-toggle'; +import { Tabs, TabsList, TabsTrigger } from '@/components/ui/tabs'; import { ChartShareActions, MetricAssumptionNotes } from '@/components/ui/chart-display-helpers'; import { UnofficialDomainNotice } from '@/components/ui/unofficial-domain-notice'; import { exportToCsv } from '@/lib/csv-export'; @@ -803,29 +804,35 @@ export default function ChartDisplay() { )} - { - if (!isAgenticOnlyXAxisMode(value)) return true; - // Before mount, render all buttons so SSR and first client render match. - if (!mounted) return true; - return isAgenticSequence; - }).map(({ value, label }) => ({ - value, - label, - testId: `x-axis-mode-${value}`, - }))} onValueChange={(value) => { - setSelectedXAxisMode(value); + setSelectedXAxisMode(value as XAxisMode); track('latency_x_axis_mode_selected', { mode: value }); }} - ariaLabel="Chart x-axis metric" - testId="x-axis-mode-buttons" - className="flex-wrap justify-center gap-1.5 sm:gap-2" - buttonClassName="min-w-[130px] sm:min-w-[140px] flex-1 sm:flex-initial justify-center rounded-md px-4 py-2 text-sm font-semibold" - activeButtonClassName="bg-muted text-foreground shadow-sm" - inactiveButtonClassName="text-muted-foreground hover:bg-muted/50 hover:text-foreground" - /> + > + + {X_AXIS_MODE_BUTTONS.filter(({ value }) => { + if (!isAgenticOnlyXAxisMode(value)) return true; + // Before mount, render all buttons so SSR and first client render match. + if (!mounted) return true; + return isAgenticSequence; + }).map(({ value, label }) => ( + + {label} + + ))} + +
{displayGraphs}
{/* Performance Over Time — Modal Drill-Down */} diff --git a/packages/app/src/components/ui/tabs.tsx b/packages/app/src/components/ui/tabs.tsx index a54963a8..8b0f7e66 100644 --- a/packages/app/src/components/ui/tabs.tsx +++ b/packages/app/src/components/ui/tabs.tsx @@ -17,14 +17,11 @@ function Tabs({ className, ...props }: React.ComponentProps) { return ( -
-
- -
+ ); } @@ -34,24 +31,27 @@ function TabsTrigger({ className, ...props }: React.ComponentProps Date: Fri, 3 Jul 2026 02:31:01 -0500 Subject: [PATCH 39/40] style(inference): match x-axis tabs to top section-nav recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the card-fill + white active text; adopt tab-nav.tsx's flat underline-strip recipe verbatim — active = accent text (text-secondary dark:text-primary) + matching border-b-2 underline, no background, inactive = muted-foreground with border-only hover. The x-axis mode row now reads identically to the dashboard's top section tabs. cypress 8/8, vitest 2381. --- packages/app/src/components/ui/tabs.tsx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/packages/app/src/components/ui/tabs.tsx b/packages/app/src/components/ui/tabs.tsx index 8b0f7e66..4669e9e1 100644 --- a/packages/app/src/components/ui/tabs.tsx +++ b/packages/app/src/components/ui/tabs.tsx @@ -19,24 +19,27 @@ function TabsList({ className, ...props }: React.ComponentProps ); } +// Active/inactive recipe mirrors the top-of-page section nav +// (data-testid="chart-section-tabs" in src/components/tab-nav.tsx: tabLinkClass + +// currentTabClass) so the two tab rows read as the same flat underline-strip +// component: accent text + accent border-b-2 underline when active, muted text +// with no background fill when inactive, and a faint border highlight on hover. function TabsTrigger({ className, ...props }: React.ComponentProps) { return ( Date: Fri, 3 Jul 2026 04:05:50 -0500 Subject: [PATCH 40/40] style(inference): default high contrast + parallelism labels to OFF Reverts two of this PR's default-flips per product decision: high contrast (i_hc) and parallelism/advanced labels (i_advlabel) now default off. InferenceContext drops defaultHighContrast:true and flips the advlabel init to === '1'; both write-backs now encode ON as '1' / OFF as '' (matching i_linelabel), consistent with the unchanged PARAM_DEFAULTS so bare links render both off and i_hc=1/i_advlabel=1 still enable them. Specs updated; 62/62 affected e2e green. --- .../app/cypress/e2e/gradient-labels.cy.ts | 10 ++++--- packages/app/cypress/e2e/url-params.cy.ts | 26 +++++++++++++++---- .../components/inference/InferenceContext.tsx | 14 +++++----- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts index a0753e90..9c3d3274 100644 --- a/packages/app/cypress/e2e/gradient-labels.cy.ts +++ b/packages/app/cypress/e2e/gradient-labels.cy.ts @@ -24,8 +24,8 @@ describe('Gradient Labels Toggle', () => { cy.get('label[for="scatter-parallelism-labels"]').should('contain.text', 'Parallelism Labels'); }); - it('Parallelism Labels toggle is on by default', () => { - cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); + it('Parallelism Labels toggle is off by default', () => { + cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked'); }); it('per-point labels are visible by default (gradient labels off)', () => { @@ -60,7 +60,7 @@ describe('Gradient Labels Toggle', () => { }); it('both toggles can be enabled simultaneously', () => { - // Parallelism Labels is on by default; ensure it's on, then turn on Gradient. + // Parallelism Labels is off by default; turn it on, then turn on Gradient. cy.get('#scatter-parallelism-labels').then(($el) => { if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); }); @@ -71,8 +71,10 @@ describe('Gradient Labels Toggle', () => { cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Reset gradient for next tests (parallelism stays at its default-on). + // Reset both for next tests (each subsequent test does a fresh cy.visit, + // but keep state tidy here too). cy.get('#scatter-gradient-labels').click(); + cy.get('#scatter-parallelism-labels').click(); }); it('URL param i_gradlabel=1 enables gradient labels on load', () => { diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts index 927aee5f..6c827218 100644 --- a/packages/app/cypress/e2e/url-params.cy.ts +++ b/packages/app/cypress/e2e/url-params.cy.ts @@ -236,10 +236,10 @@ describe('URL Parameter Persistence', () => { }); describe('High contrast mode', () => { - it('inference loads with high contrast on by default', () => { + it('inference loads with high contrast off by default', () => { visitWithDismissedModal('/inference'); cy.get('[data-testid="scatter-graph"]').should('exist'); - cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); }); it('i_hc=0 disables high contrast on load', () => { @@ -273,12 +273,12 @@ describe('URL Parameter Persistence', () => { cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); - it('historical trends tab shares the inference high-contrast default (on)', () => { + it('historical trends tab shares the inference high-contrast default (off)', () => { // Historical reads highContrast from the same InferenceContext as the - // scatter chart, so it inherits the default-on behavior. + // scatter chart, so it inherits the default-off behavior. visitWithDismissedModal('/historical'); cy.get('[data-testid="historical-trends-display"]').should('exist'); - cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked'); + cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); }); it('i_hc=1 enables historical trends high contrast', () => { @@ -287,4 +287,20 @@ describe('URL Parameter Persistence', () => { cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); }); + + describe('Default toggle states (share-link correctness)', () => { + it('a bare /inference link with neither param renders high contrast AND parallelism labels off', () => { + visitWithDismissedModal('/inference'); + cy.get('[data-testid="scatter-graph"]').should('exist'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); + cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked'); + }); + + it('i_hc=1&i_advlabel=1 enables both high contrast and parallelism labels on load', () => { + visitWithDismissedModal('/inference?i_hc=1&i_advlabel=1'); + cy.get('[data-testid="scatter-graph"]').should('exist'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked'); + cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); + }); + }); }); diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index b9cbc7ce..6d6ad19d 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -241,8 +241,6 @@ export function InferenceProvider({ const dataQuickFilters = activeTab === 'historical' ? EMPTY_QUICK_FILTERS : quickFilters; const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({ urlPrefix: 'i_', - // Inference chart defaults to high contrast (?i_hc=0 overrides off). - defaultHighContrast: true, }); const [hideNonOptimal, setHideNonOptimal] = useState(() => getUrlParam('i_optimal') !== '0'); @@ -252,14 +250,14 @@ export function InferenceProvider({ if (getUrlParam('i_nolabel') === '1') return false; if (getUrlParam('i_label') === '0') return false; if (getUrlParam('i_label') === '1') return true; - // Default on: parallelism labels (also default on) are point labels and - // are pointless without them shown. + // Default on: point labels (TP + concurrency, or the fuller parallelism + // breakdown when Parallelism Labels is toggled on) are useful either way. return true; }); const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1'); - // Parallelism labels default on (?i_advlabel=0 overrides off). + // Parallelism labels default off (?i_advlabel=1 overrides on). const [useAdvancedLabels, setUseAdvancedLabels] = useState( - () => getUrlParam('i_advlabel') !== '0', + () => getUrlParam('i_advlabel') === '1', ); const [showGradientLabels, setShowGradientLabels] = useState( () => getUrlParam('i_gradlabel') === '1', @@ -1042,14 +1040,14 @@ export function InferenceProvider({ i_dend: selectedDateRange.endDate, i_optimal: hideNonOptimal ? '' : '0', i_label: showPointLabels ? '' : '0', - i_hc: highContrast ? '' : '0', + i_hc: highContrast ? '1' : '', i_log: logScale ? '1' : '', i_xmetric: selectedXAxisMetric || '', i_e2e_xmetric: selectedE2eXAxisMetric || '', i_xmode: selectedXAxisMode, i_scale: scaleType, i_legend: isLegendExpanded ? '' : '0', - i_advlabel: useAdvancedLabels ? '' : '0', + i_advlabel: useAdvancedLabels ? '1' : '', i_gradlabel: showGradientLabels ? '1' : '', i_linelabel: showLineLabels ? '1' : '', i_speed: showSpeedOverlay ? '1' : '',