diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md new file mode 100644 index 00000000..10e37d6c --- /dev/null +++ b/.claude/agents/ingest.md @@ -0,0 +1,196 @@ +--- +name: ingest +description: Ingest a benchmark run from GitHub Actions into the Neon DB used by the feat/agentx deployment. The target DB write URL must be provided in the invocation. Handles standard ingest, delete+reingest, and changelog entries. Invoke when the user asks to ingest a workflow run URL. +tools: Bash, Read, Edit, Write +--- + +You ingest benchmark runs from `SemiAnalysisAI/InferenceX` GitHub Actions into the Neon branch used by the `feat/agentx` deployment of this dashboard. Operate on `/Users/quilicic/InferenceX-app`. + +## Environment + +- **Repo root**: `/Users/quilicic/InferenceX-app` +- **DB write URL — MUST be provided by the invoker.** There is no default: the target Neon branch changes over time, and ingesting into the wrong one silently corrupts a live deployment. If the prompt does not include a `postgresql://` write URL, STOP and ask for it before touching anything. Requirements: + - Use the **direct (non-pooled)** host for ingest/migrations — no `-pooler` in the hostname. + - For psql diagnostics you may use the same URL directly: `psql "$DATABASE_WRITE_URL" -c "..."`. +- **Local dev server**: usually `http://localhost:3002` (port 3000 is a different project on this machine — never purge port 3000) +- **Preview URL**: `https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app` +- **INVALIDATE_SECRET** lives in repo root `.env` under that key. +- **GitHub auth**: `gh auth token` for `gh` calls and the GITHUB_TOKEN env var. + +## Standard ingest + +```bash +cd /Users/quilicic/InferenceX-app/packages/db +DATABASE_WRITE_URL='' \ +GITHUB_TOKEN=$(gh auth token) \ +pnpm exec tsx src/ingest-ci-run.ts --download SemiAnalysisAI/InferenceX +``` + +Then refresh the materialized view (the script's auto-refresh sometimes races): +`REFRESH MATERIALIZED VIEW latest_benchmarks;` + +## Cache purge (always do after any DB mutation) + +```bash +SECRET=$(grep "^INVALIDATE_SECRET" /Users/quilicic/InferenceX-app/.env | cut -d= -f2 | tr -d '"') +# Localhost (port 3002, NOT 3000) +curl -s -X POST -H "Authorization: Bearer $SECRET" http://localhost:3002/api/v1/invalidate +# Preview +mkdir -p /tmp/vp && cd /tmp/vp \ + && vercel link --project inferencemax-app --scope semianalysisai --yes >/dev/null 2>&1 \ + && vercel curl /api/v1/invalidate \ + --deployment https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app \ + --yes -- -sS -X POST -H "Authorization: Bearer $SECRET" +rm -rf /tmp/vp +``` + +## Delete + reingest (use only when user explicitly says "delete and reingest" OR when the run supersedes prior data with the same (model, hw, framework, precision)) + +```sql +BEGIN; +DELETE FROM benchmark_results br USING configs c +WHERE c.id = br.config_id + AND c.model = '' AND c.hardware = '' AND c.framework = '' + AND c.precision = '' AND br.benchmark_type = ''; +DELETE FROM availability +WHERE model = '' AND hardware = '' AND framework = '' + AND precision = '' AND benchmark_type = ''; +COMMIT; +``` + +If the user says "replace ONLY the points this run produces", scope the DELETE to `AND br.conc IN (...)` so untouched conc levels survive. Don't do this unless asked. + +## AIPerf tagging — DO NOT use by default + +AIPerf is no longer a separate harness from the user's perspective. **Always** ingest with `spec_method='none'` (the standard path above), regardless of run name. Run names that include the word "aiperf" do NOT mean you should set `spec_decoding='aiperf'` — the user wants those runs to merge into the standard legend entry alongside other runs of the same (model, hw, framework, precision). + +Only override this if the user **explicitly** asks for the run to appear as a separate legend line. If they do, the patching procedure is preserved below. Otherwise, use the standard ingest section above and do not touch `spec_decoding`. + +
+Explicit-request-only: how to tag a run as `spec_decoding='aiperf'` + +```bash +RID= +TMPDIR=$(mktemp -d -t aiperf-$RID-XXXX) +cd $TMPDIR + +# 1. Logical-name dedup + download +gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/$RID/artifacts" --paginate \ + --jq '.artifacts[] | "\(.name)\t\(.archive_download_url)\t\(.created_at)"' \ + | python3 -c " +import sys, re, collections +seen = collections.OrderedDict() +for line in sys.stdin: + name, url, created = line.rstrip('\n').split('\t') + key = re.sub(r'_[a-zA-Z][a-zA-Z0-9.-]*_\d+$', '', name) + if key not in seen or seen[key][2] < created: + seen[key] = (name, url, created) +for _, (name, url, _) in seen.items(): + print(f'{name}\t{url}') +" > artifacts.tsv +while IFS=$'\t' read -r name url; do + mkdir -p "$name" + gh api "$url" > "$name/a.zip" 2>/dev/null + unzip -oq "$name/a.zip" -d "$name" 2>/dev/null + rm "$name/a.zip" +done < artifacts.tsv + +# 2. Patch every benchmark JSON to set spec_decoding=aiperf +find $TMPDIR -name "*.json" | python3 -c " +import sys, json +for fn in (l.strip() for l in sys.stdin): + try: + with open(fn) as f: d = json.load(f) + except Exception: continue + rows = d if isinstance(d, list) else [d] + if not rows or not isinstance(rows[0], dict): continue + changed = False + for row in rows: + if isinstance(row, dict) and ('scenario_type' in row or 'infmax_model_prefix' in row or 'tput_per_gpu' in row): + row['spec_decoding'] = 'aiperf' + changed = True + if changed: + with open(fn, 'w') as f: json.dump(d if isinstance(d, list) else rows[0], f) +" + +# 3. Ingest in CI mode (reads INGEST_* env vars) +cd /Users/quilicic/InferenceX-app/packages/db +INGEST_RUN_ID=$RID INGEST_RUN_ATTEMPT=1 INGEST_ARTIFACTS_PATH=$TMPDIR INGEST_REPO=SemiAnalysisAI/InferenceX \ +DATABASE_WRITE_URL='' \ +GITHUB_TOKEN=$(gh auth token) \ +pnpm exec tsx src/ingest-ci-run.ts +rm -rf $TMPDIR +``` + +The `spec_method` column has a lowercase check constraint — always lowercase. + +
+ +## Don't auto-mention "AIPerf" in changelog entries + +Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`. + +## Adding a perf changelog entry — MANDATORY for every ingest + +**You ALWAYS MUST add a changelog entry for every run you ingest. This is not optional.** Every standard ingest, delete+reingest, and partial ingest gets exactly one changelog entry. Never finish an ingest without one. + +- If the user gave changelog text, use it verbatim (substitute `` with the run's hardware SKU when the text contains that placeholder). +- If the user did NOT specify text, DO NOT skip the changelog — derive a sensible description from the run name (see convention below) and add it anyway, then tell the user what you used so they can adjust. + +Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `---` (matches what the user actually sees in the filter chain). + +```sql +INSERT INTO changelog_entries (workflow_run_id, date, base_ref, head_ref, config_keys, description, pr_link) +SELECT id, date, '', '', ARRAY['---'], '', NULL +FROM latest_workflow_runs WHERE github_run_id = +RETURNING id, workflow_run_id, date::text, description; +``` + +Description convention from prior entries: ` Ingest # ()` — e.g. + +- `B200 Kimi Ingest #1` +- `MI355X Kimi Ingest #2` +- `H200 Kimi Ingest #1 (mmap cache)` + +If the user doesn't specify a description, DO NOT skip the entry and DO NOT block on asking — derive a description from the run name, add the entry, and report what you used so the user can adjust. + +## Common gotchas + +- **`conclusion IS NULL` filter**: availability hides runs whose `latest_workflow_runs.conclusion` is null (still in_progress). If a user wants in-progress data shown, you can `UPDATE workflow_runs SET conclusion='success', status='completed' WHERE id = ` then `REFRESH MATERIALIZED VIEW latest_benchmarks`. +- **failed_run filter**: rows where `num_requests_successful === 0 AND num_requests_total > 0` get skipped on purpose — they have null metrics and would overwrite good rows via ON CONFLICT. +- **Aggregated `results_bmk` artifact** contains rows from all runner attempts merged together — pair the artifact-level logical-name dedup with the row-level failed-run skip to avoid empty-row overwrites. +- **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `__` suffix. +- **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection. +- **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = ''` so the frontend's max-date-per-group dedup doesn't drop the older sweep. +- **Agentic interactivity normalization (`*_intvty`)**: for `agentic_traces` runs, interactivity MUST be the slow-tail reciprocal of the ITL percentile — `*_intvty = 1/*_itl` (so `p90_intvty = 1/p90_itl`). Some harness versions emit `*_intvty` as `p(1/ITL)` instead (fast-tail — inverts percentile order, e.g. p90 shows ~`1/p10(ITL)`), which silently contaminates cross-run Pareto comparisons. The ingest mapper (`benchmark-mapper.ts`) now **derives `*_intvty` from `*_itl` and discards the artifact's value** for agentic rows, so a normal ingest is self-correcting — no manual step needed. The frontend `agenticAliases` does the same for overlay / `?unofficialrun=` rows. If you ever load agentic data through a path that bypasses the mapper, run `pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes` (idempotent; rewrites `mean/p75/p90/p95 _intvty = 1/_itl`) then refresh the MV + purge cache. `std_intvty` is intentionally left alone (the reciprocal of a std is meaningless; the API strips it anyway). + +## Process + +1. **Always start by checking the run** with `gh api repos/SemiAnalysisAI/InferenceX/actions/runs/ --jq '{name, status, conclusion}'`. Note the model/hw/precision from the name. If `status != "completed"`, ask the user if they want to ingest in-progress data (will likely have failed_run skips). +2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding. +3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line. +4. **Refresh materialized view**. +5. **Add changelog entry — ALWAYS, MANDATORY.** Every ingest gets exactly one changelog entry (see "Adding a perf changelog entry — MANDATORY"). Use the user's text if given (substituting ``); otherwise derive one from the run name and add it anyway. Never skip this step. +6. **Purge both caches** (localhost 3002 + preview — never port 3000). +7. **Report** the row count, date, hardware, run id, and the changelog id (always present). + +## Related: ingesting agentic _datasets_ (not benchmark runs) + +This agent ingests **benchmark runs**. The HF agentic trace **datasets** (`semianalysisai/cc-traces-weka-*`) that the agentic benchmark replays are ingested by a separate script, not this flow: + +```bash +cd packages/db && DATABASE_WRITE_URL='' \ + pnpm exec tsx src/ingest-weka-dataset.ts \ + [--label "…"] [--variant full|256k] [--description "…"] [--limit N] +``` + +It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). + +New agentic benchmark artifacts preserve AIPerf's `metadata.dataset` provenance as a top-level `dataset` object. Standard benchmark ingest automatically derives the dataset slug from `dataset.hf_dataset_name` and upserts `run_datasets`; do not manually backfill that mapping for new-format runs. Manual mapping is only needed for legacy artifacts that do not contain dataset provenance. + +## Don't + +- Don't push to git unless the user asked. +- Don't ingest without permission if it's a delete+reingest of existing data. +- Don't hit port 3000 for cache purge — it's a different project. +- Don't capitalize `spec_method` values (DB has a lowercase check constraint). diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 00000000..513a873e --- /dev/null +++ b/.eslintignore @@ -0,0 +1,3 @@ +# Stale agent worktrees produced by parallel Claude Code sessions — they +# hold their own branches and are linted as part of their own runs. +.claude/worktrees/ diff --git a/.github/workflows/ingest-agentic-results.yml b/.github/workflows/ingest-agentic-results.yml new file mode 100644 index 00000000..fab99f5d --- /dev/null +++ b/.github/workflows/ingest-agentic-results.yml @@ -0,0 +1,233 @@ +name: Ingest Agentic Benchmark Results + +# Dispatched from the main InferenceX repo at the end of an agentic (AgentX +# trace-replay) sweep, mirroring the fixed-seq-len `ingest-results` dispatch: +# +# curl -sSf -X POST \ +# -H "Authorization: Bearer $INFX_FRONTEND_PAT" \ +# -H "Accept: application/vnd.github+v3+json" \ +# https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \ +# -d '{"event_type": "ingest-agentic-results", +# "client_payload": {"run-id": "", "run-attempt": "", +# "database-target": "production"}}' +# +# The ingest script (packages/db/src/ingest-ci-run.ts) auto-detects agentic +# artifacts: benchmark rows land in benchmark_results (benchmark_type= +# 'agentic_traces'), raw profile exports + server metrics land in the +# agentic_trace_replay sidecar with precomputed chart/timeline JSONBs, the +# run is linked to its dataset in run_datasets, and changelog-metadata is +# ingested when present. This is a separate workflow from ingest-results.yml +# because agentic ingests are blob-heavy (100MB+ gzipped profile exports per +# high-concurrency point) and need a much longer timeout, plus +# agentic-specific alerting (missing dataset slug). + +on: + repository_dispatch: + types: [ingest-agentic-results] + workflow_dispatch: + inputs: + run-id: + description: InferenceX Actions run ID to ingest + required: true + type: string + run-attempt: + description: InferenceX Actions run attempt to ingest + required: false + default: '1' + type: string + database-target: + description: Database/cache target for the ingest + required: false + default: production + type: choice + options: + - production + - dev + - agentx-v1 + +jobs: + ingest: + # Blob-heavy: uploading trace-replay sidecars for a ~20-point sweep takes + # far longer than a fixed-seq-len ingest. + timeout-minutes: 60 + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Wait for source run to finish + if: github.event_name != 'workflow_dispatch' + run: sleep 300 + + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: pnpm/action-setup@0e279bb959325dab635dd2c09392533439d90093 # v6.0.8 + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: '24' + cache: pnpm + - name: Install dependencies + run: pnpm install --filter @semianalysisai/inferencex-db... + env: + CYPRESS_INSTALL_BINARY: '0' + + - name: Select ingest target + env: + REQUESTED_DATABASE_TARGET: ${{ github.event.client_payload.database-target || inputs.database-target || 'production' }} + DATABASE_WRITE_URL_PRODUCTION: ${{ secrets.DATABASE_WRITE_URL }} + DATABASE_WRITE_URL_DEV: ${{ secrets.DATABASE_DEV_WRITE_URL }} + DATABASE_WRITE_URL_AGENTX_V1: ${{ secrets.DATABASE_AGENTX_V1_WRITE_URL }} + run: | + case "$REQUESTED_DATABASE_TARGET" in + production) + database_write_url="$DATABASE_WRITE_URL_PRODUCTION" + cache_invalidate_url="https://inferencex.semianalysis.com/api/v1/invalidate" + ;; + dev) + database_write_url="$DATABASE_WRITE_URL_DEV" + cache_invalidate_url="https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app/api/v1/invalidate" + ;; + agentx-v1) + database_write_url="$DATABASE_WRITE_URL_AGENTX_V1" + cache_invalidate_url="https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app/api/v1/invalidate" + ;; + *) + echo "::error::Unsupported database-target: $REQUESTED_DATABASE_TARGET" + exit 1 + ;; + esac + + if [ -z "$database_write_url" ]; then + echo "::error::Database secret is empty for target: $REQUESTED_DATABASE_TARGET" + exit 1 + fi + + echo "::add-mask::$database_write_url" + echo "DATABASE_WRITE_URL=$database_write_url" >> "$GITHUB_ENV" + echo "INGEST_DATABASE_TARGET=$REQUESTED_DATABASE_TARGET" >> "$GITHUB_ENV" + echo "CACHE_INVALIDATE_URL=$cache_invalidate_url" >> "$GITHUB_ENV" + echo "Selected ingest target: $REQUESTED_DATABASE_TARGET" + echo "Cache invalidate URL: $cache_invalidate_url" + + - name: Run migrations + run: pnpm admin:db:migrate --yes + + - name: Download artifacts from InferenceX run + env: + GH_TOKEN: ${{ secrets.INFX_MAIN_PAT }} + RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }} + ARTIFACTS_PATH: ${{ github.workspace }}/artifacts + run: | + mkdir -p "$ARTIFACTS_PATH" + + # Download all artifacts for the run, deduplicated by name (keep latest). + gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/${RUN_ID}/artifacts" --paginate \ + | jq -r ' + [.artifacts[]] + | group_by(.name) | map(sort_by(.created_at) | last)[] + | "\(.name)\t\(.archive_download_url)"' \ + | while IFS=$'\t' read -r name url; do + echo "Downloading artifact: ${name}" + ok=false + for attempt in 1 2 3; do + if gh api "${url}" > artifact.zip; then + ok=true + break + fi + echo " Attempt ${attempt}/3 failed, retrying in ${attempt}s..." + sleep "$attempt" + done + if [ "$ok" = false ]; then + echo "::warning::Failed to download artifact after 3 attempts: ${name} — skipping" + rm -f artifact.zip + echo "$name" >> "$ARTIFACTS_PATH/.failures" + continue + fi + mkdir -p "${ARTIFACTS_PATH}/${name}" + if ! unzip -o artifact.zip -d "${ARTIFACTS_PATH}/${name}"; then + echo "::warning::Failed to extract artifact: ${name} — skipping" + rm -rf "${ARTIFACTS_PATH:?}/${name}" + echo "$name" >> "$ARTIFACTS_PATH/.failures" + fi + rm -f artifact.zip + done + + if [ -f "$ARTIFACTS_PATH/.failures" ]; then + count=$(wc -l < "$ARTIFACTS_PATH/.failures") + rm "$ARTIFACTS_PATH/.failures" + echo "::warning::${count} artifact(s) failed to download; ingesting what's available" + fi + + echo "Downloaded artifacts:" + ls "$ARTIFACTS_PATH/" + + if [ -z "$(ls -A "$ARTIFACTS_PATH")" ]; then + echo "::error::No artifacts could be downloaded from run ${RUN_ID}" + exit 1 + fi + + - name: Ingest results to DB + env: + GITHUB_TOKEN: ${{ secrets.INFX_MAIN_PAT }} + INGEST_RUN_ID: ${{ github.event.client_payload.run-id || inputs.run-id }} + INGEST_RUN_ATTEMPT: ${{ github.event.client_payload.run-attempt || inputs.run-attempt }} + INGEST_ARTIFACTS_PATH: ${{ github.workspace }}/artifacts + INGEST_REPO: SemiAnalysisAI/InferenceX + UNMAPPED_ENTITIES_OUTPUT: ${{ github.workspace }}/unmapped-entities.json + run: pnpm admin:db:ingest:ci + + - name: Apply run overrides + run: pnpm admin:db:apply-overrides --yes + + - name: Verify database + run: pnpm admin:db:verify + + - name: Invalidate Vercel cache + env: + VERCEL_INVALIDATE_SECRET: ${{ secrets.VERCEL_INVALIDATE_SECRET }} + run: | + curl -sSf -X POST "$CACHE_INVALIDATE_URL" \ + -H "Authorization: Bearer $VERCEL_INVALIDATE_SECRET" || true + + - name: Check for unmapped entities + if: always() + id: unmapped + run: | + f="${{ github.workspace }}/unmapped-entities.json" + if [ -f "$f" ]; then + echo "found=true" >> "$GITHUB_OUTPUT" + models=$(jq -r '.models // [] | join(", ")' "$f") + hardware=$(jq -r '.hardware // [] | join(", ")' "$f") + precisions=$(jq -r '.precisions // [] | join(", ")' "$f") + datasets=$(jq -r '.datasets // [] | join(", ")' "$f") + msg="" + [ -n "$models" ] && msg="${msg}Models: ${models}\n" + [ -n "$hardware" ] && msg="${msg}Hardware: ${hardware}\n" + [ -n "$precisions" ] && msg="${msg}Precisions: ${precisions}\n" + [ -n "$datasets" ] && msg="${msg}Datasets missing from datasets table (run ingest-weka-dataset): ${datasets}\n" + { + echo 'summary<> "$GITHUB_OUTPUT" + fi + + - name: Notify Slack on unmapped entities + if: steps.unmapped.outputs.found == 'true' + uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL }} + webhook-type: incoming-webhook + payload: | + { + "text": ":warning: *Unrecognized entities during agentic ingest*\nRun ID: ${{ github.event.client_payload.run-id || inputs.run-id }}\n```${{ steps.unmapped.outputs.summary }}```\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" + } + + - name: Notify Slack on failure + if: failure() + uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL }} + webhook-type: incoming-webhook + payload: | + { + "text": ":rotating_light: *Agentic ingest workflow failed*\nRun ID: ${{ github.event.client_payload.run-id || inputs.run-id }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" + } diff --git a/.gitignore b/.gitignore index a86f6e23..c52b0482 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ # next.js **/.next +**/.next-* **/out # production @@ -71,3 +72,4 @@ C:* # python bytecode (e.g. .claude/skills/*/iso-interactivity.py imports) **/__pycache__/ **/*.pyc +.playwright-mcp/ diff --git a/.oxlintrc.json b/.oxlintrc.json index ff610e51..5a03a5a0 100644 --- a/.oxlintrc.json +++ b/.oxlintrc.json @@ -28,6 +28,7 @@ "no-undef": "off", "no-underscore-dangle": "off", "no-useless-undefined": "off", + "require-unicode-regexp": "off", "no-warning-comments": "off", "prefer-destructuring": "off", "sort-imports": "off", diff --git a/docs/data-pipeline.md b/docs/data-pipeline.md index 38e7d471..bc439e8a 100644 --- a/docs/data-pipeline.md +++ b/docs/data-pipeline.md @@ -62,6 +62,18 @@ Configs are preloaded into an in-memory Map at ingest start. `getOrCreateConfig( Unmapped models/hardware are tracked (not silently dropped) so operators can see what new GPU or model names appeared in CI artifacts. This is how new GPUs get added to the system — the skip tracker acts as a change detection mechanism. +### Server-Metric Orchestrator Adapters + +AIPerf defines the `server_metrics_export.json` envelope, but labels such as worker role and rank belong to the serving orchestrator. The chart-series ETL therefore normalizes raw series through an orchestrator-specific adapter before exposing per-worker metrics. For example, the Dynamo adapter maps `dynamo_component=prefill|backend` to canonical `prefill|decode` roles and uses the endpoint, worker ID, DP rank, and engine together as the source identity. + +Adapters are selected from the benchmark's canonical framework, and per-worker series are only emitted for disaggregated configs with a recognized adapter. Unknown orchestrators and non-disaggregated configs retain their aggregate-only series; roles are never guessed from ports or metric names. The frontend only consumes the canonical source identity and never interprets orchestrator-native labels. + +### Agentic Dataset Provenance + +AIPerf exports public-dataset provenance in `metadata.dataset`, including the Hugging Face dataset ID. InferenceX preserves that object as `dataset` on each agentic aggregate benchmark row. During benchmark ingest, `ingest-ci-run.ts` derives the dashboard slug from `hf_dataset_name` (for example, `semianalysisai/cc-traces-weka-062126` becomes `cc-traces-weka-062126`) and upserts `run_datasets` for the workflow run. + +Legacy artifacts without provenance leave any existing mapping untouched. A workflow run can map to only one dataset; conflicting dataset IDs fail ingest rather than silently linking the run to an arbitrary dataset. + ## Frontend Transform Pipeline ### Why transformBenchmarkRows Exists diff --git a/packages/app/cypress/component/chart-legend.cy.tsx b/packages/app/cypress/component/chart-legend.cy.tsx index 4a362c2b..535a0053 100644 --- a/packages/app/cypress/component/chart-legend.cy.tsx +++ b/packages/app/cypress/component/chart-legend.cy.tsx @@ -1,5 +1,8 @@ import { useState } from 'react'; +import LegendPointsDialog from '@/components/inference/ui/LegendPointsDialog'; +import type { InferenceData } from '@/components/inference/types'; +import { buildLegendPointsRows } from '@/components/inference/utils/legend-points-table'; import ChartLegend, { type CommonLegendItemProps } from '@/components/ui/chart-legend'; const MOCK_ITEMS: CommonLegendItemProps[] = [ @@ -119,4 +122,146 @@ describe('ChartLegend (sidebar variant)', () => { .click(); cy.get('.sidebar-legend').should('not.have.class', 'bg-accent'); }); + + it('renders no points-table icon when items have no onShowPoints handler', () => { + cy.get('[data-testid^="legend-points-"]').should('not.exist'); + }); +}); + +// --------------------------------------------------------------------------- +// Per-series points table (inference legend drill-down) +// --------------------------------------------------------------------------- + +function mockPoint(overrides: Partial = {}): InferenceData { + return { + date: '2025-06-15', + x: 100, + y: 500, + tp: 8, + conc: 16, + hwKey: 'b300-sxm', + precision: 'fp4', + tput_per_gpu: 1500.5, + median_intvty: 45.2, + p90_intvty: 38.1, + median_ttft: 0.42, + p90_ttft: 0.87, + tpPerGpu: { y: 1500.5, roof: false }, + tpPerMw: { y: 50, roof: false }, + costh: { y: 1, roof: false }, + costn: { y: 1, roof: false }, + costr: { y: 1, roof: false }, + costhi: { y: 1, roof: false }, + costni: { y: 1, roof: false }, + costri: { y: 1, roof: false }, + ...overrides, + } as InferenceData; +} + +const OFFICIAL_POINTS: InferenceData[] = [ + mockPoint({ conc: 32, benchmark_type: 'agentic_traces', id: 206863, offload_mode: 'on' }), + mockPoint({ conc: 4, benchmark_type: 'agentic_traces', id: 206860, offload_mode: 'off' }), +]; + +const OVERLAY_POINTS: InferenceData[] = [ + mockPoint({ conc: 8, run_url: 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/1' }), +]; + +/** Mirrors ScatterGraph's wiring: legend rows with onShowPoints → dialog. */ +function LegendWithPointsTable() { + const [openSeries, setOpenSeries] = useState<'official' | 'overlay' | null>(null); + + const items: CommonLegendItemProps[] = [ + { + name: 'b300-sxm', + hw: 'b300-sxm', + label: 'B300 (vLLM)', + color: '#2b83ba', + isActive: true, + onClick: () => {}, + onShowPoints: () => setOpenSeries('official'), + }, + { + name: '✕ unofficial-run-99', + hw: 'overlay-run-99', + label: '✕ my-branch', + color: '#dc2626', + isActive: true, + onClick: () => {}, + onShowPoints: () => setOpenSeries('overlay'), + }, + ]; + + const isOverlay = openSeries === 'overlay'; + return ( + <> + {}} + variant="sidebar" + /> + {openSeries && ( + { + if (!open) setOpenSeries(null); + }} + title={isOverlay ? '✕ my-branch' : 'B300 (vLLM)'} + subtitle="DeepSeek V4 Pro · Agentic Traces" + accentColor={isOverlay ? '#dc2626' : '#2b83ba'} + rows={buildLegendPointsRows(isOverlay ? OVERLAY_POINTS : OFFICIAL_POINTS, isOverlay)} + isOverlay={isOverlay} + /> + )} + + ); +} + +describe('ChartLegend points-table icon + dialog', () => { + beforeEach(() => { + cy.mount(); + }); + + it('renders the icon only for rows with an onShowPoints handler', () => { + cy.get('[data-testid="legend-points-b300-sxm"]').should('exist'); + cy.get('[data-testid="legend-points-overlay-run-99"]').should('exist'); + }); + + it('opens the dialog with the series points sorted by concurrency, with row links', () => { + cy.get('[data-testid="legend-points-b300-sxm"]').click(); + cy.get('[data-testid="legend-points-dialog"]').should('be.visible'); + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'B300 (vLLM)'); + cy.get('[data-testid="legend-points-dialog"]').should( + 'contain.text', + 'DeepSeek V4 Pro · Agentic Traces', + ); + // Two rows, conc ascending, linked to the agentic detail pages + cy.get('[data-testid="legend-points-row"]').should('have.length', 2); + cy.get('a[data-testid="legend-points-row"]') + .first() + .should('have.attr', 'href', '/inference/agentic/206860'); + cy.get('a[data-testid="legend-points-row"]').first().should('contain.text', '4'); + // Offload column present for agentic rows + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'Offload'); + }); + + it('overlay series opens a link-free table with the metrics-only caption', () => { + cy.get('[data-testid="legend-points-overlay-run-99"]').click(); + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', '✕ my-branch'); + cy.get('a[data-testid="legend-points-row"]').should('not.exist'); + cy.get('div[data-testid="legend-points-row"]').should('have.length', 1); + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', 'metrics only'); + // Metrics still render + cy.get('[data-testid="legend-points-dialog"]').should('contain.text', '1500.5'); + }); + + it('dialog closes and can be reopened', () => { + cy.get('[data-testid="legend-points-b300-sxm"]').click(); + cy.get('[data-testid="legend-points-dialog"]').should('be.visible'); + cy.get('body').type('{esc}'); + cy.get('[data-testid="legend-points-dialog"]').should('not.exist'); + cy.get('[data-testid="legend-points-overlay-run-99"]').click(); + cy.get('[data-testid="legend-points-dialog"]').should('be.visible'); + }); }); diff --git a/packages/app/cypress/component/dataset-list.cy.tsx b/packages/app/cypress/component/dataset-list.cy.tsx new file mode 100644 index 00000000..f7cfcb9a --- /dev/null +++ b/packages/app/cypress/component/dataset-list.cy.tsx @@ -0,0 +1,93 @@ +import { QueryClient, QueryClientProvider } from '@tanstack/react-query'; +import { AppRouterContext } from 'next/dist/shared/lib/app-router-context.shared-runtime'; + +import { DatasetList } from '@/components/datasets/dataset-list'; +import type { DatasetRecord } from '@/hooks/api/use-datasets'; + +const datasets: DatasetRecord[] = [ + { + id: 'ds-1', + slug: 'cc-traces-weka-full', + label: 'cc-traces-weka (full)', + variant: 'full', + description: 'Every captured request, unmodified.', + hf_url: 'https://huggingface.co/datasets/semianalysisai/cc-traces-weka-full', + license: 'apache-2.0', + conversation_count: 1234, + summary: { + totalIn: 5_000_000, + totalOut: 250_000, + cachedPct: 0.82, + mainTurns: 9800, + subagentGroups: 540, + }, + ingested_at: '2026-06-20T00:00:00Z', + }, + { + id: 'ds-2', + slug: 'cc-traces-weka-256k', + label: 'cc-traces-weka (256k)', + variant: '256k', + description: 'Turns trimmed to a 256k context window.', + hf_url: null, + license: 'apache-2.0', + conversation_count: 980, + summary: { + totalIn: 3_200_000, + totalOut: 180_000, + cachedPct: 0.79, + mainTurns: 7600, + subagentGroups: 410, + }, + ingested_at: '2026-06-19T00:00:00Z', + }, +]; + +function createMockRouter() { + return { + push: cy.stub(), + replace: cy.stub(), + refresh: cy.stub(), + back: cy.stub(), + forward: cy.stub(), + prefetch: cy.stub().resolves(), + }; +} + +function mountList() { + const queryClient = new QueryClient({ defaultOptions: { queries: { retry: false } } }); + cy.mount( + + + + + , + ); +} + +describe('DatasetList', () => { + it('renders a card per dataset with its summary stats', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: datasets }).as('list'); + mountList(); + cy.wait('@list'); + cy.contains('cc-traces-weka (full)').should('be.visible'); + cy.contains('cc-traces-weka (256k)').should('be.visible'); + cy.contains('1,234').should('be.visible'); // conversation_count, localized + cy.contains('82%').should('be.visible'); // cachedPct + cy.get('a[href="/datasets/cc-traces-weka-full"]').should('exist'); + }); + + it('shows the empty state when no datasets are ingested', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: [] }).as('empty'); + mountList(); + cy.wait('@empty'); + cy.contains('No datasets ingested yet.').should('be.visible'); + }); + + it('shows the error state when the request fails', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 500, body: { error: 'boom' } }).as('err'); + mountList(); + cy.wait('@err'); + cy.contains('Failed to load datasets.').should('be.visible'); + }); +}); diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx new file mode 100644 index 00000000..511505b9 --- /dev/null +++ b/packages/app/cypress/component/distribution-card.cy.tsx @@ -0,0 +1,82 @@ +import { DistributionCard } from '@/components/datasets/distribution-card'; +import type { Distribution } from '@/hooks/api/use-datasets'; + +const distribution: Distribution = { + bins: [ + { x0: 0, x1: 100, count: 5 }, + { x0: 100, x1: 200, count: 20 }, + { x0: 200, x1: 300, count: 12 }, + { x0: 300, x1: 400, count: 3 }, + ], + stats: { + count: 40, + min: 10, + max: 390, + mean: 180, + median: 175, + p75: 250, + p90: 320, + p95: 360, + }, +}; + +describe('DistributionCard', () => { + it('renders the title, summary stats, and one bar per bin', () => { + cy.mount( + , + ); + cy.contains('Input tokens per turn').should('be.visible'); + cy.contains('n=40').should('be.visible'); + cy.contains('p50 175').should('be.visible'); + cy.contains('p75 250').should('be.visible'); + cy.contains('p90 320').should('be.visible'); + cy.contains('p95 360').should('be.visible'); + cy.get( + 'line[stroke="#3b82f6"], line[stroke="#22c55e"], line[stroke="#f59e0b"], line[stroke="#ef4444"]', + ).should('have.length', 8); + // One filled bar rect per bin (ChartHover may add a transparent overlay rect). + cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length); + }); + + it('shows a "No data" placeholder when no distribution is provided', () => { + cy.mount(); + cy.contains('Empty metric').should('be.visible'); + cy.contains('No data').should('be.visible'); + cy.get('rect[class*="fill-primary"]').should('not.exist'); + }); + + it('marks the chart as log scale when scale="log"', () => { + cy.mount( + , + ); + cy.contains('log scale').should('be.visible'); + }); + + it('renders older v1 stats without unavailable percentile guides', () => { + cy.mount( + , + ); + cy.contains('p50 175').should('be.visible'); + cy.contains('p90 320').should('be.visible'); + cy.contains('NaN').should('not.exist'); + }); +}); diff --git a/packages/app/cypress/component/inference-chart-controls.cy.tsx b/packages/app/cypress/component/inference-chart-controls.cy.tsx index 03e6a50c..5a6311f4 100644 --- a/packages/app/cypress/component/inference-chart-controls.cy.tsx +++ b/packages/app/cypress/component/inference-chart-controls.cy.tsx @@ -14,8 +14,8 @@ describe('Inference ChartControls', () => { it('renders the sequence selector with the current sequence', () => { // Default mock: selectedSequence = Sequence.EightK_OneK -> label "8K / 1K" - cy.get('#sequence-select').should('be.visible'); - cy.get('#sequence-select').should('contain.text', '8K / 1K'); + cy.get('#scenario-select').should('be.visible'); + cy.get('#scenario-select').should('contain.text', '8K / 1K'); }); it('renders the precision multi-select with the current precision', () => { diff --git a/packages/app/cypress/component/trace-flamegraph.cy.tsx b/packages/app/cypress/component/trace-flamegraph.cy.tsx new file mode 100644 index 00000000..1be90e0c --- /dev/null +++ b/packages/app/cypress/component/trace-flamegraph.cy.tsx @@ -0,0 +1,86 @@ +import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; +import type { ConversationStructure } from '@/hooks/api/use-datasets'; + +// Two main turns followed by one subagent group with two child turns. +// Node indices: 0 = turn, 1 = turn, 2 = subagent (so its rows key off `g-2`). +const structure: ConversationStructure = { + blockSize: 64, + nodes: [ + { kind: 'turn', turnIndex: 0, model: 'claude', in: 1000, out: 200, cached: 600, uncached: 400 }, + { + kind: 'turn', + turnIndex: 1, + model: 'claude', + in: 2000, + out: 300, + cached: 1500, + uncached: 500, + }, + { + kind: 'subagent', + label: 'Subagent: search', + agentId: 'agent-1', + durationMs: 12000, + in: 5000, + out: 800, + cached: 3000, + uncached: 2000, + children: [ + { + kind: 'turn', + turnIndex: 0, + model: 'claude', + in: 2500, + out: 400, + cached: 1500, + uncached: 1000, + }, + { + kind: 'turn', + turnIndex: 1, + model: 'claude', + in: 2500, + out: 400, + cached: 1500, + uncached: 1000, + }, + ], + }, + ], + totals: { in: 8000, out: 1300, cached: 5100, uncached: 2900, numTurns: 2, numSubagentGroups: 1 }, +}; + +describe('TraceFlamegraph', () => { + it('renders the legend, main-turn rows, and the subagent group header', () => { + cy.mount(); + cy.contains('Cached prefix').should('be.visible'); + cy.contains('Uncached input').should('be.visible'); + cy.contains('Output').should('be.visible'); + cy.get('[data-rowkey="t-0"]').should('contain.text', 'Turn 1'); + cy.get('[data-rowkey="t-1"]').should('contain.text', 'Turn 2'); + cy.contains('Subagent: search').should('be.visible'); + }); + + it('keeps subagent children collapsed until the group is expanded', () => { + cy.mount(); + cy.get('[data-rowkey="g-2-c-0"]').should('not.exist'); + cy.contains('button', 'Subagent: search').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('be.visible'); + cy.get('[data-rowkey="g-2-c-1"]').should('be.visible'); + }); + + it('expand all / collapse all toggles every subagent group', () => { + cy.mount(); + cy.contains('button', 'Expand all').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('be.visible'); + cy.contains('button', 'Collapse all').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('not.exist'); + }); + + it('auto-expands and highlights the target group child for a request-timeline deep link', () => { + cy.mount( + , + ); + cy.get('[data-rowkey="g-2-c-1"]').should('be.visible').and('have.class', 'ring-primary'); + }); +}); diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts new file mode 100644 index 00000000..e8161066 --- /dev/null +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -0,0 +1,337 @@ +import { unlockAgenticGate } from '../support/e2e'; + +const timelineRequest = ( + index: number, + ttftMs: number, + tpotMs: number, + overrides: Record = {}, +) => ({ + cid: 'conversation-1', + ti: index, + wid: 'worker-1', + ad: 0, + phase: 'profiling', + credit: index * 1_000_000_000, + start: index * 1_000_000_000, + ack: null, + end: (index + 1) * 1_000_000_000, + ttftMs, + tpotMs, + isl: 1024, + osl: 128, + cancelled: false, + ...overrides, +}); + +describe('Agentic point request metric time series', () => { + before(() => { + cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} }); + cy.intercept('GET', '/api/v1/trace-server-metrics*', { body: null }); + cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/request-timeline*', { + body: { + version: 3, + startNs: 0, + endNs: 7_000_000_000, + durationS: 7, + requests: [ + timelineRequest(0, 100, 10), + timelineRequest(1, 200, 20), + timelineRequest(2, 400, 25), + timelineRequest(3, 800, 40), + timelineRequest(4, 1600, 80), + timelineRequest(5, 3200, 160, { phase: 'warmup' }), + timelineRequest(6, 6400, 320, { cancelled: true }), + timelineRequest(7, 0, 0, { + cid: 'conversation-1::sa:subagent_001_abcd', + credit: 1_100_000_000, + start: 1_100_000_000, + end: 1_900_000_000, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + }), + timelineRequest(8, 0, 0, { + cid: 'conversation-1::sa:subagent_001_abcd:aux:011', + credit: 1_200_000_000, + start: 1_200_000_000, + end: 1_800_000_000, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + }), + ], + }, + }); + cy.visit('/inference/agentic/206885', { onBeforeLoad: unlockAgenticGate }); + }); + + it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => { + cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { + cy.contains('h2', 'Interactivity over time').should('be.visible'); + cy.get('[data-testid="interactivity-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P90'); + // 6 points: profiling slice includes requests 0-4 (profiling) + request 5 + // (phase='warmup' label but start=5s > profiling boundary=0s, so + // sliceTimelineByPhase keeps it); cancelled r6 and null-metric r7/r8 are dropped. + cy.get('[data-testid="interactivity-point-count"]').should('have.text', '6 points'); + cy.get('svg circle').should('have.length', 6); + cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); + cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.contains('h2', 'TTFT over time').should('be.visible'); + // Same 6-point slice as interactivity (warmup r5 included by time-boundary). + cy.get('[data-testid="ttft-point-count"]').should('have.text', '6 points'); + cy.get('svg circle').should('have.length', 6); + cy.get('svg').should('contain.text', 'TTFT (s)'); + cy.get('svg').should('contain.text', 'Cumulative P90 TTFT'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + }); + + it('switches ISL and OSL cards from distributions to in-flight averages', () => { + cy.get('[data-testid="isl-metric-chart"]').within(() => { + cy.get('[data-testid="isl-metric-inflight"]').click(); + cy.contains('h2', 'Average ISL in flight').should('be.visible'); + cy.get('svg').should('contain.text', 'Average ISL in flight (30s avg)'); + }); + cy.get('[data-testid="osl-metric-chart"]').within(() => { + cy.get('[data-testid="osl-metric-inflight"]').click(); + cy.contains('h2', 'Average OSL in flight').should('be.visible'); + cy.contains('Retrospective: final observed OSL').should('be.visible'); + cy.get('svg').should('contain.text', 'Average OSL in flight (30s avg)'); + }); + }); + + it('switches the TTFT chart to E2E request latency over time', () => { + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.get('[data-testid="latency-metric-e2e"]').click(); + cy.contains('h2', 'E2E latency over time').should('be.visible'); + // 8 points: e2e = (end−start)/1e6 > 0 for all non-cancelled requests — + // includes r0-r5 (profiling slice) + r7, r8 (subagent/aux with null ttft/tpot + // but valid start/end). Cancelled r6 is excluded. + cy.get('[data-testid="e2e-point-count"]').should('have.text', '8 points'); + cy.get('svg circle').should('have.length', 8); + cy.get('svg').should('contain.text', 'E2E latency (s)'); + cy.get('svg').should('contain.text', 'Cumulative P90 E2E latency'); + + cy.get('[data-testid="latency-metric-ttft"]').click(); + cy.contains('h2', 'TTFT over time').should('be.visible'); + }); + }); + + it('switches each chart independently from P90 to P75', () => { + cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { + cy.contains('svg', 'P90 (rolling 50 req)') + .find('path') + .first() + .invoke('attr', 'd') + .as('p90Path'); + cy.contains('button', 'P75').click(); + cy.get('[data-testid="interactivity-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P75'); + cy.get('svg').should('contain.text', '1 / cumulative P75 TPOT'); + cy.contains('svg', 'P75 (rolling 50 req)') + .find('path') + .first() + .invoke('attr', 'd') + .then(function (p75Path) { + expect(p75Path).not.to.equal(this.p90Path); + }); + }); + + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.get('[data-testid="ttft-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P90'); + cy.contains('button', 'P75').click(); + cy.get('svg').should('contain.text', 'P75 (rolling 50 req)'); + cy.get('svg').should('contain.text', 'Cumulative P75 TTFT'); + }); + }); + + it('switches the request activity card from queue depth to cumulative completions', () => { + cy.get('[data-testid="request-activity-chart"]').within(() => { + cy.contains('h2', 'Request queue depth').should('be.visible'); + cy.get('[data-testid="request-activity-completed"]').click(); + cy.contains('h2', 'Cumulative completed requests').should('be.visible'); + cy.get('svg').should('contain.text', 'Completed requests'); + cy.get('svg').should('contain.text', 'Requests'); + cy.get('[data-testid="request-activity-queue"]').click(); + cy.contains('h2', 'Request queue depth').should('be.visible'); + }); + }); + + it('shows total idle time on the request timeline (time-boundary phase slice, consistent with the charts)', () => { + cy.get('[data-testid="detail-view-timeline"]').click(); + cy.location('search').should('contain', 'view=timeline'); + // The Gantt now slices by TIME BOUNDARY (sliceTimelineByPhase), matching the + // per-point charts, instead of the per-request phase LABEL. The earliest + // profiling request starts at t=0, so the boundary is 0 and warmup-labelled + // r5 (start=5s) is counted as profiling here too — exactly as the interactivity + // /TTFT charts already count it (their 6-point slice includes r5). That fills + // the former 5–6s gap that label-based filtering left open, so in-flight + // coverage is now continuous across [0s, 7s]: idle 0ms (0.0%). A 1.00s value + // here would mean the Gantt had regressed to label-based filtering. + cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 0ms (0.0%)'); + cy.get('[data-timeline-row-kind="aux"]') + .should('have.css', 'padding-left', '24px') + .and('contain.text', 'aux 011 · parallel'); + }); + + it('restores the request timeline view after browser Back from a dataset route', () => { + cy.window().then((win) => { + win.history.pushState({}, '', '/datasets/test-dataset/conversations/conversation-1'); + }); + cy.go('back'); + cy.location('pathname').should('eq', '/inference/agentic/206885'); + cy.location('search').should('contain', 'view=timeline'); + cy.get('[data-testid="detail-view-timeline"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="timeline-total-idle-time"]').should('be.visible'); + }); + + it('shows a cumulative average for unique input tokens in flight', () => { + cy.get('[data-testid="detail-view-point"]').click(); + cy.get('[data-testid="unique-input-inflight-chart"]').within(() => { + cy.get('svg').should('contain.text', 'Cumulative average'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + }); +}); + +const pointMeta = { + id: 206885, + hardware: 'gb200', + framework: 'dynamo-vllm', + model: 'deepseek-r1-0528', + precision: 'fp8', + spec_method: 'none', + disagg: true, + conc: 128, + offload_mode: 'off', + isl: null, + osl: null, + benchmark_type: 'agentic_traces', + date: '2026-06-23', + run_url: null, + server_gpu_cache_hit_rate: 0.5, + server_cpu_cache_hit_rate: null, +}; + +const sourceSeries = (source: Record, prompt: number, generation: number) => ({ + source, + kvCacheUsage: [ + { t: 0, value: 0.25 }, + { t: 1, value: 0.5 }, + ], + prefixCacheHitRate: [{ t: 0, value: 0.5 }], + queueDepth: [{ t: 0, running: 2, waiting: 1, total: 3 }], + promptTokensBySource: { miss: [{ t: 0, value: prompt }] }, + promptTps: [{ t: 0, value: prompt }], + generationTps: [{ t: 0, value: generation }], + prefixCacheHitsTps: [{ t: 0, value: prompt / 2 }], + hostKvCacheUsage: [], + kvCacheUsageByEngine: [], +}); + +describe('Agentic point orchestrator metric sources', () => { + beforeEach(() => { + const prefill = sourceSeries( + { + id: 'dynamo|prefill|10.30.1.56:7500|prefill-a|0|0', + adapter: 'dynamo', + role: 'prefill', + endpointUrl: '10.30.1.56:7500', + nativeRole: 'prefill', + workerId: 'prefill-a', + dpRank: '0', + engine: '0', + }, + 100, + 1, + ); + const decode = sourceSeries( + { + id: 'dynamo|decode|10.30.1.206:7516|decode-a|0|0', + adapter: 'dynamo', + role: 'decode', + endpointUrl: '10.30.1.206:7516', + nativeRole: 'backend', + workerId: 'decode-a', + dpRank: '0', + engine: '0', + }, + 300, + 400, + ); + cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} }); + cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/request-timeline*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/trace-server-metrics*', { + body: { + meta: pointMeta, + startNs: 0, + endNs: 2_000_000_000, + durationS: 2, + timeslicesCount: 2, + kvCacheUsage: prefill.kvCacheUsage, + prefixCacheHitRate: prefill.prefixCacheHitRate, + queueDepth: prefill.queueDepth, + promptTokensBySource: prefill.promptTokensBySource, + prefillTps: prefill.promptTps, + decodeTps: decode.generationTps, + prefixCacheHitsTps: prefill.prefixCacheHitsTps, + hostKvCacheUsage: [], + kvCacheUsageByEngine: [], + metricSources: [prefill, decode], + }, + }); + cy.visit('/inference/agentic/206885', { onBeforeLoad: unlockAgenticGate }); + }); + + it('switches every server chart to an orchestrator-normalized worker', () => { + cy.get('[data-testid="metric-source-toolbar"]') + .should('have.css', 'position', 'sticky') + .and('have.css', 'top', '64px'); + cy.get('[data-testid="metric-source-select"]').should('contain.text', 'All endpoints').click(); + cy.contains('[role="option"]', 'Decode · decode-a').click(); + + cy.get('[data-testid="metric-source-select"]').should('contain.text', 'Decode · decode-a'); + cy.contains('h2', 'Throughput · Decode · decode-a').should('be.visible'); + cy.contains('svg', 'Decode (avg n=50)').should('be.visible'); + + cy.get('[data-testid="metric-source-select"]').click(); + cy.contains('[role="option"]', 'Prefill · prefill-a').click(); + cy.contains('h2', 'Throughput · Prefill · prefill-a').should('be.visible'); + }); + + it('toggles input and decode independently while keeping one visible', () => { + cy.get('[data-testid="throughput-series-input"]') + .should('have.attr', 'aria-pressed', 'true') + .and('not.be.disabled'); + cy.get('[data-testid="throughput-series-decode"]') + .should('have.attr', 'aria-pressed', 'true') + .and('not.be.disabled'); + cy.contains('svg', 'Input (avg n=50)').should('be.visible'); + cy.contains('svg', 'Decode (avg n=50)').should('be.visible'); + cy.contains('svg', 'Total running avg (60s burn-in)').should('be.visible'); + + cy.get('[data-testid="throughput-series-input"]').click(); + cy.get('[data-testid="throughput-series-input"]').should('have.attr', 'aria-pressed', 'false'); + cy.get('[data-testid="throughput-series-decode"]').should('be.disabled'); + cy.contains('svg', 'Input (avg n=50)').should('not.exist'); + cy.contains('svg', 'Total running avg (60s burn-in)').should('not.exist'); + + cy.get('[data-testid="throughput-series-input"]').click(); + cy.get('[data-testid="throughput-series-decode"]').click(); + cy.get('[data-testid="throughput-series-input"]').should('be.disabled'); + cy.get('[data-testid="throughput-series-decode"]').should('have.attr', 'aria-pressed', 'false'); + }); +}); diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts new file mode 100644 index 00000000..0d2a7789 --- /dev/null +++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts @@ -0,0 +1,135 @@ +import { unlockAgenticGate } from '../support/e2e'; + +const distribution = (values: { + median: number; + p75: number; + p90: number; + p95: number; + max: number; +}) => ({ + bins: [ + { x0: 0, x1: 10, count: 5 }, + { x0: 10, x1: 100, count: 15 }, + ], + stats: { + count: 20, + min: 0, + mean: 40, + ...values, + }, +}); + +describe('Dataset distribution percentiles', () => { + before(() => { + cy.intercept('GET', '/api/v1/datasets/test-dataset', { + body: { + id: 'test-dataset', + slug: 'test-dataset', + label: 'Test dataset', + variant: 'full', + description: null, + hf_url: null, + license: 'apache-2.0', + conversation_count: 1, + summary: { + mainTurns: 20, + subagentGroups: 0, + subagentTurns: 0, + medianRequestsPerConversation: 12, + meanRequestsPerConversation: 14.6, + medianSubagentsPerTrace: 3, + meanSubagentsPerTrace: 4.8, + cachedPct: 0.5, + totalIn: 1000, + totalOut: 200, + }, + chart_data: { + version: 2, + inputTokensPerTurn: distribution({ + median: 100, + p75: 200, + p90: 300, + p95: 400, + max: 500, + }), + outputTokensPerTurn: distribution({ + median: 10, + p75: 20, + p90: 30, + p95: 40, + max: 50, + }), + uncachedInputTokensPerTurn: distribution({ + median: 0, + p75: 64, + p90: 128, + p95: 256, + max: 512, + }), + subagentInputTokensPerRequest: distribution({ + median: 1000, + p75: 2000, + p90: 3000, + p95: 4000, + max: 5000, + }), + subagentOutputTokensPerRequest: distribution({ + median: 100, + p75: 200, + p90: 300, + p95: 400, + max: 500, + }), + }, + ingested_at: '2026-06-23T00:00:00Z', + }, + }); + cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', { + body: { total: 0, items: [] }, + }); + cy.visit('/datasets/test-dataset', { onBeforeLoad: unlockAgenticGate }); + }); + + it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => { + const expected = [ + ['Input tokens per turn', ['p50 100', 'p75 200', 'p90 300', 'p95 400']], + ['Output tokens per turn', ['p50 10', 'p75 20', 'p90 30', 'p95 40']], + ['Uncached input tokens per request', ['p50 0', 'p75 64', 'p90 128', 'p95 256']], + ] as const; + + for (const [title, percentiles] of expected) { + cy.contains('[data-slot="card"]', title).within(() => { + for (const percentile of percentiles) cy.contains(percentile).should('be.visible'); + cy.get('svg line[stroke="#3b82f6"]').should('exist'); + cy.get('svg line[stroke="#22c55e"]').should('exist'); + cy.get('svg line[stroke="#f59e0b"]').should('exist'); + cy.get('svg line[stroke="#ef4444"]').should('exist'); + }); + } + }); + + it('shows median and mean model requests per conversation', () => { + cy.contains('dt', 'Median requests / convo').next('dd').should('have.text', '12'); + cy.contains('dt', 'Mean requests / convo').next('dd').should('have.text', '14.6'); + }); + + it('summarizes subagents per trace instead of charting group counts', () => { + cy.contains('dt', 'Median subagents / trace').next('dd').should('have.text', '3'); + cy.contains('dt', 'Mean subagents / trace').next('dd').should('have.text', '4.8'); + cy.contains('Subagent groups per conversation').should('not.exist'); + }); + + it('shows ISL and OSL distributions for inner subagent requests only', () => { + const expected = [ + ['Subagent request ISL', ['p50 1.0k', 'p75 2.0k', 'p90 3.0k', 'p95 4.0k']], + ['Subagent request OSL', ['p50 100', 'p75 200', 'p90 300', 'p95 400']], + ] as const; + + for (const [title, percentiles] of expected) { + cy.contains('[data-slot="card"]', title).within(() => { + cy.contains('Inner subagent requests only').should('be.visible'); + for (const percentile of percentiles) cy.contains(percentile).should('be.visible'); + }); + } + }); +}); diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts new file mode 100644 index 00000000..bdb1adfc --- /dev/null +++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts @@ -0,0 +1,131 @@ +import { unlockAgenticGate } from '../support/e2e'; + +describe('Dataset conversation flamegraph timing', () => { + before(() => { + cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', { + body: { + conv_id: 'conversation-1', + models: ['model-a'], + num_turns: 2, + num_subagent_groups: 1, + total_in: 1000, + total_out: 100, + total_cached: 500, + structure: { + blockSize: 64, + totals: { + in: 1000, + out: 100, + cached: 500, + uncached: 500, + numTurns: 2, + numSubagentGroups: 1, + }, + nodes: [ + { + kind: 'turn', + turnIndex: 0, + startS: 0, + endS: 1.2, + model: 'model-a', + in: 100, + out: 10, + cached: 0, + uncached: 100, + }, + { + kind: 'subagent', + label: 'Explore', + agentId: 'agent-1', + startS: 3661.2, + endS: 3782.6, + durationMs: 121_400, + in: 800, + out: 80, + cached: 500, + uncached: 300, + children: [ + { + kind: 'turn', + turnIndex: 1, + startS: 3661.2, + endS: 3668.2, + model: 'model-a', + in: 300, + out: 30, + cached: 150, + uncached: 150, + }, + { + kind: 'turn', + turnIndex: 2, + startS: 3665.2, + endS: 3671.2, + model: 'model-a', + in: 300, + out: 30, + cached: 200, + uncached: 100, + }, + { + kind: 'turn', + turnIndex: 3, + startS: 3670.2, + endS: 3675.2, + model: 'model-a', + in: 200, + out: 20, + cached: 150, + uncached: 50, + }, + ], + }, + { + kind: 'turn', + turnIndex: 2, + startS: 65.4, + endS: 67.4, + model: 'model-a', + in: 100, + out: 10, + cached: 0, + uncached: 100, + }, + ], + }, + }, + }); + cy.visit('/datasets/test-dataset/conversations/conversation-1', { + onBeforeLoad: unlockAgenticGate, + }); + }); + + it('shows turn offsets and a collapsed subagent time range', () => { + cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00–00:01'); + cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05–01:07'); + cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03'); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist'); + }); + + it('shows subturn offsets when the subagent group is expanded', () => { + cy.contains('button', 'Explore').click(); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01–1:01:08'); + // Parallel groups render as left-gutter brackets; each member row carries + // one bracket segment per group it belongs to (non-transitive chains keep + // their own segments/lanes). + cy.get('[data-testid="flamegraph-overlap-g-1-c-0"]') + .should('have.length', 1) + .and('have.attr', 'data-overlap-group', 'subagent-1-1'); + cy.get('[data-testid="flamegraph-overlap-g-1-c-1"]') + .should('have.length', 2) + .then(($segs) => { + expect([...$segs].map((seg) => seg.dataset.overlapGroup).toSorted()).to.deep.equal([ + 'subagent-1-1', + 'subagent-1-2', + ]); + }); + cy.get('[data-testid="flamegraph-overlap-g-1-c-2"]') + .should('have.length', 1) + .and('have.attr', 'data-overlap-group', 'subagent-1-2'); + }); +}); diff --git a/packages/app/cypress/e2e/dropdown-switching.cy.ts b/packages/app/cypress/e2e/dropdown-switching.cy.ts index 34d95ec3..93658af0 100644 --- a/packages/app/cypress/e2e/dropdown-switching.cy.ts +++ b/packages/app/cypress/e2e/dropdown-switching.cy.ts @@ -17,10 +17,10 @@ describe('Dropdown one-click switching', () => { cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'true'); cy.get('[role="option"]').should('have.length.greaterThan', 0); - cy.get('[data-testid="sequence-selector"]').click(); + cy.get('[data-testid="scenario-selector"]').click(); cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'false'); - cy.get('[data-testid="sequence-selector"]').should('have.attr', 'aria-expanded', 'true'); + cy.get('[data-testid="scenario-selector"]').should('have.attr', 'aria-expanded', 'true'); cy.get('[role="option"]').should('have.length.greaterThan', 0); }); diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts new file mode 100644 index 00000000..6c832e08 --- /dev/null +++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts @@ -0,0 +1,188 @@ +import { unlockAgenticGate } from '../support/e2e'; + +// --------------------------------------------------------------------------- +// Spec-scoped fixture helpers +// +// The shared cypress/fixtures/api/*.json files contain ZERO agentic_traces rows +// (by design — adding them flips the bare /inference default to the agentic +// scenario and regresses other specs). This spec therefore injects minimal +// agentic data via spec-scoped cy.intercept overrides that shadow the fixture +// server, following the same pattern used in ttft-x-axis-toggle.cy.ts. +// --------------------------------------------------------------------------- + +const DEFAULT_MODEL_DB_KEY = 'dsv4'; // DeepSeek-V4-Pro +const AGENTIC_DATE = '2026-06-12'; + +// Two GPUs with agentic + single_turn entries so the scenario selector resolves +// to agentic (agentic preferred when both types exist for the same model). +const AGENTIC_HARDWARE = [ + { hardware: 'b200', framework: 'vllm', disagg: false }, + { hardware: 'b300', framework: 'vllm', disagg: false }, +]; + +const agenticAvailability = [ + // Agentic rows (isl/osl null). + ...AGENTIC_HARDWARE.map((g) => ({ + model: DEFAULT_MODEL_DB_KEY, + isl: null, + osl: null, + precision: 'fp4', + hardware: g.hardware, + framework: g.framework, + spec_method: 'none', + disagg: g.disagg, + benchmark_type: 'agentic_traces', + date: AGENTIC_DATE, + })), + // Single-turn rows alongside — without these the scenario selector may not + // see the "both exist" signal it needs to confidently pick agentic. + ...AGENTIC_HARDWARE.map((g) => ({ + model: DEFAULT_MODEL_DB_KEY, + isl: 8192, + osl: 1024, + precision: 'fp4', + hardware: g.hardware, + framework: g.framework, + spec_method: 'none', + disagg: g.disagg, + benchmark_type: 'single_turn', + date: AGENTIC_DATE, + })), +]; + +// Minimal per-metric percentile ladder matching what the chart expects for +// agentic rows (median/p75/p90/p95/p99 + std for each family). +const percentileLadder = (prefix: string, base: number): Record => ({ + [`median_${prefix}`]: base, + [`p75_${prefix}`]: base * 1.2, + [`p90_${prefix}`]: base * 1.5, + [`p95_${prefix}`]: base * 1.7, + [`p99_${prefix}`]: base * 2.2, + [`std_${prefix}`]: base * 0.3, +}); + +const agenticMetrics = (conc: number): Record => { + const scale = conc / 16; + const itl = 0.011 * scale; + return { + ...percentileLadder('ttft', 0.4 * scale), + ...percentileLadder('tpot', 0.012 * scale), + ...percentileLadder('itl', itl), + ...percentileLadder('e2el', 8 * scale), + median_intvty: 1 / itl, + p75_intvty: 1 / (itl * 1.2), + p90_intvty: 1 / (itl * 1.5), + p99_intvty: 1 / (itl * 2.2), + std_intvty: (1 / itl) * 0.1, + tput_per_gpu: 950 / Math.sqrt(scale), + output_tput_per_gpu: 210, + input_tput_per_gpu: 740, + total_tput_tps: 7600 * conc * 0.05, + }; +}; + +// IDs must be unique numbers — the GPU graph uses them as D3 data keys and +// trace-availability is keyed on them. +let benchIdCursor = 800100; +const agenticBenchmarks = AGENTIC_HARDWARE.flatMap((g) => + [16, 64, 128].map((conc) => ({ + id: benchIdCursor++, + hardware: g.hardware, + framework: g.framework, + model: DEFAULT_MODEL_DB_KEY, + precision: 'fp4', + spec_method: 'none', + disagg: g.disagg, + is_multinode: false, + prefill_tp: 8, + prefill_ep: 1, + prefill_dp_attention: false, + prefill_num_workers: 0, + decode_tp: 8, + decode_ep: 1, + decode_dp_attention: false, + decode_num_workers: 0, + num_prefill_gpu: 8, + num_decode_gpu: 8, + isl: null, + osl: null, + conc, + offload_mode: 'off', + benchmark_type: 'agentic_traces', + image: 'vllm/vllm-openai:v0.9.0', + metrics: agenticMetrics(conc), + workers: null, + date: AGENTIC_DATE, + run_url: null, + })), +); + +// All injected IDs with a stored trace blob — the GPU graph renders the +// "View charts" link only when trace-availability returns true for the id. +const agenticIds = new Set(agenticBenchmarks.map((b) => b.id)); + +describe('GPU comparison agentic point detail', () => { + it('exposes the per-point charts as a normal browser link', () => { + // Shadow the fixture-server availability + benchmarks responses with + // spec-scoped agentic data so the GPU graph renders agentic dots. + cy.intercept('GET', '/api/v1/availability', { body: agenticAvailability }).as( + 'agenticAvailability', + ); + cy.intercept('GET', '/api/v1/benchmarks*', { body: agenticBenchmarks }).as('agenticBenchmarks'); + // Return true for all injected ids so the "View charts" link appears. + cy.intercept('GET', '/api/v1/trace-availability*', (request) => { + const ids = new URL(request.url).searchParams.get('ids')?.split(',') ?? []; + if (ids.length < 20) request.alias = 'gpuTraceAvailability'; + const result = Object.fromEntries( + ids.filter((id) => agenticIds.has(Number(id))).map((id) => [id, true]), + ); + request.reply({ body: result }); + }); + + cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + unlockAgenticGate(win); + }, + }); + + cy.get('[data-testid="gpu-multiselect"] [role="combobox"]').click({ force: true }); + cy.get('[role="option"]').first().click(); + cy.contains('button', 'Select date range').click(); + cy.get('body').then(($body) => { + if ($body.text().includes('View anyway')) { + cy.contains('button', 'View anyway').click(); + } else { + cy.contains('button', 'Max Range').click(); + cy.contains('button', 'Apply').click(); + } + }); + + cy.get('[data-testid="gpu-graph"]').first().should('be.visible'); + cy.wait('@gpuTraceAvailability'); + cy.wait(100); + cy.get('[data-testid="gpu-graph"]') + .first() + .find('svg .dot-group') + .should('have.length.greaterThan', 0) + .first() + .then(($point) => { + const point = $point[0] as unknown as SVGElement & { + __data__: { benchmark_type?: string; id?: number }; + }; + expect(point.__data__.benchmark_type).to.equal('agentic_traces'); + expect(point.__data__.id).to.be.a('number'); + cy.wrap($point).find('.visible-shape').click({ force: true }); + }); + + cy.get('[data-chart-tooltip]:visible').should('have.length', 1); + cy.get('[data-chart-tooltip]:visible [data-action="view-charts"]') + .should('be.visible') + .then(($link) => { + expect($link).to.match('a'); + expect($link).not.to.have.attr('target'); + expect($link.attr('href')).to.match(/^\/inference\/agentic\/\d+$/u); + }); + cy.location('pathname').should('eq', '/inference'); + }); +}); diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts index 333baa6d..9c3d3274 100644 --- a/packages/app/cypress/e2e/gradient-labels.cy.ts +++ b/packages/app/cypress/e2e/gradient-labels.cy.ts @@ -60,19 +60,19 @@ describe('Gradient Labels Toggle', () => { }); it('both toggles can be enabled simultaneously', () => { - // Turn on Gradient Labels (off by default) + // Parallelism Labels is off by default; turn it on, then turn on Gradient. + cy.get('#scatter-parallelism-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); cy.get('#scatter-gradient-labels').click(); cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); - // Turn on Parallelism Labels - cy.get('#scatter-parallelism-labels').click(); - cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Both should be checked cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Reset for next tests + // Reset both for next tests (each subsequent test does a fresh cy.visit, + // but keep state tidy here too). cy.get('#scatter-gradient-labels').click(); cy.get('#scatter-parallelism-labels').click(); }); diff --git a/packages/app/cypress/e2e/historical-trends.cy.ts b/packages/app/cypress/e2e/historical-trends.cy.ts index f0a70a56..55b0e274 100644 --- a/packages/app/cypress/e2e/historical-trends.cy.ts +++ b/packages/app/cypress/e2e/historical-trends.cy.ts @@ -88,8 +88,8 @@ describe('Historical Trends — Content & Interactions', () => { delete doc.body.dataset.scrollLocked; doc.body.style.removeProperty('pointer-events'); }); - cy.get('[data-testid="sequence-selector"]').should('be.visible'); - cy.get('[data-testid="sequence-selector"]').click(); + cy.get('[data-testid="scenario-selector"]').should('be.visible'); + cy.get('[data-testid="scenario-selector"]').click(); cy.get('[role="option"]').should('have.length.greaterThan', 0); cy.get('body').type('{esc}'); }); diff --git a/packages/app/cypress/e2e/line-labels.cy.ts b/packages/app/cypress/e2e/line-labels.cy.ts index 84e655f8..23b372df 100644 --- a/packages/app/cypress/e2e/line-labels.cy.ts +++ b/packages/app/cypress/e2e/line-labels.cy.ts @@ -15,26 +15,30 @@ describe('Line Labels Toggle', () => { cy.get('label[for="scatter-line-labels"]').should('contain.text', 'Line Labels'); }); - it('Line Labels toggle is on by default', () => { - cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked'); - - // Line labels render without any interaction - cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); - }); - - it('toggling Line Labels off then back on removes and restores label elements', () => { - // On by default — turn it off first. - cy.get('#scatter-line-labels').click(); + it('Line Labels toggle is off by default', () => { cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked'); + + // No line labels render without interaction cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0); + }); - // Turn it back on — labels return. + it('toggling Line Labels on then back off adds and removes label elements', () => { + // Off by default — turn it on first. cy.get('#scatter-line-labels').click(); cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked'); cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); + + // Turn it back off — labels disappear. + cy.get('#scatter-line-labels').click(); + cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked'); + cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0); }); it('line labels have colored background rects and text', () => { + // Off by default — ensure on (idempotent; prior test left them off). + cy.get('#scatter-line-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); // Each line label group should contain a background rect and text cy.get('[data-testid="scatter-graph"] svg g.line-label .ll-bg').should( 'have.length.greaterThan', @@ -47,7 +51,10 @@ describe('Line Labels Toggle', () => { }); it('line labels render in the foreground, after the scatter points', () => { - // Labels were toggled on in the test above and remain on here. + // Off by default — ensure on (idempotent; previous test leaves them on). + cy.get('#scatter-line-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); cy.get('[data-testid="scatter-graph"] svg').then(($svg) => { diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index e17a4aff..92b32d33 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -1,46 +1,314 @@ -describe('TTFT X-Axis Toggle (E2E chart)', () => { +import { unlockAgenticGate } from '../support/e2e'; + +const interceptDerivedMetrics = () => { + cy.intercept('GET', '/api/v1/derived-agentic-metrics*', (request) => { + const ids = new URL(request.url).searchParams.get('ids')?.split(',').filter(Boolean) ?? []; + request.reply({ + body: Object.fromEntries( + ids.map((id, index) => [ + id, + { + id: Number(id), + normalized_session_time_s: 60 + index, + p90_prefill_tps_per_user: 100 + index, + p75_normalized_e2e_400_s: 8 + index, + p90_normalized_e2e_400_s: 12 + index, + }, + ]), + ), + }); + }).as('derivedAgenticMetrics'); +}; + +// This spec exercises the agentic x-axis modes, which only exist when the +// selected model resolves to the Agentic Traces scenario. The default e2e +// fixtures (cypress/fixtures/api/*.json) have NO agentic rows for any model, so +// after the availability-gated effectiveSequence fix the bare-/inference default +// correctly resolves to a fixed-seq scenario. We therefore inject agentic +// availability + benchmark rows for the default model VIA SPEC-SCOPED INTERCEPTS +// (not the shared fixtures) so this test — and only this test — sees the agentic +// view. Scoping to intercepts keeps every other spec's default fixed-seq. +const DEFAULT_MODEL_DB_KEY = 'dsv4'; // DeepSeek-V4-Pro is the default model +const AGENTIC_DATE = '2026-06-12'; + +// Percentile ladder for one metric family (median/p75/p90/p95/p99/std). +const percentileLadder = (prefix: string, base: number): Record => ({ + [`median_${prefix}`]: base, + [`p75_${prefix}`]: base * 1.2, + [`p90_${prefix}`]: base * 1.5, + [`p95_${prefix}`]: base * 1.7, + [`p99_${prefix}`]: base * 2.2, + [`std_${prefix}`]: base * 0.3, +}); + +const agenticMetrics = (conc: number): Record => { + const scale = conc / 16; + const itl = 0.011 * scale; + return { + ...percentileLadder('ttft', 0.4 * scale), + ...percentileLadder('tpot', 0.012 * scale), + ...percentileLadder('itl', itl), + ...percentileLadder('e2el', 8 * scale), + median_intvty: 1 / itl, + p75_intvty: 1 / (itl * 1.2), + p90_intvty: 1 / (itl * 1.5), + p99_intvty: 1 / (itl * 2.2), + std_intvty: (1 / itl) * 0.1, + tput_per_gpu: 950 / Math.sqrt(scale), + output_tput_per_gpu: 210, + input_tput_per_gpu: 740, + total_tput_tps: 7600 * conc * 0.05, + }; +}; + +const agenticGpus = [ + { hardware: 'b200', framework: 'vllm', disagg: false }, + { hardware: 'b300', framework: 'vllm', disagg: false }, +]; + +// Availability: default model has BOTH agentic and fixed-seq, so the default +// resolves to agentic (the product-intended, agentic-preferred behavior). +const agenticAvailability = [ + ...agenticGpus.map((g) => ({ + model: DEFAULT_MODEL_DB_KEY, + isl: null, + osl: null, + precision: 'fp4', + hardware: g.hardware, + framework: g.framework, + spec_method: 'none', + disagg: g.disagg, + benchmark_type: 'agentic_traces', + date: AGENTIC_DATE, + })), + ...agenticGpus.map((g) => ({ + model: DEFAULT_MODEL_DB_KEY, + isl: 8192, + osl: 1024, + precision: 'fp4', + hardware: g.hardware, + framework: g.framework, + spec_method: 'none', + disagg: g.disagg, + benchmark_type: 'single_turn', + date: AGENTIC_DATE, + })), +]; + +let benchIdCursor = 900000; +const agenticBenchmarks = agenticGpus.flatMap((g) => + [16, 64, 128].map((conc) => ({ + id: benchIdCursor++, + hardware: g.hardware, + framework: g.framework, + model: DEFAULT_MODEL_DB_KEY, + precision: 'fp4', + spec_method: 'none', + disagg: g.disagg, + is_multinode: false, + prefill_tp: 8, + decode_tp: 8, + num_prefill_gpu: 8, + num_decode_gpu: 8, + isl: null, + osl: null, + conc, + offload_mode: 'off', + benchmark_type: 'agentic_traces', + image: 'vllm/vllm-openai:v0.9.0', + metrics: agenticMetrics(conc), + workers: null, + date: AGENTIC_DATE, + run_url: null, + })), +); + +const interceptAgenticData = () => { + cy.intercept('GET', '/api/v1/availability', { body: agenticAvailability }).as('availability'); + cy.intercept('GET', '/api/v1/benchmarks*', { body: agenticBenchmarks }).as('benchmarks'); +}; + +describe('X-Axis Mode Toggle (inference chart)', () => { before(() => { - cy.window().then((win) => { - win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + interceptAgenticData(); + cy.visit('/inference', { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + unlockAgenticGate(win); + }, }); - cy.visit('/inference'); - cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 2); + cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible'); + cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1); }); - it('shows the x-axis dropdown in the e2e chart heading', () => { - cy.get('[data-testid="chart-figure"]') - .eq(1) - .find('h2 button') - .should('contain.text', 'vs.') - .and('contain.text', 'Latency'); + it('shows Interactivity by default for the agentic view', () => { + cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces'); + cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-interactivity"]') + .should('be.visible') + .and('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); }); - it('opens popover with three x-axis options', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').within(() => { - cy.contains('End-to-end Latency').should('exist'); - cy.contains('P99 TTFT').should('exist'); - cy.contains('Median TTFT').should('exist'); + it('switches the x-axis to TTFT and updates the heading', () => { + cy.get('[data-testid="x-axis-mode-ttft"]').click(); + cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token'); + }); + + it('switches the x-axis to E2E Latency and updates the heading', () => { + cy.get('[data-testid="x-axis-mode-e2e"]').click(); + cy.get('[data-testid="x-axis-mode-e2e"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency'); + }); + + it('switches to request-level normalized E2E at 400 output tokens', () => { + interceptDerivedMetrics(); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click(); + cy.wait('@derivedAgenticMetrics'); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + cy.get('[data-testid="chart-figure"] h2').should( + 'contain.text', + 'P90 Normalized E2E @ 400 output tokens', + ); + cy.get('[data-testid="chart-figure"] svg').should( + 'contain.text', + 'P90 Normalized E2E @ 400 output tokens (s)', + ); + + cy.get('[data-testid="percentile-selector"]').click(); + cy.contains('[role="option"]', 'p75').click(); + cy.get('[data-testid="chart-figure"] h2').should( + 'contain.text', + 'P75 Normalized E2E @ 400 output tokens', + ); + }); + + it('switches back to Interactivity', () => { + cy.get('[data-testid="x-axis-mode-interactivity"]').click(); + cy.get('[data-testid="x-axis-mode-interactivity"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); + }); +}); + +// --------------------------------------------------------------------------- +// Overlay path — regression coverage for unofficial-run overlays with agentic +// x-axis modes (finding #8 / AGENTS.md: chart features must have overlay tests). +// The overlay behavior itself is verified correct by prior review; this suite +// guards against regressions only and does NOT change overlay behavior. +// --------------------------------------------------------------------------- + +// Build a minimal unofficial-run API response that contains one agentic +// overlay benchmark row so the provider builds overlay chart data. +const OVERLAY_RUN_ID = 99900000001; +const OVERLAY_RUN_URL = `https://github.com/SemiAnalysisAI/InferenceX/actions/runs/${OVERLAY_RUN_ID}`; + +const overlayBenchmarkRow = { + id: 800000, + hardware: 'b200', + framework: 'vllm', + model: DEFAULT_MODEL_DB_KEY, + precision: 'fp4', + spec_method: 'none', + disagg: false, + is_multinode: false, + prefill_tp: 8, + decode_tp: 8, + num_prefill_gpu: 8, + num_decode_gpu: 8, + isl: null, + osl: null, + conc: 32, + offload_mode: 'off', + benchmark_type: 'agentic_traces', + image: 'vllm/vllm-openai:v0.9.0', + metrics: agenticMetrics(32), + workers: null, + date: AGENTIC_DATE, + run_url: OVERLAY_RUN_URL, +}; + +const interceptAgenticDataWithOverlay = () => { + interceptAgenticData(); + cy.intercept('GET', '/api/unofficial-run*', { + body: { + runInfos: [ + { + id: OVERLAY_RUN_ID, + name: 'Overlay regression fixture', + branch: 'test/overlay-regression', + sha: 'abc000', + createdAt: `${AGENTIC_DATE}T00:00:00Z`, + url: OVERLAY_RUN_URL, + conclusion: 'success', + status: 'completed', + isNonMainBranch: true, + }, + ], + benchmarks: [overlayBenchmarkRow], + evaluations: [], + }, + }).as('unofficialRun'); +}; + +describe('X-Axis Mode Toggle — overlay path (finding #8 regression guard)', () => { + before(() => { + interceptAgenticDataWithOverlay(); + cy.visit(`/inference?unofficialrun=${OVERLAY_RUN_ID}`, { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + unlockAgenticGate(win); + }, }); + cy.wait('@unofficialRun'); + cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible'); + cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1); }); - it('switches x-axis to P99 TTFT and updates the heading', () => { - cy.get('[data-slot="popover-content"]').contains('P99 TTFT').click(); - cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'P99 TTFT'); + it('shows overlay (unofficial-run) watermark SVG when an overlay is loaded', () => { + // The unofficial-run pattern watermark appears when isUnofficialRun is true. + cy.get('[data-testid="inference-chart-display"] svg pattern[id^="unofficial-pattern-"]').should( + 'exist', + ); }); - it('switches x-axis to Median TTFT and updates the heading', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').contains('Median TTFT').click(); - cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'Median TTFT'); + it('switches to ttft x-axis mode and renders SVG with overlay points', () => { + cy.get('[data-testid="x-axis-mode-ttft"]').click(); + cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token'); + // Overlay points render as triangles or circles inside the chart SVG. + cy.get('[data-testid="inference-chart-display"] svg').should('exist'); + cy.get('[data-testid="inference-chart-display"] svg').then(($svgs) => { + let total = 0; + $svgs.each((_i, svg) => { + total += svg.querySelectorAll('circle, polygon, path').length; + }); + expect(total).to.be.greaterThan(0); + }); }); - it('switches back to End-to-end Latency', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').contains('End-to-end Latency').click(); - cy.get('[data-testid="chart-figure"]') - .eq(1) - .find('h2') - .should('contain.text', 'End-to-end Latency'); + it('normalized-e2e mode shows suppression banner for unofficial-run overlays', () => { + interceptDerivedMetrics(); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click(); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + // The suppression message appears because isUnofficialRun is true and the + // mode is 'normalized-e2e' (documented in ChartDisplay.tsx ~line 640). + cy.contains( + 'Normalized E2E requires persisted per-request traces, so unofficial-run overlays are unavailable for this experimental view.', + ).should('be.visible'); }); }); diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts index 33282b9c..6c827218 100644 --- a/packages/app/cypress/e2e/url-params.cy.ts +++ b/packages/app/cypress/e2e/url-params.cy.ts @@ -21,7 +21,7 @@ const visitWithErrorSpy = (path: string) => { }; const assertNoHydrationMismatch = () => { - cy.get('[data-testid="sequence-selector"]').should('be.visible'); + cy.get('[data-testid="scenario-selector"]').should('be.visible'); cy.get('@consoleError').then((spy) => { const calls = (spy as unknown as { args: unknown[][] }).args; const hydration = calls.filter((args) => @@ -152,7 +152,7 @@ describe('URL Parameter Persistence', () => { it('/inference?i_seq=1k/1k seeds the sequence without a hydration error', () => { visitWithErrorSpy('/inference?i_seq=1k/1k'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); assertNoHydrationMismatch(); }); @@ -160,13 +160,13 @@ describe('URL Parameter Persistence', () => { // Visit the canonical model-prefixed slug so the assertion is directly // about the rendered page, not about a bare-slug redirect interleaving. visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=1k/1k'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); assertNoHydrationMismatch(); }); it('/compare/[slug] with invalid ?i_seq=junk falls back to the seeded default', () => { visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=junk'); - cy.get('[data-testid="sequence-selector"]') + cy.get('[data-testid="scenario-selector"]') .invoke('text') .should('not.contain', 'junk') .and('match', /[18]K . [18]K/u); @@ -228,7 +228,7 @@ describe('URL Parameter Persistence', () => { // `effectivePrecisions` intersects the selection with available precisions // and the UI may render the fallback. dsr1 + fp8 + 1k/1k is supported. visitWithErrorSpy('/inference?i_seq=1k/1k&g_model=DeepSeek-R1-0528&i_prec=fp8'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); cy.get('[data-testid="model-selector"]').should('contain.text', 'DeepSeek'); cy.get('[data-testid="precision-multiselect"]').should('contain.text', 'FP8'); assertNoHydrationMismatch(); @@ -236,12 +236,18 @@ describe('URL Parameter Persistence', () => { }); describe('High contrast mode', () => { - it('page loads without high contrast by default', () => { + it('inference loads with high contrast off by default', () => { visitWithDismissedModal('/inference'); cy.get('[data-testid="scatter-graph"]').should('exist'); cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); }); + it('i_hc=0 disables high contrast on load', () => { + visitWithDismissedModal('/inference?i_hc=0'); + cy.get('[data-testid="scatter-graph"]').should('exist'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); + }); + it('i_hc=1 applies high contrast on load', () => { visitWithDismissedModal('/inference?i_hc=1'); cy.get('[data-testid="scatter-graph"]').should('exist'); @@ -267,7 +273,9 @@ describe('URL Parameter Persistence', () => { cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); - it('historical trends tab has high contrast switch off by default', () => { + it('historical trends tab shares the inference high-contrast default (off)', () => { + // Historical reads highContrast from the same InferenceContext as the + // scatter chart, so it inherits the default-off behavior. visitWithDismissedModal('/historical'); cy.get('[data-testid="historical-trends-display"]').should('exist'); cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); @@ -279,4 +287,20 @@ describe('URL Parameter Persistence', () => { cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); }); + + describe('Default toggle states (share-link correctness)', () => { + it('a bare /inference link with neither param renders high contrast AND parallelism labels off', () => { + visitWithDismissedModal('/inference'); + cy.get('[data-testid="scatter-graph"]').should('exist'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); + cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked'); + }); + + it('i_hc=1&i_advlabel=1 enables both high contrast and parallelism labels on load', () => { + visitWithDismissedModal('/inference?i_hc=1&i_advlabel=1'); + cy.get('[data-testid="scatter-graph"]').should('exist'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked'); + cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); + }); + }); }); diff --git a/packages/app/cypress/support/e2e.ts b/packages/app/cypress/support/e2e.ts index d8209e33..0edb08c0 100644 --- a/packages/app/cypress/support/e2e.ts +++ b/packages/app/cypress/support/e2e.ts @@ -14,3 +14,22 @@ Cypress.on('window:before:load', (win) => { // localStorage unavailable — fine, the test will just see the modal. } }); + +/** + * Unlock the shared feature gate for specs that exercise agentic surfaces + * (the "Agentic Traces" scenario, /datasets, /inference/agentic/[id], and the + * Datasets nav link). The gate is OFF by default so the PR can ship without + * publicly exposing agentic features; agentic specs opt in by seeding the same + * localStorage flag the ↑↑↓↓ konami unlock writes (see use-feature-gate.ts). + * + * Call from a spec's `cy.visit(..., { onBeforeLoad })`: + * cy.visit('/datasets/x', { onBeforeLoad: unlockAgenticGate }); + * or compose inside an existing hook: `unlockAgenticGate(win)`. + */ +export function unlockAgenticGate(win: Window): void { + try { + win.localStorage.setItem('inferencex-feature-gate', '1'); + } catch { + // localStorage unavailable — spec will see the gate locked and likely 404. + } +} diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index bcdfe21b..490fca87 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,10 +189,14 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), + selectedPercentile: 'p90', + setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), selectedE2eXAxisMetric: null, setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'), + selectedXAxisMode: 'interactivity' as const, + setSelectedXAxisMode: namedStub('setSelectedXAxisMode'), scaleType: 'auto', setScaleType: namedStub('setScaleType'), quickFilters: { vendors: [], frameworks: [], disagg: [], spec: [] }, @@ -419,6 +423,9 @@ export function createMockGlobalFilterContext( selectedPrecisions: [Precision.FP4], setSelectedPrecisions: namedStub('setSelectedPrecisions_global'), effectiveSequence: Sequence.EightK_OneK, + // Mocks represent a settled state: availability is known and the sequence is + // resolved. Tests exercising the pre-availability window override this. + sequenceResolved: true, effectivePrecisions: [Precision.FP4], selectedRunDate: '2025-03-01', setSelectedRunDate: namedStub('setSelectedRunDate_global'), diff --git a/packages/app/next.config.ts b/packages/app/next.config.ts index 39ab4487..32988f05 100644 --- a/packages/app/next.config.ts +++ b/packages/app/next.config.ts @@ -3,6 +3,12 @@ import type { NextConfig } from 'next'; import { allowedDevOriginsFromEnv } from './src/lib/allowed-dev-origins'; const nextConfig: NextConfig = { + // Allow a second, isolated dev server (e.g. a dump-mode instance on another + // port) to run from the same project dir by pointing it at a separate build + // dir via NEXT_DIST_DIR. Defaults to '.next' so the primary server and all + // CI/prod builds are unaffected. Next.js's single-dev-server lock lives under + // distDir, so distinct dirs let the two coexist. + distDir: process.env.NEXT_DIST_DIR || '.next', allowedDevOrigins: allowedDevOriginsFromEnv(), transpilePackages: ['@semianalysisai/inferencex-constants'], serverExternalPackages: ['shiki'], diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx new file mode 100644 index 00000000..91b769bd --- /dev/null +++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx @@ -0,0 +1,29 @@ +import type { Metadata } from 'next'; +import { notFound } from 'next/navigation'; + +import { AgenticGate } from '@/components/agentic-gate'; +import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail'; +import { isPersistedBenchmarkId } from '@/lib/benchmark-id'; + +export const metadata: Metadata = { + title: 'Agentic trace detail | InferenceX', + robots: { index: false }, +}; + +export default async function AgenticPointDetailPage({ + params, +}: { + params: Promise<{ id: string }>; +}) { + const { id } = await params; + const numericId = Number(id); + // benchmark_results.id is a positive bigserial — anything else (`/agentic/abc`, + // `/agentic/0`, `/agentic/-1`) can never resolve, so 404 instead of rendering a + // blank detail shell that fires doomed id-keyed fetches. + if (!isPersistedBenchmarkId(numericId)) notFound(); + return ( + + + + ); +} diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index 072c99f1..304ccb0b 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -33,6 +33,10 @@ export function normalizeArtifactRows( if (!params) continue; const { config } = params; results.push({ + // Synthetic id — overlay rows aren't persisted, so trace_replay lookups + // (keyed on benchmark_results.id) will always miss, which is the + // intended behaviour: overlays never have stored trace_replay blobs. + id: 0, hardware: config.hardware, framework: config.framework, model: config.model, @@ -50,6 +54,8 @@ export function normalizeArtifactRows( decode_num_workers: config.decodeNumWorkers, num_prefill_gpu: config.numPrefillGpu, num_decode_gpu: config.numDecodeGpu, + benchmark_type: params.benchmarkType, + offload_mode: params.offloadMode, isl: params.isl, osl: params.osl, conc: params.conc, diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts new file mode 100644 index 00000000..9cb229d4 --- /dev/null +++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts @@ -0,0 +1,47 @@ +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { + getAgenticAggregates, + STATS_VERSION, + type AgenticAggregateMap, +} from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idsQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: response stays small (a few numbers per id), but generating it +// parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the +// "Aggregates" toggle stays snappy. +// +// Key derived from STATS_VERSION (governs the `aggregate_stats` payload). The +// blob cache is write-once with no post-backfill purge, so deriving the key +// from the constant is what rolls the namespace on a version bump — a +// hand-written string would pin the route to stale blob hits forever. +/** Version-derived blob-cache key namespace (exported for the key-derivation test). */ +export const CACHE_KEY_PREFIX = `agentic-aggregates-v${STATS_VERSION}`; + +const getCachedAgenticAggregates = cachedQuery( + (ids: number[]): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getAgenticAggregates(ids)); + return getAgenticAggregates(getDb(), ids); + }, + CACHE_KEY_PREFIX, + { blobOnly: true }, +); + +/** + * GET /api/v1/agentic-aggregates?ids=1,2,3 + * + * Returns per-id mean/p50/p75/p90/p99 for ISL, OSL, KV cache utilization, + * and prefix cache hit rate — computed live from the stored aiperf + * profile_export.jsonl + server_metrics_json blobs. Ids without a + * trace_replay blob (or with no usable samples) get nulls. + */ +export const GET = idsQueryRoute({ + maxIds: 200, + logLabel: 'agentic aggregates', + fetch: getCachedAgenticAggregates, +}); diff --git a/packages/app/src/app/api/v1/agentic-cache-keys.test.ts b/packages/app/src/app/api/v1/agentic-cache-keys.test.ts new file mode 100644 index 00000000..58fa194f --- /dev/null +++ b/packages/app/src/app/api/v1/agentic-cache-keys.test.ts @@ -0,0 +1,70 @@ +/** + * Guards that every agentic blob-cache key is DERIVED from the version constant + * that governs its payload — not a hand-written string. `blobSet` is write-once + * and nothing purges the blob cache after a backfill, so an unversioned (or + * hand-bumped) key would serve stale data forever after a payload-version bump. + * Deriving the key from the constant means a future bump rolls the cache + * namespace automatically; these tests fail loudly if a route drifts back to a + * literal string. + */ + +import { describe, expect, it, vi } from 'vitest'; + +// Route modules call getDb() at import time via cachedQuery's closure and pull +// in the blob cache — stub both so importing the route is side-effect-free. +vi.mock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: vi.fn(() => 'mock-sql'), + JSON_MODE: false, + FIXTURES_MODE: false, +})); + +vi.mock('@/lib/api-cache', () => ({ + // Passthrough so importing the route doesn't touch blob storage; the key is + // still exported as CACHE_KEY_PREFIX for us to assert on. + cachedQuery: (fn: (...args: unknown[]) => unknown) => fn, + cachedJson: (data: unknown) => Response.json(data), +})); + +import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; +import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series'; +import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline'; + +import { CACHE_KEY_PREFIX as derivedAgenticMetricsKey } from './derived-agentic-metrics/route'; +import { CACHE_KEY_PREFIX as agenticAggregatesKey } from './agentic-aggregates/route'; +import { CACHE_KEY_PREFIX as requestTimelineKey } from './request-timeline/route'; +import { CACHE_KEY_PREFIX as traceServerMetricsKey } from './trace-server-metrics/route'; +import { CACHE_KEY_PREFIX as traceHistogramsKey } from './trace-histograms/route'; + +describe('agentic blob-cache keys are version-derived', () => { + it('derived-agentic-metrics key embeds STATS_VERSION', () => { + expect(derivedAgenticMetricsKey).toBe(`derived-agentic-metrics-v${STATS_VERSION}`); + }); + + it('agentic-aggregates key embeds STATS_VERSION', () => { + expect(agenticAggregatesKey).toBe(`agentic-aggregates-v${STATS_VERSION}`); + }); + + it('request-timeline key embeds REQUEST_TIMELINE_VERSION', () => { + expect(requestTimelineKey).toBe(`request-timeline-v${REQUEST_TIMELINE_VERSION}`); + }); + + it('trace-server-metrics key embeds CHART_SERIES_VERSION', () => { + expect(traceServerMetricsKey).toBe(`trace-server-metrics-v${CHART_SERIES_VERSION}`); + }); + + it('trace-histograms key embeds REQUEST_TIMELINE_VERSION (its payload is read from request_timeline)', () => { + expect(traceHistogramsKey).toBe(`trace-histograms-v${REQUEST_TIMELINE_VERSION}`); + }); + + it('every key actually contains a version segment (no unversioned literals)', () => { + for (const key of [ + derivedAgenticMetricsKey, + agenticAggregatesKey, + requestTimelineKey, + traceServerMetricsKey, + traceHistogramsKey, + ]) { + expect(key).toMatch(/-v\d+$/u); + } + }); +}); diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts new file mode 100644 index 00000000..0718aae0 --- /dev/null +++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts @@ -0,0 +1,29 @@ +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { + getBenchmarkSiblings, + type BenchmarkSiblings, +} from '@semianalysisai/inferencex-db/queries/benchmark-siblings'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +const getCachedSiblings = cachedQuery((id: number): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getBenchmarkSiblings(id)); + return getBenchmarkSiblings(getDb(), id); +}, 'benchmark-siblings'); + +/** + * GET /api/v1/benchmark-siblings?id=N + * + * Returns the SKU (hw/framework/model/precision/spec/benchmark_type) of the + * benchmark_result + all sibling rows that share that SKU within the same + * workflow_run. Used by the agentic detail page to render a navigator. + */ +export const GET = idQueryRoute({ + logLabel: 'benchmark siblings', + fetch: getCachedSiblings, +}); diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts new file mode 100644 index 00000000..bc374e72 --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.test.ts @@ -0,0 +1,71 @@ +import { describe, expect, it, vi, beforeEach } from 'vitest'; + +const { mockGetConversation, mockGetDb } = vi.hoisted(() => ({ + mockGetConversation: vi.fn(), + mockGetDb: vi.fn(() => 'mock-sql'), +})); + +vi.mock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: mockGetDb, + JSON_MODE: false, + FIXTURES_MODE: false, +})); + +vi.mock('@semianalysisai/inferencex-db/queries/datasets', () => ({ + getConversation: mockGetConversation, +})); + +vi.mock('@semianalysisai/inferencex-db/json-provider', () => ({ + getConversation: vi.fn(), +})); + +vi.mock('@/lib/api-cache', () => ({ + cachedQuery: (fn: (...args: any[]) => any) => fn, + cachedJson: (data: unknown) => Response.json(data), +})); + +import { GET } from './route'; +import { NextRequest } from 'next/server'; + +function req(): NextRequest { + return new NextRequest(new URL('http://localhost/api/v1/datasets/ds/conversations/x')); +} + +/** + * App Router decodes each dynamic route segment EXACTLY ONCE before handing it to + * the handler, so `params.convId` is already the raw conversation id. These tests + * pin the route's contract: it must pass that value straight to the query with NO + * further decodeURIComponent (which would over-decode, mis-key '%'/'/' ids, or + * throw on a lone '%'). The client (useDatasetConversation) encodeURIComponent's + * the id before the fetch, so the whole pipeline decodes once end-to-end. + */ +beforeEach(() => { + vi.clearAllMocks(); + mockGetConversation.mockResolvedValue({ conv_id: 'x', turns: [] }); +}); + +describe('GET /api/v1/datasets/[slug]/conversations/[convId] — decode exactly once', () => { + it('passes the already-decoded convId straight through (no second decode)', async () => { + const params = Promise.resolve({ slug: 'ds', convId: 'a/b%c' }); + const res = await GET(req(), { params }); + expect(res.status).toBe(200); + // 'a/b%c' contains a lone '%'; a second decodeURIComponent here would THROW + // (→ 500). Passing through means the query sees the raw id verbatim. + expect(mockGetConversation).toHaveBeenCalledWith('mock-sql', 'ds', 'a/b%c'); + }); + + it('preserves special characters (% / # ?) exactly as decoded by App Router', async () => { + const raw = 'conv/50%_a#b?c'; + const params = Promise.resolve({ slug: 'ds', convId: raw }); + const res = await GET(req(), { params }); + expect(res.status).toBe(200); + expect(mockGetConversation).toHaveBeenCalledWith('mock-sql', 'ds', raw); + }); + + it('returns 404 when the conversation is not found', async () => { + mockGetConversation.mockResolvedValueOnce(null); + const params = Promise.resolve({ slug: 'ds', convId: 'missing' }); + const res = await GET(req(), { params }); + expect(res.status).toBe(404); + }); +}); diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts new file mode 100644 index 00000000..35f2fddf --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts @@ -0,0 +1,40 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { + getConversation, + type ConversationDetail, +} from '@semianalysisai/inferencex-db/queries/datasets'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedConversation = cachedQuery( + (slug: string, convId: string): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getConversation(slug, convId)); + return getConversation(getDb(), slug, convId); + }, + 'dataset-conversation', +); + +/** GET /api/v1/datasets/[slug]/conversations/[convId] — flamegraph structure. */ +export async function GET( + _request: NextRequest, + { params }: { params: Promise<{ slug: string; convId: string }> }, +) { + const { slug, convId } = await params; + try { + // App Router has already decoded the `[convId]` segment exactly once, so + // `convId` is the raw conversation id. The client (useDatasetConversation) + // encodeURIComponent-encodes it before the fetch; decoding again here would + // over-decode and mis-key ids containing '%' / '/'. Decode exactly once. + const data = await getCachedConversation(slug, convId); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching dataset conversation:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts new file mode 100644 index 00000000..b582e79c --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.test.ts @@ -0,0 +1,116 @@ +import { describe, expect, it, vi, beforeEach } from 'vitest'; + +const { mockListConversations, mockGetDb } = vi.hoisted(() => ({ + mockListConversations: vi.fn(), + mockGetDb: vi.fn(() => 'mock-sql'), +})); + +vi.mock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: mockGetDb, + JSON_MODE: false, + FIXTURES_MODE: false, +})); + +vi.mock('@semianalysisai/inferencex-db/queries/datasets', () => ({ + listConversations: mockListConversations, +})); + +vi.mock('@semianalysisai/inferencex-db/json-provider', () => ({ + listConversations: vi.fn(), +})); + +vi.mock('@/lib/api-cache', () => ({ + cachedQuery: (fn: (...args: any[]) => any) => fn, + cachedJson: (data: unknown) => Response.json(data), +})); + +import { GET } from './route'; +import { NextRequest } from 'next/server'; + +function req(path: string): NextRequest { + return new NextRequest(new URL(path, 'http://localhost')); +} + +const PARAMS = Promise.resolve({ slug: 'test-dataset' }); + +beforeEach(() => { + vi.clearAllMocks(); +}); + +describe('GET /api/v1/datasets/[slug]/conversations — search input validation', () => { + it('returns 400 when search exceeds 100 characters', async () => { + const longSearch = 'a'.repeat(101); + const res = await GET(req(`/api/v1/datasets/test-dataset/conversations?search=${longSearch}`), { + params: PARAMS, + }); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('search too long'); + // DB must not be called. + expect(mockListConversations).not.toHaveBeenCalled(); + }); + + it('accepts a search string exactly at the 100-character limit', async () => { + const exactSearch = 'a'.repeat(100); + mockListConversations.mockResolvedValueOnce({ total: 0, items: [] }); + const res = await GET( + req(`/api/v1/datasets/test-dataset/conversations?search=${exactSearch}`), + { params: PARAMS }, + ); + expect(res.status).toBe(200); + }); + + it('trims whitespace before applying the length check', async () => { + // A 101-char string that is 100 chars of spaces + 1 real char should become + // 1 char after trimming — well under the limit. + const paddedSearch = `${' '.repeat(100)}a`; + mockListConversations.mockResolvedValueOnce({ total: 1, items: [] }); + const res = await GET( + req(`/api/v1/datasets/test-dataset/conversations?search=${paddedSearch}`), + { params: PARAMS }, + ); + expect(res.status).toBe(200); + expect(mockListConversations).toHaveBeenCalledWith( + 'mock-sql', + 'test-dataset', + expect.objectContaining({ search: 'a' }), + ); + }); + + it('returns 404 when the dataset slug is unknown', async () => { + mockListConversations.mockResolvedValueOnce(null); + const res = await GET(req('/api/v1/datasets/test-dataset/conversations'), { + params: PARAMS, + }); + expect(res.status).toBe(404); + const body = await res.json(); + expect(body.error).toBe('Not found'); + }); + + it('returns conversation data for a valid request', async () => { + const mockData = { total: 2, items: [{ conv_id: 'c1' }, { conv_id: 'c2' }] }; + mockListConversations.mockResolvedValueOnce(mockData); + const res = await GET( + req('/api/v1/datasets/test-dataset/conversations?search=agent&sort=turns&limit=10&offset=0'), + { params: PARAMS }, + ); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body).toEqual(mockData); + expect(mockListConversations).toHaveBeenCalledWith( + 'mock-sql', + 'test-dataset', + expect.objectContaining({ search: 'agent', sort: 'turns', limit: 10, offset: 0 }), + ); + }); + + it('returns 500 when the query throws', async () => { + mockListConversations.mockRejectedValueOnce(new Error('Neon timeout')); + const res = await GET(req('/api/v1/datasets/test-dataset/conversations'), { + params: PARAMS, + }); + expect(res.status).toBe(500); + const body = await res.json(); + expect(body.error).toBe('Internal server error'); + }); +}); diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts new file mode 100644 index 00000000..2dad4ace --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts @@ -0,0 +1,71 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { + listConversations, + type ConversationList, + type ListConversationsOpts, +} from '@semianalysisai/inferencex-db/queries/datasets'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const SORTS = new Set(['tokens', 'turns', 'subagents', 'id']); + +const getCachedConversations = cachedQuery( + ( + slug: string, + search: string, + limit: number, + offset: number, + sort: string, + ): Promise => { + const opts: ListConversationsOpts = { + search: search || undefined, + limit, + offset, + sort: sort as ListConversationsOpts['sort'], + }; + if (JSON_MODE) return Promise.resolve(jsonProvider.listConversations(slug, opts)); + return listConversations(getDb(), slug, opts); + }, + 'dataset-conversations', +); + +// Maximum search string length accepted. Longer strings are rejected with 400 +// rather than being forwarded to the DB: an ILIKE on an unindexed conv_id column +// with a very long pattern (or many stacked wildcards) can exhaust Neon's +// statement timeout and return a 500. 100 chars is generous for any real +// conversation-id prefix while keeping the attack surface small. +const MAX_SEARCH_LENGTH = 100; + +/** + * GET /api/v1/datasets/[slug]/conversations?search=&limit=&offset=&sort= + * Paginated conversation list (counts only, no flamegraph structure). + */ +export async function GET(request: NextRequest, { params }: { params: Promise<{ slug: string }> }) { + const { slug } = await params; + const sp = request.nextUrl.searchParams; + const rawSearch = sp.get('search') ?? ''; + const search = rawSearch.trim(); + + // Reject search strings that exceed the length cap before touching the DB. + if (search.length > MAX_SEARCH_LENGTH) { + return NextResponse.json({ error: 'search too long' }, { status: 400 }); + } + + const limit = Math.min(200, Math.max(1, Number(sp.get('limit')) || 50)); + const offset = Math.max(0, Number(sp.get('offset')) || 0); + const sortParam = sp.get('sort') ?? 'tokens'; + const sort = SORTS.has(sortParam) ? sortParam : 'tokens'; + try { + const data = await getCachedConversations(slug, search, limit, offset, sort); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching dataset conversations:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/datasets/[slug]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/route.ts new file mode 100644 index 00000000..e440ff5d --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/[slug]/route.ts @@ -0,0 +1,30 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { getDataset, type DatasetDetail } from '@semianalysisai/inferencex-db/queries/datasets'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedDataset = cachedQuery((slug: string): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getDataset(slug)); + return getDataset(getDb(), slug); +}, 'dataset'); + +/** GET /api/v1/datasets/[slug] — one dataset incl. precomputed chart_data. */ +export async function GET( + _request: NextRequest, + { params }: { params: Promise<{ slug: string }> }, +) { + const { slug } = await params; + try { + const data = await getCachedDataset(slug); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching dataset:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/datasets/route.ts b/packages/app/src/app/api/v1/datasets/route.ts new file mode 100644 index 00000000..3ad4c15d --- /dev/null +++ b/packages/app/src/app/api/v1/datasets/route.ts @@ -0,0 +1,25 @@ +import { NextResponse } from 'next/server'; + +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { listDatasets, type DatasetRecord } from '@semianalysisai/inferencex-db/queries/datasets'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedDatasets = cachedQuery((): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.listDatasets()); + return listDatasets(getDb()); +}, 'datasets'); + +/** GET /api/v1/datasets — all ingested cc-traces-weka datasets (registry cards). */ +export async function GET() { + try { + const data = await getCachedDatasets(); + return cachedJson(data); + } catch (error) { + console.error('Error fetching datasets:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts new file mode 100644 index 00000000..3afa5d41 --- /dev/null +++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts @@ -0,0 +1,54 @@ +import { STATS_VERSION } from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { + getDerivedAgenticMetrics, + type DerivedAgenticMetricMap, +} from '@semianalysisai/inferencex-db/queries/derived-agentic-metrics'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idsQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: the response is one entry per id with two numbers, but the +// derivation work parses thousands of JSONL records per blob — cache the +// computed result so a chart-refresh hits the warm path. +// +// The cache key is derived from STATS_VERSION (the payload governs the derived +// metrics read out of `aggregate_stats`). blobSet is write-once and nothing +// purges post-backfill, so a hand-written version string would serve stale +// data forever after a bump — deriving the key from the constant means a +// STATS_VERSION bump automatically rolls the cache namespace. +/** Version-derived blob-cache key namespace (exported for the key-derivation test). */ +export const CACHE_KEY_PREFIX = `derived-agentic-metrics-v${STATS_VERSION}`; + +const getCachedDerivedAgenticMetrics = cachedQuery( + (ids: number[]): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getDerivedAgenticMetrics(ids)); + return getDerivedAgenticMetrics(getDb(), ids); + }, + CACHE_KEY_PREFIX, + { blobOnly: true }, +); + +/** + * GET /api/v1/derived-agentic-metrics?ids=1,2,3 + * + * Returns per-id derived metrics computed live from the stored aiperf + * profile_export.jsonl blobs: + * - normalized_session_time_s: mean across sessions of session e2e time + * (Σ per-turn request_latency) rescaled by mean_load / session_load. + * - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT) + * across every turn in every session. + * - p75/p90_normalized_e2e_400_s: percentile of per-request + * TTFT + 399 × observed ITL. + * + * Ids without a trace_replay blob or with unparseable records are omitted. + */ +export const GET = idsQueryRoute({ + maxIds: 200, + logLabel: 'derived agentic metrics', + fetch: getCachedDerivedAgenticMetrics, +}); diff --git a/packages/app/src/app/api/v1/id-routes.test.ts b/packages/app/src/app/api/v1/id-routes.test.ts new file mode 100644 index 00000000..32499e99 --- /dev/null +++ b/packages/app/src/app/api/v1/id-routes.test.ts @@ -0,0 +1,136 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +vi.mock('@/lib/api-cache', () => ({ + cachedJson: (data: unknown) => Response.json(data), +})); + +import { NextRequest, NextResponse } from 'next/server'; + +import { idQueryRoute, idsQueryRoute, parseIdsParam } from './id-routes'; + +function req(url: string): NextRequest { + return new NextRequest(new URL(url, 'http://localhost')); +} + +beforeEach(() => { + vi.clearAllMocks(); +}); + +describe('parseIdsParam', () => { + it('parses, dedupes, and sorts ids ascending', () => { + const result = parseIdsParam(req('/x?ids=3, 1,2,3'), 200); + expect(result).toEqual([1, 2, 3]); + }); + + it('drops non-finite and non-positive ids', () => { + const result = parseIdsParam(req('/x?ids=abc,-1,0,5'), 200); + expect(result).toEqual([5]); + }); + + it('returns 400 when the param is missing', async () => { + const result = parseIdsParam(req('/x'), 200); + expect(result).toBeInstanceOf(NextResponse); + const res = result as NextResponse; + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('ids query param is required'); + }); + + it('returns 400 when no valid ids remain', async () => { + const result = parseIdsParam(req('/x?ids=abc,-2'), 200); + expect(result).toBeInstanceOf(NextResponse); + const res = result as NextResponse; + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('no valid ids provided'); + }); + + it('returns 400 when the id count exceeds maxIds', async () => { + const result = parseIdsParam(req('/x?ids=1,2,3'), 2); + expect(result).toBeInstanceOf(NextResponse); + const res = result as NextResponse; + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('too many ids (max 2)'); + }); +}); + +describe('idsQueryRoute', () => { + it('fetches with sorted deduped ids and returns the payload', async () => { + const fetch = vi.fn().mockResolvedValue({ 1: 'a', 2: 'b' }); + const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch }); + + const res = await GET(req('/x?ids=2,1,2')); + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ 1: 'a', 2: 'b' }); + expect(fetch).toHaveBeenCalledWith([1, 2]); + }); + + it('returns 400 without calling fetch when ids are invalid', async () => { + const fetch = vi.fn(); + const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch }); + + const res = await GET(req('/x')); + expect(res.status).toBe(400); + expect(fetch).not.toHaveBeenCalled(); + }); + + it('returns 500 and logs when the fetch throws', async () => { + const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + const fetch = vi.fn().mockRejectedValue(new Error('boom')); + const GET = idsQueryRoute({ maxIds: 200, logLabel: 'things', fetch }); + + const res = await GET(req('/x?ids=1')); + expect(res.status).toBe(500); + const body = await res.json(); + expect(body.error).toBe('Internal server error'); + expect(consoleSpy).toHaveBeenCalledWith('Error fetching things:', expect.any(Error)); + consoleSpy.mockRestore(); + }); +}); + +describe('idQueryRoute', () => { + it('fetches by id and returns the payload', async () => { + const fetch = vi.fn().mockResolvedValue({ value: 42 }); + const GET = idQueryRoute({ logLabel: 'thing', fetch }); + + const res = await GET(req('/x?id=7')); + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ value: 42 }); + expect(fetch).toHaveBeenCalledWith(7); + }); + + it.each(['/x', '/x?id=abc', '/x?id=0'])('returns 400 for %s', async (url) => { + const fetch = vi.fn(); + const GET = idQueryRoute({ logLabel: 'thing', fetch }); + + const res = await GET(req(url)); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe('id is required (benchmark_result_id)'); + expect(fetch).not.toHaveBeenCalled(); + }); + + it('returns 404 when the fetch yields null', async () => { + const fetch = vi.fn().mockResolvedValue(null); + const GET = idQueryRoute({ logLabel: 'thing', fetch }); + + const res = await GET(req('/x?id=7')); + expect(res.status).toBe(404); + const body = await res.json(); + expect(body.error).toBe('Not found'); + }); + + it('returns 500 and logs when the fetch throws', async () => { + const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + const fetch = vi.fn().mockRejectedValue(new Error('boom')); + const GET = idQueryRoute({ logLabel: 'thing', fetch }); + + const res = await GET(req('/x?id=7')); + expect(res.status).toBe(500); + const body = await res.json(); + expect(body.error).toBe('Internal server error'); + expect(consoleSpy).toHaveBeenCalledWith('Error fetching thing:', expect.any(Error)); + consoleSpy.mockRestore(); + }); +}); diff --git a/packages/app/src/app/api/v1/id-routes.ts b/packages/app/src/app/api/v1/id-routes.ts new file mode 100644 index 00000000..fea9221b --- /dev/null +++ b/packages/app/src/app/api/v1/id-routes.ts @@ -0,0 +1,85 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { cachedJson } from '@/lib/api-cache'; + +/** + * Shared GET-handler factories for the agentic benchmark routes, which all + * key off `benchmark_results.id`. Two shapes exist: + * - bulk `?ids=1,2,3` routes returning a map keyed by id + * - single `?id=N` routes returning one payload or 404 + * + * Both preserve the v1 error contract: 400 with `{error}` for bad params, + * 404 `{error: 'Not found'}` when a single-id lookup misses, and 500 + * `{error: 'Internal server error'}` (with a console.error) on query failure. + * Success payloads go through `cachedJson` for CDN caching + gzip. + */ + +/** + * Parse, dedupe, validate, and ascending-sort the `ids` query param. + * Sorted so the same set of ids in any order hits the same cache entry. + * Returns a NextResponse (400) when the param is missing, empty, or too long. + */ +export function parseIdsParam(request: NextRequest, maxIds: number): number[] | NextResponse { + const raw = request.nextUrl.searchParams.get('ids'); + if (!raw) { + return NextResponse.json({ error: 'ids query param is required' }, { status: 400 }); + } + + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => Number(s.trim())) + .filter((n) => Number.isFinite(n) && n > 0), + ), + ]; + if (ids.length === 0) { + return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 }); + } + if (ids.length > maxIds) { + return NextResponse.json({ error: `too many ids (max ${maxIds})` }, { status: 400 }); + } + return ids.toSorted((a, b) => a - b); +} + +/** Build a GET handler for a bulk `?ids=…` route. */ +export function idsQueryRoute(options: { + maxIds: number; + /** Human-readable name used in the 500-path console.error. */ + logLabel: string; + fetch: (ids: number[]) => Promise; +}): (request: NextRequest) => Promise { + const { maxIds, logLabel, fetch } = options; + return async (request: NextRequest) => { + const ids = parseIdsParam(request, maxIds); + if (ids instanceof NextResponse) return ids; + try { + return cachedJson(await fetch(ids)); + } catch (error) { + console.error(`Error fetching ${logLabel}:`, error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } + }; +} + +/** Build a GET handler for a single `?id=N` route (404 when the fetch misses). */ +export function idQueryRoute(options: { + logLabel: string; + fetch: (id: number) => Promise; +}): (request: NextRequest) => Promise { + const { logLabel, fetch } = options; + return async (request: NextRequest) => { + const id = Number(request.nextUrl.searchParams.get('id')); + if (!id || !Number.isFinite(id)) { + return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 }); + } + try { + const data = await fetch(id); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error(`Error fetching ${logLabel}:`, error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } + }; +} diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts new file mode 100644 index 00000000..89b599af --- /dev/null +++ b/packages/app/src/app/api/v1/request-timeline/route.ts @@ -0,0 +1,42 @@ +import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { + getRequestTimeline, + type RequestTimeline, +} from '@semianalysisai/inferencex-db/queries/request-timeline'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +// Key derived from REQUEST_TIMELINE_VERSION (governs the `request_timeline` +// payload). The blob cache is write-once with no post-backfill purge, so the +// version-derived key is what rolls the namespace on a bump — a hand-written +// string would serve stale blob-cached timelines forever. +/** Version-derived blob-cache key namespace (exported for the key-derivation test). */ +export const CACHE_KEY_PREFIX = `request-timeline-v${REQUEST_TIMELINE_VERSION}`; + +const getCachedRequestTimeline = cachedQuery( + (id: number): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getRequestTimeline(id)); + return getRequestTimeline(getDb(), id); + }, + CACHE_KEY_PREFIX, + { blobOnly: true }, +); + +/** + * GET /api/v1/request-timeline?id=N + * + * Returns the per-request Gantt timeline for one agentic benchmark point. + * Each request entry has ns-from-start offsets for credit/start/ack/end, + * plus TTFT, ISL, OSL, conversation id, turn index, worker id. 404 if the + * point has no stored profile_export.jsonl blob. + */ +export const GET = idQueryRoute({ + logLabel: 'request timeline', + fetch: getCachedRequestTimeline, +}); diff --git a/packages/app/src/app/api/v1/trace-availability/route.ts b/packages/app/src/app/api/v1/trace-availability/route.ts new file mode 100644 index 00000000..45eafef4 --- /dev/null +++ b/packages/app/src/app/api/v1/trace-availability/route.ts @@ -0,0 +1,29 @@ +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getTraceAvailability, + type TraceAvailabilityMap, +} from '@semianalysisai/inferencex-db/queries/trace-availability'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idsQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +const getCachedTraceAvailability = cachedQuery( + (ids: number[]): Promise => getTraceAvailability(getDb(), ids), + 'trace-availability', +); + +/** + * GET /api/v1/trace-availability?ids=1,2,3 + * + * Returns `{[id]: true}` for ids that have a stored trace_replay blob. + * Lightweight presence check used by the scatter tooltip to decide whether + * to render the "View charts" button — see queries/trace-availability.ts. + */ +export const GET = idsQueryRoute({ + maxIds: 500, + logLabel: 'trace availability', + fetch: getCachedTraceAvailability, +}); diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts new file mode 100644 index 00000000..4d3014ab --- /dev/null +++ b/packages/app/src/app/api/v1/trace-histograms/route.ts @@ -0,0 +1,47 @@ +import { REQUEST_TIMELINE_VERSION } from '@semianalysisai/inferencex-db/etl/compute-request-timeline'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { + getTraceHistograms, + type TraceHistogramMap, +} from '@semianalysisai/inferencex-db/queries/trace-histograms'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idsQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: a 50-id histogram payload can easily exceed Next.js's 2MB +// unstable_cache limit (each point carries one int per request, ~500-1000+ +// requests for agentic), which manifests as a 500 from the route. Blob +// storage lets us cache the larger response without losing the warm-cache hit. +// +// Key derived from REQUEST_TIMELINE_VERSION: the histograms are read out of the +// `request_timeline` payload (getTraceHistograms keys its fast path off that +// constant). The blob cache is write-once with no post-backfill purge, so the +// version-derived key is what rolls the namespace on a bump — the previously +// unversioned key would serve stale histograms forever. +export const CACHE_KEY_PREFIX = `trace-histograms-v${REQUEST_TIMELINE_VERSION}`; + +const getCachedTraceHistograms = cachedQuery( + (ids: number[]): Promise => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getTraceHistograms(ids)); + return getTraceHistograms(getDb(), ids); + }, + CACHE_KEY_PREFIX, + { blobOnly: true }, +); + +/** + * GET /api/v1/trace-histograms?ids=1,2,3 + * + * Returns per-request ISL/OSL arrays parsed from the stored aiperf + * `profile_export.jsonl` blobs, keyed by `benchmark_results.id`. + * Ids without a trace_replay blob are omitted from the response. + */ +export const GET = idsQueryRoute({ + maxIds: 200, + logLabel: 'trace histograms', + fetch: getCachedTraceHistograms, +}); diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts new file mode 100644 index 00000000..2d3554a4 --- /dev/null +++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts @@ -0,0 +1,42 @@ +import { CHART_SERIES_VERSION } from '@semianalysisai/inferencex-db/etl/compute-chart-series'; +import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { + getTraceServerMetrics, + type TraceServerMetrics, +} from '@semianalysisai/inferencex-db/queries/trace-server-metrics'; + +import { cachedQuery } from '@/lib/api-cache'; + +import { idQueryRoute } from '../id-routes'; + +export const dynamic = 'force-dynamic'; + +// Key derived from CHART_SERIES_VERSION (governs the `chart_series` payload). +// The blob cache is write-once with no post-backfill purge, so the +// version-derived key is what rolls the namespace on a bump — a hand-written +// string would serve stale blob-cached series forever. +/** Version-derived blob-cache key namespace (exported for the key-derivation test). */ +export const CACHE_KEY_PREFIX = `trace-server-metrics-v${CHART_SERIES_VERSION}`; + +const getCachedTraceServerMetrics = cachedQuery( + (id: number): Promise => { + if (JSON_MODE) return jsonProvider.getTraceServerMetrics(id); + return getTraceServerMetrics(getDb(), id); + }, + CACHE_KEY_PREFIX, + { blobOnly: true }, +); + +/** + * GET /api/v1/trace-server-metrics?id=N + * + * Returns parsed time-series for the agentic detail view: KV cache usage, + * prefix cache hit rate per interval, queue depth, and per-source prompt + * token rates. Times are in seconds from benchmark start. 404 if the point + * has no stored server_metrics_export.json blob. + */ +export const GET = idQueryRoute({ + logLabel: 'trace server metrics', + fetch: getCachedTraceServerMetrics, +}); diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx new file mode 100644 index 00000000..5bc8fea9 --- /dev/null +++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx @@ -0,0 +1,45 @@ +import { Suspense } from 'react'; +import type { Metadata } from 'next'; + +import { AgenticGate } from '@/components/agentic-gate'; +import { ConversationView } from '@/components/datasets/conversation-view'; +import { SITE_URL } from '@semianalysisai/inferencex-constants'; + +interface Props { + params: Promise<{ slug: string; convId: string }>; +} + +export async function generateMetadata({ params }: Props): Promise { + const { slug, convId } = await params; + // App Router has already decoded the dynamic segment exactly once, so `convId` + // is the raw conversation id here. Re-encode for the canonical URL. + const short = convId.slice(0, 12); + const title = `Conversation ${short} | ${slug}`; + const description = `Per-turn token flamegraph (cached prefix vs uncached input vs output) for conversation ${short} in the ${slug} agentic trace dataset.`; + return { + title, + description, + alternates: { + canonical: `${SITE_URL}/datasets/${slug}/conversations/${encodeURIComponent(convId)}`, + }, + robots: { index: false }, // per-conversation pages are too numerous to index + }; +} + +export default async function ConversationPage({ params }: Props) { + const { slug, convId } = await params; + // `convId` is already decoded once by App Router — pass it straight through. + // A second decodeURIComponent here would over-decode (and throw for ids that + // contain a literal '%'). ConversationView re-encodes when it builds the API URL. + return ( + +
+
+ + + +
+
+
+ ); +} diff --git a/packages/app/src/app/datasets/[slug]/page.tsx b/packages/app/src/app/datasets/[slug]/page.tsx new file mode 100644 index 00000000..c853a695 --- /dev/null +++ b/packages/app/src/app/datasets/[slug]/page.tsx @@ -0,0 +1,35 @@ +import type { Metadata } from 'next'; + +import { AgenticGate } from '@/components/agentic-gate'; +import { DatasetDetail } from '@/components/datasets/dataset-detail'; +import { SITE_URL } from '@semianalysisai/inferencex-constants'; + +interface Props { + params: Promise<{ slug: string }>; +} + +export async function generateMetadata({ params }: Props): Promise { + const { slug } = await params; + const title = `${slug} | Agentic Datasets`; + const description = `Distributions, token statistics, and per-conversation flamegraphs for the ${slug} agentic trace dataset.`; + return { + title, + description, + alternates: { canonical: `${SITE_URL}/datasets/${slug}` }, + openGraph: { title: `${title} | InferenceX`, description, url: `${SITE_URL}/datasets/${slug}` }, + twitter: { title: `${title} | InferenceX`, description }, + }; +} + +export default async function DatasetDetailPage({ params }: Props) { + const { slug } = await params; + return ( + +
+
+ +
+
+
+ ); +} diff --git a/packages/app/src/app/datasets/page.tsx b/packages/app/src/app/datasets/page.tsx new file mode 100644 index 00000000..711e0dbc --- /dev/null +++ b/packages/app/src/app/datasets/page.tsx @@ -0,0 +1,108 @@ +import type { Metadata } from 'next'; + +import { AgenticGate } from '@/components/agentic-gate'; +import { Card } from '@/components/ui/card'; +import { JsonLd } from '@/components/json-ld'; +import { DatasetList } from '@/components/datasets/dataset-list'; +import { SITE_URL } from '@semianalysisai/inferencex-constants'; + +const DESCRIPTION = + 'The real Claude Code agentic conversation traces that the InferenceX agentic benchmark replays — methodology, distributions, and per-conversation flamegraphs.'; + +export const metadata: Metadata = { + title: 'Agentic Datasets', + description: DESCRIPTION, + alternates: { canonical: `${SITE_URL}/datasets` }, + openGraph: { + title: 'Agentic Datasets | InferenceX', + description: DESCRIPTION, + url: `${SITE_URL}/datasets`, + }, + twitter: { title: 'Agentic Datasets | InferenceX', description: DESCRIPTION }, +}; + +const jsonLd = { + '@context': 'https://schema.org', + '@type': 'CollectionPage', + name: 'InferenceX Agentic Datasets', + description: DESCRIPTION, + url: `${SITE_URL}/datasets`, +}; + +export default function DatasetsPage() { + return ( + + + + ); +} + +function DatasetsPageContent() { + return ( +
+ +
+
+ +

+ Agentic Benchmark Datasets +

+

+ InferenceX's agentic benchmark doesn't replay synthetic prompts — it replays + real Claude Code coding sessions captured as conversation traces. + Each trace is a full multi-turn session: the main agent's turns plus any + subagents it spawned, with per-turn input/output token counts and the 64-token + KV-cache block hashes needed to reconstruct prefix-cache reuse. The traces are + published openly on HuggingFace under semianalysisai/cc-traces-weka-*{' '} + (apache-2.0). +

+ +

+ How traces are captured +

+

+ Production Claude Code sessions are recorded through a logging proxy that captures + every API request: its input and output token counts, the model used, timing (TTFT, + inter-token latency), and a list of hash_ids — one per 64-token KV block + of the request's input. Subagent invocations are grouped under their parent turn. + No prompt or completion text is stored; only token counts and block hashes, so the + corpus is shareable while remaining a faithful workload for replay. +

+ +

+ Cached prefix vs uncached suffix +

+

+ Agentic workloads are dominated by prefix reuse: each turn resends the growing + conversation, so most of its input is already in the KV cache from prior turns. We + reconstruct this exactly. Walking a conversation in order under an idealized infinite + cache, a turn's cached prefix is its longest run of leading{' '} + hash_ids already seen; the rest is the uncached suffix{' '} + that must be (re)computed. Blocks are 64 tokens; the split is clamped so cached + + uncached equals the turn's effective input even on a partial final block. + Subagents run against a snapshot of the parent cache at spawn (their context is + separate and is not folded back into the parent). +

+ +

Dataset variants

+
    +
  • + full — every captured request, unmodified. +
  • +
  • + 256k — requests whose input + output exceeds 256,000 tokens are + dropped so every turn fits a 256k context window (used when benchmarking engines + configured for a 256k max context). +
  • +
+
+
+ +
+

Datasets

+ +
+
+
+ ); +} diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 6e7afb0b..8bd10c71 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -12,6 +12,8 @@ import { useState, } from 'react'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; + // useLayoutEffect warns during SSR; alias to useEffect on the server (no-op there anyway). const useIsomorphicLayoutEffect = typeof window === 'undefined' ? useEffect : useLayoutEffect; @@ -19,11 +21,6 @@ function isEnumValue>(e: T, v: string): v is T[ return (Object.values(e) as string[]).includes(v); } -const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u; -const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u; - -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; - import { useAvailability } from '@/hooks/api/use-availability'; import { useWorkflowInfo } from '@/hooks/api/use-workflow-info'; import { useUrlState } from '@/hooks/useUrlState'; @@ -38,8 +35,22 @@ import { } from '@/lib/data-mappings'; import { computeAutoSwitchDecision } from '@/lib/unofficial-run-auto-switch'; import { countCurvesByPrecision, resolveEffectivePrecisions } from '@/lib/default-precisions'; +import { resolveEffectiveSequence } from '@/lib/default-sequence'; +import { useFeatureGate } from '@/lib/use-feature-gate'; import type { AvailabilityRow, WorkflowInfoResponse } from '@/lib/api'; +const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u; +const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u; + +// Placeholder for the public (non-null) `effectiveSequence` during the window +// before availability has loaded. It must be a fixed-seq scenario — never +// AgenticTraces — so the scenario selector doesn't flash "Agentic Traces" for a +// fixed-seq-only model while the chart shows its loading skeleton. `8k/1k` is +// the pre-agentic default for non-agentic models. Consumers that must not act on +// an unresolved sequence gate on `sequenceResolved` instead. +// (Declared after the import block so it never references `Sequence` above its import.) +const PRE_AVAILABILITY_SEQUENCE = Sequence.EightK_OneK; + interface RunInfo { runId: string; runDate: string; @@ -66,6 +77,15 @@ export interface GlobalFilterContextType { // Effective (validated) values effectiveSequence: Sequence; + /** + * Whether `effectiveSequence` reflects the selected model's real availability + * (DB or unofficial run) rather than the pre-load placeholder. False during + * the brief window before availability loads. Consumers that trigger data + * fetches or render sequence-dependent labels should gate on this so a + * fixed-seq-only model never fires an agentic fetch or flashes "Agentic + * Traces" before availability settles. + */ + sequenceResolved: boolean; effectivePrecisions: string[]; // Run date & run ID @@ -100,7 +120,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record { const runs: Record = {}; for (const run of data.runs) { const runId = String(run.github_run_id); - const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id); + const runChangelogs = data.changelogs.filter( + (c) => String(c.workflow_run_id) === String(run.github_run_id), + ); runs[runId] = { runId, runDate: run.created_at, @@ -140,6 +162,14 @@ export function GlobalFilterProvider({ }) { const { hasUrlParam, getUrlParam, setUrlParams } = useUrlState(); + // Agentic surfaces are hidden behind the shared konami-code feature gate + // (default OFF until agentic launches). When locked, agentic sequences are + // filtered out of `availableSequences` below — the single chokepoint that + // cascades: no agentic default (resolveEffectiveSequence falls to 8k/1k), no + // "Agentic Traces" scenario-selector entry, and no agentic x-axis mode / + // percentile selector (those key off effectiveSequence === AgenticTraces). + const agenticGateUnlocked = useFeatureGate(); + // ── Core filter state ───────────────────────────────────────────────────── const [selectedModel, setSelectedModel] = useState( () => initialModel ?? Model.DeepSeek_V4_Pro, @@ -147,7 +177,11 @@ export function GlobalFilterProvider({ const [selectedSequence, setSelectedSequence] = useState(() => { if (initialSequence) return initialSequence; - return Sequence.EightK_OneK; + const urlSeq = getUrlParam('i_seq'); + if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence; + // Prefer Agentic Traces by default when the selected model has it; the + // effectiveSequence fallback below handles models without agentic data. + return Sequence.AgenticTraces; }); const initialValidPrecisions = useMemo( @@ -269,26 +303,61 @@ export function GlobalFilterProvider({ } }, [unofficialAvailable, selectedModel]); - // Sequences available for the selected model (DB ∪ unofficial run for this model) + // Sequences available for the selected model (DB ∪ unofficial run for this model). + // + // When the agentic feature gate is locked (default), agentic sequences are + // dropped from every branch — including the static SEQUENCE_OPTIONS fallback — + // so no agentic scenario is ever selectable or defaulted. This is the single + // gate chokepoint for the main inference chart's agentic surfaces. const availableSequences = useMemo(() => { + const dropAgentic = (seqs: Sequence[]) => + agenticGateUnlocked ? seqs : seqs.filter((s) => s !== Sequence.AgenticTraces); const unofficialSeqs = unofficialAvailable .filter((a) => a.model === selectedModel) .map((a) => a.sequence as Sequence); if (!availabilityRows) { - return unofficialSeqs.length > 0 ? [...new Set(unofficialSeqs)] : SEQUENCE_OPTIONS; + return unofficialSeqs.length > 0 + ? dropAgentic([...new Set(unofficialSeqs)]) + : dropAgentic(SEQUENCE_OPTIONS); } - const dbSeqs = modelRows - .map((r) => islOslToSequence(r.isl, r.osl)) - .filter((s): s is Sequence => s !== null); - const merged = [...new Set([...dbSeqs, ...unofficialSeqs])]; - return merged.length > 0 ? merged : SEQUENCE_OPTIONS; - }, [availabilityRows, modelRows, unofficialAvailable, selectedModel]); - - // Synchronously validated sequence - const effectiveSequence = useMemo(() => { - if (availableSequences.includes(selectedSequence)) return selectedSequence; - return availableSequences[0] ?? selectedSequence; - }, [availableSequences, selectedSequence]); + const dbSeqs = modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null); + const merged = dropAgentic([...new Set([...dbSeqs, ...unofficialSeqs])]); + return merged.length > 0 ? merged : dropAgentic(SEQUENCE_OPTIONS); + }, [availabilityRows, modelRows, unofficialAvailable, selectedModel, agenticGateUnlocked]); + + // Whether we actually know the selected model's sequences yet. Availability + // may arrive from the DB (`availabilityRows`) OR from a loaded unofficial run + // (`unofficialAvailable` for this model) — either source lets us resolve a + // trustworthy effectiveSequence. Until then `availableSequences` is the static + // SEQUENCE_OPTIONS fallback (which contains AgenticTraces), so resolving + // eagerly would fetch + label an agentic scenario for fixed-seq-only models, + // then snap once availability lands (flash + wasted request). + const availabilityLoaded = useMemo( + () => + availabilityRows !== undefined || unofficialAvailable.some((a) => a.model === selectedModel), + [availabilityRows, unofficialAvailable, selectedModel], + ); + + // Synchronously validated sequence. + // + // `resolveEffectiveSequence` returns null while availability is still loading + // — we surface that as `sequenceResolved` so InferenceContext can gate the + // benchmark fetch until the real sequence is known (no agentic fetch fires for + // a fixed-seq-only model). For the non-null public `effectiveSequence` value + // we substitute a fixed-seq scenario (never AgenticTraces) during that window + // so the scenario selector never flashes "Agentic Traces"; the chart shows its + // normal loading skeleton until `sequenceResolved` flips true. + const resolvedSequence = useMemo( + () => + resolveEffectiveSequence({ + selectedSequence, + availableSequences, + availabilityLoaded, + }), + [selectedSequence, availableSequences, availabilityLoaded], + ); + const sequenceResolved = resolvedSequence !== null; + const effectiveSequence = resolvedSequence ?? PRE_AVAILABILITY_SEQUENCE; // Precisions available for the selected model + sequence (DB ∪ unofficial run) const availablePrecisions = useMemo(() => { @@ -298,7 +367,7 @@ export function GlobalFilterProvider({ if (!availabilityRows) { return unofficialPrecs.length > 0 ? [...new Set(unofficialPrecs)].toSorted() : ['fp4']; } - const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const dbPrecs = rows.map((r) => r.precision); const merged = [...new Set([...dbPrecs, ...unofficialPrecs])].toSorted(); return merged.length > 0 ? merged : ['fp4']; @@ -307,10 +376,7 @@ export function GlobalFilterProvider({ // Curve count per precision (distinct hw/framework/spec/disagg series) for the // selected model + sequence — drives the auto default toward the densest one. const precisionCurveCounts = useMemo( - () => - countCurvesByPrecision( - modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence), - ), + () => countCurvesByPrecision(modelRows.filter((r) => rowToSequence(r) === effectiveSequence)), [modelRows, effectiveSequence], ); @@ -346,7 +412,7 @@ export function GlobalFilterProvider({ // Dates available for selected model + sequence + precisions const availableDates = useMemo(() => { if (!availabilityRows) return []; - const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision)); if (rows.length === 0) { return [...new Set(seqRows.map((r) => r.date))].toSorted(); @@ -438,7 +504,11 @@ export function GlobalFilterProvider({ g_model: selectedModel, g_rundate: selectedRunDate, g_runid: selectedRunId, - i_seq: effectiveSequence, + // Don't pin the sequence to the URL until it's resolved from real + // availability — writing the pre-load placeholder (8k/1k) would clobber a + // shared `?i_seq=agentic-traces` link before the model's availability + // confirms it has agentic data. + i_seq: sequenceResolved ? effectiveSequence : undefined, // Only pin the precision in the URL once chosen explicitly; in auto mode // leave it out so the link keeps following the per-model densest default. i_prec: precisionExplicit ? effectivePrecisions.join(',') : undefined, @@ -448,6 +518,7 @@ export function GlobalFilterProvider({ selectedRunDate, selectedRunId, effectiveSequence, + sequenceResolved, effectivePrecisions, precisionExplicit, setUrlParams, @@ -462,6 +533,7 @@ export function GlobalFilterProvider({ selectedPrecisions, setSelectedPrecisions, effectiveSequence, + sequenceResolved, effectivePrecisions, selectedRunDate: effectiveRunDate, setSelectedRunDate: setSelectedRunDateManual, @@ -484,6 +556,7 @@ export function GlobalFilterProvider({ selectedSequence, selectedPrecisions, effectiveSequence, + sequenceResolved, effectivePrecisions, effectiveRunDate, setSelectedRunDateManual, diff --git a/packages/app/src/components/agentic-gate.tsx b/packages/app/src/components/agentic-gate.tsx new file mode 100644 index 00000000..9fa0aa37 --- /dev/null +++ b/packages/app/src/components/agentic-gate.tsx @@ -0,0 +1,41 @@ +'use client'; + +import { notFound } from 'next/navigation'; +import { useEffect, useState } from 'react'; + +import { FEATURE_GATE_KEY, useFeatureGate } from '@/lib/use-feature-gate'; + +/** + * Client gate for the standalone agentic product pages (`/datasets/*`, + * `/inference/agentic/[id]`). These are server-rendered routes with no nav + * entry once the header link is hidden, so a direct URL visit is the only way + * in. When the shared konami-code feature gate (see {@link useFeatureGate}) is + * locked — the default until agentic launches — we `notFound()` so the route + * behaves like a clean 404 instead of publicly exposing agentic surfaces. + * + * The gate lives in localStorage, which the server can't read, so we resolve it + * on the client: read the flag synchronously on mount, and until then render + * nothing (no content flash before a potential 404). QA can unlock at runtime + * with ↑↑↓↓ (the same mechanism as the Hidden tab dropdown) or by seeding + * `localStorage['inferencex-feature-gate'] = '1'`, after which these pages + * render in full. + */ +export function AgenticGate({ children }: { children: React.ReactNode }) { + const unlocked = useFeatureGate(); + // Distinguish "haven't read localStorage yet" from "read it, gate is locked": + // useFeatureGate() returns false on the server and on the very first client + // render before its mount effect runs, so we must not 404 during that window. + const [resolved, setResolved] = useState(false); + useEffect(() => setResolved(true), []); + + if (!resolved) return null; + if (!unlocked) { + // Belt-and-suspenders: re-read the flag directly in case an unlock event + // hasn't propagated yet on this first resolved render. + if (typeof window !== 'undefined' && localStorage.getItem(FEATURE_GATE_KEY) === '1') { + return <>{children}; + } + notFound(); + } + return <>{children}; +} diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx new file mode 100644 index 00000000..415a430d --- /dev/null +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -0,0 +1,109 @@ +'use client'; + +import Link from 'next/link'; +import { useSearchParams } from 'next/navigation'; + +import { Card } from '@/components/ui/card'; +import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; +import { useDatasetConversation } from '@/hooks/api/use-datasets'; +import { compact, formatShare } from './format'; +import { Stat } from './stat'; + +export function ConversationView({ slug, convId }: { slug: string; convId: string }) { + const { data, isLoading, isError } = useDatasetConversation(slug, convId); + + // Deep-link target from a request-timeline click: ?raw= or ?turn=[&sa=]. + // useSearchParams (not a one-shot window.location read) so the params are + // present on the very first client-side navigation, not just after a reload. + const params = useSearchParams(); + const turnRaw = params.get('turn'); + const sourceRaw = params.get('raw'); + const sourceInner = params.get('inner'); + const highlight = { + turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null, + raw: sourceRaw !== null && /^\d+$/u.test(sourceRaw) ? Number(sourceRaw) : null, + inner: sourceInner !== null && /^\d+$/u.test(sourceInner) ? Number(sourceInner) : null, + agent: params.get('sa'), + }; + + if (isLoading) { + return ( +
Loading conversation…
+ ); + } + if (isError || !data) { + return ( +
+ Conversation not found.{' '} + + Back to dataset + +
+ ); + } + + const cachedPct = formatShare(data.total_cached, data.total_in); + + return ( +
+
+
+ + Datasets + + / + + {slug} + + / + conversation +
+

+ {data.conv_id} +

+ {data.models.length > 0 && ( +
+ {data.models.map((m) => ( + + {m} + + ))} +
+ )} +
+ + +
+ + + + + + +
+
+ + +

Flamegraph

+

+ One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default — + click a group to expand it. Each bar splits input into cached prefix and uncached suffix, + plus generated output. Timestamps are elapsed from conversation start; subagent headers + show their full active range. A colored bracket on the left groups requests in the same + main-agent or subagent scope whose original execution intervals overlapped (ran in + parallel). +

+ +
+
+ ); +} diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx new file mode 100644 index 00000000..609a4c8f --- /dev/null +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -0,0 +1,320 @@ +'use client'; + +import { useState } from 'react'; +import Link from 'next/link'; + +import { Card } from '@/components/ui/card'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { DistributionCard } from '@/components/datasets/distribution-card'; +import { + useDataset, + useDatasetConversations, + type ConversationSort, +} from '@/hooks/api/use-datasets'; +import { track } from '@/lib/analytics'; +import { compact, formatPct, formatShare, perConversation } from './format'; +import { Stat } from './stat'; + +const PAGE = 50; + +const SORTS: { value: ConversationSort; label: string }[] = [ + { value: 'tokens', label: 'Total input ↓' }, + { value: 'turns', label: 'Turns ↓' }, + { value: 'subagents', label: 'Subagent groups ↓' }, + { value: 'id', label: 'Conversation ID' }, +]; + +export function DatasetDetail({ slug }: { slug: string }) { + const { data: dataset, isLoading, isError } = useDataset(slug); + const [search, setSearch] = useState(''); + const [sort, setSort] = useState('tokens'); + const [page, setPage] = useState(0); + + const { data: convs, isFetching } = useDatasetConversations({ + slug, + search, + sort, + limit: PAGE, + offset: page * PAGE, + }); + + if (isLoading) { + return
Loading dataset…
; + } + if (isError || !dataset) { + return ( +
+ Dataset not found.{' '} + + Back to datasets + +
+ ); + } + + const s = dataset.summary ?? {}; + const cd = dataset.chart_data ?? {}; + const total = convs?.total ?? 0; + const pageCount = Math.ceil(total / PAGE); + + return ( +
+ {/* header */} +
+
+ + ← Datasets + +
+ + {dataset.description && ( +

{dataset.description}

+ )} +
+ + {/* summary stats */} + +
+ + + + + + + + +
+ {s.modelMix && Object.keys(s.modelMix).length > 0 && ( +
+
+ Model mix (turns) +
+
+ {Object.entries(s.modelMix) + .toSorted((a, b) => b[1] - a[1]) + .map(([model, count]) => ( + + {model} {compact(count)} + + ))} +
+
+ )} +
+ + {/* distribution cards */} +
+

Distributions

+
+ + + + + + + +
+
+ + {/* conversation list */} +
+
+

+ Conversations{' '} + ({total}) +

+
+ { + setSearch(e.target.value); + setPage(0); + }} + placeholder="Search by ID…" + className="h-8 w-40 rounded-md border border-border/40 bg-background px-2 text-xs outline-none focus:border-primary" + /> + +
+
+ + + + + + + + + + + + + + + {(convs?.items ?? []).map((c) => { + const cachedPct = formatShare(c.total_cached, c.total_in); + return ( + + + + + + + + + ); + })} + {!isFetching && (convs?.items.length ?? 0) === 0 && ( + + + + )} + +
ConversationTurnsSubagentsInputOutputCached
+ track('datasets_conversation_clicked', { slug })} + className="font-mono text-xs text-primary hover:underline" + > + {c.conv_id.slice(0, 20)}… + + {c.models.length > 0 && ( + + {c.models.length} model{c.models.length === 1 ? '' : 's'} + + )} + {c.num_turns}{c.num_subagent_groups}{compact(c.total_in)}{compact(c.total_out)} + {cachedPct} +
+ No conversations match. +
+
+ + {pageCount > 1 && ( +
+ + + Page {page + 1} of {pageCount} + + +
+ )} +
+
+ ); +} diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx new file mode 100644 index 00000000..d85d7eaa --- /dev/null +++ b/packages/app/src/components/datasets/dataset-list.tsx @@ -0,0 +1,86 @@ +'use client'; + +import Link from 'next/link'; + +import { Card } from '@/components/ui/card'; +import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets'; +import { track } from '@/lib/analytics'; +import { compact, formatPct, perConversation } from './format'; + +function DatasetCard({ d }: { d: DatasetRecord }) { + const s = d.summary ?? {}; + const cachedPct = formatPct(s.cachedPct); + return ( + track('datasets_card_clicked', { slug: d.slug })} + className="block transition-colors hover:[&_*]:border-primary/40" + > + +
+

{d.label}

+ + {d.variant} + +
+ {d.description && ( +

{d.description}

+ )} +
+ + + + + + + + +
+
View dataset →
+
+ + ); +} + +function Stat({ label, value }: { label: string; value: string }) { + return ( +
+
{label}
+
{value}
+
+ ); +} + +export function DatasetList() { + const { data, isLoading, isError } = useDatasets(); + + if (isLoading) { + return
Loading datasets…
; + } + if (isError || !data) { + return ( +
Failed to load datasets.
+ ); + } + if (data.length === 0) { + return ( +
+ No datasets ingested yet. +
+ ); + } + + return ( +
+ {data.map((d) => ( + + ))} +
+ ); +} diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx new file mode 100644 index 00000000..8adc02ee --- /dev/null +++ b/packages/app/src/components/datasets/distribution-card.tsx @@ -0,0 +1,220 @@ +'use client'; + +import { useMemo } from 'react'; + +import { Card } from '@/components/ui/card'; +import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover'; +import type { Distribution } from '@/hooks/api/use-datasets'; +import { compact } from './format'; + +interface DistributionCardProps { + title: string; + subtitle?: string; + unit: string; + distribution?: Distribution; + scale?: 'log' | 'linear'; + /** Format the x value (defaults to compact). e.g. percent for cached fraction. */ + formatValue?: (v: number) => string; +} + +const W = 720; +const H = 240; +const PAD = { top: 12, right: 16, bottom: 48, left: 52 }; + +/** + * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a + * themeable bar chart with p50/p75/p90/p95 guide lines and a hover tooltip. Bars are + * drawn at equal visual width; for log-scaled bins the edge labels are already + * log-spaced so the shape reads as a log histogram. + */ +export function DistributionCard({ + title, + subtitle, + unit, + distribution, + scale = 'linear', + formatValue = compact, +}: DistributionCardProps) { + const computed = useMemo(() => { + const bins = distribution?.bins ?? []; + if (bins.length === 0) return null; + const maxCount = Math.max(1, ...bins.map((b) => b.count)); + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const n = bins.length; + const barW = innerW / n; + // Map a data value to an x pixel by locating its bin (positional — works for + // both linear and log bins since the edges are precomputed at ingest). + // Out-of-range values clamp to the first/last bin. + const valueToX = (v: number): number => { + for (let i = 0; i < n; i++) { + if (v >= bins[i].x0 && (v < bins[i].x1 || i === n - 1)) { + return PAD.left + (i + 0.5) * barW; + } + } + if (v <= bins[0].x0) return PAD.left + 0.5 * barW; + return PAD.left + (n - 0.5) * barW; + }; + return { bins, maxCount, innerW, innerH, n, barW, valueToX }; + }, [distribution]); + + if (!computed) { + return ( + +
{title}
+
+ No data +
+
+ ); + } + + const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed; + const stats = distribution?.stats; + + const guides: { label: string; value: number; color: string }[] = stats + ? [ + { label: 'p50', value: stats.median, color: '#3b82f6' }, + ...(typeof stats.p75 === 'number' + ? [{ label: 'p75', value: stats.p75, color: '#22c55e' }] + : []), + { label: 'p90', value: stats.p90, color: '#f59e0b' }, + ...(typeof stats.p95 === 'number' + ? [{ label: 'p95', value: stats.p95, color: '#ef4444' }] + : []), + ] + : []; + + // X tick labels from a few bin edges. + const tickIdxs = [0, Math.floor(n / 3), Math.floor((2 * n) / 3), n - 1]; + + const resolve = (fraction: number) => { + const i = Math.min(n - 1, Math.max(0, Math.floor(fraction * n))); + const b = bins[i]; + const items: HoverItem[] = [ + { + color: 'currentColor', + label: 'Range', + value: `${formatValue(b.x0)}–${formatValue(b.x1)} ${unit}`, + }, + { color: 'currentColor', label: 'Count', value: b.count.toLocaleString() }, + ]; + return { items }; + }; + + return ( + +
+ {title} + {scale === 'log' && ( + + log scale + + )} +
+ {subtitle &&
{subtitle}
} + {stats && ( +
+ n={stats.count.toLocaleString()} · p50 {formatValue(stats.median)} + {typeof stats.p75 === 'number' && <> · p75 {formatValue(stats.p75)}} · p90{' '} + {formatValue(stats.p90)} + {typeof stats.p95 === 'number' && <> · p95 {formatValue(stats.p95)}} · max{' '} + {formatValue(stats.max)} {unit} +
+ )} +
+ + {/* bars */} + {bins.map((b, i) => { + const h = (b.count / maxCount) * innerH; + const x = PAD.left + i * barW; + const y = PAD.top + (innerH - h); + return ( + + ); + })} + + {/* guide lines */} + {guides.map((g) => { + const x = valueToX(g.value); + return ( + + ); + })} + + {/* x axis */} + + {tickIdxs.map((i, k) => { + const anchor = k === 0 ? 'start' : k === tickIdxs.length - 1 ? 'end' : 'middle'; + const x = PAD.left + (i + 0.5) * barW; + return ( + + {formatValue(bins[i].x0)} + + ); + })} + + {unit} + + + {/* guide legend */} + {guides.map((g, i) => ( + + + + {g.label} {formatValue(g.value)} + + + ))} + +
+
+ ); +} diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts new file mode 100644 index 00000000..fd526d12 --- /dev/null +++ b/packages/app/src/components/datasets/format.ts @@ -0,0 +1,28 @@ +/** + * Compact number formatter for dataset token/count displays: + * 1234 → "1.2k", 1_200_000 → "1.2M", 3.2e9 → "3.2B", 0.82 → "0.82". + */ +export function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + if (abs > 0 && abs < 1) return n.toFixed(2); + return String(Math.round(n)); +} + +/** Format a per-conversation count without hiding a meaningful fractional mean. */ +export function perConversation(n: number | undefined): string { + if (typeof n !== 'number' || !Number.isFinite(n)) return '—'; + return n.toLocaleString(undefined, { maximumFractionDigits: 1 }); +} + +/** Format a 0–1 fraction as a whole percent ("42%"), em dash when absent. */ +export function formatPct(fraction: number | undefined): string { + return typeof fraction === 'number' ? `${(fraction * 100).toFixed(0)}%` : '—'; +} + +/** Percent share of `part` in `total` ("42%"), em dash when `total` is 0. */ +export function formatShare(part: number, total: number): string { + return total > 0 ? `${((part / total) * 100).toFixed(0)}%` : '—'; +} diff --git a/packages/app/src/components/datasets/stat.tsx b/packages/app/src/components/datasets/stat.tsx new file mode 100644 index 00000000..3fb6a32a --- /dev/null +++ b/packages/app/src/components/datasets/stat.tsx @@ -0,0 +1,9 @@ +/** Label/value pair for the summary
grids on dataset and conversation pages. */ +export function Stat({ label, value }: { label: string; value: string }) { + return ( +
+
{label}
+
{value}
+
+ ); +} diff --git a/packages/app/src/components/datasets/trace-flamegraph-model.ts b/packages/app/src/components/datasets/trace-flamegraph-model.ts new file mode 100644 index 00000000..2aff9ac3 --- /dev/null +++ b/packages/app/src/components/datasets/trace-flamegraph-model.ts @@ -0,0 +1,422 @@ +/** + * Pure logic for the trace flamegraph: overlap detection, deep-link resolution, + * visible-row construction, and bracket-lane layout. No React/DOM — everything + * here is unit-testable directly. Rendering lives in trace-flamegraph.tsx. + */ + +import type { StructureNode } from '@/hooks/api/use-datasets'; + +// Kept distinct from token-segment colors. A row can carry multiple rails when +// it overlaps different requests during different parts of its lifetime. +export const OVERLAP_COLORS = ['#06b6d4', '#ec4899', '#6366f1', '#84cc16', '#f97316'] as const; + +// Cap on simultaneously-drawn bracket lanes. A pathological conversation (e.g. a +// long-running session whose subagent fans out into hundreds of children with +// 15+ concurrent requests) can require dozens of lanes; left unbounded the +// gutter grows wide enough to push the bars off-screen AND emits one DOM node +// per lane per row (tens of thousands of empty divs). We bound it: lanes beyond +// the cap fold into the last "dense" lane, which stays readable for the common +// case (≤6 concurrent) and degrades gracefully for the outliers. +export const MAX_LANES = 6; + +export interface TimedRequest { + key: string; + startS?: number; + endS?: number; +} + +export interface RequestOverlapGroup { + id: string; + requestKeys: string[]; + startS: number; + endS: number; +} + +/** + * Find maximal sets of requests that were simultaneously in flight. + * Intervals are half-open, so one request ending exactly when another begins + * is serialized rather than parallel. Maximal-set filtering prevents a nested + * A/B pair from duplicating an A/B/C marker, while preserving A/B and B/C as + * separate groups when their overlaps happen at different times. + */ +export function findRequestOverlapGroups( + requests: TimedRequest[], + scopeKey = 'scope', +): RequestOverlapGroup[] { + const valid = requests.filter( + (request): request is TimedRequest & { startS: number; endS: number } => + Number.isFinite(request.startS) && + Number.isFinite(request.endS) && + request.endS! > request.startS!, + ); + const boundaries = [ + ...new Set(valid.flatMap((request) => [request.startS, request.endS])), + ].toSorted((a, b) => a - b); + const candidates = new Map>(); + + for (let i = 0; i < boundaries.length - 1; i++) { + const startS = boundaries[i]!; + const endS = boundaries[i + 1]!; + if (endS <= startS) continue; + const requestKeys = valid + .filter((request) => request.startS <= startS && request.endS >= endS) + .map((request) => request.key) + .toSorted(); + if (requestKeys.length < 2) continue; + const key = requestKeys.join('\u0000'); + const existing = candidates.get(key); + candidates.set(key, { + requestKeys, + startS: existing ? Math.min(existing.startS, startS) : startS, + endS: existing ? Math.max(existing.endS, endS) : endS, + }); + } + + const maximal = [...candidates.values()].filter( + (candidate, _, all) => + !all.some( + (other) => + other.requestKeys.length > candidate.requestKeys.length && + candidate.requestKeys.every((key) => other.requestKeys.includes(key)), + ), + ); + + return maximal + .toSorted( + (a, b) => + a.startS - b.startS || + a.endS - b.endS || + a.requestKeys.join(',').localeCompare(b.requestKeys.join(',')), + ) + .map((group, index) => ({ ...group, id: `${scopeKey}-${index + 1}` })); +} + +export interface RowOverlap { + id: string; + label: string; + color: string; + startS: number; + endS: number; + peerCount: number; +} + +export interface VisibleRow { + key: string; + label: string; + sublabel?: string; + timeLabel?: string; + cached: number; + uncached: number; + output: number; + total: number; + indent: number; + isGroup: boolean; + isExpanded: boolean; + groupIndex?: number; + overlaps: RowOverlap[]; +} + +/** Format seconds from conversation start as a compact elapsed timestamp. */ +export function formatElapsedTime(seconds: number): string { + const total = Math.max(0, Math.round(seconds)); + const hours = Math.floor(total / 3600); + const minutes = Math.floor((total % 3600) / 60); + const secs = total % 60; + const mm = String(minutes).padStart(2, '0'); + const ss = String(secs).padStart(2, '0'); + return hours > 0 ? `${hours}:${mm}:${ss}` : `${mm}:${ss}`; +} + +/** Elapsed-interval label for a row ("+MM:SS–MM:SS"), or undefined when untimed. */ +export function timeLabel(startS?: number, endS?: number): string | undefined { + if (startS === undefined || !Number.isFinite(startS)) return undefined; + const start = formatElapsedTime(startS); + if (endS === undefined || !Number.isFinite(endS) || endS <= startS) return `+${start}`; + return `+${start}–${formatElapsedTime(endS)}`; +} + +export interface DeepLinkHighlight { + turn?: number | null; + raw?: number | null; + inner?: number | null; + agent?: string | null; +} + +export interface DeepLinkTarget { + rowKey: string; + expandGroup: number | null; +} + +/** + * Resolve a request-timeline deep link to a flamegraph row key (+ the subagent + * group that must be expanded to show it). Raw Weka source coordinates are + * exact and take precedence: + * raw= -> top-level Weka request + * raw=&inner= -> subagent child inside that top-level marker + * Otherwise main turns match by main-turn ordinal and subagent turns match the + * group by agentId, then the ti-th child. + * + * `buildConversationStructure` emits exactly one node per raw Weka entry (and + * one child per nested entry), so a node's array position IS its raw index. + * Structures ingested before rawIndex/innerIndex were stored omit the explicit + * fields — fall back to the array position so deep links keep resolving against + * those older rows instead of silently doing nothing. + */ +export function resolveDeepLinkTarget( + nodes: readonly StructureNode[], + highlight: DeepLinkHighlight, +): DeepLinkTarget | null { + const { turn, raw, inner, agent } = highlight; + if (typeof raw === 'number' && raw >= 0) { + if (typeof inner === 'number' && inner >= 0) { + const gi = nodes.findIndex( + (node, i) => node.kind === 'subagent' && (node.rawIndex ?? i) === raw, + ); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + const ci = group.children.findIndex((child, i) => (child.innerIndex ?? i) === inner); + if (ci === -1) return null; + return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi }; + } + const i = nodes.findIndex( + (node, idx) => node.kind === 'turn' && (node.rawIndex ?? idx) === raw, + ); + if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null }; + return null; + } + if (typeof turn !== 'number' || turn < 0) return null; + if (agent) { + const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === agent); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + if (turn >= group.children.length) return null; + return { rowKey: `g-${gi}-c-${turn}`, expandGroup: gi }; + } + let ordinal = 0; + for (let i = 0; i < nodes.length; i++) { + if (nodes[i].kind === 'turn') { + if (ordinal === turn) return { rowKey: `t-${i}`, expandGroup: null }; + ordinal += 1; + } + } + return null; +} + +/** + * Overlap groups per row key. Main-agent turns and each subagent's children are + * separate scopes — parallelism is only meaningful within one agent's stream. + */ +export function buildRowOverlaps(nodes: readonly StructureNode[]): Map { + const mainGroups = findRequestOverlapGroups( + nodes.flatMap((node, i) => + node.kind === 'turn' ? [{ key: `t-${i}`, startS: node.startS, endS: node.endS }] : [], + ), + 'main', + ); + const subagentGroups = nodes.flatMap((node, i) => + node.kind === 'subagent' + ? findRequestOverlapGroups( + node.children.map((child, ci) => ({ + key: `g-${i}-c-${ci}`, + startS: child.startS, + endS: child.endS, + })), + `subagent-${i}`, + ) + : [], + ); + const groups: RequestOverlapGroup[] = [...mainGroups, ...subagentGroups]; + + const byRow = new Map(); + groups.forEach((group, groupIndex) => { + const overlap = { + id: group.id, + label: `P${groupIndex + 1}`, + color: OVERLAP_COLORS[groupIndex % OVERLAP_COLORS.length]!, + startS: group.startS, + endS: group.endS, + peerCount: group.requestKeys.length - 1, + }; + group.requestKeys.forEach((key) => byRow.set(key, [...(byRow.get(key) ?? []), overlap])); + }); + return byRow; +} + +/** + * Flatten structure nodes into the rows currently visible: one row per main + * turn, one header per subagent group, plus indented children for expanded + * groups. Row keys (`t-`, `g-`, `g--c-`) index by node position so + * they stay stable across expand/collapse. + */ +export function buildVisibleRows( + nodes: readonly StructureNode[], + expanded: ReadonlySet, + overlapsByRow: ReadonlyMap, +): VisibleRow[] { + const out: VisibleRow[] = []; + let turnNo = 0; + nodes.forEach((node: StructureNode, i) => { + if (node.kind === 'turn') { + turnNo += 1; + out.push({ + key: `t-${i}`, + label: `Turn ${turnNo}`, + sublabel: node.model ?? undefined, + timeLabel: timeLabel(node.startS, node.endS), + cached: node.cached, + uncached: node.uncached, + output: node.out, + total: node.in + node.out, + indent: 0, + isGroup: false, + isExpanded: false, + overlaps: overlapsByRow.get(`t-${i}`) ?? [], + }); + } else { + const isExpanded = expanded.has(i); + out.push({ + key: `g-${i}`, + label: `${node.label}`, + sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${ + node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : '' + }`, + timeLabel: timeLabel(node.startS, node.endS), + cached: node.cached, + uncached: node.uncached, + output: node.out, + total: node.in + node.out, + indent: 0, + isGroup: true, + isExpanded, + groupIndex: i, + overlaps: [], + }); + if (isExpanded) { + node.children.forEach((child, ci) => { + out.push({ + key: `g-${i}-c-${ci}`, + label: `↳ subturn ${ci + 1}`, + sublabel: child.model ?? undefined, + timeLabel: timeLabel(child.startS, child.endS), + cached: child.cached, + uncached: child.uncached, + output: child.out, + total: child.in + child.out, + indent: 1, + isGroup: false, + isExpanded: false, + overlaps: overlapsByRow.get(`g-${i}-c-${ci}`) ?? [], + }); + }); + } + } + }); + return out; +} + +export interface BraceSeg { + role: 'first' | 'middle' | 'last' | 'through'; + isMember: boolean; + color: string; + groupId: string; + peerCount: number; + startS: number; + endS: number; +} + +export interface BraceLayout { + laneCount: number; + overflowLanes: number; + /** Per visible row: only the lanes that actually carry a bracket segment. */ + rowSegs: { lane: number; seg: BraceSeg }[][]; +} + +/** + * Geometry for the parallel-group brackets drawn in the left gutter. Each + * overlap group becomes a vertical bracket spanning from its first to its last + * visible member row, with a right-pointing tick on the exact member rows. + * Non-transitive chains (a row in two groups) get separate lanes so their + * brackets sit side by side. `through` = a row inside a group's span that is + * NOT itself a member (the aux-stream edge case) — drawn as a faint connector + * with no tick. + */ +export function computeBraceLayout(rows: readonly VisibleRow[]): BraceLayout { + const groupMap = new Map< + string, + { id: string; color: string; peerCount: number; startS: number; endS: number; idxs: number[] } + >(); + rows.forEach((r, idx) => { + for (const ov of r.overlaps) { + const g = groupMap.get(ov.id) ?? { + id: ov.id, + color: ov.color, + peerCount: ov.peerCount, + startS: ov.startS, + endS: ov.endS, + idxs: [], + }; + g.idxs.push(idx); + groupMap.set(ov.id, g); + } + }); + const groups = [...groupMap.values()] + .filter((g) => g.idxs.length >= 2) // need ≥2 visible members to bracket + .map((g) => ({ + ...g, + min: Math.min(...g.idxs), + max: Math.max(...g.idxs), + members: new Set(g.idxs), + })) + .toSorted((a, b) => a.min - b.min || a.max - b.max); + + // Greedy lane assignment: a group reuses a lane whose previous group ended + // before this one starts. + const laneEnd: number[] = []; + const laneOf = new Map(); + for (const g of groups) { + let lane = laneEnd.findIndex((end) => end < g.min); + if (lane === -1) { + lane = laneEnd.length; + laneEnd.push(g.max); + } else { + laneEnd[lane] = g.max; + } + laneOf.set(g.id, lane); + } + const rawLaneCount = laneEnd.length; + // Bound the gutter (see MAX_LANES). Lanes past the cap collapse onto the last + // visible lane, so every parallel row still carries a marker but the gutter + // width and DOM-node count stay bounded regardless of how parallel the + // conversation is. + const laneCount = Math.min(rawLaneCount, MAX_LANES); + const displayLane = (lane: number) => Math.min(lane, laneCount - 1); + + // Sparse per-row segments: only lanes that actually carry a bracket on a row + // are stored (and later rendered). The previous dense matrix emitted one DOM + // node per lane per row — catastrophic at 49 lanes × 2k rows. + const rowSegs: { lane: number; seg: BraceSeg }[][] = rows.map(() => []); + for (const g of groups) { + const lane = displayLane(laneOf.get(g.id)!); + for (let idx = g.min; idx <= g.max; idx++) { + const isMember = g.members.has(idx); + const role = + idx === g.min ? 'first' : idx === g.max ? 'last' : isMember ? 'middle' : 'through'; + const seg: BraceSeg = { + role, + isMember, + color: g.color, + groupId: g.id, + peerCount: g.peerCount, + startS: g.startS, + endS: g.endS, + }; + const cell = rowSegs[idx]!; + const existing = cell.find((c) => c.lane === lane); + // Collisions only happen in the folded overflow lane. Prefer a real + // member marker over a faint pass-through connector. + if (!existing) cell.push({ lane, seg }); + else if (seg.isMember && !existing.seg.isMember) existing.seg = seg; + } + } + return { laneCount, overflowLanes: rawLaneCount - laneCount, rowSegs }; +} diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts new file mode 100644 index 00000000..0af344f1 --- /dev/null +++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts @@ -0,0 +1,246 @@ +import { describe, expect, it } from 'vitest'; + +import type { + StructureNode, + SubagentNode, + TurnNode, +} from '@semianalysisai/inferencex-db/etl/weka-structure'; + +import { + buildRowOverlaps, + buildVisibleRows, + computeBraceLayout, + findRequestOverlapGroups, + formatElapsedTime, + resolveDeepLinkTarget, + timeLabel, +} from './trace-flamegraph-model'; + +describe('formatElapsedTime', () => { + it('formats elapsed seconds below and above one hour', () => { + expect(formatElapsedTime(0)).toBe('00:00'); + expect(formatElapsedTime(65.4)).toBe('01:05'); + expect(formatElapsedTime(3661.6)).toBe('1:01:02'); + expect(formatElapsedTime(86_541.149)).toBe('24:02:21'); + }); + + it('clamps negative offsets to the conversation origin', () => { + expect(formatElapsedTime(-5)).toBe('00:00'); + }); +}); + +describe('timeLabel', () => { + it('renders a range when the end is after the start, a point otherwise', () => { + expect(timeLabel(65, 130)).toBe('+01:05–02:10'); + expect(timeLabel(65)).toBe('+01:05'); + expect(timeLabel(65, 65)).toBe('+01:05'); + expect(timeLabel(undefined, 130)).toBeUndefined(); + expect(timeLabel(Number.NaN, 130)).toBeUndefined(); + }); +}); + +describe('findRequestOverlapGroups', () => { + it('keeps non-transitive overlap chains as separate groups', () => { + const groups = findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 8 }, + { key: 'B', startS: 5, endS: 11 }, + { key: 'C', startS: 9, endS: 15 }, + ]); + + expect(groups.map((group) => group.requestKeys)).toEqual([ + ['A', 'B'], + ['B', 'C'], + ]); + expect(groups.map(({ startS, endS }) => [startS, endS])).toEqual([ + [5, 8], + [9, 11], + ]); + }); + + it('does not consider touching or invalid intervals parallel', () => { + expect( + findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 5 }, + { key: 'B', startS: 5, endS: 8 }, + { key: 'missing-end', startS: 3 }, + { key: 'zero-duration', startS: 4, endS: 4 }, + ]), + ).toEqual([]); + }); + + it('returns only the maximal simultaneous set for nested intervals', () => { + const groups = findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 10 }, + { key: 'B', startS: 2, endS: 8 }, + { key: 'C', startS: 3, endS: 7 }, + ]); + expect(groups).toMatchObject([{ requestKeys: ['A', 'B', 'C'], startS: 3, endS: 7 }]); + }); +}); + +const turn = (turnIndex: number, extra: Partial = {}): TurnNode => ({ + kind: 'turn', + turnIndex, + in: 100, + out: 10, + cached: 0, + uncached: 100, + ...extra, +}); +const subagent = (children: TurnNode[], extra: Partial = {}): SubagentNode => ({ + kind: 'subagent', + label: 'Subagent', + in: 100, + out: 10, + cached: 0, + uncached: 100, + children, + ...extra, +}); + +describe('resolveDeepLinkTarget', () => { + // Node layout mirroring a real Weka conversation: raw entries + // 0: turn, 1: subagent (2 children), 2: turn + const withRawIndexes: StructureNode[] = [ + turn(0, { rawIndex: 0 }), + subagent([turn(1, { rawIndex: 1, innerIndex: 0 }), turn(2, { rawIndex: 1, innerIndex: 1 })], { + agentId: 'subagent_001_abcd1234', + rawIndex: 1, + }), + turn(3, { rawIndex: 2 }), + ]; + // The same conversation as stored by the pre-rawIndex ingest (fields absent). + const legacy: StructureNode[] = [ + turn(0), + subagent([turn(1), turn(2)], { agentId: 'subagent_001_abcd1234' }), + turn(3), + ]; + + it('resolves raw source coordinates against explicit rawIndex fields', () => { + expect(resolveDeepLinkTarget(withRawIndexes, { raw: 2 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(withRawIndexes, { raw: 1, inner: 1 })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + }); + + it('falls back to node array position for structures ingested before rawIndex existed', () => { + // One node per raw entry means position === raw index, so the deep link + // must still resolve exactly (regression: it previously returned null and + // the flamegraph neither scrolled nor highlighted anything). + expect(resolveDeepLinkTarget(legacy, { raw: 2, turn: 1 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(legacy, { raw: 0, turn: 0 })).toEqual({ + rowKey: 't-0', + expandGroup: null, + }); + }); + + it('resolves subagent children positionally when innerIndex is absent', () => { + expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 1, turn: 1 })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + }); + + it('returns null for out-of-range raw coordinates instead of guessing', () => { + expect(resolveDeepLinkTarget(legacy, { raw: 9 })).toBeNull(); + expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 5 })).toBeNull(); + // raw pointing at a subagent marker without inner does not match a turn. + expect(resolveDeepLinkTarget(legacy, { raw: 1 })).toBeNull(); + }); + + it('keeps the positional turn/agent fallback for links without raw coordinates', () => { + expect(resolveDeepLinkTarget(legacy, { turn: 1 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(legacy, { turn: 1, agent: 'subagent_001_abcd1234' })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + expect(resolveDeepLinkTarget(legacy, {})).toBeNull(); + }); +}); + +describe('buildVisibleRows', () => { + const nodes: StructureNode[] = [ + turn(0, { startS: 0, endS: 10, model: 'claude' }), + subagent([turn(1), turn(2)], { label: 'Subagent: search', durationMs: 12_000 }), + turn(3), + ]; + + it('hides collapsed subagent children and keys rows by node position', () => { + const rows = buildVisibleRows(nodes, new Set(), new Map()); + expect(rows.map((r) => r.key)).toEqual(['t-0', 'g-1', 't-2']); + expect(rows[0]).toMatchObject({ + label: 'Turn 1', + sublabel: 'claude', + timeLabel: '+00:00–00:10', + total: 110, + isGroup: false, + }); + expect(rows[1]).toMatchObject({ + label: 'Subagent: search', + sublabel: '2 turns · 12s', + isGroup: true, + isExpanded: false, + groupIndex: 1, + }); + }); + + it('inserts indented child rows for expanded groups and attaches overlaps', () => { + const overlap = { + id: 'main-1', + label: 'P1', + color: '#06b6d4', + startS: 0, + endS: 1, + peerCount: 1, + }; + const rows = buildVisibleRows(nodes, new Set([1]), new Map([['g-1-c-0', [overlap]]])); + expect(rows.map((r) => r.key)).toEqual(['t-0', 'g-1', 'g-1-c-0', 'g-1-c-1', 't-2']); + expect(rows[2]).toMatchObject({ label: '↳ subturn 1', indent: 1, overlaps: [overlap] }); + expect(rows[3]!.overlaps).toEqual([]); + }); +}); + +describe('buildRowOverlaps and computeBraceLayout', () => { + it('brackets parallel main turns and spans a non-member row as pass-through', () => { + const nodes: StructureNode[] = [ + turn(0, { startS: 0, endS: 10 }), + turn(1), // untimed — sits inside the bracket span without being a member + turn(2, { startS: 5, endS: 30 }), // overlaps turn 0 and turn 3 + turn(3, { startS: 28, endS: 40 }), + ]; + const overlaps = buildRowOverlaps(nodes); + expect([...overlaps.keys()].toSorted()).toEqual(['t-0', 't-2', 't-3']); + + const rows = buildVisibleRows(nodes, new Set(), overlaps); + const layout = computeBraceLayout(rows); + // Two overlap groups sharing rows 0–2 and 2–3 need two side-by-side lanes. + expect(layout.laneCount).toBe(2); + expect(layout.overflowLanes).toBe(0); + const roles = layout.rowSegs.map((segs) => + segs.map(({ lane, seg }) => `${lane}:${seg.role}${seg.isMember ? '' : ':nonmember'}`), + ); + expect(roles[0]).toEqual(['0:first']); + expect(roles[1]).toEqual(['0:through:nonmember']); + expect(roles[2]!.toSorted()).toEqual(['0:last', '1:first']); + expect(roles[3]).toEqual(['1:last']); + }); + + it('reports no lanes for a fully serial conversation', () => { + const nodes: StructureNode[] = [ + turn(0, { startS: 0, endS: 5 }), + turn(1, { startS: 5, endS: 9 }), + ]; + const rows = buildVisibleRows(nodes, new Set(), buildRowOverlaps(nodes)); + expect(computeBraceLayout(rows)).toEqual({ laneCount: 0, overflowLanes: 0, rowSegs: [[], []] }); + }); +}); diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx new file mode 100644 index 00000000..d63cc691 --- /dev/null +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -0,0 +1,439 @@ +'use client'; + +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { createPortal } from 'react-dom'; + +import type { ConversationStructure } from '@/hooks/api/use-datasets'; +import { track } from '@/lib/analytics'; +import { compact, formatShare } from './format'; +import { + buildRowOverlaps, + buildVisibleRows, + computeBraceLayout, + formatElapsedTime, + MAX_LANES, + OVERLAP_COLORS, + resolveDeepLinkTarget, + type VisibleRow, +} from './trace-flamegraph-model'; + +// Pure logic lives in trace-flamegraph-model.ts; re-exported here so this file +// stays the module entry point for the flamegraph's public API. +export { + findRequestOverlapGroups, + formatElapsedTime, + resolveDeepLinkTarget, +} from './trace-flamegraph-model'; +export type { + DeepLinkHighlight, + DeepLinkTarget, + RequestOverlapGroup, + TimedRequest, +} from './trace-flamegraph-model'; + +// Stacked-bar segment colors. Cached prefix vs uncached input vs output — +// fixed hues (theme-independent) so the meaning is stable in light/dark. +const SEG = { + cached: '#10b981', // emerald-500 — input served from prefix cache + uncached: '#f59e0b', // amber-500 — input that must be (re)computed + output: '#8b5cf6', // violet-500 — generated tokens +} as const; + +const LEGEND = [ + { key: 'cached', label: 'Cached prefix', color: SEG.cached }, + { key: 'uncached', label: 'Uncached input', color: SEG.uncached }, + { key: 'output', label: 'Output', color: SEG.output }, +] as const; + +// Width (px) of one parallel-group bracket lane in the left gutter. Overlapping +// groups (non-transitive chains) get their own lane so their brackets sit +// side-by-side instead of stacking visually. +const LANE_W = 14; + +interface TooltipState { + x: number; + y: number; + row: VisibleRow; +} + +/** + * Per-conversation flamegraph driven by the precomputed `structure` JSONB. + * One row per turn; subagent groups render a collapsible header with indented + * children (collapsed by default). Each bar stacks cached-prefix + uncached + * input + output, scaled to the widest visible turn. + */ +export function TraceFlamegraph({ + structure, + highlightTurn, + highlightRawIndex, + highlightInnerIndex, + highlightAgentId, +}: { + structure: ConversationStructure; + /** Turn index to scroll to / highlight (from a request-timeline deep link). */ + highlightTurn?: number | null; + /** Raw Weka top-level request index to scroll to / highlight. */ + highlightRawIndex?: number | null; + /** Raw Weka nested request index under highlightRawIndex, for subagent children. */ + highlightInnerIndex?: number | null; + /** Subagent id when the highlighted turn is inside a subagent group. */ + highlightAgentId?: string | null; +}) { + const nodes = structure.nodes; + + // Resolve the deep-link target to a row key (+ the group that must be open to + // show it). See resolveDeepLinkTarget for the matching rules. + const target = useMemo( + () => + resolveDeepLinkTarget(nodes, { + turn: highlightTurn, + raw: highlightRawIndex, + inner: highlightInnerIndex, + agent: highlightAgentId, + }), + [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId], + ); + + // Subagent groups collapsed by default — except the deep-link target's group. + const [expanded, setExpanded] = useState>(() => + typeof target?.expandGroup === 'number' ? new Set([target.expandGroup]) : new Set(), + ); + const [tooltip, setTooltip] = useState(null); + const scrollRef = useRef(null); + + // Portal target only exists after mount (the tooltip is portaled to body so + // its position:fixed is viewport-relative, immune to ancestor transforms). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + + // The deep-link target row gets a state-driven highlight (ring + bg flash) + // that fades out — state-driven so a re-render can't clobber it, and so the + // fade is a real CSS transition rather than an abrupt classList removal. + const [highlightKey, setHighlightKey] = useState(target?.rowKey ?? null); + + // When the deep-link target resolves/changes: expand its subagent group, then + // (after the row renders) scroll it into view and flash the highlight. Runs on + // first load and on any later target change (e.g. clicking another bar into + // the same conversation). The row query/scroll is deferred to the next frame + // so the just-expanded child row exists in the DOM. + useEffect(() => { + if (!target) return; + if (typeof target.expandGroup === 'number') { + const gi = target.expandGroup; + setExpanded((prev) => (prev.has(gi) ? prev : new Set(prev).add(gi))); + } + setHighlightKey(target.rowKey); + const raf = requestAnimationFrame(() => { + scrollRef.current + ?.querySelector(`[data-rowkey="${target.rowKey}"]`) + ?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + }); + const t = setTimeout(() => setHighlightKey(null), 2200); + return () => { + cancelAnimationFrame(raf); + clearTimeout(t); + }; + }, [target]); + + const groupIndexes = useMemo(() => { + const out: number[] = []; + nodes.forEach((node, i) => { + if (node.kind === 'subagent') out.push(i); + }); + return out; + }, [nodes]); + + const toggle = useCallback((i: number) => { + setExpanded((prev) => { + const next = new Set(prev); + if (next.has(i)) next.delete(i); + else next.add(i); + return next; + }); + }, []); + + const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]); + const collapseAll = useCallback(() => setExpanded(new Set()), []); + + const overlapsByRow = useMemo(() => buildRowOverlaps(nodes), [nodes]); + + const rows = useMemo( + () => buildVisibleRows(nodes, expanded, overlapsByRow), + [nodes, expanded, overlapsByRow], + ); + + // Two scales: leaf turns/subturns share a per-turn axis (the primary signal — + // how cached/uncached evolves), while subagent group headers carry aggregates + // orders of magnitude larger, so they get their own axis to stay comparable to + // each other. Group bars render slim + muted, so the mixed scale reads as a + // distinct "group summary" track rather than a contradiction. + const maxTotal = useMemo( + () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)), + [rows], + ); + const maxGroupTotal = useMemo( + () => Math.max(1, ...rows.filter((r) => r.isGroup).map((r) => r.total)), + [rows], + ); + + const braces = useMemo(() => computeBraceLayout(rows), [rows]); + + const onMove = (e: React.MouseEvent, row: VisibleRow) => { + setTooltip({ x: e.clientX, y: e.clientY, row }); + }; + + return ( +
+
+
+ {LEGEND.map((l) => ( + + + {l.label} + + ))} + + + Bracketed rows ran in parallel + +
+ {groupIndexes.length > 0 && ( +
+ + +
+ )} +
+ + {braces.overflowLanes > 0 && ( +

+ Dense parallel region — bracket lanes capped at {MAX_LANES}; {braces.overflowLanes}{' '} + further overlapping {braces.overflowLanes === 1 ? 'group is' : 'groups are'} folded into + the last lane. +

+ )} + +
+ {/* gap-0 so the per-row bracket segments connect into a continuous + vertical rail across the rows of a parallel group. */} +
+ {rows.map((row, idx) => { + // Group headers use the group axis; turns/subturns use the per-turn + // axis. Clamp to the track width either way. + const denom = row.isGroup ? maxGroupTotal : maxTotal; + const widthPct = Math.min(100, Math.max(0.5, (row.total / denom) * 100)); + const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0; + const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0; + const ow = row.total > 0 ? (row.output / row.total) * 100 : 0; + const isHighlighted = row.key === highlightKey; + const segs = braces.rowSegs[idx]!; + return ( +
+ {/* Parallel-group bracket gutter (only rendered when the + conversation has any overlaps, so non-overlap traces keep a + flush-left layout with no dead space). Segments are sparse and + absolutely positioned per lane so a row only pays for the + lanes it actually touches. */} + {braces.laneCount > 0 && ( +
+ {segs.map(({ lane, seg }) => { + const top = seg.role === 'first' ? '50%' : '0'; + const bottom = seg.role === 'last' ? '50%' : '0'; + return ( +
+ {/* vertical rail */} +
+ {/* right-pointing tick marking an actual member row */} + {seg.isMember && ( +
+ )} +
+ ); + })} +
+ )} + + {/* row content (indented for subagent children) */} +
+ {/* label / group toggle */} +
+ {row.isGroup ? ( + + ) : ( + {row.label} + )} +
+ + {/* Original interval, measured from conversation start. */} +
+ {row.timeLabel ?? '—'} +
+ + {/* stacked bar — group headers render as a slim muted summary + strip so they read as aggregates, not individual turns. */} +
onMove(e, row)} + onMouseLeave={() => setTooltip(null)} + > +
+
+
+
+
+
+ + {/* total */} +
+ {compact(row.total)} +
+
+
+ ); + })} +
+
+ + {tooltip && + mounted && + createPortal( +
+
+ {tooltip.row.label} + {tooltip.row.sublabel ? ( + + {tooltip.row.sublabel} + + ) : null} +
+
+ Cached prefix + + {compact(tooltip.row.cached)} + + Uncached input + + {compact(tooltip.row.uncached)} + + Output + + {compact(tooltip.row.output)} + + Cached % + + {formatShare(tooltip.row.cached, tooltip.row.cached + tooltip.row.uncached)} + + From start + + {tooltip.row.timeLabel ?? '—'} + +
+
, + document.body, + )} +
+ ); +} diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx index 8fbf52ac..1a12057e 100644 --- a/packages/app/src/components/header/header.tsx +++ b/packages/app/src/components/header/header.tsx @@ -9,6 +9,7 @@ import { track } from '@/lib/analytics'; import { ModeToggle } from '@/components/ui/mode-toggle'; import { MinecraftToggles } from '@/components/minecraft/minecraft-toggles'; import { navigateInApp } from '@/lib/client-navigation'; +import { useFeatureGate } from '@/lib/use-feature-gate'; import { cn } from '@/lib/utils'; import { GitHubStars } from './GithubStars'; @@ -46,6 +47,15 @@ const NAV_LINKS = [ testId: 'nav-link-supporters', event: 'header_supporters_clicked', }, + { + href: '/datasets', + label: 'Datasets', + testId: 'nav-link-datasets', + event: 'header_datasets_clicked', + // Agentic surface — hidden behind the konami-code feature gate (default off) + // until agentic launches. Same gate as the Hidden tab dropdown. + gated: true, + }, { href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' }, { href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' }, ] as const; @@ -62,9 +72,14 @@ function isActive(pathname: string, href: string): boolean { export const Header = ({ starCount }: { starCount?: number | null }) => { const pathname = usePathname() ?? '/'; const router = useRouter(); + const featureGateUnlocked = useFeatureGate(); const [mobileMenuOpen, setMobileMenuOpen] = useState(false); const menuRef = useRef(null); + // Hide gated nav links (e.g. Datasets — an agentic surface) unless the shared + // feature gate is unlocked. Mirrors the tab-nav "Hidden" dropdown gating. + const navLinks = NAV_LINKS.filter((l) => !('gated' in l && l.gated) || featureGateUnlocked); + // Close menu on route change useEffect(() => { setMobileMenuOpen(false); @@ -118,7 +133,7 @@ export const Header = ({ starCount }: { starCount?: number | null }) => { {/* Desktop nav */}