diff --git a/verifiers/v1/cli/dashboard/eval.py b/verifiers/v1/cli/dashboard/eval.py index 8f152c18a..f5ace4a7b 100644 --- a/verifiers/v1/cli/dashboard/eval.py +++ b/verifiers/v1/cli/dashboard/eval.py @@ -1,13 +1,14 @@ """The eval `--rich` dashboard: a config overview, a progress bar, and one line per rollout. Reads each `Rollout.trace`/`phase` every tick — no extra plumbing. Each row carries a bracketed -phase marker that reads at a glance — `[setup]` (yellow), `[rollout]` (cyan), `[finalize]` -(magenta), `[scoring]` (blue), `[success]` (green), `[error]` (red) — padded so the brackets -line up in a column down the left edge. The reward shows only once a rollout is fully scored -(phase DONE), so it never flips as scoring lands. A task's rollouts are grouped adjacently and joined -by a left brace (╭│╰), so an episode (a task's n rollouts) reads as a unit. Every started -rollout stays on screen (finished ones keep their result); the overview + progress sit on top, -above a rule. +phase marker that reads at a glance — `[pending]` (dim), `[setup]` (yellow), `[rollout]` (cyan), +`[finalize]` (magenta), `[scoring]` (blue), `[success]` (green), `[error]` (red) — padded so the +brackets line up in a column down the left edge. The reward shows only once a rollout is fully +scored (phase DONE), so it never flips as scoring lands. A task's rollouts are grouped adjacently +and joined by a left brace (╭│╰), so an episode (a task's n rollouts) reads as a unit. Every +rollout is on screen from the start: one still queued behind the concurrency cap reads `[pending]` +(its task is all that's known yet) until it begins, and finished ones keep their result. The +overview + progress sit on top, above a rule. """ import contextlib @@ -44,6 +45,7 @@ _LABEL_WIDTH = len("timeouts") _STYLE = { + "pending": "dim", "setup": "yellow", "running": "cyan", "finalize": "magenta", @@ -52,6 +54,7 @@ "error": "red", } _MARK_LABEL = { + "pending": "pending", "setup": "setup", "running": "rollout", "finalize": "finalize", @@ -357,17 +360,26 @@ def _tokens(trace: Trace) -> tuple[int, int, int | None, int | None, int]: return prompt, completion, cached, reasoning, nbranches +def _started(rollout: Rollout) -> float: + # Sort key: when a rollout began (its setup start). A still-pending rollout has no trace + # yet, so it sorts last (+inf) — behind everything already in flight, in task order. + return ( + rollout.trace.timing.setup.start if rollout.trace is not None else float("inf") + ) + + def _groups(rollouts: list[Rollout]) -> list[list[Rollout]]: # The n rollouts of each task, grouped together (so they sit adjacent); groups ordered by - # earliest start, rollouts within a group by start. Finished ones stay (never removed). + # earliest start, rollouts within a group by start. Every rollout carries its `task` from + # construction, so ones still queued behind the concurrency cap (no trace yet) are grouped + # and shown too — as `[pending]`. Finished ones stay (never removed). by_task: dict[int, list[Rollout]] = {} for rollout in rollouts: - if rollout.trace is not None: - by_task.setdefault(rollout.trace.task.idx, []).append(rollout) + by_task.setdefault(rollout.task.idx, []).append(rollout) groups = list(by_task.values()) for group in groups: - group.sort(key=lambda r: r.trace.timing.setup.start) - groups.sort(key=lambda g: g[0].trace.timing.setup.start) + group.sort(key=_started) + groups.sort(key=lambda g: _started(g[0])) return groups @@ -385,6 +397,21 @@ def Rows(groups: list[list[Rollout]], now: float, runtime_type: str) -> Table: for group in groups: for i, rollout in enumerate(group): t = rollout.trace + task = rollout.task + label = f"name={task.name[:32]}" if task.name else f"idx={task.idx}" + if ( + t is None + ): # queued behind the concurrency cap — only its task is known yet + rows.append( + ( + _brace(i, len(group)), + "pending", + [f"task {label}", *[""] * 7], + "", + "", + ) + ) + continue if rollout.phase == Phase.DONE: # fully scored — reward is final state = "error" if t.has_error else "success" result = t.error.type if t.has_error else f"reward={t.reward:.2f}" @@ -398,7 +425,6 @@ def Rows(groups: list[list[Rollout]], now: float, runtime_type: str) -> Table: stop = f"{stop} (truncated)".strip() else: state, result, stop = rollout.phase, "", "" - label = f"name={t.task.name[:32]}" if t.task.name else f"idx={t.task.idx}" descriptor = ( rollout.runtime.descriptor if rollout.runtime is not None else None ) diff --git a/verifiers/v1/rollout.py b/verifiers/v1/rollout.py index b2d66891c..25157c2e2 100644 --- a/verifiers/v1/rollout.py +++ b/verifiers/v1/rollout.py @@ -51,9 +51,11 @@ class Phase(StrEnum): - """A rollout's lifecycle phase (for display): provisioning, the harness driving, - post-run finalize, per-rollout + group scoring, then fully scored.""" + """A rollout's lifecycle phase (for display): queued behind the concurrency cap, + provisioning, the harness driving, post-run finalize, per-rollout + group scoring, + then fully scored.""" + PENDING = "pending" SETUP = "setup" RUNNING = "running" FINALIZE = "finalize" @@ -93,9 +95,11 @@ def __init__( `Environment.serving` context — so a rollout always has them and no runner has to thread them in.""" self.interception = interception - self.phase = Phase.SETUP - """Lifecycle phase for display (see `Phase`); advanced through the rollout, and - set to DONE by the Episode once group scoring has run.""" + self.phase = Phase.PENDING + """Lifecycle phase for display (see `Phase`); starts PENDING (queued behind the + concurrency cap) so the --rich dashboard can list it before it begins, advances to + SETUP the moment `run()` starts, and is set to DONE by the Episode once group + scoring has run.""" self.runtime: Runtime | None = None """The runtime, set the moment `run()` creates it (so it's always tearable-down even if setup crashes) and torn down in `run()`'s `finally`; the --rich dashboard @@ -139,6 +143,7 @@ async def run(self) -> Trace: `self.shared_urls` / `self.interception`).""" trace: Trace = Trace(task=self.task, state=state_cls(type(self.taskset))()) self.trace = trace # expose for the --rich dashboard + self.phase = Phase.SETUP # leaving the queue: provisioning starts now trace.timing.setup.start = time.time() self.runtime = make_runtime( self.runtime_config, name=trace.id