PrimeIntellect-ai · mikasenghaas · Jul 2, 2026 · Jul 2, 2026
diff --git a/verifiers/v1/cli/dashboard/eval.py b/verifiers/v1/cli/dashboard/eval.py
@@ -1,13 +1,14 @@
 """The eval `--rich` dashboard: a config overview, a progress bar, and one line per rollout.
 
 Reads each `Rollout.trace`/`phase` every tick — no extra plumbing. Each row carries a bracketed
-phase marker that reads at a glance — `[setup]` (yellow), `[rollout]` (cyan), `[finalize]`
-(magenta), `[scoring]` (blue), `[success]` (green), `[error]` (red) — padded so the brackets
-line up in a column down the left edge. The reward shows only once a rollout is fully scored
-(phase DONE), so it never flips as scoring lands. A task's rollouts are grouped adjacently and joined
-by a left brace (╭│╰), so an episode (a task's n rollouts) reads as a unit. Every started
-rollout stays on screen (finished ones keep their result); the overview + progress sit on top,
-above a rule.
+phase marker that reads at a glance — `[pending]` (dim), `[setup]` (yellow), `[rollout]` (cyan),
+`[finalize]` (magenta), `[scoring]` (blue), `[success]` (green), `[error]` (red) — padded so the
+brackets line up in a column down the left edge. The reward shows only once a rollout is fully
+scored (phase DONE), so it never flips as scoring lands. A task's rollouts are grouped adjacently
+and joined by a left brace (╭│╰), so an episode (a task's n rollouts) reads as a unit. Every
+rollout is on screen from the start: one still queued behind the concurrency cap reads `[pending]`
+(its task is all that's known yet) until it begins, and finished ones keep their result. The
+overview + progress sit on top, above a rule.
 """
 
 import contextlib
@@ -44,6 +45,7 @@
 _LABEL_WIDTH = len("timeouts")
 
 _STYLE = {
+    "pending": "dim",
     "setup": "yellow",
     "running": "cyan",
     "finalize": "magenta",
@@ -52,6 +54,7 @@
     "error": "red",
 }
 _MARK_LABEL = {
+    "pending": "pending",
     "setup": "setup",
     "running": "rollout",
     "finalize": "finalize",
@@ -357,17 +360,26 @@ def _tokens(trace: Trace) -> tuple[int, int, int | None, int | None, int]:
     return prompt, completion, cached, reasoning, nbranches
 
 
+def _started(rollout: Rollout) -> float:
+    # Sort key: when a rollout began (its setup start). A still-pending rollout has no trace
+    # yet, so it sorts last (+inf) — behind everything already in flight, in task order.
+    return (
+        rollout.trace.timing.setup.start if rollout.trace is not None else float("inf")
+    )
+
+
 def _groups(rollouts: list[Rollout]) -> list[list[Rollout]]:
     # The n rollouts of each task, grouped together (so they sit adjacent); groups ordered by
-    # earliest start, rollouts within a group by start. Finished ones stay (never removed).
+    # earliest start, rollouts within a group by start. Every rollout carries its `task` from
+    # construction, so ones still queued behind the concurrency cap (no trace yet) are grouped
+    # and shown too — as `[pending]`. Finished ones stay (never removed).
     by_task: dict[int, list[Rollout]] = {}
     for rollout in rollouts:
-        if rollout.trace is not None:
-            by_task.setdefault(rollout.trace.task.idx, []).append(rollout)
+        by_task.setdefault(rollout.task.idx, []).append(rollout)
     groups = list(by_task.values())
     for group in groups:
-        group.sort(key=lambda r: r.trace.timing.setup.start)
-    groups.sort(key=lambda g: g[0].trace.timing.setup.start)
+        group.sort(key=_started)
+    groups.sort(key=lambda g: _started(g[0]))
     return groups
 
 
@@ -385,6 +397,21 @@ def Rows(groups: list[list[Rollout]], now: float, runtime_type: str) -> Table:
     for group in groups:
         for i, rollout in enumerate(group):
             t = rollout.trace
+            task = rollout.task
+            label = f"name={task.name[:32]}" if task.name else f"idx={task.idx}"
+            if (
+                t is None
+            ):  # queued behind the concurrency cap — only its task is known yet
+                rows.append(
+                    (
+                        _brace(i, len(group)),
+                        "pending",
+                        [f"task {label}", *[""] * 7],
+                        "",
+                        "",
+                    )
+                )
+                continue
             if rollout.phase == Phase.DONE:  # fully scored — reward is final
                 state = "error" if t.has_error else "success"
                 result = t.error.type if t.has_error else f"reward={t.reward:.2f}"
@@ -398,7 +425,6 @@ def Rows(groups: list[list[Rollout]], now: float, runtime_type: str) -> Table:
                         stop = f"{stop} (truncated)".strip()
             else:
                 state, result, stop = rollout.phase, "", ""
-            label = f"name={t.task.name[:32]}" if t.task.name else f"idx={t.task.idx}"
             descriptor = (
                 rollout.runtime.descriptor if rollout.runtime is not None else None
             )

diff --git a/verifiers/v1/rollout.py b/verifiers/v1/rollout.py
@@ -51,9 +51,11 @@
 
 
 class Phase(StrEnum):
-    """A rollout's lifecycle phase (for display): provisioning, the harness driving,
-    post-run finalize, per-rollout + group scoring, then fully scored."""
+    """A rollout's lifecycle phase (for display): queued behind the concurrency cap,
+    provisioning, the harness driving, post-run finalize, per-rollout + group scoring,
+    then fully scored."""
 
+    PENDING = "pending"
     SETUP = "setup"
     RUNNING = "running"
     FINALIZE = "finalize"
@@ -93,9 +95,11 @@ def __init__(
         `Environment.serving` context — so a rollout always has them and no runner has to thread
         them in."""
         self.interception = interception
-        self.phase = Phase.SETUP
-        """Lifecycle phase for display (see `Phase`); advanced through the rollout, and
-        set to DONE by the Episode once group scoring has run."""
+        self.phase = Phase.PENDING
+        """Lifecycle phase for display (see `Phase`); starts PENDING (queued behind the
+        concurrency cap) so the --rich dashboard can list it before it begins, advances to
+        SETUP the moment `run()` starts, and is set to DONE by the Episode once group
+        scoring has run."""
         self.runtime: Runtime | None = None
         """The runtime, set the moment `run()` creates it (so it's always tearable-down
         even if setup crashes) and torn down in `run()`'s `finally`; the --rich dashboard
@@ -139,6 +143,7 @@ async def run(self) -> Trace:
         `self.shared_urls` / `self.interception`)."""
         trace: Trace = Trace(task=self.task, state=state_cls(type(self.taskset))())
         self.trace = trace  # expose for the --rich dashboard
+        self.phase = Phase.SETUP  # leaving the queue: provisioning starts now
         trace.timing.setup.start = time.time()
         self.runtime = make_runtime(
             self.runtime_config, name=trace.id