NVIDIA · ChenhanYu · Jul 4, 2026 · coderabbitai · Jul 4, 2026
@@ -0,0 +1,21 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "MODEL_OPT_HOSTNAME_SMOKE_START"
+hostname
+echo "MODEL_OPT_HOSTNAME_SMOKE_DONE"
@@ -34,6 +34,7 @@
 from __future__ import annotations
 
 import hashlib
+import json
 import os
 import re
 import shutil
@@ -68,6 +69,25 @@
 
 _GIT_SHA_RE = re.compile(r"^[0-9a-fA-F]{7,40}$")
 _SAFE_PATH_TOKEN_RE = re.compile(r"[^A-Za-z0-9_.-]+")
+_SLURM_TERMINAL_STATES: frozenset[str] = frozenset(
+    {
+        "COMPLETED",
+        "FAILED",
+        "CANCELLED",
+        "TIMEOUT",
+        "OUT_OF_MEMORY",
+        "NODE_FAIL",
+        "PREEMPTED",
+        "BOOT_FAIL",
+    }
+)
+_SLURM_FAILURE_STATES: frozenset[str] = frozenset(
+    {"FAILED", "CANCELLED", "TIMEOUT", "OUT_OF_MEMORY", "NODE_FAIL", "PREEMPTED", "BOOT_FAIL"}
+)
+_SLURM_RUNNING_STATES: frozenset[str] = frozenset(
+    {"PENDING", "RUNNING", "CONFIGURING", "COMPLETING", "REQUEUED", "RESIZING", "SUSPENDED"}
+)
+_SLURM_STATUS_META = ".modelopt_mcp_slurm.json"
 
 
 @dataclass
@@ -551,9 +571,7 @@ def _launcher_argv(abs_yaml: Path, checkout: SourceCheckout | None, *flags: str)
     return [
         _uv_binary(),
         "run",
-        "--reinstall-package",
-        "modelopt-launcher",
-        "--project",
+        "--with-editable",
         str(checkout.launcher_dir),
         "modelopt-launcher",
         "--yaml",
@@ -1345,6 +1363,16 @@ def submit_job_impl(
             **_source_result_fields(checkout),
         }
 
+    _write_slurm_status_metadata(
+        experiment_id=experiment_id,
+        slurm_job_id=slurm_job_id,
+        cluster_host=cluster_host,
+        cluster_user=cluster_user,
+        identity=identity,
+        control_socket=control_socket,
+        reconnect_command=reconnect_command,
+    )
+
     return {
         "ok": True,
         "executor": "slurm",
@@ -1593,6 +1621,165 @@ def _experiment_not_found_diagnostic() -> str:
     )
 
 
+def _write_slurm_status_metadata(
+    *,
+    experiment_id: str,
+    slurm_job_id: str | None,
+    cluster_host: str | None,
+    cluster_user: str | None,
+    identity: str | None,
+    control_socket: str | None,
+    reconnect_command: str | None,
+) -> None:
+    """Persist Slurm submit metadata so status polling can query Slurm."""
+    if not slurm_job_id or not cluster_host:
+        return
+    exp_dir = _resolve_experiment_dir(experiment_id)
+    if exp_dir is None:
+        return
+    payload = {
+        "executor": "slurm",
+        "experiment_id": experiment_id,
+        "slurm_job_id": slurm_job_id,
+        "cluster_host": cluster_host,
+        "cluster_user": cluster_user,
+        "identity": identity,
+        "control_socket": os.path.expanduser(control_socket) if control_socket else None,
+        "reconnect_command": reconnect_command,
+    }
+    try:
+        (exp_dir / _SLURM_STATUS_META).write_text(
+            json.dumps(payload, sort_keys=True, indent=2),
+            encoding="utf-8",
+        )
+    except OSError:
+        # Status remains filesystem-based if the sidecar cannot be written.
+        return
+
+
+def _load_slurm_status_metadata(exp_dir: Path) -> dict | None:
+    """Load Slurm status metadata written by submit_job_impl."""
+    try:
+        payload = json.loads((exp_dir / _SLURM_STATUS_META).read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return None
+    if payload.get("executor") != "slurm":
+        return None
+    if not payload.get("slurm_job_id") or not payload.get("cluster_host"):
+        return None
+    return payload
+
+
+def _ssh_argv_for_slurm(meta: dict, remote_command: str) -> list[str]:
+    """Build a fixed SSH argv for querying Slurm status."""
+    argv = ["ssh", "-o", "BatchMode=yes"]
+    if meta.get("identity"):
+        argv += ["-i", str(meta["identity"])]
+    if meta.get("control_socket"):
+        argv += [
+            "-o",
+            f"ControlPath={os.path.expanduser(str(meta['control_socket']))}",
+            "-o",
+            "ControlMaster=no",
+        ]
+    user = meta.get("cluster_user")
+    host = meta["cluster_host"]
+    target = f"{user}@{host}" if user else host
+    return [*argv, target, remote_command]
+
+
+def _parse_slurm_records(text: str, job_id: str) -> dict[str, str] | None:
+    """Return the parent Slurm job record from sacct -P output."""
+    lines = [line for line in text.splitlines() if line.strip()]
+    if len(lines) < 2:
+        return None
+    headers = lines[0].split("|")
+    for line in lines[1:]:
+        values = line.split("|")
+        record = dict(zip(headers, values))
+        if record.get("JobID") == job_id:
+            return record
+    return None
+
+
+def _query_slurm_status(meta: dict) -> dict:
+    """Query remote Slurm for a detached job's authoritative status."""
+    job_id = str(meta["slurm_job_id"])
+    sacct_cmd = f"sacct -j {job_id} --format=JobID,State,ExitCode,Elapsed,NodeList -P"
+    try:
+        proc = subprocess.run(  # nosec B603 - fixed ssh argv, no shell.
+            _ssh_argv_for_slurm(meta, sacct_cmd),
+            capture_output=True,
+            text=True,
+            timeout=30,
+            check=False,
+        )
+    except (FileNotFoundError, subprocess.TimeoutExpired) as e:
+        return {
+            "ok": False,
+            "reason": "slurm_status_query_failed",
+            "diagnostic": f"Unable to query Slurm status via SSH: {e}.",
+        }
+
+    record = _parse_slurm_records(proc.stdout, job_id) if proc.returncode == 0 else None
+    if record is None:
+        squeue_cmd = f"squeue -j {job_id} -h -o '%T|%M|%R'"
+        try:
+            squeue = subprocess.run(  # nosec B603 - fixed ssh argv, no shell.
+                _ssh_argv_for_slurm(meta, squeue_cmd),
+                capture_output=True,
+                text=True,
+                timeout=30,
+                check=False,
+            )
+        except (FileNotFoundError, subprocess.TimeoutExpired) as e:
+            return {
+                "ok": False,
+                "reason": "slurm_status_query_failed",
+                "diagnostic": f"Unable to query Slurm queue via SSH: {e}.",
+                "sacct_stderr_tail": _tail(proc.stderr),
+            }
+        if squeue.returncode == 0 and squeue.stdout.strip():
+            state = squeue.stdout.strip().split("|", 1)[0].upper()
+            return {
+                "ok": True,
+                "status": "running",
+                "slurm_state": state,
+                "slurm_job_id": job_id,
+                "slurm_source": "squeue",
+            }
+        return {
+            "ok": False,
+            "reason": "slurm_status_not_found",
+            "diagnostic": "Slurm job was not found by sacct or squeue.",
+            "slurm_job_id": job_id,
+            "sacct_stdout_tail": _tail(proc.stdout),
+            "sacct_stderr_tail": _tail(proc.stderr),
+            "squeue_stdout_tail": _tail(squeue.stdout),
+            "squeue_stderr_tail": _tail(squeue.stderr),
+        }
+
+    state = (record.get("State", "").split() or [""])[0].upper()
+    if state in _SLURM_FAILURE_STATES:
+        status = "failed"
+    elif state in _SLURM_TERMINAL_STATES:
+        status = "done"
+    elif state in _SLURM_RUNNING_STATES or state:
+        status = "running"
+    else:
+        status = "unknown"
+    return {
+        "ok": True,
+        "status": status,
+        "slurm_state": state,
+        "slurm_job_id": job_id,
+        "slurm_exit_code": record.get("ExitCode"),
+        "slurm_elapsed": record.get("Elapsed"),
+        "slurm_node_list": record.get("NodeList"),
+        "slurm_source": "sacct",
+    }
+
+
 def job_status_impl(experiment_id: str) -> dict:
     """Read filesystem-based status from a nemo_run experiment dir.
 
@@ -1636,6 +1823,23 @@ def job_status_impl(experiment_id: str) -> dict:
         if first_word in _STATUS_FAILURE_WORDS:
             any_failed = True
 
+    slurm_meta = _load_slurm_status_metadata(exp_dir)
+    if slurm_meta is not None:
+        slurm_status = _query_slurm_status(slurm_meta)
+        if slurm_status.get("ok"):
+            overall = slurm_status.get("status", "unknown")
+        else:
+            overall = "running"
+        return {
+            "ok": True,
+            "experiment_id": experiment_id,
+            "experiment_dir": str(exp_dir),
+            "status": overall,
+            "task_statuses": task_statuses,
+            "has_done_marker": done_marker.exists(),
+            "slurm_status": slurm_status,
+        }
+
     if done_marker.exists():
         overall = "failed" if any_failed else "done"
     else: