Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions tools/launcher/common/smoke/hostname.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euo pipefail

echo "MODEL_OPT_HOSTNAME_SMOKE_START"
hostname
echo "MODEL_OPT_HOSTNAME_SMOKE_DONE"
210 changes: 207 additions & 3 deletions tools/mcp/modelopt_mcp/bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from __future__ import annotations

import hashlib
import json
import os
import re
import shutil
Expand Down Expand Up @@ -68,6 +69,25 @@

_GIT_SHA_RE = re.compile(r"^[0-9a-fA-F]{7,40}$")
_SAFE_PATH_TOKEN_RE = re.compile(r"[^A-Za-z0-9_.-]+")
_SLURM_TERMINAL_STATES: frozenset[str] = frozenset(
{
"COMPLETED",
"FAILED",
"CANCELLED",
"TIMEOUT",
"OUT_OF_MEMORY",
"NODE_FAIL",
"PREEMPTED",
"BOOT_FAIL",
}
)
_SLURM_FAILURE_STATES: frozenset[str] = frozenset(
{"FAILED", "CANCELLED", "TIMEOUT", "OUT_OF_MEMORY", "NODE_FAIL", "PREEMPTED", "BOOT_FAIL"}
)
_SLURM_RUNNING_STATES: frozenset[str] = frozenset(
{"PENDING", "RUNNING", "CONFIGURING", "COMPLETING", "REQUEUED", "RESIZING", "SUSPENDED"}
)
_SLURM_STATUS_META = ".modelopt_mcp_slurm.json"


@dataclass
Expand Down Expand Up @@ -551,9 +571,7 @@ def _launcher_argv(abs_yaml: Path, checkout: SourceCheckout | None, *flags: str)
return [
_uv_binary(),
"run",
"--reinstall-package",
"modelopt-launcher",
"--project",
"--with-editable",
str(checkout.launcher_dir),
"modelopt-launcher",
"--yaml",
Expand Down Expand Up @@ -1345,6 +1363,16 @@ def submit_job_impl(
**_source_result_fields(checkout),
}

_write_slurm_status_metadata(
experiment_id=experiment_id,
slurm_job_id=slurm_job_id,
cluster_host=cluster_host,
cluster_user=cluster_user,
identity=identity,
control_socket=control_socket,
reconnect_command=reconnect_command,
)

return {
"ok": True,
"executor": "slurm",
Expand Down Expand Up @@ -1593,6 +1621,165 @@ def _experiment_not_found_diagnostic() -> str:
)


def _write_slurm_status_metadata(
*,
experiment_id: str,
slurm_job_id: str | None,
cluster_host: str | None,
cluster_user: str | None,
identity: str | None,
control_socket: str | None,
reconnect_command: str | None,
) -> None:
"""Persist Slurm submit metadata so status polling can query Slurm."""
if not slurm_job_id or not cluster_host:
return
exp_dir = _resolve_experiment_dir(experiment_id)
if exp_dir is None:
return
payload = {
"executor": "slurm",
"experiment_id": experiment_id,
"slurm_job_id": slurm_job_id,
"cluster_host": cluster_host,
"cluster_user": cluster_user,
"identity": identity,
"control_socket": os.path.expanduser(control_socket) if control_socket else None,
"reconnect_command": reconnect_command,
}
try:
(exp_dir / _SLURM_STATUS_META).write_text(
json.dumps(payload, sort_keys=True, indent=2),
encoding="utf-8",
)
except OSError:
# Status remains filesystem-based if the sidecar cannot be written.
return


def _load_slurm_status_metadata(exp_dir: Path) -> dict | None:
"""Load Slurm status metadata written by submit_job_impl."""
try:
payload = json.loads((exp_dir / _SLURM_STATUS_META).read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
return None
if payload.get("executor") != "slurm":
return None
if not payload.get("slurm_job_id") or not payload.get("cluster_host"):
return None
return payload


def _ssh_argv_for_slurm(meta: dict, remote_command: str) -> list[str]:
"""Build a fixed SSH argv for querying Slurm status."""
argv = ["ssh", "-o", "BatchMode=yes"]
if meta.get("identity"):
argv += ["-i", str(meta["identity"])]
if meta.get("control_socket"):
argv += [
"-o",
f"ControlPath={os.path.expanduser(str(meta['control_socket']))}",
"-o",
"ControlMaster=no",
]
user = meta.get("cluster_user")
host = meta["cluster_host"]
target = f"{user}@{host}" if user else host
return [*argv, target, remote_command]


def _parse_slurm_records(text: str, job_id: str) -> dict[str, str] | None:
"""Return the parent Slurm job record from sacct -P output."""
lines = [line for line in text.splitlines() if line.strip()]
if len(lines) < 2:
return None
headers = lines[0].split("|")
for line in lines[1:]:
values = line.split("|")
record = dict(zip(headers, values))
if record.get("JobID") == job_id:
return record
return None


def _query_slurm_status(meta: dict) -> dict:
"""Query remote Slurm for a detached job's authoritative status."""
job_id = str(meta["slurm_job_id"])
sacct_cmd = f"sacct -j {job_id} --format=JobID,State,ExitCode,Elapsed,NodeList -P"
try:
proc = subprocess.run( # nosec B603 - fixed ssh argv, no shell.
_ssh_argv_for_slurm(meta, sacct_cmd),
capture_output=True,
text=True,
timeout=30,
check=False,
)
except (FileNotFoundError, subprocess.TimeoutExpired) as e:
return {
"ok": False,
"reason": "slurm_status_query_failed",
"diagnostic": f"Unable to query Slurm status via SSH: {e}.",
}

record = _parse_slurm_records(proc.stdout, job_id) if proc.returncode == 0 else None
if record is None:
squeue_cmd = f"squeue -j {job_id} -h -o '%T|%M|%R'"
try:
squeue = subprocess.run( # nosec B603 - fixed ssh argv, no shell.
_ssh_argv_for_slurm(meta, squeue_cmd),
capture_output=True,
text=True,
timeout=30,
check=False,
)
except (FileNotFoundError, subprocess.TimeoutExpired) as e:
return {
"ok": False,
"reason": "slurm_status_query_failed",
"diagnostic": f"Unable to query Slurm queue via SSH: {e}.",
"sacct_stderr_tail": _tail(proc.stderr),
}
if squeue.returncode == 0 and squeue.stdout.strip():
state = squeue.stdout.strip().split("|", 1)[0].upper()
return {
"ok": True,
"status": "running",
"slurm_state": state,
"slurm_job_id": job_id,
"slurm_source": "squeue",
}
return {
"ok": False,
"reason": "slurm_status_not_found",
"diagnostic": "Slurm job was not found by sacct or squeue.",
"slurm_job_id": job_id,
"sacct_stdout_tail": _tail(proc.stdout),
"sacct_stderr_tail": _tail(proc.stderr),
"squeue_stdout_tail": _tail(squeue.stdout),
"squeue_stderr_tail": _tail(squeue.stderr),
}

state = (record.get("State", "").split() or [""])[0].upper()
if state in _SLURM_FAILURE_STATES:
status = "failed"
elif state in _SLURM_TERMINAL_STATES:
status = "done"
elif state in _SLURM_RUNNING_STATES or state:
status = "running"
else:
status = "unknown"
return {
"ok": True,
"status": status,
"slurm_state": state,
"slurm_job_id": job_id,
"slurm_exit_code": record.get("ExitCode"),
"slurm_elapsed": record.get("Elapsed"),
"slurm_node_list": record.get("NodeList"),
"slurm_source": "sacct",
}


def job_status_impl(experiment_id: str) -> dict:
"""Read filesystem-based status from a nemo_run experiment dir.

Expand Down Expand Up @@ -1636,6 +1823,23 @@ def job_status_impl(experiment_id: str) -> dict:
if first_word in _STATUS_FAILURE_WORDS:
any_failed = True

slurm_meta = _load_slurm_status_metadata(exp_dir)
if slurm_meta is not None:
slurm_status = _query_slurm_status(slurm_meta)
if slurm_status.get("ok"):
overall = slurm_status.get("status", "unknown")
else:
overall = "running"
return {
"ok": True,
"experiment_id": experiment_id,
"experiment_dir": str(exp_dir),
"status": overall,
"task_statuses": task_statuses,
"has_done_marker": done_marker.exists(),
"slurm_status": slurm_status,
}
Comment on lines +1826 to +1841

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🩺 Stability & Availability | 🟡 Minor | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Inspect relevant symbols and surrounding code in bridge.py
python3 - <<'PY'
from pathlib import Path
path = Path("tools/mcp/modelopt_mcp/bridge.py")
text = path.read_text().splitlines()

symbols = ["def _query_slurm_status", "def _load_slurm_status_metadata", "def wait_for_experiment_impl", "status_*.out", "_DONE", "slurm_status_not_found"]
for sym in symbols:
    for i, line in enumerate(text, 1):
        if sym in line:
            start = max(1, i-25)
            end = min(len(text), i+120)
            print(f"\n=== {sym} @ line {i} ===")
            for j in range(start, end+1):
                print(f"{j:5d}: {text[j-1]}")
            break

# Search for other slurm status handling and wait-timeout behavior
print("\n=== SEARCH RESULTS ===")
for needle in ["wait_timeout", "overall = \"running\"", "has_done_marker", "status_*.out", "slurm_status_not_found", "task_statuses"]:
    print(f"\n-- {needle} --")
    for i, line in enumerate(text, 1):
        if needle in line:
            print(f"{i}: {line}")
PY

Repository: NVIDIA/Model-Optimizer

Length of output: 39024


🏁 Script executed:

#!/bin/bash
set -euo pipefail

sed -n '1800,1955p' tools/mcp/modelopt_mcp/bridge.py | cat -n

Repository: NVIDIA/Model-Optimizer

Length of output: 6877


🏁 Script executed:

#!/bin/bash
set -euo pipefail

rg -n "_save_slurm_status_metadata|_remove_slurm_status_metadata|_load_slurm_status_metadata|slurm_enabled|slurm_meta" tools/mcp/modelopt_mcp/bridge.py

Repository: NVIDIA/Model-Optimizer

Length of output: 383


🏁 Script executed:

#!/bin/bash
set -euo pipefail

sed -n '1978,2175p' tools/mcp/modelopt_mcp/bridge.py | cat -n

Repository: NVIDIA/Model-Optimizer

Length of output: 9278


🏁 Script executed:

#!/bin/bash
set -euo pipefail

sed -n '2060,2170p' tools/mcp/modelopt_mcp/bridge.py | cat -n

Repository: NVIDIA/Model-Optimizer

Length of output: 4902


🏁 Script executed:

#!/bin/bash
set -euo pipefail

rg -n "TERMINAL_STATUSES|class ExperimentStatusResult|class WaitForExperimentResult|class QuerySlurmStatusResult" tools/mcp/modelopt_mcp/bridge.py

Repository: NVIDIA/Model-Optimizer

Length of output: 160


Handle slurm_status_not_found with local completion markers If the Slurm query comes back slurm_status_not_found, this branch still reports running even when _DONE or status_*.out already mark the experiment complete. That can keep wait_for_experiment_impl polling until timeout for an aged-out finished job; fall back to the local markers before defaulting to running.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tools/mcp/modelopt_mcp/bridge.py` around lines 1826 - 1841, In the Slurm
status handling branch of the experiment status lookup, do not always fall back
to running when _query_slurm_status returns a non-ok result. Update the logic
around _load_slurm_status_metadata and _query_slurm_status to treat
slurm_status_not_found as a special case and consult the local completion
indicators (_DONE and status_*.out) before defaulting to running. Keep the
returned slurm_status payload intact, but ensure the overall status can become
completed when the local markers show the experiment has finished so
wait_for_experiment_impl stops polling correctly.


if done_marker.exists():
overall = "failed" if any_failed else "done"
else:
Expand Down
Loading
Loading