Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 109 additions & 24 deletions src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,76 @@ def gen_exec_command(self) -> str:

launcher_py = (mbridge_repo_path / "scripts" / "performance" / "setup_experiment.py").absolute()

parts = self._build_launcher_parts(args, tdef, mbridge_repo_path, launcher_py)
pre_hook_sbatch_path: Optional[Path] = None
base_slurm_params: str = ""
if self.test_run.pre_test:
pre_hook_sbatch_path = self._gen_pre_hook_sbatch()
parts = self._build_launcher_parts(args, tdef, mbridge_repo_path, launcher_py, include_slurm_params=False)
base_slurm_params = ";".join(self._collect_additional_slurm_params())
else:
parts = self._build_launcher_parts(args, tdef, mbridge_repo_path, launcher_py)

launcher_python = str((venv_path / "bin" / "python").absolute())
full_cmd = self._wrap_launcher_for_job_id_and_quiet_output(" ".join(parts), launcher_python)
full_cmd = self._wrap_launcher_for_job_id_and_quiet_output(
" ".join(parts),
launcher_python,
pre_hook_sbatch_path=pre_hook_sbatch_path,
base_slurm_params=base_slurm_params,
)

self._write_command_to_file(full_cmd, self.test_run.output_path)
return full_cmd

def _collect_additional_slurm_params(self) -> list[str]:
"""Return the additional_slurm_params list (without dependency)."""
params: list[str] = []
if self.system.gpus_per_node and self.system.supports_gpu_directives:
params.append(f"gpus-per-node={self.system.gpus_per_node}")
params.append(f"gres=gpu:{self.system.gpus_per_node}")
_, node_list = self.get_cached_nodes_spec()
if node_list:
params.append(f"nodelist={','.join(node_list)}")
elif self.test_run.exclude_nodes:
params.append(f"exclude={','.join(self.test_run.exclude_nodes)}")
for source in (self.system.extra_srun_args, self.test_run.extra_srun_args):
if source:
params.extend(self._parse_srun_args_as_slurm_params(source))
return params

def _gen_pre_hook_sbatch(self) -> Path:
"""Generate a standalone sbatch script for pre-hook tests; return its path."""
pre_hook_output = self.test_run.output_path / "pre_hook"
pre_hook_output.mkdir(parents=True, exist_ok=True)

for tr in self.test_run.pre_test.test_runs:
tr.num_nodes = self.test_run.nnodes

pre_hook_cmds = self.gen_pre_test(self.test_run.pre_test, self.test_run.output_path)

num_nodes, node_list = self.get_cached_nodes_spec()
sbatch_lines = [
"#!/bin/bash",
"# Pre-hook sbatch generated by CloudAI",
f"#SBATCH --job-name=pre_hook_{self.job_name()}",
f"#SBATCH --output={pre_hook_output.absolute() / 'stdout.txt'}",
f"#SBATCH --error={pre_hook_output.absolute() / 'stderr.txt'}",
f"#SBATCH --partition={self.system.default_partition}",
]
if self.system.account:
sbatch_lines.append(f"#SBATCH --account={self.system.account}")
if node_list:
sbatch_lines.append(f"#SBATCH --nodelist={','.join(node_list)}")
elif num_nodes:
sbatch_lines.append(f"#SBATCH --nodes={num_nodes}")
Comment on lines +133 to +136

@coderabbitai coderabbitai Bot Jul 2, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

Preserve excluded nodes in the pre-hook sbatch.

When get_cached_nodes_spec() returns only num_nodes, this script emits --nodes but drops self.test_run.exclude_nodes, so the pre-hook can run on nodes the main job excludes.

🐛 Proposed fix
         if node_list:
             sbatch_lines.append(f"`#SBATCH` --nodelist={','.join(node_list)}")
         elif num_nodes:
             sbatch_lines.append(f"`#SBATCH` --nodes={num_nodes}")
+            if self.test_run.exclude_nodes:
+                sbatch_lines.append(f"`#SBATCH` --exclude={','.join(self.test_run.exclude_nodes)}")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if node_list:
sbatch_lines.append(f"#SBATCH --nodelist={','.join(node_list)}")
elif num_nodes:
sbatch_lines.append(f"#SBATCH --nodes={num_nodes}")
if node_list:
sbatch_lines.append(f"`#SBATCH` --nodelist={','.join(node_list)}")
elif num_nodes:
sbatch_lines.append(f"`#SBATCH` --nodes={num_nodes}")
if self.test_run.exclude_nodes:
sbatch_lines.append(f"`#SBATCH` --exclude={','.join(self.test_run.exclude_nodes)}")
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py` around
lines 133 - 136, The pre-hook sbatch generation in build_sbatch_lines is
dropping excluded nodes when get_cached_nodes_spec() only provides num_nodes.
Update the sbatch line assembly so self.test_run.exclude_nodes is preserved even
in the --nodes path, not just when node_list is present, and keep the existing
behavior around num_nodes in SlurmCommandGenStrategy while ensuring excluded
nodes are still passed through.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a broader issue rather than only num nodes. it also doesn't set distribution, gres/gpu-per-node and some other potential directives. what I suggest as a fix, is to update the base class SlurmCommandGenStrategy with extract directives generation into a sub-method. So that you could explicitly call it here to write directives into your buffer

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
fd -a SlurmCommandGenStrategy
fd . -a --type f -g '*slurm_command_gen_strategy.py' src/cloudai/systems/slurm
</command>

Length of output: 810


🏁 Script executed:

#!/bin/bash
fd -a slurm_command_gen_strategy.py
echo "---"
fd -a -e py . src/cloudai/systems/slurm

Length of output: 2675


🏁 Script executed:

#!/bin/bash
ast-grep outline src/cloudai/systems/slurm/slurm_command_gen_strategy.py

Length of output: 995


🏁 Script executed:

#!/bin/bash
ast-grep run --pattern 'def _append_sbatch_directives($$$) {
$$$
}' --lang python src/cloudai/systems/slurm/slurm_command_gen_strategy.py

echo "---"
sed -n '1,400p' src/cloudai/systems/slurm/slurm_command_gen_strategy.py | grep -n "_append_sbatch_directives\|_append_nodes_related_directives\|_write_sbatch_script"

Length of output: 1164


🏁 Script executed:

#!/bin/bash
sed -n '342,470p' src/cloudai/systems/slurm/slurm_command_gen_strategy.py

Length of output: 5433


@podkidyshev, agreed — that's the better fix. I checked src/cloudai/systems/slurm/slurm_command_gen_strategy.py, and it already has most of this logic centralized in _append_sbatch_directives (Line 375) and _append_nodes_related_directives (Line 411), covering: reservation, --distribution, nodelist/hostfile vs -N, exclude, gpus-per-node/gres, ntasks-per-node, --time, and extra_sbatch_args. The _gen_pre_hook_sbatch method in slurm_command_gen_strategy.py currently reimplements only a small subset of that (job-name, output/error, partition, account, nodelist/nodes, time_limit), which is why distribution, gres/gpus-per-node, ntasks-per-node, extra_sbatch_args, and exclude are all missing there.

The one wrinkle is that _append_sbatch_directives hardcodes self.test_run.output_path (for stdout/stderr) and self.job_name(), both of which differ for the pre-hook script (pre_hook subdir, pre_hook_ prefix). So a clean extraction would be something like pulling the node/GPU/misc-directive portion (distribution, nodelist/exclude/hostfile, gpus-per-node/gres, ntasks-per-node, extra_sbatch_args, time_limit) out of _append_sbatch_directives into its own method, e.g. _append_resource_directives(content: List[str]), that both _write_sbatch_script and _gen_pre_hook_sbatch call directly, while each caller still appends its own job-name/output/error/partition/account lines.

Since this touches the shared base class and is broader than the original single-line fix, would you like this done in this PR, or should I open a follow-up issue to track the base-class refactor so both _gen_pre_hook_sbatch and the main sbatch path stay in sync?


✏️ Learnings added
Learnt from: podkidyshev
Repo: NVIDIA/cloudai PR: 957
File: src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py:133-136
Timestamp: 2026-07-03T12:16:35.962Z
Learning: In NVIDIA/cloudai, src/cloudai/systems/slurm/slurm_command_gen_strategy.py already centralizes most SBATCH directive generation (distribution, nodelist/hostfile/-N, exclude_nodes, gpus-per-node/gres, ntasks-per-node, extra_sbatch_args, time_limit) inside `_append_sbatch_directives` and `_append_nodes_related_directives`. When a workload-specific strategy (e.g., src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py's `_gen_pre_hook_sbatch`) needs to generate a standalone sbatch script, it should reuse/extend these base-class methods rather than reimplementing a partial subset of directives, to avoid missing directives like distribution, gres/gpus-per-node, ntasks-per-node, extra_sbatch_args, and exclude_nodes. Maintainer podkidyshev suggested extracting the resource-directive portion of `_append_sbatch_directives` into its own reusable sub-method on the base class for this purpose.

Note: Learnings are effective only in the context of similar code segments. To apply general coding guidelines, please configure review instructions. You can manage existing learnings in the app.


🧠 Learnings used
Learnt from: amaslenn
Repo: NVIDIA/cloudai PR: 754
File: src/cloudai/_core/registry.py:226-234
Timestamp: 2025-12-16T19:47:41.994Z
Learning: In this repository, prefer expressing behavioral documentation through tests rather than docstrings. Tests act as living, verified documentation. Reserve docstrings for interfaces or high-level descriptions, and avoid duplicating behavior that is already covered by tests.

Learnt from: amaslenn
Repo: NVIDIA/cloudai PR: 799
File: src/cloudai/workloads/vllm/report_generation_strategy.py:47-48
Timestamp: 2026-02-10T13:29:25.671Z
Learning: In src/cloudai/workloads/vllm/report_generation_strategy.py, it is acceptable to annotate parse_vllm_bench_output with a caching decorator (e.g., functools.cache) because benchmark result files are immutable after creation in typical usage. This improves repeated calls by avoiding re-parsing unchanged data. Ensure the function is side-effect free and that cache invalidation is not needed for these inputs. If inputs include non-file-state variability, consider cache keys on file path or content hash.

Learnt from: podkidyshev
Repo: NVIDIA/cloudai PR: 821
File: src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py:87-97
Timestamp: 2026-03-10T11:01:25.158Z
Learning: In NVIDIA/cloudai repository, there is no secrets management infrastructure; environment variable values (including tokens/secrets) are serialized as literal strings into generated commands and run artifacts (e.g., cloudai_generated_command.sh). This is an architectural limitation acknowledged by the maintainer (podkidyshev). During code reviews for this repo, do not flag secret-serialization issues unless a secrets management solution is available. If a future change introduces proper secrets handling, update this guideline accordingly.

Learnt from: podkidyshev
Repo: NVIDIA/cloudai PR: 821
File: src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py:69-70
Timestamp: 2026-03-10T11:09:12.655Z
Learning: In src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py, _build_launcher_parts uses ' '.join(parts) intentionally. The parts list contains raw shell constructs and pre-quoted entries (via shlex.quote) so they survive as single arguments to the Python launcher. Replacing with shlex.join(parts) would double-quote shell syntax and break the generated bash command. Do not suggest replacing ' '.join(parts) with shlex.join(parts) in this file during reviews.

Learnt from: podkidyshev
Repo: NVIDIA/cloudai PR: 829
File: src/cloudai/workloads/common/nixl.py:198-217
Timestamp: 2026-03-13T14:17:44.666Z
Learning: In NVIDIA/cloudai, when reviewing code in src/cloudai/workloads/common/nixl.py, understand that NIXL Pydantic models (e.g., NIXLExtendedCmdArgs) may define fields like device_list and total_buffer_size as str | list[str] to support sweep configuration. However, by the time methods such as _device_list_mounts in NIXLCmdGenBase are invoked, these fields are resolved to a single str or None. Casting them to str | None at the command-generation layer is intentional and correct and should not be flagged as a bug. If reviewing related code, verify that the sweep expansion happens before command generation and that the command-gen layer operates on concrete string values or None rather than lists.

Learnt from: amaslenn
Repo: NVIDIA/cloudai PR: 836
File: tests/workloads/test_llm_serving.py:35-83
Timestamp: 2026-03-16T14:10:11.280Z
Learning: In the NVIDIA/cloudai repository, require meaningful docstrings for non-test Python classes (production/library code) to improve readability and maintainability. Test doubles, test helper classes, and other classes defined inside test files (e.g., Fake*, Plain*, stub classes) are exempt from the docstring requirement. Enforce via automated checks (e.g., a docstring style/quality gate using CI) that validate presence and quality of class docstrings for production code while skipping test-only classes within test files.

Learnt from: amaslenn
Repo: NVIDIA/cloudai PR: 839
File: src/cloudai/workloads/sglang/slurm_command_gen_strategy.py:41-41
Timestamp: 2026-03-17T20:01:53.137Z
Learning: In src/cloudai/workloads/sglang/slurm_command_gen_strategy.py, 0.0.0.0 bindings (default prefill_host/decode_host in get_serve_commands and --host 0.0.0.0 in get_helper_command) are intentional for in-cluster serving. Ruff S104 warnings on these lines should not be flagged or require suppression comments, as the maintainer (amaslenn) has explicitly confirmed this is the intended behavior.

Learnt from: podkidyshev
Repo: NVIDIA/cloudai PR: 846
File: src/cloudai/reporter.py:168-168
Timestamp: 2026-03-24T23:24:17.668Z
Learning: In the NVIDIA/cloudai repository, follow the project-wide logging convention: use root/global logging calls like `logging.info(...)` and `logging.warning(...)` rather than creating module-level loggers via `logging.getLogger(__name__)`. When reviewing Python code, do not flag or reference LOG015 for this reason and do not suggest switching to module-level loggers. Separately, Ruff’s formatting rule (e.g., G004 f-string vs %-style) is independent and may still be raised if applicable.

Learnt from: podkidyshev
Repo: NVIDIA/cloudai PR: 869
File: src/cloudai/_core/test_scenario.py:23-23
Timestamp: 2026-04-14T11:05:59.444Z
Learning: In the NVIDIA/cloudai repository, do not surface or fail code-review findings for Ruff UP035 warnings about deprecated typing aliases (e.g., typing.List, typing.Set, typing.Type) on individual files. This deprecation cleanup is planned for a dedicated whole-repo modernization bump, so UP035 should be treated as an expected/known issue rather than something reviewers should block at the per-file level.

Learnt from: orbalayla-nvidia
Repo: NVIDIA/cloudai PR: 899
File: src/cloudai/_core/grading_strategy.py:2-2
Timestamp: 2026-06-07T12:01:43.285Z
Learning: In the NVIDIA/cloudai repo, copyright headers’ year formatting must match the test logic enforced by tests/test_check_copyright_headers.py (via collect_years_same_file and _format_years_to_ranges): use comma-separated year lists for non-consecutive “touched” years (e.g., "2024, 2026") and use hyphen ranges only for consecutive years (e.g., "2024-2026"). Do not “normalize” "YYYY, YYYY" into "YYYY-YYYY" unless the missing years are actually present in git history; otherwise the copyright header check will fail.

Learnt from: rutayan-nv
Repo: NVIDIA/cloudai PR: 901
File: src/cloudai/cli/handlers.py:147-162
Timestamp: 2026-06-24T01:05:39.500Z
Learning: For NVIDIA/cloudai: Ruff rule G004 (logging-f-string) is not enabled in this repo’s Ruff configuration, and CI Ruff checks pass even with f-string usage in logging calls. During code reviews, do not flag or request changes specifically for Ruff G004 violations; leave logging f-strings as-is in .py files unless CI or other enabled rules fail.

if self.test_run.time_limit:
sbatch_lines.append(f"#SBATCH --time={self.test_run.time_limit}")
sbatch_lines.extend(["", pre_hook_cmds])

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🔴 Critical | ⚡ Quick win

Exit non-zero when the pre-hook success check fails.

gen_pre_test() only sets PRE_TEST_SUCCESS; without an explicit exit here, the pre-hook sbatch can finish successfully and unblock the main training job even after a failed pre-test.

🐛 Proposed fix
-        sbatch_lines.extend(["", pre_hook_cmds])
+        sbatch_lines.extend(
+            [
+                "",
+                pre_hook_cmds,
+                "",
+                'if [ "${PRE_TEST_SUCCESS:-0}" -ne 1 ]; then',
+                '  echo "Pre-hook failed; not releasing dependent training job." >&2',
+                "  exit 1",
+                "fi",
+            ]
+        )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
sbatch_lines.extend(["", pre_hook_cmds])
sbatch_lines.extend(
[
"",
pre_hook_cmds,
"",
'if [ "${PRE_TEST_SUCCESS:-0}" -ne 1 ]; then',
' echo "Pre-hook failed; not releasing dependent training job." >&2',
" exit 1",
"fi",
]
)
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py` at line
139, The pre-hook submission in gen_pre_test / SlurmCommandGenStrategy currently
only propagates PRE_TEST_SUCCESS, so a failed pre-test can still let the sbatch
wrapper exit successfully and release the main job. Update the command assembly
around sbatch_lines.extend and pre_hook_cmds to explicitly exit non-zero when
the pre-test check fails, using the gen_pre_test-generated success flag as the
gate so the pre-hook sbatch cannot complete as success on failure.


sbatch_path = self.test_run.output_path / "pre_hook_sbatch_script.sh"
sbatch_path.write_text("\n".join(sbatch_lines))
sbatch_path.chmod(sbatch_path.stat().st_mode | stat.S_IXUSR)
return sbatch_path

def store_test_run(self) -> None:
test_cmd = self.gen_exec_command()
trd = TestRunDetails.from_test_run(self.test_run, test_cmd=test_cmd, full_cmd=test_cmd)
Expand Down Expand Up @@ -166,12 +229,21 @@ def _normalize_cuda_graph_scope_arg(self, val: Any) -> str:
parts = [p.strip().strip("\"'") for p in s.split(",") if p.strip()]
return ",".join(parts)

def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str, launcher_python: str) -> str:
def _wrap_launcher_for_job_id_and_quiet_output(
self,
launcher_cmd: str,
launcher_python: str,
pre_hook_sbatch_path: Optional[Path] = None,
base_slurm_params: str = "",
) -> str:
"""
Run the Megatron-Bridge launcher quietly and ensure CloudAI can parse a job ID.

CloudAI's SlurmRunner expects stdout to include "Submitted batch job <id>".
This writes a readable wrapper script (with section breaks) into the test output directory, then runs it.

If pre_hook_sbatch_path is provided, the pre-hook sbatch is submitted first and its job ID is used as
a Slurm dependency (afterok) for the main training job, so training only starts if the pre-hook passed.
"""
output_dir = self.test_run.output_path.absolute()
output_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -181,6 +253,28 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str, launcher

container_runtime_exports = self._container_runtime_env_exports()

pre_hook_lines: list[str] = []
launch_line: str
if pre_hook_sbatch_path is not None:
pre_hook_lines = [
f'PRE_HOOK_SBATCH="{pre_hook_sbatch_path.absolute()}"',
'PRE_HOOK_OUTPUT=$(sbatch "$PRE_HOOK_SBATCH" 2>&1)',
'PRE_HOOK_JOB_ID=$(echo "$PRE_HOOK_OUTPUT" | grep -Eo "Submitted batch job [0-9]+" | grep -Eo "[0-9]+" | tail -n1 || true)', # noqa: E501
'if [ -z "$PRE_HOOK_JOB_ID" ]; then',
' echo "Failed to submit pre-hook job: $PRE_HOOK_OUTPUT" >&2',
" exit 1",
"fi",
'echo "Submitted pre-hook batch job $PRE_HOOK_JOB_ID"',
f'ADDITIONAL_SLURM_PARAMS="{base_slurm_params}"',
'if [ -n "$PRE_HOOK_JOB_ID" ]; then',
' ADDITIONAL_SLURM_PARAMS="${ADDITIONAL_SLURM_PARAMS};dependency=afterok:${PRE_HOOK_JOB_ID}"',
Comment on lines +268 to +270

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

Quote base Slurm params and avoid a leading empty parameter.

base_slurm_params is injected into a double-quoted shell assignment, so $, backticks, or quotes from extra Slurm args can be expanded by the wrapper. Also, an empty base value becomes ;dependency=....

🐛 Proposed fix
         if pre_hook_sbatch_path is not None:
+            quoted_base_slurm_params = shlex.quote(base_slurm_params)
             pre_hook_lines = [
                 f'PRE_HOOK_SBATCH="{pre_hook_sbatch_path.absolute()}"',
                 'PRE_HOOK_OUTPUT=$(sbatch "$PRE_HOOK_SBATCH" 2>&1)',
@@
                 "fi",
                 'echo "Submitted pre-hook batch job $PRE_HOOK_JOB_ID"',
-                f'ADDITIONAL_SLURM_PARAMS="{base_slurm_params}"',
-                'if [ -n "$PRE_HOOK_JOB_ID" ]; then',
+                f"ADDITIONAL_SLURM_PARAMS={quoted_base_slurm_params}",
+                'if [ -n "$ADDITIONAL_SLURM_PARAMS" ]; then',
                 '  ADDITIONAL_SLURM_PARAMS="${ADDITIONAL_SLURM_PARAMS};dependency=afterok:${PRE_HOOK_JOB_ID}"',
+                "else",
+                '  ADDITIONAL_SLURM_PARAMS="dependency=afterok:${PRE_HOOK_JOB_ID}"',
                 "fi",
                 "",
             ]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
f'ADDITIONAL_SLURM_PARAMS="{base_slurm_params}"',
'if [ -n "$PRE_HOOK_JOB_ID" ]; then',
' ADDITIONAL_SLURM_PARAMS="${ADDITIONAL_SLURM_PARAMS};dependency=afterok:${PRE_HOOK_JOB_ID}"',
quoted_base_slurm_params = shlex.quote(base_slurm_params)
pre_hook_lines = [
f'PRE_HOOK_SBATCH="{pre_hook_sbatch_path.absolute()}"',
'PRE_HOOK_OUTPUT=$(sbatch "$PRE_HOOK_SBATCH" 2>&1)',
# ...
"fi",
'echo "Submitted pre-hook batch job $PRE_HOOK_JOB_ID"',
f"ADDITIONAL_SLURM_PARAMS={quoted_base_slurm_params}",
'if [ -n "$ADDITIONAL_SLURM_PARAMS" ]; then',
' ADDITIONAL_SLURM_PARAMS="${ADDITIONAL_SLURM_PARAMS};dependency=afterok:${PRE_HOOK_JOB_ID}"',
"else",
' ADDITIONAL_SLURM_PARAMS="dependency=afterok:${PRE_HOOK_JOB_ID}"',
"fi",
"",
]
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py` around
lines 268 - 270, The shell wrapper in the Slurm command generation logic is
inserting base Slurm args directly into the ADDITIONAL_SLURM_PARAMS assignment,
which can allow shell expansion and produce a leading separator when the base
value is empty. Update the Megatron bridge’s slurm_command_gen_strategy around
the ADDITIONAL_SLURM_PARAMS construction to safely quote/escape
base_slurm_params before embedding it in the shell snippet, and adjust the
PRE_HOOK_JOB_ID append logic so it only adds the dependency separator when
ADDITIONAL_SLURM_PARAMS already has content. Use the existing command assembly
in the strategy method that emits these lines to keep the fix localized.

"fi",
"",
]
launch_line = f'{launcher_cmd} --additional_slurm_params "$ADDITIONAL_SLURM_PARAMS" >>"$LOG" 2>&1 || LAUNCH_RC=$?' # noqa: E501
else:
launch_line = f'{launcher_cmd} >>"$LOG" 2>&1 || LAUNCH_RC=$?'

script_lines = [
"#!/usr/bin/env bash",
"set -o pipefail",
Expand All @@ -195,6 +289,7 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str, launcher
"",
*container_runtime_exports,
"",
*pre_hook_lines,
': >"$LOG"',
"WANDB_INSTALL_RC=0",
f'{shlex.quote(launcher_python)} -m pip install wandb numpy==1.26.4 >>"$LOG" 2>&1 || WANDB_INSTALL_RC=$?',
Expand All @@ -205,7 +300,7 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str, launcher
"fi",
"",
"LAUNCH_RC=0",
f'{launcher_cmd} >>"$LOG" 2>&1 || LAUNCH_RC=$?',
launch_line,
"",
# Parse job id from Megatron-Bridge output (multiple possible formats)
# Patterns: "Submitted batch job 694112", "Job id: 694112", "- Job id: 694112", "Job ID: 694112"
Expand Down Expand Up @@ -247,7 +342,12 @@ def _list_or_comma_str(self, val: str | list[str] | None) -> Optional[str]:
raise RuntimeError("Unexpected sweeps list. At this point code expects scalars only")

def _build_launcher_parts( # noqa: C901
self, args: MegatronBridgeCmdArgs, tdef: MegatronBridgeTestDefinition, repo_path: Path, launcher_py: Path
self,
args: MegatronBridgeCmdArgs,
tdef: MegatronBridgeTestDefinition,
repo_path: Path,
launcher_py: Path,
include_slurm_params: bool = True,
Comment on lines 344 to +350

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

📐 Maintainability & Code Quality | 🔵 Trivial | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
ruff check --select FBT001,FBT002 src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py

Repository: NVIDIA/cloudai

Length of output: 1524


Make include_slurm_params keyword-only. This removes the FBT001/FBT002 warning and avoids boolean positional calls; the existing keyword call site stays unchanged.

♻️ Proposed fix
         args: MegatronBridgeCmdArgs,
         tdef: MegatronBridgeTestDefinition,
         repo_path: Path,
         launcher_py: Path,
+        *,
         include_slurm_params: bool = True,
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def _build_launcher_parts( # noqa: C901
self, args: MegatronBridgeCmdArgs, tdef: MegatronBridgeTestDefinition, repo_path: Path, launcher_py: Path
self,
args: MegatronBridgeCmdArgs,
tdef: MegatronBridgeTestDefinition,
repo_path: Path,
launcher_py: Path,
include_slurm_params: bool = True,
def _build_launcher_parts( # noqa: C901
self,
args: MegatronBridgeCmdArgs,
tdef: MegatronBridgeTestDefinition,
repo_path: Path,
launcher_py: Path,
*,
include_slurm_params: bool = True,
🧰 Tools
🪛 Ruff (0.15.20)

[warning] 344-344: Too many branches (28 > 12)

(PLR0912)


[warning] 344-344: Too many statements (152 > 50)

(PLR0915)


[warning] 350-350: Boolean-typed positional argument in function definition

(FBT001)


[warning] 350-350: Boolean default positional argument in function definition

(FBT002)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py` around
lines 344 - 350, Make the include_slurm_params parameter keyword-only in
_build_launcher_parts so it cannot be passed positionally and the FBT001/FBT002
warning is eliminated. Update the _build_launcher_parts function signature to
force include_slurm_params as a keyword-only argument, and ensure any internal
call sites use the named argument form; the existing keyword-based caller should
continue to work unchanged.

Source: Linters/SAST tools

) -> list[str]:
fields_set = args.model_fields_set
force_fields = {
Expand Down Expand Up @@ -451,25 +551,10 @@ def add_field(field: str, flag: str, value: Any) -> None:
add_field("nsys_trace", "--nsys_trace", self._list_or_comma_str(args.nsys_trace))
add_field("nsys_extra_args", "--nsys_extra_args", self._list_or_comma_str(args.nsys_extra_args))

additional_slurm_params: list[str] = []

if self.system.gpus_per_node and self.system.supports_gpu_directives:
additional_slurm_params.append(f"gpus-per-node={self.system.gpus_per_node}")
additional_slurm_params.append(f"gres=gpu:{self.system.gpus_per_node}")

_, node_list = self.get_cached_nodes_spec()
if node_list:
nodelist_str = ",".join(node_list)
additional_slurm_params.append(f"nodelist={nodelist_str}")
elif self.test_run.exclude_nodes:
additional_slurm_params.append(f"exclude={','.join(self.test_run.exclude_nodes)}")

for source in (self.system.extra_srun_args, self.test_run.extra_srun_args):
if source:
additional_slurm_params.extend(self._parse_srun_args_as_slurm_params(source))

if additional_slurm_params:
parts.extend(["--additional_slurm_params", shlex.quote(";".join(additional_slurm_params))])
if include_slurm_params:
additional_slurm_params = self._collect_additional_slurm_params()
if additional_slurm_params:
parts.extend(["--additional_slurm_params", shlex.quote(";".join(additional_slurm_params))])

# Config variant
add_field("config_variant", "-cv", args.config_variant)
Expand Down
Loading