NVIDIA · denera · Dec 2, 2025 · Dec 2, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/build_tools/jax.py b/build_tools/jax.py
@@ -103,6 +103,9 @@ def setup_jax_extension(
 
     setup_mpi_flags(include_dirs, cxx_flags)
 
+    if bool(int(os.getenv("NVTE_WITH_CUBLASMP", 0))):
+        cxx_flags.append("-DNVTE_WITH_CUBLASMP")
+
     # Define TE/JAX as a Pybind11Extension
     from pybind11.setup_helpers import Pybind11Extension
 

diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
@@ -6,6 +6,7 @@
 
 import os
 from pathlib import Path
+from importlib import metadata
 
 import setuptools
 
@@ -88,6 +89,9 @@ def setup_pytorch_extension(
         libraries.append("nvshmem_host")
         cxx_flags.append("-DNVTE_ENABLE_NVSHMEM")
 
+    if bool(int(os.getenv("NVTE_WITH_CUBLASMP", 0))):
+        cxx_flags.append("-DNVTE_WITH_CUBLASMP")
+
     # Construct PyTorch CUDA extension
     sources = [str(path) for path in sources]
     include_dirs = [str(path) for path in include_dirs]

diff --git a/examples/jax/collective_gemm/common.py b/examples/jax/collective_gemm/common.py
@@ -4,11 +4,14 @@
 """Shared functions for the collective GEMM tests"""
 
 import argparse
+import glob
+import os
 
 import jax
 import jax.numpy as jnp
 import numpy as np
 from jax.experimental import mesh_utils
+from jax.experimental.multihost_utils import sync_global_devices
 
 from transformer_engine.jax.cpp_extensions.gemm import collective_gemm_bootstrap
 
@@ -56,9 +59,9 @@ def assert_allclose(actual, desired, rtol=None, atol=None, dtype=None, **kwargs)
         tols["atol"] = atol
 
     if not isinstance(actual, float):
-        actual = actual.astype(jnp.float32)
+        actual = np.asarray(actual, dtype=np.float32)
     if not isinstance(desired, float):
-        desired = desired.astype(jnp.float32)
+        desired = np.asarray(desired, dtype=np.float32)
 
     np.testing.assert_allclose(actual, desired, **tols, **kwargs)
 
@@ -96,6 +99,20 @@ def _initialize_distributed(args):
 
     assert args.num_devices_per_process == 1, "Only single process single GPU is supported!"
 
+    # cuBLASMp issues NCCL collectives on its own communication stream
+    # inside the GEMM custom call. Add COLLECTIVES so XLA captures those
+    # ops alongside the custom call instead of invalidating the capture.
+    # Lower the min-graph-size to 1 so single-matmul modules also get
+    # captured -- otherwise small test cases skip the captured path.
+    # Userbuffers does not need either flag.
+    if args.use_cublasmp:
+        xla_flags = os.environ.get("XLA_FLAGS", "")
+        os.environ["XLA_FLAGS"] = (
+            xla_flags
+            + " --xla_gpu_enable_command_buffer=+COLLECTIVES"
+            + " --xla_gpu_graph_min_graph_size=1"
+        )
+
     print(
         f"Initializing JAX distributed with coordinator={args.coordinator_address}, "
         f"num_processes={args.num_processes}, process_id={args.process_id}"
@@ -118,6 +135,20 @@ def _initialize_distributed(args):
     devices_per_process = 1
     num_total_devices = args.num_processes
 
+    # Remove stale NCCL unique ID files from previous (possibly crashed) runs.
+    # These files are used for one-time coordination during bootstrap; stale files
+    # cause non-leader processes to read an old unique ID, breaking NCCL init.
+    # Only process 0 performs the cleanup; a global barrier ensures all processes
+    # wait for the cleanup to complete before any TP leader writes a fresh file.
+    nccl_base_path = os.environ.get("NVTE_JAX_NCCL_FILE_PATH", "/tmp")
+    if args.process_id == 0:
+        for f in glob.glob(os.path.join(nccl_base_path, "nccl_*_unique_id_*.bin")):
+            try:
+                os.remove(f)
+            except OSError:
+                pass
+    sync_global_devices("nccl_id_cleanup")
+
     print(
         f"Initializing CGEMM communicator with num_total_devices={num_total_devices},"
         f" devices_per_process={devices_per_process}, process_id={args.process_id}"
@@ -128,6 +159,7 @@ def _initialize_distributed(args):
         num_devices_per_process=devices_per_process,
         process_id=args.process_id,
         tensor_parallel_size=args.tensor_parallel_size,
+        use_cublasmp=args.use_cublasmp,
     )
 
 
@@ -199,6 +231,16 @@ def cgemm_parser(description="Collective GEMM test on multi-GPU with tensor para
     parser.add_argument("--seq-len", type=int, default=8192, help="Sequence length for testing")
     parser.add_argument("--hidden-in", type=int, default=4096, help="Input hidden dimension")
     parser.add_argument("--hidden-out", type=int, default=8192, help="Output hidden dimension")
+    parser.add_argument(
+        "--std",
+        type=float,
+        default=0.023,
+        help=(
+            "Standard deviation for input/weight/bias tensors. Matches TE/PyTorch's"
+            " run_gemm_with_overlap.py default so both frameworks evaluate FP8 noise"
+            " on equal footing."
+        ),
+    )
     parser.add_argument(
         "--collective-type",
         type=str,
@@ -224,5 +266,11 @@ def cgemm_parser(description="Collective GEMM test on multi-GPU with tensor para
     parser.add_argument(
         "--enable-result-check", action="store_true", default=True, help="Enable result checking"
     )
+    parser.add_argument(
+        "--use-cublasmp",
+        action="store_true",
+        default=False,
+        help="Use the cuBLASMp backend for overlapping collective operations with GEMM computation",
+    )
 
     return parser
diff --git a/examples/jax/collective_gemm/conftest.py b/examples/jax/collective_gemm/conftest.py
@@ -5,19 +5,31 @@
 """config for collective_gemm tests"""
 import pytest
 
+import transformer_engine.jax  # noqa: F401 - must load libtransformer_engine.so before transformer_engine_jax
+from transformer_engine_jax import nvte_built_with_cublasmp
+
 
 def pytest_addoption(parser):
     """Pytest hook for collective_gemm tests"""
     parser.addoption("--coordinator-address", action="store", default="localhost:12345")
     parser.addoption("--num-processes", action="store", default=1)
     parser.addoption("--process-id", action="store", default=0)
     parser.addoption("--local-device-ids", action="store", default=None)
+    parser.addoption("--use-cublasmp", action="store_true", default=False)
 
 
 @pytest.fixture(autouse=True)
 def distributed_args(request):
     """Fixture for querying distributed initialization arguments"""
     if request.cls:
+        use_cublasmp = request.config.getoption("--use-cublasmp")
+        if use_cublasmp and not nvte_built_with_cublasmp():
+            pytest.skip(
+                "Collective GEMM cuBLASMp backend tests require Transformer Engine to be built "
+                "with NVTE_WITH_CUBLASMP=1."
+            )
+        if use_cublasmp and "mxfp8" in request.node.name.lower():
+            pytest.skip("MXFP8 is not supported by the cuBLASMp backend wrappers in TE/common.")
         request.cls.coordinator_address = request.config.getoption("--coordinator-address")
         request.cls.num_processes = int(request.config.getoption("--num-processes"))
         request.cls.process_id = int(request.config.getoption("--process-id"))
@@ -27,3 +39,4 @@ def distributed_args(request):
             if request.cls.local_device_ids is None
             else len(request.cls.local_device_ids.split(","))
         )
+        request.cls.use_cublasmp = use_cublasmp
diff --git a/examples/jax/collective_gemm/run_test_cgemm.sh b/examples/jax/collective_gemm/run_test_cgemm.sh
@@ -23,6 +23,30 @@ else
   echo "NVLINK support detected"
 fi
 
+echo "*** Checking cuBLASMp support in TE build ***"
+CUBLASMP_SUPPORT=$(python3 - <<'PY'
+try:
+    import transformer_engine.jax
+    from transformer_engine_jax import nvte_built_with_cublasmp
+except Exception as exc:
+    print(f"error:{exc}")
+    raise SystemExit(0)
+
+print("1" if nvte_built_with_cublasmp() else "0")
+PY
+)
+
+if [[ "$CUBLASMP_SUPPORT" == "1" ]]; then
+  echo "cuBLASMp backend support detected"
+  BACKENDS=("cublasmp" "userbuffers")
+elif [[ "$CUBLASMP_SUPPORT" == "0" ]]; then
+  echo "cuBLASMp backend support not detected; skipping cuBLASMp backend tests"
+  BACKENDS=("userbuffers")
+else
+  echo "Failed to query cuBLASMp support from transformer_engine_jax: $CUBLASMP_SUPPORT"
+  exit 1
+fi
+
 # Define individual test cases to run (file::class::method)
 # DelayedScalingFP8 and CurrentScalingFP8 use the same GEMM so we don't need to test both cases all
 # the time.
@@ -93,50 +117,62 @@ for TEST_CASE in "${TEST_CASES[@]}"; do
   # Clear PIDs array for this test case
   PIDS=()
 
-  for i in $(seq 0 $(($NUM_GPUS - 1))); do
-    # Define output file for logs
-    LOG_FILE="${TEST_NAME}_gpu_${i}.log"
-
-    if [ $i -eq 0 ]; then
-      # For process 0: show live output AND save to log file using tee
-      echo "=== Live output from process 0 ==="
-      pytest -s -c "$TE_PATH/tests/jax/pytest.ini" \
-        -vs --junitxml=$XML_LOG_DIR/collective_gemm_${TEST_NAME}.xml \
-        "$TE_PATH/examples/jax/collective_gemm/$TEST_CASE" \
-        --num-processes=$NUM_GPUS \
-        --process-id=$i 2>&1 | tee "$LOG_FILE" &
-      PID=$!
-      PIDS+=($PID)
+  for BACKEND in "${BACKENDS[@]}"; do
+    echo "Setting backend to $BACKEND for test $TEST_NAME"
+
+    for i in $(seq 0 $(($NUM_GPUS - 1))); do
+      # Define output file for logs
+      LOG_FILE="${TEST_NAME}_gpu_${i}_${BACKEND}.log"
+
+      test_args=(
+        "--num-processes=$NUM_GPUS"
+        "--process-id=$i"
+      )
+      if [ "$BACKEND" == "cublasmp" ]; then
+        test_args+=("--use-cublasmp")
+      fi
+
+      if [ $i -eq 0 ]; then
+        # For process 0: show live output AND save to log file using tee
+        echo "=== Live output from process 0 ==="
+        pytest -s -c "${TE_PATH}/tests/jax/pytest.ini" -vs \
+          "--junitxml=${XML_LOG_DIR}/${TEST_NAME}_gpu_${i}_${BACKEND}.xml" \
+          "${TE_PATH}/examples/jax/collective_gemm/${TEST_CASE}" \
+          "${test_args[@]}" 2>&1 | tee "$LOG_FILE" &
+        PID=$!
+        PIDS+=($PID)
+      else
+        # For other processes: redirect to log files only
+        pytest -s -c "${TE_PATH}/tests/jax/pytest.ini" -vs \
+          "${TE_PATH}/examples/jax/collective_gemm/${TEST_CASE}" \
+          "${test_args[@]}" > "$LOG_FILE" 2>&1 &
+        PID=$!
+        PIDS+=($PID)
+      fi
+    done
+
+    # Wait for all processes to finish
+    wait
+
+    # Check and print the log content from process 0
+    if grep -q "SKIPPED" "${TEST_NAME}_gpu_0_${BACKEND}.log"; then
+      echo "... $TEST_CASE SKIPPED"
+    elif grep -q "FAILED" "${TEST_NAME}_gpu_0_${BACKEND}.log"; then
+      echo "... $TEST_CASE FAILED"
+      HAS_FAILURE=1
+    elif grep -q "PASSED" "${TEST_NAME}_gpu_0_${BACKEND}.log"; then
+      echo "... $TEST_CASE PASSED"
     else
-      # For other processes: redirect to log files only
-      pytest -s -c "$TE_PATH/tests/jax/pytest.ini" \
-        -vs "$TE_PATH/examples/jax/collective_gemm/$TEST_CASE" \
-        --num-processes=$NUM_GPUS \
-        --process-id=$i > "$LOG_FILE" 2>&1 &
-      PID=$!
-      PIDS+=($PID)
+      echo "... $TEST_CASE INVALID"
+      HAS_FAILURE=1
     fi
-  done
 
-  # Wait for all processes to finish
-  wait
-
-  # Check and print the log content from process 0
-  if grep -q "SKIPPED" "${TEST_NAME}_gpu_0.log"; then
-    echo "... $TEST_CASE SKIPPED"
-  elif grep -q "FAILED" "${TEST_NAME}_gpu_0.log"; then
-    echo "... $TEST_CASE FAILED"
-    HAS_FAILURE=1
-  elif grep -q "PASSED" "${TEST_NAME}_gpu_0.log"; then
-    echo "... $TEST_CASE PASSED"
-  else
-    echo "... $TEST_CASE INVALID"
-    HAS_FAILURE=1
-  fi
-
-  # Remove the log files after processing them
-  wait
-  rm ${TEST_NAME}_gpu_*.log
+
+    # Remove the log files after processing them
+    wait
+    rm ${TEST_NAME}_gpu_*_${BACKEND}.log
+
+  done
 done
 
 wait

diff --git a/examples/jax/collective_gemm/test_dense_grad.py b/examples/jax/collective_gemm/test_dense_grad.py
@@ -95,11 +95,14 @@ def run_dense_grad_tests(args, mesh=None):
     # Create test data
     rng = jax.random.PRNGKey(0)
     rng, x_rng, weight_rng, bias_rng = jax.random.split(rng, 4)
-    x = jax.random.normal(
+    std = jnp.asarray(args.std, dtype=jnp.bfloat16)
+    x = std * jax.random.normal(
         x_rng, (args.batch_size, args.seq_len, args.hidden_in), dtype=jnp.bfloat16
     )
-    weight = jax.random.normal(weight_rng, (args.hidden_in, args.hidden_out), dtype=jnp.bfloat16)
-    bias = jax.random.normal(bias_rng, (args.hidden_out,), dtype=jnp.bfloat16)
+    weight = std * jax.random.normal(
+        weight_rng, (args.hidden_in, args.hidden_out), dtype=jnp.bfloat16
+    )
+    bias = std * jax.random.normal(bias_rng, (args.hidden_out,), dtype=jnp.bfloat16)
 
     collective_op = (
         CollectiveOp.ALL_GATHER
@@ -183,6 +186,7 @@ def setUp(self):
         self.args.process_id = self.process_id
         self.args.local_device_ids = self.local_device_ids
         self.args.num_devices_per_process = self.num_devices_per_process
+        self.args.use_cublasmp = self.use_cublasmp
         self.args.enable_data_parallel = True
         self.args.tensor_parallel_size = _get_dp_and_tp_sizes(self.args)[1]
         _initialize_distributed(self.args)