Skip to content

Commit ba7debd

Browse files
committed
Xfail MCDM mempool OOM with older bindings
Keep cuda-core tests using published older cuda-bindings wheels from failing when the shared mempool xfail helper is unavailable.
1 parent d727990 commit ba7debd

2 files changed

Lines changed: 46 additions & 3 deletions

File tree

cuda_bindings/cuda/bindings/_test_helpers/mempool.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
from cuda.bindings import driver, runtime
99

1010

11+
# Keep in sync with the fallback in cuda_core/tests/conftest.py. The cuda_core
12+
# copy is intentionally simpler because it only handles cuda_core CUDAError
13+
# exceptions when this helper is absent from older published bindings.
1114
def is_windows_mcdm_device(device=0):
1215
if sys.platform != "win32":
1316
return False

cuda_core/tests/conftest.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,50 @@
3434
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
3535
except ModuleNotFoundError:
3636
# Older cuda.bindings artifacts (for example 12.9.x backports) do not ship
37-
# this helper yet. In that case, keep the primary failure visible instead of
38-
# xfail-ing the known Windows MCDM mempool setup issue.
37+
# this helper yet. Keep the fallback local so tests against published
38+
# bindings still xfail the known Windows MCDM mempool setup issue.
39+
#
40+
# Keep in sync with cuda_bindings/cuda/bindings/_test_helpers/mempool.py.
41+
# This copy is intentionally simpler because it only handles cuda_core
42+
# CUDAError exceptions when the shared helper is absent.
43+
def _is_windows_mcdm_device(device=0):
44+
if sys.platform != "win32":
45+
return False
46+
import cuda.bindings.nvml as nvml
47+
48+
device_id = int(getattr(device, "device_id", device))
49+
(err,) = driver.cuInit(0)
50+
if err != driver.CUresult.CUDA_SUCCESS:
51+
return False
52+
err, pci_bus_id = driver.cuDeviceGetPCIBusId(13, device_id)
53+
if err != driver.CUresult.CUDA_SUCCESS:
54+
return False
55+
pci_bus_id = pci_bus_id.split(b"\x00", 1)[0].decode("ascii")
56+
nvml.init_v2()
57+
try:
58+
handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id)
59+
current, _ = nvml.device_get_driver_model_v2(handle)
60+
return current == nvml.DriverModel.DRIVER_MCDM
61+
finally:
62+
nvml.shutdown()
63+
3964
def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
40-
return
65+
if api_name is not None and not isinstance(api_name, str):
66+
device = api_name
67+
api_name = None
68+
69+
if "CUDA_ERROR_OUT_OF_MEMORY" not in str(err_or_exc):
70+
return
71+
try:
72+
is_windows_mcdm = _is_windows_mcdm_device(device)
73+
except Exception:
74+
# If MCDM detection fails, leave the primary test failure visible.
75+
return
76+
if not is_windows_mcdm:
77+
return
78+
79+
api_context = f"{api_name} " if api_name else ""
80+
pytest.xfail(f"{api_context}could not reserve VA for mempool operations on Windows MCDM")
4181

4282

4383
# Import shared test helpers for tests across subprojects.

0 commit comments

Comments
 (0)