-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup.py
More file actions
102 lines (89 loc) · 3.03 KB
/
Copy pathsetup.py
File metadata and controls
102 lines (89 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import re
import pybind11
import torch
from setuptools import Extension, find_packages, setup
from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension
pybind_includes = [f"-I{pybind11.get_include()}", f"-I{pybind11.get_include(user=True)}"]
def _cuda_is_available() -> bool:
"""Return True if we can build the CUDA extension.
Build-time CUDA availability is independent from runtime ``torch.cuda``;
we need ``nvcc`` (i.e. ``CUDA_HOME``) more than a live GPU. This lets us
build CPU-only wheels on CI runners without a CUDA toolkit while still
producing GPU-enabled wheels on developer / build boxes.
"""
if os.environ.get("SPOPS_FORCE_CPU_ONLY", "0") == "1":
return False
return CUDA_HOME is not None and torch.backends.cuda.is_built()
def _cuda_gencode_flags():
"""Build -gencode flags from TORCH_CUDA_ARCH_LIST.
Default covers A100 (sm_80), L40/L40S/RTX 6000 Ada (sm_89), and H100 (sm_90),
with PTX embedded for the highest arch for forward compatibility.
"""
arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", "8.0;8.9;9.0+PTX")
flags = []
for entry in re.split(r"[;\s,]+", arch_list.strip()):
if not entry:
continue
ptx = entry.endswith("+PTX")
ver = entry[:-4] if ptx else entry
num = ver.replace(".", "")
flags.append(f"-gencode=arch=compute_{num},code=sm_{num}")
if ptx:
flags.append(f"-gencode=arch=compute_{num},code=compute_{num}")
return flags
_cpu_extension = Extension(
"spops_backend_cpu",
["./spops/spops_backend_cpu.cpp"],
extra_compile_args=[
"-O3",
"-Wall",
"-shared",
"-std=c++11",
"-fPIC",
*pybind_includes,
"-march=native",
"-fopenmp",
"-ffast-math",
],
extra_link_args=["-lgomp"],
)
ext_modules = [_cpu_extension]
if _cuda_is_available():
_gencode = _cuda_gencode_flags()
ext_modules.insert(
0,
CUDAExtension(
"spops_backend",
[
"./spops/spops_backend.cpp",
"./spops/lib/sputnik_spops_kernels.cu",
"./spops/lib/structure_aware_spops_kernels.cu",
"./spops/lib/shuffler_spops_kernels.cu",
],
dlink=True,
dlink_libraries=["dlink_lib"],
extra_compile_args={
"cxx": [],
"nvcc": [
*_gencode,
"-lcublas",
"-lcudart",
"-O3",
],
"nvcclink": [*_gencode, "--device-link"],
},
libraries=["cusparse", "cublas"],
),
)
else:
print(
"spops: CUDA toolkit not detected (CUDA_HOME unset or torch built without CUDA). "
"Building CPU-only wheel; GPU kernels will be unavailable at runtime."
)
setup(
name="spops",
packages=find_packages(exclude=["tests", "tests.*"]),
ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension},
)