From 7507dd84fcb620511a67365e80033e1268b0cd70 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 10:09:22 -0700
Subject: [PATCH 01/11] run_cybind_cython_gen 13.3.0 ../ctk-next (NO MANUAL
 CHANGES)

---
 .../cuda/bindings/_bindings/cydriver.pxd.in   |   77 +-
 .../cuda/bindings/_bindings/cydriver.pyx.in   |  494 +++-
 .../cuda/bindings/_bindings/cyruntime.pxd.in  |    7 +-
 .../cuda/bindings/_bindings/cyruntime.pyx.in  |   11 +-
 .../bindings/_bindings/cyruntime_ptds.pxd.in  |    7 +-
 .../bindings/_bindings/cyruntime_ptds.pyx.in  |    8 +-
 .../cuda/bindings/_internal/nvrtc.pxd         |    8 +-
 .../cuda/bindings/_internal/nvrtc_linux.pyx   |   59 +-
 .../cuda/bindings/_internal/nvrtc_windows.pyx |   47 +-
 cuda_bindings/cuda/bindings/cydriver.pxd.in   |  155 +-
 cuda_bindings/cuda/bindings/cydriver.pyx.in   |   92 +-
 cuda_bindings/cuda/bindings/cynvrtc.pxd       |   25 +-
 cuda_bindings/cuda/bindings/cynvrtc.pyx       |   11 +-
 cuda_bindings/cuda/bindings/cyruntime.pxd.in  |   15 +-
 cuda_bindings/cuda/bindings/cyruntime.pyx.in  |    8 +-
 .../cuda/bindings/cyruntime_functions.pxi.in  |    7 +-
 .../cuda/bindings/cyruntime_types.pxi.in      |   87 +-
 cuda_bindings/cuda/bindings/driver.pxd.in     |  241 +-
 cuda_bindings/cuda/bindings/driver.pyx.in     | 2017 +++++++++++++++--
 cuda_bindings/cuda/bindings/nvrtc.pxd         |   74 +-
 cuda_bindings/cuda/bindings/nvrtc.pyx         |  355 ++-
 cuda_bindings/cuda/bindings/runtime.pxd.in    |  121 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in    |  563 ++++-
 cuda_bindings/docs/source/module/driver.rst   |  147 +-
 cuda_bindings/docs/source/module/nvrtc.rst    |   99 +-
 cuda_bindings/docs/source/module/runtime.rst  |  226 +-
 26 files changed, 4438 insertions(+), 523 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index 0b19de67d0a..1ae95c10023 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 from cuda.bindings.cydriver cimport *
 
 {{if 'cuGetErrorString' in found_functions}}
@@ -1024,6 +1024,66 @@ cdef CUresult _cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice
 cdef CUresult _cuMulticastGetGranularity(size_t* granularity, const CUmulticastObjectProp* prop, CUmulticastGranularity_flags option) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuLogicalEndpointIdReserve' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointIdReserve(CUlogicalEndpointId* baseLeId, cuuint32_t count) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointIdRelease' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointIdRelease(CUlogicalEndpointId baseLeId, cuuint32_t count) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointCreate' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointCreate(CUlogicalEndpointId leId, const CUlogicalEndpointProp* prop) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointAddDevice' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointAddDevice(CUlogicalEndpointId leId, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointDestroy' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointDestroy(CUlogicalEndpointId leId) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointBindAddr' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointBindAddr(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, void* ptr, cuuint64_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointBindMem' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointBindMem(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, CUmemGenericAllocationHandle memHandle, cuuint64_t memOffset, cuuint64_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointUnbind' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointUnbind(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, cuuint64_t size) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointExport' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointExport(void* handle, CUlogicalEndpointId leId, CUlogicalEndpointIpcHandleType handleType) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointImport' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointImport(CUlogicalEndpointId leId, const void* handle, CUlogicalEndpointIpcHandleType handleType) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointGetLimits' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointGetLimits(cuuint64_t* bindAlignment, cuuint64_t* maxSize, const CUlogicalEndpointProp* prop) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointQuery' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointQuery(CUlogicalEndpointId leId, cuuint32_t count, int* queryStatus) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuPointerGetAttribute' in found_functions}}
 
 cdef CUresult _cuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -1139,6 +1199,11 @@ cdef CUresult _cuStreamAddCallback(CUstream hStream, CUstreamCallback callback,
 cdef CUresult _cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef CUresult _cuStreamBeginRecaptureToGraph(CUstream hStream, CUstreamCaptureMode mode, CUgraph hGraph, CUgraphRecaptureCallback callbackFunc, void* userData) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
 
 cdef CUresult _cuStreamBeginCaptureToGraph(CUstream hStream, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUstreamCaptureMode mode) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -1364,11 +1429,6 @@ cdef CUresult _cuLaunchKernelEx(const CUlaunchConfig* config, CUfunction f, void
 cdef CUresult _cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-
-cdef CUresult _cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuLaunchHostFunc' in found_functions}}
 
 cdef CUresult _cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void* userData) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -1424,6 +1484,11 @@ cdef CUresult _cuLaunchGrid(CUfunction f, int grid_width, int grid_height) excep
 cdef CUresult _cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+
+cdef CUresult _cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuParamSetTexRef' in found_functions}}
 
 cdef CUresult _cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) except ?CUDA_ERROR_NOT_FOUND nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index bdf535c70fb..6508500c21a 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
@@ -220,6 +220,18 @@ cdef bint __cuPythonInit = False
 {{if 'cuMulticastBindAddr_v2' in found_functions}}cdef void *__cuMulticastBindAddr_v2 = NULL{{endif}}
 {{if 'cuMulticastUnbind' in found_functions}}cdef void *__cuMulticastUnbind = NULL{{endif}}
 {{if 'cuMulticastGetGranularity' in found_functions}}cdef void *__cuMulticastGetGranularity = NULL{{endif}}
+{{if 'cuLogicalEndpointIdReserve' in found_functions}}cdef void *__cuLogicalEndpointIdReserve = NULL{{endif}}
+{{if 'cuLogicalEndpointIdRelease' in found_functions}}cdef void *__cuLogicalEndpointIdRelease = NULL{{endif}}
+{{if 'cuLogicalEndpointCreate' in found_functions}}cdef void *__cuLogicalEndpointCreate = NULL{{endif}}
+{{if 'cuLogicalEndpointAddDevice' in found_functions}}cdef void *__cuLogicalEndpointAddDevice = NULL{{endif}}
+{{if 'cuLogicalEndpointDestroy' in found_functions}}cdef void *__cuLogicalEndpointDestroy = NULL{{endif}}
+{{if 'cuLogicalEndpointBindAddr' in found_functions}}cdef void *__cuLogicalEndpointBindAddr = NULL{{endif}}
+{{if 'cuLogicalEndpointBindMem' in found_functions}}cdef void *__cuLogicalEndpointBindMem = NULL{{endif}}
+{{if 'cuLogicalEndpointUnbind' in found_functions}}cdef void *__cuLogicalEndpointUnbind = NULL{{endif}}
+{{if 'cuLogicalEndpointExport' in found_functions}}cdef void *__cuLogicalEndpointExport = NULL{{endif}}
+{{if 'cuLogicalEndpointImport' in found_functions}}cdef void *__cuLogicalEndpointImport = NULL{{endif}}
+{{if 'cuLogicalEndpointGetLimits' in found_functions}}cdef void *__cuLogicalEndpointGetLimits = NULL{{endif}}
+{{if 'cuLogicalEndpointQuery' in found_functions}}cdef void *__cuLogicalEndpointQuery = NULL{{endif}}
 {{if 'cuPointerGetAttribute' in found_functions}}cdef void *__cuPointerGetAttribute = NULL{{endif}}
 {{if 'cuMemPrefetchAsync_v2' in found_functions}}cdef void *__cuMemPrefetchAsync_v2 = NULL{{endif}}
 {{if 'cuMemAdvise_v2' in found_functions}}cdef void *__cuMemAdvise_v2 = NULL{{endif}}
@@ -243,6 +255,7 @@ cdef bint __cuPythonInit = False
 {{if 'cuStreamWaitEvent' in found_functions}}cdef void *__cuStreamWaitEvent = NULL{{endif}}
 {{if 'cuStreamAddCallback' in found_functions}}cdef void *__cuStreamAddCallback = NULL{{endif}}
 {{if 'cuStreamBeginCapture_v2' in found_functions}}cdef void *__cuStreamBeginCapture_v2 = NULL{{endif}}
+{{if 'cuStreamBeginRecaptureToGraph' in found_functions}}cdef void *__cuStreamBeginRecaptureToGraph = NULL{{endif}}
 {{if 'cuStreamBeginCaptureToGraph' in found_functions}}cdef void *__cuStreamBeginCaptureToGraph = NULL{{endif}}
 {{if 'cuThreadExchangeStreamCaptureMode' in found_functions}}cdef void *__cuThreadExchangeStreamCaptureMode = NULL{{endif}}
 {{if 'cuStreamEndCapture' in found_functions}}cdef void *__cuStreamEndCapture = NULL{{endif}}
@@ -288,7 +301,6 @@ cdef bint __cuPythonInit = False
 {{if 'cuLaunchKernel' in found_functions}}cdef void *__cuLaunchKernel = NULL{{endif}}
 {{if 'cuLaunchKernelEx' in found_functions}}cdef void *__cuLaunchKernelEx = NULL{{endif}}
 {{if 'cuLaunchCooperativeKernel' in found_functions}}cdef void *__cuLaunchCooperativeKernel = NULL{{endif}}
-{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}cdef void *__cuLaunchCooperativeKernelMultiDevice = NULL{{endif}}
 {{if 'cuLaunchHostFunc' in found_functions}}cdef void *__cuLaunchHostFunc = NULL{{endif}}
 {{if 'cuLaunchHostFunc_v2' in found_functions}}cdef void *__cuLaunchHostFunc_v2 = NULL{{endif}}
 {{if 'cuFuncSetBlockShape' in found_functions}}cdef void *__cuFuncSetBlockShape = NULL{{endif}}
@@ -300,6 +312,7 @@ cdef bint __cuPythonInit = False
 {{if 'cuLaunch' in found_functions}}cdef void *__cuLaunch = NULL{{endif}}
 {{if 'cuLaunchGrid' in found_functions}}cdef void *__cuLaunchGrid = NULL{{endif}}
 {{if 'cuLaunchGridAsync' in found_functions}}cdef void *__cuLaunchGridAsync = NULL{{endif}}
+{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}cdef void *__cuLaunchCooperativeKernelMultiDevice = NULL{{endif}}
 {{if 'cuParamSetTexRef' in found_functions}}cdef void *__cuParamSetTexRef = NULL{{endif}}
 {{if 'cuFuncSetSharedMemConfig' in found_functions}}cdef void *__cuFuncSetSharedMemConfig = NULL{{endif}}
 {{if 'cuGraphCreate' in found_functions}}cdef void *__cuGraphCreate = NULL{{endif}}
@@ -797,6 +810,10 @@ cdef int _cuPythonInit() except -1 nogil:
                 global __cuStreamBeginCapture_v2
                 _F_cuGetProcAddress_v2('cuStreamBeginCapture', &__cuStreamBeginCapture_v2, 10010, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
                 {{endif}}
+                {{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+                global __cuStreamBeginRecaptureToGraph
+                _F_cuGetProcAddress_v2('cuStreamBeginRecaptureToGraph', &__cuStreamBeginRecaptureToGraph, 13030, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
+                {{endif}}
                 {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
                 global __cuStreamBeginCaptureToGraph
                 _F_cuGetProcAddress_v2('cuStreamBeginCaptureToGraph', &__cuStreamBeginCaptureToGraph, 12030, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
@@ -1164,6 +1181,10 @@ cdef int _cuPythonInit() except -1 nogil:
                 global __cuStreamBeginCapture_v2
                 _F_cuGetProcAddress_v2('cuStreamBeginCapture', &__cuStreamBeginCapture_v2, 10010, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
                 {{endif}}
+                {{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+                global __cuStreamBeginRecaptureToGraph
+                _F_cuGetProcAddress_v2('cuStreamBeginRecaptureToGraph', &__cuStreamBeginRecaptureToGraph, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+                {{endif}}
                 {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
                 global __cuStreamBeginCaptureToGraph
                 _F_cuGetProcAddress_v2('cuStreamBeginCaptureToGraph', &__cuStreamBeginCaptureToGraph, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1925,6 +1946,54 @@ cdef int _cuPythonInit() except -1 nogil:
             global __cuMulticastGetGranularity
             _F_cuGetProcAddress_v2('cuMulticastGetGranularity', &__cuMulticastGetGranularity, 12010, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
+            {{if 'cuLogicalEndpointIdReserve' in found_functions}}
+            global __cuLogicalEndpointIdReserve
+            _F_cuGetProcAddress_v2('cuLogicalEndpointIdReserve', &__cuLogicalEndpointIdReserve, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointIdRelease' in found_functions}}
+            global __cuLogicalEndpointIdRelease
+            _F_cuGetProcAddress_v2('cuLogicalEndpointIdRelease', &__cuLogicalEndpointIdRelease, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointCreate' in found_functions}}
+            global __cuLogicalEndpointCreate
+            _F_cuGetProcAddress_v2('cuLogicalEndpointCreate', &__cuLogicalEndpointCreate, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointAddDevice' in found_functions}}
+            global __cuLogicalEndpointAddDevice
+            _F_cuGetProcAddress_v2('cuLogicalEndpointAddDevice', &__cuLogicalEndpointAddDevice, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointDestroy' in found_functions}}
+            global __cuLogicalEndpointDestroy
+            _F_cuGetProcAddress_v2('cuLogicalEndpointDestroy', &__cuLogicalEndpointDestroy, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointBindAddr' in found_functions}}
+            global __cuLogicalEndpointBindAddr
+            _F_cuGetProcAddress_v2('cuLogicalEndpointBindAddr', &__cuLogicalEndpointBindAddr, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointBindMem' in found_functions}}
+            global __cuLogicalEndpointBindMem
+            _F_cuGetProcAddress_v2('cuLogicalEndpointBindMem', &__cuLogicalEndpointBindMem, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointUnbind' in found_functions}}
+            global __cuLogicalEndpointUnbind
+            _F_cuGetProcAddress_v2('cuLogicalEndpointUnbind', &__cuLogicalEndpointUnbind, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointExport' in found_functions}}
+            global __cuLogicalEndpointExport
+            _F_cuGetProcAddress_v2('cuLogicalEndpointExport', &__cuLogicalEndpointExport, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointImport' in found_functions}}
+            global __cuLogicalEndpointImport
+            _F_cuGetProcAddress_v2('cuLogicalEndpointImport', &__cuLogicalEndpointImport, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointGetLimits' in found_functions}}
+            global __cuLogicalEndpointGetLimits
+            _F_cuGetProcAddress_v2('cuLogicalEndpointGetLimits', &__cuLogicalEndpointGetLimits, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuLogicalEndpointQuery' in found_functions}}
+            global __cuLogicalEndpointQuery
+            _F_cuGetProcAddress_v2('cuLogicalEndpointQuery', &__cuLogicalEndpointQuery, 13030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
             {{if 'cuPointerGetAttribute' in found_functions}}
             global __cuPointerGetAttribute
             _F_cuGetProcAddress_v2('cuPointerGetAttribute', &__cuPointerGetAttribute, 4000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -2045,10 +2114,6 @@ cdef int _cuPythonInit() except -1 nogil:
             global __cuFuncLoad
             _F_cuGetProcAddress_v2('cuFuncLoad', &__cuFuncLoad, 12040, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
-            {{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-            global __cuLaunchCooperativeKernelMultiDevice
-            _F_cuGetProcAddress_v2('cuLaunchCooperativeKernelMultiDevice', &__cuLaunchCooperativeKernelMultiDevice, 9000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-            {{endif}}
             {{if 'cuFuncSetBlockShape' in found_functions}}
             global __cuFuncSetBlockShape
             _F_cuGetProcAddress_v2('cuFuncSetBlockShape', &__cuFuncSetBlockShape, 2000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -2085,6 +2150,10 @@ cdef int _cuPythonInit() except -1 nogil:
             global __cuLaunchGridAsync
             _F_cuGetProcAddress_v2('cuLaunchGridAsync', &__cuLaunchGridAsync, 2000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
+            {{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+            global __cuLaunchCooperativeKernelMultiDevice
+            _F_cuGetProcAddress_v2('cuLaunchCooperativeKernelMultiDevice', &__cuLaunchCooperativeKernelMultiDevice, 9000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
             {{if 'cuParamSetTexRef' in found_functions}}
             global __cuParamSetTexRef
             _F_cuGetProcAddress_v2('cuParamSetTexRef', &__cuParamSetTexRef, 2000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -3137,6 +3206,10 @@ cdef int _cuPythonInit() except -1 nogil:
             global __cuStreamBeginCapture_v2
             __cuStreamBeginCapture_v2 = windll.GetProcAddress(handle, 'cuStreamBeginCapture_v2_ptsz')
             {{endif}}
+            {{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+            global __cuStreamBeginRecaptureToGraph
+            __cuStreamBeginRecaptureToGraph = windll.GetProcAddress(handle, 'cuStreamBeginRecaptureToGraph_ptsz')
+            {{endif}}
             {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
             global __cuStreamBeginCaptureToGraph
             __cuStreamBeginCaptureToGraph = windll.GetProcAddress(handle, 'cuStreamBeginCaptureToGraph_ptsz')
@@ -3504,6 +3577,10 @@ cdef int _cuPythonInit() except -1 nogil:
             global __cuStreamBeginCapture_v2
             __cuStreamBeginCapture_v2 = windll.GetProcAddress(handle, 'cuStreamBeginCapture_v2')
             {{endif}}
+            {{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+            global __cuStreamBeginRecaptureToGraph
+            __cuStreamBeginRecaptureToGraph = windll.GetProcAddress(handle, 'cuStreamBeginRecaptureToGraph')
+            {{endif}}
             {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
             global __cuStreamBeginCaptureToGraph
             __cuStreamBeginCaptureToGraph = windll.GetProcAddress(handle, 'cuStreamBeginCaptureToGraph')
@@ -4265,6 +4342,54 @@ cdef int _cuPythonInit() except -1 nogil:
         global __cuMulticastGetGranularity
         __cuMulticastGetGranularity = windll.GetProcAddress(handle, 'cuMulticastGetGranularity')
         {{endif}}
+        {{if 'cuLogicalEndpointIdReserve' in found_functions}}
+        global __cuLogicalEndpointIdReserve
+        __cuLogicalEndpointIdReserve = windll.GetProcAddress(handle, 'cuLogicalEndpointIdReserve')
+        {{endif}}
+        {{if 'cuLogicalEndpointIdRelease' in found_functions}}
+        global __cuLogicalEndpointIdRelease
+        __cuLogicalEndpointIdRelease = windll.GetProcAddress(handle, 'cuLogicalEndpointIdRelease')
+        {{endif}}
+        {{if 'cuLogicalEndpointCreate' in found_functions}}
+        global __cuLogicalEndpointCreate
+        __cuLogicalEndpointCreate = windll.GetProcAddress(handle, 'cuLogicalEndpointCreate')
+        {{endif}}
+        {{if 'cuLogicalEndpointAddDevice' in found_functions}}
+        global __cuLogicalEndpointAddDevice
+        __cuLogicalEndpointAddDevice = windll.GetProcAddress(handle, 'cuLogicalEndpointAddDevice')
+        {{endif}}
+        {{if 'cuLogicalEndpointDestroy' in found_functions}}
+        global __cuLogicalEndpointDestroy
+        __cuLogicalEndpointDestroy = windll.GetProcAddress(handle, 'cuLogicalEndpointDestroy')
+        {{endif}}
+        {{if 'cuLogicalEndpointBindAddr' in found_functions}}
+        global __cuLogicalEndpointBindAddr
+        __cuLogicalEndpointBindAddr = windll.GetProcAddress(handle, 'cuLogicalEndpointBindAddr')
+        {{endif}}
+        {{if 'cuLogicalEndpointBindMem' in found_functions}}
+        global __cuLogicalEndpointBindMem
+        __cuLogicalEndpointBindMem = windll.GetProcAddress(handle, 'cuLogicalEndpointBindMem')
+        {{endif}}
+        {{if 'cuLogicalEndpointUnbind' in found_functions}}
+        global __cuLogicalEndpointUnbind
+        __cuLogicalEndpointUnbind = windll.GetProcAddress(handle, 'cuLogicalEndpointUnbind')
+        {{endif}}
+        {{if 'cuLogicalEndpointExport' in found_functions}}
+        global __cuLogicalEndpointExport
+        __cuLogicalEndpointExport = windll.GetProcAddress(handle, 'cuLogicalEndpointExport')
+        {{endif}}
+        {{if 'cuLogicalEndpointImport' in found_functions}}
+        global __cuLogicalEndpointImport
+        __cuLogicalEndpointImport = windll.GetProcAddress(handle, 'cuLogicalEndpointImport')
+        {{endif}}
+        {{if 'cuLogicalEndpointGetLimits' in found_functions}}
+        global __cuLogicalEndpointGetLimits
+        __cuLogicalEndpointGetLimits = windll.GetProcAddress(handle, 'cuLogicalEndpointGetLimits')
+        {{endif}}
+        {{if 'cuLogicalEndpointQuery' in found_functions}}
+        global __cuLogicalEndpointQuery
+        __cuLogicalEndpointQuery = windll.GetProcAddress(handle, 'cuLogicalEndpointQuery')
+        {{endif}}
         {{if 'cuPointerGetAttribute' in found_functions}}
         global __cuPointerGetAttribute
         __cuPointerGetAttribute = windll.GetProcAddress(handle, 'cuPointerGetAttribute')
@@ -4385,10 +4510,6 @@ cdef int _cuPythonInit() except -1 nogil:
         global __cuFuncLoad
         __cuFuncLoad = windll.GetProcAddress(handle, 'cuFuncLoad')
         {{endif}}
-        {{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-        global __cuLaunchCooperativeKernelMultiDevice
-        __cuLaunchCooperativeKernelMultiDevice = windll.GetProcAddress(handle, 'cuLaunchCooperativeKernelMultiDevice')
-        {{endif}}
         {{if 'cuFuncSetBlockShape' in found_functions}}
         global __cuFuncSetBlockShape
         __cuFuncSetBlockShape = windll.GetProcAddress(handle, 'cuFuncSetBlockShape')
@@ -4425,6 +4546,10 @@ cdef int _cuPythonInit() except -1 nogil:
         global __cuLaunchGridAsync
         __cuLaunchGridAsync = windll.GetProcAddress(handle, 'cuLaunchGridAsync')
         {{endif}}
+        {{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+        global __cuLaunchCooperativeKernelMultiDevice
+        __cuLaunchCooperativeKernelMultiDevice = windll.GetProcAddress(handle, 'cuLaunchCooperativeKernelMultiDevice')
+        {{endif}}
         {{if 'cuParamSetTexRef' in found_functions}}
         global __cuParamSetTexRef
         __cuParamSetTexRef = windll.GetProcAddress(handle, 'cuParamSetTexRef')
@@ -5474,6 +5599,10 @@ cdef int _cuPythonInit() except -1 nogil:
             global __cuStreamBeginCapture_v2
             __cuStreamBeginCapture_v2 = dlfcn.dlsym(handle, 'cuStreamBeginCapture_v2_ptsz')
             {{endif}}
+            {{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+            global __cuStreamBeginRecaptureToGraph
+            __cuStreamBeginRecaptureToGraph = dlfcn.dlsym(handle, 'cuStreamBeginRecaptureToGraph_ptsz')
+            {{endif}}
             {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
             global __cuStreamBeginCaptureToGraph
             __cuStreamBeginCaptureToGraph = dlfcn.dlsym(handle, 'cuStreamBeginCaptureToGraph_ptsz')
@@ -5841,6 +5970,10 @@ cdef int _cuPythonInit() except -1 nogil:
             global __cuStreamBeginCapture_v2
             __cuStreamBeginCapture_v2 = dlfcn.dlsym(handle, 'cuStreamBeginCapture_v2')
             {{endif}}
+            {{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+            global __cuStreamBeginRecaptureToGraph
+            __cuStreamBeginRecaptureToGraph = dlfcn.dlsym(handle, 'cuStreamBeginRecaptureToGraph')
+            {{endif}}
             {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
             global __cuStreamBeginCaptureToGraph
             __cuStreamBeginCaptureToGraph = dlfcn.dlsym(handle, 'cuStreamBeginCaptureToGraph')
@@ -6602,6 +6735,54 @@ cdef int _cuPythonInit() except -1 nogil:
         global __cuMulticastGetGranularity
         __cuMulticastGetGranularity = dlfcn.dlsym(handle, 'cuMulticastGetGranularity')
         {{endif}}
+        {{if 'cuLogicalEndpointIdReserve' in found_functions}}
+        global __cuLogicalEndpointIdReserve
+        __cuLogicalEndpointIdReserve = dlfcn.dlsym(handle, 'cuLogicalEndpointIdReserve')
+        {{endif}}
+        {{if 'cuLogicalEndpointIdRelease' in found_functions}}
+        global __cuLogicalEndpointIdRelease
+        __cuLogicalEndpointIdRelease = dlfcn.dlsym(handle, 'cuLogicalEndpointIdRelease')
+        {{endif}}
+        {{if 'cuLogicalEndpointCreate' in found_functions}}
+        global __cuLogicalEndpointCreate
+        __cuLogicalEndpointCreate = dlfcn.dlsym(handle, 'cuLogicalEndpointCreate')
+        {{endif}}
+        {{if 'cuLogicalEndpointAddDevice' in found_functions}}
+        global __cuLogicalEndpointAddDevice
+        __cuLogicalEndpointAddDevice = dlfcn.dlsym(handle, 'cuLogicalEndpointAddDevice')
+        {{endif}}
+        {{if 'cuLogicalEndpointDestroy' in found_functions}}
+        global __cuLogicalEndpointDestroy
+        __cuLogicalEndpointDestroy = dlfcn.dlsym(handle, 'cuLogicalEndpointDestroy')
+        {{endif}}
+        {{if 'cuLogicalEndpointBindAddr' in found_functions}}
+        global __cuLogicalEndpointBindAddr
+        __cuLogicalEndpointBindAddr = dlfcn.dlsym(handle, 'cuLogicalEndpointBindAddr')
+        {{endif}}
+        {{if 'cuLogicalEndpointBindMem' in found_functions}}
+        global __cuLogicalEndpointBindMem
+        __cuLogicalEndpointBindMem = dlfcn.dlsym(handle, 'cuLogicalEndpointBindMem')
+        {{endif}}
+        {{if 'cuLogicalEndpointUnbind' in found_functions}}
+        global __cuLogicalEndpointUnbind
+        __cuLogicalEndpointUnbind = dlfcn.dlsym(handle, 'cuLogicalEndpointUnbind')
+        {{endif}}
+        {{if 'cuLogicalEndpointExport' in found_functions}}
+        global __cuLogicalEndpointExport
+        __cuLogicalEndpointExport = dlfcn.dlsym(handle, 'cuLogicalEndpointExport')
+        {{endif}}
+        {{if 'cuLogicalEndpointImport' in found_functions}}
+        global __cuLogicalEndpointImport
+        __cuLogicalEndpointImport = dlfcn.dlsym(handle, 'cuLogicalEndpointImport')
+        {{endif}}
+        {{if 'cuLogicalEndpointGetLimits' in found_functions}}
+        global __cuLogicalEndpointGetLimits
+        __cuLogicalEndpointGetLimits = dlfcn.dlsym(handle, 'cuLogicalEndpointGetLimits')
+        {{endif}}
+        {{if 'cuLogicalEndpointQuery' in found_functions}}
+        global __cuLogicalEndpointQuery
+        __cuLogicalEndpointQuery = dlfcn.dlsym(handle, 'cuLogicalEndpointQuery')
+        {{endif}}
         {{if 'cuPointerGetAttribute' in found_functions}}
         global __cuPointerGetAttribute
         __cuPointerGetAttribute = dlfcn.dlsym(handle, 'cuPointerGetAttribute')
@@ -6722,10 +6903,6 @@ cdef int _cuPythonInit() except -1 nogil:
         global __cuFuncLoad
         __cuFuncLoad = dlfcn.dlsym(handle, 'cuFuncLoad')
         {{endif}}
-        {{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-        global __cuLaunchCooperativeKernelMultiDevice
-        __cuLaunchCooperativeKernelMultiDevice = dlfcn.dlsym(handle, 'cuLaunchCooperativeKernelMultiDevice')
-        {{endif}}
         {{if 'cuFuncSetBlockShape' in found_functions}}
         global __cuFuncSetBlockShape
         __cuFuncSetBlockShape = dlfcn.dlsym(handle, 'cuFuncSetBlockShape')
@@ -6762,6 +6939,10 @@ cdef int _cuPythonInit() except -1 nogil:
         global __cuLaunchGridAsync
         __cuLaunchGridAsync = dlfcn.dlsym(handle, 'cuLaunchGridAsync')
         {{endif}}
+        {{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+        global __cuLaunchCooperativeKernelMultiDevice
+        __cuLaunchCooperativeKernelMultiDevice = dlfcn.dlsym(handle, 'cuLaunchCooperativeKernelMultiDevice')
+        {{endif}}
         {{if 'cuParamSetTexRef' in found_functions}}
         global __cuParamSetTexRef
         __cuParamSetTexRef = dlfcn.dlsym(handle, 'cuParamSetTexRef')
@@ -10025,6 +10206,150 @@ cdef CUresult _cuMulticastGetGranularity(size_t* granularity, const CUmulticastO
     return err
 {{endif}}
 
+{{if 'cuLogicalEndpointIdReserve' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointIdReserve(CUlogicalEndpointId* baseLeId, cuuint32_t count) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointIdReserve
+    cuPythonInit()
+    if __cuLogicalEndpointIdReserve == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointIdReserve" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId*, cuuint32_t) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointIdReserve)(baseLeId, count)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointIdRelease' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointIdRelease(CUlogicalEndpointId baseLeId, cuuint32_t count) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointIdRelease
+    cuPythonInit()
+    if __cuLogicalEndpointIdRelease == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointIdRelease" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId, cuuint32_t) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointIdRelease)(baseLeId, count)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointCreate' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointCreate(CUlogicalEndpointId leId, const CUlogicalEndpointProp* prop) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointCreate
+    cuPythonInit()
+    if __cuLogicalEndpointCreate == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointCreate" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId, const CUlogicalEndpointProp*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointCreate)(leId, prop)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointAddDevice' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointAddDevice(CUlogicalEndpointId leId, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointAddDevice
+    cuPythonInit()
+    if __cuLogicalEndpointAddDevice == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointAddDevice" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId, CUdevice) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointAddDevice)(leId, dev)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointDestroy' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointDestroy(CUlogicalEndpointId leId) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointDestroy
+    cuPythonInit()
+    if __cuLogicalEndpointDestroy == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointDestroy" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointDestroy)(leId)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointBindAddr' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointBindAddr(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, void* ptr, cuuint64_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointBindAddr
+    cuPythonInit()
+    if __cuLogicalEndpointBindAddr == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointBindAddr" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId, CUdevice, cuuint64_t, void*, cuuint64_t, unsigned long long) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointBindAddr)(leId, dev, offset, ptr, size, flags)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointBindMem' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointBindMem(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, CUmemGenericAllocationHandle memHandle, cuuint64_t memOffset, cuuint64_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointBindMem
+    cuPythonInit()
+    if __cuLogicalEndpointBindMem == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointBindMem" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId, CUdevice, cuuint64_t, CUmemGenericAllocationHandle, cuuint64_t, cuuint64_t, unsigned long long) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointBindMem)(leId, dev, offset, memHandle, memOffset, size, flags)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointUnbind' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointUnbind(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, cuuint64_t size) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointUnbind
+    cuPythonInit()
+    if __cuLogicalEndpointUnbind == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointUnbind" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId, CUdevice, cuuint64_t, cuuint64_t) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointUnbind)(leId, dev, offset, size)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointExport' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointExport(void* handle, CUlogicalEndpointId leId, CUlogicalEndpointIpcHandleType handleType) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointExport
+    cuPythonInit()
+    if __cuLogicalEndpointExport == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointExport" not found')
+    err = (<CUresult (*)(void*, CUlogicalEndpointId, CUlogicalEndpointIpcHandleType) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointExport)(handle, leId, handleType)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointImport' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointImport(CUlogicalEndpointId leId, const void* handle, CUlogicalEndpointIpcHandleType handleType) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointImport
+    cuPythonInit()
+    if __cuLogicalEndpointImport == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointImport" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId, const void*, CUlogicalEndpointIpcHandleType) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointImport)(leId, handle, handleType)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointGetLimits' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointGetLimits(cuuint64_t* bindAlignment, cuuint64_t* maxSize, const CUlogicalEndpointProp* prop) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointGetLimits
+    cuPythonInit()
+    if __cuLogicalEndpointGetLimits == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointGetLimits" not found')
+    err = (<CUresult (*)(cuuint64_t*, cuuint64_t*, const CUlogicalEndpointProp*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointGetLimits)(bindAlignment, maxSize, prop)
+    return err
+{{endif}}
+
+{{if 'cuLogicalEndpointQuery' in found_functions}}
+
+cdef CUresult _cuLogicalEndpointQuery(CUlogicalEndpointId leId, cuuint32_t count, int* queryStatus) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLogicalEndpointQuery
+    cuPythonInit()
+    if __cuLogicalEndpointQuery == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLogicalEndpointQuery" not found')
+    err = (<CUresult (*)(CUlogicalEndpointId, cuuint32_t, int*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLogicalEndpointQuery)(leId, count, queryStatus)
+    return err
+{{endif}}
+
 {{if 'cuPointerGetAttribute' in found_functions}}
 
 cdef CUresult _cuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -10301,6 +10626,18 @@ cdef CUresult _cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mod
     return err
 {{endif}}
 
+{{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef CUresult _cuStreamBeginRecaptureToGraph(CUstream hStream, CUstreamCaptureMode mode, CUgraph hGraph, CUgraphRecaptureCallback callbackFunc, void* userData) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuStreamBeginRecaptureToGraph
+    cuPythonInit()
+    if __cuStreamBeginRecaptureToGraph == NULL:
+        with gil:
+            raise RuntimeError('Function "cuStreamBeginRecaptureToGraph" not found')
+    err = (<CUresult (*)(CUstream, CUstreamCaptureMode, CUgraph, CUgraphRecaptureCallback, void*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuStreamBeginRecaptureToGraph)(hStream, mode, hGraph, callbackFunc, userData)
+    return err
+{{endif}}
+
 {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
 
 cdef CUresult _cuStreamBeginCaptureToGraph(CUstream hStream, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUstreamCaptureMode mode) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -10841,18 +11178,6 @@ cdef CUresult _cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, un
     return err
 {{endif}}
 
-{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-
-cdef CUresult _cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuLaunchCooperativeKernelMultiDevice
-    cuPythonInit()
-    if __cuLaunchCooperativeKernelMultiDevice == NULL:
-        with gil:
-            raise RuntimeError('Function "cuLaunchCooperativeKernelMultiDevice" not found')
-    err = (<CUresult (*)(CUDA_LAUNCH_PARAMS*, unsigned int, unsigned int) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLaunchCooperativeKernelMultiDevice)(launchParamsList, numDevices, flags)
-    return err
-{{endif}}
-
 {{if 'cuLaunchHostFunc' in found_functions}}
 
 cdef CUresult _cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void* userData) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -10985,6 +11310,18 @@ cdef CUresult _cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height,
     return err
 {{endif}}
 
+{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+
+cdef CUresult _cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuLaunchCooperativeKernelMultiDevice
+    cuPythonInit()
+    if __cuLaunchCooperativeKernelMultiDevice == NULL:
+        with gil:
+            raise RuntimeError('Function "cuLaunchCooperativeKernelMultiDevice" not found')
+    err = (<CUresult (*)(CUDA_LAUNCH_PARAMS*, unsigned int, unsigned int) except ?CUDA_ERROR_NOT_FOUND nogil> __cuLaunchCooperativeKernelMultiDevice)(launchParamsList, numDevices, flags)
+    return err
+{{endif}}
+
 {{if 'cuParamSetTexRef' in found_functions}}
 
 cdef CUresult _cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -14907,6 +15244,90 @@ cpdef dict _inspect_function_pointers():
     data["__cuMulticastGetGranularity"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuLogicalEndpointIdReserve' in found_functions}}
+    global __cuLogicalEndpointIdReserve
+    data["__cuLogicalEndpointIdReserve"] = <intptr_t>__cuLogicalEndpointIdReserve
+    {{else}}
+    data["__cuLogicalEndpointIdReserve"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointIdRelease' in found_functions}}
+    global __cuLogicalEndpointIdRelease
+    data["__cuLogicalEndpointIdRelease"] = <intptr_t>__cuLogicalEndpointIdRelease
+    {{else}}
+    data["__cuLogicalEndpointIdRelease"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointCreate' in found_functions}}
+    global __cuLogicalEndpointCreate
+    data["__cuLogicalEndpointCreate"] = <intptr_t>__cuLogicalEndpointCreate
+    {{else}}
+    data["__cuLogicalEndpointCreate"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointAddDevice' in found_functions}}
+    global __cuLogicalEndpointAddDevice
+    data["__cuLogicalEndpointAddDevice"] = <intptr_t>__cuLogicalEndpointAddDevice
+    {{else}}
+    data["__cuLogicalEndpointAddDevice"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointDestroy' in found_functions}}
+    global __cuLogicalEndpointDestroy
+    data["__cuLogicalEndpointDestroy"] = <intptr_t>__cuLogicalEndpointDestroy
+    {{else}}
+    data["__cuLogicalEndpointDestroy"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointBindAddr' in found_functions}}
+    global __cuLogicalEndpointBindAddr
+    data["__cuLogicalEndpointBindAddr"] = <intptr_t>__cuLogicalEndpointBindAddr
+    {{else}}
+    data["__cuLogicalEndpointBindAddr"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointBindMem' in found_functions}}
+    global __cuLogicalEndpointBindMem
+    data["__cuLogicalEndpointBindMem"] = <intptr_t>__cuLogicalEndpointBindMem
+    {{else}}
+    data["__cuLogicalEndpointBindMem"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointUnbind' in found_functions}}
+    global __cuLogicalEndpointUnbind
+    data["__cuLogicalEndpointUnbind"] = <intptr_t>__cuLogicalEndpointUnbind
+    {{else}}
+    data["__cuLogicalEndpointUnbind"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointExport' in found_functions}}
+    global __cuLogicalEndpointExport
+    data["__cuLogicalEndpointExport"] = <intptr_t>__cuLogicalEndpointExport
+    {{else}}
+    data["__cuLogicalEndpointExport"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointImport' in found_functions}}
+    global __cuLogicalEndpointImport
+    data["__cuLogicalEndpointImport"] = <intptr_t>__cuLogicalEndpointImport
+    {{else}}
+    data["__cuLogicalEndpointImport"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointGetLimits' in found_functions}}
+    global __cuLogicalEndpointGetLimits
+    data["__cuLogicalEndpointGetLimits"] = <intptr_t>__cuLogicalEndpointGetLimits
+    {{else}}
+    data["__cuLogicalEndpointGetLimits"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuLogicalEndpointQuery' in found_functions}}
+    global __cuLogicalEndpointQuery
+    data["__cuLogicalEndpointQuery"] = <intptr_t>__cuLogicalEndpointQuery
+    {{else}}
+    data["__cuLogicalEndpointQuery"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuPointerGetAttribute' in found_functions}}
     global __cuPointerGetAttribute
     data["__cuPointerGetAttribute"] = <intptr_t>__cuPointerGetAttribute
@@ -15068,6 +15489,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuStreamBeginCapture_v2"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+    global __cuStreamBeginRecaptureToGraph
+    data["__cuStreamBeginRecaptureToGraph"] = <intptr_t>__cuStreamBeginRecaptureToGraph
+    {{else}}
+    data["__cuStreamBeginRecaptureToGraph"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
     global __cuStreamBeginCaptureToGraph
     data["__cuStreamBeginCaptureToGraph"] = <intptr_t>__cuStreamBeginCaptureToGraph
@@ -15383,13 +15811,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuLaunchCooperativeKernel"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-    global __cuLaunchCooperativeKernelMultiDevice
-    data["__cuLaunchCooperativeKernelMultiDevice"] = <intptr_t>__cuLaunchCooperativeKernelMultiDevice
-    {{else}}
-    data["__cuLaunchCooperativeKernelMultiDevice"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuLaunchHostFunc' in found_functions}}
     global __cuLaunchHostFunc
     data["__cuLaunchHostFunc"] = <intptr_t>__cuLaunchHostFunc
@@ -15467,6 +15888,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuLaunchGridAsync"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+    global __cuLaunchCooperativeKernelMultiDevice
+    data["__cuLaunchCooperativeKernelMultiDevice"] = <intptr_t>__cuLaunchCooperativeKernelMultiDevice
+    {{else}}
+    data["__cuLaunchCooperativeKernelMultiDevice"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuParamSetTexRef' in found_functions}}
     global __cuParamSetTexRef
     data["__cuParamSetTexRef"] = <intptr_t>__cuParamSetTexRef
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index 01936ee0e42..178ba2022a1 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 include "../cyruntime_types.pxi"
 
 include "../_lib/cyruntime/cyruntime.pxd"
@@ -296,6 +296,11 @@ cdef cudaError_t _cudaStreamAttachMemAsync(cudaStream_t stream, void* devPtr, si
 cdef cudaError_t _cudaStreamBeginCapture(cudaStream_t stream, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef cudaError_t _cudaStreamBeginRecaptureToGraph(cudaStream_t stream, cudaStreamCaptureMode mode, cudaGraph_t graph, cudaGraphRecaptureCallbackData* callbackData) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaStreamBeginCaptureToGraph' in found_functions}}
 
 cdef cudaError_t _cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index 2f3c4431475..0a7de772210 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 include "../cyruntime_functions.pxi"
 
 import os
@@ -548,6 +548,15 @@ cdef cudaError_t _cudaStreamBeginCapture(cudaStream_t stream, cudaStreamCaptureM
     return cudaStreamBeginCapture(stream, mode)
 {{endif}}
 
+{{if 'cudaStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef cudaError_t _cudaStreamBeginRecaptureToGraph(cudaStream_t stream, cudaStreamCaptureMode mode, cudaGraph_t graph, cudaGraphRecaptureCallbackData* callbackData) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaStreamBeginRecaptureToGraph(stream, mode, graph, callbackData)
+    return cudaStreamBeginRecaptureToGraph(stream, mode, graph, callbackData)
+{{endif}}
+
 {{if 'cudaStreamBeginCaptureToGraph' in found_functions}}
 
 cdef cudaError_t _cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil:
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index 53b30d026a6..08e14a023de 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
@@ -299,6 +299,11 @@ cdef cudaError_t _cudaStreamAttachMemAsync(cudaStream_t stream, void* devPtr, si
 cdef cudaError_t _cudaStreamBeginCapture(cudaStream_t stream, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef cudaError_t _cudaStreamBeginRecaptureToGraph(cudaStream_t stream, cudaStreamCaptureMode mode, cudaGraph_t graph, cudaGraphRecaptureCallbackData* callbackData) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaStreamBeginCaptureToGraph' in found_functions}}
 
 cdef cudaError_t _cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index 679722b3ccc..c771cf89de8 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
@@ -359,6 +359,12 @@ cdef cudaError_t _cudaStreamBeginCapture(cudaStream_t stream, cudaStreamCaptureM
     return cudaStreamBeginCapture(stream, mode)
 {{endif}}
 
+{{if 'cudaStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef cudaError_t _cudaStreamBeginRecaptureToGraph(cudaStream_t stream, cudaStreamCaptureMode mode, cudaGraph_t graph, cudaGraphRecaptureCallbackData* callbackData) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaStreamBeginRecaptureToGraph(stream, mode, graph, callbackData)
+{{endif}}
+
 {{if 'cudaStreamBeginCaptureToGraph' in found_functions}}
 
 cdef cudaError_t _cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil:
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd b/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
index 1964af1f16b..e27ff2c08da 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from ..cynvrtc cimport *
 
@@ -61,3 +61,9 @@ cdef nvrtcResult _nvrtcSetFlowCallback(nvrtcProgram prog, void* callback, void*
 cdef nvrtcResult _nvrtcGetTileIRSize(nvrtcProgram prog, size_t* TileIRSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil
 
 cdef nvrtcResult _nvrtcGetTileIR(nvrtcProgram prog, char* TileIR) except ?NVRTC_ERROR_INVALID_INPUT nogil
+
+cdef nvrtcResult _nvrtcInstallBundledHeaders(const char* installPath, unsigned int flags, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil
+
+cdef nvrtcResult _nvrtcGetBundledHeadersInfo(nvrtcBundledHeadersInfo* info, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil
+
+cdef nvrtcResult _nvrtcRemoveBundledHeaders(const char* installPath, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
index 780042a8cdc..ac9cbca550a 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
@@ -82,6 +82,9 @@ cdef void* __nvrtcGetPCHHeapSizeRequired = NULL
 cdef void* __nvrtcSetFlowCallback = NULL
 cdef void* __nvrtcGetTileIRSize = NULL
 cdef void* __nvrtcGetTileIR = NULL
+cdef void* __nvrtcInstallBundledHeaders = NULL
+cdef void* __nvrtcGetBundledHeadersInfo = NULL
+cdef void* __nvrtcRemoveBundledHeaders = NULL
 
 cdef void* load_library() except* with gil:
     cdef uintptr_t handle = load_nvidia_dynamic_lib("nvrtc")._handle_uint
@@ -280,6 +283,27 @@ cdef int _init_nvrtc() except -1 nogil:
                 handle = load_library()
             __nvrtcGetTileIR = dlsym(handle, 'nvrtcGetTileIR')
 
+        global __nvrtcInstallBundledHeaders
+        __nvrtcInstallBundledHeaders = dlsym(RTLD_DEFAULT, 'nvrtcInstallBundledHeaders')
+        if __nvrtcInstallBundledHeaders == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvrtcInstallBundledHeaders = dlsym(handle, 'nvrtcInstallBundledHeaders')
+
+        global __nvrtcGetBundledHeadersInfo
+        __nvrtcGetBundledHeadersInfo = dlsym(RTLD_DEFAULT, 'nvrtcGetBundledHeadersInfo')
+        if __nvrtcGetBundledHeadersInfo == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvrtcGetBundledHeadersInfo = dlsym(handle, 'nvrtcGetBundledHeadersInfo')
+
+        global __nvrtcRemoveBundledHeaders
+        __nvrtcRemoveBundledHeaders = dlsym(RTLD_DEFAULT, 'nvrtcRemoveBundledHeaders')
+        if __nvrtcRemoveBundledHeaders == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvrtcRemoveBundledHeaders = dlsym(handle, 'nvrtcRemoveBundledHeaders')
+
         __py_nvrtc_init = True
         return 0
 
@@ -377,6 +401,15 @@ cpdef dict _inspect_function_pointers():
     global __nvrtcGetTileIR
     data["__nvrtcGetTileIR"] = <intptr_t>__nvrtcGetTileIR
 
+    global __nvrtcInstallBundledHeaders
+    data["__nvrtcInstallBundledHeaders"] = <intptr_t>__nvrtcInstallBundledHeaders
+
+    global __nvrtcGetBundledHeadersInfo
+    data["__nvrtcGetBundledHeadersInfo"] = <intptr_t>__nvrtcGetBundledHeadersInfo
+
+    global __nvrtcRemoveBundledHeaders
+    data["__nvrtcRemoveBundledHeaders"] = <intptr_t>__nvrtcRemoveBundledHeaders
+
     func_ptrs = data
     return data
 
@@ -597,3 +630,27 @@ cdef nvrtcResult _nvrtcGetTileIR(nvrtcProgram prog, char* TileIR) except ?NVRTC_
         with gil:
             raise FunctionNotFoundError("function nvrtcGetTileIR is not found")
     return (<nvrtcResult (*)(nvrtcProgram, char*) noexcept nogil>__nvrtcGetTileIR)(prog, TileIR)
+
+cdef nvrtcResult _nvrtcInstallBundledHeaders(const char* installPath, unsigned int flags, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    global __nvrtcInstallBundledHeaders
+    _check_or_init_nvrtc()
+    if __nvrtcInstallBundledHeaders == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvrtcInstallBundledHeaders is not found")
+    return (<nvrtcResult (*)(const char*, unsigned int, const char**) noexcept nogil>__nvrtcInstallBundledHeaders)(installPath, flags, errorLog)
+
+cdef nvrtcResult _nvrtcGetBundledHeadersInfo(nvrtcBundledHeadersInfo* info, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    global __nvrtcGetBundledHeadersInfo
+    _check_or_init_nvrtc()
+    if __nvrtcGetBundledHeadersInfo == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvrtcGetBundledHeadersInfo is not found")
+    return (<nvrtcResult (*)(nvrtcBundledHeadersInfo*, const char**) noexcept nogil>__nvrtcGetBundledHeadersInfo)(info, errorLog)
+
+cdef nvrtcResult _nvrtcRemoveBundledHeaders(const char* installPath, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    global __nvrtcRemoveBundledHeaders
+    _check_or_init_nvrtc()
+    if __nvrtcRemoveBundledHeaders == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvrtcRemoveBundledHeaders is not found")
+    return (<nvrtcResult (*)(const char*, const char**) noexcept nogil>__nvrtcRemoveBundledHeaders)(installPath, errorLog)
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
index 1fb555644e1..03427c3ef50 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -101,6 +101,9 @@ cdef void* __nvrtcGetPCHHeapSizeRequired = NULL
 cdef void* __nvrtcSetFlowCallback = NULL
 cdef void* __nvrtcGetTileIRSize = NULL
 cdef void* __nvrtcGetTileIR = NULL
+cdef void* __nvrtcInstallBundledHeaders = NULL
+cdef void* __nvrtcGetBundledHeadersInfo = NULL
+cdef void* __nvrtcRemoveBundledHeaders = NULL
 
 cdef int _init_nvrtc() except -1 nogil:
     global __py_nvrtc_init
@@ -192,6 +195,15 @@ cdef int _init_nvrtc() except -1 nogil:
         global __nvrtcGetTileIR
         __nvrtcGetTileIR = GetProcAddress(handle, 'nvrtcGetTileIR')
 
+        global __nvrtcInstallBundledHeaders
+        __nvrtcInstallBundledHeaders = GetProcAddress(handle, 'nvrtcInstallBundledHeaders')
+
+        global __nvrtcGetBundledHeadersInfo
+        __nvrtcGetBundledHeadersInfo = GetProcAddress(handle, 'nvrtcGetBundledHeadersInfo')
+
+        global __nvrtcRemoveBundledHeaders
+        __nvrtcRemoveBundledHeaders = GetProcAddress(handle, 'nvrtcRemoveBundledHeaders')
+
         __py_nvrtc_init = True
         return 0
 
@@ -289,6 +301,15 @@ cpdef dict _inspect_function_pointers():
     global __nvrtcGetTileIR
     data["__nvrtcGetTileIR"] = <intptr_t>__nvrtcGetTileIR
 
+    global __nvrtcInstallBundledHeaders
+    data["__nvrtcInstallBundledHeaders"] = <intptr_t>__nvrtcInstallBundledHeaders
+
+    global __nvrtcGetBundledHeadersInfo
+    data["__nvrtcGetBundledHeadersInfo"] = <intptr_t>__nvrtcGetBundledHeadersInfo
+
+    global __nvrtcRemoveBundledHeaders
+    data["__nvrtcRemoveBundledHeaders"] = <intptr_t>__nvrtcRemoveBundledHeaders
+
     func_ptrs = data
     return data
 
@@ -509,3 +530,27 @@ cdef nvrtcResult _nvrtcGetTileIR(nvrtcProgram prog, char* TileIR) except ?NVRTC_
         with gil:
             raise FunctionNotFoundError("function nvrtcGetTileIR is not found")
     return (<nvrtcResult (*)(nvrtcProgram, char*) noexcept nogil>__nvrtcGetTileIR)(prog, TileIR)
+
+cdef nvrtcResult _nvrtcInstallBundledHeaders(const char* installPath, unsigned int flags, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    global __nvrtcInstallBundledHeaders
+    _check_or_init_nvrtc()
+    if __nvrtcInstallBundledHeaders == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvrtcInstallBundledHeaders is not found")
+    return (<nvrtcResult (*)(const char*, unsigned int, const char**) noexcept nogil>__nvrtcInstallBundledHeaders)(installPath, flags, errorLog)
+
+cdef nvrtcResult _nvrtcGetBundledHeadersInfo(nvrtcBundledHeadersInfo* info, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    global __nvrtcGetBundledHeadersInfo
+    _check_or_init_nvrtc()
+    if __nvrtcGetBundledHeadersInfo == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvrtcGetBundledHeadersInfo is not found")
+    return (<nvrtcResult (*)(nvrtcBundledHeadersInfo*, const char**) noexcept nogil>__nvrtcGetBundledHeadersInfo)(info, errorLog)
+
+cdef nvrtcResult _nvrtcRemoveBundledHeaders(const char* installPath, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    global __nvrtcRemoveBundledHeaders
+    _check_or_init_nvrtc()
+    if __nvrtcRemoveBundledHeaders == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvrtcRemoveBundledHeaders is not found")
+    return (<nvrtcResult (*)(const char*, const char**) noexcept nogil>__nvrtcRemoveBundledHeaders)(installPath, errorLog)
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index 6ed16b51bae..10786e12afd 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -635,7 +635,13 @@ cdef extern from "cuda.h":
         CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED = 146
         CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED = 147
         CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED = 148
-        CU_DEVICE_ATTRIBUTE_MAX = 149
+        CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED = 151
+        CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED = 152
+        CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_SUPPORTED = 153
+        CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_MULTICAST_SUPPORTED = 154
+        CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_COUNTED_OPS_SUPPORTED = 155
+        CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_ACCESS_ON_OWNER_DEVICE_SUPPORTED = 156
+        CU_DEVICE_ATTRIBUTE_MAX = 157
 
     ctypedef CUdevice_attribute_enum CUdevice_attribute
 
@@ -697,7 +703,8 @@ cdef extern from "cuda.h":
         CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13
         CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14
         CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15
-        CU_FUNC_ATTRIBUTE_MAX = 16
+        CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED = 16
+        CU_FUNC_ATTRIBUTE_MAX = 17
 
     ctypedef CUfunction_attribute_enum CUfunction_attribute
 
@@ -1051,6 +1058,7 @@ cdef extern from "cuda.h":
         CU_GRAPH_NODE_TYPE_MEM_FREE = 11
         CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12
         CU_GRAPH_NODE_TYPE_CONDITIONAL = 13
+        CU_GRAPH_NODE_TYPE_RESERVED_16 = 16
 
     ctypedef CUgraphNodeType_enum CUgraphNodeType
 
@@ -1430,6 +1438,7 @@ cdef extern from "cuda.h":
         CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915
         CUDA_ERROR_KEY_ROTATION = 916
         CUDA_ERROR_STREAM_DETACHED = 917
+        CUDA_ERROR_GRAPH_RECAPTURE_FAILURE = 918
         CUDA_ERROR_UNKNOWN = 999
 
     ctypedef cudaError_enum CUresult
@@ -2349,6 +2358,7 @@ cdef extern from "cuda.h":
         CUDA_MEM_FREE_NODE_PARAMS free
         CUDA_BATCH_MEM_OP_NODE_PARAMS_v2 memOp
         CUDA_CONDITIONAL_NODE_PARAMS conditional
+        char asBytes[232]
         long long reserved2
 
     ctypedef CUgraphNodeParams_st CUgraphNodeParams
@@ -2450,8 +2460,7 @@ cdef extern from "cuda.h":
     cdef struct CUcheckpointRestoreArgs_st:
         CUcheckpointGpuPair* gpuPairs
         unsigned int gpuPairsCount
-        char reserved[44]
-        cuuint64_t reserved1
+        char reserved[52]
 
     ctypedef CUcheckpointRestoreArgs_st CUcheckpointRestoreArgs
 
@@ -2485,6 +2494,57 @@ cdef extern from "cuda.h":
 
     ctypedef CUmemDecompressParams_st CUmemDecompressParams
 
+    ctypedef cuuint32_t CUlogicalEndpointId
+
+    cdef enum CUlogicalEndpointIpcHandleType_enum:
+        CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE = 0
+        CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC = 1
+
+    ctypedef CUlogicalEndpointIpcHandleType_enum CUlogicalEndpointIpcHandleType
+
+    cdef struct CUlogicalEndpointFabricHandle_st:
+        unsigned char data[64]
+
+    ctypedef CUlogicalEndpointFabricHandle_st CUlogicalEndpointFabricHandle
+
+    cdef enum CUlogicalEndpointType_enum:
+        CU_LOGICAL_ENDPOINT_TYPE_INVALID = 0
+        CU_LOGICAL_ENDPOINT_TYPE_UNICAST = 1
+        CU_LOGICAL_ENDPOINT_TYPE_MULTICAST = 2
+
+    ctypedef CUlogicalEndpointType_enum CUlogicalEndpointType
+
+    cdef enum CUlogicalEndpointFlag_enum:
+        CU_LOGICAL_ENDPOINT_FLAG_NONE = 0
+        CU_LOGICAL_ENDPOINT_FLAG_COUNTED_OPS = 1
+
+    ctypedef CUlogicalEndpointFlag_enum CUlogicalEndpointFlag
+
+    cdef struct anon_struct25:
+        CUdevice device
+
+    cdef struct anon_struct26:
+        unsigned int numDevices
+
+    cdef struct CUlogicalEndpointProp_struct:
+        CUlogicalEndpointType type
+        anon_struct25 unicast
+        anon_struct26 multicast
+        unsigned long long size
+        unsigned int ipcHandleTypes
+        unsigned int flags
+
+    ctypedef CUlogicalEndpointProp_struct CUlogicalEndpointProp
+
+    cdef enum CUgraphRecaptureStatus_enum:
+        CU_GRAPH_RECAPTURE_ELIGIBLE_FOR_UPDATE = 0
+        CU_GRAPH_RECAPTURE_INELIGIBLE_FOR_UPDATE = 1
+        CU_GRAPH_RECAPTURE_ERROR = 2
+
+    ctypedef CUgraphRecaptureStatus_enum CUgraphRecaptureStatus
+
+    ctypedef CUresult (*CUgraphRecaptureCallback)(void* data, CUgraphNode node, const CUgraphNodeParams* originalParams, const CUgraphNodeParams* recaptureParams, CUgraphRecaptureStatus status)
+
     cdef enum CUfunctionLoadingState_enum:
         CU_FUNCTION_LOADING_STATE_UNLOADED = 0
         CU_FUNCTION_LOADING_STATE_LOADED = 1
@@ -2514,6 +2574,9 @@ cdef extern from "cuda.h":
         CU_COREDUMP_SKIP_CONSTBANK_MEMORY = 32
         CU_COREDUMP_LIGHTWEIGHT_FLAGS = 47
         CU_COREDUMP_GZIP_COMPRESS = 64
+        CU_COREDUMP_FAULTED_CONTEXTS_ONLY = 128
+        CU_COREDUMP_NO_ERRBAR_AT_EXIT = 1073741824
+        CU_COREDUMP_LOG_ONLY = 2147483648
 
     cdef struct CUcoredumpCallbackEntry_st:
         pass
@@ -2526,6 +2589,7 @@ cdef extern from "cuda.h":
     ctypedef CUdevResourceDesc_st* CUdevResourceDesc
 
     ctypedef enum CUgreenCtxCreate_flags:
+        CU_GREEN_CTX_NONE = 0
         CU_GREEN_CTX_DEFAULT_STREAM = 1
 
     ctypedef enum CUdevSmResourceGroup_flags:
@@ -2744,12 +2808,12 @@ cdef enum CUeglColorFormat_enum:
 
 ctypedef CUeglColorFormat_enum CUeglColorFormat
 
-cdef union anon_union16:
+cdef union anon_union17:
     CUarray pArray[3]
     void* pPitch[3]
 
 cdef struct CUeglFrame_st:
-    anon_union16 frame
+    anon_union17 frame
     unsigned int width
     unsigned int height
     unsigned int depth
@@ -3832,6 +3896,66 @@ cdef CUresult cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice
 cdef CUresult cuMulticastGetGranularity(size_t* granularity, const CUmulticastObjectProp* prop, CUmulticastGranularity_flags option) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuLogicalEndpointIdReserve' in found_functions}}
+
+cdef CUresult cuLogicalEndpointIdReserve(CUlogicalEndpointId* baseLeId, cuuint32_t count) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointIdRelease' in found_functions}}
+
+cdef CUresult cuLogicalEndpointIdRelease(CUlogicalEndpointId baseLeId, cuuint32_t count) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointCreate' in found_functions}}
+
+cdef CUresult cuLogicalEndpointCreate(CUlogicalEndpointId leId, const CUlogicalEndpointProp* prop) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointAddDevice' in found_functions}}
+
+cdef CUresult cuLogicalEndpointAddDevice(CUlogicalEndpointId leId, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointDestroy' in found_functions}}
+
+cdef CUresult cuLogicalEndpointDestroy(CUlogicalEndpointId leId) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointBindAddr' in found_functions}}
+
+cdef CUresult cuLogicalEndpointBindAddr(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, void* ptr, cuuint64_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointBindMem' in found_functions}}
+
+cdef CUresult cuLogicalEndpointBindMem(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, CUmemGenericAllocationHandle memHandle, cuuint64_t memOffset, cuuint64_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointUnbind' in found_functions}}
+
+cdef CUresult cuLogicalEndpointUnbind(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, cuuint64_t size) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointExport' in found_functions}}
+
+cdef CUresult cuLogicalEndpointExport(void* handle, CUlogicalEndpointId leId, CUlogicalEndpointIpcHandleType handleType) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointImport' in found_functions}}
+
+cdef CUresult cuLogicalEndpointImport(CUlogicalEndpointId leId, const void* handle, CUlogicalEndpointIpcHandleType handleType) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointGetLimits' in found_functions}}
+
+cdef CUresult cuLogicalEndpointGetLimits(cuuint64_t* bindAlignment, cuuint64_t* maxSize, const CUlogicalEndpointProp* prop) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuLogicalEndpointQuery' in found_functions}}
+
+cdef CUresult cuLogicalEndpointQuery(CUlogicalEndpointId leId, cuuint32_t count, int* queryStatus) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuPointerGetAttribute' in found_functions}}
 
 cdef CUresult cuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -3947,6 +4071,11 @@ cdef CUresult cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, v
 cdef CUresult cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef CUresult cuStreamBeginRecaptureToGraph(CUstream hStream, CUstreamCaptureMode mode, CUgraph hGraph, CUgraphRecaptureCallback callbackFunc, void* userData) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
 
 cdef CUresult cuStreamBeginCaptureToGraph(CUstream hStream, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUstreamCaptureMode mode) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -4172,11 +4301,6 @@ cdef CUresult cuLaunchKernelEx(const CUlaunchConfig* config, CUfunction f, void*
 cdef CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-
-cdef CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuLaunchHostFunc' in found_functions}}
 
 cdef CUresult cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void* userData) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -4232,6 +4356,11 @@ cdef CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height) except
 cdef CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+
+cdef CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuParamSetTexRef' in found_functions}}
 
 cdef CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -5267,7 +5396,7 @@ cdef CUresult cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource* pCudaResou
 cdef CUresult cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource* pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-cdef enum: CUDA_VERSION = 13020
+cdef enum: CUDA_VERSION = 13030
 
 cdef enum: CU_IPC_HANDLE_SIZE = 64
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index 527fd10d05f..652c16c137c 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 cimport cuda.bindings._bindings.cydriver as cydriver
 
 {{if 'cuGetErrorString' in found_functions}}
@@ -1228,6 +1228,78 @@ cdef CUresult cuMulticastGetGranularity(size_t* granularity, const CUmulticastOb
     return cydriver._cuMulticastGetGranularity(granularity, prop, option)
 {{endif}}
 
+{{if 'cuLogicalEndpointIdReserve' in found_functions}}
+
+cdef CUresult cuLogicalEndpointIdReserve(CUlogicalEndpointId* baseLeId, cuuint32_t count) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointIdReserve(baseLeId, count)
+{{endif}}
+
+{{if 'cuLogicalEndpointIdRelease' in found_functions}}
+
+cdef CUresult cuLogicalEndpointIdRelease(CUlogicalEndpointId baseLeId, cuuint32_t count) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointIdRelease(baseLeId, count)
+{{endif}}
+
+{{if 'cuLogicalEndpointCreate' in found_functions}}
+
+cdef CUresult cuLogicalEndpointCreate(CUlogicalEndpointId leId, const CUlogicalEndpointProp* prop) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointCreate(leId, prop)
+{{endif}}
+
+{{if 'cuLogicalEndpointAddDevice' in found_functions}}
+
+cdef CUresult cuLogicalEndpointAddDevice(CUlogicalEndpointId leId, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointAddDevice(leId, dev)
+{{endif}}
+
+{{if 'cuLogicalEndpointDestroy' in found_functions}}
+
+cdef CUresult cuLogicalEndpointDestroy(CUlogicalEndpointId leId) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointDestroy(leId)
+{{endif}}
+
+{{if 'cuLogicalEndpointBindAddr' in found_functions}}
+
+cdef CUresult cuLogicalEndpointBindAddr(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, void* ptr, cuuint64_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointBindAddr(leId, dev, offset, ptr, size, flags)
+{{endif}}
+
+{{if 'cuLogicalEndpointBindMem' in found_functions}}
+
+cdef CUresult cuLogicalEndpointBindMem(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, CUmemGenericAllocationHandle memHandle, cuuint64_t memOffset, cuuint64_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointBindMem(leId, dev, offset, memHandle, memOffset, size, flags)
+{{endif}}
+
+{{if 'cuLogicalEndpointUnbind' in found_functions}}
+
+cdef CUresult cuLogicalEndpointUnbind(CUlogicalEndpointId leId, CUdevice dev, cuuint64_t offset, cuuint64_t size) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointUnbind(leId, dev, offset, size)
+{{endif}}
+
+{{if 'cuLogicalEndpointExport' in found_functions}}
+
+cdef CUresult cuLogicalEndpointExport(void* handle, CUlogicalEndpointId leId, CUlogicalEndpointIpcHandleType handleType) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointExport(handle, leId, handleType)
+{{endif}}
+
+{{if 'cuLogicalEndpointImport' in found_functions}}
+
+cdef CUresult cuLogicalEndpointImport(CUlogicalEndpointId leId, const void* handle, CUlogicalEndpointIpcHandleType handleType) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointImport(leId, handle, handleType)
+{{endif}}
+
+{{if 'cuLogicalEndpointGetLimits' in found_functions}}
+
+cdef CUresult cuLogicalEndpointGetLimits(cuuint64_t* bindAlignment, cuuint64_t* maxSize, const CUlogicalEndpointProp* prop) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointGetLimits(bindAlignment, maxSize, prop)
+{{endif}}
+
+{{if 'cuLogicalEndpointQuery' in found_functions}}
+
+cdef CUresult cuLogicalEndpointQuery(CUlogicalEndpointId leId, cuuint32_t count, int* queryStatus) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLogicalEndpointQuery(leId, count, queryStatus)
+{{endif}}
+
 {{if 'cuPointerGetAttribute' in found_functions}}
 
 cdef CUresult cuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -1366,6 +1438,12 @@ cdef CUresult cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode) e
     return cydriver._cuStreamBeginCapture_v2(hStream, mode)
 {{endif}}
 
+{{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef CUresult cuStreamBeginRecaptureToGraph(CUstream hStream, CUstreamCaptureMode mode, CUgraph hGraph, CUgraphRecaptureCallback callbackFunc, void* userData) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuStreamBeginRecaptureToGraph(hStream, mode, hGraph, callbackFunc, userData)
+{{endif}}
+
 {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
 
 cdef CUresult cuStreamBeginCaptureToGraph(CUstream hStream, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUstreamCaptureMode mode) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -1636,12 +1714,6 @@ cdef CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, uns
     return cydriver._cuLaunchCooperativeKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams)
 {{endif}}
 
-{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-
-cdef CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags)
-{{endif}}
-
 {{if 'cuLaunchHostFunc' in found_functions}}
 
 cdef CUresult cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void* userData) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -1708,6 +1780,12 @@ cdef CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, C
     return cydriver._cuLaunchGridAsync(f, grid_width, grid_height, hStream)
 {{endif}}
 
+{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+
+cdef CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags)
+{{endif}}
+
 {{if 'cuParamSetTexRef' in found_functions}}
 
 cdef CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) except ?CUDA_ERROR_NOT_FOUND nogil:
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd b/cuda_bindings/cuda/bindings/cynvrtc.pxd
index 6bf1bda94e1..9a4476c3ced 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -27,11 +27,22 @@ cdef extern from "nvrtc.h":
         NVRTC_ERROR_PCH_CREATE = 15
         NVRTC_ERROR_CANCELLED = 16
         NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED = 17
+        NVRTC_ERROR_BUSY = 18
 
     cdef struct _nvrtcProgram:
         pass
     ctypedef _nvrtcProgram* nvrtcProgram
 
+    cdef struct anon_struct0:
+        int available
+        size_t compressedSize
+        size_t uncompressedSize
+        int cudaVersionMajor
+        int cudaVersionMinor
+        unsigned int numFiles
+
+    ctypedef anon_struct0 nvrtcBundledHeadersInfo
+
 cdef const char* nvrtcGetErrorString(nvrtcResult result) except ?NULL nogil
 
 cdef nvrtcResult nvrtcVersion(int* major, int* minor) except ?NVRTC_ERROR_INVALID_INPUT nogil
@@ -83,3 +94,15 @@ cdef nvrtcResult nvrtcSetFlowCallback(nvrtcProgram prog, void* callback, void* p
 cdef nvrtcResult nvrtcGetTileIRSize(nvrtcProgram prog, size_t* TileIRSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil
 
 cdef nvrtcResult nvrtcGetTileIR(nvrtcProgram prog, char* TileIR) except ?NVRTC_ERROR_INVALID_INPUT nogil
+
+cdef nvrtcResult nvrtcInstallBundledHeaders(const char* installPath, unsigned int flags, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil
+
+cdef nvrtcResult nvrtcGetBundledHeadersInfo(nvrtcBundledHeadersInfo* info, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil
+
+cdef nvrtcResult nvrtcRemoveBundledHeaders(const char* installPath, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil
+
+cdef enum: NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS = 0
+
+cdef enum: NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE = 1
+
+cdef enum: NVRTC_INSTALL_HEADERS_NO_WAIT = 2
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx b/cuda_bindings/cuda/bindings/cynvrtc.pyx
index 2bf71611d72..d297aaa2998 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from ._internal cimport nvrtc as _nvrtc
 
@@ -83,3 +83,12 @@ cdef nvrtcResult nvrtcGetTileIRSize(nvrtcProgram prog, size_t* TileIRSizeRet) ex
 
 cdef nvrtcResult nvrtcGetTileIR(nvrtcProgram prog, char* TileIR) except ?NVRTC_ERROR_INVALID_INPUT nogil:
     return _nvrtc._nvrtcGetTileIR(prog, TileIR)
+
+cdef nvrtcResult nvrtcInstallBundledHeaders(const char* installPath, unsigned int flags, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    return _nvrtc._nvrtcInstallBundledHeaders(installPath, flags, errorLog)
+
+cdef nvrtcResult nvrtcGetBundledHeadersInfo(nvrtcBundledHeadersInfo* info, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    return _nvrtc._nvrtcGetBundledHeadersInfo(info, errorLog)
+
+cdef nvrtcResult nvrtcRemoveBundledHeaders(const char* installPath, const char** errorLog) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    return _nvrtc._nvrtcRemoveBundledHeaders(installPath, errorLog)
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index 85108b68a9d..453011b2ba3 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -176,12 +176,12 @@ cdef struct cudaEglPlaneDesc_st:
 
 ctypedef cudaEglPlaneDesc_st cudaEglPlaneDesc
 
-cdef union anon_union11:
+cdef union anon_union12:
     cudaArray_t pArray[3]
     cudaPitchedPtr pPitch[3]
 
 cdef struct cudaEglFrame_st:
-    anon_union11 frame
+    anon_union12 frame
     cudaEglPlaneDesc planeDesc[3]
     unsigned int planeCount
     cudaEglFrameType frameType
@@ -494,6 +494,11 @@ cdef cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, void* devPtr, siz
 cdef cudaError_t cudaStreamBeginCapture(cudaStream_t stream, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef cudaError_t cudaStreamBeginRecaptureToGraph(cudaStream_t stream, cudaStreamCaptureMode mode, cudaGraph_t graph, cudaGraphRecaptureCallbackData* callbackData) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaStreamBeginCaptureToGraph' in found_functions}}
 
 cdef cudaError_t cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -2082,8 +2087,8 @@ cdef enum: cudaTextureType2DLayered = 242
 
 cdef enum: cudaTextureTypeCubemapLayered = 252
 
-cdef enum: CUDART_VERSION = 13020
+cdef enum: CUDART_VERSION = 13030
 
-cdef enum: __CUDART_API_VERSION = 13020
+cdef enum: __CUDART_API_VERSION = 13030
 
 cdef enum: CUDA_EGL_MAX_PLANES = 3
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index 4084ed03ab7..230b5d8f84f 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 cimport cuda.bindings._bindings.cyruntime as cyruntime
 cimport cython
 
@@ -353,6 +353,12 @@ cdef cudaError_t cudaStreamBeginCapture(cudaStream_t stream, cudaStreamCaptureMo
     return cyruntime._cudaStreamBeginCapture(stream, mode)
 {{endif}}
 
+{{if 'cudaStreamBeginRecaptureToGraph' in found_functions}}
+
+cdef cudaError_t cudaStreamBeginRecaptureToGraph(cudaStream_t stream, cudaStreamCaptureMode mode, cudaGraph_t graph, cudaGraphRecaptureCallbackData* callbackData) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaStreamBeginRecaptureToGraph(stream, mode, graph, callbackData)
+{{endif}}
+
 {{if 'cudaStreamBeginCaptureToGraph' in found_functions}}
 
 cdef cudaError_t cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaStreamCaptureMode mode) except ?cudaErrorCallRequiresNewerDriver nogil:
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index 8cbdb881a73..981b55fb297 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 cdef extern from "cuda_runtime_api.h":
 
     {{if 'cudaDeviceReset' in found_functions}}
@@ -293,6 +293,11 @@ cdef extern from "cuda_runtime_api.h":
 
     cudaError_t cudaStreamBeginCapture(cudaStream_t stream, cudaStreamCaptureMode mode) nogil
 
+    {{endif}}
+    {{if 'cudaStreamBeginRecaptureToGraph' in found_functions}}
+
+    cudaError_t cudaStreamBeginRecaptureToGraph(cudaStream_t stream, cudaStreamCaptureMode mode, cudaGraph_t graph, cudaGraphRecaptureCallbackData* callbackData) nogil
+
     {{endif}}
     {{if 'cudaStreamBeginCaptureToGraph' in found_functions}}
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index fb4a4aabaac..a7ad5839ac8 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 cdef extern from "vector_types.h":
 
@@ -147,6 +147,7 @@ cdef extern from "driver_types.h":
         cudaErrorInvalidResourceType = 914
         cudaErrorInvalidResourceConfiguration = 915
         cudaErrorStreamDetached = 917
+        cudaErrorGraphRecaptureFailure = 918
         cudaErrorUnknown = 999
         cudaErrorApiFailureBase = 10000
 
@@ -322,6 +323,11 @@ cdef extern from "driver_types.h":
         unsigned int lastLayer
         unsigned int reserved[16]
 
+    cdef enum cudaSharedMemoryMode:
+        cudaSharedMemoryModeDefault = 0
+        cudaSharedMemoryModeRequirePortable = 1
+        cudaSharedMemoryModeAllowNonPortable = 2
+
     cdef struct cudaPointerAttributes:
         cudaMemoryType type
         int device
@@ -346,8 +352,9 @@ cdef extern from "driver_types.h":
         int requiredClusterDepth
         int clusterSchedulingPolicyPreference
         int nonPortableClusterSizeAllowed
-        int reserved0
-        int reserved[15]
+        int deviceNodeUpdateStatus
+        int reserved1
+        int reserved[14]
 
     cdef struct cudaMemLocation:
         cudaMemLocationType type
@@ -407,13 +414,13 @@ cdef extern from "driver_types.h":
         cudaArray_t array
         cudaOffset3D offset
 
-    cdef union anon_union1:
+    cdef union anon_union2:
         anon_struct6 ptr
         anon_struct7 array
 
     cdef struct cudaMemcpy3DOperand:
         cudaMemcpy3DOperandType type
-        anon_union1 op
+        anon_union2 op
 
     cdef struct cudaMemcpy3DBatchOp:
         cudaMemcpy3DOperand src
@@ -543,14 +550,14 @@ cdef extern from "driver_types.h":
         void* handle
         const void* name
 
-    cdef union anon_union2:
+    cdef union anon_union3:
         int fd
         anon_struct8 win32
         const void* nvSciBufObject
 
     cdef struct cudaExternalMemoryHandleDesc:
         cudaExternalMemoryHandleType type
-        anon_union2 handle
+        anon_union3 handle
         unsigned long long size
         unsigned int flags
         unsigned int reserved[16]
@@ -573,21 +580,21 @@ cdef extern from "driver_types.h":
         void* handle
         const void* name
 
-    cdef union anon_union3:
+    cdef union anon_union4:
         int fd
         anon_struct9 win32
         const void* nvSciSyncObj
 
     cdef struct cudaExternalSemaphoreHandleDesc:
         cudaExternalSemaphoreHandleType type
-        anon_union3 handle
+        anon_union4 handle
         unsigned int flags
         unsigned int reserved[16]
 
     cdef struct anon_struct10:
         unsigned long long value
 
-    cdef union anon_union4:
+    cdef union anon_union5:
         void* fence
         unsigned long long reserved
 
@@ -596,7 +603,7 @@ cdef extern from "driver_types.h":
 
     cdef struct anon_struct12:
         anon_struct10 fence
-        anon_union4 nvSciSync
+        anon_union5 nvSciSync
         anon_struct11 keyedMutex
         unsigned int reserved[12]
 
@@ -608,7 +615,7 @@ cdef extern from "driver_types.h":
     cdef struct anon_struct13:
         unsigned long long value
 
-    cdef union anon_union5:
+    cdef union anon_union6:
         void* fence
         unsigned long long reserved
 
@@ -618,7 +625,7 @@ cdef extern from "driver_types.h":
 
     cdef struct anon_struct15:
         anon_struct13 fence
-        anon_union5 nvSciSync
+        anon_union6 nvSciSync
         anon_struct14 keyedMutex
         unsigned int reserved[10]
 
@@ -842,7 +849,7 @@ cdef extern from "driver_types.h":
         size_t offset
         size_t size
 
-    cdef union anon_union9:
+    cdef union anon_union10:
         dim3 gridDim
         anon_struct16 param
         unsigned int isEnabled
@@ -850,7 +857,7 @@ cdef extern from "driver_types.h":
     cdef struct cudaGraphKernelNodeUpdate:
         cudaGraphDeviceNode_t node
         cudaGraphKernelNodeField field
-        anon_union9 updateData
+        anon_union10 updateData
 
     cdef enum cudaLaunchMemSyncDomain:
         cudaLaunchMemSyncDomainDefault = 0
@@ -867,11 +874,6 @@ cdef extern from "driver_types.h":
         cudaLaunchPortableClusterModeRequirePortable = 1
         cudaLaunchPortableClusterModeAllowNonPortable = 2
 
-    cdef enum cudaSharedMemoryMode:
-        cudaSharedMemoryModeDefault = 0
-        cudaSharedMemoryModeRequirePortable = 1
-        cudaSharedMemoryModeAllowNonPortable = 2
-
     cdef enum cudaLaunchAttributeID:
         cudaLaunchAttributeIgnore = 0
         cudaLaunchAttributeAccessPolicyWindow = 1
@@ -953,12 +955,12 @@ cdef extern from "driver_types.h":
     cdef struct anon_struct22:
         unsigned long long bytesOverBudget
 
-    cdef union anon_union10:
+    cdef union anon_union11:
         anon_struct22 overBudget
 
     cdef struct cudaAsyncNotificationInfo:
         cudaAsyncNotificationType type
-        anon_union10 info
+        anon_union11 info
 
     ctypedef cudaAsyncNotificationInfo cudaAsyncNotificationInfo_t
 
@@ -1009,6 +1011,20 @@ cdef extern from "driver_types.h":
         cudaChannelFormatKindUnsignedBlockCompressed7 = 29
         cudaChannelFormatKindUnsignedBlockCompressed7SRGB = 30
         cudaChannelFormatKindUnsignedNormalized1010102 = 31
+        cudaChannelFormatKindUnsigned8Packed422 = 32
+        cudaChannelFormatKindUnsigned8Packed444 = 33
+        cudaChannelFormatKindUnsigned8SemiPlanar420 = 34
+        cudaChannelFormatKindUnsigned16SemiPlanar420 = 35
+        cudaChannelFormatKindUnsigned8SemiPlanar422 = 36
+        cudaChannelFormatKindUnsigned16SemiPlanar422 = 37
+        cudaChannelFormatKindUnsigned8SemiPlanar444 = 38
+        cudaChannelFormatKindUnsigned16SemiPlanar444 = 39
+        cudaChannelFormatKindUnsigned8Planar420 = 40
+        cudaChannelFormatKindUnsigned16Planar420 = 41
+        cudaChannelFormatKindUnsigned8Planar422 = 42
+        cudaChannelFormatKindUnsigned16Planar422 = 43
+        cudaChannelFormatKindUnsigned8Planar444 = 44
+        cudaChannelFormatKindUnsigned16Planar444 = 45
 
     cdef enum cudaMemoryType:
         cudaMemoryTypeUnregistered = 0
@@ -1033,6 +1049,11 @@ cdef extern from "driver_types.h":
         cudaStreamCaptureStatusActive = 1
         cudaStreamCaptureStatusInvalidated = 2
 
+    cdef enum cudaGraphRecaptureStatus:
+        cudaGraphRecaptureEligibleForUpdate = 0
+        cudaGraphRecaptureIneligibleForUpdate = 1
+        cudaGraphRecaptureError = 2
+
     cdef enum cudaStreamCaptureMode:
         cudaStreamCaptureModeGlobal = 0
         cudaStreamCaptureModeThreadLocal = 1
@@ -1340,7 +1361,9 @@ cdef extern from "driver_types.h":
         cudaDevAttrHostMemoryPoolsSupported = 144
         cudaDevAttrReserved145 = 145
         cudaDevAttrOnlyPartialHostNativeAtomicSupported = 147
-        cudaDevAttrMax = 148
+        cudaDevAttrAtomicReductionSupported = 148
+        cudaDevAttrCigStreamsSupported = 151
+        cudaDevAttrMax = 152
 
     cdef enum cudaMemPoolAttr:
         cudaMemPoolReuseFollowEventDependencies = 1
@@ -1543,7 +1566,8 @@ cdef extern from "driver_types.h":
         cudaGraphNodeTypeMemAlloc = 10
         cudaGraphNodeTypeMemFree = 11
         cudaGraphNodeTypeConditional = 13
-        cudaGraphNodeTypeCount = 14
+        cudaGraphNodeTypeReserved16 = 16
+        cudaGraphNodeTypeCount = 17
 
     cdef enum cudaGraphChildGraphNodeOwnership:
         cudaGraphChildGraphOwnershipInvalid = -1
@@ -1600,6 +1624,15 @@ cdef extern from "driver_types.h":
         cudaDeviceNumaConfigNone = 0
         cudaDeviceNumaConfigNumaNode = 1
 
+    cdef enum cudaFabricOpStatusSource:
+        cudaFabricOpStatusSourceMbarrierV1 = 0
+        cudaFabricOpStatusSourceMax = 2147483647
+
+    cdef enum cudaFabricOpStatusInfo:
+        cudaFabricOpStatusInfoSuccess = 0
+        cudaFabricOpStatusInfoLast = 0
+        cudaFabricOpStatusInfoMax = 2147483647
+
 cdef extern from "surface_types.h":
 
     ctypedef unsigned long long cudaSurfaceObject_t
@@ -1719,6 +1752,12 @@ cdef extern from "cuda_runtime_api.h":
 
     ctypedef void (*cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void* userData)
 
+    ctypedef cudaError_t (*cudaGraphRecaptureCallback_t)(void* data, cudaGraphNode_t node, const cudaGraphNodeParams* originalParams, const cudaGraphNodeParams* recaptureParams, cudaGraphRecaptureStatus status)
+
+    cdef struct cudaGraphRecaptureCallbackData:
+        cudaGraphRecaptureCallback_t callbackFunc
+        void* userData
+
     ctypedef void (*cudaLogsCallback_t)(void* data, cudaLogLevel logLevel, char* message, size_t length)
 
 cdef extern from "device_types.h":
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index a5328c2f5e8..a90c6addff3 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 
 include "_lib/utils.pxd"
@@ -567,6 +567,21 @@ cdef class CUoccupancyB2DSize:
     cdef cydriver.CUoccupancyB2DSize* _pvt_ptr
 {{endif}}
 
+{{if 'CUgraphRecaptureCallback' in found_types}}
+
+cdef class CUgraphRecaptureCallback:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    cdef cydriver.CUgraphRecaptureCallback  _pvt_val
+    cdef cydriver.CUgraphRecaptureCallback* _pvt_ptr
+{{endif}}
+
 {{if 'CUcoredumpStatusCallback' in found_types}}
 
 cdef class CUcoredumpStatusCallback:
@@ -4348,7 +4363,9 @@ cdef class CUmemLocation_st:
     {{endif}}
     {{if 'CUmemLocation_st.id' in found_struct}}
     id : int
-
+        Identifier for CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST,
+        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST_NUMA.
     {{endif}}
 
     Methods
@@ -5163,6 +5180,10 @@ cdef class CUgraphNodeParams_st:
     conditional : CUDA_CONDITIONAL_NODE_PARAMS
         Conditional node parameters.
     {{endif}}
+    {{if 'CUgraphNodeParams_st.asBytes' in found_struct}}
+    asBytes : bytes
+        Padding as bytes
+    {{endif}}
     {{if 'CUgraphNodeParams_st.reserved2' in found_struct}}
     reserved2 : long long
         Reserved bytes. Must be zero.
@@ -5318,10 +5339,6 @@ cdef class CUcheckpointRestoreArgs_st:
     reserved : bytes
         Reserved for future use, must be zeroed
     {{endif}}
-    {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-    reserved1 : cuuint64_t
-        Reserved for future use, must be zeroed
-    {{endif}}
 
     Methods
     -------
@@ -5334,9 +5351,6 @@ cdef class CUcheckpointRestoreArgs_st:
     cdef size_t _gpuPairs_length
     cdef cydriver.CUcheckpointGpuPair* _gpuPairs
     {{endif}}
-    {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-    cdef cuuint64_t _reserved1
-    {{endif}}
 {{endif}}
 {{if 'CUcheckpointUnlockArgs_st' in found_struct}}
 
@@ -5420,6 +5434,114 @@ cdef class CUmemDecompressParams_st:
     cdef _HelperInputVoidPtr _cydst
     {{endif}}
 {{endif}}
+{{if 'CUlogicalEndpointFabricHandle_st' in found_struct}}
+
+cdef class CUlogicalEndpointFabricHandle_st:
+    """
+    Fabric handle for a logical endpoint
+
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointFabricHandle_st.data' in found_struct}}
+    data : bytes
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cydriver.CUlogicalEndpointFabricHandle_st _pvt_val
+    cdef cydriver.CUlogicalEndpointFabricHandle_st* _pvt_ptr
+{{endif}}
+{{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+
+cdef class anon_struct25:
+    """
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointProp_struct.unicast.device' in found_struct}}
+    device : CUdevice
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cydriver.CUlogicalEndpointProp_struct* _pvt_ptr
+    {{if 'CUlogicalEndpointProp_struct.unicast.device' in found_struct}}
+    cdef CUdevice _device
+    {{endif}}
+{{endif}}
+{{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+
+cdef class anon_struct26:
+    """
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointProp_struct.multicast.numDevices' in found_struct}}
+    numDevices : unsigned int
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cydriver.CUlogicalEndpointProp_struct* _pvt_ptr
+{{endif}}
+{{if 'CUlogicalEndpointProp_struct' in found_struct}}
+
+cdef class CUlogicalEndpointProp_struct:
+    """
+    Properties of a logical endpoint construction
+
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointProp_struct.type' in found_struct}}
+    type : CUlogicalEndpointType
+        Type of the logical endpoint defined in CUlogicalEndpointType
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+    unicast : anon_struct25
+
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+    multicast : anon_struct26
+
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.size' in found_struct}}
+    size : unsigned long long
+        Size of the logical endpoint
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.ipcHandleTypes' in found_struct}}
+    ipcHandleTypes : unsigned int
+        A bitmask of IPC handle types defined in
+        CUlogicalEndpointIpcHandleType
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.flags' in found_struct}}
+    flags : unsigned int
+        A bitmask of flags defined in CUlogicalEndpointFlag
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cydriver.CUlogicalEndpointProp_struct* _val_ptr
+    cdef cydriver.CUlogicalEndpointProp_struct* _pvt_ptr
+    {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+    cdef anon_struct25 _unicast
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+    cdef anon_struct26 _multicast
+    {{endif}}
+{{endif}}
 {{if 'CUdevSmResource_st' in found_struct}}
 
 cdef class CUdevSmResource_st:
@@ -5603,7 +5725,7 @@ cdef class CUdevResource_st:
 {{endif}}
 {{if True}}
 
-cdef class anon_union16:
+cdef class anon_union17:
     """
     Attributes
     ----------
@@ -5634,7 +5756,7 @@ cdef class CUeglFrame_st:
     Attributes
     ----------
     {{if True}}
-    frame : anon_union16
+    frame : anon_union17
 
     {{endif}}
     {{if True}}
@@ -5682,7 +5804,7 @@ cdef class CUeglFrame_st:
     cdef cydriver.CUeglFrame_st* _val_ptr
     cdef cydriver.CUeglFrame_st* _pvt_ptr
     {{if True}}
-    cdef anon_union16 _frame
+    cdef anon_union17 _frame
     {{endif}}
 {{endif}}
 {{if 'CUdeviceptr' in found_types}}
@@ -9978,7 +10100,9 @@ cdef class CUmemLocation_v1(CUmemLocation_st):
     {{endif}}
     {{if 'CUmemLocation_st.id' in found_struct}}
     id : int
-
+        Identifier for CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST,
+        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST_NUMA.
     {{endif}}
 
     Methods
@@ -10002,7 +10126,9 @@ cdef class CUmemLocation(CUmemLocation_v1):
     {{endif}}
     {{if 'CUmemLocation_st.id' in found_struct}}
     id : int
-
+        Identifier for CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST,
+        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST_NUMA.
     {{endif}}
 
     Methods
@@ -10994,6 +11120,10 @@ cdef class CUgraphNodeParams(CUgraphNodeParams_st):
     conditional : CUDA_CONDITIONAL_NODE_PARAMS
         Conditional node parameters.
     {{endif}}
+    {{if 'CUgraphNodeParams_st.asBytes' in found_struct}}
+    asBytes : bytes
+        Padding as bytes
+    {{endif}}
     {{if 'CUgraphNodeParams_st.reserved2' in found_struct}}
     reserved2 : long long
         Reserved bytes. Must be zero.
@@ -11100,10 +11230,6 @@ cdef class CUcheckpointRestoreArgs(CUcheckpointRestoreArgs_st):
     reserved : bytes
         Reserved for future use, must be zeroed
     {{endif}}
-    {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-    reserved1 : cuuint64_t
-        Reserved for future use, must be zeroed
-    {{endif}}
 
     Methods
     -------
@@ -11186,6 +11312,81 @@ cdef class CUmemDecompressParams(CUmemDecompressParams_st):
     """
     pass
 {{endif}}
+{{if 'CUlogicalEndpointId' in found_types}}
+
+cdef class CUlogicalEndpointId:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    cdef cydriver.CUlogicalEndpointId  _pvt_val
+    cdef cydriver.CUlogicalEndpointId* _pvt_ptr
+{{endif}}
+{{if 'CUlogicalEndpointFabricHandle' in found_types}}
+
+cdef class CUlogicalEndpointFabricHandle(CUlogicalEndpointFabricHandle_st):
+    """
+    Fabric handle for a logical endpoint
+
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointFabricHandle_st.data' in found_struct}}
+    data : bytes
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    pass
+{{endif}}
+{{if 'CUlogicalEndpointProp' in found_types}}
+
+cdef class CUlogicalEndpointProp(CUlogicalEndpointProp_struct):
+    """
+    Properties of a logical endpoint construction
+
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointProp_struct.type' in found_struct}}
+    type : CUlogicalEndpointType
+        Type of the logical endpoint defined in CUlogicalEndpointType
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+    unicast : anon_struct25
+
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+    multicast : anon_struct26
+
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.size' in found_struct}}
+    size : unsigned long long
+        Size of the logical endpoint
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.ipcHandleTypes' in found_struct}}
+    ipcHandleTypes : unsigned int
+        A bitmask of IPC handle types defined in
+        CUlogicalEndpointIpcHandleType
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.flags' in found_struct}}
+    flags : unsigned int
+        A bitmask of flags defined in CUlogicalEndpointFlag
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    pass
+{{endif}}
 {{if 'CUdevSmResource' in found_types}}
 
 cdef class CUdevSmResource(CUdevSmResource_st):
@@ -11401,7 +11602,7 @@ cdef class CUeglFrame_v1(CUeglFrame_st):
     Attributes
     ----------
     {{if True}}
-    frame : anon_union16
+    frame : anon_union17
 
     {{endif}}
     {{if True}}
@@ -11459,7 +11660,7 @@ cdef class CUeglFrame(CUeglFrame_v1):
     Attributes
     ----------
     {{if True}}
-    frame : anon_union16
+    frame : anon_union17
 
     {{endif}}
     {{if True}}
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 46562e7f63b..33fe9cdad77 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1630+gadce055ea.d20260422. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -2050,7 +2050,10 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
-        'Maximum optin shared memory per block\n'
+        'Maximum optin shared memory per block. That is shared memory that is\n'
+        'available for dynamic allocation or static allocation (including\n'
+        'architecture specific static shared memory) on this device but is not\n'
+        'guaranteed to be portable.\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES' in found_values}}
 
@@ -2400,6 +2403,43 @@ class CUdevice_attribute(_FastEnum):
         'Device supports atomic reduction operations in stream batch memory\n'
         'operations\n'
     ){{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED' in found_values}}
+
+    CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED = (
+        cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED,
+        'Device supports CIG streams with D3D12\n'
+    ){{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED' in found_values}}
+
+    CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED = (
+        cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED,
+        'Device supports mmap() of dmabuf file descriptors for CUDA device memory\n'
+        'allocations\n'
+    ){{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_SUPPORTED' in found_values}}
+
+    CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_SUPPORTED = (
+        cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_SUPPORTED,
+        'Device supports unicast logical endpoints\n'
+    ){{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_MULTICAST_SUPPORTED' in found_values}}
+
+    CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_MULTICAST_SUPPORTED = (
+        cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_MULTICAST_SUPPORTED,
+        'Device supports multicast logical endpoints\n'
+    ){{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_COUNTED_OPS_SUPPORTED' in found_values}}
+
+    CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_COUNTED_OPS_SUPPORTED = (
+        cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_COUNTED_OPS_SUPPORTED,
+        'Device supports counted operations via logical endpoints\n'
+    ){{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_ACCESS_ON_OWNER_DEVICE_SUPPORTED' in found_values}}
+
+    CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_ACCESS_ON_OWNER_DEVICE_SUPPORTED = (
+        cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_ACCESS_ON_OWNER_DEVICE_SUPPORTED,
+        'Device supports unicast logical endpoint access on the owner device\n'
+    ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MAX' in found_values}}
     CU_DEVICE_ATTRIBUTE_MAX = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAX{{endif}}
 
@@ -2709,6 +2749,13 @@ class CUfunction_attribute(_FastEnum):
         'CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. See\n'
         ':py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`\n'
     ){{endif}}
+    {{if 'CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED' in found_values}}
+
+    CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED = (
+        cydriver.CUfunction_attribute_enum.CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED,
+        'Whether the function can be updated on device. 1 means device node update\n'
+        'is supported, 0 is unsupported. See :py:obj:`~.cuFuncGetAttribute`.\n'
+    ){{endif}}
     {{if 'CU_FUNC_ATTRIBUTE_MAX' in found_values}}
     CU_FUNC_ATTRIBUTE_MAX = cydriver.CUfunction_attribute_enum.CU_FUNC_ATTRIBUTE_MAX{{endif}}
 
@@ -3964,6 +4011,12 @@ class CUgraphNodeType(_FastEnum):
         '                                        call\n'
         ':py:obj:`~.cudaGraphSetConditional` from device code.\n'
     ){{endif}}
+    {{if 'CU_GRAPH_NODE_TYPE_RESERVED_16' in found_values}}
+
+    CU_GRAPH_NODE_TYPE_RESERVED_16 = (
+        cydriver.CUgraphNodeType_enum.CU_GRAPH_NODE_TYPE_RESERVED_16,
+        'Reserved\n'
+    ){{endif}}
 
 {{endif}}
 {{if 'CUgraphDependencyType_enum' in found_types}}
@@ -5388,6 +5441,13 @@ class CUresult(_FastEnum):
         "associated with the stream has been destroyed, limiting the stream's\n"
         'operational capabilities.\n'
     ){{endif}}
+    {{if 'CUDA_ERROR_GRAPH_RECAPTURE_FAILURE' in found_values}}
+
+    CUDA_ERROR_GRAPH_RECAPTURE_FAILURE = (
+        cydriver.cudaError_enum.CUDA_ERROR_GRAPH_RECAPTURE_FAILURE,
+        'This error indicates that a graph recapture failed and had to be\n'
+        'terminated.\n'
+    ){{endif}}
     {{if 'CUDA_ERROR_UNKNOWN' in found_values}}
 
     CUDA_ERROR_UNKNOWN = (
@@ -6910,6 +6970,81 @@ class CUmemDecompressAlgorithm(_FastEnum):
         'LZ4 is supported.\n'
     ){{endif}}
 
+{{endif}}
+{{if 'CUlogicalEndpointIpcHandleType_enum' in found_types}}
+
+class CUlogicalEndpointIpcHandleType(_FastEnum):
+    """
+    IPC handle types that can be requested/queried for a given logical
+    endpoint
+    """
+    {{if 'CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE' in found_values}}
+    CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE = cydriver.CUlogicalEndpointIpcHandleType_enum.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE{{endif}}
+    {{if 'CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC' in found_values}}
+    CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC = cydriver.CUlogicalEndpointIpcHandleType_enum.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC{{endif}}
+
+{{endif}}
+{{if 'CUlogicalEndpointType_enum' in found_types}}
+
+class CUlogicalEndpointType(_FastEnum):
+    """
+    Logical endpoint type
+    """
+    {{if 'CU_LOGICAL_ENDPOINT_TYPE_INVALID' in found_values}}
+    CU_LOGICAL_ENDPOINT_TYPE_INVALID = cydriver.CUlogicalEndpointType_enum.CU_LOGICAL_ENDPOINT_TYPE_INVALID{{endif}}
+    {{if 'CU_LOGICAL_ENDPOINT_TYPE_UNICAST' in found_values}}
+    CU_LOGICAL_ENDPOINT_TYPE_UNICAST = cydriver.CUlogicalEndpointType_enum.CU_LOGICAL_ENDPOINT_TYPE_UNICAST{{endif}}
+    {{if 'CU_LOGICAL_ENDPOINT_TYPE_MULTICAST' in found_values}}
+    CU_LOGICAL_ENDPOINT_TYPE_MULTICAST = cydriver.CUlogicalEndpointType_enum.CU_LOGICAL_ENDPOINT_TYPE_MULTICAST{{endif}}
+
+{{endif}}
+{{if 'CUlogicalEndpointFlag_enum' in found_types}}
+
+class CUlogicalEndpointFlag(_FastEnum):
+    """
+    Flags for :py:obj:`~.CUlogicalEndpointProp`
+    """
+    {{if 'CU_LOGICAL_ENDPOINT_FLAG_NONE' in found_values}}
+
+    CU_LOGICAL_ENDPOINT_FLAG_NONE = (
+        cydriver.CUlogicalEndpointFlag_enum.CU_LOGICAL_ENDPOINT_FLAG_NONE,
+        'Default flag for logical endpoint construction\n'
+    ){{endif}}
+    {{if 'CU_LOGICAL_ENDPOINT_FLAG_COUNTED_OPS' in found_values}}
+
+    CU_LOGICAL_ENDPOINT_FLAG_COUNTED_OPS = (
+        cydriver.CUlogicalEndpointFlag_enum.CU_LOGICAL_ENDPOINT_FLAG_COUNTED_OPS,
+        "Indicate the programmer's intention to use counted operations with the\n"
+        'logical endpoint\n'
+    ){{endif}}
+
+{{endif}}
+{{if 'CUgraphRecaptureStatus_enum' in found_types}}
+
+class CUgraphRecaptureStatus(_FastEnum):
+    """
+
+    """
+    {{if 'CU_GRAPH_RECAPTURE_ELIGIBLE_FOR_UPDATE' in found_values}}
+
+    CU_GRAPH_RECAPTURE_ELIGIBLE_FOR_UPDATE = (
+        cydriver.CUgraphRecaptureStatus_enum.CU_GRAPH_RECAPTURE_ELIGIBLE_FOR_UPDATE,
+        'Node is eligible for update in an instantiated graph.\n'
+    ){{endif}}
+    {{if 'CU_GRAPH_RECAPTURE_INELIGIBLE_FOR_UPDATE' in found_values}}
+
+    CU_GRAPH_RECAPTURE_INELIGIBLE_FOR_UPDATE = (
+        cydriver.CUgraphRecaptureStatus_enum.CU_GRAPH_RECAPTURE_INELIGIBLE_FOR_UPDATE,
+        'Parameter changes in the node cannot be applied to an instantiated graph.\n'
+    ){{endif}}
+    {{if 'CU_GRAPH_RECAPTURE_ERROR' in found_values}}
+
+    CU_GRAPH_RECAPTURE_ERROR = (
+        cydriver.CUgraphRecaptureStatus_enum.CU_GRAPH_RECAPTURE_ERROR,
+        'Error while attempting to recapture the node. The recapture will be ended\n'
+        'regardless of the return value from the callback.\n'
+    ){{endif}}
+
 {{endif}}
 {{if 'CUfunctionLoadingState_enum' in found_types}}
 
@@ -6973,19 +7108,27 @@ class CUCoredumpGenerationFlags(_FastEnum):
     CU_COREDUMP_LIGHTWEIGHT_FLAGS = cydriver.CUCoredumpGenerationFlags.CU_COREDUMP_LIGHTWEIGHT_FLAGS{{endif}}
     {{if 'CU_COREDUMP_GZIP_COMPRESS' in found_values}}
     CU_COREDUMP_GZIP_COMPRESS = cydriver.CUCoredumpGenerationFlags.CU_COREDUMP_GZIP_COMPRESS{{endif}}
+    {{if 'CU_COREDUMP_FAULTED_CONTEXTS_ONLY' in found_values}}
+    CU_COREDUMP_FAULTED_CONTEXTS_ONLY = cydriver.CUCoredumpGenerationFlags.CU_COREDUMP_FAULTED_CONTEXTS_ONLY{{endif}}
+    {{if 'CU_COREDUMP_NO_ERRBAR_AT_EXIT' in found_values}}
+    CU_COREDUMP_NO_ERRBAR_AT_EXIT = cydriver.CUCoredumpGenerationFlags.CU_COREDUMP_NO_ERRBAR_AT_EXIT{{endif}}
+    {{if 'CU_COREDUMP_LOG_ONLY' in found_values}}
+    CU_COREDUMP_LOG_ONLY = cydriver.CUCoredumpGenerationFlags.CU_COREDUMP_LOG_ONLY{{endif}}
 
 {{endif}}
 {{if 'CUgreenCtxCreate_flags' in found_types}}
 
 class CUgreenCtxCreate_flags(_FastEnum):
     """
-
+    Flags for green context creation
     """
+    {{if 'CU_GREEN_CTX_NONE' in found_values}}
+    CU_GREEN_CTX_NONE = cydriver.CUgreenCtxCreate_flags.CU_GREEN_CTX_NONE{{endif}}
     {{if 'CU_GREEN_CTX_DEFAULT_STREAM' in found_values}}
 
     CU_GREEN_CTX_DEFAULT_STREAM = (
         cydriver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM,
-        'Required. Creates a default stream to use inside the green context\n'
+        'Creates a default stream to use inside the green context\n'
     ){{endif}}
 
 {{endif}}
@@ -8661,6 +8804,34 @@ cdef class CUmemGenericAllocationHandle:
         return <void_ptr>self._pvt_ptr
 {{endif}}
 
+{{if 'CUlogicalEndpointId' in found_types}}
+
+cdef class CUlogicalEndpointId:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    def __cinit__(self, uint32_t init_value = 0, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cydriver.CUlogicalEndpointId *>_ptr
+        if init_value:
+            self._pvt_ptr[0] = init_value
+    def __dealloc__(self):
+        pass
+    def __repr__(self):
+        return '<CUlogicalEndpointId ' + str(self.__int__()) + '>'
+    def __int__(self):
+        return <uint32_t>self._pvt_ptr[0]
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+{{endif}}
+
 {{if 'CUcontext' in found_types}}
 
 cdef class CUcontext:
@@ -9880,6 +10051,35 @@ cdef class CUoccupancyB2DSize:
         return <void_ptr>self._pvt_ptr
 {{endif}}
 
+{{if 'CUgraphRecaptureCallback' in found_types}}
+
+cdef class CUgraphRecaptureCallback:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    def __cinit__(self, void_ptr init_value = 0, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+            self._pvt_ptr[0] = <cydriver.CUgraphRecaptureCallback>init_value
+        else:
+            self._pvt_ptr = <cydriver.CUgraphRecaptureCallback *>_ptr
+    def __init__(self, *args, **kwargs):
+        pass
+    def __repr__(self):
+        return '<CUgraphRecaptureCallback ' + str(hex(self.__int__())) + '>'
+    def __index__(self):
+        return self.__int__()
+    def __int__(self):
+        return <void_ptr>self._pvt_ptr[0]
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+{{endif}}
+
 {{if 'CUcoredumpStatusCallback' in found_types}}
 
 cdef class CUcoredumpStatusCallback:
@@ -21701,7 +21901,9 @@ cdef class CUmemLocation_st:
     {{endif}}
     {{if 'CUmemLocation_st.id' in found_struct}}
     id : int
-
+        Identifier for CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST,
+        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST_NUMA.
     {{endif}}
 
     Methods
@@ -23931,6 +24133,10 @@ cdef class CUgraphNodeParams_st:
     conditional : CUDA_CONDITIONAL_NODE_PARAMS
         Conditional node parameters.
     {{endif}}
+    {{if 'CUgraphNodeParams_st.asBytes' in found_struct}}
+    asBytes : bytes
+        Padding as bytes
+    {{endif}}
     {{if 'CUgraphNodeParams_st.reserved2' in found_struct}}
     reserved2 : long long
         Reserved bytes. Must be zero.
@@ -24092,6 +24298,12 @@ cdef class CUgraphNodeParams_st:
             except ValueError:
                 str_list += ['conditional : <ValueError>']
             {{endif}}
+            {{if 'CUgraphNodeParams_st.asBytes' in found_struct}}
+            try:
+                str_list += ['asBytes : ' + str(self.asBytes)]
+            except ValueError:
+                str_list += ['asBytes : <ValueError>']
+            {{endif}}
             {{if 'CUgraphNodeParams_st.reserved2' in found_struct}}
             try:
                 str_list += ['reserved2 : ' + str(self.reserved2)]
@@ -24229,6 +24441,25 @@ cdef class CUgraphNodeParams_st:
     def conditional(self, conditional not None : CUDA_CONDITIONAL_NODE_PARAMS):
         string.memcpy(&self._pvt_ptr[0].conditional, <cydriver.CUDA_CONDITIONAL_NODE_PARAMS*><void_ptr>conditional.getPtr(), sizeof(self._pvt_ptr[0].conditional))
     {{endif}}
+    {{if 'CUgraphNodeParams_st.asBytes' in found_struct}}
+    @property
+    def asBytes(self):
+        return PyBytes_FromStringAndSize(self._pvt_ptr[0].asBytes, 232)
+    @asBytes.setter
+    def asBytes(self, asBytes):
+        if len(asBytes) != 232:
+            raise ValueError("asBytes length must be 232, is " + str(len(asBytes)))
+        if CHAR_MIN == 0:
+            for i, b in enumerate(asBytes):
+                if b < 0 and b > -129:
+                    b = b + 256
+                self._pvt_ptr[0].asBytes[i] = b
+        else:
+            for i, b in enumerate(asBytes):
+                if b > 127 and b < 256:
+                    b = b - 256
+                self._pvt_ptr[0].asBytes[i] = b
+    {{endif}}
     {{if 'CUgraphNodeParams_st.reserved2' in found_struct}}
     @property
     def reserved2(self):
@@ -24472,10 +24703,6 @@ cdef class CUcheckpointRestoreArgs_st:
     reserved : bytes
         Reserved for future use, must be zeroed
     {{endif}}
-    {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-    reserved1 : cuuint64_t
-        Reserved for future use, must be zeroed
-    {{endif}}
 
     Methods
     -------
@@ -24489,9 +24716,6 @@ cdef class CUcheckpointRestoreArgs_st:
             self._pvt_ptr = <cydriver.CUcheckpointRestoreArgs_st *>_ptr
     def __init__(self, void_ptr _ptr = 0):
         pass
-        {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-        self._reserved1 = cuuint64_t(_ptr=<void_ptr>&self._pvt_ptr[0].reserved1)
-        {{endif}}
     def __dealloc__(self):
         pass
         {{if 'CUcheckpointRestoreArgs_st.gpuPairs' in found_struct}}
@@ -24522,12 +24746,6 @@ cdef class CUcheckpointRestoreArgs_st:
             except ValueError:
                 str_list += ['reserved : <ValueError>']
             {{endif}}
-            {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-            try:
-                str_list += ['reserved1 : ' + str(self.reserved1)]
-            except ValueError:
-                str_list += ['reserved1 : <ValueError>']
-            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -24566,11 +24784,11 @@ cdef class CUcheckpointRestoreArgs_st:
     {{if 'CUcheckpointRestoreArgs_st.reserved' in found_struct}}
     @property
     def reserved(self):
-        return PyBytes_FromStringAndSize(self._pvt_ptr[0].reserved, 44)
+        return PyBytes_FromStringAndSize(self._pvt_ptr[0].reserved, 52)
     @reserved.setter
     def reserved(self, reserved):
-        if len(reserved) != 44:
-            raise ValueError("reserved length must be 44, is " + str(len(reserved)))
+        if len(reserved) != 52:
+            raise ValueError("reserved length must be 52, is " + str(len(reserved)))
         if CHAR_MIN == 0:
             for i, b in enumerate(reserved):
                 if b < 0 and b > -129:
@@ -24582,24 +24800,6 @@ cdef class CUcheckpointRestoreArgs_st:
                     b = b - 256
                 self._pvt_ptr[0].reserved[i] = b
     {{endif}}
-    {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-    @property
-    def reserved1(self):
-        return self._reserved1
-    @reserved1.setter
-    def reserved1(self, reserved1):
-        cdef cydriver.cuuint64_t cyreserved1
-        if reserved1 is None:
-            cyreserved1 = <cydriver.cuuint64_t><void_ptr>0
-        elif isinstance(reserved1, (cuuint64_t)):
-            preserved1 = int(reserved1)
-            cyreserved1 = <cydriver.cuuint64_t><void_ptr>preserved1
-        else:
-            preserved1 = int(cuuint64_t(reserved1))
-            cyreserved1 = <cydriver.cuuint64_t><void_ptr>preserved1
-        self._reserved1._pvt_ptr[0] = cyreserved1
-
-    {{endif}}
 {{endif}}
 {{if 'CUcheckpointUnlockArgs_st' in found_struct}}
 
@@ -24822,6 +25022,313 @@ cdef class CUmemDecompressParams_st:
             self._pvt_ptr[0].padding[i] = b
     {{endif}}
 {{endif}}
+{{if 'CUlogicalEndpointFabricHandle_st' in found_struct}}
+
+cdef class CUlogicalEndpointFabricHandle_st:
+    """
+    Fabric handle for a logical endpoint
+
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointFabricHandle_st.data' in found_struct}}
+    data : bytes
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cydriver.CUlogicalEndpointFabricHandle_st *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'CUlogicalEndpointFabricHandle_st.data' in found_struct}}
+            try:
+                str_list += ['data : ' + str(self.data)]
+            except ValueError:
+                str_list += ['data : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'CUlogicalEndpointFabricHandle_st.data' in found_struct}}
+    @property
+    def data(self):
+        return PyBytes_FromStringAndSize(<char*>self._pvt_ptr[0].data, 64)
+    @data.setter
+    def data(self, data):
+        if len(data) != 64:
+            raise ValueError("data length must be 64, is " + str(len(data)))
+        for i, b in enumerate(data):
+            self._pvt_ptr[0].data[i] = b
+    {{endif}}
+{{endif}}
+{{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+
+cdef class anon_struct25:
+    """
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointProp_struct.unicast.device' in found_struct}}
+    device : CUdevice
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr):
+        self._pvt_ptr = <cydriver.CUlogicalEndpointProp_struct *>_ptr
+
+    def __init__(self, void_ptr _ptr):
+        pass
+        {{if 'CUlogicalEndpointProp_struct.unicast.device' in found_struct}}
+        self._device = CUdevice(_ptr=<void_ptr>&self._pvt_ptr[0].unicast.device)
+        {{endif}}
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>&self._pvt_ptr[0].unicast
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'CUlogicalEndpointProp_struct.unicast.device' in found_struct}}
+            try:
+                str_list += ['device : ' + str(self.device)]
+            except ValueError:
+                str_list += ['device : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'CUlogicalEndpointProp_struct.unicast.device' in found_struct}}
+    @property
+    def device(self):
+        return self._device
+    @device.setter
+    def device(self, device):
+        cdef cydriver.CUdevice cydevice
+        if device is None:
+            cydevice = <cydriver.CUdevice><void_ptr>0
+        elif isinstance(device, (CUdevice)):
+            pdevice = int(device)
+            cydevice = <cydriver.CUdevice><void_ptr>pdevice
+        else:
+            pdevice = int(CUdevice(device))
+            cydevice = <cydriver.CUdevice><void_ptr>pdevice
+        self._device._pvt_ptr[0] = cydevice
+
+    {{endif}}
+{{endif}}
+{{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+
+cdef class anon_struct26:
+    """
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointProp_struct.multicast.numDevices' in found_struct}}
+    numDevices : unsigned int
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr):
+        self._pvt_ptr = <cydriver.CUlogicalEndpointProp_struct *>_ptr
+
+    def __init__(self, void_ptr _ptr):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>&self._pvt_ptr[0].multicast
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'CUlogicalEndpointProp_struct.multicast.numDevices' in found_struct}}
+            try:
+                str_list += ['numDevices : ' + str(self.numDevices)]
+            except ValueError:
+                str_list += ['numDevices : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'CUlogicalEndpointProp_struct.multicast.numDevices' in found_struct}}
+    @property
+    def numDevices(self):
+        return self._pvt_ptr[0].multicast.numDevices
+    @numDevices.setter
+    def numDevices(self, unsigned int numDevices):
+        self._pvt_ptr[0].multicast.numDevices = numDevices
+    {{endif}}
+{{endif}}
+{{if 'CUlogicalEndpointProp_struct' in found_struct}}
+
+cdef class CUlogicalEndpointProp_struct:
+    """
+    Properties of a logical endpoint construction
+
+    Attributes
+    ----------
+    {{if 'CUlogicalEndpointProp_struct.type' in found_struct}}
+    type : CUlogicalEndpointType
+        Type of the logical endpoint defined in CUlogicalEndpointType
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+    unicast : anon_struct25
+
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+    multicast : anon_struct26
+
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.size' in found_struct}}
+    size : unsigned long long
+        Size of the logical endpoint
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.ipcHandleTypes' in found_struct}}
+    ipcHandleTypes : unsigned int
+        A bitmask of IPC handle types defined in
+        CUlogicalEndpointIpcHandleType
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.flags' in found_struct}}
+    flags : unsigned int
+        A bitmask of flags defined in CUlogicalEndpointFlag
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._val_ptr = <cydriver.CUlogicalEndpointProp_struct *>calloc(1, sizeof(cydriver.CUlogicalEndpointProp_struct))
+            self._pvt_ptr = self._val_ptr
+        else:
+            self._pvt_ptr = <cydriver.CUlogicalEndpointProp_struct *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+        {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+        self._unicast = anon_struct25(_ptr=<void_ptr>self._pvt_ptr)
+        {{endif}}
+        {{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+        self._multicast = anon_struct26(_ptr=<void_ptr>self._pvt_ptr)
+        {{endif}}
+    def __dealloc__(self):
+        if self._val_ptr is not NULL:
+            free(self._val_ptr)
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'CUlogicalEndpointProp_struct.type' in found_struct}}
+            try:
+                str_list += ['type : ' + str(self.type)]
+            except ValueError:
+                str_list += ['type : <ValueError>']
+            {{endif}}
+            {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+            try:
+                str_list += ['unicast :\n' + '\n'.join(['    ' + line for line in str(self.unicast).splitlines()])]
+            except ValueError:
+                str_list += ['unicast : <ValueError>']
+            {{endif}}
+            {{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+            try:
+                str_list += ['multicast :\n' + '\n'.join(['    ' + line for line in str(self.multicast).splitlines()])]
+            except ValueError:
+                str_list += ['multicast : <ValueError>']
+            {{endif}}
+            {{if 'CUlogicalEndpointProp_struct.size' in found_struct}}
+            try:
+                str_list += ['size : ' + str(self.size)]
+            except ValueError:
+                str_list += ['size : <ValueError>']
+            {{endif}}
+            {{if 'CUlogicalEndpointProp_struct.ipcHandleTypes' in found_struct}}
+            try:
+                str_list += ['ipcHandleTypes : ' + str(self.ipcHandleTypes)]
+            except ValueError:
+                str_list += ['ipcHandleTypes : <ValueError>']
+            {{endif}}
+            {{if 'CUlogicalEndpointProp_struct.flags' in found_struct}}
+            try:
+                str_list += ['flags : ' + str(self.flags)]
+            except ValueError:
+                str_list += ['flags : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'CUlogicalEndpointProp_struct.type' in found_struct}}
+    @property
+    def type(self):
+        return CUlogicalEndpointType(self._pvt_ptr[0].type)
+    @type.setter
+    def type(self, type not None : CUlogicalEndpointType):
+        self._pvt_ptr[0].type = int(type)
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
+    @property
+    def unicast(self):
+        return self._unicast
+    @unicast.setter
+    def unicast(self, unicast not None : anon_struct25):
+        string.memcpy(&self._pvt_ptr[0].unicast, <cydriver.anon_struct25*><void_ptr>unicast.getPtr(), sizeof(self._pvt_ptr[0].unicast))
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.multicast' in found_struct}}
+    @property
+    def multicast(self):
+        return self._multicast
+    @multicast.setter
+    def multicast(self, multicast not None : anon_struct26):
+        string.memcpy(&self._pvt_ptr[0].multicast, <cydriver.anon_struct26*><void_ptr>multicast.getPtr(), sizeof(self._pvt_ptr[0].multicast))
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.size' in found_struct}}
+    @property
+    def size(self):
+        return self._pvt_ptr[0].size
+    @size.setter
+    def size(self, unsigned long long size):
+        self._pvt_ptr[0].size = size
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.ipcHandleTypes' in found_struct}}
+    @property
+    def ipcHandleTypes(self):
+        return self._pvt_ptr[0].ipcHandleTypes
+    @ipcHandleTypes.setter
+    def ipcHandleTypes(self, unsigned int ipcHandleTypes):
+        self._pvt_ptr[0].ipcHandleTypes = ipcHandleTypes
+    {{endif}}
+    {{if 'CUlogicalEndpointProp_struct.flags' in found_struct}}
+    @property
+    def flags(self):
+        return self._pvt_ptr[0].flags
+    @flags.setter
+    def flags(self, unsigned int flags):
+        self._pvt_ptr[0].flags = flags
+    {{endif}}
+{{endif}}
 {{if 'CUdevSmResource_st' in found_struct}}
 
 cdef class CUdevSmResource_st:
@@ -25398,7 +25905,7 @@ cdef class CUdevResource_st:
 {{endif}}
 {{if True}}
 
-cdef class anon_union16:
+cdef class anon_union17:
     """
     Attributes
     ----------
@@ -25480,7 +25987,7 @@ cdef class CUeglFrame_st:
     Attributes
     ----------
     {{if True}}
-    frame : anon_union16
+    frame : anon_union17
 
     {{endif}}
     {{if True}}
@@ -25534,7 +26041,7 @@ cdef class CUeglFrame_st:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if True}}
-        self._frame = anon_union16(_ptr=<void_ptr>self._pvt_ptr)
+        self._frame = anon_union17(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -25612,8 +26119,8 @@ cdef class CUeglFrame_st:
     def frame(self):
         return self._frame
     @frame.setter
-    def frame(self, frame not None : anon_union16):
-        string.memcpy(&self._pvt_ptr[0].frame, <cydriver.anon_union16*><void_ptr>frame.getPtr(), sizeof(self._pvt_ptr[0].frame))
+    def frame(self, frame not None : anon_union17):
+        string.memcpy(&self._pvt_ptr[0].frame, <cydriver.anon_union17*><void_ptr>frame.getPtr(), sizeof(self._pvt_ptr[0].frame))
     {{endif}}
     {{if True}}
     @property
@@ -28200,18 +28707,14 @@ def cuCtxGetLimit(limit not None : CUlimit):
     Parameters
     ----------
     limit : :py:obj:`~.CUlimit`
-        Limit to query
+        None
 
     Returns
     -------
     CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_LIMIT`
-    pvalue : int
-        Returned size of limit
 
-    See Also
-    --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cudaDeviceGetLimit`
+    pvalue : int
+        None
     """
     cdef size_t pvalue = 0
     cdef cydriver.CUlimit cylimit = int(limit)
@@ -35581,6 +36084,16 @@ def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmem
     on a supported platform, will give a DMA_BUF handle mapped via PCIE
     BAR1 or will return an error otherwise.
 
+    If the device attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED` is set and a
+    CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD is requested
+    for a device memory range then the returned dmabuf file descriptor may
+    be passed as the file descriptor argument to the mmap() system call.
+
+    For device memory on x86 systems the mapping will be a write combined
+    mapping. On coherent ARM platforms these mappings will be regular
+    cached memory. On all other platforms these mappings will be uncached.
+
     Parameters
     ----------
     dptr : :py:obj:`~.CUdeviceptr`
@@ -37046,9 +37559,11 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     of the host memory node. Specifying
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT` as the
     :py:obj:`~.CUmemPoolProps`::CUmemLocation::type will result in
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. By default, the pool's memory
-    will be accessible from the device it is allocated on. In the case of
-    pools created with :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` or
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+
+    By default, the pool's memory will be accessible from the device it is
+    allocated on. In the case of pools created with
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` or
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`, their default accessibility will
     be from the host CPU. Applications can control the maximum size of the
     pool by specifying a non-zero value for
@@ -37173,14 +37688,13 @@ def cuMemGetDefaultMemPool(location : Optional[CUmemLocation], typename not None
 
     The memory location can be of one of
     :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
-    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` or
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`, or
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`. The allocation type can be
     one of :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED` or
     :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`. When the allocation type is
     :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`, the location type can also
     be :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to indicate no preferred
-    location for the managed memory pool. In all other cases, the call
-    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+    location for the managed memory pool.
 
     Parameters
     ----------
@@ -37219,6 +37733,7 @@ def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     The memory location can be of one of
     :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` or
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, or
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`. The allocation type can be
     one of :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED` or
     :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`. When the allocation type is
@@ -37271,14 +37786,16 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
 
     The memory location can be of one of
     :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
-    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` or
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` or or
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`. The allocation type can be
     one of :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED` or
     :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`. When the allocation type is
     :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`, the location type can also
     be :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to indicate no preferred
-    location for the managed memory pool. In all other cases, the call
-    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+    location for the managed memory pool.
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED` can not be used with
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE_MEMORY_NODE`. In all other
+    cases, the call returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     When a memory pool is set as the current memory pool, the location
     parameter should be the same as the location of the pool. The location
@@ -38177,6 +38694,785 @@ def cuMulticastGetGranularity(prop : Optional[CUmulticastObjectProp], option not
     return (_CUresult_SUCCESS, granularity)
 {{endif}}
 
+{{if 'cuLogicalEndpointIdReserve' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointIdReserve(count):
+    """ Reserves a range of logical endpoint ids.
+
+    Reserves a range of logical endpoint ids starting at `*baseLeId` and
+    extending for `count`. The reserved ids can be used to create or import
+    logical endpoints via :py:obj:`~.cuLogicalEndpointCreate` or
+    :py:obj:`~.cuLogicalEndpointImport` respectively.
+
+    Parameters
+    ----------
+    count : Any
+        The number of logical endpoint ids to reserve.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
+    baseLeId : :py:obj:`~.CUlogicalEndpointId`
+        If :py:obj:`~.cuLogicalEndpointIdReserve` returns CUDA_SUCCESS,
+        *baseLeId contains the base logical endpoint id of the reserved
+        logical endpoint id range.
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.cuuint32_t cycount
+    if count is None:
+        pcount = 0
+    elif isinstance(count, (cuuint32_t,)):
+        pcount = int(count)
+    else:
+        pcount = int(cuuint32_t(count))
+    cycount = <cydriver.cuuint32_t><void_ptr>pcount
+    cdef CUlogicalEndpointId baseLeId = CUlogicalEndpointId()
+    with nogil:
+        err = cydriver.cuLogicalEndpointIdReserve(<cydriver.CUlogicalEndpointId*>baseLeId._pvt_ptr, cycount)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_CUresult(err), None)
+    return (_CUresult_SUCCESS, baseLeId)
+{{endif}}
+
+{{if 'cuLogicalEndpointIdRelease' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointIdRelease(baseLeId, count):
+    """ Releases a range of logical endpoint ids.
+
+    Releases up to `count` logical endpoint ids starting at `baseLeId`. The
+    range of ids represented by [`baseLeId`, `baseLeId` + `count`) must all
+    be previously reserved. All logical endpoints in the range must be
+    destroyed before they can be released.
+
+    Parameters
+    ----------
+    baseLeId : :py:obj:`~.CUlogicalEndpointId`
+        First logical endpoint id to be released back to the system.
+    count : Any
+        Number of logical endpoint ids to release back to the system.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.cuuint32_t cycount
+    if count is None:
+        pcount = 0
+    elif isinstance(count, (cuuint32_t,)):
+        pcount = int(count)
+    else:
+        pcount = int(cuuint32_t(count))
+    cycount = <cydriver.cuuint32_t><void_ptr>pcount
+    cdef cydriver.CUlogicalEndpointId cybaseLeId
+    if baseLeId is None:
+        pbaseLeId = 0
+    elif isinstance(baseLeId, (CUlogicalEndpointId,)):
+        pbaseLeId = int(baseLeId)
+    else:
+        pbaseLeId = int(CUlogicalEndpointId(baseLeId))
+    cybaseLeId = <cydriver.CUlogicalEndpointId><void_ptr>pbaseLeId
+    with nogil:
+        err = cydriver.cuLogicalEndpointIdRelease(cybaseLeId, cycount)
+    return (_CUresult(err),)
+{{endif}}
+
+{{if 'cuLogicalEndpointCreate' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointCreate(leId, prop : Optional[CUlogicalEndpointProp]):
+    """ Creates a logical endpoint with the requested properties and associates it with the logical endpoint id.
+
+    This creates a logical endpoint as described by `prop`. The number of
+    participating devices is determined by the
+    :py:obj:`~.CUlogicalEndpointProp.type`. If the type is
+    :py:obj:`~.CU_LOGICAL_ENDPOINT_TYPE_UNICAST` then
+    :py:obj:`~.CUlogicalEndpointProp.unicast.device` specifies the owner
+    device of the unicast logical endpoint. If the type is
+    :py:obj:`~.CU_LOGICAL_ENDPOINT_TYPE_MULTICAST` then
+    :py:obj:`~.CUlogicalEndpointProp.multicast.numDevices` specifies the
+    number of devices in the multicast logical endpoint team.
+
+    Devices can be added to a multicast logical endpoint via
+    :py:obj:`~.cuLogicalEndpointAddDevice`. After all the participating
+    devices have been added, a call to :py:obj:`~.cuLogicalEndpointQuery`
+    must be made to ensure that the logical endpoint is ready for memory
+    binding and access.
+
+    A unicast logical endpoint does not have a notion of adding devices via
+    :py:obj:`~.cuLogicalEndpointAddDevice`. However, a call to
+    :py:obj:`~.cuLogicalEndpointQuery` must still be made to ensure that
+    the logical endpoint is ready for memory binding and access.
+
+    Memory is bound to the logical endpoint via either
+    :py:obj:`~.cuLogicalEndpointBindAddr` or
+    :py:obj:`~.cuLogicalEndpointBindMem`, and can be unbound via
+    :py:obj:`~.cuLogicalEndpointUnbind`. The total amount of memory that
+    can be bound per device is specified by
+    :py:obj:`~.CUlogicalEndpointProp.size`. This size must be a multiple of
+    the value for `bindAlignment` as returned by
+    :py:obj:`~.cuLogicalEndpointGetLimits`. The maximum size for the
+    logical endpoint cannot exceed the value for `maxSize` as returned by
+    :py:obj:`~.cuLogicalEndpointGetLimits`. The bind alignment and maximum
+    size depend on the properties of the logical endpoint.
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        Logical endpoint id that will be associated with the newly created
+        logical endpoint.
+    prop : :py:obj:`~.CUlogicalEndpointProp`
+        Properties of the logical endpoint to create.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    cdef cydriver.CUlogicalEndpointProp* cyprop_ptr = <cydriver.CUlogicalEndpointProp*>prop._pvt_ptr if prop is not None else NULL
+    with nogil:
+        err = cydriver.cuLogicalEndpointCreate(cyleId, cyprop_ptr)
+    return (_CUresult(err),)
+{{endif}}
+
+{{if 'cuLogicalEndpointAddDevice' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointAddDevice(leId, dev):
+    """ Associates a device to a multicast logical endpoint.
+
+    Associates a device to a logical endpoint. The type of the logical
+    endpoint must be :py:obj:`~.CU_LOGICAL_ENDPOINT_TYPE_MULTICAST`. The
+    added device will be a part of the multicast team of size specified by
+    :py:obj:`~.CUlogicalEndpointProp.multicast.numDevices` during
+    :py:obj:`~.cuLogicalEndpointCreate`. The association of the device to
+    the multicast logical endpoint is permanent during the life time of the
+    multicast logical endpoint. All devices must be added to the multicast
+    logical endpoint before any memory can be bound to any device in the
+    team. A multicast logical endpoint will not be ready for use until all
+    devices have been added. User can query whether the logical endpoint is
+    ready for use via :py:obj:`~.cuLogicalEndpointQuery`.
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        Logical endpoint id representing a multicast logical endpoint.
+    dev : :py:obj:`~.CUdevice`
+        Device that will be associated with the multicast logical endpoint.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`,
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.CUdevice cydev
+    if dev is None:
+        pdev = 0
+    elif isinstance(dev, (CUdevice,)):
+        pdev = int(dev)
+    else:
+        pdev = int(CUdevice(dev))
+    cydev = <cydriver.CUdevice>pdev
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    with nogil:
+        err = cydriver.cuLogicalEndpointAddDevice(cyleId, cydev)
+    return (_CUresult(err),)
+{{endif}}
+
+{{if 'cuLogicalEndpointDestroy' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointDestroy(leId):
+    """ Removes the association of the logical endpoint from the logical endpoint id.
+
+    Removes the association between the logical endpoint id and the logical
+    endpoint resources. Any memory bound by this process to any device
+    associated with the logical endpoint will be unbound. If this was the
+    last reference to the logical endpoint, all associated resources will
+    be destroyed.
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        Logical endpoint id of the logical endpoint to be destroyed.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`,
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    with nogil:
+        err = cydriver.cuLogicalEndpointDestroy(cyleId)
+    return (_CUresult(err),)
+{{endif}}
+
+{{if 'cuLogicalEndpointBindAddr' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointBindAddr(leId, dev, offset, ptr, size, unsigned long long flags):
+    """ Bind a memory allocation represented by a virtual address to a logical endpoint.
+
+    Binds the memory allocation specified by its mapped address `ptr` to a
+    logical endpoint represented by `leId` at the offset `offset`. The
+    memory must have been allocated via :py:obj:`~.cuMemCreate` or
+    :py:obj:`~.cudaMallocAsync`. The intended `size` of the bind, the
+    `offset` in the logical endpoint range and `ptr` must be multiples of
+    the value for `bindAlignment` as returned by
+    :py:obj:`~.cuLogicalEndpointGetLimits`.
+
+    The `size` cannot be larger than the size of the allocated memory.
+    Similarly the `size` + `offset` cannot be larger than the total size of
+    the logical endpoint.
+
+    For device memory, i.e., type :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+    the memory allocation must have been created on the device specified by
+    `dev`. For host NUMA memory, i.e., type
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, the memory allocation must
+    have been created on the CPU NUMA node closest to `dev`. That is, the
+    value returned when querying
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for `dev`, must be the CPU
+    NUMA node where the memory was allocated.
+
+    For multicast endpoints, the device named by `dev` must have been added
+    to the multicast team via :py:obj:`~.cuLogicalEndpointAddDevice`.
+
+    For unicast endpoints the device named by `dev` must be the owner
+    device specified during :py:obj:`~.cuLogicalEndpointCreate` via
+    :py:obj:`~.CUlogicalEndpointProp.unicast.device`.
+
+    Externally shareable as well as imported multicast endpoints can be
+    bound only to externally shareable memory. Imported unicast endpoints
+    cannot be bound to any memory.
+
+    This call will return :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if
+    :py:obj:`~.cuLogicalEndpointQuery` has not been called for the logical
+    endpoint to ensure that the endpoint is ready for memory binding.
+
+    Note that this call will return :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` if
+    there are insufficient resources required to perform the bind. This
+    call may also return :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY` if the
+    necessary system software is not initialized or running. This call may
+    return :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` if the system configuration
+    is in an illegal state. In such cases, to continue using logical
+    endpoints, verify that the system configuration is in a valid state and
+    all required driver daemons are running properly.
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        Logical endpoint to which memory will be associated.
+    dev : :py:obj:`~.CUdevice`
+        Device on which the memory will be bound to the logical endpoint
+    offset : Any
+        Offset into the logical endpoint space.
+    ptr : Any
+        Virtual address of the memory allocation.
+    size : Any
+        Size of memory that will be bound to the logical endpoint.
+    flags : unsigned long long
+        Flags for future use, must be zero for now.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.cuuint64_t cysize
+    if size is None:
+        psize = 0
+    elif isinstance(size, (cuuint64_t,)):
+        psize = int(size)
+    else:
+        psize = int(cuuint64_t(size))
+    cysize = <cydriver.cuuint64_t><void_ptr>psize
+    cdef cydriver.cuuint64_t cyoffset
+    if offset is None:
+        poffset = 0
+    elif isinstance(offset, (cuuint64_t,)):
+        poffset = int(offset)
+    else:
+        poffset = int(cuuint64_t(offset))
+    cyoffset = <cydriver.cuuint64_t><void_ptr>poffset
+    cdef cydriver.CUdevice cydev
+    if dev is None:
+        pdev = 0
+    elif isinstance(dev, (CUdevice,)):
+        pdev = int(dev)
+    else:
+        pdev = int(CUdevice(dev))
+    cydev = <cydriver.CUdevice>pdev
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    cdef _HelperInputVoidPtrStruct cyptrHelper
+    cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper)
+    with nogil:
+        err = cydriver.cuLogicalEndpointBindAddr(cyleId, cydev, cyoffset, cyptr, cysize, flags)
+    _helper_input_void_ptr_free(&cyptrHelper)
+    return (_CUresult(err),)
+{{endif}}
+
+{{if 'cuLogicalEndpointBindMem' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointBindMem(leId, dev, offset, memHandle, memOffset, size, unsigned long long flags):
+    """ Binds memory object represented by a handle to the logical endpoint.
+
+    Binds the memory allocation specified by `memHandle` to a logical
+    endpoint represented by `leId` at the offset `offset`. The memory must
+    have been allocated via :py:obj:`~.cuMemCreate`. The intended `size` of
+    the bind, the offset in the logical endpoint range `offset` and the
+    offset in the memory handle `memOffset` must be multiples of the value
+    for `bindAlignment` as returned by
+    :py:obj:`~.cuLogicalEndpointGetLimits`.
+
+    The `size` + `memOffset` cannot be larger than the size of the
+    allocated memory. Similarly the `size` + `offset` cannot be larger than
+    the total size of the logical endpoint.
+
+    For device memory, i.e., type :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+    the memory allocation must have been created on the device specified by
+    `dev`. For host NUMA memory, i.e., type
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, the memory allocation must
+    have been created on the CPU NUMA node closest to `dev`. That is, the
+    value returned when querying
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for `dev`, must be the CPU
+    NUMA node where the memory was allocated.
+
+    For multicast endpoints, the device named by `dev` must have been added
+    to the multicast team via :py:obj:`~.cuLogicalEndpointAddDevice`.
+
+    For unicast endpoints the device named by `dev` must be the owner
+    device specified during :py:obj:`~.cuLogicalEndpointCreate` via
+    :py:obj:`~.CUlogicalEndpointProp.unicast.device`.
+
+    Externally shareable as well as imported multicast endpoints can be
+    bound only to externally shareable memory. Imported unicast endpoints
+    cannot be bound to any memory.
+
+    This call will return :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if
+    :py:obj:`~.cuLogicalEndpointQuery` has not been called for the logical
+    endpoint to ensure that the endpoint is ready for memory binding.
+
+    Note that this call will return :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` if
+    there are insufficient resources required to perform the bind. This
+    call may also return :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY` if the
+    necessary system software is not initialized or running. This call may
+    return :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` if the system configuration
+    is in an illegal state. In such cases, to continue using logical
+    endpoints, verify that the system configuration is in a valid state and
+    all required driver daemons are running properly.
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        Logical endpoint to which memory will be associated.
+    dev : :py:obj:`~.CUdevice`
+        Device on which the memory will be bound to the logical endpoint
+    offset : Any
+        Offset into the logical endpoint space.
+    memHandle : :py:obj:`~.CUmemGenericAllocationHandle`
+        Handle representing a memory allocation.
+    memOffset : Any
+        Offset into the memory for the attachment
+    size : Any
+        Size of memory that will be bound to the logical endpoint.
+    flags : unsigned long long
+        Flags for future use, must be zero for now.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.cuuint64_t cysize
+    if size is None:
+        psize = 0
+    elif isinstance(size, (cuuint64_t,)):
+        psize = int(size)
+    else:
+        psize = int(cuuint64_t(size))
+    cysize = <cydriver.cuuint64_t><void_ptr>psize
+    cdef cydriver.cuuint64_t cymemOffset
+    if memOffset is None:
+        pmemOffset = 0
+    elif isinstance(memOffset, (cuuint64_t,)):
+        pmemOffset = int(memOffset)
+    else:
+        pmemOffset = int(cuuint64_t(memOffset))
+    cymemOffset = <cydriver.cuuint64_t><void_ptr>pmemOffset
+    cdef cydriver.CUmemGenericAllocationHandle cymemHandle
+    if memHandle is None:
+        pmemHandle = 0
+    elif isinstance(memHandle, (CUmemGenericAllocationHandle,)):
+        pmemHandle = int(memHandle)
+    else:
+        pmemHandle = int(CUmemGenericAllocationHandle(memHandle))
+    cymemHandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>pmemHandle
+    cdef cydriver.cuuint64_t cyoffset
+    if offset is None:
+        poffset = 0
+    elif isinstance(offset, (cuuint64_t,)):
+        poffset = int(offset)
+    else:
+        poffset = int(cuuint64_t(offset))
+    cyoffset = <cydriver.cuuint64_t><void_ptr>poffset
+    cdef cydriver.CUdevice cydev
+    if dev is None:
+        pdev = 0
+    elif isinstance(dev, (CUdevice,)):
+        pdev = int(dev)
+    else:
+        pdev = int(CUdevice(dev))
+    cydev = <cydriver.CUdevice>pdev
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    with nogil:
+        err = cydriver.cuLogicalEndpointBindMem(cyleId, cydev, cyoffset, cymemHandle, cymemOffset, cysize, flags)
+    return (_CUresult(err),)
+{{endif}}
+
+{{if 'cuLogicalEndpointUnbind' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointUnbind(leId, dev, offset, size):
+    """ Unbinds any binding at offset from the logical endpoint.
+
+    Unbinds any memory allocations bound to the logical endpoint on `dev`
+    at `offset` and up to the given `size`. The intended `size` of the
+    unbind and the offset in the logical endpoint range `offset` must be
+    multiples of the value for `bindAlignment` as returned by
+    :py:obj:`~.cuLogicalEndpointGetLimits`.
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        Logical endpoint id representing a logical endpoint.
+    dev : :py:obj:`~.CUdevice`
+        Device on which the memory is bound to the logical endpoint
+    offset : Any
+        Offset into the logical endpoint.
+    size : Any
+        Desired size to unbind.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+
+    Notes
+    -----
+    The `offset` must correspond to a value specified during a bind call. The `size` must either match the bind call of the offset or be the combined `size` of multiple bind calls. The `size` + `offset` must fully enclose all bindings that are covered.
+    """
+    cdef cydriver.cuuint64_t cysize
+    if size is None:
+        psize = 0
+    elif isinstance(size, (cuuint64_t,)):
+        psize = int(size)
+    else:
+        psize = int(cuuint64_t(size))
+    cysize = <cydriver.cuuint64_t><void_ptr>psize
+    cdef cydriver.cuuint64_t cyoffset
+    if offset is None:
+        poffset = 0
+    elif isinstance(offset, (cuuint64_t,)):
+        poffset = int(offset)
+    else:
+        poffset = int(cuuint64_t(offset))
+    cyoffset = <cydriver.cuuint64_t><void_ptr>poffset
+    cdef cydriver.CUdevice cydev
+    if dev is None:
+        pdev = 0
+    elif isinstance(dev, (CUdevice,)):
+        pdev = int(dev)
+    else:
+        pdev = int(CUdevice(dev))
+    cydev = <cydriver.CUdevice>pdev
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    with nogil:
+        err = cydriver.cuLogicalEndpointUnbind(cyleId, cydev, cyoffset, cysize)
+    return (_CUresult(err),)
+{{endif}}
+
+{{if 'cuLogicalEndpointExport' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointExport(leId, handleType not None : CUlogicalEndpointIpcHandleType):
+    """ Exports a logical endpoint associated with leId to an IPC handle.
+
+    Given a logical endpoint id `leId`, create a shareable handle `handle`
+    that can be used to share the logical endpoint with other processes.
+    The recipient process can convert the shareable handle back into a
+    logical endpoint id using :py:obj:`~.cuLogicalEndpointImport`. The
+    implementation of what this `handle` is and how it can be transfered is
+    defined by the requested handle type in `handletype`.
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        Logical endpoint id of logical endpoint.
+    handleType : :py:obj:`~.CUlogicalEndpointIpcHandleType`
+        Type of shareable handle requested. Defines type and size of the
+        handle output parameter.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
+    handle : Any
+        Pointer to the location in which to store the requested handle
+        type.
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    cdef CUlogicalEndpointFabricHandle handle = CUlogicalEndpointFabricHandle()
+    cdef cydriver.CUlogicalEndpointIpcHandleType cyhandleType = int(handleType)
+    with nogil:
+        err = cydriver.cuLogicalEndpointExport(<void*><cydriver.CUlogicalEndpointFabricHandle*>handle._pvt_ptr, cyleId, cyhandleType)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_CUresult(err), None)
+    return (_CUresult_SUCCESS, handle)
+{{endif}}
+
+{{if 'cuLogicalEndpointImport' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointImport(leId, handle, handleType not None : CUlogicalEndpointIpcHandleType):
+    """ Imports a logical endpoint from the given IPC handle and associates it with a logical endpoint id.
+
+    Imports a logical endpoint from the given IPC `handle` and associates
+    it with the logical endpoint id specified by `leId`.
+
+    If the current process cannot support the logical endpoint described by
+    the shareable handle, this API will error as
+    :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`. If `handle` is of type
+    :py:obj:`~.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC` and the importer
+    process does not have access permissions, then
+    :py:obj:`~.CUDA_ERROR_NOT_PERMITTED` will be returned
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        Logical endpoint id that will be used to access the exported
+        logical endpoint.
+    handle : Any
+        Shareable handle representing the logical endpoint that is to be
+        imported.
+    handleType : :py:obj:`~.CUlogicalEndpointIpcHandleType`
+        Handle type of the exported handle
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointGetLimits`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    cdef _HelperInputVoidPtrStruct cyhandleHelper
+    cdef void* cyhandle = _helper_input_void_ptr(handle, &cyhandleHelper)
+    cdef cydriver.CUlogicalEndpointIpcHandleType cyhandleType = int(handleType)
+    with nogil:
+        err = cydriver.cuLogicalEndpointImport(cyleId, cyhandle, cyhandleType)
+    _helper_input_void_ptr_free(&cyhandleHelper)
+    return (_CUresult(err),)
+{{endif}}
+
+{{if 'cuLogicalEndpointGetLimits' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointGetLimits(prop : Optional[CUlogicalEndpointProp]):
+    """ Calculates the minimum alignment and the maximum size for the given logical endpoint properties.
+
+    The `bindAlignment` can be used as a multiple for size and bind offset
+    values. The `maxSize` is the maximum size of the logical endpoint. If
+    `maxSize` is less than :py:obj:`~.CUlogicalEndpointProp`:size the user
+    must adjust the request to the smaller value.
+
+    Parameters
+    ----------
+    prop : :py:obj:`~.CUlogicalEndpointProp`
+        Properties of the logical endpoint.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
+    bindAlignment : :py:obj:`~.cuuint64_t`
+        Minimum alignment granularity of the proposed logical endpoint.
+    maxSize : :py:obj:`~.cuuint64_t`
+        Maximum size of the logical endpoint.
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointQuery`
+    """
+    cdef cuuint64_t bindAlignment = cuuint64_t()
+    cdef cuuint64_t maxSize = cuuint64_t()
+    cdef cydriver.CUlogicalEndpointProp* cyprop_ptr = <cydriver.CUlogicalEndpointProp*>prop._pvt_ptr if prop is not None else NULL
+    with nogil:
+        err = cydriver.cuLogicalEndpointGetLimits(<cydriver.cuuint64_t*>bindAlignment._pvt_ptr, <cydriver.cuuint64_t*>maxSize._pvt_ptr, cyprop_ptr)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_CUresult(err), None, None)
+    return (_CUresult_SUCCESS, bindAlignment, maxSize)
+{{endif}}
+
+{{if 'cuLogicalEndpointQuery' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLogicalEndpointQuery(leId, count):
+    """ Determines if all logical endpoints in the range have been successfully constructed.
+
+    Queries the driver to determine if all logical endpoints in the given
+    range starting at `leId` and extending for `count` have been
+    successfully constructed.
+
+    Provides a mechanism to ensure that it is safe to begin using a logical
+    endpoint ID. Using a logical endpoint ID before verifying that it is
+    fully constructed can result in undefined behavior.
+
+    This is not a blocking API, it returns immediately with a `queryStatus`
+    of 0 if any logical endpoint ID in the given range is not fully
+    constructed, and a non-zero value otherwise.
+
+    Parameters
+    ----------
+    leId : :py:obj:`~.CUlogicalEndpointId`
+        First logical endpoint ID to be queried.
+    count : Any
+        Number of logical endpoints IDs to be queried.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
+    queryStatus : int
+        Status of the logical endpoints. Returns 0 if any logical endpoint
+        in the given range is not fully constructed, and non-zero if all
+        logical endpoints in the given range are fully constructed.
+
+    See Also
+    --------
+    :py:obj:`~.cuLogicalEndpointCreate`, :py:obj:`~.cuLogicalEndpointIdReserve`, :py:obj:`~.cuLogicalEndpointIdRelease`, :py:obj:`~.cuLogicalEndpointAddDevice`, :py:obj:`~.cuLogicalEndpointDestroy`, :py:obj:`~.cuLogicalEndpointBindAddr`, :py:obj:`~.cuLogicalEndpointBindMem`, :py:obj:`~.cuLogicalEndpointUnbind`, :py:obj:`~.cuLogicalEndpointExport`, :py:obj:`~.cuLogicalEndpointImport`, :py:obj:`~.cuLogicalEndpointGetLimits`
+    """
+    cdef cydriver.cuuint32_t cycount
+    if count is None:
+        pcount = 0
+    elif isinstance(count, (cuuint32_t,)):
+        pcount = int(count)
+    else:
+        pcount = int(cuuint32_t(count))
+    cycount = <cydriver.cuuint32_t><void_ptr>pcount
+    cdef cydriver.CUlogicalEndpointId cyleId
+    if leId is None:
+        pleId = 0
+    elif isinstance(leId, (CUlogicalEndpointId,)):
+        pleId = int(leId)
+    else:
+        pleId = int(CUlogicalEndpointId(leId))
+    cyleId = <cydriver.CUlogicalEndpointId><void_ptr>pleId
+    cdef int queryStatus = 0
+    with nogil:
+        err = cydriver.cuLogicalEndpointQuery(cyleId, cycount, &queryStatus)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_CUresult(err), None)
+    return (_CUresult_SUCCESS, queryStatus)
+{{endif}}
+
 {{if 'cuPointerGetAttribute' in found_functions}}
 
 @cython.embedsignature(True)
@@ -39549,6 +40845,10 @@ def cuStreamCreateWithPriority(unsigned int flags, int priority):
 def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstreamCigCaptureParams]):
     """ Begins capture to CIG on a stream.
 
+    Support for CIG streams with D3D12 can be determined using
+    :py:obj:`~.cuDeviceGetAttribute()` with
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED`.
+
     Begin CIG (CUDA in Graphics) capture on `hStream` for the graphics API
     as provided in `streamCigCaptureParams`. When a stream is in CIG
     capture mode, all operations pushed into the stream will not be
@@ -40242,6 +41542,95 @@ def cuStreamBeginCapture(hStream, mode not None : CUstreamCaptureMode):
     return (_CUresult(err),)
 {{endif}}
 
+{{if 'cuStreamBeginRecaptureToGraph' in found_functions}}
+
+@cython.embedsignature(True)
+def cuStreamBeginRecaptureToGraph(hStream, mode not None : CUstreamCaptureMode, hGraph, callbackFunc, userData):
+    """ Begin graph capture on a stream to an existing graph.
+
+    Begin graph capture on `hStream` to the existing `hGraph`. The node
+    creation order while recapturing the graph must be identical to the
+    original graph. The recapture will fail immediately for:
+
+    - Topology mismatches between the existing graph and the recaptured
+      graph
+
+    - Parameter mismatches for memory allocation or free nodes
+
+    Any other node parameter mismatches during recapture can be configured
+    to call the function provided in `callbackFunc`. The recapture will
+    fail immediately if the callback returns anything other than
+    CUDA_SUCCESS.
+
+    If the recapture fails for any reason, the `graph` will be in an
+    undefined state and should be destroyed.
+
+    See cuStreamBeginCapture for additional detail on beginning the
+    capture.
+
+    Parameters
+    ----------
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        Stream in which to initiate capture
+    mode : :py:obj:`~.CUstreamCaptureMode`
+        Controls the interaction of this capture sequence with other API
+        calls that are potentially unsafe. For more details see
+        :py:obj:`~.cuThreadExchangeStreamCaptureMode`.
+    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+        Existing CUDA graph to be captured into
+    callbackFunc : :py:obj:`~.CUgraphRecaptureCallback`
+        Function that will be called for all parameter mismatches from the
+        original graph
+    userData : Any
+        A generic pointer to user data that is passed into the callback
+        function
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
+
+    See Also
+    --------
+    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamBeginCapture`, :py:obj:`~.cuStreamIsCapturing`, :py:obj:`~.cuStreamEndCapture`, :py:obj:`~.cuThreadExchangeStreamCaptureMode`
+
+    Notes
+    -----
+    Any user objects associated with `graph` will be released prior to the recapture.
+    """
+    cdef cydriver.CUgraphRecaptureCallback cycallbackFunc
+    if callbackFunc is None:
+        pcallbackFunc = 0
+    elif isinstance(callbackFunc, (CUgraphRecaptureCallback,)):
+        pcallbackFunc = int(callbackFunc)
+    else:
+        pcallbackFunc = int(CUgraphRecaptureCallback(callbackFunc))
+    cycallbackFunc = <cydriver.CUgraphRecaptureCallback><void_ptr>pcallbackFunc
+    cdef cydriver.CUgraph cyhGraph
+    if hGraph is None:
+        phGraph = 0
+    elif isinstance(hGraph, (CUgraph,)):
+        phGraph = int(hGraph)
+    else:
+        phGraph = int(CUgraph(hGraph))
+    cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
+    cdef cydriver.CUstream cyhStream
+    if hStream is None:
+        phStream = 0
+    elif isinstance(hStream, (CUstream,)):
+        phStream = int(hStream)
+    else:
+        phStream = int(CUstream(hStream))
+    cyhStream = <cydriver.CUstream><void_ptr>phStream
+    cdef cydriver.CUstreamCaptureMode cymode = int(mode)
+    cdef _HelperInputVoidPtrStruct cyuserDataHelper
+    cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper)
+    with nogil:
+        err = cydriver.cuStreamBeginRecaptureToGraph(cyhStream, cymode, cyhGraph, cycallbackFunc, cyuserData)
+    _helper_input_void_ptr_free(&cyuserDataHelper)
+    return (_CUresult(err),)
+{{endif}}
+
 {{if 'cuStreamBeginCaptureToGraph' in found_functions}}
 
 @cython.embedsignature(True)
@@ -43675,180 +45064,6 @@ def cuLaunchCooperativeKernel(f, unsigned int gridDimX, unsigned int gridDimY, u
     return (_CUresult(err),)
 {{endif}}
 
-{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
-
-@cython.embedsignature(True)
-def cuLaunchCooperativeKernelMultiDevice(launchParamsList : Optional[tuple[CUDA_LAUNCH_PARAMS] | list[CUDA_LAUNCH_PARAMS]], unsigned int numDevices, unsigned int flags):
-    """ Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute.
-
-    [Deprecated]
-
-    Invokes kernels as specified in the `launchParamsList` array where each
-    element of the array specifies all the parameters required to perform a
-    single kernel launch. These kernels can cooperate and synchronize as
-    they execute. The size of the array is specified by `numDevices`.
-
-    No two kernels can be launched on the same device. All the devices
-    targeted by this multi-device launch must be identical. All devices
-    must have a non-zero value for the device attribute
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH`.
-
-    All kernels launched must be identical with respect to the compiled
-    code. Note that any device, constant or managed variables present in
-    the module that owns the kernel launched on each device, are
-    independently instantiated on every device. It is the application's
-    responsibility to ensure these variables are initialized and used
-    appropriately.
-
-    The size of the grids as specified in blocks, the size of the blocks
-    themselves and the amount of shared memory used by each thread block
-    must also match across all launched kernels.
-
-    The streams used to launch these kernels must have been created via
-    either :py:obj:`~.cuStreamCreate` or
-    :py:obj:`~.cuStreamCreateWithPriority`. The NULL stream or
-    :py:obj:`~.CU_STREAM_LEGACY` or :py:obj:`~.CU_STREAM_PER_THREAD` cannot
-    be used.
-
-    The total number of blocks launched per kernel cannot exceed the
-    maximum number of blocks per multiprocessor as returned by
-    :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessor` (or
-    :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`) times
-    the number of multiprocessors as specified by the device attribute
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT`. Since the total
-    number of blocks launched per device has to match across all devices,
-    the maximum number of blocks that can be launched per device will be
-    limited by the device with the least number of multiprocessors.
-
-    The kernels cannot make use of CUDA dynamic parallelism.
-
-    The :py:obj:`~.CUDA_LAUNCH_PARAMS` structure is defined as:
-
-    **View CUDA Toolkit Documentation for a C++ code example**
-
-    where:
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.function` specifies the kernel to be
-      launched. All functions must be identical with respect to the
-      compiled code. Note that you can also specify context-less kernel
-      :py:obj:`~.CUkernel` by querying the handle using
-      :py:obj:`~.cuLibraryGetKernel()` and then casting to
-      :py:obj:`~.CUfunction`. In this case, the context to launch the
-      kernel on be taken from the specified stream
-      :py:obj:`~.CUDA_LAUNCH_PARAMS.hStream`.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.gridDimX` is the width of the grid in
-      blocks. This must match across all kernels launched.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.gridDimY` is the height of the grid in
-      blocks. This must match across all kernels launched.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.gridDimZ` is the depth of the grid in
-      blocks. This must match across all kernels launched.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.blockDimX` is the X dimension of each
-      thread block. This must match across all kernels launched.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.blockDimX` is the Y dimension of each
-      thread block. This must match across all kernels launched.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.blockDimZ` is the Z dimension of each
-      thread block. This must match across all kernels launched.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.sharedMemBytes` is the dynamic shared-
-      memory size per thread block in bytes. This must match across all
-      kernels launched.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.hStream` is the handle to the stream to
-      perform the launch in. This cannot be the NULL stream or
-      :py:obj:`~.CU_STREAM_LEGACY` or :py:obj:`~.CU_STREAM_PER_THREAD`. The
-      CUDA context associated with this stream must match that associated
-      with :py:obj:`~.CUDA_LAUNCH_PARAMS.function`.
-
-    - :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams` is an array of pointers
-      to kernel parameters. If :py:obj:`~.CUDA_LAUNCH_PARAMS.function` has
-      N parameters, then :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams` needs
-      to be an array of N pointers. Each of
-      :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams`[0] through
-      :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams`[N-1] must point to a
-      region of memory from which the actual kernel parameter will be
-      copied. The number of kernel parameters and their offsets and sizes
-      do not need to be specified as that information is retrieved directly
-      from the kernel's image.
-
-    By default, the kernel won't begin execution on any GPU until all prior
-    work in all the specified streams has completed. This behavior can be
-    overridden by specifying the flag
-    :py:obj:`~.CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC`.
-    When this flag is specified, each kernel will only wait for prior work
-    in the stream corresponding to that GPU to complete before it begins
-    execution.
-
-    Similarly, by default, any subsequent work pushed in any of the
-    specified streams will not begin execution until the kernels on all
-    GPUs have completed. This behavior can be overridden by specifying the
-    flag
-    :py:obj:`~.CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC`.
-    When this flag is specified, any subsequent work pushed in any of the
-    specified streams will only wait for the kernel launched on the GPU
-    corresponding to that stream to complete before it begins execution.
-
-    Calling :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()` sets
-    persistent function state that is the same as function state set
-    through :py:obj:`~.cuLaunchKernel` API when called individually for
-    each element in `launchParamsList`.
-
-    When kernels are launched via
-    :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()`, the previous block
-    shape, shared size and parameter info associated with each
-    :py:obj:`~.CUDA_LAUNCH_PARAMS.function` in `launchParamsList` is
-    overwritten.
-
-    Note that to use :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()`,
-    the kernels must either have been compiled with toolchain version 3.2
-    or later so that it will contain kernel parameter information, or have
-    no kernel parameters. If either of these conditions is not met, then
-    :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()` will return
-    :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`.
-
-    Parameters
-    ----------
-    launchParamsList : list[:py:obj:`~.CUDA_LAUNCH_PARAMS`]
-        List of launch parameters, one per device
-    numDevices : unsigned int
-        Size of the `launchParamsList` array
-    flags : unsigned int
-        Flags to control launch behavior
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`
-
-    See Also
-    --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchCooperativeKernel`, :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice`
-    """
-    launchParamsList = [] if launchParamsList is None else launchParamsList
-    if not all(isinstance(_x, (CUDA_LAUNCH_PARAMS,)) for _x in launchParamsList):
-        raise TypeError("Argument 'launchParamsList' is not instance of type (expected tuple[cydriver.CUDA_LAUNCH_PARAMS,] or list[cydriver.CUDA_LAUNCH_PARAMS,]")
-    cdef cydriver.CUDA_LAUNCH_PARAMS* cylaunchParamsList = NULL
-    if len(launchParamsList) > 1:
-        cylaunchParamsList = <cydriver.CUDA_LAUNCH_PARAMS*> calloc(len(launchParamsList), sizeof(cydriver.CUDA_LAUNCH_PARAMS))
-        if cylaunchParamsList is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(launchParamsList)) + 'x' + str(sizeof(cydriver.CUDA_LAUNCH_PARAMS)))
-        for idx in range(len(launchParamsList)):
-            string.memcpy(&cylaunchParamsList[idx], (<CUDA_LAUNCH_PARAMS>launchParamsList[idx])._pvt_ptr, sizeof(cydriver.CUDA_LAUNCH_PARAMS))
-    elif len(launchParamsList) == 1:
-        cylaunchParamsList = (<CUDA_LAUNCH_PARAMS>launchParamsList[0])._pvt_ptr
-    if numDevices > len(launchParamsList): raise RuntimeError("List is too small: " + str(len(launchParamsList)) + " < " + str(numDevices))
-    with nogil:
-        err = cydriver.cuLaunchCooperativeKernelMultiDevice(cylaunchParamsList, numDevices, flags)
-    if len(launchParamsList) > 1 and cylaunchParamsList is not NULL:
-        free(cylaunchParamsList)
-    return (_CUresult(err),)
-{{endif}}
-
 {{if 'cuLaunchHostFunc' in found_functions}}
 
 ctypedef struct cuHostCallbackData_st:
@@ -44484,6 +45699,180 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
     return (_CUresult(err),)
 {{endif}}
 
+{{if 'cuLaunchCooperativeKernelMultiDevice' in found_functions}}
+
+@cython.embedsignature(True)
+def cuLaunchCooperativeKernelMultiDevice(launchParamsList : Optional[tuple[CUDA_LAUNCH_PARAMS] | list[CUDA_LAUNCH_PARAMS]], unsigned int numDevices, unsigned int flags):
+    """ Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute.
+
+    [Deprecated]
+
+    Invokes kernels as specified in the `launchParamsList` array where each
+    element of the array specifies all the parameters required to perform a
+    single kernel launch. These kernels can cooperate and synchronize as
+    they execute. The size of the array is specified by `numDevices`.
+
+    No two kernels can be launched on the same device. All the devices
+    targeted by this multi-device launch must be identical. All devices
+    must have a non-zero value for the device attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH`.
+
+    All kernels launched must be identical with respect to the compiled
+    code. Note that any device, constant or managed variables present in
+    the module that owns the kernel launched on each device, are
+    independently instantiated on every device. It is the application's
+    responsibility to ensure these variables are initialized and used
+    appropriately.
+
+    The size of the grids as specified in blocks, the size of the blocks
+    themselves and the amount of shared memory used by each thread block
+    must also match across all launched kernels.
+
+    The streams used to launch these kernels must have been created via
+    either :py:obj:`~.cuStreamCreate` or
+    :py:obj:`~.cuStreamCreateWithPriority`. The NULL stream or
+    :py:obj:`~.CU_STREAM_LEGACY` or :py:obj:`~.CU_STREAM_PER_THREAD` cannot
+    be used.
+
+    The total number of blocks launched per kernel cannot exceed the
+    maximum number of blocks per multiprocessor as returned by
+    :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessor` (or
+    :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`) times
+    the number of multiprocessors as specified by the device attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT`. Since the total
+    number of blocks launched per device has to match across all devices,
+    the maximum number of blocks that can be launched per device will be
+    limited by the device with the least number of multiprocessors.
+
+    The kernels cannot make use of CUDA dynamic parallelism.
+
+    The :py:obj:`~.CUDA_LAUNCH_PARAMS` structure is defined as:
+
+    **View CUDA Toolkit Documentation for a C++ code example**
+
+    where:
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.function` specifies the kernel to be
+      launched. All functions must be identical with respect to the
+      compiled code. Note that you can also specify context-less kernel
+      :py:obj:`~.CUkernel` by querying the handle using
+      :py:obj:`~.cuLibraryGetKernel()` and then casting to
+      :py:obj:`~.CUfunction`. In this case, the context to launch the
+      kernel on be taken from the specified stream
+      :py:obj:`~.CUDA_LAUNCH_PARAMS.hStream`.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.gridDimX` is the width of the grid in
+      blocks. This must match across all kernels launched.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.gridDimY` is the height of the grid in
+      blocks. This must match across all kernels launched.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.gridDimZ` is the depth of the grid in
+      blocks. This must match across all kernels launched.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.blockDimX` is the X dimension of each
+      thread block. This must match across all kernels launched.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.blockDimX` is the Y dimension of each
+      thread block. This must match across all kernels launched.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.blockDimZ` is the Z dimension of each
+      thread block. This must match across all kernels launched.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.sharedMemBytes` is the dynamic shared-
+      memory size per thread block in bytes. This must match across all
+      kernels launched.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.hStream` is the handle to the stream to
+      perform the launch in. This cannot be the NULL stream or
+      :py:obj:`~.CU_STREAM_LEGACY` or :py:obj:`~.CU_STREAM_PER_THREAD`. The
+      CUDA context associated with this stream must match that associated
+      with :py:obj:`~.CUDA_LAUNCH_PARAMS.function`.
+
+    - :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams` is an array of pointers
+      to kernel parameters. If :py:obj:`~.CUDA_LAUNCH_PARAMS.function` has
+      N parameters, then :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams` needs
+      to be an array of N pointers. Each of
+      :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams`[0] through
+      :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams`[N-1] must point to a
+      region of memory from which the actual kernel parameter will be
+      copied. The number of kernel parameters and their offsets and sizes
+      do not need to be specified as that information is retrieved directly
+      from the kernel's image.
+
+    By default, the kernel won't begin execution on any GPU until all prior
+    work in all the specified streams has completed. This behavior can be
+    overridden by specifying the flag
+    :py:obj:`~.CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC`.
+    When this flag is specified, each kernel will only wait for prior work
+    in the stream corresponding to that GPU to complete before it begins
+    execution.
+
+    Similarly, by default, any subsequent work pushed in any of the
+    specified streams will not begin execution until the kernels on all
+    GPUs have completed. This behavior can be overridden by specifying the
+    flag
+    :py:obj:`~.CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC`.
+    When this flag is specified, any subsequent work pushed in any of the
+    specified streams will only wait for the kernel launched on the GPU
+    corresponding to that stream to complete before it begins execution.
+
+    Calling :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()` sets
+    persistent function state that is the same as function state set
+    through :py:obj:`~.cuLaunchKernel` API when called individually for
+    each element in `launchParamsList`.
+
+    When kernels are launched via
+    :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()`, the previous block
+    shape, shared size and parameter info associated with each
+    :py:obj:`~.CUDA_LAUNCH_PARAMS.function` in `launchParamsList` is
+    overwritten.
+
+    Note that to use :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()`,
+    the kernels must either have been compiled with toolchain version 3.2
+    or later so that it will contain kernel parameter information, or have
+    no kernel parameters. If either of these conditions is not met, then
+    :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()` will return
+    :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`.
+
+    Parameters
+    ----------
+    launchParamsList : list[:py:obj:`~.CUDA_LAUNCH_PARAMS`]
+        List of launch parameters, one per device
+    numDevices : unsigned int
+        Size of the `launchParamsList` array
+    flags : unsigned int
+        Flags to control launch behavior
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`
+
+    See Also
+    --------
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchCooperativeKernel`, :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice`
+    """
+    launchParamsList = [] if launchParamsList is None else launchParamsList
+    if not all(isinstance(_x, (CUDA_LAUNCH_PARAMS,)) for _x in launchParamsList):
+        raise TypeError("Argument 'launchParamsList' is not instance of type (expected tuple[cydriver.CUDA_LAUNCH_PARAMS,] or list[cydriver.CUDA_LAUNCH_PARAMS,]")
+    cdef cydriver.CUDA_LAUNCH_PARAMS* cylaunchParamsList = NULL
+    if len(launchParamsList) > 1:
+        cylaunchParamsList = <cydriver.CUDA_LAUNCH_PARAMS*> calloc(len(launchParamsList), sizeof(cydriver.CUDA_LAUNCH_PARAMS))
+        if cylaunchParamsList is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(launchParamsList)) + 'x' + str(sizeof(cydriver.CUDA_LAUNCH_PARAMS)))
+        for idx in range(len(launchParamsList)):
+            string.memcpy(&cylaunchParamsList[idx], (<CUDA_LAUNCH_PARAMS>launchParamsList[idx])._pvt_ptr, sizeof(cydriver.CUDA_LAUNCH_PARAMS))
+    elif len(launchParamsList) == 1:
+        cylaunchParamsList = (<CUDA_LAUNCH_PARAMS>launchParamsList[0])._pvt_ptr
+    if numDevices > len(launchParamsList): raise RuntimeError("List is too small: " + str(len(launchParamsList)) + " < " + str(numDevices))
+    with nogil:
+        err = cydriver.cuLaunchCooperativeKernelMultiDevice(cylaunchParamsList, numDevices, flags)
+    if len(launchParamsList) > 1 and cylaunchParamsList is not NULL:
+        free(cylaunchParamsList)
+    return (_CUresult(err),)
+{{endif}}
+
 {{if 'cuParamSetTexRef' in found_functions}}
 
 @cython.embedsignature(True)
@@ -54222,6 +55611,28 @@ def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
         as :py:obj:`~.CU_COREDUMP_TRIGGER_HOST` but better reflects the
         default behavior.
 
+      - :py:obj:`~.CU_COREDUMP_SKIP_CONSTBANK_MEMORY` - Coredump will not
+        include constbank memory.
+
+      - :py:obj:`~.CU_COREDUMP_GZIP_COMPRESS` - The generated coredump will
+        be compressed with gzip, and .gz suffix will be appended to the
+        filename, if it's not a part of it already.
+
+      - :py:obj:`~.CU_COREDUMP_FAULTED_CONTEXTS_ONLY` - The coredump will
+        only include contexts that have encountered an exception or a trap.
+
+      - :py:obj:`~.CU_COREDUMP_NO_ERRBAR_AT_EXIT` - By default, when
+        coredumps are requested, the GPU will ensure memory faults and
+        other errors prevent warps from exiting, if possible. This can
+        potentially affect the performance of the application. Setting this
+        flag will disable this functionality, making it possible for
+        faulted warps to exit, but also avoiding the potential performance
+        hit.
+
+      - :py:obj:`~.CU_COREDUMP_LOG_ONLY` - Setting this flag will disable
+        actual generation of the coredump file, but exception details will
+        still be logged.
+
     Parameters
     ----------
     attrib : :py:obj:`~.CUcoredumpSettings`
@@ -54338,6 +55749,28 @@ def cuCoredumpGetAttributeGlobal(attrib not None : CUcoredumpSettings):
         as :py:obj:`~.CU_COREDUMP_TRIGGER_HOST` but better reflects the
         default behavior.
 
+      - :py:obj:`~.CU_COREDUMP_SKIP_CONSTBANK_MEMORY` - Coredump will not
+        include constbank memory.
+
+      - :py:obj:`~.CU_COREDUMP_GZIP_COMPRESS` - The generated coredump will
+        be compressed with gzip, and .gz suffix will be appended to the
+        filename, if it's not a part of it already.
+
+      - :py:obj:`~.CU_COREDUMP_FAULTED_CONTEXTS_ONLY` - The coredump will
+        only include contexts that have encountered an exception or a trap.
+
+      - :py:obj:`~.CU_COREDUMP_NO_ERRBAR_AT_EXIT` - By default, when
+        coredumps are requested, the GPU will ensure memory faults and
+        other errors prevent warps from exiting, if possible. This can
+        potentially affect the performance of the application. Setting this
+        flag will disable this functionality, making it possible for
+        faulted warps to exit, but also avoiding the potential performance
+        hit.
+
+      - :py:obj:`~.CU_COREDUMP_LOG_ONLY` - Setting this flag will disable
+        actual generation of the coredump file, but exception details will
+        still be logged.
+
     Parameters
     ----------
     attrib : :py:obj:`~.CUcoredumpSettings`
@@ -54461,6 +55894,28 @@ def cuCoredumpSetAttribute(attrib not None : CUcoredumpSettings, value):
         as :py:obj:`~.CU_COREDUMP_TRIGGER_HOST` but better reflects the
         default behavior.
 
+      - :py:obj:`~.CU_COREDUMP_SKIP_CONSTBANK_MEMORY` - Coredump will not
+        include constbank memory.
+
+      - :py:obj:`~.CU_COREDUMP_GZIP_COMPRESS` - The generated coredump will
+        be compressed with gzip, and .gz suffix will be appended to the
+        filename, if it's not a part of it already.
+
+      - :py:obj:`~.CU_COREDUMP_FAULTED_CONTEXTS_ONLY` - The coredump will
+        only include contexts that have encountered an exception or a trap.
+
+      - :py:obj:`~.CU_COREDUMP_NO_ERRBAR_AT_EXIT` - By default, when
+        coredumps are requested, the GPU will ensure memory faults and
+        other errors prevent warps from exiting, if possible. This can
+        potentially affect the performance of the application. Setting this
+        flag will disable this functionality, making it possible for
+        faulted warps to exit, but also avoiding the potential performance
+        hit.
+
+      - :py:obj:`~.CU_COREDUMP_LOG_ONLY` - Setting this flag will disable
+        actual generation of the coredump file, but exception details will
+        still be logged.
+
     Parameters
     ----------
     attrib : :py:obj:`~.CUcoredumpSettings`
@@ -54480,6 +55935,10 @@ def cuCoredumpSetAttribute(attrib not None : CUcoredumpSettings, value):
     See Also
     --------
     :py:obj:`~.cuCoredumpGetAttributeGlobal`, :py:obj:`~.cuCoredumpGetAttribute`, :py:obj:`~.cuCoredumpSetAttributeGlobal`
+
+    Notes
+    -----
+    :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` replaces all previously set coredump flags. Mixing :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` with the deprecated boolean attributes (:py:obj:`~.CU_COREDUMP_TRIGGER_HOST`, :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`) can result in undefined behavior. To avoid issues, either use only :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` or combine all desired flag bits (including :py:obj:`~.CU_COREDUMP_SKIP_ABORT`) in a single call.
     """
     cdef cydriver.CUcoredumpSettings cyattrib = int(attrib)
     cdef _HelperCUcoredumpSettings cyvalue = _HelperCUcoredumpSettings(attrib, value, is_getter=False)
@@ -54587,6 +56046,28 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value):
         as :py:obj:`~.CU_COREDUMP_TRIGGER_HOST` but better reflects the
         default behavior.
 
+      - :py:obj:`~.CU_COREDUMP_SKIP_CONSTBANK_MEMORY` - Coredump will not
+        include constbank memory.
+
+      - :py:obj:`~.CU_COREDUMP_GZIP_COMPRESS` - The generated coredump will
+        be compressed with gzip, and .gz suffix will be appended to the
+        filename, if it's not a part of it already.
+
+      - :py:obj:`~.CU_COREDUMP_FAULTED_CONTEXTS_ONLY` - The coredump will
+        only include contexts that have encountered an exception or a trap.
+
+      - :py:obj:`~.CU_COREDUMP_NO_ERRBAR_AT_EXIT` - By default, when
+        coredumps are requested, the GPU will ensure memory faults and
+        other errors prevent warps from exiting, if possible. This can
+        potentially affect the performance of the application. Setting this
+        flag will disable this functionality, making it possible for
+        faulted warps to exit, but also avoiding the potential performance
+        hit.
+
+      - :py:obj:`~.CU_COREDUMP_LOG_ONLY` - Setting this flag will disable
+        actual generation of the coredump file, but exception details will
+        still be logged.
+
     Parameters
     ----------
     attrib : :py:obj:`~.CUcoredumpSettings`
@@ -54606,6 +56087,10 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value):
     See Also
     --------
     :py:obj:`~.cuCoredumpGetAttribute`, :py:obj:`~.cuCoredumpGetAttributeGlobal`, :py:obj:`~.cuCoredumpSetAttribute`
+
+    Notes
+    -----
+    :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` replaces all previously set coredump flags. Mixing :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` with the deprecated boolean attributes (:py:obj:`~.CU_COREDUMP_TRIGGER_HOST`, :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`) can result in undefined behavior. To avoid issues, either use only :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` or combine all desired flag bits (including :py:obj:`~.CU_COREDUMP_SKIP_ABORT`) in a single call.
     """
     cdef cydriver.CUcoredumpSettings cyattrib = int(attrib)
     cdef _HelperCUcoredumpSettings cyvalue = _HelperCUcoredumpSettings(attrib, value, is_getter=False)
@@ -54858,8 +56343,10 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
 
     The supported flags are:
 
+    - `CU_GREEN_CTX_NONE` : Default behavior.
+
     - `CU_GREEN_CTX_DEFAULT_STREAM` : Creates a default stream to use
-      inside the green context. Required.
+      inside the green context.
 
     Parameters
     ----------
@@ -54870,7 +56357,6 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
         Device on which to create the green context.
     flags : unsigned int
         One of the supported green context creation flags.
-        `CU_GREEN_CTX_DEFAULT_STREAM` is required.
 
     Returns
     -------
@@ -54961,18 +56447,37 @@ def cuGreenCtxDestroy(hCtx):
 
 @cython.embedsignature(True)
 def cuCtxFromGreenCtx(hCtx):
-    """ Converts a green context into the primary context.
+    """ Returns a :py:obj:`~.CUcontext` handle for a green context.
+
+    This API returns in `pContext` a :py:obj:`~.CUcontext` handle that
+    represents the specified green context `hCtx`. The returned handle can
+    be passed to CUDA APIs that accept a :py:obj:`~.CUcontext` and will be
+    treated as if it were a primary context, while still honoring the
+    resources and configuration associated with `hCtx` as applicable.
+
+    Applications that wish to use a green context with CUDA APIs that
+    require a :py:obj:`~.CUcontext` must use this API to obtain a handle to
+    a :py:obj:`~.CUcontext` representing the green context. Otherwise,
+    passing a green context to such APIs will fail with
+    :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`.
 
-    The API converts a green context into the primary context returned in
-    `pContext`. It is important to note that the converted context
-    `pContext` is a normal primary context but with the resources of the
-    specified green context `hCtx`. Once converted, it can then be used to
-    set the context current with :py:obj:`~.cuCtxSetCurrent` or with any of
-    the CUDA APIs that accept a CUcontext parameter.
+    The :py:obj:`~.CUcontext` returned by :py:obj:`~.cuCtxFromGreenCtx` may
+    be passed to CUDA Driver APIs that accept a :py:obj:`~.CUcontext`.
 
-    Users are expected to call this API before calling any CUDA APIs that
-    accept a CUcontext. Failing to do so will result in the APIs returning
-    :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`.
+    - For APIs whose semantics are independent of green context resources,
+      the operation is performed identically to how it would perform with a
+      primary context.
+
+    - For APIs whose behavior depends on green context resources (for
+      example, kernel launch), the operation is performed using the
+      resources and configuration of the specified green context `hCtx`.
+
+    This call does not create a new independent context and does not change
+    the underlying context lifetime. The validity of the returned
+    `pContext` is tied to `hCtx`, and no additional destruction or release
+    is required beyond correctly managing `hCtx` with the green context
+    APIs. Destroying `pContext` via :py:obj:`~.cuCtxDestroy` is undefined
+    behavior.
 
     Parameters
     ----------
@@ -54984,7 +56489,7 @@ def cuCtxFromGreenCtx(hCtx):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pContext : :py:obj:`~.CUcontext`
-        Returned primary context with green context resources
+        Returned :py:obj:`~.CUcontext` with green context resources
 
     See Also
     --------
@@ -58239,6 +59744,24 @@ def sizeof(objType):
     {{if 'CUmemDecompressParams' in found_types}}
     if objType == CUmemDecompressParams:
         return sizeof(cydriver.CUmemDecompressParams){{endif}}
+    {{if 'CUlogicalEndpointId' in found_types}}
+    if objType == CUlogicalEndpointId:
+        return sizeof(cydriver.CUlogicalEndpointId){{endif}}
+    {{if 'CUlogicalEndpointFabricHandle_st' in found_struct}}
+    if objType == CUlogicalEndpointFabricHandle_st:
+        return sizeof(cydriver.CUlogicalEndpointFabricHandle_st){{endif}}
+    {{if 'CUlogicalEndpointFabricHandle' in found_types}}
+    if objType == CUlogicalEndpointFabricHandle:
+        return sizeof(cydriver.CUlogicalEndpointFabricHandle){{endif}}
+    {{if 'CUlogicalEndpointProp_struct' in found_struct}}
+    if objType == CUlogicalEndpointProp_struct:
+        return sizeof(cydriver.CUlogicalEndpointProp_struct){{endif}}
+    {{if 'CUlogicalEndpointProp' in found_types}}
+    if objType == CUlogicalEndpointProp:
+        return sizeof(cydriver.CUlogicalEndpointProp){{endif}}
+    {{if 'CUgraphRecaptureCallback' in found_types}}
+    if objType == CUgraphRecaptureCallback:
+        return sizeof(cydriver.CUgraphRecaptureCallback){{endif}}
     {{if 'CUcoredumpCallbackHandle' in found_types}}
     if objType == CUcoredumpCallbackHandle:
         return sizeof(cydriver.CUcoredumpCallbackHandle){{endif}}
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pxd b/cuda_bindings/cuda/bindings/nvrtc.pxd
index 5cb372430f9..743c75f8837 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/nvrtc.pxd
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 cimport cuda.bindings.cynvrtc as cynvrtc
 
 include "_lib/utils.pxd"
@@ -19,3 +19,75 @@ cdef class nvrtcProgram:
     """
     cdef cynvrtc.nvrtcProgram  _pvt_val
     cdef cynvrtc.nvrtcProgram* _pvt_ptr
+
+cdef class anon_struct0:
+    """
+    Attributes
+    ----------
+
+    available : int
+
+
+
+    compressedSize : size_t
+
+
+
+    uncompressedSize : size_t
+
+
+
+    cudaVersionMajor : int
+
+
+
+    cudaVersionMinor : int
+
+
+
+    numFiles : unsigned int
+
+
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cynvrtc.nvrtcBundledHeadersInfo* _pvt_ptr
+
+cdef class nvrtcBundledHeadersInfo(anon_struct0):
+    """
+    Attributes
+    ----------
+
+    available : int
+
+
+
+    compressedSize : size_t
+
+
+
+    uncompressedSize : size_t
+
+
+
+    cudaVersionMajor : int
+
+
+
+    cudaVersionMinor : int
+
+
+
+    numFiles : unsigned int
+
+
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cynvrtc.nvrtcBundledHeadersInfo _pvt_val
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx b/cuda_bindings/cuda/bindings/nvrtc.pyx
index aca1ead365f..e6d1dc6ad02 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -43,6 +43,19 @@ ctypedef unsigned long long float_ptr
 ctypedef unsigned long long double_ptr
 ctypedef unsigned long long void_ptr
 
+#: Flags for nvrtcInstallBundledHeaders.Skip installation if version marker
+#: exists and version matches. This is the default behavior when flags=0.
+NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS = cynvrtc.NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS
+
+#: Clear existing directory contents before installation. Guarantees
+#: consistency by removing any existing files first.
+NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE = cynvrtc.NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE
+
+#: Return NVRTC_ERROR_BUSY immediately if installation is in progress by
+#: another process, instead of waiting for the lock. Can be combined with
+#: FORCE_OVERWRITE using bitwise OR.
+NVRTC_INSTALL_HEADERS_NO_WAIT = cynvrtc.NVRTC_INSTALL_HEADERS_NO_WAIT
+
 class nvrtcResult(_FastEnum):
     """
     The enumerated type nvrtcResult defines API call result codes.
@@ -85,6 +98,8 @@ class nvrtcResult(_FastEnum):
 
     NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED = cynvrtc.nvrtcResult.NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED
 
+    NVRTC_ERROR_BUSY = cynvrtc.nvrtcResult.NVRTC_ERROR_BUSY
+
 cdef object _nvrtcResult = nvrtcResult
 cdef object _nvrtcResult_SUCCESS = nvrtcResult.NVRTC_SUCCESS
 
@@ -122,6 +137,183 @@ cdef class nvrtcProgram:
     def getPtr(self):
         return <void_ptr>self._pvt_ptr
 
+cdef class anon_struct0:
+    """
+    Attributes
+    ----------
+
+    available : int
+
+
+
+    compressedSize : size_t
+
+
+
+    uncompressedSize : size_t
+
+
+
+    cudaVersionMajor : int
+
+
+
+    cudaVersionMinor : int
+
+
+
+    numFiles : unsigned int
+
+
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr):
+        self._pvt_ptr = <cynvrtc.nvrtcBundledHeadersInfo *>_ptr
+
+    def __init__(self, void_ptr _ptr):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+
+            try:
+                str_list += ['available : ' + str(self.available)]
+            except ValueError:
+                str_list += ['available : <ValueError>']
+
+
+            try:
+                str_list += ['compressedSize : ' + str(self.compressedSize)]
+            except ValueError:
+                str_list += ['compressedSize : <ValueError>']
+
+
+            try:
+                str_list += ['uncompressedSize : ' + str(self.uncompressedSize)]
+            except ValueError:
+                str_list += ['uncompressedSize : <ValueError>']
+
+
+            try:
+                str_list += ['cudaVersionMajor : ' + str(self.cudaVersionMajor)]
+            except ValueError:
+                str_list += ['cudaVersionMajor : <ValueError>']
+
+
+            try:
+                str_list += ['cudaVersionMinor : ' + str(self.cudaVersionMinor)]
+            except ValueError:
+                str_list += ['cudaVersionMinor : <ValueError>']
+
+
+            try:
+                str_list += ['numFiles : ' + str(self.numFiles)]
+            except ValueError:
+                str_list += ['numFiles : <ValueError>']
+
+            return '\n'.join(str_list)
+        else:
+            return ''
+
+    @property
+    def available(self):
+        return self._pvt_ptr[0].available
+    @available.setter
+    def available(self, int available):
+        self._pvt_ptr[0].available = available
+
+
+    @property
+    def compressedSize(self):
+        return self._pvt_ptr[0].compressedSize
+    @compressedSize.setter
+    def compressedSize(self, size_t compressedSize):
+        self._pvt_ptr[0].compressedSize = compressedSize
+
+
+    @property
+    def uncompressedSize(self):
+        return self._pvt_ptr[0].uncompressedSize
+    @uncompressedSize.setter
+    def uncompressedSize(self, size_t uncompressedSize):
+        self._pvt_ptr[0].uncompressedSize = uncompressedSize
+
+
+    @property
+    def cudaVersionMajor(self):
+        return self._pvt_ptr[0].cudaVersionMajor
+    @cudaVersionMajor.setter
+    def cudaVersionMajor(self, int cudaVersionMajor):
+        self._pvt_ptr[0].cudaVersionMajor = cudaVersionMajor
+
+
+    @property
+    def cudaVersionMinor(self):
+        return self._pvt_ptr[0].cudaVersionMinor
+    @cudaVersionMinor.setter
+    def cudaVersionMinor(self, int cudaVersionMinor):
+        self._pvt_ptr[0].cudaVersionMinor = cudaVersionMinor
+
+
+    @property
+    def numFiles(self):
+        return self._pvt_ptr[0].numFiles
+    @numFiles.setter
+    def numFiles(self, unsigned int numFiles):
+        self._pvt_ptr[0].numFiles = numFiles
+
+
+cdef class nvrtcBundledHeadersInfo(anon_struct0):
+    """
+    Attributes
+    ----------
+
+    available : int
+
+
+
+    compressedSize : size_t
+
+
+
+    uncompressedSize : size_t
+
+
+
+    cudaVersionMajor : int
+
+
+
+    cudaVersionMinor : int
+
+
+
+    numFiles : unsigned int
+
+
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = <cynvrtc.nvrtcBundledHeadersInfo *>&self._pvt_val
+        else:
+            self._pvt_ptr = <cynvrtc.nvrtcBundledHeadersInfo *>_ptr
+
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+
 @cython.embedsignature(True)
 def nvrtcGetErrorString(result not None : nvrtcResult):
     """ nvrtcGetErrorString is a helper function that returns a string describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to `"NVRTC_SUCCESS"`. For unrecognized enumeration values, it returns `"NVRTC_ERROR unknown"`.
@@ -455,7 +647,7 @@ def nvrtcGetCUBINSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetCUBIN(prog, char* cubin):
-    """ nvrtcGetCUBIN stores the cubin generated by the previous compilation of `prog` in the memory pointed by `cubin`. No cubin is available if the value specified to `-arch` is a virtual architecture instead of an actual architecture.
+    """ nvrtcGetCUBIN stores the cubin generated by the previous compilation of `prog` in the memory pointed by `cubin`. No cubin is available if the value specified to `-arch` is a virtual architecture instead of an actual architecture. The cubin does not contain code for the Tile functions (`__tile__` / `__tile_global__`) or variables (`__tile__`); use `nvrtcGetTileIR()` to extract the cuda_tile IR generated for Tile code.
 
     Parameters
     ----------
@@ -966,19 +1158,25 @@ def nvrtcSetFlowCallback(prog, callback, payload):
 
 @cython.embedsignature(True)
 def nvrtcGetTileIRSize(prog):
-    """
+    """ nvrtcGetTileIRSize sets the value of `TileIRSizeRet` with the size of the cuda_tile IR generated by the previous compilation of `prog`.
 
     Parameters
     ----------
     prog : :py:obj:`~.nvrtcProgram`
-        None
+        CUDA Runtime Compilation program.
 
     Returns
     -------
     nvrtcResult
-
+        - :py:obj:`~.NVRTC_SUCCESS`
+        - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
+        - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
     TileIRSizeRet : int
-        None
+        Size of the generated cuda_tile IR.
+
+    See Also
+    --------
+    :py:obj:`~.nvrtcGetTileIR`
     """
     cdef cynvrtc.nvrtcProgram cyprog
     if prog is None:
@@ -997,19 +1195,25 @@ def nvrtcGetTileIRSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetTileIR(prog, char* TileIR):
-    """
+    """ nvrtcGetTileIR stores the cuda_tile IR generated by the previous compilation of `prog` in the memory pointed by `TileIR`.
 
     Parameters
     ----------
     prog : :py:obj:`~.nvrtcProgram`
-        None
+        CUDA Runtime Compilation program.
     TileIR : bytes
-        None
+        Generated cuda_tile IR.
 
     Returns
     -------
     nvrtcResult
+        - :py:obj:`~.NVRTC_SUCCESS`
+        - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
+        - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
 
+    See Also
+    --------
+    :py:obj:`~.nvrtcGetTileIRSize`
     """
     cdef cynvrtc.nvrtcProgram cyprog
     if prog is None:
@@ -1023,6 +1227,136 @@ def nvrtcGetTileIR(prog, char* TileIR):
         err = cynvrtc.nvrtcGetTileIR(cyprog, TileIR)
     return (_nvrtcResult(err),)
 
+@cython.embedsignature(True)
+def nvrtcInstallBundledHeaders(char* installPath, unsigned int flags):
+    """ nvrtcInstallBundledHeaders extracts CUDA headers bundled with NVRTC to a specified directory for use during compilation.
+
+    NVRTC bundles a set of CUDA Toolkit headers and CUDA C++ Core Libraries
+    (CCCL) within libnvrtc-builtins. This function extracts these headers
+    to the specified directory, allowing NVRTC programs to compile without
+    requiring a separate CUDA Toolkit installation. The bundled headers
+    match those available in the CUDA Toolkit plus CCCL libraries.
+
+    After extraction, users can compile kernels by passing appropriate
+    include paths (such as "-I<installPath>" and "-I<installPath>/cccl") to
+    nvrtcCompileProgram.
+
+    A version marker file (.nvrtc_headers_version) is created in the
+    installation directory to track the installed version.
+
+    This function is thread-safe and process-safe. Concurrent calls from
+    multiple threads or processes will be serialized using file locking. By
+    default, the function waits for the lock; use
+    NVRTC_INSTALL_HEADERS_NO_WAIT to return immediately with
+    NVRTC_ERROR_BUSY if another process holds the lock.
+
+    Parameters
+    ----------
+    installPath : bytes
+        Path where headers should be extracted (UTF-8 encoded). The
+        directory will be created if it doesn't exist.
+    flags : unsigned int
+        NVRTC_INSTALL_HEADERS_* flags:
+
+    Returns
+    -------
+    nvrtcResult
+        - :py:obj:`~.NVRTC_SUCCESS`
+        - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT` (invalid path or conflicting flags like SKIP_IF_EXISTS | FORCE_OVERWRITE)
+        - :py:obj:`~.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE` (extraction failed or version mismatch)
+        - :py:obj:`~.NVRTC_ERROR_BUSY` (lock held by another process and NVRTC_INSTALL_HEADERS_NO_WAIT was specified)
+    errorLog : bytes
+        Optional pointer to receive detailed error message on failure. If
+        non-NULL, `*errorLog` will be set to point to a string describing
+        the error cause. Note: subsequent API calls from the same thread
+        may overwrite this message. May be NULL if error details are not
+        needed.
+
+    See Also
+    --------
+    :py:obj:`~.nvrtcCompileProgram`
+
+    Notes
+    -----
+    Use NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS to avoid reinstalling if headers already exist. Use NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE to guarantee consistency by clearing the directory first.
+    """
+    cdef const char* errorLog = NULL
+    with nogil:
+        err = cynvrtc.nvrtcInstallBundledHeaders(installPath, flags, &errorLog)
+    if err != cynvrtc.NVRTC_SUCCESS:
+        return (_nvrtcResult(err), None)
+    return (_nvrtcResult_SUCCESS, <bytes>errorLog if errorLog != NULL else None)
+
+@cython.embedsignature(True)
+def nvrtcGetBundledHeadersInfo():
+    """ nvrtcGetBundledHeadersInfo queries information about the bundled headers without extracting them.
+
+    This function allows users to determine if bundled headers are
+    available and get size estimates before calling
+    nvrtcInstallBundledHeaders.
+
+    Returns
+    -------
+    nvrtcResult
+        - :py:obj:`~.NVRTC_SUCCESS`
+        - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT` (info is NULL)
+        - :py:obj:`~.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE` (failed to query bundled headers)
+    info : :py:obj:`~.nvrtcBundledHeadersInfo`
+        Pointer to structure to receive header information.
+    errorLog : bytes
+        Optional pointer to receive detailed error message on failure. If
+        non-NULL, `*errorLog` will be set to point to a string describing
+        the error cause. Note: subsequent API calls from the same thread
+        may overwrite this message. May be NULL if error details are not
+        needed.
+    """
+    cdef nvrtcBundledHeadersInfo info = nvrtcBundledHeadersInfo()
+    cdef const char* errorLog = NULL
+    with nogil:
+        err = cynvrtc.nvrtcGetBundledHeadersInfo(<cynvrtc.nvrtcBundledHeadersInfo*>info._pvt_ptr, &errorLog)
+    if err != cynvrtc.NVRTC_SUCCESS:
+        return (_nvrtcResult(err), None, None)
+    return (_nvrtcResult_SUCCESS, info, <bytes>errorLog if errorLog != NULL else None)
+
+@cython.embedsignature(True)
+def nvrtcRemoveBundledHeaders(char* installPath):
+    """ nvrtcRemoveBundledHeaders removes previously installed bundled headers.
+
+    This function removes the headers installed by
+    nvrtcInstallBundledHeaders, helping users manage disk space. It
+    recursively removes all files and subdirectories within the
+    installation directory.
+
+    Parameters
+    ----------
+    installPath : bytes
+        Path where headers were previously installed. Must be the same path
+        used with nvrtcInstallBundledHeaders.
+
+    Returns
+    -------
+    nvrtcResult
+        - :py:obj:`~.NVRTC_SUCCESS`
+        - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT` (invalid path)
+        - :py:obj:`~.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE` (removal failed)
+    errorLog : bytes
+        Optional pointer to receive detailed error message on failure. If
+        non-NULL, `*errorLog` will be set to point to a string describing
+        the error cause. Note: subsequent API calls from the same thread
+        may overwrite this message. May be NULL if error details are not
+        needed.
+
+    Notes
+    -----
+    This function will remove ALL contents of the specified directory, not just files installed by NVRTC. Use with caution.
+    """
+    cdef const char* errorLog = NULL
+    with nogil:
+        err = cynvrtc.nvrtcRemoveBundledHeaders(installPath, &errorLog)
+    if err != cynvrtc.NVRTC_SUCCESS:
+        return (_nvrtcResult(err), None)
+    return (_nvrtcResult_SUCCESS, <bytes>errorLog if errorLog != NULL else None)
+
 @cython.embedsignature(True)
 def sizeof(objType):
     """ Returns the size of provided CUDA Python structure in bytes
@@ -1040,4 +1374,7 @@ def sizeof(objType):
 
     if objType == nvrtcProgram:
         return sizeof(cynvrtc.nvrtcProgram)
+
+    if objType == nvrtcBundledHeadersInfo:
+        return sizeof(cynvrtc.nvrtcBundledHeadersInfo)
     raise TypeError("Unknown type: " + str(objType))
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 3043bcdddbb..323fc99e465 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1630+gadce055ea.d20260422. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 
 include "_lib/utils.pxd"
@@ -333,6 +333,21 @@ cdef class cudaStreamCallback_t:
     cdef cyruntime.cudaStreamCallback_t* _pvt_ptr
 {{endif}}
 
+{{if 'cudaGraphRecaptureCallback_t' in found_types}}
+
+cdef class cudaGraphRecaptureCallback_t:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    cdef cyruntime.cudaGraphRecaptureCallback_t  _pvt_val
+    cdef cyruntime.cudaGraphRecaptureCallback_t* _pvt_ptr
+{{endif}}
+
 {{if 'cudaLogsCallback_t' in found_types}}
 
 cdef class cudaLogsCallback_t:
@@ -1428,8 +1443,14 @@ cdef class cudaFuncAttributes:
         higher cluster sizes that’s not guaranteed to be portable. See
         cudaFuncSetAttribute
     {{endif}}
-    {{if 'cudaFuncAttributes.reserved0' in found_struct}}
-    reserved0 : int
+    {{if 'cudaFuncAttributes.deviceNodeUpdateStatus' in found_struct}}
+    deviceNodeUpdateStatus : int
+        Whether the function can be updated on device. 1 means device node
+        update is supported, 0 is unsupported or driver is too old to check
+        the value.
+    {{endif}}
+    {{if 'cudaFuncAttributes.reserved1' in found_struct}}
+    reserved1 : int
 
     {{endif}}
     {{if 'cudaFuncAttributes.reserved' in found_struct}}
@@ -1462,7 +1483,9 @@ cdef class cudaMemLocation:
     {{endif}}
     {{if 'cudaMemLocation.id' in found_struct}}
     id : int
-        identifier for a given this location's ::CUmemLocationType.
+        Identifier for cudaMemLocationType::cudaMemLocationTypeDevice,
+        cudaMemLocationType::cudaMemLocationTypeHost, or
+        cudaMemLocationType::cudaMemLocationTypeHostNuma.
     {{endif}}
 
     Methods
@@ -1470,7 +1493,7 @@ cdef class cudaMemLocation:
     getPtr()
         Get memory address of class instance
     """
-    cdef cyruntime.cudaMemLocation _pvt_val
+    cdef cyruntime.cudaMemLocation* _val_ptr
     cdef cyruntime.cudaMemLocation* _pvt_ptr
 {{endif}}
 {{if 'cudaMemAccessDesc' in found_struct}}
@@ -1844,7 +1867,7 @@ cdef class anon_struct7:
 {{endif}}
 {{if 'cudaMemcpy3DOperand.op' in found_struct}}
 
-cdef class anon_union1:
+cdef class anon_union2:
     """
     Attributes
     ----------
@@ -1883,7 +1906,7 @@ cdef class cudaMemcpy3DOperand:
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op' in found_struct}}
-    op : anon_union1
+    op : anon_union2
 
     {{endif}}
 
@@ -1895,7 +1918,7 @@ cdef class cudaMemcpy3DOperand:
     cdef cyruntime.cudaMemcpy3DOperand* _val_ptr
     cdef cyruntime.cudaMemcpy3DOperand* _pvt_ptr
     {{if 'cudaMemcpy3DOperand.op' in found_struct}}
-    cdef anon_union1 _op
+    cdef anon_union2 _op
     {{endif}}
 {{endif}}
 {{if 'cudaMemcpy3DBatchOp' in found_struct}}
@@ -2468,7 +2491,7 @@ cdef class anon_struct8:
 {{endif}}
 {{if 'cudaExternalMemoryHandleDesc.handle' in found_struct}}
 
-cdef class anon_union2:
+cdef class anon_union3:
     """
     Attributes
     ----------
@@ -2511,7 +2534,7 @@ cdef class cudaExternalMemoryHandleDesc:
         Type of the handle
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle' in found_struct}}
-    handle : anon_union2
+    handle : anon_union3
 
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.size' in found_struct}}
@@ -2535,7 +2558,7 @@ cdef class cudaExternalMemoryHandleDesc:
     cdef cyruntime.cudaExternalMemoryHandleDesc* _val_ptr
     cdef cyruntime.cudaExternalMemoryHandleDesc* _pvt_ptr
     {{if 'cudaExternalMemoryHandleDesc.handle' in found_struct}}
-    cdef anon_union2 _handle
+    cdef anon_union3 _handle
     {{endif}}
 {{endif}}
 {{if 'cudaExternalMemoryBufferDesc' in found_struct}}
@@ -2650,7 +2673,7 @@ cdef class anon_struct9:
 {{endif}}
 {{if 'cudaExternalSemaphoreHandleDesc.handle' in found_struct}}
 
-cdef class anon_union3:
+cdef class anon_union4:
     """
     Attributes
     ----------
@@ -2693,7 +2716,7 @@ cdef class cudaExternalSemaphoreHandleDesc:
         Type of the handle
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle' in found_struct}}
-    handle : anon_union3
+    handle : anon_union4
 
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.flags' in found_struct}}
@@ -2713,7 +2736,7 @@ cdef class cudaExternalSemaphoreHandleDesc:
     cdef cyruntime.cudaExternalSemaphoreHandleDesc* _val_ptr
     cdef cyruntime.cudaExternalSemaphoreHandleDesc* _pvt_ptr
     {{if 'cudaExternalSemaphoreHandleDesc.handle' in found_struct}}
-    cdef anon_union3 _handle
+    cdef anon_union4 _handle
     {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.fence' in found_struct}}
@@ -2736,7 +2759,7 @@ cdef class anon_struct10:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
 
-cdef class anon_union4:
+cdef class anon_union5:
     """
     Attributes
     ----------
@@ -2788,7 +2811,7 @@ cdef class anon_struct12:
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
-    nvSciSync : anon_union4
+    nvSciSync : anon_union5
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
@@ -2810,7 +2833,7 @@ cdef class anon_struct12:
     cdef anon_struct10 _fence
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
-    cdef anon_union4 _nvSciSync
+    cdef anon_union5 _nvSciSync
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
     cdef anon_struct11 _keyedMutex
@@ -2875,7 +2898,7 @@ cdef class anon_struct13:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
 
-cdef class anon_union5:
+cdef class anon_union6:
     """
     Attributes
     ----------
@@ -2931,7 +2954,7 @@ cdef class anon_struct15:
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
-    nvSciSync : anon_union5
+    nvSciSync : anon_union6
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
@@ -2953,7 +2976,7 @@ cdef class anon_struct15:
     cdef anon_struct13 _fence
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
-    cdef anon_union5 _nvSciSync
+    cdef anon_union6 _nvSciSync
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
     cdef anon_struct14 _keyedMutex
@@ -3116,7 +3139,7 @@ cdef class cudaDevSmResourceGroupParams_st:
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
-        Reserved for future use - ensure this is is zero initialized.
+        Reserved for future use - ensure this is zero initialized.
     {{endif}}
 
     Methods
@@ -3943,7 +3966,7 @@ cdef class anon_struct16:
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
 
-cdef class anon_union9:
+cdef class anon_union10:
     """
     Attributes
     ----------
@@ -3992,7 +4015,7 @@ cdef class cudaGraphKernelNodeUpdate:
         interpreted
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    updateData : anon_union9
+    updateData : anon_union10
         Update data to apply. Which field is used depends on field's value
     {{endif}}
 
@@ -4007,7 +4030,7 @@ cdef class cudaGraphKernelNodeUpdate:
     cdef cudaGraphDeviceNode_t _node
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    cdef anon_union9 _updateData
+    cdef anon_union10 _updateData
     {{endif}}
 {{endif}}
 {{if 'cudaLaunchMemSyncDomainMap_st' in found_struct}}
@@ -4373,7 +4396,7 @@ cdef class anon_struct22:
 {{endif}}
 {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
 
-cdef class anon_union10:
+cdef class anon_union11:
     """
     Attributes
     ----------
@@ -4405,7 +4428,7 @@ cdef class cudaAsyncNotificationInfo:
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union10
+    info : anon_union11
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -4418,7 +4441,7 @@ cdef class cudaAsyncNotificationInfo:
     cdef cyruntime.cudaAsyncNotificationInfo* _val_ptr
     cdef cyruntime.cudaAsyncNotificationInfo* _pvt_ptr
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    cdef anon_union10 _info
+    cdef anon_union11 _info
     {{endif}}
 {{endif}}
 {{if 'cudaTextureDesc' in found_struct}}
@@ -4490,6 +4513,38 @@ cdef class cudaTextureDesc:
     cdef cyruntime.cudaTextureDesc _pvt_val
     cdef cyruntime.cudaTextureDesc* _pvt_ptr
 {{endif}}
+{{if 'cudaGraphRecaptureCallbackData' in found_struct}}
+
+cdef class cudaGraphRecaptureCallbackData:
+    """
+    Struct of user callback data that is invoked when node parameter
+    mismatches are detected while recapturing to an existing graph
+
+    Attributes
+    ----------
+    {{if 'cudaGraphRecaptureCallbackData.callbackFunc' in found_struct}}
+    callbackFunc : cudaGraphRecaptureCallback_t
+        Callback function that will be invoked
+    {{endif}}
+    {{if 'cudaGraphRecaptureCallbackData.userData' in found_struct}}
+    userData : Any
+        Generic pointer that is passed to the callback function
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cyruntime.cudaGraphRecaptureCallbackData _pvt_val
+    cdef cyruntime.cudaGraphRecaptureCallbackData* _pvt_ptr
+    {{if 'cudaGraphRecaptureCallbackData.callbackFunc' in found_struct}}
+    cdef cudaGraphRecaptureCallback_t _callbackFunc
+    {{endif}}
+    {{if 'cudaGraphRecaptureCallbackData.userData' in found_struct}}
+    cdef _HelperInputVoidPtr _cyuserData
+    {{endif}}
+{{endif}}
 {{if True}}
 
 cdef class cudaEglPlaneDesc_st:
@@ -4541,7 +4596,7 @@ cdef class cudaEglPlaneDesc_st:
 {{endif}}
 {{if True}}
 
-cdef class anon_union11:
+cdef class anon_union12:
     """
     Attributes
     ----------
@@ -4577,7 +4632,7 @@ cdef class cudaEglFrame_st:
     Attributes
     ----------
     {{if True}}
-    frame : anon_union11
+    frame : anon_union12
 
     {{endif}}
     {{if True}}
@@ -4605,7 +4660,7 @@ cdef class cudaEglFrame_st:
     cdef cyruntime.cudaEglFrame_st* _val_ptr
     cdef cyruntime.cudaEglFrame_st* _pvt_ptr
     {{if True}}
-    cdef anon_union11 _frame
+    cdef anon_union12 _frame
     {{endif}}
 {{endif}}
 {{if 'CUuuid' in found_types}}
@@ -4731,7 +4786,7 @@ cdef class cudaDevSmResourceGroupParams(cudaDevSmResourceGroupParams_st):
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
-        Reserved for future use - ensure this is is zero initialized.
+        Reserved for future use - ensure this is zero initialized.
     {{endif}}
 
     Methods
@@ -4981,7 +5036,7 @@ cdef class cudaAsyncNotificationInfo_t(cudaAsyncNotificationInfo):
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union10
+    info : anon_union11
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -5316,7 +5371,7 @@ cdef class cudaEglFrame(cudaEglFrame_st):
     Attributes
     ----------
     {{if True}}
-    frame : anon_union11
+    frame : anon_union12
 
     {{endif}}
     {{if True}}
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 8b2ae97b419..5c38d5c0a2f 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1630+gadce055ea.d20260422. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -1402,6 +1402,13 @@ class cudaError_t(_FastEnum):
         "associated with the stream has been destroyed, limiting the stream's\n"
         'operational capabilities.\n'
     ){{endif}}
+    {{if 'cudaErrorGraphRecaptureFailure' in found_values}}
+
+    cudaErrorGraphRecaptureFailure = (
+        cyruntime.cudaError.cudaErrorGraphRecaptureFailure,
+        'This error indicates that a graph recapture failed and had to be\n'
+        'terminated.\n'
+    ){{endif}}
     {{if 'cudaErrorUnknown' in found_values}}
 
     cudaErrorUnknown = (
@@ -1411,6 +1418,37 @@ class cudaError_t(_FastEnum):
     {{if 'cudaErrorApiFailureBase' in found_values}}
     cudaErrorApiFailureBase = cyruntime.cudaError.cudaErrorApiFailureBase{{endif}}
 
+{{endif}}
+{{if 'cudaSharedMemoryMode' in found_types}}
+
+class cudaSharedMemoryMode(_FastEnum):
+    """
+    Shared memory related attributes for use with
+    :py:obj:`~.cuLaunchKernelEx`
+    """
+    {{if 'cudaSharedMemoryModeDefault' in found_values}}
+
+    cudaSharedMemoryModeDefault = (
+        cyruntime.cudaSharedMemoryMode.cudaSharedMemoryModeDefault,
+        'The default to use for allowing non-portable shared memory size on launch -\n'
+        'uses current function attributes for\n'
+        ':py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize`\n'
+    ){{endif}}
+    {{if 'cudaSharedMemoryModeRequirePortable' in found_values}}
+
+    cudaSharedMemoryModeRequirePortable = (
+        cyruntime.cudaSharedMemoryMode.cudaSharedMemoryModeRequirePortable,
+        'Specifies that the shared memory size requested must be a portable size\n'
+        'within :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlock`\n'
+    ){{endif}}
+    {{if 'cudaSharedMemoryModeAllowNonPortable' in found_values}}
+
+    cudaSharedMemoryModeAllowNonPortable = (
+        cyruntime.cudaSharedMemoryMode.cudaSharedMemoryModeAllowNonPortable,
+        'Specifies that the shared memory size requested may be a non-portable size\n'
+        'up to :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`\n'
+    ){{endif}}
+
 {{endif}}
 {{if 'cudaGraphDependencyType_enum' in found_types}}
 
@@ -1549,37 +1587,6 @@ class cudaLaunchAttributePortableClusterMode(_FastEnum):
         'Specifies that the cluster size requested may be a non-portable size\n'
     ){{endif}}
 
-{{endif}}
-{{if 'cudaSharedMemoryMode' in found_types}}
-
-class cudaSharedMemoryMode(_FastEnum):
-    """
-    Shared memory related attributes for use with
-    :py:obj:`~.cuLaunchKernelEx`
-    """
-    {{if 'cudaSharedMemoryModeDefault' in found_values}}
-
-    cudaSharedMemoryModeDefault = (
-        cyruntime.cudaSharedMemoryMode.cudaSharedMemoryModeDefault,
-        'The default to use for allowing non-portable shared memory size on launch -\n'
-        'uses current function attributes for\n'
-        ':py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize`\n'
-    ){{endif}}
-    {{if 'cudaSharedMemoryModeRequirePortable' in found_values}}
-
-    cudaSharedMemoryModeRequirePortable = (
-        cyruntime.cudaSharedMemoryMode.cudaSharedMemoryModeRequirePortable,
-        'Specifies that the shared memory size requested must be a portable size\n'
-        'within :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlock`\n'
-    ){{endif}}
-    {{if 'cudaSharedMemoryModeAllowNonPortable' in found_values}}
-
-    cudaSharedMemoryModeAllowNonPortable = (
-        cyruntime.cudaSharedMemoryMode.cudaSharedMemoryModeAllowNonPortable,
-        'Specifies that the shared memory size requested may be a non-portable size\n'
-        'up to :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`\n'
-    ){{endif}}
-
 {{endif}}
 {{if 'cudaLaunchAttributeID' in found_types}}
 
@@ -3052,6 +3059,90 @@ class cudaChannelFormatKind(_FastEnum):
         cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsignedNormalized1010102,
         '4 channel unsigned normalized (10-bit, 10-bit, 10-bit, 2-bit) format\n'
     ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned8Packed422' in found_values}}
+
+    cudaChannelFormatKindUnsigned8Packed422 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Packed422,
+        '4 channel unsigned 8-bit packed format, with 4:2:2 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned8Packed444' in found_values}}
+
+    cudaChannelFormatKindUnsigned8Packed444 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Packed444,
+        '4 channel unsigned 8-bit packed format, with 4:4:4 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned8SemiPlanar420' in found_values}}
+
+    cudaChannelFormatKindUnsigned8SemiPlanar420 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8SemiPlanar420,
+        '3 channel unsigned 8-bit semi-planar format, with 4:2:0 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned16SemiPlanar420' in found_values}}
+
+    cudaChannelFormatKindUnsigned16SemiPlanar420 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16SemiPlanar420,
+        '3 channel unsigned 16-bit semi-planar format, with 4:2:0 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned8SemiPlanar422' in found_values}}
+
+    cudaChannelFormatKindUnsigned8SemiPlanar422 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8SemiPlanar422,
+        '3 channel unsigned 8-bit semi-planar format, with 4:2:2 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned16SemiPlanar422' in found_values}}
+
+    cudaChannelFormatKindUnsigned16SemiPlanar422 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16SemiPlanar422,
+        '3 channel unsigned 16-bit semi-planar format, with 4:2:2 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned8SemiPlanar444' in found_values}}
+
+    cudaChannelFormatKindUnsigned8SemiPlanar444 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8SemiPlanar444,
+        '3 channel unsigned 8-bit semi-planar format, with 4:4:4 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned16SemiPlanar444' in found_values}}
+
+    cudaChannelFormatKindUnsigned16SemiPlanar444 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16SemiPlanar444,
+        '3 channel unsigned 16-bit semi-planar format, with 4:4:4 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned8Planar420' in found_values}}
+
+    cudaChannelFormatKindUnsigned8Planar420 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Planar420,
+        '3 channel unsigned 8-bit planar format, with 4:2:0 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned16Planar420' in found_values}}
+
+    cudaChannelFormatKindUnsigned16Planar420 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16Planar420,
+        '3 channel unsigned 16-bit planar format, with 4:2:0 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned8Planar422' in found_values}}
+
+    cudaChannelFormatKindUnsigned8Planar422 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Planar422,
+        '3 channel unsigned 8-bit planar format, with 4:2:2 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned16Planar422' in found_values}}
+
+    cudaChannelFormatKindUnsigned16Planar422 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16Planar422,
+        '3 channel unsigned 16-bit planar format, with 4:2:2 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned8Planar444' in found_values}}
+
+    cudaChannelFormatKindUnsigned8Planar444 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Planar444,
+        '3 channel unsigned 8-bit planar format, with 4:4:4 sampling\n'
+    ){{endif}}
+    {{if 'cudaChannelFormatKindUnsigned16Planar444' in found_values}}
+
+    cudaChannelFormatKindUnsigned16Planar444 = (
+        cyruntime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16Planar444,
+        '3 channel unsigned 16-bit planar format, with 4:4:4 sampling\n'
+    ){{endif}}
 
 {{endif}}
 {{if 'cudaMemoryType' in found_types}}
@@ -3179,6 +3270,34 @@ class cudaStreamCaptureStatus(_FastEnum):
         'terminated\n'
     ){{endif}}
 
+{{endif}}
+{{if 'cudaGraphRecaptureStatus' in found_types}}
+
+class cudaGraphRecaptureStatus(_FastEnum):
+    """
+    Possible recapture statuses that can be returned to the user
+    callback
+    """
+    {{if 'cudaGraphRecaptureEligibleForUpdate' in found_values}}
+
+    cudaGraphRecaptureEligibleForUpdate = (
+        cyruntime.cudaGraphRecaptureStatus.cudaGraphRecaptureEligibleForUpdate,
+        'Node is eligible for update in an instantiated graph.\n'
+    ){{endif}}
+    {{if 'cudaGraphRecaptureIneligibleForUpdate' in found_values}}
+
+    cudaGraphRecaptureIneligibleForUpdate = (
+        cyruntime.cudaGraphRecaptureStatus.cudaGraphRecaptureIneligibleForUpdate,
+        'Parameter changes in the node cannot be applied to an instantiated graph.\n'
+    ){{endif}}
+    {{if 'cudaGraphRecaptureError' in found_values}}
+
+    cudaGraphRecaptureError = (
+        cyruntime.cudaGraphRecaptureStatus.cudaGraphRecaptureError,
+        'Error while attempting to recapture the node. The recapture will be ended\n'
+        'regardless of the return value from the callback.\n'
+    ){{endif}}
+
 {{endif}}
 {{if 'cudaStreamCaptureMode' in found_types}}
 
@@ -4863,6 +4982,19 @@ class cudaDeviceAttr(_FastEnum):
         'Link between the device and the host supports only some native atomic\n'
         'operations\n'
     ){{endif}}
+    {{if 'cudaDevAttrAtomicReductionSupported' in found_values}}
+
+    cudaDevAttrAtomicReductionSupported = (
+        cyruntime.cudaDeviceAttr.cudaDevAttrAtomicReductionSupported,
+        'Device supports atomic reduction operations in stream batch memory\n'
+        'operations\n'
+    ){{endif}}
+    {{if 'cudaDevAttrCigStreamsSupported' in found_values}}
+
+    cudaDevAttrCigStreamsSupported = (
+        cyruntime.cudaDeviceAttr.cudaDevAttrCigStreamsSupported,
+        'Device supports CIG streams\n'
+    ){{endif}}
     {{if 'cudaDevAttrMax' in found_values}}
     cudaDevAttrMax = cyruntime.cudaDeviceAttr.cudaDevAttrMax{{endif}}
 
@@ -6004,6 +6136,12 @@ class cudaGraphNodeType(_FastEnum):
         '                                   call :py:obj:`~.cudaGraphSetConditional`\n'
         'from device code.\n'
     ){{endif}}
+    {{if 'cudaGraphNodeTypeReserved16' in found_values}}
+
+    cudaGraphNodeTypeReserved16 = (
+        cyruntime.cudaGraphNodeType.cudaGraphNodeTypeReserved16,
+        'Reserved.\n'
+    ){{endif}}
     {{if 'cudaGraphNodeTypeCount' in found_values}}
     cudaGraphNodeTypeCount = cyruntime.cudaGraphNodeType.cudaGraphNodeTypeCount{{endif}}
 
@@ -6329,6 +6467,36 @@ class cudaDeviceNumaConfig(_FastEnum):
         'The GPU is a NUMA node, cudaDevAttrNumaId contains its NUMA ID\n'
     ){{endif}}
 
+{{endif}}
+{{if 'cudaFabricOpStatusSource' in found_types}}
+
+class cudaFabricOpStatusSource(_FastEnum):
+    """
+    Fabric operation status source
+    """
+    {{if 'cudaFabricOpStatusSourceMbarrierV1' in found_values}}
+
+    cudaFabricOpStatusSourceMbarrierV1 = (
+        cyruntime.cudaFabricOpStatusSource.cudaFabricOpStatusSourceMbarrierV1,
+        '1B-aligned 1B-wide status from an mbarrier.layout::v1\n'
+    ){{endif}}
+    {{if 'cudaFabricOpStatusSourceMax' in found_values}}
+    cudaFabricOpStatusSourceMax = cyruntime.cudaFabricOpStatusSource.cudaFabricOpStatusSourceMax{{endif}}
+
+{{endif}}
+{{if 'cudaFabricOpStatusInfo' in found_types}}
+
+class cudaFabricOpStatusInfo(_FastEnum):
+    """
+    Fabric operation status info
+    """
+    {{if 'cudaFabricOpStatusInfoSuccess' in found_values}}
+    cudaFabricOpStatusInfoSuccess = cyruntime.cudaFabricOpStatusInfo.cudaFabricOpStatusInfoSuccess{{endif}}
+    {{if 'cudaFabricOpStatusInfoLast' in found_values}}
+    cudaFabricOpStatusInfoLast = cyruntime.cudaFabricOpStatusInfo.cudaFabricOpStatusInfoLast{{endif}}
+    {{if 'cudaFabricOpStatusInfoMax' in found_values}}
+    cudaFabricOpStatusInfoMax = cyruntime.cudaFabricOpStatusInfo.cudaFabricOpStatusInfoMax{{endif}}
+
 {{endif}}
 {{if 'cudaSurfaceBoundaryMode' in found_types}}
 
@@ -7728,6 +7896,35 @@ cdef class cudaStreamCallback_t:
         return <void_ptr>self._pvt_ptr
 {{endif}}
 
+{{if 'cudaGraphRecaptureCallback_t' in found_types}}
+
+cdef class cudaGraphRecaptureCallback_t:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    def __cinit__(self, void_ptr init_value = 0, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+            self._pvt_ptr[0] = <cyruntime.cudaGraphRecaptureCallback_t>init_value
+        else:
+            self._pvt_ptr = <cyruntime.cudaGraphRecaptureCallback_t *>_ptr
+    def __init__(self, *args, **kwargs):
+        pass
+    def __repr__(self):
+        return '<cudaGraphRecaptureCallback_t ' + str(hex(self.__int__())) + '>'
+    def __index__(self):
+        return self.__int__()
+    def __int__(self):
+        return <void_ptr>self._pvt_ptr[0]
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+{{endif}}
+
 {{if 'cudaLogsCallback_t' in found_types}}
 
 cdef class cudaLogsCallback_t:
@@ -10761,8 +10958,14 @@ cdef class cudaFuncAttributes:
         higher cluster sizes that’s not guaranteed to be portable. See
         cudaFuncSetAttribute
     {{endif}}
-    {{if 'cudaFuncAttributes.reserved0' in found_struct}}
-    reserved0 : int
+    {{if 'cudaFuncAttributes.deviceNodeUpdateStatus' in found_struct}}
+    deviceNodeUpdateStatus : int
+        Whether the function can be updated on device. 1 means device node
+        update is supported, 0 is unsupported or driver is too old to check
+        the value.
+    {{endif}}
+    {{if 'cudaFuncAttributes.reserved1' in found_struct}}
+    reserved1 : int
 
     {{endif}}
     {{if 'cudaFuncAttributes.reserved' in found_struct}}
@@ -10885,11 +11088,17 @@ cdef class cudaFuncAttributes:
             except ValueError:
                 str_list += ['nonPortableClusterSizeAllowed : <ValueError>']
             {{endif}}
-            {{if 'cudaFuncAttributes.reserved0' in found_struct}}
+            {{if 'cudaFuncAttributes.deviceNodeUpdateStatus' in found_struct}}
             try:
-                str_list += ['reserved0 : ' + str(self.reserved0)]
+                str_list += ['deviceNodeUpdateStatus : ' + str(self.deviceNodeUpdateStatus)]
             except ValueError:
-                str_list += ['reserved0 : <ValueError>']
+                str_list += ['deviceNodeUpdateStatus : <ValueError>']
+            {{endif}}
+            {{if 'cudaFuncAttributes.reserved1' in found_struct}}
+            try:
+                str_list += ['reserved1 : ' + str(self.reserved1)]
+            except ValueError:
+                str_list += ['reserved1 : <ValueError>']
             {{endif}}
             {{if 'cudaFuncAttributes.reserved' in found_struct}}
             try:
@@ -11028,13 +11237,21 @@ cdef class cudaFuncAttributes:
     def nonPortableClusterSizeAllowed(self, int nonPortableClusterSizeAllowed):
         self._pvt_ptr[0].nonPortableClusterSizeAllowed = nonPortableClusterSizeAllowed
     {{endif}}
-    {{if 'cudaFuncAttributes.reserved0' in found_struct}}
+    {{if 'cudaFuncAttributes.deviceNodeUpdateStatus' in found_struct}}
     @property
-    def reserved0(self):
-        return self._pvt_ptr[0].reserved0
-    @reserved0.setter
-    def reserved0(self, int reserved0):
-        self._pvt_ptr[0].reserved0 = reserved0
+    def deviceNodeUpdateStatus(self):
+        return self._pvt_ptr[0].deviceNodeUpdateStatus
+    @deviceNodeUpdateStatus.setter
+    def deviceNodeUpdateStatus(self, int deviceNodeUpdateStatus):
+        self._pvt_ptr[0].deviceNodeUpdateStatus = deviceNodeUpdateStatus
+    {{endif}}
+    {{if 'cudaFuncAttributes.reserved1' in found_struct}}
+    @property
+    def reserved1(self):
+        return self._pvt_ptr[0].reserved1
+    @reserved1.setter
+    def reserved1(self, int reserved1):
+        self._pvt_ptr[0].reserved1 = reserved1
     {{endif}}
     {{if 'cudaFuncAttributes.reserved' in found_struct}}
     @property
@@ -11062,7 +11279,9 @@ cdef class cudaMemLocation:
     {{endif}}
     {{if 'cudaMemLocation.id' in found_struct}}
     id : int
-        identifier for a given this location's ::CUmemLocationType.
+        Identifier for cudaMemLocationType::cudaMemLocationTypeDevice,
+        cudaMemLocationType::cudaMemLocationTypeHost, or
+        cudaMemLocationType::cudaMemLocationTypeHostNuma.
     {{endif}}
 
     Methods
@@ -11072,13 +11291,15 @@ cdef class cudaMemLocation:
     """
     def __cinit__(self, void_ptr _ptr = 0):
         if _ptr == 0:
-            self._pvt_ptr = &self._pvt_val
+            self._val_ptr = <cyruntime.cudaMemLocation *>calloc(1, sizeof(cyruntime.cudaMemLocation))
+            self._pvt_ptr = self._val_ptr
         else:
             self._pvt_ptr = <cyruntime.cudaMemLocation *>_ptr
     def __init__(self, void_ptr _ptr = 0):
         pass
     def __dealloc__(self):
-        pass
+        if self._val_ptr is not NULL:
+            free(self._val_ptr)
     def getPtr(self):
         return <void_ptr>self._pvt_ptr
     def __repr__(self):
@@ -12150,7 +12371,7 @@ cdef class anon_struct7:
 {{endif}}
 {{if 'cudaMemcpy3DOperand.op' in found_struct}}
 
-cdef class anon_union1:
+cdef class anon_union2:
     """
     Attributes
     ----------
@@ -12231,7 +12452,7 @@ cdef class cudaMemcpy3DOperand:
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op' in found_struct}}
-    op : anon_union1
+    op : anon_union2
 
     {{endif}}
 
@@ -12249,7 +12470,7 @@ cdef class cudaMemcpy3DOperand:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if 'cudaMemcpy3DOperand.op' in found_struct}}
-        self._op = anon_union1(_ptr=<void_ptr>self._pvt_ptr)
+        self._op = anon_union2(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -12287,8 +12508,8 @@ cdef class cudaMemcpy3DOperand:
     def op(self):
         return self._op
     @op.setter
-    def op(self, op not None : anon_union1):
-        string.memcpy(&self._pvt_ptr[0].op, <cyruntime.anon_union1*><void_ptr>op.getPtr(), sizeof(self._pvt_ptr[0].op))
+    def op(self, op not None : anon_union2):
+        string.memcpy(&self._pvt_ptr[0].op, <cyruntime.anon_union2*><void_ptr>op.getPtr(), sizeof(self._pvt_ptr[0].op))
     {{endif}}
 {{endif}}
 {{if 'cudaMemcpy3DBatchOp' in found_struct}}
@@ -14459,7 +14680,7 @@ cdef class anon_struct8:
 {{endif}}
 {{if 'cudaExternalMemoryHandleDesc.handle' in found_struct}}
 
-cdef class anon_union2:
+cdef class anon_union3:
     """
     Attributes
     ----------
@@ -14556,7 +14777,7 @@ cdef class cudaExternalMemoryHandleDesc:
         Type of the handle
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle' in found_struct}}
-    handle : anon_union2
+    handle : anon_union3
 
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.size' in found_struct}}
@@ -14586,7 +14807,7 @@ cdef class cudaExternalMemoryHandleDesc:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if 'cudaExternalMemoryHandleDesc.handle' in found_struct}}
-        self._handle = anon_union2(_ptr=<void_ptr>self._pvt_ptr)
+        self._handle = anon_union3(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -14642,8 +14863,8 @@ cdef class cudaExternalMemoryHandleDesc:
     def handle(self):
         return self._handle
     @handle.setter
-    def handle(self, handle not None : anon_union2):
-        string.memcpy(&self._pvt_ptr[0].handle, <cyruntime.anon_union2*><void_ptr>handle.getPtr(), sizeof(self._pvt_ptr[0].handle))
+    def handle(self, handle not None : anon_union3):
+        string.memcpy(&self._pvt_ptr[0].handle, <cyruntime.anon_union3*><void_ptr>handle.getPtr(), sizeof(self._pvt_ptr[0].handle))
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.size' in found_struct}}
     @property
@@ -14990,7 +15211,7 @@ cdef class anon_struct9:
 {{endif}}
 {{if 'cudaExternalSemaphoreHandleDesc.handle' in found_struct}}
 
-cdef class anon_union3:
+cdef class anon_union4:
     """
     Attributes
     ----------
@@ -15087,7 +15308,7 @@ cdef class cudaExternalSemaphoreHandleDesc:
         Type of the handle
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle' in found_struct}}
-    handle : anon_union3
+    handle : anon_union4
 
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.flags' in found_struct}}
@@ -15113,7 +15334,7 @@ cdef class cudaExternalSemaphoreHandleDesc:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if 'cudaExternalSemaphoreHandleDesc.handle' in found_struct}}
-        self._handle = anon_union3(_ptr=<void_ptr>self._pvt_ptr)
+        self._handle = anon_union4(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -15163,8 +15384,8 @@ cdef class cudaExternalSemaphoreHandleDesc:
     def handle(self):
         return self._handle
     @handle.setter
-    def handle(self, handle not None : anon_union3):
-        string.memcpy(&self._pvt_ptr[0].handle, <cyruntime.anon_union3*><void_ptr>handle.getPtr(), sizeof(self._pvt_ptr[0].handle))
+    def handle(self, handle not None : anon_union4):
+        string.memcpy(&self._pvt_ptr[0].handle, <cyruntime.anon_union4*><void_ptr>handle.getPtr(), sizeof(self._pvt_ptr[0].handle))
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.flags' in found_struct}}
     @property
@@ -15231,7 +15452,7 @@ cdef class anon_struct10:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
 
-cdef class anon_union4:
+cdef class anon_union5:
     """
     Attributes
     ----------
@@ -15351,7 +15572,7 @@ cdef class anon_struct12:
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
-    nvSciSync : anon_union4
+    nvSciSync : anon_union5
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
@@ -15377,7 +15598,7 @@ cdef class anon_struct12:
         self._fence = anon_struct10(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
-        self._nvSciSync = anon_union4(_ptr=<void_ptr>self._pvt_ptr)
+        self._nvSciSync = anon_union5(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
         self._keyedMutex = anon_struct11(_ptr=<void_ptr>self._pvt_ptr)
@@ -15429,8 +15650,8 @@ cdef class anon_struct12:
     def nvSciSync(self):
         return self._nvSciSync
     @nvSciSync.setter
-    def nvSciSync(self, nvSciSync not None : anon_union4):
-        string.memcpy(&self._pvt_ptr[0].params.nvSciSync, <cyruntime.anon_union4*><void_ptr>nvSciSync.getPtr(), sizeof(self._pvt_ptr[0].params.nvSciSync))
+    def nvSciSync(self, nvSciSync not None : anon_union5):
+        string.memcpy(&self._pvt_ptr[0].params.nvSciSync, <cyruntime.anon_union5*><void_ptr>nvSciSync.getPtr(), sizeof(self._pvt_ptr[0].params.nvSciSync))
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
     @property
@@ -15593,7 +15814,7 @@ cdef class anon_struct13:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
 
-cdef class anon_union5:
+cdef class anon_union6:
     """
     Attributes
     ----------
@@ -15731,7 +15952,7 @@ cdef class anon_struct15:
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
-    nvSciSync : anon_union5
+    nvSciSync : anon_union6
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
@@ -15757,7 +15978,7 @@ cdef class anon_struct15:
         self._fence = anon_struct13(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
-        self._nvSciSync = anon_union5(_ptr=<void_ptr>self._pvt_ptr)
+        self._nvSciSync = anon_union6(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
         self._keyedMutex = anon_struct14(_ptr=<void_ptr>self._pvt_ptr)
@@ -15809,8 +16030,8 @@ cdef class anon_struct15:
     def nvSciSync(self):
         return self._nvSciSync
     @nvSciSync.setter
-    def nvSciSync(self, nvSciSync not None : anon_union5):
-        string.memcpy(&self._pvt_ptr[0].params.nvSciSync, <cyruntime.anon_union5*><void_ptr>nvSciSync.getPtr(), sizeof(self._pvt_ptr[0].params.nvSciSync))
+    def nvSciSync(self, nvSciSync not None : anon_union6):
+        string.memcpy(&self._pvt_ptr[0].params.nvSciSync, <cyruntime.anon_union6*><void_ptr>nvSciSync.getPtr(), sizeof(self._pvt_ptr[0].params.nvSciSync))
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
     @property
@@ -16203,7 +16424,7 @@ cdef class cudaDevSmResourceGroupParams_st:
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
-        Reserved for future use - ensure this is is zero initialized.
+        Reserved for future use - ensure this is zero initialized.
     {{endif}}
 
     Methods
@@ -18718,7 +18939,7 @@ cdef class anon_struct16:
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
 
-cdef class anon_union9:
+cdef class anon_union10:
     """
     Attributes
     ----------
@@ -18823,7 +19044,7 @@ cdef class cudaGraphKernelNodeUpdate:
         interpreted
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    updateData : anon_union9
+    updateData : anon_union10
         Update data to apply. Which field is used depends on field's value
     {{endif}}
 
@@ -18844,7 +19065,7 @@ cdef class cudaGraphKernelNodeUpdate:
         self._node = cudaGraphDeviceNode_t(_ptr=<void_ptr>&self._pvt_ptr[0].node)
         {{endif}}
         {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-        self._updateData = anon_union9(_ptr=<void_ptr>self._pvt_ptr)
+        self._updateData = anon_union10(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -18905,8 +19126,8 @@ cdef class cudaGraphKernelNodeUpdate:
     def updateData(self):
         return self._updateData
     @updateData.setter
-    def updateData(self, updateData not None : anon_union9):
-        string.memcpy(&self._pvt_ptr[0].updateData, <cyruntime.anon_union9*><void_ptr>updateData.getPtr(), sizeof(self._pvt_ptr[0].updateData))
+    def updateData(self, updateData not None : anon_union10):
+        string.memcpy(&self._pvt_ptr[0].updateData, <cyruntime.anon_union10*><void_ptr>updateData.getPtr(), sizeof(self._pvt_ptr[0].updateData))
     {{endif}}
 {{endif}}
 {{if 'cudaLaunchMemSyncDomainMap_st' in found_struct}}
@@ -19943,7 +20164,7 @@ cdef class anon_struct22:
 {{endif}}
 {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
 
-cdef class anon_union10:
+cdef class anon_union11:
     """
     Attributes
     ----------
@@ -20003,7 +20224,7 @@ cdef class cudaAsyncNotificationInfo:
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union10
+    info : anon_union11
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -20022,7 +20243,7 @@ cdef class cudaAsyncNotificationInfo:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-        self._info = anon_union10(_ptr=<void_ptr>self._pvt_ptr)
+        self._info = anon_union11(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -20060,8 +20281,8 @@ cdef class cudaAsyncNotificationInfo:
     def info(self):
         return self._info
     @info.setter
-    def info(self, info not None : anon_union10):
-        string.memcpy(&self._pvt_ptr[0].info, <cyruntime.anon_union10*><void_ptr>info.getPtr(), sizeof(self._pvt_ptr[0].info))
+    def info(self, info not None : anon_union11):
+        string.memcpy(&self._pvt_ptr[0].info, <cyruntime.anon_union11*><void_ptr>info.getPtr(), sizeof(self._pvt_ptr[0].info))
     {{endif}}
 {{endif}}
 {{if 'cudaTextureDesc' in found_struct}}
@@ -20330,6 +20551,88 @@ cdef class cudaTextureDesc:
         self._pvt_ptr[0].seamlessCubemap = seamlessCubemap
     {{endif}}
 {{endif}}
+{{if 'cudaGraphRecaptureCallbackData' in found_struct}}
+
+cdef class cudaGraphRecaptureCallbackData:
+    """
+    Struct of user callback data that is invoked when node parameter
+    mismatches are detected while recapturing to an existing graph
+
+    Attributes
+    ----------
+    {{if 'cudaGraphRecaptureCallbackData.callbackFunc' in found_struct}}
+    callbackFunc : cudaGraphRecaptureCallback_t
+        Callback function that will be invoked
+    {{endif}}
+    {{if 'cudaGraphRecaptureCallbackData.userData' in found_struct}}
+    userData : Any
+        Generic pointer that is passed to the callback function
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cyruntime.cudaGraphRecaptureCallbackData *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+        {{if 'cudaGraphRecaptureCallbackData.callbackFunc' in found_struct}}
+        self._callbackFunc = cudaGraphRecaptureCallback_t(_ptr=<void_ptr>&self._pvt_ptr[0].callbackFunc)
+        {{endif}}
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'cudaGraphRecaptureCallbackData.callbackFunc' in found_struct}}
+            try:
+                str_list += ['callbackFunc : ' + str(self.callbackFunc)]
+            except ValueError:
+                str_list += ['callbackFunc : <ValueError>']
+            {{endif}}
+            {{if 'cudaGraphRecaptureCallbackData.userData' in found_struct}}
+            try:
+                str_list += ['userData : ' + hex(self.userData)]
+            except ValueError:
+                str_list += ['userData : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'cudaGraphRecaptureCallbackData.callbackFunc' in found_struct}}
+    @property
+    def callbackFunc(self):
+        return self._callbackFunc
+    @callbackFunc.setter
+    def callbackFunc(self, callbackFunc):
+        cdef cyruntime.cudaGraphRecaptureCallback_t cycallbackFunc
+        if callbackFunc is None:
+            cycallbackFunc = <cyruntime.cudaGraphRecaptureCallback_t><void_ptr>0
+        elif isinstance(callbackFunc, (cudaGraphRecaptureCallback_t)):
+            pcallbackFunc = int(callbackFunc)
+            cycallbackFunc = <cyruntime.cudaGraphRecaptureCallback_t><void_ptr>pcallbackFunc
+        else:
+            pcallbackFunc = int(cudaGraphRecaptureCallback_t(callbackFunc))
+            cycallbackFunc = <cyruntime.cudaGraphRecaptureCallback_t><void_ptr>pcallbackFunc
+        self._callbackFunc._pvt_ptr[0] = cycallbackFunc
+    {{endif}}
+    {{if 'cudaGraphRecaptureCallbackData.userData' in found_struct}}
+    @property
+    def userData(self):
+        return <void_ptr>self._pvt_ptr[0].userData
+    @userData.setter
+    def userData(self, userData):
+        self._cyuserData = _HelperInputVoidPtr(userData)
+        self._pvt_ptr[0].userData = <void*><void_ptr>self._cyuserData.cptr
+    {{endif}}
+{{endif}}
 {{if True}}
 
 cdef class cudaEglPlaneDesc_st:
@@ -20494,7 +20797,7 @@ cdef class cudaEglPlaneDesc_st:
 {{endif}}
 {{if True}}
 
-cdef class anon_union11:
+cdef class anon_union12:
     """
     Attributes
     ----------
@@ -20584,7 +20887,7 @@ cdef class cudaEglFrame_st:
     Attributes
     ----------
     {{if True}}
-    frame : anon_union11
+    frame : anon_union12
 
     {{endif}}
     {{if True}}
@@ -20618,7 +20921,7 @@ cdef class cudaEglFrame_st:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if True}}
-        self._frame = anon_union11(_ptr=<void_ptr>self._pvt_ptr)
+        self._frame = anon_union12(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -20666,8 +20969,8 @@ cdef class cudaEglFrame_st:
     def frame(self):
         return self._frame
     @frame.setter
-    def frame(self, frame not None : anon_union11):
-        string.memcpy(&self._pvt_ptr[0].frame, <cyruntime.anon_union11*><void_ptr>frame.getPtr(), sizeof(self._pvt_ptr[0].frame))
+    def frame(self, frame not None : anon_union12):
+        string.memcpy(&self._pvt_ptr[0].frame, <cyruntime.anon_union12*><void_ptr>frame.getPtr(), sizeof(self._pvt_ptr[0].frame))
     {{endif}}
     {{if True}}
     @property
@@ -23931,6 +24234,82 @@ def cudaStreamBeginCapture(stream, mode not None : cudaStreamCaptureMode):
     return (_cudaError_t(err),)
 {{endif}}
 
+{{if 'cudaStreamBeginRecaptureToGraph' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaStreamBeginRecaptureToGraph(stream, mode not None : cudaStreamCaptureMode, graph, callbackData : Optional[cudaGraphRecaptureCallbackData]):
+    """ Begin graph capture on a stream to an existing graph.
+
+    Begin graph capture on `stream` to the existing `graph`. The node
+    creation order while recapturing the graph must be identical to the
+    original graph. The recapture will fail immediately for:
+
+    - Topology mismatches between the existing graph and the recaptured
+      graph
+
+    - Parameter mismatches for memory allocation or free nodes
+
+    Any other node parameter mismatches during recapture can be configured
+    to call the function provided in `callbackFunc`. The recapture will
+    fail immediately if the callback returns anything other than
+    cudaSuccess.
+
+    If the recapture fails for any reason, the `graph` will be in an
+    undefined state and should be destroyed.
+
+    See cudaStreamBeginCapture for additional detail on beginning the
+    capture.
+
+    Parameters
+    ----------
+    stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        Stream in which to initiate capture
+    mode : :py:obj:`~.cudaStreamCaptureMode`
+        Controls the interaction of this capture sequence with other API
+        calls that are potentially unsafe. For more details see
+        :py:obj:`~.cudaThreadExchangeStreamCaptureMode`.
+    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+        Existing CUDA graph to be captured into
+    callbackData : :py:obj:`~.cudaGraphRecaptureCallbackData`
+        Optional struct of callback data that will be invoked for all
+        parameter mismatches from the original graph
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorDeinitialized`, :py:obj:`~.cudaErrorNotInitialized`, :py:obj:`~.cudaErrorInvalidValue`,
+
+    See Also
+    --------
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamBeginCapture`, :py:obj:`~.cudaStreamIsCapturing`, :py:obj:`~.cudaStreamEndCapture`, :py:obj:`~.cudaThreadExchangeStreamCaptureMode`
+
+    Notes
+    -----
+    Any user objects associated with `graph` will be released prior to the recapture.
+    """
+    cdef cyruntime.cudaGraph_t cygraph
+    if graph is None:
+        pgraph = 0
+    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
+        pgraph = int(graph)
+    else:
+        pgraph = int(cudaGraph_t(graph))
+    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
+    cdef cyruntime.cudaStream_t cystream
+    if stream is None:
+        pstream = 0
+    elif isinstance(stream, (cudaStream_t,driver.CUstream)):
+        pstream = int(stream)
+    else:
+        pstream = int(cudaStream_t(stream))
+    cystream = <cyruntime.cudaStream_t><void_ptr>pstream
+    cdef cyruntime.cudaStreamCaptureMode cymode = int(mode)
+    cdef cyruntime.cudaGraphRecaptureCallbackData* cycallbackData_ptr = <cyruntime.cudaGraphRecaptureCallbackData*>callbackData._pvt_ptr if callbackData is not None else NULL
+    with nogil:
+        err = cyruntime.cudaStreamBeginRecaptureToGraph(cystream, cymode, cygraph, cycallbackData_ptr)
+    return (_cudaError_t(err),)
+{{endif}}
+
 {{if 'cudaStreamBeginCaptureToGraph' in found_functions}}
 
 @cython.embedsignature(True)
@@ -31347,7 +31726,7 @@ def cudaMemGetDefaultMemPool(location : Optional[cudaMemLocation], typename not
 
     The memory location can be of one of
     :py:obj:`~.cudaMemLocationTypeDevice`,
-    :py:obj:`~.cudaMemLocationTypeHost` or
+    :py:obj:`~.cudaMemLocationTypeHost`, or
     :py:obj:`~.cudaMemLocationTypeHostNuma`. The allocation type can be one
     of :py:obj:`~.cudaMemAllocationTypePinned` or
     :py:obj:`~.cudaMemAllocationTypeManaged`. When the allocation type is
@@ -31392,7 +31771,7 @@ def cudaMemGetMemPool(location : Optional[cudaMemLocation], typename not None :
 
     The memory location can be of one of
     :py:obj:`~.cudaMemLocationTypeDevice`,
-    :py:obj:`~.cudaMemLocationTypeHost` or
+    :py:obj:`~.cudaMemLocationTypeHost`, or
     :py:obj:`~.cudaMemLocationTypeHostNuma`. The allocation type can be one
     of :py:obj:`~.cudaMemAllocationTypePinned` or
     :py:obj:`~.cudaMemAllocationTypeManaged`. When the allocation type is
@@ -42119,6 +42498,12 @@ def sizeof(objType):
     {{if 'cudaStreamCallback_t' in found_types}}
     if objType == cudaStreamCallback_t:
         return sizeof(cyruntime.cudaStreamCallback_t){{endif}}
+    {{if 'cudaGraphRecaptureCallback_t' in found_types}}
+    if objType == cudaGraphRecaptureCallback_t:
+        return sizeof(cyruntime.cudaGraphRecaptureCallback_t){{endif}}
+    {{if 'cudaGraphRecaptureCallbackData' in found_struct}}
+    if objType == cudaGraphRecaptureCallbackData:
+        return sizeof(cyruntime.cudaGraphRecaptureCallbackData){{endif}}
     {{if 'cudaLogsCallback_t' in found_types}}
     if objType == cudaLogsCallback_t:
         return sizeof(cyruntime.cudaLogsCallback_t){{endif}}
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index dd305dfce0a..04fdbaba6ed 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 ------
@@ -1603,7 +1603,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN
 
 
-        Maximum optin shared memory per block
+        Maximum optin shared memory per block. That is shared memory that is available for dynamic allocation or static allocation (including architecture specific static shared memory) on this device but is not guaranteed to be portable.
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES
@@ -1918,6 +1918,42 @@ Data types used by CUDA driver
         Device supports atomic reduction operations in stream batch memory operations
 
 
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED
+
+
+        Device supports CIG streams with D3D12
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED
+
+
+        Device supports mmap() of dmabuf file descriptors for CUDA device memory allocations
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_SUPPORTED
+
+
+        Device supports unicast logical endpoints
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_MULTICAST_SUPPORTED
+
+
+        Device supports multicast logical endpoints
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_COUNTED_OPS_SUPPORTED
+
+
+        Device supports counted operations via logical endpoints
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_ACCESS_ON_OWNER_DEVICE_SUPPORTED
+
+
+        Device supports unicast logical endpoint access on the owner device
+
+
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX
 
 .. autoclass:: cuda.bindings.driver.CUpointer_attribute
@@ -2173,6 +2209,12 @@ Data types used by CUDA driver
         The block scheduling policy of a function. The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
 
 
+    .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED
+
+
+        Whether the function can be updated on device. 1 means device node update is supported, 0 is unsupported. See :py:obj:`~.cuFuncGetAttribute`.
+
+
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX
 
 .. autoclass:: cuda.bindings.driver.CUfunc_cache
@@ -3325,6 +3367,12 @@ Data types used by CUDA driver
 
                                                 call :py:obj:`~.cudaGraphSetConditional` from device code.
 
+
+    .. autoattribute:: cuda.bindings.driver.CUgraphNodeType.CU_GRAPH_NODE_TYPE_RESERVED_16
+
+
+        Reserved
+
 .. autoclass:: cuda.bindings.driver.CUgraphDependencyType
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDependencyType.CU_GRAPH_DEPENDENCY_TYPE_DEFAULT
@@ -4311,6 +4359,12 @@ Data types used by CUDA driver
         This error indicates that the requested operation is not permitted because the stream is in a detached state. This can occur if the green context associated with the stream has been destroyed, limiting the stream's operational capabilities.
 
 
+    .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_GRAPH_RECAPTURE_FAILURE
+
+
+        This error indicates that a graph recapture failed and had to be terminated.
+
+
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_UNKNOWN
 
 
@@ -6986,6 +7040,59 @@ Support for multicast on a specific device can be queried using the device attri
 .. autofunction:: cuda.bindings.driver.cuMulticastUnbind
 .. autofunction:: cuda.bindings.driver.cuMulticastGetGranularity
 
+Logical Endpoint
+----------------
+
+This section describes the logical endpoint functions of the low-level CUDA driver application programming interface.
+
+.. autoclass:: cuda.bindings.driver.CUlogicalEndpointFabricHandle_st
+.. autoclass:: cuda.bindings.driver.CUlogicalEndpointProp_struct
+.. autoclass:: cuda.bindings.driver.CUlogicalEndpointIpcHandleType
+
+    .. autoattribute:: cuda.bindings.driver.CUlogicalEndpointIpcHandleType.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE
+
+
+    .. autoattribute:: cuda.bindings.driver.CUlogicalEndpointIpcHandleType.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC
+
+.. autoclass:: cuda.bindings.driver.CUlogicalEndpointType
+
+    .. autoattribute:: cuda.bindings.driver.CUlogicalEndpointType.CU_LOGICAL_ENDPOINT_TYPE_INVALID
+
+
+    .. autoattribute:: cuda.bindings.driver.CUlogicalEndpointType.CU_LOGICAL_ENDPOINT_TYPE_UNICAST
+
+
+    .. autoattribute:: cuda.bindings.driver.CUlogicalEndpointType.CU_LOGICAL_ENDPOINT_TYPE_MULTICAST
+
+.. autoclass:: cuda.bindings.driver.CUlogicalEndpointFlag
+
+    .. autoattribute:: cuda.bindings.driver.CUlogicalEndpointFlag.CU_LOGICAL_ENDPOINT_FLAG_NONE
+
+
+        Default flag for logical endpoint construction
+
+
+    .. autoattribute:: cuda.bindings.driver.CUlogicalEndpointFlag.CU_LOGICAL_ENDPOINT_FLAG_COUNTED_OPS
+
+
+        Indicate the programmer's intention to use counted operations with the logical endpoint
+
+.. autoclass:: cuda.bindings.driver.CUlogicalEndpointId
+.. autoclass:: cuda.bindings.driver.CUlogicalEndpointFabricHandle
+.. autoclass:: cuda.bindings.driver.CUlogicalEndpointProp
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointIdReserve
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointIdRelease
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointCreate
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointAddDevice
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointDestroy
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointBindAddr
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointBindMem
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointUnbind
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointExport
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointImport
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointGetLimits
+.. autofunction:: cuda.bindings.driver.cuLogicalEndpointQuery
+
 Unified Addressing
 ------------------
 
@@ -7077,6 +7184,26 @@ Stream Management
 
 This section describes the stream management functions of the low-level CUDA driver application programming interface.
 
+.. autoclass:: cuda.bindings.driver.CUgraphRecaptureStatus
+
+    .. autoattribute:: cuda.bindings.driver.CUgraphRecaptureStatus.CU_GRAPH_RECAPTURE_ELIGIBLE_FOR_UPDATE
+
+
+        Node is eligible for update in an instantiated graph.
+
+
+    .. autoattribute:: cuda.bindings.driver.CUgraphRecaptureStatus.CU_GRAPH_RECAPTURE_INELIGIBLE_FOR_UPDATE
+
+
+        Parameter changes in the node cannot be applied to an instantiated graph.
+
+
+    .. autoattribute:: cuda.bindings.driver.CUgraphRecaptureStatus.CU_GRAPH_RECAPTURE_ERROR
+
+
+        Error while attempting to recapture the node. The recapture will be ended regardless of the return value from the callback.
+
+.. autoclass:: cuda.bindings.driver.CUgraphRecaptureCallback
 .. autofunction:: cuda.bindings.driver.cuStreamCreate
 .. autofunction:: cuda.bindings.driver.cuStreamCreateWithPriority
 .. autofunction:: cuda.bindings.driver.cuStreamBeginCaptureToCig
@@ -7090,6 +7217,7 @@ This section describes the stream management functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuStreamWaitEvent
 .. autofunction:: cuda.bindings.driver.cuStreamAddCallback
 .. autofunction:: cuda.bindings.driver.cuStreamBeginCapture
+.. autofunction:: cuda.bindings.driver.cuStreamBeginRecaptureToGraph
 .. autofunction:: cuda.bindings.driver.cuStreamBeginCaptureToGraph
 .. autofunction:: cuda.bindings.driver.cuThreadExchangeStreamCaptureMode
 .. autofunction:: cuda.bindings.driver.cuStreamEndCapture
@@ -7193,7 +7321,6 @@ This section describes the execution control functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuLaunchKernel
 .. autofunction:: cuda.bindings.driver.cuLaunchKernelEx
 .. autofunction:: cuda.bindings.driver.cuLaunchCooperativeKernel
-.. autofunction:: cuda.bindings.driver.cuLaunchCooperativeKernelMultiDevice
 .. autofunction:: cuda.bindings.driver.cuLaunchHostFunc
 .. autofunction:: cuda.bindings.driver.cuLaunchHostFunc_v2
 
@@ -7421,6 +7548,15 @@ This section describes the coredump attribute control functions of the low-level
     .. autoattribute:: cuda.bindings.driver.CUCoredumpGenerationFlags.CU_COREDUMP_GZIP_COMPRESS
 
 
+    .. autoattribute:: cuda.bindings.driver.CUCoredumpGenerationFlags.CU_COREDUMP_FAULTED_CONTEXTS_ONLY
+
+
+    .. autoattribute:: cuda.bindings.driver.CUCoredumpGenerationFlags.CU_COREDUMP_NO_ERRBAR_AT_EXIT
+
+
+    .. autoattribute:: cuda.bindings.driver.CUCoredumpGenerationFlags.CU_COREDUMP_LOG_ONLY
+
+
     .. autoattribute:: cuda.bindings.driver.CUCoredumpGenerationFlags.CU_COREDUMP_LIGHTWEIGHT_FLAGS
 
 .. autoclass:: cuda.bindings.driver.CUcoredumpCallbackHandle
@@ -7623,10 +7759,13 @@ Additionally, there are two known scenarios, where its possible for the workload
 .. autoclass:: cuda.bindings.driver.CUdevResource
 .. autoclass:: cuda.bindings.driver.CUgreenCtxCreate_flags
 
+    .. autoattribute:: cuda.bindings.driver.CUgreenCtxCreate_flags.CU_GREEN_CTX_NONE
+
+
     .. autoattribute:: cuda.bindings.driver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM
 
 
-        Required. Creates a default stream to use inside the green context
+        Creates a default stream to use inside the green context
 
 .. autoclass:: cuda.bindings.driver.CUdevSmResourceGroup_flags
 
diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst
index d747ae0deb8..7c0da681418 100644
--- a/cuda_bindings/docs/source/module/nvrtc.rst
+++ b/cuda_bindings/docs/source/module/nvrtc.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 -----
@@ -65,6 +65,9 @@ NVRTC defines the following enumeration type and function for API call error han
 
     .. autoattribute:: cuda.bindings.nvrtc.nvrtcResult.NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED
 
+
+    .. autoattribute:: cuda.bindings.nvrtc.nvrtcResult.NVRTC_ERROR_BUSY
+
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetErrorString
 
 General Information Query
@@ -98,6 +101,8 @@ NVRTC defines the following type and functions for actual compilation.
 .. autofunction:: cuda.bindings.nvrtc.nvrtcAddNameExpression
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetLoweredName
 .. autofunction:: cuda.bindings.nvrtc.nvrtcSetFlowCallback
+.. autofunction:: cuda.bindings.nvrtc.nvrtcGetTileIRSize
+.. autofunction:: cuda.bindings.nvrtc.nvrtcGetTileIR
 
 Precompiled header (PCH) (CUDA 12.8+)
 -------------------------------------
@@ -109,6 +114,28 @@ NVRTC defines the following function related to PCH. Also see PCH related flags
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetPCHCreateStatus
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetPCHHeapSizeRequired
 
+Bundled Headers Installation
+----------------------------
+
+NVRTC defines the following types and functions for bundled headers installation and management.
+
+.. autoclass:: cuda.bindings.nvrtc.nvrtcBundledHeadersInfo
+.. autofunction:: cuda.bindings.nvrtc.nvrtcInstallBundledHeaders
+.. autofunction:: cuda.bindings.nvrtc.nvrtcGetBundledHeadersInfo
+.. autofunction:: cuda.bindings.nvrtc.nvrtcRemoveBundledHeaders
+.. autoattribute:: cuda.bindings.nvrtc.NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS
+
+    Flags for nvrtcInstallBundledHeaders.Skip installation if version marker exists and version matches. This is the default behavior when flags=0.
+
+.. autoattribute:: cuda.bindings.nvrtc.NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE
+
+    Clear existing directory contents before installation. Guarantees consistency by removing any existing files first.
+
+.. autoattribute:: cuda.bindings.nvrtc.NVRTC_INSTALL_HEADERS_NO_WAIT
+
+    Return NVRTC_ERROR_BUSY immediately if installation is in progress by another process, instead of waiting for the lock. Can be combined with FORCE_OVERWRITE using bitwise OR.
+
+
 Supported Compile Options
 -------------------------
 
@@ -190,6 +217,66 @@ Do extensible whole program compilation of device code.
 
 
 
+- Tile compilation
+
+
+
+
+
+  - ``--enable-tile``\  (``-enable-tile``\ )
+
+Enable support for Tile constructs (e.g. ``__tile__``\  ) and define the macro ``__CUDACC_TILE__``\  .
+
+
+
+
+
+
+
+  - ``--tile-only``\  (``-tile-only``\ )
+
+Enable support for parsing Tile constructs and define the macro ``__CUDACC_TILE__``\ , but omit code generation for non-Tile code (e.g. ``__global__``\  function).
+
+
+
+
+
+
+
+  - ``--simt-only``\  (``-simt-only``\ )
+
+Enable support for parsing Tile constructs and define the macro ``__CUDACC_TILE__``\ , but omit code generation for Tile code (e.g. ``__tile_global__``\  function).
+
+
+
+
+
+
+
+  - ``--default-tile``\  (``--default-tile``\ )
+
+Consider an unannotated function or static storage duration variable as having an implicit ``__tile__``\  annotation. If the static storage duration variable violates Tile restrictions (e.g. cannot be of pointer type), then the implicit annotation is silently omitted by default; use ``-diagnose-implicit-tile-var``\  to enable compiler diagnostics for such cases.
+
+``-default-tile``\  can be used in conjunction with ``-default-device``\ . If both flags are specified, the unannotated function and/or static storage duration variable will be considered to have both ``__tile__``\  and ``__device__``\  implict annotations. There will be a separate copy of the function/variable in the generated SIMT and cuda_tile programs, at present.
+
+
+
+
+
+
+
+  - ``--diagnose-implicit-tile-var``\  (``-diagnose-implicit-tile-var``\ )
+
+When combined with ``-default-tile``\ , diagnose static storage duration variables that could not be implicitly annotated with the ``__tile__``\  memory space specifier (e.g., due to pointer type restrictions).
+
+
+
+
+
+
+
+
+
 - Debugging support
 
 
@@ -468,6 +555,16 @@ Add the directory ``<dir>``\  to the list of directories to be searched for head
 
 
 
+  - ``--use-bundled-headers=<dir>``\  
+
+Install bundled CUDA headers to ``<dir>``\  and add include paths. This is a convenience flag that combines calling nvrtcInstallBundledHeaders and adding ``-I<dir>``\  and ``-I<dir>/cccl``\  to the include search path. Headers are installed only if they don't already exist at the specified location.
+
+
+
+
+
+
+
   - ``--pre-include=<header>``\  (``-include``\ )
 
 Preinclude ``<header>``\  during preprocessing.
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index 0da84a3922c..5f7b5177560 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 -------
@@ -884,6 +884,12 @@ Data types used by CUDA Runtime
         This error indicates that the requested operation is not permitted because the stream is in a detached state. This can occur if the green context associated with the stream has been destroyed, limiting the stream's operational capabilities.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorGraphRecaptureFailure
+
+
+        This error indicates that a graph recapture failed and had to be terminated.
+
+
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnknown
 
 
@@ -1085,6 +1091,90 @@ Data types used by CUDA Runtime
 
         4 channel unsigned normalized (10-bit, 10-bit, 10-bit, 2-bit) format
 
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Packed422
+
+
+        4 channel unsigned 8-bit packed format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Packed444
+
+
+        4 channel unsigned 8-bit packed format, with 4:4:4 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8SemiPlanar420
+
+
+        3 channel unsigned 8-bit semi-planar format, with 4:2:0 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16SemiPlanar420
+
+
+        3 channel unsigned 16-bit semi-planar format, with 4:2:0 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8SemiPlanar422
+
+
+        3 channel unsigned 8-bit semi-planar format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16SemiPlanar422
+
+
+        3 channel unsigned 16-bit semi-planar format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8SemiPlanar444
+
+
+        3 channel unsigned 8-bit semi-planar format, with 4:4:4 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16SemiPlanar444
+
+
+        3 channel unsigned 16-bit semi-planar format, with 4:4:4 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Planar420
+
+
+        3 channel unsigned 8-bit planar format, with 4:2:0 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16Planar420
+
+
+        3 channel unsigned 16-bit planar format, with 4:2:0 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Planar422
+
+
+        3 channel unsigned 8-bit planar format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16Planar422
+
+
+        3 channel unsigned 16-bit planar format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned8Planar444
+
+
+        3 channel unsigned 8-bit planar format, with 4:4:4 sampling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaChannelFormatKind.cudaChannelFormatKindUnsigned16Planar444
+
+
+        3 channel unsigned 16-bit planar format, with 4:4:4 sampling
+
 .. autoclass:: cuda.bindings.runtime.cudaMemoryType
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemoryType.cudaMemoryTypeUnregistered
@@ -1179,6 +1269,25 @@ Data types used by CUDA Runtime
 
         Stream is part of a capture sequence that has been invalidated, but not terminated
 
+.. autoclass:: cuda.bindings.runtime.cudaGraphRecaptureStatus
+
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphRecaptureStatus.cudaGraphRecaptureEligibleForUpdate
+
+
+        Node is eligible for update in an instantiated graph.
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphRecaptureStatus.cudaGraphRecaptureIneligibleForUpdate
+
+
+        Parameter changes in the node cannot be applied to an instantiated graph.
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphRecaptureStatus.cudaGraphRecaptureError
+
+
+        Error while attempting to recapture the node. The recapture will be ended regardless of the return value from the callback.
+
 .. autoclass:: cuda.bindings.runtime.cudaStreamCaptureMode
 
     .. autoattribute:: cuda.bindings.runtime.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal
@@ -1578,6 +1687,25 @@ Data types used by CUDA Runtime
 
         Block compressed 7
 
+.. autoclass:: cuda.bindings.runtime.cudaSharedMemoryMode
+
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeDefault
+
+
+        The default to use for allowing non-portable shared memory size on launch - uses current function attributes for :py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize`
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeRequirePortable
+
+
+        Specifies that the shared memory size requested must be a portable size within :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlock`
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeAllowNonPortable
+
+
+        Specifies that the shared memory size requested may be a non-portable size up to :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`
+
 .. autoclass:: cuda.bindings.runtime.cudaFuncAttribute
 
     .. autoattribute:: cuda.bindings.runtime.cudaFuncAttribute.cudaFuncAttributeMaxDynamicSharedMemorySize
@@ -2675,6 +2803,18 @@ Data types used by CUDA Runtime
         Link between the device and the host supports only some native atomic operations
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrAtomicReductionSupported
+
+
+        Device supports atomic reduction operations in stream batch memory operations
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCigStreamsSupported
+
+
+        Device supports CIG streams
+
+
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMax
 
 .. autoclass:: cuda.bindings.runtime.cudaMemPoolAttr
@@ -3596,6 +3736,12 @@ Data types used by CUDA Runtime
                                            call :py:obj:`~.cudaGraphSetConditional` from device code.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeReserved16
+
+
+        Reserved.
+
+
     .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeCount
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership
@@ -3921,25 +4067,6 @@ Data types used by CUDA Runtime
 
         Specifies that the cluster size requested may be a non-portable size
 
-.. autoclass:: cuda.bindings.runtime.cudaSharedMemoryMode
-
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeDefault
-
-
-        The default to use for allowing non-portable shared memory size on launch - uses current function attributes for :py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize`
-
-
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeRequirePortable
-
-
-        Specifies that the shared memory size requested must be a portable size within :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlock`
-
-
-    .. autoattribute:: cuda.bindings.runtime.cudaSharedMemoryMode.cudaSharedMemoryModeAllowNonPortable
-
-
-        Specifies that the shared memory size requested may be a non-portable size up to :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`
-
 .. autoclass:: cuda.bindings.runtime.cudaLaunchAttributeID
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeIgnore
@@ -4102,6 +4229,26 @@ Data types used by CUDA Runtime
 
     .. autoattribute:: cuda.bindings.runtime.cudaLogLevel.cudaLogLevelWarning
 
+.. autoclass:: cuda.bindings.runtime.cudaFabricOpStatusSource
+
+    .. autoattribute:: cuda.bindings.runtime.cudaFabricOpStatusSource.cudaFabricOpStatusSourceMbarrierV1
+
+
+        1B-aligned 1B-wide status from an mbarrier.layout::v1
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaFabricOpStatusSource.cudaFabricOpStatusSourceMax
+
+.. autoclass:: cuda.bindings.runtime.cudaFabricOpStatusInfo
+
+    .. autoattribute:: cuda.bindings.runtime.cudaFabricOpStatusInfo.cudaFabricOpStatusInfoSuccess
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaFabricOpStatusInfo.cudaFabricOpStatusInfoLast
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaFabricOpStatusInfo.cudaFabricOpStatusInfoMax
+
 .. autoclass:: cuda.bindings.runtime.cudaSurfaceBoundaryMode
 
     .. autoattribute:: cuda.bindings.runtime.cudaSurfaceBoundaryMode.cudaBoundaryModeZero
@@ -5252,6 +5399,7 @@ This section describes the device management functions of the CUDA runtime appli
 .. autofunction:: cuda.bindings.runtime.cudaDeviceSynchronize
 .. autofunction:: cuda.bindings.runtime.cudaDeviceSetLimit
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetLimit
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetTexture1DLinearMaxWidth
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetCacheConfig
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetStreamPriorityRange
 .. autofunction:: cuda.bindings.runtime.cudaDeviceSetCacheConfig
@@ -5262,6 +5410,7 @@ This section describes the device management functions of the CUDA runtime appli
 .. autofunction:: cuda.bindings.runtime.cudaIpcGetMemHandle
 .. autofunction:: cuda.bindings.runtime.cudaIpcOpenMemHandle
 .. autofunction:: cuda.bindings.runtime.cudaIpcCloseMemHandle
+.. autofunction:: cuda.bindings.runtime.cudaDeviceFlushGPUDirectRDMAWrites
 .. autofunction:: cuda.bindings.runtime.cudaDeviceRegisterAsyncNotification
 .. autofunction:: cuda.bindings.runtime.cudaDeviceUnregisterAsyncNotification
 .. autofunction:: cuda.bindings.runtime.cudaGetDeviceCount
@@ -5296,7 +5445,9 @@ Stream Management
 
 This section describes the stream management functions of the CUDA runtime application programming interface.
 
+.. autoclass:: cuda.bindings.runtime.cudaGraphRecaptureCallbackData
 .. autoclass:: cuda.bindings.runtime.cudaStreamCallback_t
+.. autoclass:: cuda.bindings.runtime.cudaGraphRecaptureCallback_t
 .. autofunction:: cuda.bindings.runtime.cudaStreamCreate
 .. autofunction:: cuda.bindings.runtime.cudaStreamCreateWithFlags
 .. autofunction:: cuda.bindings.runtime.cudaStreamCreateWithPriority
@@ -5315,6 +5466,7 @@ This section describes the stream management functions of the CUDA runtime appli
 .. autofunction:: cuda.bindings.runtime.cudaStreamQuery
 .. autofunction:: cuda.bindings.runtime.cudaStreamAttachMemAsync
 .. autofunction:: cuda.bindings.runtime.cudaStreamBeginCapture
+.. autofunction:: cuda.bindings.runtime.cudaStreamBeginRecaptureToGraph
 .. autofunction:: cuda.bindings.runtime.cudaStreamBeginCaptureToGraph
 .. autofunction:: cuda.bindings.runtime.cudaThreadExchangeStreamCaptureMode
 .. autofunction:: cuda.bindings.runtime.cudaStreamEndCapture
@@ -5330,6 +5482,7 @@ This section describes the event management functions of the CUDA runtime applic
 .. autofunction:: cuda.bindings.runtime.cudaEventCreate
 .. autofunction:: cuda.bindings.runtime.cudaEventCreateWithFlags
 .. autofunction:: cuda.bindings.runtime.cudaEventRecord
+.. autofunction:: cuda.bindings.runtime.cudaEventRecordWithFlags
 .. autofunction:: cuda.bindings.runtime.cudaEventQuery
 .. autofunction:: cuda.bindings.runtime.cudaEventSynchronize
 .. autofunction:: cuda.bindings.runtime.cudaEventDestroy
@@ -5418,6 +5571,8 @@ Some functions have overloaded C++ API template versions documented separately i
 .. autofunction:: cuda.bindings.runtime.cudaArrayGetPlane
 .. autofunction:: cuda.bindings.runtime.cudaArrayGetMemoryRequirements
 .. autofunction:: cuda.bindings.runtime.cudaMipmappedArrayGetMemoryRequirements
+.. autofunction:: cuda.bindings.runtime.cudaArrayGetSparseProperties
+.. autofunction:: cuda.bindings.runtime.cudaMipmappedArrayGetSparseProperties
 .. autofunction:: cuda.bindings.runtime.cudaMemcpy
 .. autofunction:: cuda.bindings.runtime.cudaMemcpyPeer
 .. autofunction:: cuda.bindings.runtime.cudaMemcpy2D
@@ -5730,8 +5885,10 @@ This section describes the graph management functions of CUDA runtime applicatio
 .. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeGetAttribute
 .. autofunction:: cuda.bindings.runtime.cudaGraphKernelNodeSetAttribute
 .. autofunction:: cuda.bindings.runtime.cudaGraphAddMemcpyNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemcpyNode1D
 .. autofunction:: cuda.bindings.runtime.cudaGraphMemcpyNodeGetParams
 .. autofunction:: cuda.bindings.runtime.cudaGraphMemcpyNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphMemcpyNodeSetParams1D
 .. autofunction:: cuda.bindings.runtime.cudaGraphAddMemsetNode
 .. autofunction:: cuda.bindings.runtime.cudaGraphMemsetNodeGetParams
 .. autofunction:: cuda.bindings.runtime.cudaGraphMemsetNodeSetParams
@@ -5741,6 +5898,25 @@ This section describes the graph management functions of CUDA runtime applicatio
 .. autofunction:: cuda.bindings.runtime.cudaGraphAddChildGraphNode
 .. autofunction:: cuda.bindings.runtime.cudaGraphChildGraphNodeGetGraph
 .. autofunction:: cuda.bindings.runtime.cudaGraphAddEmptyNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddEventRecordNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphEventRecordNodeGetEvent
+.. autofunction:: cuda.bindings.runtime.cudaGraphEventRecordNodeSetEvent
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddEventWaitNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphEventWaitNodeGetEvent
+.. autofunction:: cuda.bindings.runtime.cudaGraphEventWaitNodeSetEvent
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddExternalSemaphoresSignalNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphExternalSemaphoresSignalNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExternalSemaphoresSignalNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddExternalSemaphoresWaitNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphExternalSemaphoresWaitNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExternalSemaphoresWaitNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemAllocNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphMemAllocNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphAddMemFreeNode
+.. autofunction:: cuda.bindings.runtime.cudaGraphMemFreeNodeGetParams
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGraphMemTrim
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetGraphMemAttribute
+.. autofunction:: cuda.bindings.runtime.cudaDeviceSetGraphMemAttribute
 .. autofunction:: cuda.bindings.runtime.cudaGraphClone
 .. autofunction:: cuda.bindings.runtime.cudaGraphNodeFindInClone
 .. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetType
@@ -5758,13 +5934,23 @@ This section describes the graph management functions of CUDA runtime applicatio
 .. autofunction:: cuda.bindings.runtime.cudaGraphRemoveDependencies
 .. autofunction:: cuda.bindings.runtime.cudaGraphDestroyNode
 .. autofunction:: cuda.bindings.runtime.cudaGraphInstantiate
+.. autofunction:: cuda.bindings.runtime.cudaGraphInstantiateWithFlags
 .. autofunction:: cuda.bindings.runtime.cudaGraphInstantiateWithParams
 .. autofunction:: cuda.bindings.runtime.cudaGraphExecGetFlags
 .. autofunction:: cuda.bindings.runtime.cudaGraphExecKernelNodeSetParams
 .. autofunction:: cuda.bindings.runtime.cudaGraphExecMemcpyNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecMemcpyNodeSetParams1D
 .. autofunction:: cuda.bindings.runtime.cudaGraphExecMemsetNodeSetParams
 .. autofunction:: cuda.bindings.runtime.cudaGraphExecHostNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecChildGraphNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecEventRecordNodeSetEvent
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecEventWaitNodeSetEvent
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecExternalSemaphoresSignalNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphExecExternalSemaphoresWaitNodeSetParams
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeSetEnabled
+.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetEnabled
 .. autofunction:: cuda.bindings.runtime.cudaGraphExecUpdate
+.. autofunction:: cuda.bindings.runtime.cudaGraphUpload
 .. autofunction:: cuda.bindings.runtime.cudaGraphLaunch
 .. autofunction:: cuda.bindings.runtime.cudaGraphExecDestroy
 .. autofunction:: cuda.bindings.runtime.cudaGraphDestroy

From 17708e06cb1f4d6bb2a2f5f32bccca3a3a265cf9 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 10:09:41 -0700
Subject: [PATCH 02/11] run_cybind_native 13.3.0 ../ctk-next (NO MANUAL
 CHANGES)

---
 .../cuda/bindings/_internal/_fast_enum.py     |   5 +-
 .../cuda/bindings/_internal/cudla.pxd         |   2 +-
 .../cuda/bindings/_internal/cudla_linux.pyx   |   2 +-
 .../cuda/bindings/_internal/nvfatbin.pxd      |   3 +-
 .../bindings/_internal/nvfatbin_linux.pyx     |  23 ++-
 .../bindings/_internal/nvfatbin_windows.pyx   |  19 ++-
 .../bindings/_internal/nvjitlink_linux.pyx    |  46 +++++-
 .../bindings/_internal/nvjitlink_windows.pyx  |  38 ++++-
 cuda_bindings/cuda/bindings/cudla.pxd         |   2 +-
 cuda_bindings/cuda/bindings/cudla.pyx         |   3 +-
 cuda_bindings/cuda/bindings/cycudla.pxd       |   3 +-
 cuda_bindings/cuda/bindings/cycudla.pyx       |   2 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pxd    |   3 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pyx    |   6 +-
 cuda_bindings/cuda/bindings/cynvml.pxd        | 121 ++++++++++++++-
 cuda_bindings/cuda/bindings/nvfatbin.pxd      |   3 +-
 cuda_bindings/cuda/bindings/nvfatbin.pyx      |  13 +-
 cuda_bindings/cuda/bindings/nvml.pxd          |   9 +-
 cuda_bindings/cuda/bindings/nvml.pyx          | 146 +++++++++++++-----
 19 files changed, 386 insertions(+), 63 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/_fast_enum.py b/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
index 556cd33459e..cd0a8e5610e 100644
--- a/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
+++ b/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
@@ -1,8 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+#
+# This code was automatically generated across versions from 12.9.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 
 """
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla.pxd b/cuda_bindings/cuda/bindings/_internal/cudla.pxd
index beca59f3e6b..e4c479673de 100644
--- a/cuda_bindings/cuda/bindings/_internal/cudla.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/cudla.pxd
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+# This code was automatically generated across versions from 1.5.0 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from ..cycudla cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx b/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
index ccc53f32ca8..2d760477718 100644
--- a/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+# This code was automatically generated across versions from 1.5.0 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
index c82cc8efb76..2358e1220c9 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from ..cynvfatbin cimport *
 
@@ -20,5 +20,6 @@ cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, s
 cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddIndex(nvFatbinHandle handle, const void* code, size_t size, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
index 89e5015bc38..448e51240ef 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
@@ -68,6 +68,7 @@ cdef void* __nvFatbinAddLTOIR = NULL
 cdef void* __nvFatbinSize = NULL
 cdef void* __nvFatbinGet = NULL
 cdef void* __nvFatbinVersion = NULL
+cdef void* __nvFatbinAddIndex = NULL
 cdef void* __nvFatbinAddReloc = NULL
 cdef void* __nvFatbinAddTileIR = NULL
 
@@ -151,6 +152,13 @@ cdef int _init_nvfatbin() except -1 nogil:
                 handle = load_library()
             __nvFatbinVersion = dlsym(handle, 'nvFatbinVersion')
 
+        global __nvFatbinAddIndex
+        __nvFatbinAddIndex = dlsym(RTLD_DEFAULT, 'nvFatbinAddIndex')
+        if __nvFatbinAddIndex == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddIndex = dlsym(handle, 'nvFatbinAddIndex')
+
         global __nvFatbinAddReloc
         __nvFatbinAddReloc = dlsym(RTLD_DEFAULT, 'nvFatbinAddReloc')
         if __nvFatbinAddReloc == NULL:
@@ -213,6 +221,9 @@ cpdef dict _inspect_function_pointers():
     global __nvFatbinVersion
     data["__nvFatbinVersion"] = <intptr_t>__nvFatbinVersion
 
+    global __nvFatbinAddIndex
+    data["__nvFatbinAddIndex"] = <intptr_t>__nvFatbinAddIndex
+
     global __nvFatbinAddReloc
     data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
 
@@ -324,6 +335,16 @@ cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) e
         major, minor)
 
 
+cdef nvFatbinResult _nvFatbinAddIndex(nvFatbinHandle handle, const void* code, size_t size, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddIndex
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddIndex == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddIndex is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*) noexcept nogil>__nvFatbinAddIndex)(
+        handle, code, size, identifier)
+
+
 cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     global __nvFatbinAddReloc
     _check_or_init_nvfatbin()
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
index 576a2ca9a6f..4a6b6148fd8 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -86,6 +86,7 @@ cdef void* __nvFatbinAddLTOIR = NULL
 cdef void* __nvFatbinSize = NULL
 cdef void* __nvFatbinGet = NULL
 cdef void* __nvFatbinVersion = NULL
+cdef void* __nvFatbinAddIndex = NULL
 cdef void* __nvFatbinAddReloc = NULL
 cdef void* __nvFatbinAddTileIR = NULL
 
@@ -129,6 +130,9 @@ cdef int _init_nvfatbin() except -1 nogil:
         global __nvFatbinVersion
         __nvFatbinVersion = GetProcAddress(handle, 'nvFatbinVersion')
 
+        global __nvFatbinAddIndex
+        __nvFatbinAddIndex = GetProcAddress(handle, 'nvFatbinAddIndex')
+
         global __nvFatbinAddReloc
         __nvFatbinAddReloc = GetProcAddress(handle, 'nvFatbinAddReloc')
 
@@ -184,6 +188,9 @@ cpdef dict _inspect_function_pointers():
     global __nvFatbinVersion
     data["__nvFatbinVersion"] = <intptr_t>__nvFatbinVersion
 
+    global __nvFatbinAddIndex
+    data["__nvFatbinAddIndex"] = <intptr_t>__nvFatbinAddIndex
+
     global __nvFatbinAddReloc
     data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
 
@@ -295,6 +302,16 @@ cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) e
         major, minor)
 
 
+cdef nvFatbinResult _nvFatbinAddIndex(nvFatbinHandle handle, const void* code, size_t size, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddIndex
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddIndex == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddIndex is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*) noexcept nogil>__nvFatbinAddIndex)(
+        handle, code, size, identifier)
+
+
 cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     global __nvFatbinAddReloc
     _check_or_init_nvfatbin()
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 8c823494622..34ad6b5f0c5 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
@@ -73,6 +73,8 @@ cdef void* __nvJitLinkGetErrorLog = NULL
 cdef void* __nvJitLinkGetInfoLogSize = NULL
 cdef void* __nvJitLinkGetInfoLog = NULL
 cdef void* __nvJitLinkVersion = NULL
+cdef void* __nvJitLinkGetLinkedLTOIRSize = NULL
+cdef void* __nvJitLinkGetLinkedLTOIR = NULL
 
 
 cdef void* load_library() except* with gil:
@@ -189,6 +191,20 @@ cdef int _init_nvjitlink() except -1 nogil:
                 handle = load_library()
             __nvJitLinkVersion = dlsym(handle, 'nvJitLinkVersion')
 
+        global __nvJitLinkGetLinkedLTOIRSize
+        __nvJitLinkGetLinkedLTOIRSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedLTOIRSize')
+        if __nvJitLinkGetLinkedLTOIRSize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvJitLinkGetLinkedLTOIRSize = dlsym(handle, 'nvJitLinkGetLinkedLTOIRSize')
+
+        global __nvJitLinkGetLinkedLTOIR
+        __nvJitLinkGetLinkedLTOIR = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedLTOIR')
+        if __nvJitLinkGetLinkedLTOIR == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvJitLinkGetLinkedLTOIR = dlsym(handle, 'nvJitLinkGetLinkedLTOIR')
+
         __py_nvjitlink_init = True
         return 0
 
@@ -252,6 +268,12 @@ cpdef dict _inspect_function_pointers():
     global __nvJitLinkVersion
     data["__nvJitLinkVersion"] = <intptr_t>__nvJitLinkVersion
 
+    global __nvJitLinkGetLinkedLTOIRSize
+    data["__nvJitLinkGetLinkedLTOIRSize"] = <intptr_t>__nvJitLinkGetLinkedLTOIRSize
+
+    global __nvJitLinkGetLinkedLTOIR
+    data["__nvJitLinkGetLinkedLTOIR"] = <intptr_t>__nvJitLinkGetLinkedLTOIR
+
     func_ptrs = data
     return data
 
@@ -405,3 +427,23 @@ cdef nvJitLinkResult _nvJitLinkVersion(unsigned int* major, unsigned int* minor)
             raise FunctionNotFoundError("function nvJitLinkVersion is not found")
     return (<nvJitLinkResult (*)(unsigned int*, unsigned int*) noexcept nogil>__nvJitLinkVersion)(
         major, minor)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedLTOIRSize(nvJitLinkHandle handle, size_t* size) except?_NVJITLINKRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvJitLinkGetLinkedLTOIRSize
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetLinkedLTOIRSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedLTOIRSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) noexcept nogil>__nvJitLinkGetLinkedLTOIRSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedLTOIR(nvJitLinkHandle handle, void* ltoir) except?_NVJITLINKRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvJitLinkGetLinkedLTOIR
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetLinkedLTOIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedLTOIR is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) noexcept nogil>__nvJitLinkGetLinkedLTOIR)(
+        handle, ltoir)
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 8a5a7661b42..8dcfd8b3e4d 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -91,6 +91,8 @@ cdef void* __nvJitLinkGetErrorLog = NULL
 cdef void* __nvJitLinkGetInfoLogSize = NULL
 cdef void* __nvJitLinkGetInfoLog = NULL
 cdef void* __nvJitLinkVersion = NULL
+cdef void* __nvJitLinkGetLinkedLTOIRSize = NULL
+cdef void* __nvJitLinkGetLinkedLTOIR = NULL
 
 
 cdef int _init_nvjitlink() except -1 nogil:
@@ -147,6 +149,12 @@ cdef int _init_nvjitlink() except -1 nogil:
         global __nvJitLinkVersion
         __nvJitLinkVersion = GetProcAddress(handle, 'nvJitLinkVersion')
 
+        global __nvJitLinkGetLinkedLTOIRSize
+        __nvJitLinkGetLinkedLTOIRSize = GetProcAddress(handle, 'nvJitLinkGetLinkedLTOIRSize')
+
+        global __nvJitLinkGetLinkedLTOIR
+        __nvJitLinkGetLinkedLTOIR = GetProcAddress(handle, 'nvJitLinkGetLinkedLTOIR')
+
         __py_nvjitlink_init = True
         return 0
 
@@ -211,6 +219,12 @@ cpdef dict _inspect_function_pointers():
     global __nvJitLinkVersion
     data["__nvJitLinkVersion"] = <intptr_t>__nvJitLinkVersion
 
+    global __nvJitLinkGetLinkedLTOIRSize
+    data["__nvJitLinkGetLinkedLTOIRSize"] = <intptr_t>__nvJitLinkGetLinkedLTOIRSize
+
+    global __nvJitLinkGetLinkedLTOIR
+    data["__nvJitLinkGetLinkedLTOIR"] = <intptr_t>__nvJitLinkGetLinkedLTOIR
+
     func_ptrs = data
     return data
 
@@ -364,3 +378,23 @@ cdef nvJitLinkResult _nvJitLinkVersion(unsigned int* major, unsigned int* minor)
             raise FunctionNotFoundError("function nvJitLinkVersion is not found")
     return (<nvJitLinkResult (*)(unsigned int*, unsigned int*) noexcept nogil>__nvJitLinkVersion)(
         major, minor)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedLTOIRSize(nvJitLinkHandle handle, size_t* size) except?_NVJITLINKRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvJitLinkGetLinkedLTOIRSize
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetLinkedLTOIRSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedLTOIRSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) noexcept nogil>__nvJitLinkGetLinkedLTOIRSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedLTOIR(nvJitLinkHandle handle, void* ltoir) except?_NVJITLINKRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvJitLinkGetLinkedLTOIR
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetLinkedLTOIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedLTOIR is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) noexcept nogil>__nvJitLinkGetLinkedLTOIR)(
+        handle, ltoir)
diff --git a/cuda_bindings/cuda/bindings/cudla.pxd b/cuda_bindings/cuda/bindings/cudla.pxd
index 786622ac9a9..894aacb7c54 100644
--- a/cuda_bindings/cuda/bindings/cudla.pxd
+++ b/cuda_bindings/cuda/bindings/cudla.pxd
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+# This code was automatically generated across versions from 1.5.0 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/cudla.pyx b/cuda_bindings/cuda/bindings/cudla.pyx
index ff7569b9b7a..6474990fefd 100644
--- a/cuda_bindings/cuda/bindings/cudla.pyx
+++ b/cuda_bindings/cuda/bindings/cudla.pyx
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+# This code was automatically generated across versions from 1.5.0 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 cimport cython  # NOQA
 from libc.stdint cimport intptr_t, uintptr_t
@@ -1595,6 +1595,7 @@ class Status(_IntEnum):
     ErrorInvalidModule = cudlaErrorInvalidModule
     ErrorUnsupportedOperation = cudlaErrorUnsupportedOperation
     ErrorNvSci = cudlaErrorNvSci
+    ErrorDriverNotFound = cudlaErrorDriverNotFound
     ErrorDlaErrInvalidInput = cudlaErrorDlaErrInvalidInput
     ErrorDlaErrInvalidPreAction = cudlaErrorDlaErrInvalidPreAction
     ErrorDlaErrNoMem = cudlaErrorDlaErrNoMem
diff --git a/cuda_bindings/cuda/bindings/cycudla.pxd b/cuda_bindings/cuda/bindings/cycudla.pxd
index 5bcbe623469..b8fdec0a2f5 100644
--- a/cuda_bindings/cuda/bindings/cycudla.pxd
+++ b/cuda_bindings/cuda/bindings/cycudla.pxd
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+# This code was automatically generated across versions from 1.5.0 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
@@ -33,6 +33,7 @@ ctypedef enum cudlaStatus "cudlaStatus":
     cudlaErrorInvalidModule "cudlaErrorInvalidModule" = 12
     cudlaErrorUnsupportedOperation "cudlaErrorUnsupportedOperation" = 13
     cudlaErrorNvSci "cudlaErrorNvSci" = 14
+    cudlaErrorDriverNotFound "cudlaErrorDriverNotFound" = 15
     cudlaErrorDlaErrInvalidInput "cudlaErrorDlaErrInvalidInput" = 0x40000001
     cudlaErrorDlaErrInvalidPreAction "cudlaErrorDlaErrInvalidPreAction" = 0x40000002
     cudlaErrorDlaErrNoMem "cudlaErrorDlaErrNoMem" = 0x40000003
diff --git a/cuda_bindings/cuda/bindings/cycudla.pyx b/cuda_bindings/cuda/bindings/cycudla.pyx
index 8d0cbdc5111..543acf56fc3 100644
--- a/cuda_bindings/cuda/bindings/cycudla.pyx
+++ b/cuda_bindings/cuda/bindings/cycudla.pyx
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+# This code was automatically generated across versions from 1.5.0 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from ._internal cimport cudla as _cudla
 
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
index 197e0bb67cf..5aab8de073e 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
@@ -51,5 +51,6 @@ cdef nvFatbinResult nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, si
 cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddIndex(nvFatbinHandle handle, const void* code, size_t size, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
index d382045a2b2..d1fca6e8025 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from ._internal cimport nvfatbin as _nvfatbin
 
@@ -47,6 +47,10 @@ cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) ex
     return _nvfatbin._nvFatbinVersion(major, minor)
 
 
+cdef nvFatbinResult nvFatbinAddIndex(nvFatbinHandle handle, const void* code, size_t size, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddIndex(handle, code, size, identifier)
+
+
 cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     return _nvfatbin._nvFatbinAddReloc(handle, code, size)
 
diff --git a/cuda_bindings/cuda/bindings/cynvml.pxd b/cuda_bindings/cuda/bindings/cynvml.pxd
index 1f59e6d522a..2bd67517e41 100644
--- a/cuda_bindings/cuda/bindings/cynvml.pxd
+++ b/cuda_bindings/cuda/bindings/cynvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport int64_t
 
@@ -235,6 +235,7 @@ ctypedef enum nvmlNvlinkVersion_t "nvmlNvlinkVersion_t":
     NVML_NVLINK_VERSION_3_1 "NVML_NVLINK_VERSION_3_1" = 5
     NVML_NVLINK_VERSION_4_0 "NVML_NVLINK_VERSION_4_0" = 6
     NVML_NVLINK_VERSION_5_0 "NVML_NVLINK_VERSION_5_0" = 7
+    NVML_NVLINK_VERSION_6_0 "NVML_NVLINK_VERSION_6_0" = 8
 
 ctypedef enum nvmlEccCounterType_t "nvmlEccCounterType_t":
     NVML_VOLATILE_ECC "NVML_VOLATILE_ECC" = 0
@@ -415,6 +416,7 @@ ctypedef enum nvmlDeviceGpuRecoveryAction_t "nvmlDeviceGpuRecoveryAction_t":
     NVML_GPU_RECOVERY_ACTION_NODE_REBOOT "NVML_GPU_RECOVERY_ACTION_NODE_REBOOT" = 2
     NVML_GPU_RECOVERY_ACTION_DRAIN_P2P "NVML_GPU_RECOVERY_ACTION_DRAIN_P2P" = 3
     NVML_GPU_RECOVERY_ACTION_DRAIN_AND_RESET "NVML_GPU_RECOVERY_ACTION_DRAIN_AND_RESET" = 4
+    NVML_GPU_RECOVERY_ACTION_RECOVER_IMEX_DOMAIN "NVML_GPU_RECOVERY_ACTION_RECOVER_IMEX_DOMAIN" = 5
 
 ctypedef enum nvmlFanState_t "nvmlFanState_t":
     NVML_FAN_NORMAL "NVML_FAN_NORMAL" = 0
@@ -646,6 +648,42 @@ ctypedef enum nvmlGpmMetricId_t "nvmlGpmMetricId_t":
     NVML_GPM_METRIC_GR7_CTXSW_REQUESTS "NVML_GPM_METRIC_GR7_CTXSW_REQUESTS" = 207
     NVML_GPM_METRIC_GR7_CTXSW_CYCLES_PER_REQ "NVML_GPM_METRIC_GR7_CTXSW_CYCLES_PER_REQ" = 208
     NVML_GPM_METRIC_GR7_CTXSW_ACTIVE_PCT "NVML_GPM_METRIC_GR7_CTXSW_ACTIVE_PCT" = 209
+    NVML_GPM_METRIC_NVLINK_L18_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L18_RX_PER_SEC" = 212
+    NVML_GPM_METRIC_NVLINK_L18_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L18_TX_PER_SEC" = 213
+    NVML_GPM_METRIC_NVLINK_L19_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L19_RX_PER_SEC" = 214
+    NVML_GPM_METRIC_NVLINK_L19_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L19_TX_PER_SEC" = 215
+    NVML_GPM_METRIC_NVLINK_L20_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L20_RX_PER_SEC" = 216
+    NVML_GPM_METRIC_NVLINK_L20_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L20_TX_PER_SEC" = 217
+    NVML_GPM_METRIC_NVLINK_L21_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L21_RX_PER_SEC" = 218
+    NVML_GPM_METRIC_NVLINK_L21_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L21_TX_PER_SEC" = 219
+    NVML_GPM_METRIC_NVLINK_L22_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L22_RX_PER_SEC" = 220
+    NVML_GPM_METRIC_NVLINK_L22_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L22_TX_PER_SEC" = 221
+    NVML_GPM_METRIC_NVLINK_L23_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L23_RX_PER_SEC" = 222
+    NVML_GPM_METRIC_NVLINK_L23_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L23_TX_PER_SEC" = 223
+    NVML_GPM_METRIC_NVLINK_L24_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L24_RX_PER_SEC" = 224
+    NVML_GPM_METRIC_NVLINK_L24_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L24_TX_PER_SEC" = 225
+    NVML_GPM_METRIC_NVLINK_L25_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L25_RX_PER_SEC" = 226
+    NVML_GPM_METRIC_NVLINK_L25_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L25_TX_PER_SEC" = 227
+    NVML_GPM_METRIC_NVLINK_L26_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L26_RX_PER_SEC" = 228
+    NVML_GPM_METRIC_NVLINK_L26_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L26_TX_PER_SEC" = 229
+    NVML_GPM_METRIC_NVLINK_L27_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L27_RX_PER_SEC" = 230
+    NVML_GPM_METRIC_NVLINK_L27_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L27_TX_PER_SEC" = 231
+    NVML_GPM_METRIC_NVLINK_L28_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L28_RX_PER_SEC" = 232
+    NVML_GPM_METRIC_NVLINK_L28_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L28_TX_PER_SEC" = 233
+    NVML_GPM_METRIC_NVLINK_L29_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L29_RX_PER_SEC" = 234
+    NVML_GPM_METRIC_NVLINK_L29_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L29_TX_PER_SEC" = 235
+    NVML_GPM_METRIC_NVLINK_L30_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L30_RX_PER_SEC" = 236
+    NVML_GPM_METRIC_NVLINK_L30_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L30_TX_PER_SEC" = 237
+    NVML_GPM_METRIC_NVLINK_L31_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L31_RX_PER_SEC" = 238
+    NVML_GPM_METRIC_NVLINK_L31_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L31_TX_PER_SEC" = 239
+    NVML_GPM_METRIC_NVLINK_L32_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L32_RX_PER_SEC" = 240
+    NVML_GPM_METRIC_NVLINK_L32_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L32_TX_PER_SEC" = 241
+    NVML_GPM_METRIC_NVLINK_L33_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L33_RX_PER_SEC" = 242
+    NVML_GPM_METRIC_NVLINK_L33_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L33_TX_PER_SEC" = 243
+    NVML_GPM_METRIC_NVLINK_L34_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L34_RX_PER_SEC" = 244
+    NVML_GPM_METRIC_NVLINK_L34_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L34_TX_PER_SEC" = 245
+    NVML_GPM_METRIC_NVLINK_L35_RX_PER_SEC "NVML_GPM_METRIC_NVLINK_L35_RX_PER_SEC" = 246
+    NVML_GPM_METRIC_NVLINK_L35_TX_PER_SEC "NVML_GPM_METRIC_NVLINK_L35_TX_PER_SEC" = 247
     NVML_GPM_METRIC_SM_CYCLES_ELAPSED "NVML_GPM_METRIC_SM_CYCLES_ELAPSED" = 248
     NVML_GPM_METRIC_SM_CYCLES_ACTIVE "NVML_GPM_METRIC_SM_CYCLES_ACTIVE" = 249
     NVML_GPM_METRIC_MMA_CYCLES_ACTIVE "NVML_GPM_METRIC_MMA_CYCLES_ACTIVE" = 250
@@ -695,6 +733,42 @@ ctypedef enum nvmlGpmMetricId_t "nvmlGpmMetricId_t":
     NVML_GPM_METRIC_NVLINK_L16_TX "NVML_GPM_METRIC_NVLINK_L16_TX" = 294
     NVML_GPM_METRIC_NVLINK_L17_RX "NVML_GPM_METRIC_NVLINK_L17_RX" = 295
     NVML_GPM_METRIC_NVLINK_L17_TX "NVML_GPM_METRIC_NVLINK_L17_TX" = 296
+    NVML_GPM_METRIC_NVLINK_L18_RX "NVML_GPM_METRIC_NVLINK_L18_RX" = 297
+    NVML_GPM_METRIC_NVLINK_L18_TX "NVML_GPM_METRIC_NVLINK_L18_TX" = 298
+    NVML_GPM_METRIC_NVLINK_L19_RX "NVML_GPM_METRIC_NVLINK_L19_RX" = 299
+    NVML_GPM_METRIC_NVLINK_L19_TX "NVML_GPM_METRIC_NVLINK_L19_TX" = 300
+    NVML_GPM_METRIC_NVLINK_L20_RX "NVML_GPM_METRIC_NVLINK_L20_RX" = 301
+    NVML_GPM_METRIC_NVLINK_L20_TX "NVML_GPM_METRIC_NVLINK_L20_TX" = 302
+    NVML_GPM_METRIC_NVLINK_L21_RX "NVML_GPM_METRIC_NVLINK_L21_RX" = 303
+    NVML_GPM_METRIC_NVLINK_L21_TX "NVML_GPM_METRIC_NVLINK_L21_TX" = 304
+    NVML_GPM_METRIC_NVLINK_L22_RX "NVML_GPM_METRIC_NVLINK_L22_RX" = 305
+    NVML_GPM_METRIC_NVLINK_L22_TX "NVML_GPM_METRIC_NVLINK_L22_TX" = 306
+    NVML_GPM_METRIC_NVLINK_L23_RX "NVML_GPM_METRIC_NVLINK_L23_RX" = 307
+    NVML_GPM_METRIC_NVLINK_L23_TX "NVML_GPM_METRIC_NVLINK_L23_TX" = 308
+    NVML_GPM_METRIC_NVLINK_L24_RX "NVML_GPM_METRIC_NVLINK_L24_RX" = 309
+    NVML_GPM_METRIC_NVLINK_L24_TX "NVML_GPM_METRIC_NVLINK_L24_TX" = 310
+    NVML_GPM_METRIC_NVLINK_L25_RX "NVML_GPM_METRIC_NVLINK_L25_RX" = 311
+    NVML_GPM_METRIC_NVLINK_L25_TX "NVML_GPM_METRIC_NVLINK_L25_TX" = 312
+    NVML_GPM_METRIC_NVLINK_L26_RX "NVML_GPM_METRIC_NVLINK_L26_RX" = 313
+    NVML_GPM_METRIC_NVLINK_L26_TX "NVML_GPM_METRIC_NVLINK_L26_TX" = 314
+    NVML_GPM_METRIC_NVLINK_L27_RX "NVML_GPM_METRIC_NVLINK_L27_RX" = 315
+    NVML_GPM_METRIC_NVLINK_L27_TX "NVML_GPM_METRIC_NVLINK_L27_TX" = 316
+    NVML_GPM_METRIC_NVLINK_L28_RX "NVML_GPM_METRIC_NVLINK_L28_RX" = 317
+    NVML_GPM_METRIC_NVLINK_L28_TX "NVML_GPM_METRIC_NVLINK_L28_TX" = 318
+    NVML_GPM_METRIC_NVLINK_L29_RX "NVML_GPM_METRIC_NVLINK_L29_RX" = 319
+    NVML_GPM_METRIC_NVLINK_L29_TX "NVML_GPM_METRIC_NVLINK_L29_TX" = 320
+    NVML_GPM_METRIC_NVLINK_L30_RX "NVML_GPM_METRIC_NVLINK_L30_RX" = 321
+    NVML_GPM_METRIC_NVLINK_L30_TX "NVML_GPM_METRIC_NVLINK_L30_TX" = 322
+    NVML_GPM_METRIC_NVLINK_L31_RX "NVML_GPM_METRIC_NVLINK_L31_RX" = 323
+    NVML_GPM_METRIC_NVLINK_L31_TX "NVML_GPM_METRIC_NVLINK_L31_TX" = 324
+    NVML_GPM_METRIC_NVLINK_L32_RX "NVML_GPM_METRIC_NVLINK_L32_RX" = 325
+    NVML_GPM_METRIC_NVLINK_L32_TX "NVML_GPM_METRIC_NVLINK_L32_TX" = 326
+    NVML_GPM_METRIC_NVLINK_L33_RX "NVML_GPM_METRIC_NVLINK_L33_RX" = 327
+    NVML_GPM_METRIC_NVLINK_L33_TX "NVML_GPM_METRIC_NVLINK_L33_TX" = 328
+    NVML_GPM_METRIC_NVLINK_L34_RX "NVML_GPM_METRIC_NVLINK_L34_RX" = 329
+    NVML_GPM_METRIC_NVLINK_L34_TX "NVML_GPM_METRIC_NVLINK_L34_TX" = 330
+    NVML_GPM_METRIC_NVLINK_L35_RX "NVML_GPM_METRIC_NVLINK_L35_RX" = 331
+    NVML_GPM_METRIC_NVLINK_L35_TX "NVML_GPM_METRIC_NVLINK_L35_TX" = 332
     NVML_GPM_METRIC_MAX "NVML_GPM_METRIC_MAX" = 333
 
 ctypedef enum nvmlPowerProfileType_t "nvmlPowerProfileType_t":
@@ -743,6 +817,16 @@ ctypedef enum nvmlPowerProfileOperation_t "nvmlPowerProfileOperation_t":
     NVML_POWER_PROFILE_OPERATION_SET_AND_OVERWRITE "NVML_POWER_PROFILE_OPERATION_SET_AND_OVERWRITE" = 2
     NVML_POWER_PROFILE_OPERATION_MAX "NVML_POWER_PROFILE_OPERATION_MAX" = 3
 
+ctypedef enum nvmlProcessMode_t "nvmlProcessMode_t":
+    NVML_PROCESS_MODE_COMPUTE "NVML_PROCESS_MODE_COMPUTE" = 0
+    NVML_PROCESS_MODE_GRAPHICS "NVML_PROCESS_MODE_GRAPHICS" = 1
+    NVML_PROCESS_MODE_MPS "NVML_PROCESS_MODE_MPS" = 2
+    NVML_PROCESS_MODE_ALL "NVML_PROCESS_MODE_ALL" = 3
+    NVML_PROCESS_MODE_MAX "NVML_PROCESS_MODE_MAX" = (NVML_PROCESS_MODE_ALL + 1)
+
+ctypedef enum nvmlCPERType_t "nvmlCPERType_t":
+    NVML_CPER_ACCESS_TYPE_GPU "NVML_CPER_ACCESS_TYPE_GPU" = (1 << 0)
+
 
 # types
 ctypedef struct nvmlPciInfoExt_v1_t 'nvmlPciInfoExt_v1_t':
@@ -923,6 +1007,8 @@ ctypedef struct nvmlPdi_v1_t 'nvmlPdi_v1_t':
     unsigned int version
     unsigned long long value
 
+ctypedef unsigned long long nvmlCPERCursorHandle_t 'nvmlCPERCursorHandle_t'
+
 ctypedef void* nvmlDevice_t 'nvmlDevice_t'
 
 ctypedef void* nvmlGpuInstance_t 'nvmlGpuInstance_t'
@@ -1398,6 +1484,29 @@ ctypedef struct nvmlVgpuSchedulerState_v2_t 'nvmlVgpuSchedulerState_v2_t':
     unsigned int avgFactor
     unsigned int frequency
 
+ctypedef struct nvmlBBXTimeData_v1_t 'nvmlBBXTimeData_v1_t':
+    unsigned int timeRun
+
+ctypedef struct nvmlRemappedRowsInfo_v2_t 'nvmlRemappedRowsInfo_v2_t':
+    unsigned int corrActiveRemaps
+    unsigned int corrInactiveRemaps
+    unsigned int uncActiveRemaps
+    unsigned int uncInactiveRemaps
+    unsigned int bPending
+    unsigned int bFailureOccurred
+
+ctypedef struct nvmlAccountingStats_v2_t 'nvmlAccountingStats_v2_t':
+    unsigned int pid
+    unsigned int isRunning
+    unsigned int gpuUtilization
+    unsigned int memoryUtilization
+    unsigned long long maxMemoryUsage
+    unsigned int sampleCount
+    unsigned long long sumGpuUtil
+    unsigned long long sumFbUtil
+    unsigned long long time
+    unsigned long long startTime
+
 ctypedef nvmlPciInfoExt_v1_t nvmlPciInfoExt_t 'nvmlPciInfoExt_t'
 
 ctypedef nvmlCoolerInfo_v1_t nvmlCoolerInfo_t 'nvmlCoolerInfo_t'
@@ -1552,6 +1661,11 @@ ctypedef nvmlRepairStatus_v1_t nvmlRepairStatus_t 'nvmlRepairStatus_t'
 
 ctypedef nvmlPdi_v1_t nvmlPdi_t 'nvmlPdi_t'
 
+ctypedef struct nvmlCPERCursor_v1_t 'nvmlCPERCursor_v1_t':
+    unsigned int cperTypeMask
+    char uuid[80]
+    nvmlCPERCursorHandle_t handle
+
 ctypedef struct nvmlEventData_t 'nvmlEventData_t':
     nvmlDevice_t device
     unsigned long long eventType
@@ -1766,6 +1880,11 @@ ctypedef nvmlActiveVgpuInstanceInfo_v1_t nvmlActiveVgpuInstanceInfo_t 'nvmlActiv
 
 ctypedef nvmlGpuFabricInfo_v3_t nvmlGpuFabricInfoV_t 'nvmlGpuFabricInfoV_t'
 
+ctypedef struct nvmlGetCPER_v1_t 'nvmlGetCPER_v1_t':
+    nvmlCPERCursor_v1_t cursor
+    unsigned char* buffer
+    unsigned int bufferSize
+
 ctypedef nvmlSystemEventSetCreateRequest_v1_t nvmlSystemEventSetCreateRequest_t 'nvmlSystemEventSetCreateRequest_t'
 
 ctypedef nvmlSystemEventSetFreeRequest_v1_t nvmlSystemEventSetFreeRequest_t 'nvmlSystemEventSetFreeRequest_t'
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
index b9836e831e7..2a080b09869 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
@@ -34,5 +34,6 @@ cpdef add_ltoir(intptr_t handle, code, size_t size, arch, identifier, options_cm
 cpdef size_t size(intptr_t handle) except? 0
 cpdef get(intptr_t handle, buffer)
 cpdef tuple version()
+cpdef add_index(intptr_t handle, code, size_t size, identifier)
 cpdef add_reloc(intptr_t handle, code, size_t size)
 cpdef add_tile_ir(intptr_t handle, code, size_t size, identifier, options_cmd_line)
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index 6e02502dbb2..9b44d0f58ac 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.4.1 to 13.2.0, generator version 0.3.1.dev1422+gf4812259e.d20260318. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -272,6 +272,17 @@ cpdef tuple version():
     return (major, minor)
 
 
+cpdef add_index(intptr_t handle, code, size_t size, identifier):
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    with nogil:
+        __status__ = nvFatbinAddIndex(<Handle>handle, <const void*>_code_, size, <const char*>_identifier_)
+    check_status(__status__)
+
+
 cpdef add_reloc(intptr_t handle, code, size_t size):
     """nvFatbinAddReloc adds relocatable PTX entries from a host object to the fatbinary.
 
diff --git a/cuda_bindings/cuda/bindings/nvml.pxd b/cuda_bindings/cuda/bindings/nvml.pxd
index 1822e272f39..45f1d1ac084 100644
--- a/cuda_bindings/cuda/bindings/nvml.pxd
+++ b/cuda_bindings/cuda/bindings/nvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -52,10 +52,14 @@ ctypedef nvmlMask255_t Mask255
 ctypedef nvmlHostname_v1_t Hostname_v1
 ctypedef nvmlUnrepairableMemoryStatus_v1_t UnrepairableMemoryStatus_v1
 ctypedef nvmlRusdSettings_v1_t RusdSettings_v1
+ctypedef nvmlBBXTimeData_v1_t BBXTimeData_v1
+ctypedef nvmlRemappedRowsInfo_v2_t RemappedRowsInfo_v2
+ctypedef nvmlAccountingStats_v2_t AccountingStats_v2
 ctypedef nvmlPowerValue_v2_t PowerValue_v2
 ctypedef nvmlVgpuTypeMaxInstance_v1_t VgpuTypeMaxInstance_v1
 ctypedef nvmlVgpuProcessUtilizationSample_t VgpuProcessUtilizationSample
 ctypedef nvmlGpuFabricInfo_t GpuFabricInfo
+ctypedef nvmlCPERCursor_v1_t CPERCursor_v1
 ctypedef nvmlSystemEventSetCreateRequest_v1_t SystemEventSetCreateRequest_v1
 ctypedef nvmlSystemEventSetFreeRequest_v1_t SystemEventSetFreeRequest_v1
 ctypedef nvmlSystemRegisterEventRequest_v1_t SystemRegisterEventRequest_v1
@@ -67,6 +71,7 @@ ctypedef nvmlWorkloadPowerProfileCurrentProfiles_v1_t WorkloadPowerProfileCurren
 ctypedef nvmlWorkloadPowerProfileRequestedProfiles_v1_t WorkloadPowerProfileRequestedProfiles_v1
 ctypedef nvmlWorkloadPowerProfileUpdateProfiles_v1_t WorkloadPowerProfileUpdateProfiles_v1
 ctypedef nvmlPRMTLV_v1_t PRMTLV_v1
+ctypedef nvmlGetCPER_v1_t GetCPER_v1
 ctypedef nvmlVgpuSchedulerSetState_t VgpuSchedulerSetState
 ctypedef nvmlGpmMetricsGet_t GpmMetricsGet
 ctypedef nvmlPRMCounterList_v1_t PRMCounterList_v1
@@ -137,6 +142,8 @@ ctypedef nvmlPowerProfileType_t _PowerProfileType
 ctypedef nvmlDeviceAddressingModeType_t _DeviceAddressingModeType
 ctypedef nvmlPRMCounterId_t _PRMCounterId
 ctypedef nvmlPowerProfileOperation_t _PowerProfileOperation
+ctypedef nvmlProcessMode_t _ProcessMode
+ctypedef nvmlCPERType_t _CPERType
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/nvml.pyx b/cuda_bindings/cuda/bindings/nvml.pyx
index d2596d72ebf..1b57f8165f2 100644
--- a/cuda_bindings/cuda/bindings/nvml.pyx
+++ b/cuda_bindings/cuda/bindings/nvml.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -422,6 +422,7 @@ class NvlinkVersion(_FastEnum):
     VERSION_3_1 = (NVML_NVLINK_VERSION_3_1, 'NVLink Version 3.1.')
     VERSION_4_0 = (NVML_NVLINK_VERSION_4_0, 'NVLink Version 4.0.')
     VERSION_5_0 = (NVML_NVLINK_VERSION_5_0, 'NVLink Version 5.0.')
+    VERSION_6_0 = (NVML_NVLINK_VERSION_6_0, 'NVLink Version 6.0.')
 
 class EccCounterType(_FastEnum):
     """
@@ -714,6 +715,7 @@ class DeviceGpuRecoveryAction(_FastEnum):
     GPU_RECOVERY_ACTION_NODE_REBOOT = (NVML_GPU_RECOVERY_ACTION_NODE_REBOOT, 'Reboot Node.')
     GPU_RECOVERY_ACTION_DRAIN_P2P = (NVML_GPU_RECOVERY_ACTION_DRAIN_P2P, 'Drain P2P.')
     GPU_RECOVERY_ACTION_DRAIN_AND_RESET = (NVML_GPU_RECOVERY_ACTION_DRAIN_AND_RESET, 'Drain P2P and Reset Gpu.')
+    GPU_RECOVERY_ACTION_RECOVER_IMEX_DOMAIN = (NVML_GPU_RECOVERY_ACTION_RECOVER_IMEX_DOMAIN, 'Recover IMEX Domain.')
 
 class FanState(_FastEnum):
     """
@@ -994,6 +996,42 @@ class GpmMetricId(_FastEnum):
     GPM_METRIC_GR7_CTXSW_REQUESTS = NVML_GPM_METRIC_GR7_CTXSW_REQUESTS
     GPM_METRIC_GR7_CTXSW_CYCLES_PER_REQ = NVML_GPM_METRIC_GR7_CTXSW_CYCLES_PER_REQ
     GPM_METRIC_GR7_CTXSW_ACTIVE_PCT = NVML_GPM_METRIC_GR7_CTXSW_ACTIVE_PCT
+    GPM_METRIC_NVLINK_L18_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L18_RX_PER_SEC
+    GPM_METRIC_NVLINK_L18_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L18_TX_PER_SEC
+    GPM_METRIC_NVLINK_L19_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L19_RX_PER_SEC
+    GPM_METRIC_NVLINK_L19_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L19_TX_PER_SEC
+    GPM_METRIC_NVLINK_L20_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L20_RX_PER_SEC
+    GPM_METRIC_NVLINK_L20_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L20_TX_PER_SEC
+    GPM_METRIC_NVLINK_L21_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L21_RX_PER_SEC
+    GPM_METRIC_NVLINK_L21_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L21_TX_PER_SEC
+    GPM_METRIC_NVLINK_L22_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L22_RX_PER_SEC
+    GPM_METRIC_NVLINK_L22_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L22_TX_PER_SEC
+    GPM_METRIC_NVLINK_L23_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L23_RX_PER_SEC
+    GPM_METRIC_NVLINK_L23_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L23_TX_PER_SEC
+    GPM_METRIC_NVLINK_L24_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L24_RX_PER_SEC
+    GPM_METRIC_NVLINK_L24_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L24_TX_PER_SEC
+    GPM_METRIC_NVLINK_L25_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L25_RX_PER_SEC
+    GPM_METRIC_NVLINK_L25_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L25_TX_PER_SEC
+    GPM_METRIC_NVLINK_L26_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L26_RX_PER_SEC
+    GPM_METRIC_NVLINK_L26_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L26_TX_PER_SEC
+    GPM_METRIC_NVLINK_L27_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L27_RX_PER_SEC
+    GPM_METRIC_NVLINK_L27_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L27_TX_PER_SEC
+    GPM_METRIC_NVLINK_L28_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L28_RX_PER_SEC
+    GPM_METRIC_NVLINK_L28_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L28_TX_PER_SEC
+    GPM_METRIC_NVLINK_L29_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L29_RX_PER_SEC
+    GPM_METRIC_NVLINK_L29_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L29_TX_PER_SEC
+    GPM_METRIC_NVLINK_L30_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L30_RX_PER_SEC
+    GPM_METRIC_NVLINK_L30_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L30_TX_PER_SEC
+    GPM_METRIC_NVLINK_L31_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L31_RX_PER_SEC
+    GPM_METRIC_NVLINK_L31_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L31_TX_PER_SEC
+    GPM_METRIC_NVLINK_L32_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L32_RX_PER_SEC
+    GPM_METRIC_NVLINK_L32_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L32_TX_PER_SEC
+    GPM_METRIC_NVLINK_L33_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L33_RX_PER_SEC
+    GPM_METRIC_NVLINK_L33_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L33_TX_PER_SEC
+    GPM_METRIC_NVLINK_L34_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L34_RX_PER_SEC
+    GPM_METRIC_NVLINK_L34_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L34_TX_PER_SEC
+    GPM_METRIC_NVLINK_L35_RX_PER_SEC = NVML_GPM_METRIC_NVLINK_L35_RX_PER_SEC
+    GPM_METRIC_NVLINK_L35_TX_PER_SEC = NVML_GPM_METRIC_NVLINK_L35_TX_PER_SEC
     GPM_METRIC_SM_CYCLES_ELAPSED = (NVML_GPM_METRIC_SM_CYCLES_ELAPSED, "The GPU's SM cycles elapsed since reboot.")
     GPM_METRIC_SM_CYCLES_ACTIVE = (NVML_GPM_METRIC_SM_CYCLES_ACTIVE, "The GPU's SM activity since reboot.")
     GPM_METRIC_MMA_CYCLES_ACTIVE = (NVML_GPM_METRIC_MMA_CYCLES_ACTIVE, "The GPU's SM MMA tensor activity since reboot.")
@@ -1043,6 +1081,42 @@ class GpmMetricId(_FastEnum):
     GPM_METRIC_NVLINK_L16_TX = (NVML_GPM_METRIC_NVLINK_L16_TX, 'NvLink write for link 16 in bytes since reboot.')
     GPM_METRIC_NVLINK_L17_RX = (NVML_GPM_METRIC_NVLINK_L17_RX, 'NvLink read for link 17 in bytes since reboot.')
     GPM_METRIC_NVLINK_L17_TX = (NVML_GPM_METRIC_NVLINK_L17_TX, 'NvLink write for link 17 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L18_RX = (NVML_GPM_METRIC_NVLINK_L18_RX, 'NvLink read for link 18 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L18_TX = (NVML_GPM_METRIC_NVLINK_L18_TX, 'NvLink write for link 18 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L19_RX = (NVML_GPM_METRIC_NVLINK_L19_RX, 'NvLink read for link 19 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L19_TX = (NVML_GPM_METRIC_NVLINK_L19_TX, 'NvLink write for link 19 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L20_RX = (NVML_GPM_METRIC_NVLINK_L20_RX, 'NvLink read for link 20 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L20_TX = (NVML_GPM_METRIC_NVLINK_L20_TX, 'NvLink write for link 20 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L21_RX = (NVML_GPM_METRIC_NVLINK_L21_RX, 'NvLink read for link 21 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L21_TX = (NVML_GPM_METRIC_NVLINK_L21_TX, 'NvLink write for link 21 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L22_RX = (NVML_GPM_METRIC_NVLINK_L22_RX, 'NvLink read for link 22 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L22_TX = (NVML_GPM_METRIC_NVLINK_L22_TX, 'NvLink write for link 22 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L23_RX = (NVML_GPM_METRIC_NVLINK_L23_RX, 'NvLink read for link 23 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L23_TX = (NVML_GPM_METRIC_NVLINK_L23_TX, 'NvLink write for link 23 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L24_RX = (NVML_GPM_METRIC_NVLINK_L24_RX, 'NvLink read for link 24 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L24_TX = (NVML_GPM_METRIC_NVLINK_L24_TX, 'NvLink write for link 24 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L25_RX = (NVML_GPM_METRIC_NVLINK_L25_RX, 'NvLink read for link 25 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L25_TX = (NVML_GPM_METRIC_NVLINK_L25_TX, 'NvLink write for link 25 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L26_RX = (NVML_GPM_METRIC_NVLINK_L26_RX, 'NvLink read for link 26 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L26_TX = (NVML_GPM_METRIC_NVLINK_L26_TX, 'NvLink write for link 26 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L27_RX = (NVML_GPM_METRIC_NVLINK_L27_RX, 'NvLink read for link 27 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L27_TX = (NVML_GPM_METRIC_NVLINK_L27_TX, 'NvLink write for link 27 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L28_RX = (NVML_GPM_METRIC_NVLINK_L28_RX, 'NvLink read for link 28 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L28_TX = (NVML_GPM_METRIC_NVLINK_L28_TX, 'NvLink write for link 28 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L29_RX = (NVML_GPM_METRIC_NVLINK_L29_RX, 'NvLink read for link 29 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L29_TX = (NVML_GPM_METRIC_NVLINK_L29_TX, 'NvLink write for link 29 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L30_RX = (NVML_GPM_METRIC_NVLINK_L30_RX, 'NvLink read for link 30 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L30_TX = (NVML_GPM_METRIC_NVLINK_L30_TX, 'NvLink write for link 30 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L31_RX = (NVML_GPM_METRIC_NVLINK_L31_RX, 'NvLink read for link 31 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L31_TX = (NVML_GPM_METRIC_NVLINK_L31_TX, 'NvLink write for link 31 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L32_RX = (NVML_GPM_METRIC_NVLINK_L32_RX, 'NvLink read for link 32 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L32_TX = (NVML_GPM_METRIC_NVLINK_L32_TX, 'NvLink write for link 32 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L33_RX = (NVML_GPM_METRIC_NVLINK_L33_RX, 'NvLink read for link 33 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L33_TX = (NVML_GPM_METRIC_NVLINK_L33_TX, 'NvLink write for link 33 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L34_RX = (NVML_GPM_METRIC_NVLINK_L34_RX, 'NvLink read for link 34 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L34_TX = (NVML_GPM_METRIC_NVLINK_L34_TX, 'NvLink write for link 34 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L35_RX = (NVML_GPM_METRIC_NVLINK_L35_RX, 'NvLink read for link 35 in bytes since reboot.')
+    GPM_METRIC_NVLINK_L35_TX = (NVML_GPM_METRIC_NVLINK_L35_TX, 'NvLink write for link 35 in bytes since reboot.')
     GPM_METRIC_MAX = (NVML_GPM_METRIC_MAX, 'Maximum value above +1.')
 
 class PowerProfileType(_FastEnum):
@@ -1109,6 +1183,27 @@ class PowerProfileOperation(_FastEnum):
     SET_AND_OVERWRITE = (NVML_POWER_PROFILE_OPERATION_SET_AND_OVERWRITE, 'Overwrite the existing list of requested profiles with just the requested profiles.')
     MAX = (NVML_POWER_PROFILE_OPERATION_MAX, 'Max value above +1.')
 
+class ProcessMode(_FastEnum):
+    """
+    Enum to represent process mode.
+
+    See `nvmlProcessMode_t`.
+    """
+    COMPUTE = (NVML_PROCESS_MODE_COMPUTE, 'Processes with a compute context.')
+    GRAPHICS = (NVML_PROCESS_MODE_GRAPHICS, 'Processes with a graphics context.')
+    MPS = (NVML_PROCESS_MODE_MPS, 'Processes with a MPS (Multi-Process Service) compute context.')
+    ALL = (NVML_PROCESS_MODE_ALL, 'All processes running on the GPU (compute, graphics, MPS, and other types)')
+    MAX = (NVML_PROCESS_MODE_MAX, 'Maximum value for bounds checking.')
+
+class CPERType(_FastEnum):
+    """
+    Bitmask of CPER record types. Multiple values may be combined to
+    request records from several sources in one call.
+
+    See `nvmlCPERType_t`.
+    """
+    CPER_ACCESS_TYPE_GPU = (NVML_CPER_ACCESS_TYPE_GPU, 'Access GPU CPER records.')
+
 
 class AffinityScope(_FastEnum):
     NODE = (0, "Scope of NUMA node for affinity queries")
@@ -1903,97 +1998,66 @@ class NvmlError(Exception):
 
 class UninitializedError(NvmlError):
     pass
-
 class InvalidArgumentError(NvmlError):
     pass
-
 class NotSupportedError(NvmlError):
     pass
-
 class NoPermissionError(NvmlError):
     pass
-
 class AlreadyInitializedError(NvmlError):
     pass
-
 class NotFoundError(NvmlError):
     pass
-
 class InsufficientSizeError(NvmlError):
     pass
-
 class InsufficientPowerError(NvmlError):
     pass
-
 class DriverNotLoadedError(NvmlError):
     pass
-
 class TimeoutError(NvmlError):
     pass
-
 class IrqIssueError(NvmlError):
     pass
-
 class LibraryNotFoundError(NvmlError):
     pass
-
 class FunctionNotFoundError(NvmlError):
     pass
-
 class CorruptedInforomError(NvmlError):
     pass
-
 class GpuIsLostError(NvmlError):
     pass
-
 class ResetRequiredError(NvmlError):
     pass
-
 class OperatingSystemError(NvmlError):
     pass
-
 class LibRmVersionMismatchError(NvmlError):
     pass
-
 class InUseError(NvmlError):
     pass
-
 class MemoryError(NvmlError):
     pass
-
 class NoDataError(NvmlError):
     pass
-
 class VgpuEccNotSupportedError(NvmlError):
     pass
-
 class InsufficientResourcesError(NvmlError):
     pass
-
 class FreqNotSupportedError(NvmlError):
     pass
-
 class ArgumentVersionMismatchError(NvmlError):
     pass
-
 class DeprecatedError(NvmlError):
     pass
-
 class NotReadyError(NvmlError):
     pass
-
 class GpuNotFoundError(NvmlError):
     pass
-
 class InvalidStateError(NvmlError):
     pass
-
 class ResetTypeNotSupportedError(NvmlError):
     pass
-
 class UnknownError(NvmlError):
     pass
-
 cdef object _nvml_error_factory(int status):
     cdef object pystatus = status
     if status == 1:
@@ -15537,7 +15601,7 @@ cdef class VgpuSchedulerStateInfo_v2:
 
     @property
     def avg_factor(self):
-        """int: OUT: Average factor in compensating the timeslice for Adaptive Round Robin mode."""
+        """int: OUT: Average factor in compensating the timeslice for Adaptive Round Robin mode. 0 when there is no active scheduling."""
         return self._ptr[0].avgFactor
 
     @avg_factor.setter
@@ -15548,7 +15612,7 @@ cdef class VgpuSchedulerStateInfo_v2:
 
     @property
     def timeslice(self):
-        """int: OUT: The timeslice in ns for each software run list as configured, or the default value otherwise."""
+        """int: OUT: The timeslice in ns for each software run list as configured, or the default value otherwise. 0 when there is no active scheduling."""
         return self._ptr[0].timeslice
 
     @timeslice.setter
@@ -16223,7 +16287,7 @@ cdef class ProcessDetailList_v1:
 
     @property
     def mode(self):
-        """int: Process mode(Compute/Graphics/MPSCompute)"""
+        """int: Process mode, One of `nvmlProcessMode_t`."""
         return self._ptr[0].mode
 
     @mode.setter
@@ -19878,7 +19942,7 @@ cdef class VgpuSchedulerLogInfo_v2:
 
     @property
     def avg_factor(self):
-        """int: OUT: Average factor in compensating the timeslice for Adaptive Round Robin mode."""
+        """int: OUT: Average factor in compensating the timeslice for Adaptive Round Robin mode. 0 when there is no active scheduling."""
         return self._ptr[0].avgFactor
 
     @avg_factor.setter
@@ -19889,7 +19953,7 @@ cdef class VgpuSchedulerLogInfo_v2:
 
     @property
     def timeslice(self):
-        """int: OUT: The timeslice in ns for each software run list as configured, or the default value otherwise."""
+        """int: OUT: The timeslice in ns for each software run list as configured, or the default value otherwise. 0 when there is no active scheduling."""
         return self._ptr[0].timeslice
 
     @timeslice.setter
@@ -23224,10 +23288,10 @@ cpdef object device_get_memory_info_v2(intptr_t device):
 
 
 cpdef int device_get_compute_mode(intptr_t device) except? -1:
-    """Retrieves the current compute mode for the device.
+    """Retrieves the current compute mode for the device or MIG device.
 
     Args:
-        device (intptr_t): The identifier of the target device.
+        device (intptr_t): The identifier of the target device handle or MIG device handle.
 
     Returns:
         int: Reference in which to return the current compute mode.
@@ -24405,10 +24469,10 @@ cpdef device_set_persistence_mode(intptr_t device, int mode):
 
 
 cpdef device_set_compute_mode(intptr_t device, int mode):
-    """Set the compute mode for the device.
+    """Set the compute mode for the device or MIG device.
 
     Args:
-        device (intptr_t): The identifier of the target device.
+        device (intptr_t): The identifier of the target device handle or MIG device handle.
         mode (ComputeMode): The target compute mode.
 
     .. seealso:: `nvmlDeviceSetComputeMode`

From 1bc62abbf48a30f66988e9cb3e0542b90d81a1c4 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 10:09:45 -0700
Subject: [PATCH 03/11] git apply --index
 /home/rgrossekunst/stash/squash_merge_into_public_main_preview_2026-05-26+0012_non_gen_transfer.patch
 (NO MANUAL CHANGES)

---
 cuda_bindings/cuda/bindings/_lib/utils.pxd.in | 12 ++++++
 cuda_bindings/cuda/bindings/_lib/utils.pxi.in | 40 +++++++++++++++++++
 .../cuda/bindings/utils/_ptx_utils.py         |  1 +
 .../tests/cudla/test_cudla_bindings.py        |  2 +-
 cuda_core/cuda/core/system/_nvlink.pxi        |  1 +
 cuda_core/tests/test_memory.py                | 30 +++++++++-----
 .../_dynamic_libs/descriptor_catalog.py       |  2 +
 toolshed/check_spdx.py                        |  1 +
 8 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_lib/utils.pxd.in b/cuda_bindings/cuda/bindings/_lib/utils.pxd.in
index 353a07a09c4..79ae1404c89 100644
--- a/cuda_bindings/cuda/bindings/_lib/utils.pxd.in
+++ b/cuda_bindings/cuda/bindings/_lib/utils.pxd.in
@@ -142,6 +142,18 @@ cdef class _HelperCUmemAllocationHandleType:
     cdef driver.CUmemFabricHandle _mem_fabric_handle
     {{endif}}
 {{endif}}
+{{if 'CUlogicalEndpointIpcHandleType_enum' in found_types}}
+
+cdef class _HelperCUlogicalEndpointIpcHandleType:
+    cdef void* _cptr
+    cdef cydriver.CUlogicalEndpointIpcHandleType_enum _type
+
+    # Return values
+    cdef int _int
+    {{if 'CUlogicalEndpointFabricHandle' in found_types}}
+    cdef driver.CUlogicalEndpointFabricHandle _fabric_handle
+    {{endif}}
+{{endif}}
 
 cdef class _InputVoidPtrPtrHelper:
     cdef object _references
diff --git a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in
index c88ec497215..16c2e7685f6 100644
--- a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in
+++ b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in
@@ -594,6 +594,46 @@ cdef class _HelperCUmemAllocationHandleType:
         else:
             raise TypeError('Unsupported attribute: {}'.format(self._type))
 {{endif}}
+{{if 'CUlogicalEndpointIpcHandleType_enum' in found_types}}
+
+cdef class _HelperCUlogicalEndpointIpcHandleType:
+    def __cinit__(self, attr):
+        self._type = attr.value
+        if False:
+            pass
+        {{if 'CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE' in found_values}}
+        elif self._type in (cydriver.CUlogicalEndpointIpcHandleType_enum.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE,):
+            self._cptr = <void*>&self._int
+        {{endif}}
+        {{if 'CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC' in found_values}}
+        elif self._type in (cydriver.CUlogicalEndpointIpcHandleType_enum.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC,):
+            self._fabric_handle = _driver["CUlogicalEndpointFabricHandle"]()
+            self._cptr = <void*><void_ptr>self._fabric_handle.getPtr()
+        {{endif}}
+        else:
+            raise TypeError('Unsupported attribute: {}'.format(attr.name))
+
+    def __dealloc__(self):
+        pass
+
+    @property
+    def cptr(self):
+        return <void_ptr>self._cptr
+
+    def pyObj(self):
+        if False:
+            pass
+        {{if 'CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE' in found_values}}
+        elif self._type in (cydriver.CUlogicalEndpointIpcHandleType_enum.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE,):
+            return self._int
+        {{endif}}
+        {{if 'CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC' in found_values}}
+        elif self._type in (cydriver.CUlogicalEndpointIpcHandleType_enum.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC,):
+            return self._fabric_handle
+        {{endif}}
+        else:
+            raise TypeError('Unsupported attribute: {}'.format(self._type))
+{{endif}}
 
 cdef class _InputVoidPtrPtrHelper:
     def __cinit__(self, lst):
diff --git a/cuda_bindings/cuda/bindings/utils/_ptx_utils.py b/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
index b9df6503120..5ba21c398c2 100644
--- a/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
+++ b/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
@@ -50,6 +50,7 @@
     "9.0": (13, 0),
     "9.1": (13, 1),
     "9.2": (13, 2),
+    "9.3": (13, 3),
 }
 
 
diff --git a/cuda_bindings/tests/cudla/test_cudla_bindings.py b/cuda_bindings/tests/cudla/test_cudla_bindings.py
index 529f9f6d58d..c70b3f692d3 100644
--- a/cuda_bindings/tests/cudla/test_cudla_bindings.py
+++ b/cuda_bindings/tests/cudla/test_cudla_bindings.py
@@ -38,7 +38,7 @@ def test_status_values(self):
         assert cudla.Status.ErrorUnknown == 0x7FFFFFFF
 
     def test_status_member_count(self):
-        assert len(cudla.Status) == 23
+        assert len(cudla.Status) == 24
 
     def test_mode_values(self):
         assert cudla.Mode.CUDA_DLA == 0
diff --git a/cuda_core/cuda/core/system/_nvlink.pxi b/cuda_core/cuda/core/system/_nvlink.pxi
index ad246b8364f..9b9ae61cfef 100644
--- a/cuda_core/cuda/core/system/_nvlink.pxi
+++ b/cuda_core/cuda/core/system/_nvlink.pxi
@@ -11,6 +11,7 @@ _NVLINK_VERSION_MAPPING = {
     nvml.NvlinkVersion.VERSION_3_1: (3, 1),
     nvml.NvlinkVersion.VERSION_4_0: (4, 0),
     nvml.NvlinkVersion.VERSION_5_0: (5, 0),
+    nvml.NvlinkVersion.VERSION_6_0: (6, 0),
 }
 
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 219e8f0a56b..cc143e322a2 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -60,6 +60,19 @@
 POOL_SIZE = 2097152  # 2MB size
 
 
+def _allocate_pinned_buffer_or_xfail(mr, size, *, device):
+    try:
+        return mr.allocate(size, stream=device.default_stream)
+    except CUDAError as exc:
+        if "CUDA_ERROR_OUT_OF_MEMORY" in str(exc):
+            pytest.xfail("TODO(#9999): Resolve CUDA_ERROR_OUT_OF_MEMORY")
+        raise
+    except RuntimeError as exc:
+        if "Failed to allocate memory from pool" in str(exc):
+            pytest.xfail("TODO(#9999): Resolve Failed to allocate memory from pool")
+        raise
+
+
 class DummyHostMemoryResource(MemoryResource):
     # Pure-host ctypes allocation; stream is accepted for interface
     # conformance but ignored.
@@ -682,7 +695,11 @@ def test_non_managed_resources_report_not_managed(mr_kind):
         skip_if_pinned_memory_unsupported(device)
         mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
     assert mr.is_managed is False
-    buf = mr.allocate(1024, stream=device.default_stream)
+    buf = (
+        _allocate_pinned_buffer_or_xfail(mr, 1024, device=device)
+        if mr_kind == "pinned"
+        else mr.allocate(1024, stream=device.default_stream)
+    )
     assert buf.is_managed is False
     buf.close()
 
@@ -730,16 +747,7 @@ def test_pinned_memory_resource_initialization(init_cuda):
     assert mr.is_host_accessible
 
     # Test allocation/deallocation works
-    try:
-        buffer = mr.allocate(1024, stream=device.default_stream)
-    except CUDAError as exc:
-        msg = str(exc)
-        if "CUDA_ERROR_OUT_OF_MEMORY" in msg:
-            pytest.xfail("TODO(#9999): Resolve CUDA_ERROR_OUT_OF_MEMORY")
-    except RuntimeError as exc:
-        msg = str(exc)
-        if "Failed to allocate memory from pool" in msg:
-            pytest.xfail("TODO(#9999): Resolve Failed to allocate memory from pool")
+    buffer = _allocate_pinned_buffer_or_xfail(mr, 1024, device=device)
     assert buffer.size == 1024
     assert buffer.device_id == -1  # Not bound to any GPU
     assert buffer.is_host_accessible
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py
index e334e04ddf2..ba8ed82cdbb 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py
@@ -271,6 +271,8 @@ class DescriptorSpec:
         packaged_with="ctk",
         linux_sonames=("libcupti.so.12", "libcupti.so.13"),
         windows_dlls=(
+            "cupti64_2026.3.0.dll",
+            "cupti64_2026.2.0.dll",
             "cupti64_2026.1.1.dll",
             "cupti64_2026.1.0.dll",
             "cupti64_2025.4.1.dll",
diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py
index 3d521425540..e119eaa4795 100644
--- a/toolshed/check_spdx.py
+++ b/toolshed/check_spdx.py
@@ -31,6 +31,7 @@
     "cuda_pathfinder": "Apache-2.0",
     "cuda_python": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
     "cuda_python_test_helpers": "Apache-2.0",
+    "qa": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
     "scripts": "Apache-2.0",
     "toolshed": "Apache-2.0",
 }

From 84b21cdb0ef4e540ae02b54410f1cd2365bd61ce Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 10:23:30 -0700
Subject: [PATCH 04/11] Update ci/versions.yml: build with 13.3.0

---
 ci/versions.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/versions.yml b/ci/versions.yml
index fb749c0718d..0f0ab251e50 100644
--- a/ci/versions.yml
+++ b/ci/versions.yml
@@ -5,6 +5,6 @@ backport_branch: "12.9.x"  # keep in sync with target-branch in .github/dependab
 
 cuda:
   build:
-    version: "13.2.1"
+    version: "13.3.0"
   prev_build:
     version: "12.9.1"

From 10f05239624d9a3f34b812af9b83f1250087aec0 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 13:10:50 -0700
Subject: [PATCH 05/11] Guard NVLink 6 mapping for older bindings

---
 cuda_core/cuda/core/system/_nvlink.pxi | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/system/_nvlink.pxi b/cuda_core/cuda/core/system/_nvlink.pxi
index 9b9ae61cfef..62ab4e716be 100644
--- a/cuda_core/cuda/core/system/_nvlink.pxi
+++ b/cuda_core/cuda/core/system/_nvlink.pxi
@@ -11,9 +11,12 @@ _NVLINK_VERSION_MAPPING = {
     nvml.NvlinkVersion.VERSION_3_1: (3, 1),
     nvml.NvlinkVersion.VERSION_4_0: (4, 0),
     nvml.NvlinkVersion.VERSION_5_0: (5, 0),
-    nvml.NvlinkVersion.VERSION_6_0: (6, 0),
 }
 
+_NVLINK_VERSION_6_0 = getattr(nvml.NvlinkVersion, "VERSION_6_0", None)
+if _NVLINK_VERSION_6_0 is not None:
+    _NVLINK_VERSION_MAPPING[_NVLINK_VERSION_6_0] = (6, 0)
+
 
 cdef class NvlinkInfo:
     """

From 89245f013307fc1a2cd7650e8065ec8e6b46c1a2 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 11:19:26 -0700
Subject: [PATCH 06/11] Add cuda-pathfinder 1.5.5 release notes

---
 cuda_pathfinder/docs/nv-versions.json         |  4 +++
 .../docs/source/release/1.5.5-notes.rst       | 25 +++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 cuda_pathfinder/docs/source/release/1.5.5-notes.rst

diff --git a/cuda_pathfinder/docs/nv-versions.json b/cuda_pathfinder/docs/nv-versions.json
index 379c772ebee..1d427da34be 100644
--- a/cuda_pathfinder/docs/nv-versions.json
+++ b/cuda_pathfinder/docs/nv-versions.json
@@ -3,6 +3,10 @@
         "version": "latest",
         "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/latest/"
     },
+    {
+        "version": "1.5.5",
+        "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/1.5.5/"
+    },
     {
         "version": "1.5.4",
         "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/1.5.4/"
diff --git a/cuda_pathfinder/docs/source/release/1.5.5-notes.rst b/cuda_pathfinder/docs/source/release/1.5.5-notes.rst
new file mode 100644
index 00000000000..1de21378f91
--- /dev/null
+++ b/cuda_pathfinder/docs/source/release/1.5.5-notes.rst
@@ -0,0 +1,25 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+.. py:currentmodule:: cuda.pathfinder
+
+``cuda-pathfinder`` 1.5.5 Release notes
+=======================================
+
+Highlights
+----------
+
+* Add support for the Windows CUDA 13.3 CUPTI DLL name
+  ``cupti64_2026.2.0.dll`` so
+  ``load_nvidia_dynamic_lib("cupti")`` can recognize CTK 13.3 installations.
+  (`PR #2139 <https://github.com/NVIDIA/cuda-python/pull/2139>`_)
+
+Bugfixes
+--------
+
+* On Windows, find ``cudadevrt`` in both CUDA 13 and CUDA 12 conda static-library
+  layouts. This lets ``locate_static_lib("cudadevrt")`` and
+  ``find_static_lib("cudadevrt")`` resolve CUDA 12 conda environments under
+  ``Library/lib`` instead of falling through to ``CUDA_PATH`` and reporting a
+  misleading missing-library error.
+  (`PR #2015 <https://github.com/NVIDIA/cuda-python/pull/2015>`_)

From 667d7bec79b9f5c8f302390da0c24a2ad0e41124 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 11:38:32 -0700
Subject: [PATCH 07/11] Consolidate cuda-bindings 13.3.0 release notes

---
 cuda_bindings/docs/nv-versions.json           |  8 +++
 .../docs/source/release/13.2.1-notes.rst      | 14 -----
 .../docs/source/release/13.3.0-notes.rst      | 56 ++++++++++++++++++-
 3 files changed, 61 insertions(+), 17 deletions(-)
 delete mode 100644 cuda_bindings/docs/source/release/13.2.1-notes.rst

diff --git a/cuda_bindings/docs/nv-versions.json b/cuda_bindings/docs/nv-versions.json
index 3b37062e6f5..5aeaf7bdfee 100644
--- a/cuda_bindings/docs/nv-versions.json
+++ b/cuda_bindings/docs/nv-versions.json
@@ -3,6 +3,14 @@
         "version": "latest",
         "url": "https://nvidia.github.io/cuda-python/cuda-bindings/latest/"
     },
+    {
+        "version": "13.3.0",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/13.3.0/"
+    },
+    {
+        "version": "13.2.0",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/13.2.0/"
+    },
     {
         "version": "13.1.1",
         "url": "https://nvidia.github.io/cuda-python/cuda-bindings/13.1.1/"
diff --git a/cuda_bindings/docs/source/release/13.2.1-notes.rst b/cuda_bindings/docs/source/release/13.2.1-notes.rst
deleted file mode 100644
index a1566e028d9..00000000000
--- a/cuda_bindings/docs/source/release/13.2.1-notes.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-.. module:: cuda.bindings
-
-``cuda-bindings`` 13.2.1 Release notes
-======================================
-
-Bugfixes
---------
-
-* Per-thread default stream mode would be used whenever the
-  ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` environment variable was set,
-  even if it was set to ``0``.
diff --git a/cuda_bindings/docs/source/release/13.3.0-notes.rst b/cuda_bindings/docs/source/release/13.3.0-notes.rst
index ffa53dc4d30..409e2a5a411 100644
--- a/cuda_bindings/docs/source/release/13.3.0-notes.rst
+++ b/cuda_bindings/docs/source/release/13.3.0-notes.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 .. module:: cuda.bindings
@@ -9,10 +9,40 @@
 Highlights
 ----------
 
+* Support for new APIs introduced in CUDA 13.3, including driver logical
+  endpoint APIs, graph recapture APIs, NVRTC Tile IR and bundled-header APIs,
+  and related runtime graph/event APIs.
+  (`PR #2139 <https://github.com/NVIDIA/cuda-python/pull/2139>`_)
+
+* Add ``cuda.bindings.cudla`` bindings.
+  (`PR #2034 <https://github.com/NVIDIA/cuda-python/pull/2034>`_)
+
+* Add the ``nvvmLLVMVersion`` binding.
+  (`PR #1774 <https://github.com/NVIDIA/cuda-python/pull/1774>`_)
+
+* Add additional NVML APIs introduced in CUDA 13.2.
+  (`PR #1830 <https://github.com/NVIDIA/cuda-python/pull/1830>`_)
 
 Bugfixes
 --------
 
+* Fixed the ``cuDevSmResourceSplit`` and ``cudaDevSmResourceSplit`` binding
+  signatures so ``groupParams`` is accepted as a sequence matching the CUDA API.
+  (`PR #1766 <https://github.com/NVIDIA/cuda-python/pull/1766>`_)
+
+* Fixed nested resource pointer handling to accept both ``str`` and ``bytes``
+  inputs.
+  (`PR #1698 <https://github.com/NVIDIA/cuda-python/pull/1698>`_)
+
+* Fixed ``nvmlDeviceGetFieldValues`` and ``nvmlDeviceClearFieldValues`` handling
+  of empty field lists so they return empty results instead of raising
+  ``NVML_ERROR_INVALID_ARGUMENT``.
+  (`PR #1982 <https://github.com/NVIDIA/cuda-python/pull/1982>`_)
+
+* Fixed ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM=0`` incorrectly enabling
+  per-thread default stream mode.
+  (`PR #2076 <https://github.com/NVIDIA/cuda-python/pull/2076>`_)
+
 * Fixed a use-after-free in ``cudaGraphGetEdges``, ``cudaGraphNodeGetDependencies``,
   ``cudaGraphNodeGetDependentNodes``, ``cudaStreamGetCaptureInfo``, and their
   driver-API counterparts (``cuGraphGetEdges``, ``cuGraphNodeGetDependencies``,
@@ -21,7 +51,8 @@ Bugfixes
   buffer that was freed before the call returned, leaving every wrapper holding
   a dangling pointer. The returned wrappers now own deep copies of the edge
   data.
-  (`Issue #1804 <https://github.com/NVIDIA/cuda-python/issues/1804>`_)
+  (`Issue #1804 <https://github.com/NVIDIA/cuda-python/issues/1804>`_,
+  `PR #2083 <https://github.com/NVIDIA/cuda-python/pull/2083>`_)
 
 * Fixed a double-free in the generated setters for list-valued struct members
   (e.g. ``CUlaunchConfig.attrs``, ``CUDA_MEM_ALLOC_NODE_PARAMS.accessDescs``,
@@ -29,11 +60,30 @@ Bugfixes
   counterparts). Assigning an empty list freed the internal buffer but left
   the cached pointer non-NULL, so a subsequent assignment or ``__dealloc__``
   would call ``free()`` again on the dangling pointer.
-
+  (`PR #2112 <https://github.com/NVIDIA/cuda-python/pull/2112>`_)
 
 Miscellaneous
 -------------
 
+* Add ``cuda.bindings.utils.check_nvvm_compiler_options()`` to check whether a
+  set of NVVM compiler options is supported by the installed NVVM library.
+  (`PR #1837 <https://github.com/NVIDIA/cuda-python/pull/1837>`_)
+
+* NVRTC bindings now use pre-generated Cython files and no longer require
+  pyclibrary header parsing at build time.
+  (`PR #1900 <https://github.com/NVIDIA/cuda-python/pull/1900>`_)
+
+* Improved generated documentation and argument names, including the ``ind_ex``
+  argument naming bug.
+  (`PR #1927 <https://github.com/NVIDIA/cuda-python/pull/1927>`_,
+  `PR #2082 <https://github.com/NVIDIA/cuda-python/pull/2082>`_)
+
+* Fixed ``cuda-bindings`` debug builds.
+  (`PR #1890 <https://github.com/NVIDIA/cuda-python/pull/1890>`_)
+
+* Declare ``cuda-pathfinder`` as a host dependency for pixi path-dependency
+  builds of ``cuda-bindings``.
+  (`PR #1926 <https://github.com/NVIDIA/cuda-python/pull/1926>`_)
 
 Known issues
 ------------

From a50542da32b30f215035d7fd7c4a7563b4d0b170 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 12:04:49 -0700
Subject: [PATCH 08/11] Add CUDA Python 13.3.0 release notes

---
 cuda_python/docs/nv-versions.json                |  8 ++++++++
 cuda_python/docs/source/release/13.3.0-notes.rst | 16 ++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 cuda_python/docs/source/release/13.3.0-notes.rst

diff --git a/cuda_python/docs/nv-versions.json b/cuda_python/docs/nv-versions.json
index 64d47c99ffa..22b95f5e182 100644
--- a/cuda_python/docs/nv-versions.json
+++ b/cuda_python/docs/nv-versions.json
@@ -3,6 +3,14 @@
         "version": "latest",
         "url": "https://nvidia.github.io/cuda-python/latest/"
     },
+    {
+        "version": "13.3.0",
+        "url": "https://nvidia.github.io/cuda-python/13.3.0/"
+    },
+    {
+        "version": "13.2.0",
+        "url": "https://nvidia.github.io/cuda-python/13.2.0/"
+    },
     {
         "version": "13.1.1",
         "url": "https://nvidia.github.io/cuda-python/13.1.1/"
diff --git a/cuda_python/docs/source/release/13.3.0-notes.rst b/cuda_python/docs/source/release/13.3.0-notes.rst
new file mode 100644
index 00000000000..8861d331a12
--- /dev/null
+++ b/cuda_python/docs/source/release/13.3.0-notes.rst
@@ -0,0 +1,16 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+CUDA Python 13.3.0 Release notes
+================================
+
+Included components
+-------------------
+
+* `cuda.bindings 13.3.0 <https://nvidia.github.io/cuda-python/cuda-bindings/13.3.0/release/13.3.0-notes.html>`_
+* `cuda.pathfinder 1.5.5 <https://nvidia.github.io/cuda-python/cuda-pathfinder/1.5.5/release/1.5.5-notes.html>`_
+
+Known issues
+------------
+
+* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.

From 446ea7562156d2332112df4cf5dccc80cefcce5c Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 12:34:23 -0700
Subject: [PATCH 09/11] Add 12.9.7 release notes

---
 cuda_bindings/docs/nv-versions.json           | 28 ++++++++++
 .../docs/source/release/12.9.7-notes.rst      | 53 +++++++++++++++++++
 cuda_python/docs/nv-versions.json             |  8 +++
 .../docs/source/release/12.9.7-notes.rst      | 16 ++++++
 4 files changed, 105 insertions(+)
 create mode 100644 cuda_bindings/docs/source/release/12.9.7-notes.rst
 create mode 100644 cuda_python/docs/source/release/12.9.7-notes.rst

diff --git a/cuda_bindings/docs/nv-versions.json b/cuda_bindings/docs/nv-versions.json
index 5aeaf7bdfee..9ce7781b84e 100644
--- a/cuda_bindings/docs/nv-versions.json
+++ b/cuda_bindings/docs/nv-versions.json
@@ -35,6 +35,34 @@
         "version": "13.0.0",
         "url": "https://nvidia.github.io/cuda-python/cuda-bindings/13.0.0/"
     },
+    {
+        "version": "12.9.7",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/12.9.7/"
+    },
+    {
+        "version": "12.9.6",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/12.9.6/"
+    },
+    {
+        "version": "12.9.5",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/12.9.5/"
+    },
+    {
+        "version": "12.9.4",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/12.9.4/"
+    },
+    {
+        "version": "12.9.3",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/12.9.3/"
+    },
+    {
+        "version": "12.9.2",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/12.9.2/"
+    },
+    {
+        "version": "12.9.1",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/12.9.1/"
+    },
     {
         "version": "12.9.0",
         "url": "https://nvidia.github.io/cuda-python/cuda-bindings/12.9.0/"
diff --git a/cuda_bindings/docs/source/release/12.9.7-notes.rst b/cuda_bindings/docs/source/release/12.9.7-notes.rst
new file mode 100644
index 00000000000..7670290acf5
--- /dev/null
+++ b/cuda_bindings/docs/source/release/12.9.7-notes.rst
@@ -0,0 +1,53 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+.. module:: cuda.bindings
+
+``cuda-bindings`` 12.9.7 Release notes
+======================================
+
+Bugfixes
+--------
+
+* Fixed ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM=0`` incorrectly enabling
+  per-thread default stream mode.
+  (`PR #2110 <https://github.com/NVIDIA/cuda-python/pull/2110>`_)
+
+* Fixed a use-after-free in ``cudaGraphGetEdges``, ``cudaGraphNodeGetDependencies``,
+  ``cudaGraphNodeGetDependentNodes``, ``cudaStreamGetCaptureInfo``, and their
+  driver-API counterparts (``cuGraphGetEdges``, ``cuGraphNodeGetDependencies``,
+  ``cuGraphNodeGetDependentNodes``, ``cuStreamGetCaptureInfo``). The returned
+  ``cudaGraphEdgeData``/``CUgraphEdgeData`` wrappers were backed by a scratch
+  buffer that was freed before the call returned, leaving every wrapper holding
+  a dangling pointer. The returned wrappers now own deep copies of the edge
+  data.
+  (`Issue #1804 <https://github.com/NVIDIA/cuda-python/issues/1804>`_,
+  `PR #2110 <https://github.com/NVIDIA/cuda-python/pull/2110>`_)
+
+* Fixed a double-free in the generated setters for list-valued struct members
+  (e.g. ``CUlaunchConfig.attrs``, ``CUDA_MEM_ALLOC_NODE_PARAMS.accessDescs``,
+  external-semaphore and batch-mem-op node parameter arrays, and their runtime
+  counterparts). Assigning an empty list freed the internal buffer but left
+  the cached pointer non-NULL, so a subsequent assignment or ``__dealloc__``
+  would call ``free()`` again on the dangling pointer.
+  (`PR #2115 <https://github.com/NVIDIA/cuda-python/pull/2115>`_)
+
+Miscellaneous
+-------------
+
+* NVRTC bindings now use pre-generated Cython files and no longer require
+  pyclibrary header parsing at build time.
+  (`PR #1957 <https://github.com/NVIDIA/cuda-python/pull/1957>`_)
+
+* Improved generated documentation and argument names, including the ``ind_ex``
+  argument naming bug.
+  (`PR #1928 <https://github.com/NVIDIA/cuda-python/pull/1928>`_,
+  `PR #2110 <https://github.com/NVIDIA/cuda-python/pull/2110>`_)
+
+* Source archives now include git archival metadata for setuptools-scm.
+  (`PR #1756 <https://github.com/NVIDIA/cuda-python/pull/1756>`_)
+
+Known issues
+------------
+
+* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
diff --git a/cuda_python/docs/nv-versions.json b/cuda_python/docs/nv-versions.json
index 22b95f5e182..1b35c844a1c 100644
--- a/cuda_python/docs/nv-versions.json
+++ b/cuda_python/docs/nv-versions.json
@@ -35,6 +35,14 @@
         "version": "13.0.0",
         "url": "https://nvidia.github.io/cuda-python/13.0.0/"
     },
+    {
+        "version": "12.9.7",
+        "url": "https://nvidia.github.io/cuda-python/12.9.7/"
+    },
+    {
+        "version": "12.9.6",
+        "url": "https://nvidia.github.io/cuda-python/12.9.6/"
+    },
     {
         "version": "12.9.5",
         "url": "https://nvidia.github.io/cuda-python/12.9.5/"
diff --git a/cuda_python/docs/source/release/12.9.7-notes.rst b/cuda_python/docs/source/release/12.9.7-notes.rst
new file mode 100644
index 00000000000..71e0d2630ef
--- /dev/null
+++ b/cuda_python/docs/source/release/12.9.7-notes.rst
@@ -0,0 +1,16 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+CUDA Python 12.9.7 Release notes
+================================
+
+Included components
+-------------------
+
+* `cuda.bindings 12.9.7 <https://nvidia.github.io/cuda-python/cuda-bindings/12.9.7/release/12.9.7-notes.html>`_
+* `cuda.pathfinder 1.5.5 <https://nvidia.github.io/cuda-python/cuda-pathfinder/1.5.5/release/1.5.5-notes.html>`_
+
+Known issues
+------------
+
+* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.

From d72799005cddecd9724728366052d47ea3d5cfde Mon Sep 17 00:00:00 2001
From: Keith Kraus <keith.j.kraus@gmail.com>
Date: Tue, 26 May 2026 13:19:18 -0700
Subject: [PATCH 10/11] Enable security scans on ctk-next

Run Bandit and CodeQL on ctk-next pushes and grant the scanner jobs the read permissions needed to checkout private repository contents and inspect workflow runs.
---
 .github/workflows/bandit.yml | 5 ++++-
 .github/workflows/codeql.yml | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml
index 4abcf260336..bb785e30907 100644
--- a/.github/workflows/bandit.yml
+++ b/.github/workflows/bandit.yml
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -8,6 +8,7 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
+      - "ctk-next"
       - "main"
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
@@ -17,6 +18,8 @@ jobs:
   analyze:
     runs-on: ubuntu-latest
     permissions:
+      actions: read
+      contents: read
       security-events: write
     steps:
       - name: Checkout
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 1f36bd0d694..4019054f6b9 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -8,6 +8,7 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
+      - "ctk-next"
       - "main"
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
@@ -18,6 +19,8 @@ jobs:
     name: Analyze (${{ matrix.language }})
     runs-on: ubuntu-latest
     permissions:
+      actions: read
+      contents: read
       security-events: write
 
     strategy:

From ba7debd90a51135eeb149d764b53e00a7cd44ca4 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 26 May 2026 14:14:49 -0700
Subject: [PATCH 11/11] Xfail MCDM mempool OOM with older bindings

Keep cuda-core tests using published older cuda-bindings wheels from failing when the shared mempool xfail helper is unavailable.
---
 .../cuda/bindings/_test_helpers/mempool.py    |  3 ++
 cuda_core/tests/conftest.py                   | 46 +++++++++++++++++--
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_test_helpers/mempool.py b/cuda_bindings/cuda/bindings/_test_helpers/mempool.py
index deee79f1aff..3113113d251 100644
--- a/cuda_bindings/cuda/bindings/_test_helpers/mempool.py
+++ b/cuda_bindings/cuda/bindings/_test_helpers/mempool.py
@@ -8,6 +8,9 @@
 from cuda.bindings import driver, runtime
 
 
+# Keep in sync with the fallback in cuda_core/tests/conftest.py. The cuda_core
+# copy is intentionally simpler because it only handles cuda_core CUDAError
+# exceptions when this helper is absent from older published bindings.
 def is_windows_mcdm_device(device=0):
     if sys.platform != "win32":
         return False
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 19992015825..44e8a747d25 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -34,10 +34,50 @@
     from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
 except ModuleNotFoundError:
     # Older cuda.bindings artifacts (for example 12.9.x backports) do not ship
-    # this helper yet. In that case, keep the primary failure visible instead of
-    # xfail-ing the known Windows MCDM mempool setup issue.
+    # this helper yet. Keep the fallback local so tests against published
+    # bindings still xfail the known Windows MCDM mempool setup issue.
+    #
+    # Keep in sync with cuda_bindings/cuda/bindings/_test_helpers/mempool.py.
+    # This copy is intentionally simpler because it only handles cuda_core
+    # CUDAError exceptions when the shared helper is absent.
+    def _is_windows_mcdm_device(device=0):
+        if sys.platform != "win32":
+            return False
+        import cuda.bindings.nvml as nvml
+
+        device_id = int(getattr(device, "device_id", device))
+        (err,) = driver.cuInit(0)
+        if err != driver.CUresult.CUDA_SUCCESS:
+            return False
+        err, pci_bus_id = driver.cuDeviceGetPCIBusId(13, device_id)
+        if err != driver.CUresult.CUDA_SUCCESS:
+            return False
+        pci_bus_id = pci_bus_id.split(b"\x00", 1)[0].decode("ascii")
+        nvml.init_v2()
+        try:
+            handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id)
+            current, _ = nvml.device_get_driver_model_v2(handle)
+            return current == nvml.DriverModel.DRIVER_MCDM
+        finally:
+            nvml.shutdown()
+
     def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
-        return
+        if api_name is not None and not isinstance(api_name, str):
+            device = api_name
+            api_name = None
+
+        if "CUDA_ERROR_OUT_OF_MEMORY" not in str(err_or_exc):
+            return
+        try:
+            is_windows_mcdm = _is_windows_mcdm_device(device)
+        except Exception:
+            # If MCDM detection fails, leave the primary test failure visible.
+            return
+        if not is_windows_mcdm:
+            return
+
+        api_context = f"{api_name} " if api_name else ""
+        pytest.xfail(f"{api_context}could not reserve VA for mempool operations on Windows MCDM")
 
 
 # Import shared test helpers for tests across subprojects.