Skip to content

Commit 37e9958

Browse files
nimlgenmitkotakgfokkema
committed
Init cudagraph
Co-authored-by: Mit Kotak <53411468+mitkotak@users.noreply.github.com> Co-authored-by: Gerlof Fokkema <gerlof.fokkema@gmail.com>
1 parent 44bff55 commit 37e9958

File tree

6 files changed

+747
-68
lines changed

6 files changed

+747
-68
lines changed

doc/driver.rst

+125
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,22 @@ Constants
605605

606606
.. attribute:: LAZY_ENABLE_PEER_ACCESS
607607

608+
.. class:: capture_mode
609+
610+
CUDA 10 and newer.
611+
612+
.. attribute:: GLOBAL
613+
.. attribute:: THREAD_LOCAL
614+
.. attribute:: RELAXED
615+
616+
.. class:: capture_status
617+
618+
CUDA 10 and newer.
619+
620+
.. attribute:: NONE
621+
.. attribute:: ACTIVE
622+
.. attribute:: INVALIDATED
623+
608624

609625
Graphics-related constants
610626
^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -845,6 +861,43 @@ Concurrency and Streams
845861

846862
.. versionadded:: 2011.1
847863

864+
.. method:: begin_capture(capture_mode=capture_mode.GLOBAL)
865+
866+
Begins graph stream capture on a stream.
867+
868+
When a stream is in capture mode, all operations pushed into the stream
869+
will not be executed, but will instead be captured into a graph.
870+
871+
:arg capture_mode: A :class:`capture_mode` specifying mode for capturing graph.
872+
873+
CUDA 10 and above.
874+
875+
.. method:: end_capture()
876+
877+
Ends stream capture and returns a :class:`Graph` object.
878+
879+
CUDA 10 and above.
880+
881+
.. method:: get_capture_info_v2()
882+
883+
Query a stream's capture state.
884+
885+
Return a :class:`tuple` of (:class:`capture_status` capture status, :class:`int` id for the capture sequence,
886+
:class:`Graph` the graph being captured into, a :class:`list` of :class:`GraphNode` specifying set of nodes the
887+
next node to be captured in the stream will depend on)
888+
889+
CUDA 10 and above.
890+
891+
.. method:: update_capture_dependencies(dependencies, flags)
892+
893+
Modifies the dependency set of a capturing stream.
894+
The dependency set is the set of nodes that the next captured node in the stream will depend on.
895+
896+
:arg dependencies: A :class:`list` of :class:`GraphNode` specifying the new list of dependencies.
897+
:arg flags: A :class:`int` controlling whether the set passed to the API is added to the existing set or replaces it.
898+
899+
CUDA 11.3 and above.
900+
848901
.. class:: Event(flags=0)
849902

850903
An event is a temporal 'marker' in a :class:`Stream` that allows taking the time
@@ -895,6 +948,78 @@ Concurrency and Streams
895948

896949
.. versionadded: 2011.2
897950
951+
CUDAGraphs
952+
----------
953+
954+
CUDA 10.0 and above
955+
956+
Launching a simple kernel using CUDAGraphs
957+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
958+
959+
.. literalinclude:: ../examples/cudagraph_kernel.py
960+
961+
.. class:: GraphNode
962+
963+
An object representing a node on :class:`Graph`.
964+
965+
Wraps `cuGraphNode <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc72514a94dacc85ed0617f979211079c>`
966+
967+
.. class:: GraphExec
968+
969+
An executable graph to be launched on a stream.
970+
971+
Wraps `cuGraphExec <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gf0abeceeaa9f0a39592fe36a538ea1f0>`_
972+
973+
.. method:: launch(stream_py=None)
974+
975+
Launches an executable graph in a stream.
976+
977+
:arg stream_py: :class:`Stream` object specifying device stream.
978+
Will use default stream if *stream_py* is None.
979+
980+
.. method:: kernel_node_set_params(*args, kernel_node, func=None, block=(), grid=(), shared_mem_bytes=0)
981+
982+
Sets a kernel node's parameters. Refer to :meth:`add_kernel_node` for argument specifications.
983+
984+
.. class:: Graph()
985+
986+
A cudagraph is a data dependency graph meant to
987+
serve as an alternative to :class:`Stream`.
988+
989+
Wraps `cuGraph <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g69f555c38df5b3fa1ed25efef794739a>`
990+
991+
.. method:: add_kernel_node(*args, func, block, grid=(1, ), dependencies=[], shared_mem_bytes=0)
992+
993+
Returns and adds a :class:`GraphNode` object specifying
994+
kernel node to the graph.
995+
996+
Will be placed at the root of the graph if dependencies
997+
are not specified.
998+
999+
:arg args: *arg1* through *argn* are the positional C arguments to the kernel.
1000+
See :meth:`Function.__call__` for more argument details.
1001+
1002+
:arg func: a :class:`Function`object specifying kernel function.
1003+
1004+
:arg block: a :class:`tuple` of up to three integer entries specifying the number
1005+
of thread blocks to launch, as a multi-dimensional grid.
1006+
1007+
:arg grid: a :class:`tuple` of up to three integer entries specifying the grid configuration.
1008+
1009+
:arg dependencies: A :class:`list` of :class:`GraphNode` objects specifying dependency nodes.
1010+
1011+
:arg shared_mem_bytes: A :class:`int` specifying size of shared memory.
1012+
1013+
.. method:: instantiate()
1014+
1015+
Returns and instantiates a :class:`GraphExec` object.
1016+
1017+
.. method:: debug_dot_print(path)
1018+
1019+
Returns a DOT file describing graph structure at specifed path.
1020+
1021+
:arg path: String specifying path for saving DOT file.
1022+
8981023
Memory
8991024
------
9001025

examples/demo_graph.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Sample source code from the Tutorial Introduction in the documentation.
2+
import pycuda.driver as cuda
3+
import pycuda.autoinit # noqa
4+
from pycuda.compiler import SourceModule
5+
6+
mod = SourceModule("""
7+
__global__ void plus(float *a, int num)
8+
{
9+
int idx = threadIdx.x + threadIdx.y*4;
10+
a[idx] += num;
11+
}
12+
__global__ void times(float *a, float *b)
13+
{
14+
int idx = threadIdx.x + threadIdx.y*4;
15+
a[idx] *= b[idx];
16+
}
17+
""")
18+
func_plus = mod.get_function("plus")
19+
func_times = mod.get_function("times")
20+
21+
import numpy
22+
a = numpy.zeros((4, 4)).astype(numpy.float32)
23+
a_gpu = cuda.mem_alloc_like(a)
24+
b = numpy.zeros((4, 4)).astype(numpy.float32)
25+
b_gpu = cuda.mem_alloc_like(b)
26+
result = numpy.zeros_like(b)
27+
b2_gpu = cuda.mem_alloc_like(b)
28+
29+
stream_1 = cuda.Stream()
30+
stream_1.begin_capture()
31+
cuda.memcpy_htod_async(a_gpu, a, stream_1)
32+
cuda.memcpy_htod_async(b_gpu, b, stream_1)
33+
cuda.memcpy_htod_async(b2_gpu, b, stream_1)
34+
func_plus(a_gpu, numpy.int32(2), block=(4, 4, 1), stream=stream_1)
35+
_, _, graph, deps = stream_1.get_capture_info_v2()
36+
first_node = graph.add_kernel_node(b_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps)
37+
stream_1.update_capture_dependencies([first_node], 1)
38+
39+
_, _, graph, deps = stream_1.get_capture_info_v2()
40+
second_node = graph.add_kernel_node(a_gpu, b_gpu, block=(4, 4, 1), func=func_times, dependencies=deps)
41+
stream_1.update_capture_dependencies([second_node], 1)
42+
cuda.memcpy_dtoh_async(result, a_gpu, stream_1)
43+
44+
graph = stream_1.end_capture()
45+
graph.debug_dot_print("test.dot") # print dotfile of graph
46+
instance = graph.instantiate()
47+
48+
# Setting dynamic parameters
49+
instance.kernel_node_set_params(b2_gpu, numpy.int32(100), block=(4, 4, 1), func=func_plus, kernel_node=first_node)
50+
instance.kernel_node_set_params(a_gpu, b2_gpu, block=(4, 4, 1), func=func_times, kernel_node=second_node)
51+
instance.launch()
52+
53+
print("original arrays:")
54+
print(a)
55+
print(b)
56+
print("(0+2)x(0+100) = 200, using a kernel graph of 3 kernels:")
57+
print(result)

pycuda/driver.py

+75-46
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,54 @@ def post_call(self, stream):
159159
class InOut(In, Out):
160160
pass
161161

162+
from functools import lru_cache
163+
164+
@lru_cache(maxsize=None)
165+
def _build_arg_buf(args):
166+
handlers = []
167+
168+
arg_data = []
169+
format = ""
170+
for i, arg in enumerate(args):
171+
if isinstance(arg, np.number):
172+
arg_data.append(arg)
173+
format += arg.dtype.char
174+
elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)):
175+
arg_data.append(int(arg))
176+
format += "P"
177+
elif isinstance(arg, ArgumentHandler):
178+
handlers.append(arg)
179+
arg_data.append(int(arg.get_device_alloc()))
180+
format += "P"
181+
elif isinstance(arg, np.ndarray):
182+
if isinstance(arg.base, ManagedAllocationOrStub):
183+
arg_data.append(int(arg.base))
184+
format += "P"
185+
else:
186+
arg_data.append(arg)
187+
format += "%ds" % arg.nbytes
188+
elif isinstance(arg, np.void):
189+
arg_data.append(_my_bytes(_memoryview(arg)))
190+
format += "%ds" % arg.itemsize
191+
else:
192+
cai = getattr(arg, "__cuda_array_interface__", None)
193+
if cai:
194+
arg_data.append(cai["data"][0])
195+
format += "P"
196+
continue
197+
198+
try:
199+
gpudata = np.uintp(arg.gpudata)
200+
except AttributeError:
201+
raise TypeError("invalid type on parameter #%d (0-based)" % i)
202+
else:
203+
# for gpuarrays
204+
arg_data.append(int(gpudata))
205+
format += "P"
206+
207+
from pycuda._pvt_struct import pack
208+
209+
return handlers, pack(format, *arg_data)
162210

163211
def _add_functionality():
164212
def device_get_attributes(dev):
@@ -187,52 +235,6 @@ def device_get_attributes(dev):
187235
def device___getattr__(dev, name):
188236
return dev.get_attribute(getattr(device_attribute, name.upper()))
189237

190-
def _build_arg_buf(args):
191-
handlers = []
192-
193-
arg_data = []
194-
format = ""
195-
for i, arg in enumerate(args):
196-
if isinstance(arg, np.number):
197-
arg_data.append(arg)
198-
format += arg.dtype.char
199-
elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)):
200-
arg_data.append(int(arg))
201-
format += "P"
202-
elif isinstance(arg, ArgumentHandler):
203-
handlers.append(arg)
204-
arg_data.append(int(arg.get_device_alloc()))
205-
format += "P"
206-
elif isinstance(arg, np.ndarray):
207-
if isinstance(arg.base, ManagedAllocationOrStub):
208-
arg_data.append(int(arg.base))
209-
format += "P"
210-
else:
211-
arg_data.append(arg)
212-
format += "%ds" % arg.nbytes
213-
elif isinstance(arg, np.void):
214-
arg_data.append(_my_bytes(_memoryview(arg)))
215-
format += "%ds" % arg.itemsize
216-
else:
217-
cai = getattr(arg, "__cuda_array_interface__", None)
218-
if cai:
219-
arg_data.append(cai["data"][0])
220-
format += "P"
221-
continue
222-
223-
try:
224-
gpudata = np.uintp(arg.gpudata)
225-
except AttributeError:
226-
raise TypeError("invalid type on parameter #%d (0-based)" % i)
227-
else:
228-
# for gpuarrays
229-
arg_data.append(int(gpudata))
230-
format += "P"
231-
232-
from pycuda._pvt_struct import pack
233-
234-
return handlers, pack(format, *arg_data)
235-
236238
# {{{ pre-CUDA 4 call interface (stateful)
237239

238240
def function_param_set_pre_v4(func, *args):
@@ -710,6 +712,33 @@ def new_func(*args, **kwargs):
710712

711713
_add_functionality()
712714

715+
# {{{ cudagraph
716+
717+
def patch_cudagraph():
718+
def graph_add_kernel_node_call(graph, *args, func, block, grid=(1, ), dependencies=[], shared_mem_bytes=0):
719+
if func is None:
720+
raise ValueError("must specify func")
721+
if block is None:
722+
raise ValueError("must specify block size")
723+
_, arg_buf = _build_arg_buf(args)
724+
return graph._add_kernel_node(dependencies, func, grid, block, arg_buf, shared_mem_bytes)
725+
726+
def exec_graph_set_kernel_node_call(exec_graph, *args, kernel_node, func, block, grid=(1, ), shared_mem_bytes=0):
727+
if kernel_node is None:
728+
raise ValueError("must specify kernel_node")
729+
if func is None:
730+
raise ValueError("must specify func")
731+
if block is None:
732+
raise ValueError("must specify block size")
733+
_, arg_buf = _build_arg_buf(args)
734+
return exec_graph._kernel_node_set_params(kernel_node, func, grid, block, arg_buf, shared_mem_bytes)
735+
736+
Graph.add_kernel_node = graph_add_kernel_node_call
737+
GraphExec.kernel_node_set_params = exec_graph_set_kernel_node_call
738+
if get_version() >= (10,):
739+
patch_cudagraph()
740+
741+
# }}}
713742

714743
# {{{ pagelocked numpy arrays
715744

0 commit comments

Comments
 (0)