Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
554 changes: 554 additions & 0 deletions dace/codegen/targets/gpu_helpers/copy_strategies.py

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions dace/codegen/targets/gpu_helpers/gpu_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
from dace import Config
from dace.codegen import common


def generate_sync_debug_call() -> str:
"""
Generate backend sync and error-check calls as a string if
synchronous debugging is enabled.

Parameters
----------
backend : str
Backend API prefix (e.g., 'cuda').

Returns
-------
str
The generated debug call code, or an empty string if debugging is disabled.
"""
backend: str = common.get_gpu_backend()
sync_call: str = ""
if Config.get_bool('compiler', 'cuda', 'syncdebug'):
sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")

return sync_call
31 changes: 31 additions & 0 deletions dace/transformation/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1550,6 +1550,37 @@ def get_parent_map(state: SDFGState, node: Optional[nodes.Node] = None) -> Optio
cursdfg = cursdfg.parent_sdfg
return None

def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool:
"""
Checks if the given node is enclosed within a Map whose schedule type
matches any in the `schedules` set.

Parameters
----------
state : SDFGState
The State where the node resides
node : nodes.Node
The node to check.
schedules : set[dtypes.ScheduleType]
A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}).

Returns
----------
bool
True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise.
"""
current = node

while current is not None:
if isinstance(current, nodes.MapEntry):
if current.map.schedule in schedules:
return True

parent = get_parent_map(state, current)
if parent is None:
return False
current, state = parent


def redirect_edge(state: SDFGState,
edge: graph.MultiConnectorEdge[Memlet],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
from typing import Any, Dict, Set, Type, Union

import dace
from dace import dtypes, properties, SDFG
from dace.codegen import common
from dace.config import Config
from dace.sdfg import nodes
from dace.transformation import pass_pipeline as ppl, transformation
from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs


@properties.make_properties
@transformation.explicit_cf_compatible
class ConnectGPUStreamsToKernels(ppl.Pass):
"""
This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps).

Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes,
indicating which GPU stream each kernel is assigned to. These assignments are e.g.
used when launching the kernels.
"""

def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs}

def modifies(self) -> ppl.Modifies:
return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets

def should_reapply(self, modified: ppl.Modifies) -> bool:
return False

def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
# Retrieve the GPU stream array name and the prefix for individual stream variables
stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')

# Retrieve GPU stream assignments for nodes
stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']

# Link kernels to their assigned GPU streams
for sub_sdfg in sdfg.all_sdfgs_recursive():

for state in sub_sdfg.states():
for node in state.nodes():

# Not a kernel entry - continue
if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
continue

# Stream connector name and the used GPU Stream for the kernel
assigned_gpustream = stream_assignments[node]
gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}"
accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"

# Assign the GPU stream to the kernel entry
kernel_entry = node
kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t)
stream_array_in = state.add_access(stream_array_name)
state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name,
dace.Memlet(accessed_gpu_stream))

# Assign the GPU stream to the kernel exit
kernel_exit = state.exit_node(kernel_entry)
kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t)
stream_array_out = state.add_access(stream_array_name)
state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None,
dace.Memlet(accessed_gpu_stream))

return {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
from typing import Any, Dict, Set, Type, Union

import dace
from dace import dtypes, properties, SDFG
from dace.config import Config
from dace.sdfg import nodes
from dace.transformation import pass_pipeline as ppl, transformation
from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels

# Placeholder for the GPU stream variable used in tasklet code
STREAM_PLACEHOLDER = "__dace_current_stream"


@properties.make_properties
@transformation.explicit_cf_compatible
class ConnectGPUStreamsToTasklets(ppl.Pass):
"""
This pass ensures that tasklets which require access to their assigned GPU stream
are provided with it explicitly.

Such tasklets typically originate from expanded LibraryNodes targeting GPUs.
These nodes may reference the special placeholder variable `__dace_current_stream`,
which is expected to be defined during unparsing in `cpp.py`.

To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use
the GPU stream AccessNode directly.

Note that this pass is similar to `ConnectGPUStreamsToKernels`.
"""

def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels}

def modifies(self) -> ppl.Modifies:
return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets

def should_reapply(self, modified: ppl.Modifies) -> bool:
return False

def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
# Retrieve the GPU stream's array name
stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]

# Retrieve GPU stream assignments for nodes
stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']

# Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code
# and provide them the needed GPU stream explicitly
for sub_sdfg in sdfg.all_sdfgs_recursive():

for state in sub_sdfg.states():
for node in state.nodes():

# Not a tasklet - continue
if not isinstance(node, nodes.Tasklet):
continue

# Tasklet does not need use its assigned GPU stream - continue
if not STREAM_PLACEHOLDER in node.code.as_string:
continue

# Stream connector name and the used GPU Stream for the kernel
assigned_gpustream = stream_assignments[node]
gpu_stream_conn = STREAM_PLACEHOLDER
accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"

# Provide the GPU stream explicitly to the tasklet
stream_array_in = state.add_access(stream_array_name)
stream_array_out = state.add_access(stream_array_name)

node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t)
node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True)

state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream))
state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream))

return {}
Loading
Loading