spcl · aydogdub · Nov 21, 2025
diff --git a/dace/codegen/targets/gpu_helpers/copy_strategies.py b/dace/codegen/targets/gpu_helpers/copy_strategies.py
diff --git a/dace/codegen/targets/gpu_helpers/gpu_utils.py b/dace/codegen/targets/gpu_helpers/gpu_utils.py
@@ -0,0 +1,27 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from dace import Config
+from dace.codegen import common
+
+
+def generate_sync_debug_call() -> str:
+    """
+    Generate backend sync and error-check calls as a string if
+    synchronous debugging is enabled.
+
+    Parameters
+    ----------
+    backend : str
+        Backend API prefix (e.g., 'cuda').
+
+    Returns
+    -------
+    str
+        The generated debug call code, or an empty string if debugging is disabled.
+    """
+    backend: str = common.get_gpu_backend()
+    sync_call: str = ""
+    if Config.get_bool('compiler', 'cuda', 'syncdebug'):
+        sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
+                     f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")
+
+    return sync_call
diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
@@ -1550,6 +1550,37 @@ def get_parent_map(state: SDFGState, node: Optional[nodes.Node] = None) -> Optio
         cursdfg = cursdfg.parent_sdfg
     return None
 
+def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool:
+    """
+    Checks if the given node is enclosed within a Map whose schedule type
+    matches any in the `schedules` set.
+
+    Parameters
+    ----------
+    state : SDFGState
+        The State where the node resides
+    node : nodes.Node
+        The node to check.
+    schedules : set[dtypes.ScheduleType]
+        A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}).
+
+    Returns
+    ----------
+    bool
+        True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise.
+    """
+    current = node
+
+    while current is not None:
+        if isinstance(current, nodes.MapEntry):
+            if current.map.schedule in schedules:
+                return True
+
+        parent = get_parent_map(state, current)
+        if parent is None:
+            return False
+        current, state = parent
+
 
 def redirect_edge(state: SDFGState,
                   edge: graph.MultiConnectorEdge[Memlet],

diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py
@@ -0,0 +1,70 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.codegen import common
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class ConnectGPUStreamsToKernels(ppl.Pass):
+    """
+    This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps).
+
+    Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes,
+    indicating which GPU stream each kernel is assigned to. These assignments are e.g.
+    used when launching the kernels.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream array name and the prefix for individual stream variables
+        stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Link kernels to their assigned GPU streams
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a kernel entry - continue
+                    if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}"
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Assign the GPU stream to the kernel entry
+                    kernel_entry = node
+                    kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_in = state.add_access(stream_array_name)
+                    state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name,
+                                   dace.Memlet(accessed_gpu_stream))
+
+                    # Assign the GPU stream to the kernel exit
+                    kernel_exit = state.exit_node(kernel_entry)
+                    kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_out = state.add_access(stream_array_name)
+                    state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None,
+                                   dace.Memlet(accessed_gpu_stream))
+
+        return {}
diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py
@@ -0,0 +1,80 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels
+
+# Placeholder for the GPU stream variable used in tasklet code
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class ConnectGPUStreamsToTasklets(ppl.Pass):
+    """
+    This pass ensures that tasklets which require access to their assigned GPU stream
+    are provided with it explicitly.
+
+    Such tasklets typically originate from expanded LibraryNodes targeting GPUs.
+    These nodes may reference the special placeholder variable `__dace_current_stream`,
+    which is expected to be defined during unparsing in `cpp.py`.
+
+    To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use
+    the GPU stream AccessNode directly.
+
+    Note that this pass is similar to `ConnectGPUStreamsToKernels`.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream's array name
+        stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code
+        # and provide them the needed GPU stream explicitly
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a tasklet - continue
+                    if not isinstance(node, nodes.Tasklet):
+                        continue
+
+                    # Tasklet does not need use its assigned GPU stream - continue
+                    if not STREAM_PLACEHOLDER in node.code.as_string:
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_conn = STREAM_PLACEHOLDER
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Provide the GPU stream explicitly to the tasklet
+                    stream_array_in = state.add_access(stream_array_name)
+                    stream_array_out = state.add_access(stream_array_name)
+
+                    node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t)
+                    node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True)
+
+                    state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream))
+                    state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream))
+
+        return {}