diff --git a/dace/data.py b/dace/data.py index ac793717b7..037ba99752 100644 --- a/dace/data.py +++ b/dace/data.py @@ -1,4 +1,4 @@ -# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. import aenum import copy as cp import ctypes @@ -1680,6 +1680,35 @@ def set_shape( self._set_shape_dependent_properties(new_shape, strides, total_size, offset) self.validate() + def _get_packed_fortran_strides(self) -> Tuple[int]: + """Compute packed strides, if the array is stored Fortran-style (column-major).""" + accum = 1 + strides = [] + for shape in self.shape: + strides.append(accum) + accum *= shape + return tuple(strides) + + def _get_packed_c_strides(self) -> Tuple[int]: + """Compute packed strides, if the array is stored C-styl (row-major).""" + accum = 1 + strides = [] + # Same as Fortran order if shape is inversed + for shape in reversed(self.shape): + strides.append(accum) + accum *= shape + return tuple(list(reversed(strides))) + + def is_packed_fortran_strides(self) -> bool: + """Return True if strides match Fortran-contiguous (column-major) layout.""" + strides = self._get_packed_fortran_strides() + return tuple(strides) == tuple(self.strides) + + def is_packed_c_strides(self) -> bool: + """Return True if strides match Fortran-contiguous (row-major) layout.""" + strides = self._get_packed_c_strides() + return tuple(strides) == tuple(self.strides) + @make_properties class Stream(Data): diff --git a/dace/libraries/standard/environments/__init__.py b/dace/libraries/standard/environments/__init__.py index d8f585ebd9..9fab42cdeb 100644 --- a/dace/libraries/standard/environments/__init__.py +++ b/dace/libraries/standard/environments/__init__.py @@ -1,3 +1,4 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. from .cuda import CUDA from .hptt import HPTT +from .cpu import CPU diff --git a/dace/libraries/standard/environments/cpu.py b/dace/libraries/standard/environments/cpu.py new file mode 100644 index 0000000000..32885a4755 --- /dev/null +++ b/dace/libraries/standard/environments/cpu.py @@ -0,0 +1,21 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library + + +@dace.library.environment +class CPU: + + cmake_minimum_version = None + cmake_packages = [] + cmake_variables = {} + cmake_includes = [] + cmake_libraries = [] + cmake_compile_flags = [] + cmake_link_flags = [] + cmake_files = [] + + headers = [] + state_fields = [] + init_code = "" + finalize_code = "" + dependencies = [] diff --git a/dace/libraries/standard/environments/cuda.py b/dace/libraries/standard/environments/cuda.py index 4054786150..a88182af42 100644 --- a/dace/libraries/standard/environments/cuda.py +++ b/dace/libraries/standard/environments/cuda.py @@ -14,7 +14,7 @@ class CUDA: cmake_link_flags = [] cmake_files = [] - headers = [] + headers = {'frame': ["cuda_runtime.h"]} state_fields = [] init_code = "" finalize_code = "" diff --git a/dace/libraries/standard/nodes/copy_node.py b/dace/libraries/standard/nodes/copy_node.py new file mode 100644 index 0000000000..03939c35eb --- /dev/null +++ b/dace/libraries/standard/nodes/copy_node.py @@ -0,0 +1,247 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace import library, nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from functools import reduce +import operator +from dace.codegen.common import sym2cpp +import copy + + +# Compute collapsed shapes and strides, removing singleton dimensions (length == 1) +def collapse_shape_and_strides(subset, strides): + collapsed_shape = [] + collapsed_strides = [] + for (b, e, s), stride in zip(subset, strides): + length = (e + 1 - b) // s + if length != 1: + collapsed_shape.append(length) + collapsed_strides.append(stride) + return collapsed_shape, collapsed_strides + + +def add_dynamic_inputs(dynamic_inputs, sdfg: dace.SDFG, in_subset: dace.subsets.Range, state: dace.SDFGState): + # Add dynamic inputs + pre_assignments = dict() + map_lengths = [dace.symbolic.SymExpr((e + 1 - b) // s) for (b, e, s) in in_subset] + + for dynamic_input_name, datadesc in dynamic_inputs.items(): + if dynamic_input_name in sdfg.arrays: + continue + + if dynamic_input_name in sdfg.symbols: + sdfg.replace(str(dynamic_input_name), "sym_" + str(dynamic_input_name)) + ndesc = copy.deepcopy(datadesc) + ndesc.transient = False + sdfg.add_datadesc(dynamic_input_name, ndesc) + # Should be scalar + if isinstance(ndesc, dace.data.Scalar): + pre_assignments["sym_" + dynamic_input_name] = f"{dynamic_input_name}" + else: + assert ndesc.shape == (1, ) or ndesc.shape == [ + 1, + ] + pre_assignments["sym_" + dynamic_input_name] = f"{dynamic_input_name}[0]" + + new_map_lengths = [] + for ml in map_lengths: + nml = ml.subs({str(dynamic_input_name): "sym_" + str(dynamic_input_name)}) + new_map_lengths.append(nml) + map_lengths = new_map_lengths + + if pre_assignments != dict(): + # Add a state for assignments in the beginning + sdfg.add_state_before(state=state, label="pre_assign", is_start_block=True, assignments=pre_assignments) + + return map_lengths + + +@library.expansion +class ExpandPure(ExpandTransformation): + environments = [] + + @staticmethod + def expansion(node, parent_state, parent_sdfg): + inp_name, inp, in_subset, out_name, out, out_subset, dynamic_inputs = node.validate(parent_sdfg, parent_state) + map_lengths = [(e + 1 - b) // s for (b, e, s) in in_subset] + + in_shape_collapsed, in_strides_collapsed = collapse_shape_and_strides(in_subset, inp.strides) + out_shape_collapsed, out_strides_collapsed = collapse_shape_and_strides(out_subset, out.strides) + + sdfg = dace.SDFG(f"{node.label}_sdfg") + sdfg.add_array(inp_name, in_shape_collapsed, inp.dtype, inp.storage, strides=in_strides_collapsed) + sdfg.add_array(out_name, out_shape_collapsed, out.dtype, out.storage, strides=out_strides_collapsed) + + state = sdfg.add_state(f"{node.label}_state", is_start_block=True) + + map_lengths = add_dynamic_inputs(dynamic_inputs, sdfg, in_subset, state) + + sdfg.schedule = dace.dtypes.ScheduleType.Default + + map_params = [f"__i{i}" for i in range(len(map_lengths))] + map_rng = {i: f"0:{s}" for i, s in zip(map_params, map_lengths)} + in_access_expr = ','.join(map_params) + out_access_expr = ','.join(map_params) + inputs = {"_memcpy_inp": dace.memlet.Memlet(f"{inp_name}[{in_access_expr}]")} + outputs = {"_memcpy_out": dace.memlet.Memlet(f"{out_name}[{out_access_expr}]")} + code = "_memcpy_out = _memcpy_inp" + if inp.storage == dace.dtypes.StorageType.GPU_Global: + schedule = dace.dtypes.ScheduleType.GPU_Device + else: + schedule = dace.dtypes.ScheduleType.Default + state.add_mapped_tasklet(f"{node.label}_tasklet", + map_rng, + inputs, + code, + outputs, + schedule=schedule, + external_edges=True) + + return sdfg + + +@library.expansion +class ExpandCUDA(ExpandTransformation): + environments = [environments.CUDA] + + @staticmethod + def expansion(node, parent_state: dace.SDFGState, parent_sdfg: dace.SDFG): + inp_name, inp, in_subset, out_name, out, out_subset, dynamic_inputs = node.validate(parent_sdfg, parent_state) + + map_lengths = [(e + 1 - b) // s for (b, e, s) in in_subset] + cp_size = reduce(operator.mul, map_lengths, 1) + + in_shape_collapsed, in_strides_collapsed = collapse_shape_and_strides(in_subset, inp.strides) + out_shape_collapsed, out_strides_collapsed = collapse_shape_and_strides(out_subset, out.strides) + + sdfg = dace.SDFG(f"{node.label}_sdfg") + sdfg.add_array(inp_name, in_shape_collapsed, inp.dtype, inp.storage, strides=in_strides_collapsed) + sdfg.add_array(out_name, out_shape_collapsed, out.dtype, out.storage, strides=out_strides_collapsed) + + # Add dynamic inputs + map_lengths = add_dynamic_inputs(dynamic_inputs, sdfg, in_subset, state) + + state = sdfg.add_state(f"{node.label}_state") + + in_access = state.add_access(inp_name) + out_access = state.add_access(out_name) + tasklet = state.add_tasklet( + name=f"memcpy_tasklet", + inputs={"_memcpy_in"}, + outputs={"_memcpy_out"}, + code= + f"cudaMemcpyAsync(_memcpy_out, _memcpy_in, {sym2cpp(cp_size)} * sizeof({inp.dtype.ctype}), cudaMemcpyDeviceToDevice, __dace_current_stream);", + language=dace.Language.CPP, + code_global=f"#include \n") + + tasklet.schedule = dace.dtypes.ScheduleType.GPU_Device + + state.add_edge( + in_access, None, tasklet, "_memcpy_in", + dace.memlet.Memlet(data=inp_name, subset=dace.subsets.Range([(0, e - 1, 1) for e in map_lengths]))) + state.add_edge( + tasklet, "_memcpy_out", out_access, None, + dace.memlet.Memlet(data=out_name, subset=dace.subsets.Range([(0, e - 1, 1) for e in map_lengths]))) + + return sdfg + + +@library.expansion +class ExpandCPU(ExpandTransformation): + environments = [environments.CPU] + + @staticmethod + def expansion(node, parent_state: dace.SDFGState, parent_sdfg: dace.SDFG): + inp_name, inp, in_subset, out_name, out, out_subset, dynamic_inputs = node.validate(parent_sdfg, parent_state) + map_lengths = [(e + 1 - b) // s for (b, e, s) in in_subset] + cp_size = reduce(operator.mul, map_lengths, 1) + + in_shape_collapsed, in_strides_collapsed = collapse_shape_and_strides(in_subset, inp.strides) + out_shape_collapsed, out_strides_collapsed = collapse_shape_and_strides(out_subset, out.strides) + + sdfg = dace.SDFG(f"{node.label}_sdfg") + sdfg.add_array(inp_name, in_shape_collapsed, inp.dtype, inp.storage, strides=in_strides_collapsed) + sdfg.add_array(out_name, out_shape_collapsed, out.dtype, out.storage, strides=out_strides_collapsed) + + state = sdfg.add_state(f"{node.label}_state") + + # Add dynamic inputs + map_lengths = add_dynamic_inputs(dynamic_inputs, sdfg, in_subset, state) + + # Add CPU access nodes + in_access = state.add_access(inp_name) + out_access = state.add_access(out_name) + + # Tasklet performing standard CPU memcpy + tasklet = state.add_tasklet( + name=f"memcpy_tasklet", + inputs={"_memcpy_in"}, + outputs={"_memcpy_out"}, + code=f"memcpy(_memcpy_out, _memcpy_in, {sym2cpp(cp_size)} * sizeof({inp.dtype.ctype}));", + language=dace.Language.CPP, + code_global="#include ") + + # Connect input and output to the tasklet + state.add_edge( + in_access, None, tasklet, "_memcpy_in", + dace.memlet.Memlet(data=inp_name, subset=dace.subsets.Range([(0, e - 1, 1) for e in map_lengths]))) + state.add_edge( + tasklet, "_memcpy_out", out_access, None, + dace.memlet.Memlet(data=out_name, subset=dace.subsets.Range([(0, e - 1, 1) for e in map_lengths]))) + + return sdfg + + +@library.node +class CopyLibraryNode(nodes.LibraryNode): + implementations = {"pure": ExpandPure, "CUDA": ExpandCUDA, "CPU": ExpandCPU} + default_implementation = 'pure' + + def __init__(self, name, *args, **kwargs): + super().__init__(name, *args, **kwargs) + + def validate(self, sdfg, state): + """ + Validates the tensor transposition operation. + :return: A tuple (inp, out) for the data descriptors in the parent SDFG. + """ + + if len(state.out_edges(self)) != 1: + raise ValueError("Number of out edges unequal to one") + + oe = next(iter(state.out_edges(self))) + out = sdfg.arrays[oe.data.data] + out_subset = oe.data.subset + out_name = oe.src_conn + + # Add dynamic connectors + dynamic_ies = {ie for ie in state.in_edges(self) if ie.dst_conn != "_in"} + dynamic_inputs = dict() + for ie in dynamic_ies: + dataname = ie.data.data + datadesc = state.sdfg.arrays[dataname] + if not isinstance(datadesc, dace.data.Scalar): + raise ValueError("Dynamic inputs (not connected to `_in`) need to be all scalars") + dynamic_inputs[ie.dst_conn] = datadesc + + data_ies = {ie for ie in state.in_edges(self) if ie.dst_conn == "_in"} + if len(data_ies) != 1: + raise ValueError("Only when edge should be to dst connector `_in`") + ie = data_ies.pop() + inp = sdfg.arrays[ie.data.data] + + in_subset = ie.data.subset + inp_name = ie.dst_conn + if not inp: + raise ValueError("Missing the input tensor.") + if not out: + raise ValueError("Missing the output tensor.") + + if inp.dtype != out.dtype: + raise ValueError("The datatype of the input and output tensors must match.") + + if inp.storage != out.storage: + raise ValueError("The storage of the input and output tensors must match.") + + return inp_name, inp, in_subset, out_name, out, out_subset, dynamic_inputs diff --git a/dace/libraries/standard/nodes/memset_node.py b/dace/libraries/standard/nodes/memset_node.py new file mode 100644 index 0000000000..b6cb753881 --- /dev/null +++ b/dace/libraries/standard/nodes/memset_node.py @@ -0,0 +1,160 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace import library, nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from functools import reduce +import operator +from dace.codegen.common import sym2cpp +import copy + + +@library.expansion +class ExpandPure(ExpandTransformation): + environments = [] + + @staticmethod + def expansion(node, parent_state, parent_sdfg): + out_name, out, out_subset = node.validate(parent_sdfg, parent_state) + map_lengths = [(e + 1 - b) // s for (b, e, s) in out_subset if (e + 1 - b) // s != 1] + cp_size = reduce(operator.mul, map_lengths, 1) + + out_shape_collapsed = map_lengths + out_strides_collapsed = [ + stride for (b, e, s), stride in zip(out_subset, out.strides) if ((e + 1 - b) // s) != 1 + ] + + sdfg = dace.SDFG(f"{node.label}_sdfg") + sdfg.add_array(out_name, out_shape_collapsed, out.dtype, out.storage, strides=out_strides_collapsed) + + state = sdfg.add_state(f"{node.label}_state") + map_params = [f"__i{i}" for i in range(len(map_lengths))] + map_rng = {i: f"0:{s}" for i, s in zip(map_params, map_lengths)} + access_expr = ','.join(map_params) + outputs = {"_memset_out": dace.memlet.Memlet(f"{out_name}[{access_expr}]")} + code = "_memset_out = 0" + if out.storage == dace.dtypes.StorageType.GPU_Global: + schedule = dace.dtypes.ScheduleType.GPU_Device + else: + schedule = dace.dtypes.ScheduleType.Default + state.add_mapped_tasklet(f"{node.label}_tasklet", + map_rng, + dict(), + code, + outputs, + schedule=schedule, + external_edges=True) + + return sdfg + + +@library.expansion +class ExpandCUDA(ExpandTransformation): + environments = [environments.CUDA] + + @staticmethod + def expansion(node, parent_state: dace.SDFGState, parent_sdfg: dace.SDFG): + out_name, out, out_subset = node.validate(parent_sdfg, parent_state) + map_lengths = [(e + 1 - b) // s for (b, e, s) in out_subset] + cp_size = reduce(operator.mul, map_lengths, 1) + + out_shape_collapsed = [ml for ml in map_lengths if ml != 1] + out_strides_collapsed = [ + stride for (b, e, s), stride in zip(out_subset, out.strides) if ((e + 1 - b) // s) != 1 + ] + + sdfg = dace.SDFG(f"{node.label}_sdfg") + sdfg.add_array(out_name, out_shape_collapsed, out.dtype, out.storage, strides=out_strides_collapsed) + + state = sdfg.add_state(f"{node.label}_state") + + out_access = state.add_access(out_name) + tasklet = state.add_tasklet( + name=f"memcpy_tasklet", + inputs={}, + outputs={"_memset_out"}, + code= + f"cudaMemsetAsync(_memset_out, 0, {sym2cpp(cp_size)} * sizeof({out.dtype.ctype}), __dace_current_stream);", + language=dace.Language.CPP, + code_global=f"#include \n") + + state.add_edge( + tasklet, "_memset_out", out_access, None, + dace.memlet.Memlet(data=out_name, subset=dace.subsets.Range([(0, e - 1, 1) for e in out_shape_collapsed]))) + + return sdfg + + +@library.expansion +class ExpandCPU(ExpandTransformation): + environments = [environments.CPU] + + @staticmethod + def expansion(node, parent_state: dace.SDFGState, parent_sdfg: dace.SDFG): + out_name, out, out_subset = node.validate(parent_sdfg, parent_state) + map_lengths = [(e + 1 - b) // s for (b, e, s) in out_subset if (e + 1 - b) // s != 1] + cp_size = reduce(operator.mul, map_lengths, 1) + + out_shape_collapsed = map_lengths + out_strides_collapsed = [ + stride for (b, e, s), stride in zip(out_subset, out.strides) if ((e + 1 - b) // s) != 1 + ] + + sdfg = dace.SDFG(f"{node.label}_sdfg") + sdfg.add_array(out_name, out_shape_collapsed, out.dtype, out.storage, strides=out_strides_collapsed) + + state = sdfg.add_state(f"{node.label}_state") + + # Access the original output + out_access = state.add_access(out_name) + + # Add a tasklet that does standard CPU memset + tasklet = state.add_tasklet(name=f"memset_tasklet", + inputs={}, + outputs={"_memset_out"}, + code=f"memset(_memset_out, 0, {sym2cpp(cp_size)} * sizeof({out.dtype.ctype}));", + language=dace.Language.CPP, + code_global="#include ") # include C++ memset header + + # Connect tasklet to the output + state.add_edge( + tasklet, "_memset_out", out_access, None, + dace.memlet.Memlet(data=out_name, subset=dace.subsets.Range([(0, e - 1, 1) for e in out_shape_collapsed]))) + + return sdfg + + +@library.node +class MemsetLibraryNode(nodes.LibraryNode): + implementations = {"pure": ExpandPure, "CUDA": ExpandCUDA, "CPU": ExpandCPU} + default_implementation = 'pure' + + def __init__(self, name, *args, **kwargs): + super().__init__(name, *args, **kwargs) + + def validate(self, sdfg, state): + """ + Validates the tensor transposition operation. + :return: A tuple (inp, out) for the data descriptors in the parent SDFG. + """ + + out_name, out, out_subset = None, None, None + if len(state.out_edges(self)) != 1: + raise ValueError("Number of out edges unequal to one") + + oe = next(iter(state.out_edges(self))) + out = sdfg.arrays[oe.data.data] + out_subset = oe.data.subset + out_name = oe.src_conn + + if not out: + raise ValueError("Missing the output tensor.") + + for ie in state.in_edges(self): + state.remove_edge(ie) + if state.degree(ie.src) == 0: + state.remove_node(ie.src) + state.add_edge(ie.src, None, self, None, dace.memlet.Memlet(None)) + self.remove_in_connector(ie.dst_conn) + + return out_name, out, out_subset diff --git a/dace/subsets.py b/dace/subsets.py index b19571a107..a35b798c19 100644 --- a/dace/subsets.py +++ b/dace/subsets.py @@ -1,4 +1,4 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. import dace.serialize from dace import data, symbolic, dtypes import re @@ -931,6 +931,62 @@ def intersects(self, other: 'Range'): return True + def is_contiguous_subset(self, array: 'dace.data.Array') -> bool: + """ + Check if this subset represents a contiguous subset of the array descriptor provided. + + For a subset to be contiguous: + - In Fortran layout: once a dimension is partial, all subsequent dimensions must have length 1 + - In C layout: same rule applies after reversing dimensions + + Args: + array: array descriptor to check against + + Returns: + True if the subset is contiguous, False otherwise + Returns False on all arrays that are not have a packed layout, + meaning that the complete array is contiguously stored in 1D memory. + """ + # Any step size != 1 -> not contiguous + for (_, _, s) in self: + if s != 1: + return False + + # Determine array layout and calculate expression lengths accordingly + if array.is_packed_fortran_strides(): + # Fortran layout: first dimension varies fastest + expr_lens = [((e + 1) - b) for (b, e, s) in self] + shape_dims = array.shape + elif array.is_packed_c_strides(): + # C layout: last dimension varies fastest, so reverse the order + expr_lens = list(reversed([((e + 1) - b) for (b, e, s) in self])) + shape_dims = list(reversed(array.shape)) + else: + return False + + # Check contiguity: once we find a partial dimension, all remaining must be length 1 + for i, (expr_len, dim) in enumerate(zip(expr_lens, shape_dims)): + try: + # Check if this dimension is partial (less than full shape) + if expr_len < dim: + # This dimension is partial - all remaining dimensions must be length 1 + for j in range(i + 1, len(expr_lens)): + if expr_lens[j] != 1: + return False + # All remaining dimensions are 1, so this is contiguous + return True + except TypeError: + # Handle symbolic expressions that can't be compared + # Assume it might be partial, so check remaining dimensions + for j in range(i + 1, len(expr_lens)): + if expr_lens[j] != 1: + return False + # All remaining dimensions are 1 + return True + + # All dimensions are full size - this is contiguous + return True + @dace.serialize.serializable class Indices(Subset): diff --git a/dace/transformation/passes/assignment_and_copy_kernel_to_memset_and_memcpy.py b/dace/transformation/passes/assignment_and_copy_kernel_to_memset_and_memcpy.py new file mode 100644 index 0000000000..95ed871fae --- /dev/null +++ b/dace/transformation/passes/assignment_and_copy_kernel_to_memset_and_memcpy.py @@ -0,0 +1,530 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. + +import warnings +import dace +import copy +from dace import Tuple, properties +from dace.memlet import Memlet +from dace.sdfg.graph import Edge, MultiConnectorEdge +from dace.transformation import pass_pipeline as ppl, transformation +from dace.libraries.standard.nodes.copy_node import CopyLibraryNode +from dace.libraries.standard.nodes.memset_node import MemsetLibraryNode +from typing import Dict, Iterable, List, Set + + +@properties.make_properties +@transformation.explicit_cf_compatible +class AssignmentAndCopyKernelToMemsetAndMemcpy(ppl.Pass): + overapproximate_first_dimension = properties.Property( + dtype=bool, + default=False, + desc= + "If True, the first dimension of the map is overapproximated to be contiguous, even if it is not. This is useful for some cases where the first dimension is always contiguous, but the map range is not.", + ) + apply_only_on_labels = properties.ListProperty(element_type=str, default=[], allow_none=False) + + rmid = 0 + + def __init__(self, overapproximate_first_dimensions=False, apply_only_on_labels=list()): + self.overapproximate_first_dimension = overapproximate_first_dimensions + self.apply_only_on_labels = apply_only_on_labels + + def modifies(self) -> ppl.Modifies: + return ppl.Modeifies.Everything + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def depends_on(self): + return set() + + def _get_edges_from_path(self, state: dace.SDFGState, node_path: List[dace.nodes.Node]) -> List[MultiConnectorEdge]: + if len(node_path) == 1: + return [] + edges = [] + for i in range(len(node_path) - 1): + src = node_path[i] + dst = node_path[i + 1] + oes = {oe for oe in state.out_edges(src) if oe.dst == dst} + if len(oes) != 1: + # Fail + return [] + oe = oes.pop() + edges.append(oe) + return edges + + def _detect_contiguous_memcpy_paths(self, state: dace.SDFGState, node: dace.nodes.MapEntry): + paths = list() + + # If map range is not contigous, we can't do contiguous copy detection + step_equal_one = True + for (b, e, s) in node.map.range: + if s != 1: + step_equal_one = False + break + + # Non-zero step in map range + if not step_equal_one: + return paths + + assert node in state.nodes() + assert state.exit_node(node) in state.nodes() + path_candidates = [ + self._get_edges_from_path(state, p) + for p in state.all_simple_paths(node, state.exit_node(node), as_edges=False) + ] + # AN1 -> MapEntry -> Tasklet -> MapExit -> AN2 + # Need to get AN1 and AN2 + for path_candidate in path_candidates: + if len(path_candidate) != 2: + continue + # Gen AN1 by replacing the name of the OUT connector + if path_candidate[0].dst_conn is None or (not path_candidate[0].src_conn.startswith("OUT_")): + continue + ie = next( + state.in_edges_by_connector(path_candidate[0].src, path_candidate[0].src_conn.replace("OUT_", "IN_"))) + oe = next( + state.out_edges_by_connector(path_candidate[-1].dst, path_candidate[-1].dst_conn.replace("IN_", + "OUT_"))) + tasklet = path_candidate[1].src + + # Tasklet in the middle + if not isinstance(tasklet, dace.nodes.Tasklet): + continue + if len(tasklet.in_connectors) != 1 or len(tasklet.out_connectors) != 1: + continue + # Output Access Node + if not isinstance(oe.dst, dace.nodes.AccessNode): + continue + # Input Access Node + if not isinstance(ie.src, dace.nodes.AccessNode): + continue + + in_conn = next(iter(tasklet.in_connectors)) + out_conn = next(iter(tasklet.out_connectors)) + if tasklet.language == dace.Language.Python: + tasklet_code_str = tasklet.code.as_string + if f"{out_conn} = {in_conn}" != tasklet_code_str: + continue + elif tasklet.language == dace.Language.CPP: + tasklet_code_str = tasklet.code.as_string + if f"{out_conn} = {in_conn};" != tasklet_code_str: + continue + else: + continue + + paths.append([ie] + path_candidate + [oe]) + + return paths + + def _detect_contiguous_memset_paths(self, state: dace.SDFGState, node: dace.nodes.MapEntry): + # All tasklets within the map + paths = list() + + # If map range is not contigous, we can't do contiguous copy detection + step_equal_one = True + for (b, e, s) in node.map.range: + if s != 1: + step_equal_one = False + break + + # Non-one step in map range + if not step_equal_one: + return paths + + assert node in state.nodes() + assert state.exit_node(node) in state.nodes() + path_candidates = [ + self._get_edges_from_path(state, p) + for p in state.all_simple_paths(node, state.exit_node(node), as_edges=False) + ] + # MapEntry -> Tasklet -> MapExit -> AN2 + # Need to get AN2 only + for path_candidate in path_candidates: + if len(path_candidate) != 2: + continue + + ie = path_candidate[0] + if ie.src_conn is not None or ie.dst_conn is not None or ie.data.data is not None: + continue + + oe = next( + state.out_edges_by_connector(path_candidate[-1].dst, path_candidate[-1].dst_conn.replace("IN_", + "OUT_"))) + tasklet = path_candidate[1].src + + # Tasklet in the middle + if not isinstance(tasklet, dace.nodes.Tasklet): + continue + if len(tasklet.in_connectors) != 0 or len(tasklet.out_connectors) != 1: + continue + # Output Access Node + if not isinstance(oe.dst, dace.nodes.AccessNode): + continue + + out_conn = next(iter(tasklet.out_connectors)) + if tasklet.language == dace.Language.Python: + tasklet_code_str = tasklet.code.as_string + if f"{out_conn} = 0" != tasklet_code_str and f"{out_conn} = 0.0" != tasklet_code_str: + continue + elif tasklet.language == dace.Language.CPP: + tasklet_code_str = tasklet.code.as_string + if f"{out_conn} = 0;" != tasklet_code_str and f"{out_conn} = 0.0;" != tasklet_code_str: + continue + else: + continue + + paths.append(path_candidate + [oe]) + + return paths + + def _get_num_tasklets_within_map(self, state: dace.SDFGState, node: dace.nodes.MapEntry): + assert node in state.nodes(), f"Map entry {node} not in state {state}" + assert isinstance(node, dace.nodes.MapEntry), f"Node {node} is not a MapEntry" + assert state.exit_node(node) in state.nodes(), f"Map exit {state.exit_node(node)} not in state {state}" + n = {n for n in state.all_nodes_between(node, state.exit_node(node)) if isinstance(n, dace.nodes.Tasklet)} + return len(n) + + def _get_write_begin_and_length(self, + state: dace.SDFGState, + map_entry: dace.nodes.MapEntry, + tasklet: dace.nodes.Tasklet, + verbose=True): + range_list = { + dace.symbolic.symbol(p): (b, e, s) + for (p, (b, e, s)) in zip(map_entry.map.params, map_entry.map.range) + } + + in_edge = state.in_edges(tasklet)[0] + out_edge = state.out_edges(tasklet)[0] + + if in_edge.data.data is not None: + in_data_range = [(b, e, s) for (b, e, s) in in_edge.data.subset] + out_data_range = [(b, e, s) for (b, e, s) in out_edge.data.subset] + + new_in_data_range = [] + new_out_data_range = [] + + if in_edge.data.data is not None: + for (b, e, s) in in_data_range: + nb: dace.symbolic.SymExpr = b + ne: dace.symbolic.SymExpr = e + ns: dace.symbolic.SymExpr = s + for (p, (b2, e2, s2)) in range_list.items(): + nb = nb.subs(p, b2) + ne = ne.subs(p, e2) + assert ns == 1 and s2 == 1, "Only step of 1 is supported for memcpy detection" + new_in_data_range.append((nb, ne, ns)) + + # If we overapproximate the first dimension, we assume it is contiguous + if self.overapproximate_first_dimension: + arr = state.sdfg.arrays[in_edge.data.data] + stride_one_dimension = {(i, d) for i, (d, s) in enumerate(zip(arr.shape, arr.strides)) if s == 1} + assert len(stride_one_dimension) <= 1 # If a view inside a nested SDFG it can be 0 too + # If no stride-one-dimension then we can't remove this + if len(stride_one_dimension) == 0: + return None, None, None + dim_offset, stride_one_dimension = stride_one_dimension.pop() + new_in_data_range[dim_offset] = ((0, stride_one_dimension - 1, 1)) + + for (b, e, s) in out_data_range: + nb: dace.symbolic.SymExpr = b + ne: dace.symbolic.SymExpr = e + ns: dace.symbolic.SymExpr = s + for (p, (b2, e2, s2)) in range_list.items(): + nb = nb.subs(p, b2) + ne = ne.subs(p, e2) + assert ns == 1 and s2 == 1, "Only step of 1 is supported for memcpy/memset detection" + new_out_data_range.append((nb, ne, ns)) + + # If we overapproximate the first dimension, we assume it is contiguous + if self.overapproximate_first_dimension: + arr = state.sdfg.arrays[out_edge.data.data] + stride_one_dimension = {(i, d) for i, (d, s) in enumerate(zip(arr.shape, arr.strides)) if s == 1} + assert len(stride_one_dimension) <= 1 # If a view inside a nested SDFG it can be 0 too + # If no stride-one-dimension then we can't remove this + if len(stride_one_dimension) == 0: + return None, None, None + dim_offset, stride_one_dimension = stride_one_dimension.pop() + new_out_data_range[dim_offset] = ((0, stride_one_dimension - 1, 1)) + + new_in_data_subset = dace.subsets.Range(new_in_data_range) if in_edge.data.data is not None else None + new_out_data_subset = dace.subsets.Range(new_out_data_range) if out_edge.data.data is not None else None + + if in_edge.data.data is not None: + contig_subset = new_in_data_subset.is_contiguous_subset(state.sdfg.arrays[in_edge.data.data]) + if not contig_subset: + warnings.warn(f"Input array {in_edge.data.data} is not contiguous, cannot remove memcpy/memset.", + UserWarning) + return None, None, None + + if out_edge.data.data is not None: + contig_subset = new_out_data_subset.is_contiguous_subset(state.sdfg.arrays[out_edge.data.data]) + if not contig_subset: + warnings.warn( + f"Output array {out_edge.data.data} is not contiguous, cannot remove memcpy/memset {new_out_data_range} of ({state.sdfg.arrays[out_edge.data.data]})", + UserWarning) + return None, None, None + + if in_edge.data.data is not None: + in_begin_exprs = [b for (b, e, s) in new_in_data_range] + in_length_exprs = [(e + 1) - b for (b, e, s) in new_in_data_range] + out_begin_exprs = [b for (b, e, s) in new_out_data_range] + out_length_exprs = [(e + 1) - b for (b, e, s) in new_out_data_range] + + if in_edge.data.data is not None: + in_begin_collapsed = dace.symbolic.SymExpr(1) + in_length_collapsed = dace.symbolic.SymExpr(1) + out_begin_collapsed = dace.symbolic.SymExpr(1) + out_length_collapsed = dace.symbolic.SymExpr(1) + + # We ensured the subset is contiguous, so we can get the length by multiplying each dimension's length + if in_edge.data.data is not None: + for i, b in enumerate(in_begin_exprs): + in_begin_collapsed *= b + + for i, l in enumerate(in_length_exprs): + in_length_collapsed *= l + + for i, b in enumerate(out_begin_exprs): + out_begin_collapsed *= b + + for i, l in enumerate(out_length_exprs): + out_length_collapsed *= l + + if in_edge.data.data is None: + in_begin_collapsed = None + in_length_collapsed = None + + if in_length_collapsed is not None: + # This means the inner access is voer a non-unit stride dimension + if in_length_collapsed != out_length_collapsed: + return None, None, None + + return new_in_data_range, new_out_data_range, out_length_collapsed + + def remove_memcpy_from_kernel(self, state: dace.SDFGState, node: dace.nodes.MapEntry, verbose=True): + memcpy_paths = self._detect_contiguous_memcpy_paths(state, node) + rmed_count = 0 + + joined_edges = set() + + for memcpy_path in memcpy_paths: + src_access_node = memcpy_path[0].src + map_entry = memcpy_path[0].dst + tasklet = memcpy_path[1].dst + map_exit = memcpy_path[2].dst + dst_access_node = memcpy_path[3].dst + if src_access_node not in state.nodes() or map_entry not in state.nodes() or tasklet not in state.nodes( + ) or map_exit not in state.nodes() or dst_access_node not in state.nodes(): + raise Exception( + f"Map entry, exit or tasklet not in state: {map_entry} ({map_entry in state.nodes()}), " + f"{map_exit} ({map_exit in state.nodes()}), {tasklet} ({tasklet in state.nodes()}). Skipping.", ) + + # If src and dst types are not the same, we can't do memcpy + src_desc = state.sdfg.arrays[src_access_node.data] + dst_desc = state.sdfg.arrays[dst_access_node.data] + if src_desc.dtype != dst_desc.dtype: + if verbose: + warnings.warn( + f"Source and destination types do not match for memcpy removal: {src_desc.dtype} != {dst_desc.dtype}. Skipping.", + UserWarning) + continue + if src_desc.storage != dst_desc.storage: + if verbose: + warnings.warn( + f"Source and destination storage types do not match for memcpy removal: {src_desc.storage} != {dst_desc.storage}. Skipping.", + UserWarning) + continue + + # To calculate the total range, + # Take input subset of tasklet replace expression with map range + # For now, we will just use the original range + # Needs to be before removing the path because it requires edges of the tasklet + begin_subset, exit_subset, copy_length = self._get_write_begin_and_length( + state, map_entry, tasklet, verbose) + + if begin_subset is None and exit_subset is None and copy_length is None: + continue + + # We can now remove the memcpy path + in_edges = state.in_edges(map_entry) + + # If src / dst not in the graph anymore, add new ones + if src_access_node not in state.nodes(): + new_src_access_node = state.add_access(src_access_node.data) + else: + new_src_access_node = src_access_node + if dst_access_node not in state.nodes(): + new_dst_access_node = state.add_access(dst_access_node.data) + else: + new_dst_access_node = dst_access_node + + # Add a new memcpy tasklet + tasklet = CopyLibraryNode( + name=f"copyLib_{new_src_access_node.data}_{new_dst_access_node.data}_{self.rmid}", ) + state.add_node(tasklet) + self.rmid += 1 + state.add_edge(new_src_access_node, None, tasklet, "_in", + dace.memlet.Memlet(subset=dace.subsets.Range(begin_subset), data=new_src_access_node.data)) + state.add_edge(tasklet, "_out", new_dst_access_node, None, + dace.memlet.Memlet(subset=dace.subsets.Range(exit_subset), data=new_dst_access_node.data)) + tasklet.add_in_connector("_in") + tasklet.add_out_connector("_out") + for ie in in_edges: + if not ie.dst_conn.startswith("IN_"): + _an = state.add_access(ie.data.data) + state.add_edge(_an, None, tasklet, ie.dst_conn, copy.deepcopy(ie.data)) + tasklet.add_in_connector(ie.dst_conn) + + rmed_count += 1 + + for memcpy_path in memcpy_paths: + for e in memcpy_path: + joined_edges.add(e) + + self.rm_edges(state, joined_edges) + + return rmed_count + + def remove_memset_from_kernel(self, state: dace.SDFGState, node: dace.nodes.MapEntry, verbose=True): + memset_paths = self._detect_contiguous_memset_paths(state, node) + + joined_edges = set() + + rmed_count = 0 + for memset_path in memset_paths: + map_entry = memset_path[0].src + tasklet = memset_path[0].dst + map_exit = memset_path[1].dst + dst_access_node = memset_path[2].dst + assert isinstance(map_entry, dace.nodes.MapEntry), f"Map entry {map_entry} is not a MapEntry" + assert isinstance(tasklet, dace.nodes.Tasklet), f"Tasklet {tasklet} is not a Tasklet" + assert isinstance(map_exit, dace.nodes.MapExit), f"Map exit {map_exit} is not a MapExit" + assert isinstance(dst_access_node, + dace.nodes.AccessNode), f"Destination access node {dst_access_node} is not an AccessNode" + + # To calculate the total range, + # Take input subset of tasklet replace expression with map range + # For now, we will just use the original range + # Needs to be done before removing the memset path + if map_entry not in state.nodes() or map_exit not in state.nodes() or tasklet not in state.nodes(): + raise Exception( + f"Map entry, exit or tasklet not in state: {map_entry} ({map_entry in state.nodes()})," + f"{map_exit} ({map_exit in state.nodes()}), {tasklet} ({tasklet in state.nodes()}).", ) + + begin_subset, exit_subset, copy_length = self._get_write_begin_and_length(state, map_entry, tasklet) + + if begin_subset is None or exit_subset is None or copy_length is None: + if verbose: + warnings.warn( + f"Could not determine begin or exit subset or copy length for memset removal (or they are not contiguous) in map {map_entry.map}({map_entry.map.label}). Skipping.", + UserWarning) + continue + + # We can now remove the memset path + in_edges = state.in_edges(map_entry) + + # Add a new memset tasklet + tasklet = MemsetLibraryNode(name=f"memsetLib_{dst_access_node.data}_{self.rmid}", ) + tasklet.add_out_connector("_out") + state.add_node(tasklet) + self.rmid += 1 + state.add_edge(tasklet, "_out", dst_access_node, None, + dace.memlet.Memlet(subset=dace.subsets.Range(exit_subset), data=dst_access_node.data)) + # Redirect all dynamic input connectors + for ie in in_edges: + if not ie.dst_conn.startswith("IN_"): + _an1 = state.add_access(ie.data.data) + state.add_edge(_an1, None, tasklet, ie.dst_conn, copy.deepcopy(ie.data)) + tasklet.add_in_connector(ie.dst_conn) + + rmed_count += 1 + + for memcpy_path in memset_paths: + for e in memcpy_path: + joined_edges.add(e) + + self.rm_edges(state, joined_edges) + + return rmed_count + + def _has_passthrough_connectors(self, n: dace.nodes.Node): + in_conns = n.in_connectors + out_conns = n.out_connectors + + has_passtrough = any({c.startswith("IN_") for c in in_conns}) + has_passtrough |= any({c.startswith("OUT_") for c in out_conns}) + + return has_passtrough + + def rm_edges(self, state: dace.SDFGState, edges: Iterable[Edge[Memlet]]): + nodes_to_check = set() + for i, e in enumerate(edges): + assert e in state.edges(), f"{e} not in {state.edges()}" + state.remove_edge(e) + if e.src_conn is not None: + e.src.remove_out_connector(e.src_conn) + if e.dst_conn is not None: + e.dst.remove_in_connector(e.dst_conn) + nodes_to_check.add(e.src) + nodes_to_check.add(e.dst) + + for n in nodes_to_check: + if isinstance(n, dace.nodes.MapEntry): + # If it has passthrough connectors then data is left, + # Otherwise only dynamic connectors and we should remove them + if (not self._has_passthrough_connectors(n)) and state.out_degree(n) == 0: + state.remove_node(n) + if isinstance(n, dace.nodes.MapExit): + if not self._has_passthrough_connectors(n) and state.in_degree(n) == 0: + state.remove_node(n) + + for n in state.nodes(): + if (state.degree(n) == 0): + state.remove_node(n) + + def apply_pass(self, sdfg: dace.SDFG, pipeline_res: Dict) -> Dict[int, Dict[dace.SDFGState, Set[dace.SDFGState]]]: + map_entries = set() + + for n, g in sdfg.all_nodes_recursive(): + if isinstance(n, dace.nodes.MapEntry): + map_entries.add((n, g)) + + rmed_memcpies = dict() + rmed_memsets = dict() + + for (node, state) in map_entries: + sdfg.validate() + assert node in state.nodes(), f"Map entry {node} not in state {state}" + assert state.exit_node(node) in state.nodes(), f"Map exit {state.exit_node(node)} not in state {state}" + + if self.apply_only_on_labels != [] and self.apply_only_on_labels is not None and node.label not in self.apply_only_on_labels: + continue + + if self._get_num_tasklets_within_map(state, node) == 0: + continue + + rmed_memcpy = self.remove_memcpy_from_kernel(state, node) + if rmed_memcpy > 0: + print(f"Removed {rmed_memcpy} memcpy from {node.label}") + sdfg.validate() + + # If the map is only used for 1 memcpy, then it might have been already removed + if node in state.nodes(): + rmed_memset = self.remove_memset_from_kernel(state, node) + if rmed_memset > 0: + print(f"Removed {rmed_memset} memset from {node.label}") + else: + rmed_memset = 0 + sdfg.validate() + + assert node not in rmed_memsets + assert node not in rmed_memcpies + rmed_memcpies[node] = rmed_memcpy + rmed_memsets[node] = rmed_memset + + num_rmed_memcpies = sum(rmed_memcpies.values()) + num_rmed_memsets = sum(rmed_memsets.values()) + + return num_rmed_memcpies + num_rmed_memsets diff --git a/tests/library/copy_node_test.py b/tests/library/copy_node_test.py new file mode 100644 index 0000000000..2a4ea5b2ef --- /dev/null +++ b/tests/library/copy_node_test.py @@ -0,0 +1,132 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.libraries.standard.nodes.copy_node import CopyLibraryNode + +import pytest +import numpy as np + +# More tests in AssigmentAndCopyKernelToMemsetAndMemcpy tests + + +def _get_sdfg(implementation: str, gpu: bool) -> dace.SDFG: + sdfg = dace.SDFG("copy_sdfg") + a_name = "gpuA" if gpu else "A" + b_name = "gpuB" if gpu else "B" + sdfg.add_array(name=a_name, + shape=[ + 200, + ], + dtype=dace.dtypes.float64, + storage=dace.dtypes.StorageType.GPU_Global if gpu else dace.dtypes.StorageType.CPU_Heap, + transient=False) + sdfg.add_array(name=b_name, + shape=[ + 200, + ], + dtype=dace.dtypes.float64, + storage=dace.dtypes.StorageType.GPU_Global if gpu else dace.dtypes.StorageType.CPU_Heap, + transient=False) + + state = sdfg.add_state("main") + + a1 = state.add_access(a_name) + b1 = state.add_access(b_name) + + libnode = CopyLibraryNode(name="cp1", inputs={"_in"}, outputs={"_out"}) + if implementation is not None: + libnode.implementation = implementation + + state.add_edge(a1, None, libnode, "_in", dace.memlet.Memlet(f"{a_name}[150:200]")) + state.add_edge(libnode, "_out", b1, None, dace.memlet.Memlet(f"{b_name}[50:100]")) + + return sdfg + + +def test_copy_pure_cpu(): + sdfg = _get_sdfg("pure", gpu=False) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + A = np.ones((200, ), dtype=np.float64) + B = np.zeros((200, ), dtype=np.float64) + exe(A=A, B=B) + + # Check that the copied slice matches + np.testing.assert_array_equal(B[50:100], A[150:200]) + # Other parts of B should remain zeros + assert np.all(B[:50] == 0) + assert np.all(B[100:] == 0) + + +@pytest.mark.gpu +def test_copy_pure_gpu(): + import cupy as cp + + sdfg = _get_sdfg("pure", gpu=True) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + A = cp.ones((200, ), dtype=cp.float64) + B = cp.zeros((200, ), dtype=cp.float64) + + exe(gpuA=A, gpuB=B) + + # Check that the copied slice matches + cp.testing.assert_array_equal(B[50:100], A[150:200]) + # Other parts of B should remain zeros + assert cp.all(B[:50] == 0) + assert cp.all(B[100:] == 0) + + +@pytest.mark.gpu +def test_copy_cuda_gpu(): + import cupy as cp + + sdfg = _get_sdfg("CUDA", gpu=True) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + A = cp.arange(200, dtype=cp.float64) + B = cp.zeros((200, ), dtype=cp.float64) + exe(gpuA=A, gpuB=B) + + # Check slice copy + cp.testing.assert_array_equal(B[50:100], A[150:200]) + + +@pytest.mark.gpu +def test_copy_cuda_cpu(): + import numpy as np + + # Even if using CUDA implementation, we can test on CPU arrays + sdfg = _get_sdfg("CUDA", gpu=False) + sdfg.validate() + sdfg.expand_library_nodes() + with pytest.raises(Exception): + sdfg.validate() + sdfg.compile() + + +@pytest.mark.gpu +def test_copy_memcpy_cpu(): + import numpy as np + + sdfg = _get_sdfg("CPU", gpu=False) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + sdfg.compile() + + +if __name__ == "__main__": + test_copy_pure_cpu() + test_copy_pure_gpu() + test_copy_memcpy_cpu() + test_copy_cuda_gpu() + test_copy_cuda_cpu() diff --git a/tests/library/memset_node_test.py b/tests/library/memset_node_test.py new file mode 100644 index 0000000000..322d64cd7c --- /dev/null +++ b/tests/library/memset_node_test.py @@ -0,0 +1,176 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. + +import dace +from dace.libraries.standard.nodes.memset_node import MemsetLibraryNode + +import pytest +import numpy as np + + +def _get_sdfg(implementation, gpu=True) -> dace.SDFG: + sdfg = dace.SDFG("memset_sdfg") + name = "gpuB" if gpu else "B" + sdfg.add_array(name=name, + shape=[ + 200, + ], + dtype=dace.dtypes.float64, + storage=dace.dtypes.StorageType.GPU_Global if gpu else dace.dtypes.StorageType.CPU_Heap, + transient=False) + + state = sdfg.add_state("main") + + b1 = state.add_access(name) + + libnode = MemsetLibraryNode(name="memset1", inputs={}, outputs={"_out"}) + if implementation is not None: + libnode.implementation = implementation + + # Only set a slice + state.add_edge(libnode, "_out", b1, None, dace.memlet.Memlet(f"{name}[50:100]")) + + return sdfg + + +def _get_multi_dim_sdfg(implementation, gpu=True) -> dace.SDFG: + sdfg = dace.SDFG("memset_sdfg2") + name = "gpuB" if gpu else "B" + sdfg.add_array(name=name, + shape=[50, 2, 2], + dtype=dace.dtypes.float64, + storage=dace.dtypes.StorageType.GPU_Global if gpu else dace.dtypes.StorageType.CPU_Heap, + transient=False) + + state = sdfg.add_state("main") + + b1 = state.add_access(name) + + libnode = MemsetLibraryNode(name="copy2", inputs={}, outputs={name}) + if implementation is not None: + libnode.implementation = implementation + + # Only set a slice + state.add_edge(libnode, name, b1, None, dace.memlet.Memlet(f"{name}[40:50, 0:2, 0:2]")) + + return sdfg + + +def test_memset_pure_cpu(): + sdfg = _get_sdfg("pure", gpu=False) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + B = np.ones((200, ), dtype=np.float64) + exe(B=B) + + assert np.all(B[:50] == 1) + assert np.all(B[100:] == 1) + assert np.all(B[50:100] == 0) + + +def test_memset_pure_cpu_multi_dim(): + sdfg = _get_multi_dim_sdfg("pure", gpu=False) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + B = np.ones((50, 2, 2), dtype=np.float64) + exe(B=B) + + assert np.all(B[0:40, :, :] == 1) + assert np.all(B[40:50, :, :] == 0) + + +@pytest.mark.gpu +def test_memset_pure_gpu(): + import cupy as cp + + sdfg = _get_sdfg("pure", gpu=True) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + B = cp.ones((200, ), dtype=cp.float64) + exe(gpuB=B) + + assert cp.all(B[:50] == 1) + assert cp.all(B[100:] == 1) + assert cp.all(B[50:100] == 0) + + +@pytest.mark.gpu +def test_memset_pure_gpu_multi_dim(): + import cupy as cp + + sdfg = _get_multi_dim_sdfg("pure", gpu=True) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + B = cp.ones((50, 2, 2), dtype=np.float64) + exe(gpuB=B) + + assert cp.all(B[0:40, :, :] == 1) + assert cp.all(B[40:50, :, :] == 0) + + +@pytest.mark.gpu +def test_memset_cuda_gpu(): + import cupy as cp + + sdfg = _get_sdfg("CUDA", gpu=True) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + B = cp.ones((200, ), dtype=cp.float64) + exe(gpuB=B) + + assert cp.all(B[:50] == 1) + assert cp.all(B[100:] == 1) + assert cp.all(B[50:100] == 0) + + +@pytest.mark.gpu +def test_memset_cuda_gpu_multi_dim(): + import cupy as cp + + sdfg = _get_multi_dim_sdfg("CUDA", gpu=True) + sdfg.validate() + sdfg.expand_library_nodes() + sdfg.validate() + exe = sdfg.compile() + + B = cp.ones((50, 2, 2), dtype=np.float64) + exe(gpuB=B) + + assert cp.all(B[0:40, :, :] == 1) + assert cp.all(B[40:50, :, :] == 0) + + +@pytest.mark.gpu +def test_memset_cuda_cpu(): + # Test CUDA implementation on CPU arrays + # should fail at validation or compilation + sdfg = _get_sdfg("CUDA", gpu=False) + sdfg.validate() + sdfg.expand_library_nodes() + with pytest.raises(Exception): + sdfg.validate() + sdfg.compile() + + +if __name__ == "__main__": + test_memset_pure_cpu() + test_memset_pure_gpu() + test_memset_cuda_gpu() + test_memset_cuda_cpu() + test_memset_pure_cpu_multi_dim() + test_memset_pure_gpu_multi_dim() + test_memset_cuda_gpu_multi_dim() diff --git a/tests/passes/assignment_and_copy_kernel_to_memset_and_memcpy_test.py b/tests/passes/assignment_and_copy_kernel_to_memset_and_memcpy_test.py new file mode 100644 index 0000000000..6b3cee4f1c --- /dev/null +++ b/tests/passes/assignment_and_copy_kernel_to_memset_and_memcpy_test.py @@ -0,0 +1,935 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. + +import functools +import dace +import numpy +import pytest +from dace.libraries.standard.nodes.copy_node import CopyLibraryNode +from dace.libraries.standard.nodes.memset_node import MemsetLibraryNode +from dace.properties import CodeBlock +from dace.sdfg.state import LoopRegion +from dace.transformation.passes.assignment_and_copy_kernel_to_memset_and_memcpy import AssignmentAndCopyKernelToMemsetAndMemcpy + +# Global dimension size for all test arrays +DIM_SIZE = 10 +D = dace.symbol("D") +EXPANSION_TYPES = ["pure", "CPU", pytest.param("CUDA", marks=pytest.mark.gpu)] + + +def _get_sdfg( + num_memcpies: int, + num_memsets: int, + extra_computation: bool, + non_zero: bool, + subset_in_first_dim: bool, +) -> dace.SDFG: + """ + Construct an SDFG that performs a configurable number of memcpy and memset + operations, possibly with extra computation or non-zero memsets. + """ + + sdfg = dace.SDFG("main") + state = sdfg.add_state("memset_memcpy_maps") + + # Define the iteration space of the map (controls which indices are touched) + map_entry, map_exit = state.add_map( + name="memcpy_memset_map", + ndrange={ + "i": + dace.subsets.Range([(0, DIM_SIZE - 1, + 1)]) if not subset_in_first_dim else dace.subsets.Range([(2, DIM_SIZE - 1, 1)]), + "j": + dace.subsets.Range([(0, DIM_SIZE - 1, 1)]), + }, + ) + + # Select memset value: 0.0 or 1.0 depending on `non_zero` + assign_value = "0" if not non_zero else "1" + + # Create each memcpy or memset node + for i in range(num_memcpies + num_memsets): + is_memcpy = i < num_memcpies + ch = chr(ord("A") + i) # Name arrays alphabetically: A, B, C, ... + + in_name, out_name = f"{ch}_IN", f"{ch}_OUT" + + # Add 2D arrays for input and output + for name in (in_name, out_name): + sdfg.add_array( + name=name, + shape=(DIM_SIZE, DIM_SIZE), + dtype=dace.float64, + transient=False, + ) + + # Build the tasklet: memcpy = pass-through, memset = constant assignment + tasklet_name = f"{'memcpy' if is_memcpy else 'memset'}_{i}" + tasklet_code = "_out = _in" if is_memcpy else f"_out = {assign_value}" + + tasklet = state.add_tasklet( + name=tasklet_name, + inputs={"_in"} if is_memcpy else set(), + outputs={"_out"}, + code=tasklet_code, + ) + tasklet.add_out_connector("_out") + + # Handle input connection for memcpy + if is_memcpy: + # Connect array → map → tasklet + state.add_edge( + state.add_access(in_name), + None, + map_entry, + f"IN_{in_name}", + dace.memlet.Memlet(f"{in_name}[2:{DIM_SIZE}, 0:{DIM_SIZE}]" + if subset_in_first_dim else f"{in_name}[0:{DIM_SIZE}, 0:{DIM_SIZE}]"), + ) + map_entry.add_in_connector(f"IN_{in_name}") + map_entry.add_out_connector(f"OUT_{in_name}") + tasklet.add_in_connector("_in") + state.add_edge( + map_entry, + f"OUT_{in_name}", + tasklet, + "_in", + dace.memlet.Memlet(f"{in_name}[i, j]"), + ) + else: + # Memset has no input, only output dependency + state.add_edge( + map_entry, + None, + tasklet, + None, + dace.memlet.Memlet(None), + ) + + # If enabled, add extra computation: double every other result + if extra_computation and i % 2 == 0: + sdfg.add_scalar( + f"tmp_{i}", + dace.float64, + storage=dace.dtypes.StorageType.Register, + transient=True, + ) + tmp_access = state.add_access(f"tmp_{i}") + + # Store tasklet result in temporary + state.add_edge(tasklet, "_out", tmp_access, None, dace.memlet.Memlet(f"tmp_{i}[0]")) + + # Add extra tasklet that doubles the value + extra_tasklet = state.add_tasklet( + name=f"{tasklet_name}_extra_work", + inputs={"_in"}, + outputs={"_out"}, + code="_out = 2 * _in", + ) + extra_tasklet.add_in_connector("_in") + extra_tasklet.add_out_connector("_out") + + state.add_edge( + tmp_access, + None, + extra_tasklet, + "_in", + dace.memlet.Memlet(f"tmp_{i}[0]"), + ) + state.add_edge( + extra_tasklet, + "_out", + map_exit, + f"IN_{out_name}", + dace.memlet.Memlet(f"{out_name}[i, j]"), + ) + else: + # Normal write path: tasklet → map_exit + state.add_edge( + tasklet, + "_out", + map_exit, + f"IN_{out_name}", + dace.memlet.Memlet(f"{out_name}[i, j]"), + ) + + # Final output: map_exit → output array + state.add_edge( + map_exit, + f"OUT_{out_name}", + state.add_access(out_name), + None, + dace.memlet.Memlet(f"{out_name}[2:{DIM_SIZE}, 0:{DIM_SIZE}]" + if subset_in_first_dim else f"{out_name}[0:{DIM_SIZE}, 0:{DIM_SIZE}]"), + ) + map_exit.add_in_connector(f"IN_{out_name}") + map_exit.add_out_connector(f"OUT_{out_name}") + + # Save for debugging and validate SDFG correctness + sdfg.validate() + return sdfg + + +def _get_num_memcpy_library_nodes(sdfg: dace.SDFG) -> int: + """Return number of memcpy library nodes in an SDFG.""" + return sum(isinstance(node, CopyLibraryNode) for node, state in sdfg.all_nodes_recursive()) + + +def _get_num_memset_library_nodes(sdfg: dace.SDFG) -> int: + """Return number of memset library nodes in an SDFG.""" + return sum(isinstance(node, MemsetLibraryNode) for node, state in sdfg.all_nodes_recursive()) + + +def _set_lib_node_type(sdfg: dace.SDFG, expansion_type: str): + for n, g in sdfg.all_nodes_recursive(): + if isinstance(n, (CopyLibraryNode, MemsetLibraryNode)): + n.implementation = expansion_type + + +def temporarily_disable_autoopt_and_serialization(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Save original values + orig_autoopt = dace.config.Config.get("optimizer", "autooptimize") + orig_serialization = dace.config.Config.get("testing", "serialization") + try: + # Set both to False + dace.config.Config.set("optimizer", "autooptimize", value=False) + dace.config.Config.set("testing", "serialization", value=False) + return func(*args, **kwargs) + finally: + # Restore original values + dace.config.Config.set("optimizer", "autooptimize", value=orig_autoopt) + dace.config.Config.set("testing", "serialization", value=orig_serialization) + + return wrapper + + +@dace.program +def double_memset_with_dynamic_connectors(kfdia: dace.int32, kidia: dace.int32, llindex3: dace.float64[D, D], + zsinksum: dace.float64[D]): + for i, j in dace.map[0:D:1, kidia - 1:kfdia:]: + llindex3[i, j] = 0.0 + for j in dace.map[kidia - 1:kfdia:1]: + zsinksum[j] = 0.0 + + +@dace.program +def double_memcpy_with_dynamic_connectors(kfdia: dace.int32, kidia: dace.int32, llindex3_in: dace.float64[D, D], + zsinksum_in: dace.float64[D], llindex3_out: dace.float64[D, D], + zsinksum_out: dace.float64[D]): + for i, j in dace.map[0:D:1, kidia - 1:kfdia:]: + llindex3_out[i, j] = llindex3_in[i, j] + for j in dace.map[kidia - 1:kfdia:1]: + zsinksum_out[j] = zsinksum_in[j] + + +@dace.program +def nested_memset_maps_with_dynamic_connectors(kidia: dace.int64, kfdia: dace.int64, llindex: dace.float64[5, 5, D], + zsinksum: dace.float64[5, D]): + for i in dace.map[0:5]: + sym_kidia = kidia + sym_kfdia = kfdia + for j, k in dace.map[0:5, sym_kidia:sym_kfdia:1]: + llindex[i, j, k] = 0.0 + for k in dace.map[sym_kidia:sym_kfdia:1]: + zsinksum[i, k] = 0.0 + + +@dace.program +def nested_memcpy_maps_with_dynamic_connectors(kidia: dace.int64, kfdia: dace.int64, llindex_in: dace.float64[5, 5, D], + zsinksum_in: dace.float64[5, D], llindex_out: dace.float64[5, 5, D], + zsinksum_out: dace.float64[5, D]): + for i in dace.map[0:5]: + sym_kidia = kidia + sym_kfdia = kfdia + for j, k in dace.map[0:5, sym_kidia:sym_kfdia:1]: + llindex_out[i, j, k] = llindex_in[i, j, k] + for k in dace.map[sym_kidia:sym_kfdia:1]: + zsinksum_out[i, k] = zsinksum_in[i, k] + + +@dace.program +def nested_memcpy_maps_with_dimension_change(kidia: dace.int64, kfdia: dace.int64, zcovptot: dace.float64[D], + pcovptot: dace.float64[D, D]): + for i in range(D): + sym_kidia = kidia + sym_kfdia = kfdia + for j in dace.map[sym_kidia:sym_kfdia]: + pcovptot[i, j] = zcovptot[j] + + +@dace.program +def nested_memset_maps_with_dimension_change(kidia: dace.int64, kfdia: dace.int64, pcovptot: dace.float64[D, D]): + for i in range(D): + sym_kidia = kidia + sym_kfdia = kfdia + for j in dace.map[sym_kidia:sym_kfdia]: + pcovptot[i, j] = 0.0 + + +def set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg: dace.SDFG, expansion_type: str): + if expansion_type != "CUDA": + return + + for arr_name, arr in sdfg.arrays.items(): + if not isinstance(arr, dace.data.Scalar): + arr.storage = dace.dtypes.StorageType.GPU_Global + for state in sdfg.all_states(): + for node in state.nodes(): + if isinstance(node, dace.nodes.NestedSDFG): + set_dtype_to_gpu_if_expansion_type_is_cuda(node.sdfg, expansion_type) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_nested_memcpy_maps_with_dimension_change(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = nested_memcpy_maps_with_dimension_change.to_sdfg() + sdfg.name = sdfg.name + f"_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True).apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes( + sdfg) == 1, f"Expected 1 memcpy library nodes, got {_get_num_memcpy_library_nodes(sdfg)}" + assert _get_num_memset_library_nodes( + sdfg) == 0, f"Expected 0 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + + kidia = 0 + kfdia = DIM_SIZE + A_IN = xp.random.rand(DIM_SIZE) + B_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(zcovptot=A_IN, pcovptot=B_IN, kidia=kidia, kfdia=kfdia, D=DIM_SIZE) + assert xp.allclose(A_IN, B_IN) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_nested_memset_maps_with_dimension_change(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = nested_memset_maps_with_dimension_change.to_sdfg() + sdfg.name = sdfg.name + f"_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True).apply_pass(sdfg, {}) + assert _get_num_memset_library_nodes( + sdfg) == 1, f"Expected 1 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + assert _get_num_memcpy_library_nodes( + sdfg) == 0, f"Expected 0 memcpy library nodes, got {_get_num_memcpy_library_nodes(sdfg)}" + + kidia = 0 + kfdia = DIM_SIZE + B_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(pcovptot=B_IN, kidia=kidia, kfdia=kfdia, D=DIM_SIZE) + assert xp.allclose(B_IN, 0.0) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_nested_memset_maps_with_dynamic_connectors(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = nested_memset_maps_with_dynamic_connectors.to_sdfg() + sdfg.name = sdfg.name + f"_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=False).apply_pass(sdfg, {}) + # We should have 0 memset libnodes + assert _get_num_memset_library_nodes( + sdfg) == 1, f"Expected 1 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True).apply_pass(sdfg, {}) + # We should have 2 memset libnodes + assert _get_num_memset_library_nodes( + sdfg) == 2, f"Expected 2 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + + kidia = 0 + kfdia = DIM_SIZE + A_IN = xp.random.rand(5, 5, DIM_SIZE) + B_IN = xp.random.rand(5, DIM_SIZE) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(llindex=A_IN, zsinksum=B_IN, kidia=kidia, kfdia=kfdia, D=DIM_SIZE) + assert xp.allclose(A_IN, 0.0) + assert xp.allclose(B_IN, 0.0) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_nested_memcpy_maps_with_dynamic_connectors(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = nested_memcpy_maps_with_dynamic_connectors.to_sdfg() + sdfg.name = sdfg.name + f"_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=False).apply_pass(sdfg, {}) + # We should have 0 memcpy libnodes + assert _get_num_memcpy_library_nodes( + sdfg) == 1, f"Expected 1 memcpy library nodes, got {_get_num_memcpy_library_nodes(sdfg)}" + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True).apply_pass(sdfg, {}) + # We should have 2 memcpy libnodes + assert _get_num_memcpy_library_nodes( + sdfg) == 2, f"Expected 2 memcpy library nodes, got {_get_num_memcpy_library_nodes(sdfg)}" + + kidia = 0 + kfdia = DIM_SIZE + A_IN = xp.random.rand(5, 5, DIM_SIZE) + A_OUT = xp.random.rand(5, 5, DIM_SIZE) + B_IN = xp.random.rand(5, DIM_SIZE) + B_OUT = xp.random.rand(5, DIM_SIZE) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(llindex_in=A_IN, zsinksum_in=B_IN, llindex_out=A_OUT, zsinksum_out=B_OUT, kidia=kidia, kfdia=kfdia, D=DIM_SIZE) + assert xp.allclose(A_IN, A_OUT) + assert xp.allclose(B_IN, B_OUT) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_double_memset_with_dynamic_connectors(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = double_memset_with_dynamic_connectors.to_sdfg() + sdfg.name = sdfg.name + f"_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + B_IN = xp.ones(DIM_SIZE) + + sdfg.validate() + p = AssignmentAndCopyKernelToMemsetAndMemcpy() + p.overapproximate_first_dimension = True + p.apply_pass(sdfg, {}) + for n, g in sdfg.all_nodes_recursive(): + if isinstance(n, dace.nodes.NestedSDFG): + p.apply_pass(n.sdfg) + sdfg.validate() + + assert _get_num_memcpy_library_nodes( + sdfg) == 0, f"Expected 0 memcpy library node, got {_get_num_memcpy_library_nodes(sdfg)}" + assert _get_num_memset_library_nodes( + sdfg) == 2, f"Expected 2 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(llindex3=A_IN, zsinksum=B_IN, D=DIM_SIZE, kfdia=1, kidia=DIM_SIZE) + + assert xp.all(B_IN == 0.0), f"zsinksum should be fully zeroed {B_IN}" + assert xp.all(A_IN == 0.0), f"llindex3 should be fully zeroed {A_IN}" + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_double_memcpy_with_dynamic_connectors(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = double_memcpy_with_dynamic_connectors.to_sdfg() + sdfg.name = sdfg.name + f"_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + B_IN = xp.random.rand(DIM_SIZE) + A_OUT = xp.random.rand(DIM_SIZE, DIM_SIZE) + B_OUT = xp.random.rand(DIM_SIZE) + + sdfg.validate() + p = AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True) + p.overapproximate_first_dimension = True + p.apply_pass(sdfg, {}) + for n, g in sdfg.all_nodes_recursive(): + if isinstance(n, dace.nodes.NestedSDFG): + p.apply_pass(n.sdfg) + sdfg.validate() + assert _get_num_memcpy_library_nodes( + sdfg) == 2, f"Expected 2 memcpy library node, got {_get_num_memcpy_library_nodes(sdfg)}" + assert _get_num_memset_library_nodes( + sdfg) == 0, f"Expected 0 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(llindex3_in=A_IN, + zsinksum_in=B_IN, + llindex3_out=A_OUT, + zsinksum_out=B_OUT, + D=DIM_SIZE, + kfdia=1, + kidia=DIM_SIZE) + + assert xp.all(B_IN == B_OUT) + assert xp.all(A_IN == A_OUT) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_simple_memcpy(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_sdfg(1, 0, False, False, False) + sdfg.validate() + sdfg.name = sdfg.name + f"_simple_memcpy_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy().apply_pass(sdfg, {}) + sdfg.validate() + assert _get_num_memcpy_library_nodes(sdfg) == 1 + assert _get_num_memset_library_nodes(sdfg) == 0 + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + A_OUT = xp.zeros_like(A_IN) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(A_IN=A_IN, A_OUT=A_OUT) + + assert xp.allclose(A_IN, A_OUT) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_simple_memset(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_sdfg(0, 1, False, False, False) + sdfg.name = sdfg.name + f"_simple_memset_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy().apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes(sdfg) == 0 + assert _get_num_memset_library_nodes(sdfg) == 1 + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + A_OUT = xp.zeros_like(A_IN) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(A_IN=A_IN, A_OUT=A_OUT) + + assert xp.allclose(A_OUT, 0.0) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_multi_memcpy(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_sdfg(2, 0, False, False, False) + sdfg.validate() + sdfg.name = sdfg.name + f"_multi_memcpy_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy().apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes(sdfg) == 2 + assert _get_num_memset_library_nodes(sdfg) == 0 + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + A_OUT = xp.zeros_like(A_IN) + B_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + B_OUT = xp.zeros_like(B_IN) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(A_IN=A_IN, A_OUT=A_OUT, B_IN=B_IN, B_OUT=B_OUT) + + assert xp.allclose(A_IN, A_OUT) + assert xp.allclose(B_IN, B_OUT) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_multi_memset(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_sdfg(0, 2, False, False, False) + sdfg.validate() + sdfg.name = sdfg.name + f"_multi_memset_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy().apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes(sdfg) == 0 + assert _get_num_memset_library_nodes(sdfg) == 2 + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + A_OUT = xp.zeros_like(A_IN) + B_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + B_OUT = xp.zeros_like(B_IN) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(A_IN=A_IN, A_OUT=A_OUT, B_IN=B_IN, B_OUT=B_OUT) + + assert xp.allclose(A_OUT, 0.0) + assert xp.allclose(B_OUT, 0.0) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_multi_mixed(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_sdfg(1, 1, False, False, False) + sdfg.validate() + sdfg.name = sdfg.name + f"_multi_mixed_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy().apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes(sdfg) == 1 + assert _get_num_memset_library_nodes(sdfg) == 1 + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + A_OUT = xp.zeros_like(A_IN) + B_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + B_OUT = xp.zeros_like(B_IN) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(A_IN=A_IN, A_OUT=A_OUT, B_IN=B_IN, B_OUT=B_OUT) + + assert xp.allclose(A_IN, A_OUT) + assert xp.allclose(B_OUT, 0.0) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_simple_with_extra_computation(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_sdfg(2, 2, True, False, False) + sdfg.validate() + sdfg.name = sdfg.name + f"_simple_with_extra_computation_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy().apply_pass(sdfg, {}) + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + A_OUT = xp.zeros_like(A_IN) + B_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + B_OUT = xp.zeros_like(B_IN) + C_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + C_OUT = xp.zeros_like(C_IN) + D_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + D_OUT = xp.zeros_like(D_IN) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(A_IN=A_IN, A_OUT=A_OUT, B_IN=B_IN, B_OUT=B_OUT, C_IN=C_IN, C_OUT=C_OUT, D_IN=D_IN, D_OUT=D_OUT) + + assert xp.allclose(A_OUT, 2 * A_IN) + assert xp.allclose(B_OUT, B_IN) + assert xp.allclose(C_OUT, 0.0) + assert xp.allclose(D_OUT, 0.0) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_simple_non_zero(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_sdfg(0, 1, False, True, False) + sdfg.validate() + sdfg.name = sdfg.name + f"_simple_nonzero_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy().apply_pass(sdfg, {}) + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + A_OUT = xp.zeros_like(A_IN) + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(A_IN=A_OUT, A_OUT=A_OUT) + + assert xp.allclose(A_OUT, 1.0) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_mixed_overapprox(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_sdfg(2, 2, False, False, True) + sdfg.name = sdfg.name + f"_mixed_overapprox_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy().apply_pass(sdfg, {}) + sdfg.validate() + + A_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + A_OUT = xp.zeros_like(A_IN) + B_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + B_OUT = xp.zeros_like(B_IN) + C_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + C_OUT = xp.zeros_like(C_IN) + D_IN = xp.random.rand(DIM_SIZE, DIM_SIZE) + D_OUT = xp.zeros_like(D_IN) + + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(A_IN=A_IN, A_OUT=A_OUT, B_IN=B_IN, B_OUT=B_OUT, C_IN=C_IN, C_OUT=C_OUT, D_IN=D_IN, D_OUT=D_OUT) + + assert xp.allclose(C_OUT, 0.0) + assert xp.allclose(D_OUT, 0.0) + assert xp.allclose(B_OUT[2:10, 0:10], B_IN[2:10, 0:10]) + assert xp.allclose(A_IN[2:10, 0:10], A_OUT[2:10, 0:10]) + + +def _get_nested_memcpy_with_dimension_change_and_fortran_strides(full_inner_range: bool = True, + fortran_strides: bool = True): + sdfg = dace.SDFG("nested_memcpy_with_dimension_change_and_fortran_strides") + inner_sdfg = dace.SDFG(name="inner_sdfg") + + for sd in [sdfg, inner_sdfg]: + sd.add_symbol("_for_it_0", dace.int64) + sd.add_symbol("D", dace.int64) + + scl_names = ["kfdia", "kidia"] + + for sd in [sdfg, inner_sdfg]: + for scl_name in scl_names: + sd.add_scalar(name=scl_name, dtype=dace.int64) + for arr_name, shape, strides in [("zcovptot", (D, ), (1, )), + ("pcovptot", (D, D), (1, D) if fortran_strides else (D, 1))]: + if not full_inner_range and arr_name == "pcovptot" and sd == inner_sdfg: + sd.add_array( + name=arr_name, + shape=(D, ), + dtype=dace.float64, + transient=False, + strides=(1, ) if fortran_strides else (D, ), + ) + else: + sd.add_array( + name=arr_name, + shape=shape, + dtype=dace.float64, + transient=False, + strides=strides, + ) + + for_cfg = LoopRegion(label="for1", + condition_expr=CodeBlock("_for_it_0 < D"), + loop_var="_for_it_0", + initialize_expr=CodeBlock("_for_it_0 = 0"), + update_expr=CodeBlock("_for_it_0 = _for_it_0 + 1")) + sdfg.add_node(for_cfg, True) + inner_state = for_cfg.add_state(label="s1", is_start_block=True) + nsdfg_node = inner_state.add_nested_sdfg( + sdfg=inner_sdfg, + inputs={"kfdia", "kidia", "zcovptot"}, + outputs={"pcovptot"}, + symbol_mapping={ + "_for_it_0": "_for_it_0", + "D": "D" + }, + name="inner_sdfg_node", + ) + assert "_for_it_0" in inner_sdfg.symbols + assert "_for_it_0" in sdfg.symbols + assert "_for_it_0" not in sdfg.free_symbols + assert "_for_it_0" in inner_sdfg.free_symbols + + inner_inner_state = inner_sdfg.add_state(label="s2", is_start_block=True) + + for in_name in {"kfdia", "kidia", "zcovptot"}: + inner_state.add_edge(inner_state.add_access(in_name), None, nsdfg_node, in_name, + dace.memlet.Memlet.from_array(in_name, sdfg.arrays[in_name])) + + for out_name in {"pcovptot"}: + inner_state.add_edge( + nsdfg_node, out_name, inner_state.add_access(out_name), None, + dace.memlet.Memlet("pcovptot[0:D, _for_it_0]" if not full_inner_range else "pcovptot[0:D, 0:D]")) + + inner_inner_state.add_mapped_tasklet( + name="cpy", + map_ranges={"i": dace.subsets.Range([(0, D - 1, 1)])}, + input_nodes={"zcovptot": inner_inner_state.add_access("zcovptot")}, + output_nodes={"pcovptot": inner_inner_state.add_access("pcovptot")}, + external_edges=True, + code="_out = _in", + inputs={"_in": dace.memlet.Memlet("zcovptot[i]")}, + outputs={"_out": dace.memlet.Memlet("pcovptot[i, _for_it_0]" if full_inner_range else "pcovptot[i]")}, + ) + sdfg.validate() + sdfg.save("y.sdfg") + return sdfg + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_nested_memcpy_with_dimension_change_and_fortran_strides(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_nested_memcpy_with_dimension_change_and_fortran_strides(full_inner_range=True, fortran_strides=True) + sdfg.name = sdfg.name + f"_full_inner_range_true_fortran_strides_true_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True).apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes( + sdfg) == 1, f"Expected 1 memcpy library nodes, got {_get_num_memcpy_library_nodes(sdfg)}" + assert _get_num_memset_library_nodes( + sdfg) == 0, f"Expected 0 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + + kidia = 0 + kfdia = DIM_SIZE + A_IN = xp.fromfunction(lambda x: x, (DIM_SIZE, ), dtype=xp.float64).copy() + B_IN = xp.fromfunction(lambda x, y: x * DIM_SIZE + y, (DIM_SIZE, DIM_SIZE), dtype=xp.float64).copy() + _set_lib_node_type(sdfg, expansion_type) + sdfg.save("x1.sdfg") + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(zcovptot=A_IN, pcovptot=B_IN, kidia=kidia, kfdia=kfdia, D=DIM_SIZE) + assert xp.allclose(A_IN, B_IN) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_nested_memcpy_with_dimension_change_and_fortran_strides_with_subset(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_nested_memcpy_with_dimension_change_and_fortran_strides(full_inner_range=False, fortran_strides=True) + sdfg.name = sdfg.name + f"_full_inner_range_false_fortran_strides_true_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True).apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes( + sdfg) == 1, f"Expected 1 memcpy library nodes, got {_get_num_memcpy_library_nodes(sdfg)}" + assert _get_num_memset_library_nodes( + sdfg) == 0, f"Expected 0 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + + kidia = 0 + kfdia = DIM_SIZE + A_IN = xp.fromfunction(lambda x: x, (DIM_SIZE, ), dtype=xp.float64).copy() + B_IN = xp.fromfunction(lambda x, y: x * DIM_SIZE + y, (DIM_SIZE, DIM_SIZE), dtype=xp.float64).copy() + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(zcovptot=A_IN, pcovptot=B_IN, kidia=kidia, kfdia=kfdia, D=DIM_SIZE) + assert xp.allclose(A_IN, B_IN) + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_nested_memcpy_with_dimension_change_and_c_strides(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_nested_memcpy_with_dimension_change_and_fortran_strides(full_inner_range=True, fortran_strides=False) + sdfg.name = sdfg.name + f"_full_inner_range_true_fortran_strides_false_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True).apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes( + sdfg) == 0, f"Expected 0 memcpy library nodes, got {_get_num_memcpy_library_nodes(sdfg)}" + assert _get_num_memset_library_nodes( + sdfg) == 0, f"Expected 0 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + + kidia = 0 + kfdia = DIM_SIZE + A_IN = xp.fromfunction(lambda x: x, (DIM_SIZE, ), dtype=xp.float64).copy() + B_IN = xp.fromfunction(lambda x, y: x * DIM_SIZE + y, (DIM_SIZE, DIM_SIZE), dtype=xp.float64).copy() + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(zcovptot=A_IN, pcovptot=B_IN, kidia=kidia, kfdia=kfdia, D=DIM_SIZE) + for j in range(DIM_SIZE): + assert xp.allclose(B_IN[0:DIM_SIZE, j], A_IN), f"{j}: {B_IN[0:DIM_SIZE, j] - A_IN}" + + +@pytest.mark.parametrize("expansion_type", EXPANSION_TYPES) +@temporarily_disable_autoopt_and_serialization +def test_nested_memcpy_with_dimension_change_and_c_strides_with_subset(expansion_type: str): + if expansion_type == "CUDA": + import cupy + xp = cupy if expansion_type == "CUDA" else numpy + + sdfg = _get_nested_memcpy_with_dimension_change_and_fortran_strides(full_inner_range=False, fortran_strides=False) + sdfg.name = sdfg.name + f"_full_inner_range_false_fortran_strides_false_expansion_type_{expansion_type}" + set_dtype_to_gpu_if_expansion_type_is_cuda(sdfg, expansion_type) + + AssignmentAndCopyKernelToMemsetAndMemcpy(overapproximate_first_dimensions=True).apply_pass(sdfg, {}) + assert _get_num_memcpy_library_nodes( + sdfg) == 0, f"Expected 0 memcpy library nodes, got {_get_num_memcpy_library_nodes(sdfg)}" + assert _get_num_memset_library_nodes( + sdfg) == 0, f"Expected 0 memset library nodes, got {_get_num_memset_library_nodes(sdfg)}" + + kidia = 0 + kfdia = DIM_SIZE + A_IN = xp.fromfunction(lambda x: x, (DIM_SIZE, ), dtype=xp.float64).copy() + B_IN = xp.fromfunction(lambda x, y: x * DIM_SIZE + y, (DIM_SIZE, DIM_SIZE), dtype=xp.float64).copy() + _set_lib_node_type(sdfg, expansion_type) + sdfg.expand_library_nodes(recursive=True) + sdfg.validate() + sdfg(zcovptot=A_IN, pcovptot=B_IN, kidia=kidia, kfdia=kfdia, D=DIM_SIZE) + for j in range(DIM_SIZE): + assert xp.allclose(B_IN[0:DIM_SIZE, j], A_IN), f"{j}: {B_IN[0:DIM_SIZE, j] - A_IN}" + + +if __name__ == "__main__": + for expansion_type in ["CPU", "pure", "GPU"]: + test_simple_memcpy(expansion_type) + test_simple_memset(expansion_type) + test_multi_memcpy(expansion_type) + test_multi_memset(expansion_type) + test_multi_mixed(expansion_type) + test_simple_with_extra_computation(expansion_type) + test_simple_non_zero(expansion_type) + test_mixed_overapprox(expansion_type) + test_nested_memset_maps_with_dynamic_connectors(expansion_type) + test_nested_memcpy_maps_with_dynamic_connectors(expansion_type) + test_double_memset_with_dynamic_connectors(expansion_type) + test_double_memcpy_with_dynamic_connectors(expansion_type) + test_nested_memset_maps_with_dimension_change(expansion_type) + test_nested_memcpy_maps_with_dimension_change(expansion_type) + test_nested_memcpy_with_dimension_change_and_fortran_strides(expansion_type) + test_nested_memcpy_with_dimension_change_and_fortran_strides_with_subset(expansion_type) + test_nested_memcpy_with_dimension_change_and_c_strides(expansion_type) + test_nested_memcpy_with_dimension_change_and_c_strides_with_subset(expansion_type) diff --git a/tests/utils/array_dimension_utils.py b/tests/utils/array_dimension_utils.py new file mode 100644 index 0000000000..98a32857ae --- /dev/null +++ b/tests/utils/array_dimension_utils.py @@ -0,0 +1,202 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. + +import dace +import pytest + + +# Helper function +def create_array(shape, strides): + sdfg = dace.SDFG('test_sdfg') + _, array = sdfg.add_array(name='test_array', shape=shape, dtype=dace.float64, strides=strides) + return array + + +def test_get_packed_fortran_strides_1d(): + """Test Fortran strides for 1D array""" + array = create_array((10, ), None) + expected = (1, ) + result = array._get_packed_fortran_strides() + assert result == expected + + +def test_get_packed_fortran_strides_2d(): + """Test Fortran strides for 2D array""" + array = create_array((10, 20), None) + expected = (1, 10) + result = array._get_packed_fortran_strides() + assert result == expected + + +def test_get_packed_fortran_strides_3d(): + """Test Fortran strides for 3D array""" + array = create_array((10, 20, 30), None) + expected = (1, 10, 200) + result = array._get_packed_fortran_strides() + assert result == expected + + +def test_get_packed_fortran_strides_4d(): + """Test Fortran strides for 4D array""" + array = create_array((5, 10, 15, 20), None) + expected = (1, 5, 50, 750) + result = array._get_packed_fortran_strides() + assert result == expected + + +def test_get_packed_c_strides_1d(): + """Test C strides for 1D array""" + array = create_array((10, ), None) + expected = (1, ) + result = array._get_packed_c_strides() + assert result == expected + + +def test_get_packed_c_strides_2d(): + """Test C strides for 2D array""" + array = create_array((10, 20), None) + expected = (20, 1) + result = array._get_packed_c_strides() + assert result == expected + + +def test_get_packed_c_strides_3d(): + """Test C strides for 3D array""" + array = create_array((10, 20, 30), None) + expected = (600, 30, 1) + result = array._get_packed_c_strides() + assert result == expected + + +def test_get_packed_c_strides_4d(): + """Test C strides for 4D array""" + array = create_array((5, 10, 15, 20), None) + expected = (3000, 300, 20, 1) + result = array._get_packed_c_strides() + assert result == expected + + +def test_is_packed_fortran_strides_true_3d(): + """Test 3D array with Fortran strides""" + array = create_array((10, 20, 30), (1, 10, 200)) + result = array.is_packed_fortran_strides() + assert result is True + + +def test_is_packed_fortran_strides_false_c_layout(): + """Test array with C strides returns False""" + array = create_array((10, 20, 30), (600, 30, 1)) + result = array.is_packed_fortran_strides() + assert result is False + + +def test_is_packed_fortran_strides_false_custom_strides(): + """Test array with custom strides returns False""" + array = create_array((10, 20, 30), (2, 20, 400)) + result = array.is_packed_fortran_strides() + assert result is False + + +def test_is_packed_fortran_strides_false_wrong_order(): + """Test array with incorrect stride ordering""" + array = create_array((10, 20, 30), (10, 1, 200)) + result = array.is_packed_fortran_strides() + assert result is False + + +def test_is_packed_c_strides_true_1d(): + """Test 1D array with C strides""" + array = create_array((10, ), (1, )) + result = array.is_packed_c_strides() + assert result is True + + +def test_is_packed_c_strides_true_2d(): + """Test 2D array with C strides""" + array = create_array((10, 20), (20, 1)) + result = array.is_packed_c_strides() + assert result is True + + +def test_is_packed_c_strides_true_3d(): + """Test 3D array with C strides""" + array = create_array((10, 20, 30), (600, 30, 1)) + result = array.is_packed_c_strides() + assert result is True + + +def test_is_packed_c_strides_false_fortran_layout(): + """Test array with Fortran strides returns False""" + array = create_array((10, 20, 30), (1, 10, 200)) + result = array.is_packed_c_strides() + assert result is False + + +def test_is_packed_c_strides_false_custom_strides(): + """Test array with custom strides returns False""" + array = create_array((10, 20, 30), (1200, 60, 2)) + result = array.is_packed_c_strides() + assert result is False + + +def test_is_packed_c_strides_false_wrong_order(): + """Test array with incorrect stride ordering""" + array = create_array((10, 20, 30), (1, 30, 600)) + result = array.is_packed_c_strides() + assert result is False + + +def test_empty_shape(): + """Test with empty shape (scalar)""" + array = create_array((), ()) + fortran_result = array._get_packed_fortran_strides() + c_result = array._get_packed_c_strides() + assert fortran_result == () + assert c_result == () + + +def test_fortran_and_c_equivalent_for_1d(): + """Test that 1D arrays have same strides for both layouts""" + array = create_array((100, ), (1, )) + assert array.is_packed_fortran_strides() is True + assert array.is_packed_c_strides() is True + + +def test_c_strides_calculation_accumulation(): + """Test stride accumulation for C layout""" + array = create_array((2, 3, 4), None) + result = array._get_packed_c_strides() + assert result == (12, 4, 1) + + +def test_explicit_fortran_strides(): + """Test explicitly set Fortran strides""" + array = create_array((5, 7, 9), (1, 5, 35)) + assert array.is_packed_fortran_strides() is True + assert array.is_packed_c_strides() is False + + +def test_explicit_fortran_strides_not_packed(): + """Test explicitly set Fortran strides""" + array = create_array((5, 7, 9), (10, 5 * 10, 35 * 10)) + assert array.is_packed_fortran_strides() is False + assert array.is_packed_c_strides() is False + + +def test_explicit_c_strides(): + """Test explicitly set C strides""" + array = create_array((5, 7, 9), (63, 9, 1)) + assert array.is_packed_c_strides() is True + assert array.is_packed_fortran_strides() is False + + +def test_explicit_c_strides_not_packed(): + """Test explicitly set C strides""" + array = create_array((5, 7, 9), (63 * 5, 9 * 5, 1)) + assert array.is_packed_c_strides() is False + assert array.is_packed_fortran_strides() is False + + +if __name__ == "__main__": + tests = [obj for name, obj in globals().items() if callable(obj) and name.startswith("test_")] + for test_function in tests: + test_function() diff --git a/tests/utils/is_contiguous_subset_test.py b/tests/utils/is_contiguous_subset_test.py new file mode 100644 index 0000000000..e76e8135c8 --- /dev/null +++ b/tests/utils/is_contiguous_subset_test.py @@ -0,0 +1,241 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. + +import dace +import pytest +import dace.subsets + + +# Helper function +def create_array(shape, strides): + """Helper to create a dace.data.Array with specific strides""" + sdfg = dace.SDFG('test_sdfg') + name, array = sdfg.add_array(name='test_array', shape=shape, dtype=dace.float64, strides=strides) + return array + + +def create_subset(ranges): + """Helper to create a dace.subsets.Range from list of (begin, end, step) tuples""" + return dace.subsets.Range(ranges) + + +def test_fortran_full_range_all_dimensions(): + """Test Fortran array with full range in all dimensions - should be contiguous""" + array = create_array((10, 20, 30), (1, 10, 200)) + subset = create_subset([(0, 9, 1), (0, 19, 1), (0, 29, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_fortran_partial_first_dim_rest_size_one(): + """Test Fortran array with partial first dim, rest size 1 - should be contiguous""" + array = create_array((10, 20, 30), (1, 10, 200)) + subset = create_subset([(0, 4, 1), (0, 0, 1), (0, 0, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_fortran_partial_first_dim_rest_full(): + """Test Fortran array with partial first dim, rest full - should NOT be contiguous""" + array = create_array((10, 20, 30), (1, 10, 200)) + subset = create_subset([(0, 4, 1), (0, 19, 1), (0, 29, 1)]) + result = subset.is_contiguous_subset(array) + assert result is False + + +def test_fortran_full_first_partial_second_rest_size_one(): + """Test Fortran array with full first, partial second, rest size 1 - should be contiguous""" + array = create_array((10, 20, 30), (1, 10, 200)) + subset = create_subset([(0, 9, 1), (0, 9, 1), (8, 8, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_fortran_full_first_partial_second_rest_full(): + """Test Fortran array with full first, partial second, rest full - should NOT be contiguous""" + array = create_array((10, 20, 30), (1, 10, 200)) + subset = create_subset([(0, 9, 1), (0, 9, 1), (0, 29, 1)]) + result = subset.is_contiguous_subset(array) + assert result is False + + +def test_fortran_partial_last_dim_only(): + """Test Fortran array with only last dim partial - should be contiguous""" + array = create_array((10, 20, 30), (1, 10, 200)) + subset = create_subset([(0, 9, 1), (0, 19, 1), (0, 5, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_fortran_single_element_all_dims(): + """Test Fortran array with single element (all dims size 1) - should be contiguous""" + array = create_array((10, 20, 30), (1, 10, 200)) + subset = create_subset([(5, 5, 1), (10, 10, 1), (15, 15, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_c_full_range_all_dimensions(): + """Test C array with full range in all dimensions - should be contiguous""" + array = create_array((10, 20, 30), (600, 30, 1)) + subset = create_subset([(0, 9, 1), (0, 19, 1), (0, 29, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_c_partial_last_dim_rest_size_one(): + """Test C array with partial last dim, rest size 1 - should be contiguous""" + array = create_array((10, 20, 30), (600, 30, 1)) + subset = create_subset([(0, 0, 1), (0, 0, 1), (6, 14, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_c_partial_last_dim_rest_full(): + """Test C array with partial last dim, rest full - should NOT be contiguous""" + array = create_array((10, 20, 30), (600, 30, 1)) + subset = create_subset([(0, 9, 1), (0, 19, 1), (0, 14, 1)]) + result = subset.is_contiguous_subset(array) + assert result is False + + +def test_c_full_last_partial_second_rest_size_one(): + """Test C array with full last, partial second, rest size 1 - should be contiguous""" + array = create_array((10, 20, 30), (600, 30, 1)) + subset = create_subset([(0, 0, 1), (0, 9, 1), (0, 29, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_c_full_last_partial_second_rest_full(): + """Test C array with full last, partial second, rest full - should NOT be contiguous""" + array = create_array((10, 20, 30), (600, 30, 1)) + subset = create_subset([(0, 9, 1), (0, 9, 1), (0, 29, 1)]) + result = subset.is_contiguous_subset(array) + assert result is False + + +def test_c_partial_first_dim_only(): + """Test C array with only first dim partial - should be contiguous""" + array = create_array((10, 20, 30), (600, 30, 1)) + subset = create_subset([(0, 4, 1), (0, 19, 1), (0, 29, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_c_single_element_all_dims(): + """Test C array with single element (all dims size 1) - should be contiguous""" + array = create_array((10, 20, 30), (600, 30, 1)) + subset = create_subset([(5, 5, 1), (10, 10, 1), (15, 15, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_fortran_2d_full_first_partial_second(): + """Test Fortran 2D array with full first, partial second - should be contiguous""" + array = create_array((50, 100), (1, 50)) + subset = create_subset([(0, 49, 1), (10, 89, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_fortran_2d_partial_first_full_second(): + """Test Fortran 2D array with partial first, full second - should NOT be contiguous""" + array = create_array((50, 100), (1, 50)) + subset = create_subset([(10, 39, 1), (0, 99, 1)]) + result = subset.is_contiguous_subset(array) + assert result is False + + +def test_c_2d_full_first_partial_second(): + """Test C 2D array with full first, partial second - should NOT be contiguous""" + array = create_array((50, 100), (100, 1)) + subset = create_subset([(0, 49, 1), (10, 89, 1)]) + result = subset.is_contiguous_subset(array) + assert result is False + + +def test_c_2d_partial_first_full_second(): + """Test C 2D array with partial first, full second - should be contiguous""" + array = create_array((50, 100), (100, 1)) + subset = create_subset([(10, 39, 1), (0, 99, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_1d_full_range(): + """Test 1D array with full range - should be contiguous""" + array = create_array((100, ), (1, )) + subset = create_subset([(0, 99, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_1d_partial_range(): + """Test 1D array with partial range - should be contiguous""" + array = create_array((100, ), (1, )) + subset = create_subset([(25, 74, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_fortran_4d_complex_contiguous(): + """Test Fortran 4D array - full first two dims, partial third, size 1 last""" + array = create_array((5, 10, 15, 20), (1, 5, 50, 750)) + subset = create_subset([(0, 4, 1), (0, 9, 1), (0, 7, 1), (0, 0, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_c_4d_complex_contiguous(): + """Test C 4D array - full last two dims, partial second, size 1 first""" + array = create_array((5, 10, 15, 20), (3000, 300, 20, 1)) + subset = create_subset([(0, 0, 1), (0, 4, 1), (0, 14, 1), (0, 19, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_fortran_4d_non_contiguous(): + """Test Fortran 4D array - partial middle dims without trailing size 1""" + array = create_array((5, 10, 15, 20), (1, 5, 50, 750)) + subset = create_subset([(0, 4, 1), (0, 4, 1), (0, 7, 1), (0, 10, 1)]) + result = subset.is_contiguous_subset(array) + assert result is False + + +def test_fortran_4d_contiguous_first_partial_second_rest_one(): + """Test Fortran 4D array - partial middle dims without trailing size 1""" + array = create_array((5, 10, 15, 20), (1, 5, 50, 750)) + subset = create_subset([(0, 4, 1), (0, 4, 1), (1, 1, 1), (1, 1, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_c_4d_non_contiguous(): + """Test C 4D array - partial middle dims without leading size 1""" + array = create_array((5, 10, 15, 20), (3000, 300, 20, 1)) + subset = create_subset([(0, 2, 1), (0, 4, 1), (0, 7, 1), (0, 19, 1)]) + result = subset.is_contiguous_subset(array) + assert result is False + + +def test_c_4d_contiguous_first_partial_second_rest_one(): + """Test Fortran 4D array - partial middle dims without trailing size 1""" + array = create_array((5, 10, 15, 20), (3000, 300, 20, 1)) + subset = create_subset([(1, 1, 1), (1, 1, 1), (0, 5, 1), (0, 20, 1)]) + result = subset.is_contiguous_subset(array) + assert result is True + + +def test_non_standard_strides_returns_false(): + """Test array with non-standard strides - should return False""" + array = create_array((10, 20, 30), (2, 20, 400)) # Custom strides + subset = create_subset([(0, 9, 1), (0, 19, 1), (0, 29, 1)]) + + result = subset.is_contiguous_subset(array) + assert result is False + + +if __name__ == "__main__": + tests = [obj for name, obj in globals().items() if callable(obj) and name.startswith("test_")] + for test_function in tests: + test_function()