Add 3D-4D broadcasting implementation and test

affifboudaoud · affifboudaoud · commit 5ffe768d8686 · 2025-10-21T17:05:09.000+02:00
diff --git a/dace/libraries/blas/nodes/batched_matmul.py b/dace/libraries/blas/nodes/batched_matmul.py
@@ -86,15 +86,22 @@ def make_sdfg(node, parent_state, parent_sdfg):
         if len(array_a.shape) == 2:
             memlet_a = '__im, __ik'
         else:
-            # Use output batch indices
-            a_batch_indices = ', '.join(['__i%d' % i for i in range(len(array_a.shape) - 2)])
+            # Align input batch dims to output batch dims
+            num_a_batch = len(array_a.shape) - 2
+            # Start from the rightmost batch dimension of output and work backwards
+            offset = num_batch_dims - num_a_batch
+            a_batch_indices = ', '.join(['__i%d' % (offset + i) for i in range(num_a_batch)])
             memlet_a = f'{a_batch_indices}, __im, __ik'
 
         # For B: if 2D, use [K, N]; if 3D+, use [batch_indices..., K, N]
         if len(array_b.shape) == 2:
             memlet_b = '__ik, __in'
         else:
-            b_batch_indices = ', '.join(['__i%d' % i for i in range(len(array_b.shape) - 2)])
+            # Align input batch dims to output batch dims
+            num_b_batch = len(array_b.shape) - 2
+            # Start from the rightmost batch dimension of output and work backwards
+            offset = num_batch_dims - num_b_batch
+            b_batch_indices = ', '.join(['__i%d' % (offset + i) for i in range(num_b_batch)])
             memlet_b = f'{b_batch_indices}, __ik, __in'
 
         # For C: always has batch dimensions
@@ -172,8 +179,11 @@ def expansion(node, state, sdfg):
         const {dtype}** __mkl_BMM_B = new const {dtype}*[{BATCH}];
         {dtype}** __mkl_BMM_C = new {dtype}*[{BATCH}];
         for (int __ib = 0; __ib < {BATCH}; __ib++) {{
-            __mkl_BMM_A[__ib] = (({dtype}*){x}) + __ib*{stride_a};
-            __mkl_BMM_B[__ib] = (({dtype}*){y}) + __ib*{stride_b};
+            // Handle broadcasting - compute correct index for inputs with fewer batch dimensions
+            int __a_idx = ({stride_a} > 0) ? (({a_batch_size} < {BATCH}) ? (__ib % {a_batch_size}) : __ib) : 0;
+            int __b_idx = ({stride_b} > 0) ? (({b_batch_size} < {BATCH}) ? (__ib % {b_batch_size}) : __ib) : 0;
+            __mkl_BMM_A[__ib] = (({dtype}*){x}) + __a_idx*{stride_a};
+            __mkl_BMM_B[__ib] = (({dtype}*){y}) + __b_idx*{stride_b};
             __mkl_BMM_C[__ib] = (({dtype}*)_c) + __ib*{stride_c};
         }}
 
@@ -227,9 +237,12 @@ def expansion(node, state, sdfg):
 
         code = '''
         for (int __ib = 0; __ib < {BATCH}; ++__ib) {{
+            // Handle broadcasting - compute correct index for inputs with fewer batch dimensions
+            int __a_idx = ({stride_a} > 0) ? (({a_batch_size} < {BATCH}) ? (__ib % {a_batch_size}) : __ib) : 0;
+            int __b_idx = ({stride_b} > 0) ? (({b_batch_size} < {BATCH}) ? (__ib % {b_batch_size}) : __ib) : 0;
             cblas_{func}(CblasColMajor, {ta}, {tb}, {M}, {N}, {K}, {alpha},
-                         (({dtype}*){x}) + __ib*{stride_a}, {lda},
-                         (({dtype}*){y}) + __ib*{stride_b}, {ldb},
+                         (({dtype}*){x}) + __a_idx*{stride_a}, {lda},
+                         (({dtype}*){y}) + __b_idx*{stride_b}, {ldb},
                          {beta},
                          (({dtype}*)_c) + __ib*{stride_c}, {ldc});
         }}'''.format_map(opt)
@@ -325,17 +338,38 @@ def expansion(node, state, sdfg):
         opt = _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta, cdtype, func)
         opt['array_prefix'] = '_' if needs_copy else ''
 
+        # Check if we need broadcasting (non-uniform strides)
+        needs_broadcasting = (opt.get('a_batch_size') and opt.get('b_batch_size')
+                              and (opt['a_batch_size'] != opt['BATCH'] or opt['b_batch_size'] != opt['BATCH']))
+
         # Matrix multiplication
         if (node.compute_type is None and node.accumulator_type is None and node.algorithm is None):
-            call = '''cublas{func}StridedBatched(__dace_cublas_handle,
-                CUBLAS_OP_{ta}, CUBLAS_OP_{tb},
-                {M}, {N}, {K},
-                {alpha},
-                ({dtype}*){array_prefix}{x}, {lda}, {stride_a},
-                ({dtype}*){array_prefix}{y}, {ldb}, {stride_b},
-                {beta},
-                ({dtype}*){array_prefix}_c, {ldc}, {stride_c},
-                {BATCH});'''.format_map(opt)
+            if needs_broadcasting:
+                # Use manual loop for broadcasting cases
+                call = '''
+                for (int __ib = 0; __ib < {BATCH}; ++__ib) {{
+                    int __a_idx = ({stride_a} > 0) ? (({a_batch_size} < {BATCH}) ? (__ib % {a_batch_size}) : __ib) : 0;
+                    int __b_idx = ({stride_b} > 0) ? (({b_batch_size} < {BATCH}) ? (__ib % {b_batch_size}) : __ib) : 0;
+                    cublas{func}(__dace_cublas_handle,
+                        CUBLAS_OP_{ta}, CUBLAS_OP_{tb},
+                        {M}, {N}, {K},
+                        {alpha},
+                        ({dtype}*){array_prefix}{x} + __a_idx*{stride_a}, {lda},
+                        ({dtype}*){array_prefix}{y} + __b_idx*{stride_b}, {ldb},
+                        {beta},
+                        ({dtype}*){array_prefix}_c + __ib*{stride_c}, {ldc});
+                }}'''.format_map(opt)
+            else:
+                # Use StridedBatched for uniform case
+                call = '''cublas{func}StridedBatched(__dace_cublas_handle,
+                    CUBLAS_OP_{ta}, CUBLAS_OP_{tb},
+                    {M}, {N}, {K},
+                    {alpha},
+                    ({dtype}*){array_prefix}{x}, {lda}, {stride_a},
+                    ({dtype}*){array_prefix}{y}, {ldb}, {stride_b},
+                    {beta},
+                    ({dtype}*){array_prefix}_c, {ldc}, {stride_c},
+                    {BATCH});'''.format_map(opt)
         else:
             if node.compute_type is not None:
                 acctype = node.compute_type
@@ -349,24 +383,49 @@ def expansion(node, state, sdfg):
             if node.algorithm is not None:
                 algorithm = node.algorithm
 
-            call = f'''
-            cublasGemmStridedBatchedEx(__dace_cublas_handle,
-                CUBLAS_OP_{opt['ta']}, CUBLAS_OP_{opt['tb']},
-                {opt['M']}, {opt['N']}, {opt['K']},
-                {alpha},
-                {opt['array_prefix']}{opt['x']},
-                {dtype_to_cudadatatype(opt['xdtype'])},
-                {opt['lda']}, {opt['stride_a']},
-                {opt['array_prefix']}{opt['y']},
-                {dtype_to_cudadatatype(opt['ydtype'])},
-                {opt['ldb']}, {opt['stride_b']},
-                {beta},
-                {opt['array_prefix']}_c,
-                {dtype_to_cudadatatype(opt['cdtype'])},
-                {opt['ldc']}, {opt['stride_c']},
-                {opt['BATCH']},
-                {acctype}, {algorithm});
-            '''
+            if needs_broadcasting:
+                # Use manual loop for broadcasting cases with GemmEx
+                call = f'''
+                for (int __ib = 0; __ib < {opt['BATCH']}; ++__ib) {{{{
+                    int __a_idx = ({opt['stride_a']} > 0) ? (({opt['a_batch_size']} < {opt['BATCH']}) ? (__ib % {opt['a_batch_size']}) : __ib) : 0;
+                    int __b_idx = ({opt['stride_b']} > 0) ? (({opt['b_batch_size']} < {opt['BATCH']}) ? (__ib % {opt['b_batch_size']}) : __ib) : 0;
+                    cublasGemmEx(__dace_cublas_handle,
+                        CUBLAS_OP_{opt['ta']}, CUBLAS_OP_{opt['tb']},
+                        {opt['M']}, {opt['N']}, {opt['K']},
+                        {alpha},
+                        {opt['array_prefix']}{opt['x']} + __a_idx*{opt['stride_a']},
+                        {dtype_to_cudadatatype(opt['xdtype'])},
+                        {opt['lda']},
+                        {opt['array_prefix']}{opt['y']} + __b_idx*{opt['stride_b']},
+                        {dtype_to_cudadatatype(opt['ydtype'])},
+                        {opt['ldb']},
+                        {beta},
+                        {opt['array_prefix']}_c + __ib*{opt['stride_c']},
+                        {dtype_to_cudadatatype(opt['cdtype'])},
+                        {opt['ldc']},
+                        {acctype}, {algorithm});
+                }}}}
+                '''
+            else:
+                # Use StridedBatchedEx for uniform case
+                call = f'''
+                cublasGemmStridedBatchedEx(__dace_cublas_handle,
+                    CUBLAS_OP_{opt['ta']}, CUBLAS_OP_{opt['tb']},
+                    {opt['M']}, {opt['N']}, {opt['K']},
+                    {alpha},
+                    {opt['array_prefix']}{opt['x']},
+                    {dtype_to_cudadatatype(opt['xdtype'])},
+                    {opt['lda']}, {opt['stride_a']},
+                    {opt['array_prefix']}{opt['y']},
+                    {dtype_to_cudadatatype(opt['ydtype'])},
+                    {opt['ldb']}, {opt['stride_b']},
+                    {beta},
+                    {opt['array_prefix']}_c,
+                    {dtype_to_cudadatatype(opt['cdtype'])},
+                    {opt['ldc']}, {opt['stride_c']},
+                    {opt['BATCH']},
+                    {acctype}, {algorithm});
+                '''
 
         code = call_prefix + call + call_suffix
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
diff --git a/dace/libraries/blas/nodes/matmul.py b/dace/libraries/blas/nodes/matmul.py
@@ -104,6 +104,13 @@ def _get_batchmm_opts(a_shape, a_strides, b_shape, b_strides, c_shape, c_strides
     # Calculate strides for batched operations
     # For a tensor with shape [B1, B2, ..., M, K], the stride for batched operations
     # should be M*K (the size of each matrix) to iterate through all matrices in the flattened batch
+    #
+    # For broadcasting cases (e.g., A - [b1, b2, m, k] @ B - [b2, k, n]):
+    # - The flattened batch is b1*b2
+    # - B needs special handling: we need to compute which of the b2 matrices to use
+    #   For batch index i in [0, b1*b2), the B matrix index is (i % b2)
+    #   This can be expressed as: if A has more batch dims than B, use modulo arithmetic
+
     stride_a = 0
     stride_b = 0
     stride_c = 0
@@ -125,10 +132,35 @@ def _get_batchmm_opts(a_shape, a_strides, b_shape, b_strides, c_shape, c_strides
             if res is False:
                 raise ValueError(f'Output batch dimension mismatch: {c_dim} vs {r_dim} at position {i}')
 
+    # For partial broadcasting (3D-4D cases), we need to track additional information
+    # to properly index into the smaller batch dimension tensor
+    a_batch_multiplier = 1  # How many times to cycle through A's batch
+    b_batch_multiplier = 1  # How many times to cycle through B's batch
+
+    if len(a_batch_dims) < len(result_batch_dims):
+        # A has fewer batch dimensions, so it will be broadcast
+        # Calculate the size of the leading dimensions that A doesn't have
+        a_batch_multiplier = prod(result_batch_dims[:len(result_batch_dims) - len(a_batch_dims)])
+
+    if len(b_batch_dims) < len(result_batch_dims):
+        # B has fewer batch dimensions, so it will be broadcast
+        # Calculate the size of the leading dimensions that B doesn't have
+        b_batch_multiplier = prod(result_batch_dims[:len(result_batch_dims) - len(b_batch_dims)])
+
     if batch_size == 1 and not result_batch_dims:
         return {}
 
-    return {'sa': stride_a, 'sb': stride_b, 'sc': stride_c, 'b': batch_size, 'batch_dims': result_batch_dims}
+    return {
+        'sa': stride_a,
+        'sb': stride_b,
+        'sc': stride_c,
+        'b': batch_size,
+        'batch_dims': result_batch_dims,
+        'a_batch_size': prod(a_batch_dims) if a_batch_dims else 1,
+        'b_batch_size': prod(b_batch_dims) if b_batch_dims else 1,
+        'a_batch_multiplier': a_batch_multiplier,
+        'b_batch_multiplier': b_batch_multiplier
+    }
 
 
 def _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta, cdtype, func) -> Dict[str, Any]:
@@ -165,6 +197,7 @@ def _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta,
     if opt['swap']:
         if bopt:
             bopt['sa'], bopt['sb'] = bopt['sb'], bopt['sa']
+            bopt['a_batch_size'], bopt['b_batch_size'] = bopt['b_batch_size'], bopt['a_batch_size']
         opt['lda'], opt['ldb'] = opt['ldb'], opt['lda']
         opt['x'], opt['y'] = opt['y'], opt['x']
         opt['xdtype'], opt['ydtype'] = opt['ydtype'], opt['xdtype']
@@ -180,6 +213,8 @@ def _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta,
         opt['stride_b'] = sym2cpp(bopt['sb'])
         opt['stride_c'] = sym2cpp(bopt['sc'])
         opt['BATCH'] = sym2cpp(bopt['b'])
+        opt['a_batch_size'] = sym2cpp(bopt['a_batch_size'])
+        opt['b_batch_size'] = sym2cpp(bopt['b_batch_size'])
     else:
         opt['BATCH'] = None
 
diff --git a/tests/library/batched_matmul_test.py b/tests/library/batched_matmul_test.py
@@ -224,6 +224,111 @@ def bmm_4d_broadcast(A: dtype[m, k], B: dtype[b1, b2, k, n], C: dtype[b1, b2, m,
         assert np.allclose(ref, z)
 
 
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("pure", dace.float32),
+    pytest.param("pure", dace.float64),
+    pytest.param("MKL", dace.float32, marks=pytest.mark.mkl),
+    pytest.param("MKL", dace.float64, marks=pytest.mark.mkl),
+    pytest.param("cuBLAS", dace.float32, marks=pytest.mark.gpu),
+    pytest.param("cuBLAS", dace.float64, marks=pytest.mark.gpu),
+    pytest.param("OpenBLAS", dace.float32, marks=pytest.mark.lapack),
+    pytest.param("OpenBLAS", dace.float64, marks=pytest.mark.lapack)
+])
+def test_batchmm_3d_4d_broadcast(implementation: str, dtype):
+    """Test 4D batched matmul with broadcast on LHS: [b2, m, k] @ [b1, b2, k, n]"""
+    b1, b2, m, n, k = 4, 2, 64, 128, 64
+
+    @dace.program
+    def bmm_3d_4d_broadcast(A: dtype[b2, m, k], B: dtype[b1, b2, k, n], C: dtype[b1, b2, m, n]):
+        C[:] = A @ B
+
+    with change_default(blas, implementation):
+        sdfg = bmm_3d_4d_broadcast.to_sdfg()
+        sdfg.simplify()
+        sdfg.expand_library_nodes()
+
+        x = np.random.rand(b2, m, k).astype(dtype.as_numpy_dtype())
+        y = np.random.rand(b1, b2, k, n).astype(dtype.as_numpy_dtype())
+        z = np.zeros([b1, b2, m, n]).astype(dtype.as_numpy_dtype())
+
+        csdfg = sdfg.compile()
+        csdfg(A=x, B=y, C=z)
+
+        ref = x @ y
+
+        assert np.allclose(ref, z)
+
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("pure", dace.float32),
+    pytest.param("pure", dace.float64),
+    pytest.param("MKL", dace.float32, marks=pytest.mark.mkl),
+    pytest.param("MKL", dace.float64, marks=pytest.mark.mkl),
+    pytest.param("cuBLAS", dace.float32, marks=pytest.mark.gpu),
+    pytest.param("cuBLAS", dace.float64, marks=pytest.mark.gpu),
+    pytest.param("OpenBLAS", dace.float32, marks=pytest.mark.lapack),
+    pytest.param("OpenBLAS", dace.float64, marks=pytest.mark.lapack)
+])
+def test_batchmm_4d_3d_broadcast(implementation: str, dtype):
+    """Test 4D batched matmul with broadcast on RHS: [b1, b2, m, k] @ [b2, k, n]"""
+    b1, b2, m, n, k = 4, 2, 64, 128, 64
+
+    @dace.program
+    def bmm_4d_3d_broadcast(A: dtype[b1, b2, m, k], B: dtype[b2, k, n], C: dtype[b1, b2, m, n]):
+        C[:] = A @ B
+
+    with change_default(blas, implementation):
+        sdfg = bmm_4d_3d_broadcast.to_sdfg()
+        sdfg.simplify()
+        sdfg.expand_library_nodes()
+
+        x = np.random.rand(b1, b2, m, k).astype(dtype.as_numpy_dtype())
+        y = np.random.rand(b2, k, n).astype(dtype.as_numpy_dtype())
+        z = np.zeros([b1, b2, m, n]).astype(dtype.as_numpy_dtype())
+
+        csdfg = sdfg.compile()
+        csdfg(A=x, B=y, C=z)
+
+        ref = x @ y
+
+        assert np.allclose(ref, z)
+
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("pure", dace.float32),
+    pytest.param("pure", dace.float64),
+    pytest.param("MKL", dace.float32, marks=pytest.mark.mkl),
+    pytest.param("MKL", dace.float64, marks=pytest.mark.mkl),
+    pytest.param("cuBLAS", dace.float32, marks=pytest.mark.gpu),
+    pytest.param("cuBLAS", dace.float64, marks=pytest.mark.gpu),
+    pytest.param("OpenBLAS", dace.float32, marks=pytest.mark.lapack),
+    pytest.param("OpenBLAS", dace.float64, marks=pytest.mark.lapack)
+])
+def test_batchmm_5d_3d_broadcast(implementation: str, dtype):
+    """Test 5D batched matmul with broadcast on RHS: [b1, b2, b3, m, k] @ [b3, k, n]"""
+    b1, b2, b3, m, n, k = 4, 2, 3, 64, 128, 64
+
+    @dace.program
+    def bmm_5d_3d_broadcast(A: dtype[b1, b2, b3, m, k], B: dtype[b3, k, n], C: dtype[b1, b2, b3, m, n]):
+        C[:] = A @ B
+
+    with change_default(blas, implementation):
+        sdfg = bmm_5d_3d_broadcast.to_sdfg()
+        sdfg.simplify()
+        sdfg.expand_library_nodes()
+
+        x = np.random.rand(b1, b2, b3, m, k).astype(dtype.as_numpy_dtype())
+        y = np.random.rand(b3, k, n).astype(dtype.as_numpy_dtype())
+        z = np.zeros([b1, b2, b3, m, n]).astype(dtype.as_numpy_dtype())
+
+        csdfg = sdfg.compile()
+        csdfg(A=x, B=y, C=z)
+
+        ref = x @ y
+
+        assert np.allclose(ref, z)
+
+
 if __name__ == "__main__":
     test_batchmm("pure", dace.float32)
     test_batchmm("pure", dace.float64)
@@ -261,3 +366,21 @@ def bmm_4d_broadcast(A: dtype[m, k], B: dtype[b1, b2, k, n], C: dtype[b1, b2, m,
     test_batchmm_4d_broadcast_lhs("MKL", dace.float64)
     test_batchmm_4d_broadcast_lhs("cuBLAS", dace.float32)
     test_batchmm_4d_broadcast_lhs("cuBLAS", dace.float64)
+    test_batchmm_3d_4d_broadcast("pure", dace.float32)
+    test_batchmm_3d_4d_broadcast("pure", dace.float64)
+    test_batchmm_3d_4d_broadcast("MKL", dace.float32)
+    test_batchmm_3d_4d_broadcast("MKL", dace.float64)
+    test_batchmm_3d_4d_broadcast("cuBLAS", dace.float32)
+    test_batchmm_3d_4d_broadcast("cuBLAS", dace.float64)
+    test_batchmm_4d_3d_broadcast("pure", dace.float32)
+    test_batchmm_4d_3d_broadcast("pure", dace.float64)
+    test_batchmm_4d_3d_broadcast("MKL", dace.float32)
+    test_batchmm_4d_3d_broadcast("MKL", dace.float64)
+    test_batchmm_4d_3d_broadcast("cuBLAS", dace.float32)
+    test_batchmm_4d_3d_broadcast("cuBLAS", dace.float64)
+    test_batchmm_5d_3d_broadcast("pure", dace.float32)
+    test_batchmm_5d_3d_broadcast("pure", dace.float64)
+    test_batchmm_5d_3d_broadcast("MKL", dace.float32)
+    test_batchmm_5d_3d_broadcast("MKL", dace.float64)
+    test_batchmm_5d_3d_broadcast("cuBLAS", dace.float32)
+    test_batchmm_5d_3d_broadcast("cuBLAS", dace.float64)