build for sm120a

gau-nernst · gau-nernst · commit 563fc7c44e7f · 2025-06-08T22:08:48.000+08:00
diff --git a/setup.py b/setup.py
@@ -515,7 +515,7 @@ def get_extensions():
                 "-DCUTE_USE_PACKED_TUPLE=1",
                 "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
                 "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
-                "-DCUTLASS_DEBUG_TRACE_LEVEL=0",
+                "-DCUTLASS_DEBUG_TRACE_LEVEL=1",
                 "--ftemplate-backtrace-limit=0",
                 # "--keep",
                 # "--ptxas-options=--verbose,--register-usage-level=5,--warn-on-local-memory-usage",
@@ -526,6 +526,7 @@ def get_extensions():
         )
 
         build_for_sm90a, build_for_sm100a = get_cutlass_build_flags()
+        build_for_sm100a = True
         # Define sm90a sources
         cutlass_90a_sources = [
             os.path.join(
@@ -623,7 +624,8 @@ def get_extensions():
         cutlass_100a_extra_compile_args = copy.deepcopy(extra_compile_args)
         # Only use sm100a architecture for these sources, ignoring cuda_arch_flags
         cutlass_100a_extra_compile_args["nvcc"].append(
-            "-gencode=arch=compute_100a,code=sm_100a"
+            # "-gencode=arch=compute_100a,code=sm_100a"
+            "-gencode=arch=compute_120a,code=sm_120a",
         )
         ext_modules.append(
             extension(
diff --git a/torchao/csrc/cuda/mx_kernels/mx_fp_cutlass_kernels.cu b/torchao/csrc/cuda/mx_kernels/mx_fp_cutlass_kernels.cu
@@ -69,7 +69,7 @@ void run_gemm(at::Tensor& a, at::Tensor& b, at::Tensor& a_scale,
   constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
   // Kernel functional config
   using ElementAccumulator  = float;                                          // Element type for internal accumulation
-  using ArchTag             = cutlass::arch::Sm100;                           // Tag indicating the minimum SM that supports the intended feature
+  using ArchTag             = cutlass::arch::Sm120;                           // Tag indicating the minimum SM that supports the intended feature
   using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // Operator class tag
 
 
@@ -241,7 +241,8 @@ at::Tensor mx_fp4_bf16(at::Tensor a, at::Tensor b, at::Tensor a_scale,
   using ElementD = cutlass::bfloat16_t;
 
   using MmaTileShape        = Shape<_128,_128,_128>;
-  using ClusterShape        = Shape<_2,_1,_1>;
+  // using ClusterShape        = Shape<_2,_1,_1>;
+  using ClusterShape        = Shape<_1,_1,_1>;
   using PerSmTileShape_MNK  = Shape<_128,_128,_128>;
 
   run_gemm<ElementA, ElementB, ElementD, MmaTileShape, ClusterShape, PerSmTileShape_MNK>(a, b, a_scale, b_scale, out, M, K, N);