Fix elements_per_thread propagation to ignore memory operands

tyb0807 · tyb0807 · commit 712f22e839a4 · 2025-12-23T01:41:26.000+01:00
Changes: - ReadOp: Only propagate attribute to result (register), ignore memory - WriteOp: Only validate/propagate with register operand, ignore memory This fixes false positives where memory resharding was incorrectly flagged as propagation errors. Fixes #622. Signed-off-by: tyb0807 <sontuan.vu@amd.com>
diff --git a/water/include/water/Dialect/Wave/IR/WaveOps.td b/water/include/water/Dialect/Wave/IR/WaveOps.td
@@ -280,7 +280,7 @@ def ExtractSliceOp : WaveOp<"extract_slice", [WaveInferTypeOpInterface, Identity
 
 def ReadOp : WaveOp<"read", [
     WaveInferTypeOpInterface, IdentityTypeInferenceOpTrait,
-    WaveElementsPerThreadOpInterface, AttrBasedElementsPerThreadOpTrait,
+    DeclareOpInterfaceMethods<WaveElementsPerThreadOpInterface>,
     CompatibleOperandsAndResultsIgnoreSpaceOpTrait,
     WaveInferIndexExprsOpInterface, IdentityIndexExprsOpTrait]> {
   let summary = "Reads from memory";
@@ -334,7 +334,7 @@ def RegisterOp : WaveOp<"register", [
 
 def WriteOp : WaveOp<"write", [
     WaveInferTypeOpInterface, NoOpTypeInferenceOpTrait,
-    WaveElementsPerThreadOpInterface, AttrBasedElementsPerThreadOpTrait,
+    DeclareOpInterfaceMethods<WaveElementsPerThreadOpInterface>,
     CompatibleOperandsAndResultsIgnoreSpaceOpTrait,
     DeclareOpInterfaceMethods<WaveInferIndexExprsOpInterface>]> {
   let summary = "Writes into memory";
diff --git a/water/lib/Dialect/Wave/IR/WaveOps.cpp b/water/lib/Dialect/Wave/IR/WaveOps.cpp
@@ -1321,6 +1321,32 @@ LogicalResult ReadOp::verify() {
                                bounds.getMapping());
 }
 
+llvm::FailureOr<mlir::ChangeResult> wave::ReadOp::propagateElementsPerThreadForward(
+    llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>,
+    llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue> resultElements,
+    llvm::raw_ostream &errs) {
+  // ReadOp only propagates elements_per_thread attribute to result (register)
+  // Memory operand is ignored for propagation - you can read any number of elements
+  // from memory regardless of how many were written
+  std::optional<int64_t> elementsPerThread = getElementsPerThread();
+  if (!elementsPerThread)
+    return mlir::ChangeResult::NoChange;
+
+  wave::ElementsPerThreadLatticeValue expectedResult(*elementsPerThread);
+  return wave::detail::checkAndPropagateElementsPerThreadFromConstant(
+      expectedResult, llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>(),
+      resultElements, "elements_per_thread attribute", "", "result", errs);
+}
+
+llvm::FailureOr<mlir::ChangeResult> wave::ReadOp::propagateElementsPerThreadBackward(
+    llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue>,
+    llvm::ArrayRef<wave::ElementsPerThreadLatticeValue> resultElements,
+    llvm::raw_ostream &) {
+  // ReadOp doesn't propagate backward to memory operand
+  // Memory is decoupled from register dataflow for elements_per_thread
+  return mlir::ChangeResult::NoChange;
+}
+
 //-----------------------------------------------------------------------------
 // RegisterOp
 //-----------------------------------------------------------------------------
@@ -1402,6 +1428,46 @@ LogicalResult WriteOp::verify() {
                                bounds.getMapping());
 }
 
+llvm::FailureOr<mlir::ChangeResult> wave::WriteOp::propagateElementsPerThreadForward(
+    llvm::ArrayRef<wave::ElementsPerThreadLatticeValue> operandElements,
+    llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue>,
+    llvm::raw_ostream &errs) {
+  // WriteOp only validates that elements_per_thread attribute matches register operand
+  // Memory operand is ignored for propagation - you can write to memory with any layout
+  std::optional<int64_t> elementsPerThread = getElementsPerThread();
+  if (!elementsPerThread)
+    return mlir::ChangeResult::NoChange;
+
+  // Validate register operand (value_to_store) matches attribute
+  wave::ElementsPerThreadLatticeValue expectedValue(*elementsPerThread);
+  llvm::ArrayRef<wave::ElementsPerThreadLatticeValue> valueOnly =
+      operandElements.slice(0, 1); // Only first operand (value_to_store)
+
+  return wave::detail::checkAndPropagateElementsPerThreadFromConstant(
+      expectedValue, valueOnly, llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue>(),
+      "elements_per_thread attribute", "register operand", "", errs);
+}
+
+llvm::FailureOr<mlir::ChangeResult> wave::WriteOp::propagateElementsPerThreadBackward(
+    llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue> operandElements,
+    llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>,
+    llvm::raw_ostream &errs) {
+  // WriteOp only propagates backward to register operand (value_to_store)
+  // Memory operand is ignored - you can write any layout to memory
+  std::optional<int64_t> elementsPerThread = getElementsPerThread();
+  if (!elementsPerThread)
+    return mlir::ChangeResult::NoChange;
+
+  // Propagate to register operand only
+  wave::ElementsPerThreadLatticeValue expectedValue(*elementsPerThread);
+  llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue> valueOnly =
+      operandElements.slice(0, 1); // Only first operand (value_to_store)
+
+  return wave::detail::checkAndPropagateElementsPerThreadFromConstant(
+      expectedValue, llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>(),
+      valueOnly, "elements_per_thread attribute", "", "register operand", errs);
+}
+
 // Propagate index expressions forward from the operands to the result of the
 // WriteOp. Since WriteOp has no results, this is a no-op.
 llvm::FailureOr<mlir::ChangeResult> wave::WriteOp::propagateIndexExprsForward(
diff --git a/water/test/Dialect/Wave/propagate-elements-per-thread.mlir b/water/test/Dialect/Wave/propagate-elements-per-thread.mlir
@@ -100,7 +100,7 @@ func.func @missing_elements_per_thread(%mem: !wave.tensor<[@M] of f16, <global>>
 module attributes {wave.normal_form = #wave.normal_form<full_types>} {
 func.func @read_write_conflict(%mem: !wave.tensor<[@M] of f16, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 128}>}  {
   %reg = wave.read %mem {elements_per_thread = 4} : (!wave.tensor<[@M] of f16, <global>>) -> !wave.tensor<[@M] of f16, <register>>
-  // expected-error @below {{failed to propagate elements per thread backward: mismatch between elements_per_thread attribute (8) and operand #0 (4)}}
+  // expected-error @below {{failed to propagate elements per thread backward: mismatch between elements_per_thread attribute (8) and register operand #0 (4)}}
   wave.write %reg, %mem {elements_per_thread = 8} : !wave.tensor<[@M] of f16, <register>>, !wave.tensor<[@M] of f16, <global>>
   return
 }
@@ -112,7 +112,7 @@ module attributes {wave.normal_form = #wave.normal_form<full_types>} {
 func.func @read_write_conflict_indirect(%mem: !wave.tensor<[@M] of f16, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 128}>}  {
   %reg = wave.read %mem {elements_per_thread = 4} : (!wave.tensor<[@M] of f16, <global>>) -> !wave.tensor<[@M] of f16, <register>>
   %val = wave.exp2 %reg : (!wave.tensor<[@M] of f16, <register>>) -> !wave.tensor<[@M] of f16, <register>>
-  // expected-error @below {{failed to propagate elements per thread backward: mismatch between elements_per_thread attribute (8) and operand #0 (4)}}
+  // expected-error @below {{failed to propagate elements per thread backward: mismatch between elements_per_thread attribute (8) and register operand #0 (4)}}
   wave.write %reg, %mem {elements_per_thread = 8} : !wave.tensor<[@M] of f16, <register>>, !wave.tensor<[@M] of f16, <global>>
   return
 }
@@ -162,6 +162,66 @@ module {
 
 // -----
 
+// CHECK: #wave.normal_form<full_types,memory_only_types>
+module attributes {wave.normal_form = #wave.normal_form<full_types>} {
+// CHECK-LABEL: @memory_resharding_allowed
+func.func @memory_resharding_allowed(%mem: !wave.tensor<[@M] of f16, <shared>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 128}>} {
+  %cst = arith.constant 0.0 : f16
+  // Register gets 8 elements per thread from write operation's backward propagation
+  // CHECK: wave.register {{.*}} : vector<8xf16>
+  %reg8 = wave.register %cst : !wave.tensor<[@M] of f16, <register>>
+
+  // Write 8 elements per thread to memory
+  // CHECK: wave.write {{.*}} : vector<8xf16>, !wave.tensor<[@M] of f16, <shared>>
+  wave.write %reg8, %mem {elements_per_thread = 8} : !wave.tensor<[@M] of f16, <register>>, !wave.tensor<[@M] of f16, <shared>>
+
+  // Read 4 elements per thread from same memory - this should be allowed (memory resharding)
+  // CHECK: wave.read {{.*}} : (!wave.tensor<[@M] of f16, <shared>>) -> vector<4xf16>
+  %reg4 = wave.read %mem {elements_per_thread = 4} : (!wave.tensor<[@M] of f16, <shared>>) -> !wave.tensor<[@M] of f16, <register>>
+
+  return
+}
+}
+
+// -----
+
+// CHECK: #wave.normal_form<full_types,memory_only_types>
+module attributes {wave.normal_form = #wave.normal_form<full_types>} {
+// CHECK-LABEL: @write_backward_propagation
+func.func @write_backward_propagation(%mem: !wave.tensor<[@M] of f16, <shared>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 128}>} {
+  %cst = arith.constant 0.0 : f16
+  // RegisterOp without explicit elements_per_thread - should get it from backward propagation
+  // CHECK: wave.register {{.*}} : vector<4xf16>
+  %reg = wave.register %cst : !wave.tensor<[@M] of f16, <register>>
+
+  // WriteOp should propagate elements_per_thread backward to register operand
+  // CHECK: wave.write {{.*}} : vector<4xf16>, !wave.tensor<[@M] of f16, <shared>>
+  wave.write %reg, %mem {elements_per_thread = 4} : !wave.tensor<[@M] of f16, <register>>, !wave.tensor<[@M] of f16, <shared>>
+
+  return
+}
+}
+
+// -----
+
+// CHECK: #wave.normal_form<full_types,memory_only_types>
+module attributes {wave.normal_form = #wave.normal_form<full_types>} {
+// CHECK-LABEL: @read_register_propagation
+func.func @read_register_propagation(%mem: !wave.tensor<[@M] of f16, <shared>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 128}>} {
+  // ReadOp should only propagate to its register result, not validate memory
+  // CHECK: wave.read {{.*}} : (!wave.tensor<[@M] of f16, <shared>>) -> vector<6xf16>
+  %reg = wave.read %mem {elements_per_thread = 6} : (!wave.tensor<[@M] of f16, <shared>>) -> !wave.tensor<[@M] of f16, <register>>
+
+  // Downstream operation should get 6 elements per thread
+  // CHECK: wave.exp2 {{.*}} : (vector<6xf16>) -> vector<6xf16>
+  %result = wave.exp2 %reg : (!wave.tensor<[@M] of f16, <register>>) -> !wave.tensor<[@M] of f16, <register>>
+
+  return
+}
+}
+
+// -----
+
 module attributes {wave.normal_form = #wave.normal_form<full_types>} {
 func.func @mma_uninitialized_lhs(%mem1: !wave.tensor<[@N, @K] of f16, <global>>, %mem2: !wave.tensor<[@M, @N] of f32, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 16, N = 16, K = 16}>, wave.constraints = [#wave.hardware_constraint<threads_per_wave = 32, waves_per_block = [1, 1, 1], mma_type = #wave.mma_kind<f32_16x16x16_f16>, vector_shapes = {M = 1, N = 1, K = 16}, max_bits_per_load = 128>]} {
   // LHS without elements_per_thread - this will remain uninitialized.