-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[RISCV][VLOPT] Remove unnecessary passthru restriction #124549
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
We currently check for passthrus in two places, on the instruction to reduce in isCandidate, and on the users in checkUsers. We cannot reduce the VL if an instruction has a user that's a passthru, because the user will read elements past VL in the tail. However it's fine to reduce an instruction if it itself contains a non-undef passthru. Since the VL can only be reduced, not increased, the previous tail will always remain the same.
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesWe currently check for passthrus in two places, on the instruction to reduce in isCandidate, and on the users in checkUsers. We cannot reduce the VL if an instruction has a user that's a passthru, because the user will read elements past VL in the tail. However it's fine to reduce an instruction if it itself contains a non-undef passthru. Since the VL can only be reduced, not increased, the previous tail will always remain the same. Full diff: https://github.com/llvm/llvm-project/pull/124549.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 976c65e51c2059..be897bb2577961 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -1143,27 +1143,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
if (MI.getNumDefs() != 1)
return false;
- // If we're not using VLMAX, then we need to be careful whether we are using
- // TA/TU when there is a non-undef Passthru. But when we are using VLMAX, it
- // does not matter whether we are using TA/TU with a non-undef Passthru, since
- // there are no tail elements to be preserved.
unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
const MachineOperand &VLOp = MI.getOperand(VLOpNum);
- if (VLOp.isReg() || VLOp.getImm() != RISCV::VLMaxSentinel) {
- // If MI has a non-undef passthru, we will not try to optimize it since
- // that requires us to preserve tail elements according to TA/TU.
- // Otherwise, The MI has an undef Passthru, so it doesn't matter whether we
- // are using TA/TU.
- bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc);
- unsigned PassthruOpIdx = MI.getNumExplicitDefs();
- if (HasPassthru &&
- MI.getOperand(PassthruOpIdx).getReg() != RISCV::NoRegister) {
- LLVM_DEBUG(
- dbgs() << " Not a candidate because it uses non-undef passthru"
- " with non-VLMAX VL\n");
- return false;
- }
- }
// If the VL is 1, then there is no need to reduce it. This is an
// optimization, not needed to preserve correctness.
@@ -1247,7 +1228,7 @@ std::optional<MachineOperand> RISCVVLOptimizer::checkUsers(MachineInstr &MI) {
return std::nullopt;
}
- // Tied operands might pass through.
+ // If used as a passthru, elements past VL will be read.
if (UserOp.isTied()) {
LLVM_DEBUG(dbgs() << " Abort because user used as tied operand\n");
return std::nullopt;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 49db94e1a02df1..9dbe261b7cd054 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -3919,11 +3919,12 @@ define void @trunc_v6bf16(ptr %x) {
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vfabs.v v8, v10
; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
@@ -4002,11 +4003,12 @@ define void @trunc_v6f16(ptr %x) {
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfabs.v v8, v10
; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t
; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: vse16.v v8, (a0)
; ZVFHMIN-NEXT: ret
@@ -4098,12 +4100,13 @@ define void @ceil_v6bf16(ptr %x) {
; CHECK-NEXT: vfabs.v v8, v10
; CHECK-NEXT: vmflt.vf v0, v8, fa5
; CHECK-NEXT: fsrmi a1, 3
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
; CHECK-NEXT: fsrm a1
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
@@ -4189,12 +4192,13 @@ define void @ceil_v6f16(ptr %x) {
; ZVFHMIN-NEXT: vfabs.v v8, v10
; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
; ZVFHMIN-NEXT: fsrmi a1, 3
+; ZVFHMIN-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
; ZVFHMIN-NEXT: fsrm a1
; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: vse16.v v8, (a0)
; ZVFHMIN-NEXT: ret
@@ -4290,12 +4294,13 @@ define void @floor_v6bf16(ptr %x) {
; CHECK-NEXT: vfabs.v v8, v10
; CHECK-NEXT: vmflt.vf v0, v8, fa5
; CHECK-NEXT: fsrmi a1, 2
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
; CHECK-NEXT: fsrm a1
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
@@ -4381,12 +4386,13 @@ define void @floor_v6f16(ptr %x) {
; ZVFHMIN-NEXT: vfabs.v v8, v10
; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
; ZVFHMIN-NEXT: fsrmi a1, 2
+; ZVFHMIN-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
; ZVFHMIN-NEXT: fsrm a1
; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: vse16.v v8, (a0)
; ZVFHMIN-NEXT: ret
@@ -4482,12 +4488,13 @@ define void @round_v6bf16(ptr %x) {
; CHECK-NEXT: vfabs.v v8, v10
; CHECK-NEXT: vmflt.vf v0, v8, fa5
; CHECK-NEXT: fsrmi a1, 4
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
; CHECK-NEXT: fsrm a1
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
@@ -4573,12 +4580,13 @@ define void @round_v6f16(ptr %x) {
; ZVFHMIN-NEXT: vfabs.v v8, v10
; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
; ZVFHMIN-NEXT: fsrmi a1, 4
+; ZVFHMIN-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
; ZVFHMIN-NEXT: fsrm a1
; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: vse16.v v8, (a0)
; ZVFHMIN-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
index 1cc30f077feb4a..3e49da014d56fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
@@ -107,7 +107,8 @@ define <vscale x 4 x i32> @different_vl_with_ta(<vscale x 4 x i32> %a, <vscale x
ret <vscale x 4 x i32> %w
}
-; Test case to make sure VL won't propgate if using tail-undisturbed policy.
+; We can propagate VL to a tail-undisturbed policy, provided none of its users
+; are passthrus (i.e. read past VL).
define <vscale x 4 x i32> @different_vl_with_tu(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
; CHECK-LABEL: different_vl_with_tu:
; CHECK: # %bb.0:
@@ -118,22 +119,65 @@ define <vscale x 4 x i32> @different_vl_with_tu(<vscale x 4 x i32> %passthru, <v
; CHECK-NEXT: vadd.vv v8, v14, v10
; CHECK-NEXT: ret
%v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1)
- %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen %vl2)
+ %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen %vl2)
ret <vscale x 4 x i32> %w
}
-; Test case to make sure VL won't propgate if using tail-undisturbed policy.
+; We can propagate VL to a tail-undisturbed policy, provided none of its users
+; are passthrus (i.e. read past VL).
define <vscale x 4 x i32> @different_imm_vl_with_tu(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
-; CHECK-LABEL: different_imm_vl_with_tu:
+; NOVLOPT-LABEL: different_imm_vl_with_tu:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetivli zero, 5, e32, m2, tu, ma
+; NOVLOPT-NEXT: vmv2r.v v14, v10
+; NOVLOPT-NEXT: vadd.vv v14, v10, v12
+; NOVLOPT-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; NOVLOPT-NEXT: vadd.vv v8, v14, v10
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: different_imm_vl_with_tu:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; VLOPT-NEXT: vmv2r.v v14, v10
+; VLOPT-NEXT: vadd.vv v14, v10, v12
+; VLOPT-NEXT: vadd.vv v8, v14, v10
+; VLOPT-NEXT: ret
+ %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 5)
+ %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
+ ret <vscale x 4 x i32> %w
+}
+
+; We can't reduce the VL as %v is used as a passthru, i.e. the elements past VL
+; are demanded.
+define <vscale x 4 x i32> @different_vl_as_passthru(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+; CHECK-LABEL: different_vl_as_passthru:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: vadd.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma
+; CHECK-NEXT: vadd.vv v12, v8, v10
+; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1)
+ %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %v, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl2)
+ ret <vscale x 4 x i32> %w
+}
+
+; We can't reduce the VL as %v is used as a passthru, i.e. the elements past VL
+; are demanded.
+define <vscale x 4 x i32> @different_imm_vl_as_passthru(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+; CHECK-LABEL: different_imm_vl_as_passthru:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; CHECK-NEXT: vmv2r.v v14, v10
-; CHECK-NEXT: vadd.vv v14, v10, v12
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: vadd.vv v12, v8, v10
; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT: vadd.vv v8, v14, v10
+; CHECK-NEXT: vadd.vv v12, v8, v10
+; CHECK-NEXT: vmv2r.v v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 5)
- %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen 4)
+ %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %v, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 4)
ret <vscale x 4 x i32> %w
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
We currently check for passthrus in two places, on the instruction to reduce in isCandidate, and on the users in checkUsers.
We cannot reduce the VL if an instruction has a user that's a passthru, because the user will read elements past VL in the tail.
However it's fine to reduce an instruction if it itself contains a non-undef passthru. Since the VL can only be reduced, not increased, the previous tail will always remain the same.