-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[VPlan] Extract reverse operation for reverse accesses #146525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Mel Chen (Mel-Chen) ChangesThis patch introduces VPInstruction::Reverse and extracts the reverse operations of loaded/stored values from reverse memory accesses. This extraction facilitates future support for permutation elimination within VPlan. Patch is 69.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146525.diff 18 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 67a51c12b508e..d5aeb4feb19ba 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1541,6 +1541,12 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
0, cast<VectorType>(ICA.getReturnType()));
}
+ case Intrinsic::experimental_vp_reverse: {
+ return getShuffleCost(TTI::SK_Reverse,
+ cast<VectorType>(ICA.getReturnType()),
+ cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
+ 0, cast<VectorType>(ICA.getReturnType()));
+ }
}
if (ST->hasVInstructions() && RetTy->isVectorTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b01c8b02ec66a..94782c33f5bda 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8880,6 +8880,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// bring the VPlan to its final state.
// ---------------------------------------------------------------------------
+ // Adjust the result of reverse memory accesses.
+ VPlanTransforms::runPass(VPlanTransforms::adjustRecipesForReverseAccesses,
+ *Plan);
+
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 61b5ccd85bc6e..55175a889d0e0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -970,6 +970,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
// It produces the lane index across all unrolled iterations. Unrolling will
// add all copies of its original operand as additional operands.
FirstActiveLane,
+ // Returns a reversed vector for the operand.
+ Reverse,
// The opcodes below are used for VPInstructionWithType.
//
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f3b99fe34c069..f87b6de42c8b8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -126,6 +126,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return IntegerType::get(Ctx, 1);
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
+ case VPInstruction::Reverse:
// Return the type based on first operand.
return inferScalarType(R->getOperand(0));
case VPInstruction::BranchOnCond:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a38932ef99fe..b4ed4ef3147c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -444,6 +444,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
+ case VPInstruction::Reverse:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -873,6 +874,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Res;
}
+ case VPInstruction::Reverse: {
+ return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -948,6 +952,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::Reverse: {
+ assert(VF.isVector() && "Reverse operation must be vector type");
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
+ return Ctx.TTI.getShuffleCost(
+ TargetTransformInfo::SK_Reverse, cast<VectorType>(VectorTy),
+ cast<VectorType>(VectorTy), {}, Ctx.CostKind, 0);
+ }
case VPInstruction::ExtractPenultimateElement:
if (VF == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
@@ -1033,6 +1044,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::WideIVStep:
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::Reverse:
return false;
default:
return true;
@@ -1179,6 +1191,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ReductionStartVector:
O << "reduction-start-vector";
break;
+ case VPInstruction::Reverse:
+ O << "reverse";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -2967,12 +2982,7 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
OpInfo, &Ingredient);
}
- if (!Reverse)
- return Cost;
-
- return Cost += Ctx.TTI.getShuffleCost(
- TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
- cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
+ return Cost;
}
void VPWidenLoadRecipe::execute(VPTransformState &State) {
@@ -3004,8 +3014,6 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
applyMetadata(*cast<Instruction>(NewLI));
- if (Reverse)
- NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
State.set(this, NewLI);
}
@@ -3061,8 +3069,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
Instruction *Res = NewLI;
- if (isReverse())
- Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
}
@@ -3083,12 +3089,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
- if (!Reverse)
- return Cost;
- return Cost + Ctx.TTI.getShuffleCost(
- TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
- cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
+ return Cost;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3118,13 +3120,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}
Value *StoredVal = State.get(StoredVPValue);
- if (isReverse()) {
- // If we store to reverse consecutive memory locations, then we need
- // to reverse the order of elements in the stored value.
- StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
- // We don't want to update the value in the map as it might be used in
- // another expression. So don't call resetVectorValue(StoredVal).
- }
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
if (CreateScatter)
@@ -3154,8 +3149,6 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
- if (isReverse())
- StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
@@ -3196,12 +3189,8 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
- if (!Reverse)
- return Cost;
- return Cost + Ctx.TTI.getShuffleCost(
- TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
- cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
+ return Cost;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 730deb0686b2a..cf41b6d00f285 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2172,6 +2172,14 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
VPI->getDebugLoc());
}
+ if (VPI->getOpcode() == VPInstruction::Reverse) {
+ SmallVector<VPValue *> Ops(VPI->operands());
+ Ops.append({&AllOneMask, &EVL});
+ return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_reverse,
+ Ops, TypeInfo.inferScalarType(VPI),
+ VPI->getDebugLoc());
+ }
+
VPValue *LHS, *RHS;
// Transform select with a header mask condition
// select(header_mask, LHS, RHS)
@@ -3347,3 +3355,34 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(VPlan &Plan,
MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
}
+
+void VPlanTransforms::adjustRecipesForReverseAccesses(VPlan &Plan) {
+ if (Plan.hasScalarVFOnly())
+ return;
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R);
+ if (!MemR || !MemR->isReverse())
+ continue;
+
+ if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR)) {
+ auto *Reverse =
+ new VPInstruction(VPInstruction::Reverse, {L}, L->getDebugLoc());
+ Reverse->insertAfter(L);
+ L->replaceAllUsesWith(Reverse);
+ Reverse->setOperand(0, L);
+ continue;
+ }
+
+ if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR)) {
+ VPValue *StoredVal = S->getStoredValue();
+ auto *Reverse = new VPInstruction(VPInstruction::Reverse, {StoredVal},
+ S->getDebugLoc());
+ Reverse->insertBefore(S);
+ S->setOperand(1, Reverse);
+ }
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 40885cd52a127..abe592247e2de 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -239,6 +239,20 @@ struct VPlanTransforms {
/// Add branch weight metadata, if the \p Plan's middle block is terminated by
/// a BranchOnCond recipe.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF);
+
+ /// Add reverse recipes for reverse memory accesses.
+ /// For reverse loads, transform
+ /// WIDEN ir<%L> = load vp<%addr>
+ /// into
+ /// WIDEN ir<%L> = load vp<%addr>
+ /// EMIT vp<%RevL> = reverse ir<%L>
+ ///
+ /// For reverse stores, transform
+ /// WIDEN store vp<%addr>, ir<%SVal>
+ /// into
+ /// EMIT vp<%RevS> = reverse ir<%SVal>
+ /// WIDEN store vp<%addr>, vp<%RevS>
+ static void adjustRecipesForReverseAccesses(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index 9485d827ced40..c838c63545341 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -22,8 +22,8 @@ define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
-; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
+; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index 1dd49ecf85b81..d6f619cce54a0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -37,8 +37,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -24
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -56
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x double> [[WIDE_LOAD1]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 09b274de30214..6d55f7369f01e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -165,8 +165,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
; RV64-UF2-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
@@ -180,8 +180,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4
; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4
; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -371,8 +371,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP18]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
@@ -386,8 +386,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4
; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4
; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index dd8b7d6ea7e42..6d49a7fc16ad5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -105,10 +105,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
; CHECK-NEXT: WIDEN ir<%1> = load vp<%9>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1>
+; CHECK-NEXT: EMIT vp<%10> = reverse ir<%1>
+; CHECK-NEXT: WIDEN ir<%add9> = add vp<%10>, ir<1>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
-; CHECK-NEXT: WIDEN store vp<%10>, ir<%add9>
+; CHECK-NEXT: vp<%11> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
+; CHECK-NEXT: EMIT vp<%12> = reverse ir<%add9>
+; CHECK-NEXT: WIDEN store vp<%11>, vp<%12>
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2>
; CHECK-NEXT: No successors
@@ -167,8 +169,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV(REG): At #9 Interval # 3
; CHECK-NEXT: LV(REG): At #10 Interval # 3
; CHECK-NEXT: LV(REG): At #11 Interval # 3
-; CHECK-NEXT: LV(REG): At #12 Interval # 2
-; CHECK-NEXT: LV(REG): At #13 Interval # 2
+; CHECK-NEXT: LV(REG): At #12 Interval # 3
+; CHECK-NEXT: LV(REG): At #13 Interval # 3
+; CHECK-NEXT: LV(REG): At #14 Interval # 2
+; CHECK-NEXT: LV(REG): At #15 Interval # 2
; CHECK-NEXT: LV(REG): VF = vscale x 4
; CHECK-NEXT: LV(REG): Found max...
[truncated]
|
This patch introduces VPInstruction::Reverse and extracts the reverse operations of loaded/stored values from reverse memory accesses. This extraction facilitates future support for permutation elimination within VPlan.