From 009053d6b98d52273cf08505fbaf7e973fb14d2a Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 19 Jun 2025 01:44:28 -0700 Subject: [PATCH 01/10] first step, Interleave accesses for EVL tail folding. POC --- .../Transforms/Vectorize/LoopVectorize.cpp | 1 + llvm/lib/Transforms/Vectorize/VPlan.h | 49 +++++++++++++++++++ .../Transforms/Vectorize/VPlanAnalysis.cpp | 7 +-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 47 +++++++++++------- .../Transforms/Vectorize/VPlanTransforms.cpp | 15 ++++++ llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../AArch64/sve-interleaved-accesses.ll | 16 +++--- 7 files changed, 106 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 67df7a8af098d..325adc8e08df1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4257,6 +4257,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPDerivedIVSC: case VPDef::VPScalarIVStepsSC: case VPDef::VPReplicateSC: + case VPDef::VPReverseInterleavePtrSC: case VPDef::VPInstructionSC: case VPDef::VPCanonicalIVPHISC: case VPDef::VPVectorPointerSC: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index cc9434e9b3b8b..4a18e2699c66c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -532,6 +532,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPInstructionSC: case VPRecipeBase::VPReductionEVLSC: case VPRecipeBase::VPReductionSC: + case VPRecipeBase::VPReverseInterleavePtrSC: case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPVectorPointerSC: @@ -851,6 +852,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { R->getVPDefID() == VPRecipeBase::VPReductionSC || R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || + R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC || R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC || R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; } @@ -1805,6 +1807,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, #endif }; +class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags { + Type *IndexedTy; + unsigned Factor; + +public: + VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy, + unsigned Factor, GEPNoWrapFlags GEPFlags, + DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC, + ArrayRef({Ptr, VF}), GEPFlags, DL), + IndexedTy(IndexedTy), Factor(Factor) { + assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); + } + + VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC) + + VPValue *getPtr() const { return getOperand(0); } + + VPValue *getVFValue() const { return getOperand(1); } + + void execute(VPTransformState &State) override; + + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + + VPReverseInterleavePtrRecipe *clone() override { + return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy, + Factor, getGEPNoWrapFlags(), + getDebugLoc()); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A pure virtual base class for all recipes modeling header phis, including /// phis for first order recurrences, pointer inductions and reductions. The /// start value is the first operand of the recipe and the incoming value from diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 92db9674ef42b..5d72ef0a067be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -283,9 +283,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case([this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>( + [this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) // VPInstructionWithType must be handled before VPInstruction. .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f64bd2a0cb6a2..b392dd1d0a8c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -152,6 +152,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPDerivedIVSC: case VPFirstOrderRecurrencePHISC: case VPPredInstPHISC: + case VPReverseInterleavePtrSC: case VPVectorEndPointerSC: return false; case VPInstructionSC: @@ -2379,6 +2380,33 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) { + auto &Builder = State.Builder; + Value *Ptr = State.get(getPtr(), /*IsScalar*/ true); + Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true); + Type *IndexTy = Builder.getInt32Ty(); + if (RuntimeVF->getType() != IndexTy) + RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy); + Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); + Index = Builder.CreateMul(Index, Builder.getInt32(Factor)); + Index = Builder.CreateNeg(Index); + Value *ReversePtr = + Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); + + State.set(this, ReversePtr, /*IsScalar*/ true); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent; + printAsOperand(O, SlotTracker); + O << " = reverse-interleave-ptr"; + printFlags(O); + printOperands(O, SlotTracker); +} +#endif + void VPBlendRecipe::execute(VPTransformState &State) { assert(isNormalized() && "Expected blend to be normalized!"); // We know that all PHIs in non-header blocks are converted into @@ -3424,25 +3452,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { if (auto *I = dyn_cast(ResAddr)) State.setDebugLocFrom(I->getDebugLoc()); - // If the group is reverse, adjust the index to refer to the last vector lane - // instead of the first. We adjust the index from the first vector lane, - // rather than directly getting the pointer for lane VF - 1, because the - // pointer operand of the interleaved access is supposed to be uniform. - if (Group->isReverse()) { - Value *RuntimeVF = - getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); - Value *Index = - State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); - Index = State.Builder.CreateMul(Index, - State.Builder.getInt32(Group->getFactor())); - Index = State.Builder.CreateNeg(Index); - - bool InBounds = false; - if (auto *Gep = dyn_cast(ResAddr->stripPointerCasts())) - InBounds = Gep->isInBounds(); - ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); - } - State.setDebugLocFrom(getDebugLoc()); Value *PoisonVec = PoisonValue::get(VecTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 418a2ccbd6b40..d4347847824a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2512,6 +2512,21 @@ void VPlanTransforms::createInterleaveGroups( Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV) : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV); } + // If the group is reverse, adjust the index to refer to the last vector + // lane instead of the first. We adjust the index from the first vector + // lane, rather than directly getting the pointer for lane VF - 1, because + // the pointer operand of the interleaved access is supposed to be uniform. + if (IG->isReverse()) { + auto *GEP = dyn_cast( + getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()); + auto *ReversePtr = new VPReverseInterleavePtrRecipe( + Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(), + GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() + : GEPNoWrapFlags::none(), + InsertPos->getDebugLoc()); + ReversePtr->insertBefore(InsertPos); + Addr = ReversePtr; + } auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc()); VPIG->insertBefore(InsertPos); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 279cdac92d2d1..4879aeee6e684 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -337,6 +337,7 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, + VPReverseInterleavePtrSC, VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 8c2958769a615..0031b7579cb60 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -367,8 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]] @@ -381,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1 ; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]] @@ -1577,8 +1577,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2 ; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] @@ -1597,8 +1597,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] ; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2 ; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] From 41a334dbf28122c66442c7773e5aa773d295b9b6 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 20 Jun 2025 01:44:32 -0700 Subject: [PATCH 02/10] Stride VectorEndPointer for reverse interleaved access --- .../Transforms/Vectorize/LoopVectorize.cpp | 5 +- llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++--- .../Transforms/Vectorize/VPlanTransforms.cpp | 5 +- .../AArch64/sve-interleaved-accesses.ll | 24 +++---- .../RISCV/riscv-vector-reverse-output.ll | 48 ++++++++----- .../RISCV/riscv-vector-reverse.ll | 68 ++++++++++--------- ...-force-tail-with-evl-reverse-load-store.ll | 21 ++++-- ...orize-force-tail-with-evl-uniform-store.ll | 3 +- 9 files changed, 117 insertions(+), 87 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 325adc8e08df1..f569fad84d238 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7770,8 +7770,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) ? GEPNoWrapFlags::none() : GEPNoWrapFlags::inBounds(); - VectorPtr = new VPVectorEndPointerRecipe( - Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); + VectorPtr = + new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), + /*Stride*/ -1, Flags, I->getDebugLoc()); } else { VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), GEP ? GEP->getNoWrapFlags() diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 4a18e2699c66c..327110c493d49 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1711,12 +1711,16 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<2> { Type *IndexedTy; + int64_t Stride; + public: VPVectorEndPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy, - GEPNoWrapFlags GEPFlags, DebugLoc DL) + int64_t Stride, GEPNoWrapFlags GEPFlags, DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC, ArrayRef({Ptr, VF}), GEPFlags, DL), - IndexedTy(IndexedTy) {} + IndexedTy(IndexedTy), Stride(Stride) { + assert(Stride != 0 && "Unexpected stride"); + } VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC) @@ -1748,7 +1752,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, VPVectorEndPointerRecipe *clone() override { return new VPVectorEndPointerRecipe(getOperand(0), getVFValue(), IndexedTy, - getGEPNoWrapFlags(), getDebugLoc()); + Stride, getGEPNoWrapFlags(), + getDebugLoc()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index b392dd1d0a8c6..24e5fa11f84a2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2310,12 +2310,12 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, +static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, unsigned CurrentPart, IRBuilderBase &Builder) { // Use i32 for the gep index type when the value is constant, // or query DataLayout for a more suitable index type otherwise. const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); - return IsScalable && (IsReverse || CurrentPart > 0) + return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0)) ? DL.getIndexType(Builder.getPtrTy(0)) : Builder.getInt32Ty(); } @@ -2323,18 +2323,21 @@ static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, void VPVectorEndPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; unsigned CurrentPart = getUnrollPart(*this); + bool IsUnitStride = Stride == 1 || Stride == -1; Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, - CurrentPart, Builder); + IsUnitStride, CurrentPart, Builder); // The wide store needs to start at the last vector element. Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); if (IndexTy != RunTimeVF->getType()) RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); - // NumElt = -CurrentPart * RunTimeVF + // NumElt = Stride * CurrentPart * RunTimeVF Value *NumElt = Builder.CreateMul( - ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); - // LastLane = 1 - RunTimeVF - Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); + ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF); + // LastLane = Stride * (RunTimeVF - 1) + Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1)); + if (Stride != 1) + LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane); Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); @@ -2359,7 +2362,7 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; unsigned CurrentPart = getUnrollPart(*this); Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, - CurrentPart, Builder); + /*IsUnitStride*/ true, CurrentPart, Builder); Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d4347847824a1..df4e485d331b2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2519,8 +2519,9 @@ void VPlanTransforms::createInterleaveGroups( if (IG->isReverse()) { auto *GEP = dyn_cast( getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()); - auto *ReversePtr = new VPReverseInterleavePtrRecipe( - Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(), + auto *ReversePtr = new VPVectorEndPointerRecipe( + Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), + -(int64_t)IG->getFactor(), GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(), InsertPos->getDebugLoc()); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 0031b7579cb60..b349c55d3e09a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -367,10 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i64 2, [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) @@ -381,10 +379,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1 -; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw i64 2, [[TMP15]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]] ; CHECK-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP12]]) ; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP13]]) @@ -1577,10 +1573,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = sub nsw i64 4, [[TMP6]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VEC]]) @@ -1597,10 +1591,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] ; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 -; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2 -; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = sub nsw i64 4, [[TMP22]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] ; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) ; CHECK-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index 09b274de30214..29b27cdb7556d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -40,7 +40,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] ; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]] ; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]] ; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] ; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 @@ -48,7 +49,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[TMP14:%.*]] = add [[REVERSE]], splat (i32 1) ; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] ; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] ; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] ; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]] ; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) @@ -98,7 +100,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] ; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 ; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]] +; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1 +; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]] ; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]] ; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] ; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 @@ -107,7 +110,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] ; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 ; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]] +; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 +; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] ; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] ; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] ; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) @@ -157,11 +161,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]] ; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]] ; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]] ; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]] ; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]] ; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]] ; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]] ; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 @@ -172,11 +178,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[TMP20:%.*]] = add [[REVERSE2]], splat (i32 1) ; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] ; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]] ; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]] ; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] ; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]] ; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]] ; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] ; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) @@ -246,7 +254,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] ; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]] ; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] ; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] ; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 @@ -254,7 +263,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[TMP14:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] ; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] ; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] ; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]] ; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP14]]) @@ -304,7 +314,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] ; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 ; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]] +; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1 +; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]] ; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]] ; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] ; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 @@ -313,7 +324,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] ; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 ; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]] +; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 +; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] ; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] ; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] ; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP15]]) @@ -363,11 +375,13 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] ; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]] ; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]] ; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]] ; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]] ; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]] ; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]] ; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 @@ -378,11 +392,13 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd [[REVERSE2]], splat (float 1.000000e+00) ; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] ; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]] ; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]] ; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]] ; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]] ; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]] ; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] ; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP19]]) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index dd8b7d6ea7e42..b4e49a60e0887 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -334,22 +334,24 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %22 = zext i32 %21 to i64 ; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22 ; CHECK-NEXT: %24 = mul i64 0, %18 -; CHECK-NEXT: %25 = sub i64 1, %18 -; CHECK-NEXT: %26 = getelementptr inbounds i32, ptr %23, i64 %24 -; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %26, i64 %25 -; CHECK-NEXT: %wide.load = load , ptr %27, align 4 +; CHECK-NEXT: %25 = sub i64 %18, 1 +; CHECK-NEXT: %26 = mul i64 -1, %25 +; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24 +; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26 +; CHECK-NEXT: %wide.load = load , ptr %28, align 4 ; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %wide.load) -; CHECK-NEXT: %28 = add %reverse, splat (i32 1) -; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %A, i64 %22 -; CHECK-NEXT: %30 = mul i64 0, %18 -; CHECK-NEXT: %31 = sub i64 1, %18 -; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %29, i64 %30 -; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %32, i64 %31 -; CHECK-NEXT: %reverse4 = call @llvm.vector.reverse.nxv4i32( %28) -; CHECK-NEXT: store %reverse4, ptr %33, align 4 +; CHECK-NEXT: %29 = add %reverse, splat (i32 1) +; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22 +; CHECK-NEXT: %31 = mul i64 0, %18 +; CHECK-NEXT: %32 = sub i64 %18, 1 +; CHECK-NEXT: %33 = mul i64 -1, %32 +; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31 +; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33 +; CHECK-NEXT: %reverse4 = call @llvm.vector.reverse.nxv4i32( %29) +; CHECK-NEXT: store %reverse4, ptr %35, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %34, , label %vector.body +; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %36, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -380,8 +382,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: %35 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: %add9 = add i32 %35, 1 +; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: %add9 = add i32 %37, 1 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 @@ -743,22 +745,24 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %22 = zext i32 %21 to i64 ; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22 ; CHECK-NEXT: %24 = mul i64 0, %18 -; CHECK-NEXT: %25 = sub i64 1, %18 -; CHECK-NEXT: %26 = getelementptr inbounds float, ptr %23, i64 %24 -; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %26, i64 %25 -; CHECK-NEXT: %wide.load = load , ptr %27, align 4 +; CHECK-NEXT: %25 = sub i64 %18, 1 +; CHECK-NEXT: %26 = mul i64 -1, %25 +; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24 +; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26 +; CHECK-NEXT: %wide.load = load , ptr %28, align 4 ; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %wide.load) -; CHECK-NEXT: %28 = fadd %reverse, splat (float 1.000000e+00) -; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %A, i64 %22 -; CHECK-NEXT: %30 = mul i64 0, %18 -; CHECK-NEXT: %31 = sub i64 1, %18 -; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %29, i64 %30 -; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %32, i64 %31 -; CHECK-NEXT: %reverse4 = call @llvm.vector.reverse.nxv4f32( %28) -; CHECK-NEXT: store %reverse4, ptr %33, align 4 +; CHECK-NEXT: %29 = fadd %reverse, splat (float 1.000000e+00) +; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22 +; CHECK-NEXT: %31 = mul i64 0, %18 +; CHECK-NEXT: %32 = sub i64 %18, 1 +; CHECK-NEXT: %33 = mul i64 -1, %32 +; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31 +; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33 +; CHECK-NEXT: %reverse4 = call @llvm.vector.reverse.nxv4f32( %29) +; CHECK-NEXT: store %reverse4, ptr %35, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %34, , label %vector.body +; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %36, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -789,8 +793,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: %35 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: %conv1 = fadd float %35, 1.000000e+00 +; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 96db5bf4e9acc..91d94e52d0990 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -33,7 +33,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]] +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[TMP18]], 1 +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 -1, [[TMP11]] ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) @@ -41,7 +42,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]] +; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP19]], 1 +; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP23]] ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] ; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) @@ -136,7 +138,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]] -; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]] +; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP26]], 1 +; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP15]] ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] ; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) @@ -145,7 +148,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]] +; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP27]], 1 +; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP30]] ; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] ; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) @@ -261,7 +265,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]] ; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]] -; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]] +; IF-EVL-NEXT: [[TMP29:%.*]] = sub i64 [[TMP9]], 1 +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP29]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]] ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]] ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], splat (i1 true), i32 [[TMP6]]) @@ -271,7 +276,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]] ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]] -; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]] +; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP16]], 1 +; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP30]] ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]] ; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) @@ -279,7 +285,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]] ; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]] -; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]] +; IF-EVL-NEXT: [[TMP31:%.*]] = sub i64 [[TMP22]], 1 +; IF-EVL-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP31]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]] ; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]] ; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll index 5c94ce180578f..984b64c55ce16 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll @@ -34,7 +34,8 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]] +; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]] ; CHECK-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( zeroinitializer, splat (i1 true), i32 [[TMP11]]) From 9722265a94fe132a5ca31e5bd5d0bcda5d529944 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 24 Jun 2025 02:20:10 -0700 Subject: [PATCH 03/10] Remove VPReverseInterleavePtrRecipe --- .../Transforms/Vectorize/LoopVectorize.cpp | 1 - llvm/lib/Transforms/Vectorize/VPlan.h | 49 ------------------- .../Transforms/Vectorize/VPlanAnalysis.cpp | 7 ++- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 ----------- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 - 5 files changed, 3 insertions(+), 83 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f569fad84d238..1afaaf7dd1ccc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4257,7 +4257,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPDerivedIVSC: case VPDef::VPScalarIVStepsSC: case VPDef::VPReplicateSC: - case VPDef::VPReverseInterleavePtrSC: case VPDef::VPInstructionSC: case VPDef::VPCanonicalIVPHISC: case VPDef::VPVectorPointerSC: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 327110c493d49..ed4eb97656e51 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -532,7 +532,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPInstructionSC: case VPRecipeBase::VPReductionEVLSC: case VPRecipeBase::VPReductionSC: - case VPRecipeBase::VPReverseInterleavePtrSC: case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPVectorPointerSC: @@ -852,7 +851,6 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { R->getVPDefID() == VPRecipeBase::VPReductionSC || R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || - R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC || R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC || R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; } @@ -1812,53 +1810,6 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, #endif }; -class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags { - Type *IndexedTy; - unsigned Factor; - -public: - VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy, - unsigned Factor, GEPNoWrapFlags GEPFlags, - DebugLoc DL) - : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC, - ArrayRef({Ptr, VF}), GEPFlags, DL), - IndexedTy(IndexedTy), Factor(Factor) { - assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); - } - - VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC) - - VPValue *getPtr() const { return getOperand(0); } - - VPValue *getVFValue() const { return getOperand(1); } - - void execute(VPTransformState &State) override; - - bool onlyFirstLaneUsed(const VPValue *Op) const override { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - return true; - } - - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override { - // TODO: Compute accurate cost after retiring the legacy cost model. - return 0; - } - - VPReverseInterleavePtrRecipe *clone() override { - return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy, - Factor, getGEPNoWrapFlags(), - getDebugLoc()); - } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif -}; - /// A pure virtual base class for all recipes modeling header phis, including /// phis for first order recurrences, pointer inductions and reductions. The /// start value is the first operand of the recipe and the incoming value from diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 5d72ef0a067be..92db9674ef42b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -283,10 +283,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case( - [this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPPartialReductionRecipe>([this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) // VPInstructionWithType must be handled before VPInstruction. .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 24e5fa11f84a2..13aa66502b2bb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -152,7 +152,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPDerivedIVSC: case VPFirstOrderRecurrencePHISC: case VPPredInstPHISC: - case VPReverseInterleavePtrSC: case VPVectorEndPointerSC: return false; case VPInstructionSC: @@ -2383,33 +2382,6 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - Value *Ptr = State.get(getPtr(), /*IsScalar*/ true); - Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true); - Type *IndexTy = Builder.getInt32Ty(); - if (RuntimeVF->getType() != IndexTy) - RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy); - Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); - Index = Builder.CreateMul(Index, Builder.getInt32(Factor)); - Index = Builder.CreateNeg(Index); - Value *ReversePtr = - Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); - - State.set(this, ReversePtr, /*IsScalar*/ true); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent; - printAsOperand(O, SlotTracker); - O << " = reverse-interleave-ptr"; - printFlags(O); - printOperands(O, SlotTracker); -} -#endif - void VPBlendRecipe::execute(VPTransformState &State) { assert(isNormalized() && "Expected blend to be normalized!"); // We know that all PHIs in non-header blocks are converted into diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 4879aeee6e684..279cdac92d2d1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -337,7 +337,6 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, - VPReverseInterleavePtrSC, VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, From 2587bc26cbe1d33ddd52b3b5567e35252bef8a5d Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 24 Jun 2025 02:22:38 -0700 Subject: [PATCH 04/10] comment --- llvm/lib/Transforms/Vectorize/VPlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index ed4eb97656e51..272a936860b30 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1704,7 +1704,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags { /// A recipe to compute a pointer to the last element of each part of a widened /// memory access for widened memory accesses of IndexedTy. Used for -/// VPWidenMemoryRecipes that are reversed. +/// VPWidenMemoryRecipes or VPInterleaveRecipes that are reversed. class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<2> { Type *IndexedTy; From d763b4405bddec5586b23ca0710373f8585f3560 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 30 Jun 2025 00:45:54 -0700 Subject: [PATCH 05/10] Refine assertion --- llvm/lib/Transforms/Vectorize/VPlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 272a936860b30..76b1a7c5bd6f8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1717,7 +1717,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, : VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC, ArrayRef({Ptr, VF}), GEPFlags, DL), IndexedTy(IndexedTy), Stride(Stride) { - assert(Stride != 0 && "Unexpected stride"); + assert(Stride != 0 && "Stride cannot be zero"); } VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC) From 513809f643acbeb5b2a287a057131d5412e4e009 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 30 Jun 2025 01:34:37 -0700 Subject: [PATCH 06/10] Add comment for stride --- llvm/lib/Transforms/Vectorize/VPlan.h | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 76b1a7c5bd6f8..f0e570ebefd24 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1709,6 +1709,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<2> { Type *IndexedTy; + /// The constant stride of the pointer computed by this recipe. int64_t Stride; public: From d8b703bfd4ead9f78cfd3c9e9fd7d772ff93340e Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 30 Jun 2025 01:57:15 -0700 Subject: [PATCH 07/10] Reuse InBounds --- .../Transforms/Vectorize/VPlanTransforms.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index df4e485d331b2..ffb2970118346 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2482,23 +2482,23 @@ void VPlanTransforms::createInterleaveGroups( auto *InsertPos = cast(RecipeBuilder.getRecipe(IRInsertPos)); + bool InBounds = false; + if (auto *Gep = dyn_cast( + getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts())) + InBounds = Gep->isInBounds(); + // Get or create the start address for the interleave group. auto *Start = cast(RecipeBuilder.getRecipe(IG->getMember(0))); VPValue *Addr = Start->getAddr(); VPRecipeBase *AddrDef = Addr->getDefiningRecipe(); if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) { - // TODO: Hoist Addr's defining recipe (and any operands as needed) to - // InsertPos or sink loads above zero members to join it. - bool InBounds = false; - if (auto *Gep = dyn_cast( - getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts())) - InBounds = Gep->isInBounds(); - // We cannot re-use the address of member zero because it does not // dominate the insert position. Instead, use the address of the insert // position and create a PtrAdd adjusting it to the address of member // zero. + // TODO: Hoist Addr's defining recipe (and any operands as needed) to + // InsertPos or sink loads above zero members to join it. assert(IG->getIndex(IRInsertPos) != 0 && "index of insert position shouldn't be zero"); auto &DL = IRInsertPos->getDataLayout(); @@ -2522,8 +2522,7 @@ void VPlanTransforms::createInterleaveGroups( auto *ReversePtr = new VPVectorEndPointerRecipe( Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), -(int64_t)IG->getFactor(), - GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() - : GEPNoWrapFlags::none(), + InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(), InsertPos->getDebugLoc()); ReversePtr->insertBefore(InsertPos); Addr = ReversePtr; From 67382a4a1d9a1b5b09f2d4806ae78c92ac6b7115 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 30 Jun 2025 23:23:31 -0700 Subject: [PATCH 08/10] assert negative stride --- llvm/lib/Transforms/Vectorize/VPlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f0e570ebefd24..fcc5fbec3b49b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1718,7 +1718,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, : VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC, ArrayRef({Ptr, VF}), GEPFlags, DL), IndexedTy(IndexedTy), Stride(Stride) { - assert(Stride != 0 && "Stride cannot be zero"); + assert(Stride < 0 && "Stride must be negative"); } VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC) From 80399d90f96973da1dfca4868f169277936c8e7e Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 30 Jun 2025 23:26:43 -0700 Subject: [PATCH 09/10] Remove dead GEP --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ffb2970118346..931d4d42f56e4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2517,8 +2517,6 @@ void VPlanTransforms::createInterleaveGroups( // lane, rather than directly getting the pointer for lane VF - 1, because // the pointer operand of the interleaved access is supposed to be uniform. if (IG->isReverse()) { - auto *GEP = dyn_cast( - getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()); auto *ReversePtr = new VPVectorEndPointerRecipe( Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), -(int64_t)IG->getFactor(), From a3eb766bc56deec53c655a5e663f8ac378aa2f3d Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 30 Jun 2025 23:52:03 -0700 Subject: [PATCH 10/10] Refine comment --- llvm/lib/Transforms/Vectorize/VPlan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index fcc5fbec3b49b..d460573f5bec6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1709,7 +1709,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<2> { Type *IndexedTy; - /// The constant stride of the pointer computed by this recipe. + /// The constant stride of the pointer computed by this recipe, expressed in + /// units of IndexedTy. int64_t Stride; public: