From 00e256ca638b50e1c318d9dcf33d319f134e50f8 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 19 Feb 2025 01:37:53 -0800 Subject: [PATCH 01/16] Init: New Recipe VPWidenStridedLoadRecipe --- .../Transforms/Vectorize/LoopVectorize.cpp | 112 ++++++- llvm/lib/Transforms/Vectorize/VPlan.h | 67 ++++- .../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 64 +++- .../Transforms/Vectorize/VPlanTransforms.cpp | 19 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../Transforms/Vectorize/VPlanVerifier.cpp | 2 +- .../RISCV/riscv-vector-reverse-output.ll | 280 ++++++++---------- .../RISCV/riscv-vector-reverse.ll | 224 +++++++------- ...-force-tail-with-evl-reverse-load-store.ll | 97 +++--- 10 files changed, 526 insertions(+), 346 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 907839711a39c..5bf80940617a5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1085,6 +1085,7 @@ class LoopVectorizationCostModel { CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, + CM_Strided, CM_Scalarize, CM_VectorCall, CM_IntrinsicCall @@ -1315,6 +1316,20 @@ class LoopVectorizationCostModel { return InterleaveInfo.getInterleaveGroup(Instr); } + /// Returns true if \p I is a memory instruction with strided memory access + /// that can be vectorized. + bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const; + + /// Get the stride of the strided memory access instruction \p Instr. Return 0 + /// if the instruction \p Instr is not considered for vectorization as a + /// strided memory access. + int64_t getStride(Instruction *Instr) const { + auto It = StrideInfo.find(Instr); + if (It != StrideInfo.end()) + return It->second; + return 0; + } + /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. bool requiresScalarEpilogue(bool IsVectorizing) const { @@ -1562,6 +1577,10 @@ class LoopVectorizationCostModel { /// element) InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); + /// The cost computation for strided load/store instruction. + InstructionCost getStridedLoadStoreCost(Instruction *I, + ElementCount VF) const; + /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. InstructionCost getScalarizationOverhead(Instruction *I, @@ -1701,6 +1720,9 @@ class LoopVectorizationCostModel { Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } + /// The mapping of memory access instructions to their stride values. + DenseMap StrideInfo; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -3276,6 +3298,31 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( return true; } +bool LoopVectorizationCostModel::stridedAccessCanBeWidened( + Instruction *I, ElementCount VF) const { + // Get and ensure we have a valid memory instruction. + assert((isa(I)) && "Invalid memory instruction"); + + // Only support strided access for vector VF. + if (!VF.isVector()) + return false; + + // FIXME: Remove this check for StoreInst after strided store is supported. + if (isa(I)) + return false; + + [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I); + auto *ScalarTy = getLoadStoreType(I); + // TODO: Support non-unit-reverse strided accesses. Add stride analysis here + // to ensure that the accessed addresses are evenly spaced apart by a fixed + // stride. + assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 && + "Only supports strided accesses with a stride of -1"); + + const Align Alignment = getLoadStoreAlignment(I); + return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment); +} + void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which @@ -3366,9 +3413,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (IsUniformMemOpUse(I)) return true; - return (WideningDecision == CM_Widen || - WideningDecision == CM_Widen_Reverse || - WideningDecision == CM_Interleave); + return ( + WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || + WideningDecision == CM_Strided || WideningDecision == CM_Interleave); }; // Returns true if Ptr is the pointer operand of a memory access instruction @@ -4184,7 +4231,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( [](const auto *R) { return Instruction::Select; }) .Case( [](const auto *R) { return Instruction::Store; }) - .Case( + .Case( [](const auto *R) { return Instruction::Load; }) .Case( [](const auto *R) { return Instruction::Call; }) @@ -4283,6 +4330,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: case VPDef::VPInterleaveSC: + case VPDef::VPWidenStridedLoadSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: case VPDef::VPWidenStoreEVLSC: @@ -5430,6 +5478,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, return Cost; } +InstructionCost +LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I, + ElementCount VF) const { + Type *ValTy = getLoadStoreType(I); + auto *VectorTy = cast(toVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); + const Value *Ptr = getLoadStorePointerOperand(I); + + return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment, + CostKind, I); +} + std::optional LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, ElementCount VF, @@ -5749,6 +5810,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { "Expected consecutive stride."); InstWidening Decision = ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; + // Consider using strided load/store for consecutive reverse accesses to + // achieve more efficient memory operations. + if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) { + const InstructionCost StridedLoadStoreCost = + getStridedLoadStoreCost(&I, VF); + if (StridedLoadStoreCost < Cost) { + Decision = CM_Strided; + Cost = StridedLoadStoreCost; + StrideInfo[&I] = ConsecutiveStride; + } + } setWideningDecision(&I, VF, Decision, Cost); continue; } @@ -6395,6 +6467,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { + // TODO: New CastContextHint for strided accesses. + case LoopVectorizationCostModel::CM_Strided: case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; case LoopVectorizationCostModel::CM_Interleave: @@ -7752,16 +7826,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, // reverse consecutive. LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, Range.Start); + + auto SameWiden = [&](ElementCount VF) -> bool { + return Decision == CM.getWideningDecision(I, VF); + }; + bool ContainsWidenVF = + LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range); + assert(ContainsWidenVF && + "At least widen the memory accesses by the Start VF."); + bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; + bool Strided = Decision == LoopVectorizationCostModel::CM_Strided; VPValue *Ptr = isa(I) ? Operands[0] : Operands[1]; - if (Consecutive) { + if (Consecutive || Strided) { auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; if (Reverse) { + assert(!Strided && "Reverse and Strided are mutually exclusive."); // When folding the tail, we may compute an address that we don't in the // original scalar loop and it may not be inbounds. Drop Inbounds in that // case. @@ -7773,7 +7858,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), /*Stride*/ -1, Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), + VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided, GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); @@ -7781,9 +7866,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, Builder.insert(VectorPtr); Ptr = VectorPtr; } - if (LoadInst *Load = dyn_cast(I)) + if (LoadInst *Load = dyn_cast(I)) { + if (Strided) { + const DataLayout &DL = Load->getDataLayout(); + auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType()); + int64_t Stride = CM.getStride(Load); + assert(Stride == -1 && + "Only stride memory access with a stride of -1 is supported."); + VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( + StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load)))); + return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(), + Mask, VPIRMetadata(*Load, LVer), + I->getDebugLoc()); + } return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, VPIRMetadata(*Load, LVer), I->getDebugLoc()); + } StoreInst *Store = cast(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c5b214b355545..569869e8e4bd4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -557,6 +557,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: + case VPRecipeBase::VPWidenStridedLoadSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: @@ -1764,16 +1765,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, }; /// A recipe to compute the pointers for widened memory accesses of IndexTy. +/// Supports both consecutive and reverse consecutive accesses. +/// TODO: Support non-unit strided accesses . class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<1> { Type *IndexedTy; + /// Indicate whether to compute the pointer for strided memory accesses. + bool Strided; + public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, - DebugLoc DL) + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided, + GEPNoWrapFlags GEPFlags, DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef(Ptr), GEPFlags, DL), - IndexedTy(IndexedTy) {} + IndexedTy(IndexedTy), Strided(Strided) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) @@ -1794,7 +1800,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, } VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided, getGEPNoWrapFlags(), getDebugLoc()); } @@ -2931,7 +2937,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata { return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || - R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; + R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC || + R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC; } static inline bool classof(const VPUser *U) { @@ -3050,6 +3057,56 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { } }; +/// A recipe for strided load operations, using the base address, stride, and an +/// optional mask. This recipe will generate an vp.strided.load intrinsic call +/// to represent memory accesses with a fixed stride. +struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe, + public VPValue { + VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride, + VPValue *VF, VPValue *Mask, + const VPIRMetadata &Metadata, DebugLoc DL) + : VPWidenMemoryRecipe( + VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF}, + /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL), + VPValue(this, &Load) { + setMask(Mask); + } + + VPWidenStridedLoadRecipe *clone() override { + return new VPWidenStridedLoadRecipe(cast(Ingredient), getAddr(), + getStride(), getVF(), getMask(), *this, + getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC); + + /// Return the stride operand. + VPValue *getStride() const { return getOperand(1); } + + /// Return the VF operand. + VPValue *getVF() const { return getOperand(2); } + + /// Generate a strided load. + void execute(VPTransformState &State) override; + + /// Return the cost of this VPWidenStridedLoadRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() || Op == getStride() || Op == getVF(); + } +}; + /// A recipe for widening store operations, using the stored value, the address /// to store to and an optional mask. struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 92db9674ef42b..714fef032c9b1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -184,8 +184,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert((isa(R)) && - "Store recipes should not define any values"); + assert( + (isa( + R)) && + "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 06511b61a67c3..f05c5b178a3e5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -80,6 +80,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: @@ -103,6 +104,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return cast(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -184,6 +186,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { } case VPInterleaveSC: return mayWriteToMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenStoreEVLSC: @@ -2386,8 +2389,13 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) { Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); + // TODO: Support non-unit-reverse strided accesses. + Value *Index = + Strided + ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) + : Increment; Value *ResultPtr = - Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); + Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } @@ -3226,6 +3234,60 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPWidenStridedLoadRecipe::execute(VPTransformState &State) { + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + Value *Addr = State.get(getAddr(), /*IsScalar*/ true); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) + Mask = State.get(VPMask); + else + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)), + Builder.getInt32Ty()); + + auto *PtrTy = Addr->getType(); + auto *StrideTy = Stride->getType(); + CallInst *NewLI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy}, + {Addr, Stride, Mask, RunTimeVF}, nullptr, "wide.strided.load"); + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + applyMetadata(*NewLI); + State.set(this, NewLI); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = load "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", stride = "; + getStride()->printAsOperand(O, SlotTracker); + O << ", runtimeVF = "; + getVF()->printAsOperand(O, SlotTracker); +} +#endif + +InstructionCost +VPWidenStridedLoadRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + const Value *Ptr = getLoadStorePointerOperand(&Ingredient); + + return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr, + IsMasked, Alignment, Ctx.CostKind, + &Ingredient); +} + void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 931d4d42f56e4..75113706df420 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2144,6 +2144,12 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, VPValue *NewMask = GetNewMask(L->getMask()); return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); }) + .Case([&](VPWidenStridedLoadRecipe *L) { + VPValue *NewMask = GetNewMask(L->getMask()); + return new VPWidenStridedLoadRecipe( + *cast(&L->getIngredient()), L->getAddr(), L->getStride(), + &EVL, NewMask, *L, L->getDebugLoc()); + }) .Case([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); @@ -2198,10 +2204,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); - assert(all_of(Plan.getVF().users(), - IsaPred) && - "User of VF that we can't transform to EVL."); + assert( + all_of( + Plan.getVF().users(), + IsaPred) && + "User of VF that we can't transform to EVL."); Plan.getVF().replaceAllUsesWith(&EVL); // Create a scalar phi to track the previous EVL if fixed-order recurrence is @@ -2240,7 +2248,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { NumDefVal <= 1 && "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa(EVLRecipe)) { + if (isa(EVLRecipe)) { VPValue *CurVPV = CurRecipe->getVPSingleValue(); CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 279cdac92d2d1..d9b1f7d4f5d53 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -340,6 +340,7 @@ class VPDef { VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, + VPWidenStridedLoadSC, VPVectorPointerSC, VPVectorEndPointerSC, VPWidenCallSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 38ada33d7ee19..bc9d40834c185 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) .Case( + VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case([&](auto *R) { if (R->getNumOperands() != 3) { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index 29b27cdb7556d..d53fb60e7c7c9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -37,27 +37,23 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64: [[VECTOR_BODY]]: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]] +; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; RV64-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP9]], i64 -4, splat (i1 true), i32 [[TMP10]]) +; RV64-NEXT: [[TMP11:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] +; RV64-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP14]] +; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP13]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) +; RV64-NEXT: store [[REVERSE]], ptr [[TMP17]], align 4 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -68,8 +64,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1 +; RV64-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP19]], 1 ; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -96,29 +92,24 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32: [[VECTOR_BODY]]: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]] +; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 ; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1 -; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = add [[REVERSE]], splat (i32 1) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 -; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP9]], i32 -4, splat (i1 true), i32 [[TMP10]]) +; RV32-NEXT: [[TMP11:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] +; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]] +; RV32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], 1 +; RV32-NEXT: [[TMP16:%.*]] = mul i32 -1, [[TMP15]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 [[TMP16]] +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) +; RV32-NEXT: store [[REVERSE]], ptr [[TMP18]], align 4 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -129,8 +120,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1 +; RV32-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP20]], 1 ; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -158,39 +149,34 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP20:%.*]] = add [[REVERSE2]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP15]]) +; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP14]], i64 -4, splat (i1 true), i32 [[TMP16]]) +; RV64-UF2-NEXT: [[TMP17:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP18:%.*]] = add [[WIDE_STRIDED_LOAD1]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 -1, [[TMP21]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP25]] ; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) -; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; RV64-UF2-NEXT: store [[REVERSE]], ptr [[TMP24]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; RV64-UF2-NEXT: store [[REVERSE2]], ptr [[TMP29]], align 4 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -251,27 +237,23 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64: [[VECTOR_BODY]]: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; RV64-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP9]], i64 -4, splat (i1 true), i32 [[TMP10]]) +; RV64-NEXT: [[TMP11:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; RV64-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP14]] +; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP15]] +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP11]]) +; RV64-NEXT: store [[REVERSE]], ptr [[TMP17]], align 4 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -282,8 +264,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00 +; RV64-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP19]], 1.000000e+00 ; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -310,29 +292,24 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32: [[VECTOR_BODY]]: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 ; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1 -; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 -; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP9]], i32 -4, splat (i1 true), i32 [[TMP10]]) +; RV32-NEXT: [[TMP11:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]] +; RV32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], 1 +; RV32-NEXT: [[TMP16:%.*]] = mul i32 -1, [[TMP15]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP16]] +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP11]]) +; RV32-NEXT: store [[REVERSE]], ptr [[TMP18]], align 4 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -343,8 +320,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00 +; RV32-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP20]], 1.000000e+00 ; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -372,39 +349,34 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd [[REVERSE2]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP15]]) +; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP14]], i64 -4, splat (i1 true), i32 [[TMP16]]) +; RV64-UF2-NEXT: [[TMP17:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 -1, [[TMP21]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP25]] ; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP19]]) -; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP20]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP17]]) +; RV64-UF2-NEXT: store [[REVERSE]], ptr [[TMP24]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP18]]) +; RV64-UF2-NEXT: store [[REVERSE2]], ptr [[TMP29]], align 4 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index b4e49a60e0887..416eba01440b1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -38,7 +38,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 @@ -80,9 +80,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -97,20 +97,20 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> -; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> -; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0> -; CHECK-NEXT: WIDEN store vp<%10>, ir<%add9> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -147,7 +147,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 @@ -178,7 +178,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 24 +; CHECK-NEXT: LV: Loop cost is 23 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -189,9 +189,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { -; CHECK-NEXT: Live-in ir<%18> = VF -; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF -; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count +; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF +; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -232,19 +232,19 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> -; CHECK-NEXT: WIDEN ir<%19> = load vp<%4> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18> -; CHECK-NEXT: WIDEN store vp<%5>, ir<%add9> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1 -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec> +; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: @@ -333,25 +333,21 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 ; CHECK-NEXT: %22 = zext i32 %21 to i64 ; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22 -; CHECK-NEXT: %24 = mul i64 0, %18 -; CHECK-NEXT: %25 = sub i64 %18, 1 -; CHECK-NEXT: %26 = mul i64 -1, %25 -; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24 -; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26 -; CHECK-NEXT: %wide.load = load , ptr %28, align 4 -; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %wide.load) -; CHECK-NEXT: %29 = add %reverse, splat (i32 1) -; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22 -; CHECK-NEXT: %31 = mul i64 0, %18 -; CHECK-NEXT: %32 = sub i64 %18, 1 -; CHECK-NEXT: %33 = mul i64 -1, %32 -; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31 -; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33 -; CHECK-NEXT: %reverse4 = call @llvm.vector.reverse.nxv4i32( %29) -; CHECK-NEXT: store %reverse4, ptr %35, align 4 +; CHECK-NEXT: %24 = getelementptr inbounds i32, ptr %23, i32 0 +; CHECK-NEXT: %25 = trunc i64 %18 to i32 +; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %24, i64 -4, splat (i1 true), i32 %25) +; CHECK-NEXT: %26 = add %wide.strided.load, splat (i32 1) +; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %A, i64 %22 +; CHECK-NEXT: %28 = mul i64 0, %18 +; CHECK-NEXT: %29 = sub i64 %18, 1 +; CHECK-NEXT: %30 = mul i64 -1, %29 +; CHECK-NEXT: %31 = getelementptr inbounds i32, ptr %27, i64 %28 +; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %31, i64 %30 +; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %26) +; CHECK-NEXT: store %reverse, ptr %32, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %36, , label %vector.body +; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %33, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -368,7 +364,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader ; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] ; CHECK-NEXT: br label %for.body ; CHECK-NEXT: LV: draw edge from middle.block ; CHECK-NEXT: LV: draw edge from for.body.preheader @@ -378,12 +374,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph ; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ] ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: %add9 = add i32 %37, 1 +; CHECK-NEXT: %34 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: %add9 = add i32 %34, 1 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 @@ -449,7 +445,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 @@ -491,9 +487,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -508,20 +504,20 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> -; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> -; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0> -; CHECK-NEXT: WIDEN store vp<%10>, ir<%conv1> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -558,7 +554,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 @@ -589,7 +585,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 26 +; CHECK-NEXT: LV: Loop cost is 25 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -600,9 +596,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { -; CHECK-NEXT: Live-in ir<%18> = VF -; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF -; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count +; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF +; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -643,19 +639,19 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> -; CHECK-NEXT: WIDEN ir<%19> = load vp<%4> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18> -; CHECK-NEXT: WIDEN store vp<%5>, ir<%conv1> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1 -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec> +; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: @@ -744,25 +740,21 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 ; CHECK-NEXT: %22 = zext i32 %21 to i64 ; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22 -; CHECK-NEXT: %24 = mul i64 0, %18 -; CHECK-NEXT: %25 = sub i64 %18, 1 -; CHECK-NEXT: %26 = mul i64 -1, %25 -; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24 -; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26 -; CHECK-NEXT: %wide.load = load , ptr %28, align 4 -; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %wide.load) -; CHECK-NEXT: %29 = fadd %reverse, splat (float 1.000000e+00) -; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22 -; CHECK-NEXT: %31 = mul i64 0, %18 -; CHECK-NEXT: %32 = sub i64 %18, 1 -; CHECK-NEXT: %33 = mul i64 -1, %32 -; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31 -; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33 -; CHECK-NEXT: %reverse4 = call @llvm.vector.reverse.nxv4f32( %29) -; CHECK-NEXT: store %reverse4, ptr %35, align 4 +; CHECK-NEXT: %24 = getelementptr inbounds float, ptr %23, i32 0 +; CHECK-NEXT: %25 = trunc i64 %18 to i32 +; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %24, i64 -4, splat (i1 true), i32 %25) +; CHECK-NEXT: %26 = fadd %wide.strided.load, splat (float 1.000000e+00) +; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %A, i64 %22 +; CHECK-NEXT: %28 = mul i64 0, %18 +; CHECK-NEXT: %29 = sub i64 %18, 1 +; CHECK-NEXT: %30 = mul i64 -1, %29 +; CHECK-NEXT: %31 = getelementptr inbounds float, ptr %27, i64 %28 +; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %31, i64 %30 +; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %26) +; CHECK-NEXT: store %reverse, ptr %32, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %36, , label %vector.body +; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %33, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -779,7 +771,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader ; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] ; CHECK-NEXT: br label %for.body ; CHECK-NEXT: LV: draw edge from middle.block ; CHECK-NEXT: LV: draw edge from for.body.preheader @@ -789,12 +781,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph ; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ] ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00 +; CHECK-NEXT: %34 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: %conv1 = fadd float %34, 1.000000e+00 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 91d94e52d0990..c156fc14a2300 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -29,39 +29,33 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1 -; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]] -; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[TMP18]], 1 -; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 -1, [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP19]], 1 -; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP23]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP8]], i64 -4, splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP10:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], 1 +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 -1, [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP9]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i64 [[TMP13]] +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[WIDE_STRIDED_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE]], ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 0, [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1 ; IF-EVL-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]] ; IF-EVL-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4 @@ -129,37 +123,30 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP26]], 1 -; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP15]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] -; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP27]], 1 -; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP30]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[OFFSET_IDX1:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 +; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, [[TMP9]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP13]], 1 +; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP15]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP12]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP16]] +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[WIDE_STRIDED_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP9]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE]], ptr align 4 [[TMP18]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: From f81211c5cd7aa0357ffb6f8e82771991b09f7dc8 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 14 May 2025 02:02:08 -0700 Subject: [PATCH 02/16] [WIP][VPlan Based] Try to remove CM_Strided from uniform analysis Also cherry-pick the branch Mel-Chen:legalizeAndOptimizeInductions However, still not work well as collectLoopUniforms if the use-chain is too compilicated. :( --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 22 +- .../RISCV/riscv-vector-reverse.ll | 240 ++++++++++-------- 3 files changed, 146 insertions(+), 122 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5bf80940617a5..f2c742cf62927 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3413,9 +3413,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (IsUniformMemOpUse(I)) return true; - return ( - WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || - WideningDecision == CM_Strided || WideningDecision == CM_Interleave); + return (WideningDecision == CM_Widen || + WideningDecision == CM_Widen_Reverse || + WideningDecision == CM_Interleave); }; // Returns true if Ptr is the pointer operand of a memory access instruction diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 75113706df420..8b359d53e3afb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -627,13 +627,14 @@ static SmallVector collectUsersRecursively(VPValue *V) { static void legalizeAndOptimizeInductions(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); - VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - for (VPRecipeBase &Phi : HeaderVPBB->phis()) { - auto *PhiR = dyn_cast(&Phi); - if (!PhiR) - continue; + SmallVector InductionPhis; + for (VPRecipeBase &R : HeaderVPBB->phis()) + if (auto *IV = dyn_cast(&R)) + InductionPhis.push_back(IV); + bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); + VPBuilder Builder; + for (VPWidenInductionRecipe *PhiR : reverse(InductionPhis)) { // Try to narrow wide and replicating recipes to uniform recipes, based on // VPlan analysis. // TODO: Apply to all recipes in the future, to replace legacy uniformity @@ -643,7 +644,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { auto *Def = dyn_cast(U); auto *RepR = dyn_cast(U); // Skip recipes that shouldn't be narrowed. - if (!Def || !isa(Def) || + if (!Def || + !isa(Def) || Def->getNumUsers() == 0 || !Def->getUnderlyingValue() || (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))) continue; @@ -656,11 +658,13 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { Def->operands(), /*IsUniform*/ true); Clone->insertAfter(Def); Def->replaceAllUsesWith(Clone); + Def->eraseFromParent(); } + Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). - if (auto *PtrIV = dyn_cast(&Phi)) { + if (auto *PtrIV = dyn_cast(PhiR)) { if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF())) continue; @@ -681,7 +685,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { // Replace widened induction with scalar steps for users that only use // scalars. - auto *WideIV = cast(&Phi); + auto *WideIV = cast(PhiR); if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) { return U->usesScalars(WideIV); })) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 416eba01440b1..61c380ca079b9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -25,18 +25,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 @@ -73,9 +68,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 @@ -98,10 +90,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> +; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> @@ -144,8 +135,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 @@ -158,27 +149,26 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 ; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 2 -; CHECK-NEXT: LV(REG): At #4 Interval # 2 -; CHECK-NEXT: LV(REG): At #5 Interval # 2 -; CHECK-NEXT: LV(REG): At #6 Interval # 3 -; CHECK-NEXT: LV(REG): At #7 Interval # 3 -; CHECK-NEXT: LV(REG): At #8 Interval # 3 -; CHECK-NEXT: LV(REG): At #9 Interval # 3 -; CHECK-NEXT: LV(REG): At #10 Interval # 3 +; CHECK-NEXT: LV(REG): At #3 Interval # 3 +; CHECK-NEXT: LV(REG): At #4 Interval # 3 +; CHECK-NEXT: LV(REG): At #5 Interval # 4 +; CHECK-NEXT: LV(REG): At #6 Interval # 4 +; CHECK-NEXT: LV(REG): At #7 Interval # 4 +; CHECK-NEXT: LV(REG): At #8 Interval # 4 +; CHECK-NEXT: LV(REG): At #9 Interval # 4 +; CHECK-NEXT: LV(REG): At #10 Interval # 4 ; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 2 -; CHECK-NEXT: LV(REG): At #13 Interval # 2 +; CHECK-NEXT: LV(REG): At #12 Interval # 3 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 23 +; CHECK-NEXT: LV: Loop cost is 27 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -229,13 +219,21 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4 ; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1> ; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1> +; CHECK-NEXT: EMIT vp<%3> = step-vector i32 +; CHECK-NEXT: EMIT vp<%4> = broadcast ir<%n> +; CHECK-NEXT: EMIT vp<%5> = broadcast ir<-1> +; CHECK-NEXT: EMIT vp<%6> = mul vp<%3>, vp<%5> +; CHECK-NEXT: EMIT vp<[[IV_START:%.+]]> = add vp<%4>, vp<%6> +; CHECK-NEXT: EMIT-SCALAR vp<%7> = trunc ir<%18> to i32 +; CHECK-NEXT: EMIT vp<%8> = mul ir<-1>, vp<%7> +; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = broadcast vp<%8> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: WIDEN-PHI ir<[[WIDEN_IV:%.+]]> = phi [ vp<[[IV_START]]>, ir-bb ], [ vp<[[IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> +; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> @@ -244,6 +242,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> ; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add ir<[[WIDEN_IV]]>, vp<[[IV_INC]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: @@ -320,6 +319,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %19 = sub i64 %0, %n.vec ; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32 ; CHECK-NEXT: %20 = sub i32 %n, %.cast +; CHECK-NEXT: %21 = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: %broadcast.splatinsert = insertelement poison, i32 %n, i64 0 +; CHECK-NEXT: %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer +; CHECK-NEXT: %22 = mul %21, splat (i32 -1) +; CHECK-NEXT: %induction = add %broadcast.splat, %22 +; CHECK-NEXT: %23 = trunc i64 %18 to i32 +; CHECK-NEXT: %24 = mul i32 -1, %23 +; CHECK-NEXT: %broadcast.splatinsert3 = insertelement poison, i32 %24, i64 0 +; CHECK-NEXT: %broadcast.splat4 = shufflevector %broadcast.splatinsert3, poison, zeroinitializer ; CHECK-NEXT: br ; CHECK-NEXT: LV: draw edge from vector.memcheck ; CHECK-NEXT: LV: created vector.body @@ -328,26 +336,28 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph ; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ] -; CHECK-NEXT: %.cast3 = trunc i64 %index to i32 -; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3 -; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 -; CHECK-NEXT: %22 = zext i32 %21 to i64 -; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22 -; CHECK-NEXT: %24 = getelementptr inbounds i32, ptr %23, i32 0 -; CHECK-NEXT: %25 = trunc i64 %18 to i32 -; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %24, i64 -4, splat (i1 true), i32 %25) -; CHECK-NEXT: %26 = add %wide.strided.load, splat (i32 1) -; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %A, i64 %22 -; CHECK-NEXT: %28 = mul i64 0, %18 -; CHECK-NEXT: %29 = sub i64 %18, 1 -; CHECK-NEXT: %30 = mul i64 -1, %29 -; CHECK-NEXT: %31 = getelementptr inbounds i32, ptr %27, i64 %28 -; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %31, i64 %30 -; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %26) -; CHECK-NEXT: store %reverse, ptr %32, align 4 +; CHECK-NEXT: %vec.ind = phi +; CHECK-NEXT: %25 = add nsw %vec.ind, splat (i32 -1) +; CHECK-NEXT: %26 = zext %25 to +; CHECK-NEXT: %27 = extractelement %26, i32 0 +; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %B, i64 %27 +; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %28, i32 0 +; CHECK-NEXT: %30 = trunc i64 %18 to i32 +; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %29, i64 -4, splat (i1 true), i32 %30) +; CHECK-NEXT: %31 = add %wide.strided.load, splat (i32 1) +; CHECK-NEXT: %32 = extractelement %26, i32 0 +; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %A, i64 %32 +; CHECK-NEXT: %34 = mul i64 0, %18 +; CHECK-NEXT: %35 = sub i64 %18, 1 +; CHECK-NEXT: %36 = mul i64 -1, %35 +; CHECK-NEXT: %37 = getelementptr inbounds i32, ptr %33, i64 %34 +; CHECK-NEXT: %38 = getelementptr inbounds i32, ptr %37, i64 %36 +; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %31) +; CHECK-NEXT: store %reverse, ptr %38, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %33, , label %vector.body +; CHECK-NEXT: %vec.ind.next = add %vec.ind, %broadcast.splat4 +; CHECK-NEXT: %39 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %39, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -364,7 +374,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader ; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] ; CHECK-NEXT: br label %for.body ; CHECK-NEXT: LV: draw edge from middle.block ; CHECK-NEXT: LV: draw edge from for.body.preheader @@ -374,12 +384,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph ; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: %34 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: %add9 = add i32 %34, 1 +; CHECK-NEXT: %40 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: %add9 = add i32 %40, 1 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 @@ -432,18 +442,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 @@ -480,9 +485,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 @@ -505,10 +507,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> +; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> @@ -551,8 +552,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 @@ -565,27 +566,26 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 ; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 2 -; CHECK-NEXT: LV(REG): At #4 Interval # 2 -; CHECK-NEXT: LV(REG): At #5 Interval # 2 -; CHECK-NEXT: LV(REG): At #6 Interval # 3 -; CHECK-NEXT: LV(REG): At #7 Interval # 3 -; CHECK-NEXT: LV(REG): At #8 Interval # 3 -; CHECK-NEXT: LV(REG): At #9 Interval # 3 -; CHECK-NEXT: LV(REG): At #10 Interval # 3 +; CHECK-NEXT: LV(REG): At #3 Interval # 3 +; CHECK-NEXT: LV(REG): At #4 Interval # 3 +; CHECK-NEXT: LV(REG): At #5 Interval # 4 +; CHECK-NEXT: LV(REG): At #6 Interval # 4 +; CHECK-NEXT: LV(REG): At #7 Interval # 4 +; CHECK-NEXT: LV(REG): At #8 Interval # 4 +; CHECK-NEXT: LV(REG): At #9 Interval # 4 +; CHECK-NEXT: LV(REG): At #10 Interval # 4 ; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 2 -; CHECK-NEXT: LV(REG): At #13 Interval # 2 +; CHECK-NEXT: LV(REG): At #12 Interval # 3 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 25 +; CHECK-NEXT: LV: Loop cost is 29 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -636,13 +636,21 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4 ; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1> ; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1> +; CHECK-NEXT: EMIT vp<%3> = step-vector i32 +; CHECK-NEXT: EMIT vp<%4> = broadcast ir<%n> +; CHECK-NEXT: EMIT vp<%5> = broadcast ir<-1> +; CHECK-NEXT: EMIT vp<%6> = mul vp<%3>, vp<%5> +; CHECK-NEXT: EMIT vp<[[IV_START:%.+]]> = add vp<%4>, vp<%6> +; CHECK-NEXT: EMIT-SCALAR vp<%7> = trunc ir<%18> to i32 +; CHECK-NEXT: EMIT vp<%8> = mul ir<-1>, vp<%7> +; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = broadcast vp<%8> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: WIDEN-PHI ir<[[WIDEN_IV:%.+]]> = phi [ vp<[[IV_START]]>, ir-bb ], [ vp<[[IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> +; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> @@ -651,6 +659,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> ; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add ir<[[WIDEN_IV]]>, vp<[[IV_INC]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: @@ -727,6 +736,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %19 = sub i64 %0, %n.vec ; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32 ; CHECK-NEXT: %20 = sub i32 %n, %.cast +; CHECK-NEXT: %21 = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: %broadcast.splatinsert = insertelement poison, i32 %n, i64 0 +; CHECK-NEXT: %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer +; CHECK-NEXT: %22 = mul %21, splat (i32 -1) +; CHECK-NEXT: %induction = add %broadcast.splat, %22 +; CHECK-NEXT: %23 = trunc i64 %18 to i32 +; CHECK-NEXT: %24 = mul i32 -1, %23 +; CHECK-NEXT: %broadcast.splatinsert3 = insertelement poison, i32 %24, i64 0 +; CHECK-NEXT: %broadcast.splat4 = shufflevector %broadcast.splatinsert3, poison, zeroinitializer ; CHECK-NEXT: br ; CHECK-NEXT: LV: draw edge from vector.memcheck ; CHECK-NEXT: LV: created vector.body @@ -735,26 +753,28 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph ; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ] -; CHECK-NEXT: %.cast3 = trunc i64 %index to i32 -; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3 -; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 -; CHECK-NEXT: %22 = zext i32 %21 to i64 -; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22 -; CHECK-NEXT: %24 = getelementptr inbounds float, ptr %23, i32 0 -; CHECK-NEXT: %25 = trunc i64 %18 to i32 -; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %24, i64 -4, splat (i1 true), i32 %25) -; CHECK-NEXT: %26 = fadd %wide.strided.load, splat (float 1.000000e+00) -; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %A, i64 %22 -; CHECK-NEXT: %28 = mul i64 0, %18 -; CHECK-NEXT: %29 = sub i64 %18, 1 -; CHECK-NEXT: %30 = mul i64 -1, %29 -; CHECK-NEXT: %31 = getelementptr inbounds float, ptr %27, i64 %28 -; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %31, i64 %30 -; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %26) -; CHECK-NEXT: store %reverse, ptr %32, align 4 +; CHECK-NEXT: %vec.ind = phi +; CHECK-NEXT: %25 = add nsw %vec.ind, splat (i32 -1) +; CHECK-NEXT: %26 = zext %25 to +; CHECK-NEXT: %27 = extractelement %26, i32 0 +; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %B, i64 %27 +; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %28, i32 0 +; CHECK-NEXT: %30 = trunc i64 %18 to i32 +; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %29, i64 -4, splat (i1 true), i32 %30) +; CHECK-NEXT: %31 = fadd %wide.strided.load, splat (float 1.000000e+00) +; CHECK-NEXT: %32 = extractelement %26, i32 0 +; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %A, i64 %32 +; CHECK-NEXT: %34 = mul i64 0, %18 +; CHECK-NEXT: %35 = sub i64 %18, 1 +; CHECK-NEXT: %36 = mul i64 -1, %35 +; CHECK-NEXT: %37 = getelementptr inbounds float, ptr %33, i64 %34 +; CHECK-NEXT: %38 = getelementptr inbounds float, ptr %37, i64 %36 +; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %31) +; CHECK-NEXT: store %reverse, ptr %38, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %33, , label %vector.body +; CHECK-NEXT: %vec.ind.next = add %vec.ind, %broadcast.splat4 +; CHECK-NEXT: %39 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %39, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -771,7 +791,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader ; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] ; CHECK-NEXT: br label %for.body ; CHECK-NEXT: LV: draw edge from middle.block ; CHECK-NEXT: LV: draw edge from for.body.preheader @@ -781,12 +801,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph ; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: %34 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: %conv1 = fadd float %34, 1.000000e+00 +; CHECK-NEXT: %40 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: %conv1 = fadd float %40, 1.000000e+00 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 From 806d586681c7ebe9f33ca89b8e28124bd47c9893 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 20 May 2025 00:42:15 -0700 Subject: [PATCH 03/16] [WIP][VPlan Based] Generate VPWidenStrideLoadRecipe in VPlanTransform Still rely on CM_Strided to known legal and cost. --- .../Transforms/Vectorize/LoopVectorize.cpp | 52 +++++-------------- .../Transforms/Vectorize/VPlanTransforms.cpp | 49 +++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.h | 4 ++ 3 files changed, 66 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f2c742cf62927..e635256e96951 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1320,14 +1320,9 @@ class LoopVectorizationCostModel { /// that can be vectorized. bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const; - /// Get the stride of the strided memory access instruction \p Instr. Return 0 - /// if the instruction \p Instr is not considered for vectorization as a - /// strided memory access. - int64_t getStride(Instruction *Instr) const { - auto It = StrideInfo.find(Instr); - if (It != StrideInfo.end()) - return It->second; - return 0; + /// Get the stride information of the strided memory accesses. + SmallDenseMap getStrideInfo() const { + return StrideInfo; } /// Returns true if we're required to use a scalar epilogue for at least @@ -1721,7 +1716,7 @@ class LoopVectorizationCostModel { } /// The mapping of memory access instructions to their stride values. - DenseMap StrideInfo; + SmallDenseMap StrideInfo; public: /// The loop that we evaluate. @@ -7826,27 +7821,16 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, // reverse consecutive. LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, Range.Start); - - auto SameWiden = [&](ElementCount VF) -> bool { - return Decision == CM.getWideningDecision(I, VF); - }; - bool ContainsWidenVF = - LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range); - assert(ContainsWidenVF && - "At least widen the memory accesses by the Start VF."); - bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; - bool Strided = Decision == LoopVectorizationCostModel::CM_Strided; VPValue *Ptr = isa(I) ? Operands[0] : Operands[1]; - if (Consecutive || Strided) { + if (Consecutive) { auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; if (Reverse) { - assert(!Strided && "Reverse and Strided are mutually exclusive."); // When folding the tail, we may compute an address that we don't in the // original scalar loop and it may not be inbounds. Drop Inbounds in that // case. @@ -7858,30 +7842,17 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), /*Stride*/ -1, Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided, - GEP ? GEP->getNoWrapFlags() - : GEPNoWrapFlags::none(), - I->getDebugLoc()); + VectorPtr = new VPVectorPointerRecipe( + Ptr, getLoadStoreType(I), /*Strided*/ false, + GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), + I->getDebugLoc()); } Builder.insert(VectorPtr); Ptr = VectorPtr; } - if (LoadInst *Load = dyn_cast(I)) { - if (Strided) { - const DataLayout &DL = Load->getDataLayout(); - auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType()); - int64_t Stride = CM.getStride(Load); - assert(Stride == -1 && - "Only stride memory access with a stride of -1 is supported."); - VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( - StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load)))); - return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(), - Mask, VPIRMetadata(*Load, LVer), - I->getDebugLoc()); - } + if (LoadInst *Load = dyn_cast(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, VPIRMetadata(*Load, LVer), I->getDebugLoc()); - } StoreInst *Store = cast(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, @@ -9032,6 +9003,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); + // !!! NEED COMMENT + VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, + CM.getStrideInfo()); // Replace VPValues for known constant strides guaranteed by predicate scalar // evolution. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8b359d53e3afb..d83ec28e64f78 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2691,6 +2691,55 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { R->dissolveToCFGLoop(); } +void VPlanTransforms::convertToStridedAccesses( + VPlan &Plan, const SmallDenseMap &StrideInfo) { + // !!! FIXME: Should remove StrideInfo for next step. + if (Plan.hasScalarVFOnly() || StrideInfo.empty()) + return; + + // !!! FIXME: Should clamp VF for legal and cost in next step + SmallVector ToErase; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + // !!! FIXME: Should use LoadR->isReverse() for next step + if (auto *LoadR = dyn_cast(&R); + LoadR && !LoadR->isConsecutive()) { + auto *LI = cast(&LoadR->getIngredient()); + auto It = StrideInfo.find(LI); + if (It == StrideInfo.end()) + continue; + int64_t Stride = It->second; + assert(Stride == -1 && + "Only stride memory access with a stride of -1 is supported."); + // !!! FIXME: Should get VPVectorEndPointerRecipe for reverse + VPValue *Ptr = LoadR->getAddr(); + auto *GEP = dyn_cast( + Ptr->getUnderlyingValue()->stripPointerCasts()); + auto *NewPtr = new VPVectorPointerRecipe( + Ptr, getLoadStoreType(LI), /*Stride*/ true, + GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), + LoadR->getDebugLoc()); + NewPtr->insertBefore(LoadR); + + const DataLayout &DL = LI->getDataLayout(); + auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); + VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( + StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(LI)))); + auto *StridedLoad = new VPWidenStridedLoadRecipe( + *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, + LoadR->getDebugLoc()); + StridedLoad->insertBefore(LoadR); + LoadR->replaceAllUsesWith(StridedLoad); + ToErase.push_back(LoadR); + } + } + } + + for (VPRecipeBase *R : ToErase) + R->eraseFromParent(); +} + void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { using namespace llvm::VPlanPatternMatch; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 8d2eded45da22..b863eb18a95da 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -175,6 +175,10 @@ struct VPlanTransforms { &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); + // !!! NEED COMMENT + static void convertToStridedAccesses( + VPlan &Plan, const SmallDenseMap &StrideInfo); + /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); From 831c7826cd54cb37ce48b0be9e9c98678093f01a Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 21 May 2025 00:10:24 -0700 Subject: [PATCH 04/16] [WIP][VPlan based] Clamp VF range in VPlan transformation --- .../Transforms/Vectorize/LoopVectorize.cpp | 79 +----- llvm/lib/Transforms/Vectorize/VPlan.h | 6 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 25 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 92 +++--- .../Transforms/Vectorize/VPlanTransforms.h | 4 +- .../RISCV/riscv-vector-reverse.ll | 268 ++++++++---------- 6 files changed, 201 insertions(+), 273 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e635256e96951..5d51ead847e2d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1316,15 +1316,6 @@ class LoopVectorizationCostModel { return InterleaveInfo.getInterleaveGroup(Instr); } - /// Returns true if \p I is a memory instruction with strided memory access - /// that can be vectorized. - bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const; - - /// Get the stride information of the strided memory accesses. - SmallDenseMap getStrideInfo() const { - return StrideInfo; - } - /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. bool requiresScalarEpilogue(bool IsVectorizing) const { @@ -1572,10 +1563,6 @@ class LoopVectorizationCostModel { /// element) InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); - /// The cost computation for strided load/store instruction. - InstructionCost getStridedLoadStoreCost(Instruction *I, - ElementCount VF) const; - /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. InstructionCost getScalarizationOverhead(Instruction *I, @@ -1715,9 +1702,6 @@ class LoopVectorizationCostModel { Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// The mapping of memory access instructions to their stride values. - SmallDenseMap StrideInfo; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -3293,31 +3277,6 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( return true; } -bool LoopVectorizationCostModel::stridedAccessCanBeWidened( - Instruction *I, ElementCount VF) const { - // Get and ensure we have a valid memory instruction. - assert((isa(I)) && "Invalid memory instruction"); - - // Only support strided access for vector VF. - if (!VF.isVector()) - return false; - - // FIXME: Remove this check for StoreInst after strided store is supported. - if (isa(I)) - return false; - - [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I); - auto *ScalarTy = getLoadStoreType(I); - // TODO: Support non-unit-reverse strided accesses. Add stride analysis here - // to ensure that the accessed addresses are evenly spaced apart by a fixed - // stride. - assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 && - "Only supports strided accesses with a stride of -1"); - - const Align Alignment = getLoadStoreAlignment(I); - return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment); -} - void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which @@ -5473,19 +5432,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, return Cost; } -InstructionCost -LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I, - ElementCount VF) const { - Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast(toVectorTy(ValTy, VF)); - const Align Alignment = getLoadStoreAlignment(I); - const Value *Ptr = getLoadStorePointerOperand(I); - - return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment, - CostKind, I); -} - std::optional LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, ElementCount VF, @@ -5805,17 +5751,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { "Expected consecutive stride."); InstWidening Decision = ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; - // Consider using strided load/store for consecutive reverse accesses to - // achieve more efficient memory operations. - if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) { - const InstructionCost StridedLoadStoreCost = - getStridedLoadStoreCost(&I, VF); - if (StridedLoadStoreCost < Cost) { - Decision = CM_Strided; - Cost = StridedLoadStoreCost; - StrideInfo[&I] = ConsecutiveStride; - } - } setWideningDecision(&I, VF, Decision, Cost); continue; } @@ -8986,12 +8921,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. - if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); + if (!CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); - } + + // !!! NEED COMMENT + VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, + CostCtx, Range); for (ElementCount VF : Range) Plan->addVF(VF); @@ -9003,9 +8941,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); - // !!! NEED COMMENT - VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, - CM.getStrideInfo()); // Replace VPValues for known constant strides guaranteed by predicate scalar // evolution. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 569869e8e4bd4..c9e51d9abaf90 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1725,6 +1725,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC) + VPValue *getPtr() const { return getOperand(0); } + VPValue *getVFValue() { return getOperand(1); } const VPValue *getVFValue() const { return getOperand(1); } @@ -3089,10 +3091,6 @@ struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe, /// Generate a strided load. void execute(VPTransformState &State) override; - /// Return the cost of this VPWidenStridedLoadRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f05c5b178a3e5..8886bc8765b2e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3073,9 +3073,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); - unsigned Opcode = isa(this) - ? Instruction::Load - : Instruction::Store; + unsigned Opcode = + isa( + this) + ? Instruction::Load + : Instruction::Store; if (!Consecutive) { // TODO: Using the original IR may not be accurate. @@ -3084,6 +3086,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, const Value *Ptr = getLoadStorePointerOperand(&Ingredient); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + if (isa(this)) + return Ctx.TTI.getStridedMemoryOpCost( + Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); + return Ctx.TTI.getAddressComputationCost(Ty) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); @@ -3276,18 +3283,6 @@ void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -InstructionCost -VPWidenStridedLoadRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - const Value *Ptr = getLoadStorePointerOperand(&Ingredient); - - return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr, - IsMasked, Alignment, Ctx.CostKind, - &Ingredient); -} - void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d83ec28e64f78..7a67df4cd7b6e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2691,48 +2691,68 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { R->dissolveToCFGLoop(); } -void VPlanTransforms::convertToStridedAccesses( - VPlan &Plan, const SmallDenseMap &StrideInfo) { - // !!! FIXME: Should remove StrideInfo for next step. - if (Plan.hasScalarVFOnly() || StrideInfo.empty()) +void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + if (Plan.hasScalarVFOnly()) return; - // !!! FIXME: Should clamp VF for legal and cost in next step SmallVector ToErase; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - // !!! FIXME: Should use LoadR->isReverse() for next step - if (auto *LoadR = dyn_cast(&R); - LoadR && !LoadR->isConsecutive()) { - auto *LI = cast(&LoadR->getIngredient()); - auto It = StrideInfo.find(LI); - if (It == StrideInfo.end()) - continue; - int64_t Stride = It->second; - assert(Stride == -1 && - "Only stride memory access with a stride of -1 is supported."); - // !!! FIXME: Should get VPVectorEndPointerRecipe for reverse - VPValue *Ptr = LoadR->getAddr(); - auto *GEP = dyn_cast( - Ptr->getUnderlyingValue()->stripPointerCasts()); - auto *NewPtr = new VPVectorPointerRecipe( - Ptr, getLoadStoreType(LI), /*Stride*/ true, - GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), - LoadR->getDebugLoc()); - NewPtr->insertBefore(LoadR); - - const DataLayout &DL = LI->getDataLayout(); - auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); - VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( - StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(LI)))); - auto *StridedLoad = new VPWidenStridedLoadRecipe( - *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, - LoadR->getDebugLoc()); - StridedLoad->insertBefore(LoadR); - LoadR->replaceAllUsesWith(StridedLoad); - ToErase.push_back(LoadR); - } + auto *MemR = dyn_cast(&R); + // TODO: support strided store + // TODO: support strided accesses with stride not equal to -1 + if (!MemR || !isa(MemR) || !MemR->isReverse()) + continue; + + Instruction &Ingredient = MemR->getIngredient(); + Type *ElementTy = getLoadStoreType(&Ingredient); + + auto IsProfitable = [&](ElementCount VF) -> bool { + Type *DataTy = toVectorTy(ElementTy, VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) + return false; + const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx); + const InstructionCost StridedLoadStoreCost = + Ctx.TTI.getStridedMemoryOpCost( + Ingredient.getOpcode(), DataTy, + getLoadStorePointerOperand(&Ingredient), MemR->isMasked(), + Alignment, Ctx.CostKind, &Ingredient); + return StridedLoadStoreCost < CurrentCost; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable, + Range)) + continue; + + // The stride of consecutive reverse access must be -1. + int64_t Stride = -1; + auto *VecEndPtr = cast(MemR->getAddr()); + VPValue *Ptr = VecEndPtr->getPtr(); + auto *GEP = dyn_cast( + Ptr->getUnderlyingValue()->stripPointerCasts()); + // Create a new vector pointer for strided access. + auto *NewPtr = new VPVectorPointerRecipe( + Ptr, ElementTy, /*Stride=*/ true, + GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), + VecEndPtr->getDebugLoc()); + NewPtr->insertBefore(MemR); + + auto *LoadR = cast(MemR); + auto *LI = cast(&Ingredient); + const DataLayout &DL = LI->getDataLayout(); + auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); + VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( + StrideTy, Stride * DL.getTypeAllocSize(ElementTy))); + auto *StridedLoad = new VPWidenStridedLoadRecipe( + *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, + LoadR->getDebugLoc()); + StridedLoad->insertBefore(LoadR); + LoadR->replaceAllUsesWith(StridedLoad); + + ToErase.append({LoadR, VecEndPtr}); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b863eb18a95da..e4feb14275d2a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -176,8 +176,8 @@ struct VPlanTransforms { VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); // !!! NEED COMMENT - static void convertToStridedAccesses( - VPlan &Plan, const SmallDenseMap &StrideInfo); + static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 61c380ca079b9..97afa9f87ac24 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -25,15 +25,20 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom +; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] +; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 @@ -68,6 +73,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 @@ -90,9 +98,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> -; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> @@ -135,10 +144,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 @@ -149,26 +158,27 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 ; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 3 -; CHECK-NEXT: LV(REG): At #4 Interval # 3 -; CHECK-NEXT: LV(REG): At #5 Interval # 4 -; CHECK-NEXT: LV(REG): At #6 Interval # 4 -; CHECK-NEXT: LV(REG): At #7 Interval # 4 -; CHECK-NEXT: LV(REG): At #8 Interval # 4 -; CHECK-NEXT: LV(REG): At #9 Interval # 4 -; CHECK-NEXT: LV(REG): At #10 Interval # 4 +; CHECK-NEXT: LV(REG): At #3 Interval # 2 +; CHECK-NEXT: LV(REG): At #4 Interval # 2 +; CHECK-NEXT: LV(REG): At #5 Interval # 2 +; CHECK-NEXT: LV(REG): At #6 Interval # 3 +; CHECK-NEXT: LV(REG): At #7 Interval # 3 +; CHECK-NEXT: LV(REG): At #8 Interval # 3 +; CHECK-NEXT: LV(REG): At #9 Interval # 3 +; CHECK-NEXT: LV(REG): At #10 Interval # 3 ; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 3 +; CHECK-NEXT: LV(REG): At #12 Interval # 2 +; CHECK-NEXT: LV(REG): At #13 Interval # 2 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 27 +; CHECK-NEXT: LV: Loop cost is 24 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -219,31 +229,22 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4 ; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1> ; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1> -; CHECK-NEXT: EMIT vp<%3> = step-vector i32 -; CHECK-NEXT: EMIT vp<%4> = broadcast ir<%n> -; CHECK-NEXT: EMIT vp<%5> = broadcast ir<-1> -; CHECK-NEXT: EMIT vp<%6> = mul vp<%3>, vp<%5> -; CHECK-NEXT: EMIT vp<[[IV_START:%.+]]> = add vp<%4>, vp<%6> -; CHECK-NEXT: EMIT-SCALAR vp<%7> = trunc ir<%18> to i32 -; CHECK-NEXT: EMIT vp<%8> = mul ir<-1>, vp<%7> -; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = broadcast vp<%8> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: WIDEN-PHI ir<[[WIDEN_IV:%.+]]> = phi [ vp<[[IV_START]]>, ir-bb ], [ vp<[[IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> -; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 -; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> -; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> -; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> -; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> -; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 -; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add ir<[[WIDEN_IV]]>, vp<[[IV_INC]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> +; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: @@ -319,15 +320,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %19 = sub i64 %0, %n.vec ; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32 ; CHECK-NEXT: %20 = sub i32 %n, %.cast -; CHECK-NEXT: %21 = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: %broadcast.splatinsert = insertelement poison, i32 %n, i64 0 -; CHECK-NEXT: %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer -; CHECK-NEXT: %22 = mul %21, splat (i32 -1) -; CHECK-NEXT: %induction = add %broadcast.splat, %22 -; CHECK-NEXT: %23 = trunc i64 %18 to i32 -; CHECK-NEXT: %24 = mul i32 -1, %23 -; CHECK-NEXT: %broadcast.splatinsert3 = insertelement poison, i32 %24, i64 0 -; CHECK-NEXT: %broadcast.splat4 = shufflevector %broadcast.splatinsert3, poison, zeroinitializer ; CHECK-NEXT: br ; CHECK-NEXT: LV: draw edge from vector.memcheck ; CHECK-NEXT: LV: created vector.body @@ -336,28 +328,26 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph ; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ] -; CHECK-NEXT: %vec.ind = phi -; CHECK-NEXT: %25 = add nsw %vec.ind, splat (i32 -1) -; CHECK-NEXT: %26 = zext %25 to -; CHECK-NEXT: %27 = extractelement %26, i32 0 -; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %B, i64 %27 -; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %28, i32 0 -; CHECK-NEXT: %30 = trunc i64 %18 to i32 -; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %29, i64 -4, splat (i1 true), i32 %30) -; CHECK-NEXT: %31 = add %wide.strided.load, splat (i32 1) -; CHECK-NEXT: %32 = extractelement %26, i32 0 -; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %A, i64 %32 -; CHECK-NEXT: %34 = mul i64 0, %18 -; CHECK-NEXT: %35 = sub i64 %18, 1 -; CHECK-NEXT: %36 = mul i64 -1, %35 -; CHECK-NEXT: %37 = getelementptr inbounds i32, ptr %33, i64 %34 -; CHECK-NEXT: %38 = getelementptr inbounds i32, ptr %37, i64 %36 -; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %31) -; CHECK-NEXT: store %reverse, ptr %38, align 4 +; CHECK-NEXT: %.cast3 = trunc i64 %index to i32 +; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3 +; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 +; CHECK-NEXT: %22 = zext i32 %21 to i64 +; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22 +; CHECK-NEXT: %24 = getelementptr inbounds i32, ptr %23, i32 0 +; CHECK-NEXT: %25 = trunc i64 %18 to i32 +; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %24, i64 -4, splat (i1 true), i32 %25) +; CHECK-NEXT: %26 = add %wide.strided.load, splat (i32 1) +; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %A, i64 %22 +; CHECK-NEXT: %28 = mul i64 0, %18 +; CHECK-NEXT: %29 = sub i64 %18, 1 +; CHECK-NEXT: %30 = mul i64 -1, %29 +; CHECK-NEXT: %31 = getelementptr inbounds i32, ptr %27, i64 %28 +; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %31, i64 %30 +; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %26) +; CHECK-NEXT: store %reverse, ptr %32, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %vec.ind.next = add %vec.ind, %broadcast.splat4 -; CHECK-NEXT: %39 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %39, , label %vector.body +; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %33, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -374,7 +364,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader ; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] ; CHECK-NEXT: br label %for.body ; CHECK-NEXT: LV: draw edge from middle.block ; CHECK-NEXT: LV: draw edge from for.body.preheader @@ -384,12 +374,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph ; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ] ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: %40 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: %add9 = add i32 %40, 1 +; CHECK-NEXT: %34 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: %add9 = add i32 %34, 1 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 @@ -442,15 +432,20 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom +; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] +; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 @@ -485,6 +480,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 @@ -507,9 +505,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> -; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> @@ -552,10 +551,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 @@ -566,26 +565,27 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 ; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 3 -; CHECK-NEXT: LV(REG): At #4 Interval # 3 -; CHECK-NEXT: LV(REG): At #5 Interval # 4 -; CHECK-NEXT: LV(REG): At #6 Interval # 4 -; CHECK-NEXT: LV(REG): At #7 Interval # 4 -; CHECK-NEXT: LV(REG): At #8 Interval # 4 -; CHECK-NEXT: LV(REG): At #9 Interval # 4 -; CHECK-NEXT: LV(REG): At #10 Interval # 4 +; CHECK-NEXT: LV(REG): At #3 Interval # 2 +; CHECK-NEXT: LV(REG): At #4 Interval # 2 +; CHECK-NEXT: LV(REG): At #5 Interval # 2 +; CHECK-NEXT: LV(REG): At #6 Interval # 3 +; CHECK-NEXT: LV(REG): At #7 Interval # 3 +; CHECK-NEXT: LV(REG): At #8 Interval # 3 +; CHECK-NEXT: LV(REG): At #9 Interval # 3 +; CHECK-NEXT: LV(REG): At #10 Interval # 3 ; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 3 +; CHECK-NEXT: LV(REG): At #12 Interval # 2 +; CHECK-NEXT: LV(REG): At #13 Interval # 2 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 29 +; CHECK-NEXT: LV: Loop cost is 26 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -636,21 +636,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4 ; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1> ; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1> -; CHECK-NEXT: EMIT vp<%3> = step-vector i32 -; CHECK-NEXT: EMIT vp<%4> = broadcast ir<%n> -; CHECK-NEXT: EMIT vp<%5> = broadcast ir<-1> -; CHECK-NEXT: EMIT vp<%6> = mul vp<%3>, vp<%5> -; CHECK-NEXT: EMIT vp<[[IV_START:%.+]]> = add vp<%4>, vp<%6> -; CHECK-NEXT: EMIT-SCALAR vp<%7> = trunc ir<%18> to i32 -; CHECK-NEXT: EMIT vp<%8> = mul ir<-1>, vp<%7> -; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = broadcast vp<%8> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: WIDEN-PHI ir<[[WIDEN_IV:%.+]]> = phi [ vp<[[IV_START]]>, ir-bb ], [ vp<[[IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> -; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> @@ -659,7 +651,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> ; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 -; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add ir<[[WIDEN_IV]]>, vp<[[IV_INC]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: @@ -736,15 +727,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %19 = sub i64 %0, %n.vec ; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32 ; CHECK-NEXT: %20 = sub i32 %n, %.cast -; CHECK-NEXT: %21 = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: %broadcast.splatinsert = insertelement poison, i32 %n, i64 0 -; CHECK-NEXT: %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer -; CHECK-NEXT: %22 = mul %21, splat (i32 -1) -; CHECK-NEXT: %induction = add %broadcast.splat, %22 -; CHECK-NEXT: %23 = trunc i64 %18 to i32 -; CHECK-NEXT: %24 = mul i32 -1, %23 -; CHECK-NEXT: %broadcast.splatinsert3 = insertelement poison, i32 %24, i64 0 -; CHECK-NEXT: %broadcast.splat4 = shufflevector %broadcast.splatinsert3, poison, zeroinitializer ; CHECK-NEXT: br ; CHECK-NEXT: LV: draw edge from vector.memcheck ; CHECK-NEXT: LV: created vector.body @@ -753,28 +735,26 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph ; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ] -; CHECK-NEXT: %vec.ind = phi -; CHECK-NEXT: %25 = add nsw %vec.ind, splat (i32 -1) -; CHECK-NEXT: %26 = zext %25 to -; CHECK-NEXT: %27 = extractelement %26, i32 0 -; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %B, i64 %27 -; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %28, i32 0 -; CHECK-NEXT: %30 = trunc i64 %18 to i32 -; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %29, i64 -4, splat (i1 true), i32 %30) -; CHECK-NEXT: %31 = fadd %wide.strided.load, splat (float 1.000000e+00) -; CHECK-NEXT: %32 = extractelement %26, i32 0 -; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %A, i64 %32 -; CHECK-NEXT: %34 = mul i64 0, %18 -; CHECK-NEXT: %35 = sub i64 %18, 1 -; CHECK-NEXT: %36 = mul i64 -1, %35 -; CHECK-NEXT: %37 = getelementptr inbounds float, ptr %33, i64 %34 -; CHECK-NEXT: %38 = getelementptr inbounds float, ptr %37, i64 %36 -; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %31) -; CHECK-NEXT: store %reverse, ptr %38, align 4 +; CHECK-NEXT: %.cast3 = trunc i64 %index to i32 +; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3 +; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 +; CHECK-NEXT: %22 = zext i32 %21 to i64 +; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22 +; CHECK-NEXT: %24 = getelementptr inbounds float, ptr %23, i32 0 +; CHECK-NEXT: %25 = trunc i64 %18 to i32 +; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %24, i64 -4, splat (i1 true), i32 %25) +; CHECK-NEXT: %26 = fadd %wide.strided.load, splat (float 1.000000e+00) +; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %A, i64 %22 +; CHECK-NEXT: %28 = mul i64 0, %18 +; CHECK-NEXT: %29 = sub i64 %18, 1 +; CHECK-NEXT: %30 = mul i64 -1, %29 +; CHECK-NEXT: %31 = getelementptr inbounds float, ptr %27, i64 %28 +; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %31, i64 %30 +; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %26) +; CHECK-NEXT: store %reverse, ptr %32, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %vec.ind.next = add %vec.ind, %broadcast.splat4 -; CHECK-NEXT: %39 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %39, , label %vector.body +; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %33, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -791,7 +771,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader ; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] ; CHECK-NEXT: br label %for.body ; CHECK-NEXT: LV: draw edge from middle.block ; CHECK-NEXT: LV: draw edge from for.body.preheader @@ -801,12 +781,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph ; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ] ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: %40 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: %conv1 = fadd float %40, 1.000000e+00 +; CHECK-NEXT: %34 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: %conv1 = fadd float %34, 1.000000e+00 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 From 32ebb119043e573f083d7dd4d82d6e60c2ad98de Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 21 May 2025 00:15:28 -0700 Subject: [PATCH 05/16] [WIP][VPlan based] Time to remove CM_Strided --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5d51ead847e2d..8a3a84431c6bf 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1085,7 +1085,6 @@ class LoopVectorizationCostModel { CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, - CM_Strided, CM_Scalarize, CM_VectorCall, CM_IntrinsicCall @@ -6397,8 +6396,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { - // TODO: New CastContextHint for strided accesses. - case LoopVectorizationCostModel::CM_Strided: case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; case LoopVectorizationCostModel::CM_Interleave: From 96da92eb865e79b701e695341a3df2fef607ff19 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 21 May 2025 00:58:56 -0700 Subject: [PATCH 06/16] [VPlan based] Patch comments, nfc --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 ++++--- llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8a3a84431c6bf..d2d44e170fde3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8914,17 +8914,18 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); if (!CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); - // !!! NEED COMMENT + // Convert reverse memory recipes to strided access recipes if the strided + // access is legal and profitable. VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, CostCtx, Range); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index e4feb14275d2a..1f0404b63248d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -175,7 +175,9 @@ struct VPlanTransforms { &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); - // !!! NEED COMMENT + /// Transform reverse memory recipes into strided access recipes when legal + /// and profitable. Clamps \p Range to maintain consistency with widen + /// decisions of \p Plan, and uses \p Ctx to evaluate the cost. static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, VFRange &Range); From d340701cb7c0c0d21b3dcc84be206a6b345f2a50 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 21 May 2025 01:21:22 -0700 Subject: [PATCH 07/16] Format --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 7a67df4cd7b6e..a452e65d34b0f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2734,18 +2734,18 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); // Create a new vector pointer for strided access. - auto *NewPtr = new VPVectorPointerRecipe( - Ptr, ElementTy, /*Stride=*/ true, - GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), - VecEndPtr->getDebugLoc()); + auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true, + GEP ? GEP->getNoWrapFlags() + : GEPNoWrapFlags::none(), + VecEndPtr->getDebugLoc()); NewPtr->insertBefore(MemR); auto *LoadR = cast(MemR); auto *LI = cast(&Ingredient); const DataLayout &DL = LI->getDataLayout(); auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); - VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( - StrideTy, Stride * DL.getTypeAllocSize(ElementTy))); + VPValue *StrideVPV = Plan.getOrAddLiveIn( + ConstantInt::get(StrideTy, Stride * DL.getTypeAllocSize(ElementTy))); auto *StridedLoad = new VPWidenStridedLoadRecipe( *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, LoadR->getDebugLoc()); From 1add5bc2d39b3b575c2e69abb17bd838af8f313b Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 2 Jun 2025 02:27:25 -0700 Subject: [PATCH 08/16] [Unrelated code] Remove Mel-Chen:legalizeAndOptimizeInductions We should reopen it after supporting const strided accessess. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a452e65d34b0f..b9c93f39da7e8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -627,14 +627,12 @@ static SmallVector collectUsersRecursively(VPValue *V) { static void legalizeAndOptimizeInductions(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - SmallVector InductionPhis; - for (VPRecipeBase &R : HeaderVPBB->phis()) - if (auto *IV = dyn_cast(&R)) - InductionPhis.push_back(IV); - bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); - VPBuilder Builder; - for (VPWidenInductionRecipe *PhiR : reverse(InductionPhis)) { + VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *PhiR = dyn_cast(&Phi); + if (!PhiR) + continue; // Try to narrow wide and replicating recipes to uniform recipes, based on // VPlan analysis. // TODO: Apply to all recipes in the future, to replace legacy uniformity @@ -644,8 +642,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { auto *Def = dyn_cast(U); auto *RepR = dyn_cast(U); // Skip recipes that shouldn't be narrowed. - if (!Def || - !isa(Def) || + if (!Def || !isa(Def) || Def->getNumUsers() == 0 || !Def->getUnderlyingValue() || (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))) continue; @@ -658,13 +655,11 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { Def->operands(), /*IsUniform*/ true); Clone->insertAfter(Def); Def->replaceAllUsesWith(Clone); - Def->eraseFromParent(); } - Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). - if (auto *PtrIV = dyn_cast(PhiR)) { + if (auto *PtrIV = dyn_cast(&Phi)) { if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF())) continue; @@ -685,7 +680,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { // Replace widened induction with scalar steps for users that only use // scalars. - auto *WideIV = cast(PhiR); + auto *WideIV = cast(&Phi); if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) { return U->usesScalars(WideIV); })) From 78fc57e0d6ed664e3da2025f89842f20710af612 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 2 Jun 2025 02:41:39 -0700 Subject: [PATCH 09/16] [Fix] Remove unused debug info setting. --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8886bc8765b2e..c992d570a3c6e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3247,7 +3247,6 @@ void VPWidenStridedLoadRecipe::execute(VPTransformState &State) { const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); Value *Addr = State.get(getAddr(), /*IsScalar*/ true); Value *Stride = State.get(getStride(), /*IsScalar*/ true); Value *Mask = nullptr; From 0f8cfb406ed81c906bdf0738e8603e1b91dca90f Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 2 Jun 2025 02:49:02 -0700 Subject: [PATCH 10/16] [Fix] Set Opcode as Instruction::Load directly. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b9c93f39da7e8..d5bc733b0e669 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2712,7 +2712,7 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx); const InstructionCost StridedLoadStoreCost = Ctx.TTI.getStridedMemoryOpCost( - Ingredient.getOpcode(), DataTy, + Instruction::Load, DataTy, getLoadStorePointerOperand(&Ingredient), MemR->isMasked(), Alignment, Ctx.CostKind, &Ingredient); return StridedLoadStoreCost < CurrentCost; From e27b3da630b259275e83af4785fc63b08a33154e Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 2 Jun 2025 06:13:36 -0700 Subject: [PATCH 11/16] [Fix] Replace getLoadStorePointerOperand with Ptr->getUnderlyingValue(). --- .../Transforms/Vectorize/VPlanTransforms.cpp | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d5bc733b0e669..0f2e88362432c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2701,6 +2701,14 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, if (!MemR || !isa(MemR) || !MemR->isReverse()) continue; + auto *VecEndPtr = cast(MemR->getAddr()); + VPValue *Ptr = VecEndPtr->getPtr(); + Value *PtrUV = Ptr->getUnderlyingValue(); + // Memory cost model requires the pointer operand of memory access + // instruction. + if (!PtrUV) + continue; + Instruction &Ingredient = MemR->getIngredient(); Type *ElementTy = getLoadStoreType(&Ingredient); @@ -2711,10 +2719,9 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, return false; const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx); const InstructionCost StridedLoadStoreCost = - Ctx.TTI.getStridedMemoryOpCost( - Instruction::Load, DataTy, - getLoadStorePointerOperand(&Ingredient), MemR->isMasked(), - Alignment, Ctx.CostKind, &Ingredient); + Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV, + MemR->isMasked(), Alignment, + Ctx.CostKind, &Ingredient); return StridedLoadStoreCost < CurrentCost; }; @@ -2724,10 +2731,7 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, // The stride of consecutive reverse access must be -1. int64_t Stride = -1; - auto *VecEndPtr = cast(MemR->getAddr()); - VPValue *Ptr = VecEndPtr->getPtr(); - auto *GEP = dyn_cast( - Ptr->getUnderlyingValue()->stripPointerCasts()); + auto *GEP = dyn_cast(PtrUV->stripPointerCasts()); // Create a new vector pointer for strided access. auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true, GEP ? GEP->getNoWrapFlags() From f6a722f5827a90676904c4c54e10ef4d235c463d Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 11 Jun 2025 00:42:09 -0700 Subject: [PATCH 12/16] [Fix] Pass stride in element size --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 ++++- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++-- .../LoopVectorize/RISCV/riscv-vector-reverse.ll | 8 ++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c992d570a3c6e..0d1209004dfad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3259,9 +3259,12 @@ void VPWidenStridedLoadRecipe::execute(VPTransformState &State) { auto *PtrTy = Addr->getType(); auto *StrideTy = Stride->getType(); + const DataLayout &DL = Ingredient.getDataLayout(); + Value *StrideInBytes = Builder.CreateMul( + Stride, ConstantInt::get(StrideTy, DL.getTypeAllocSize(ScalarDataTy))); CallInst *NewLI = Builder.CreateIntrinsic( Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy}, - {Addr, Stride, Mask, RunTimeVF}, nullptr, "wide.strided.load"); + {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load"); NewLI->addParamAttr( 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); applyMetadata(*NewLI); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 0f2e88362432c..1fe2f58f80789 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2743,8 +2743,8 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, auto *LI = cast(&Ingredient); const DataLayout &DL = LI->getDataLayout(); auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); - VPValue *StrideVPV = Plan.getOrAddLiveIn( - ConstantInt::get(StrideTy, Stride * DL.getTypeAllocSize(ElementTy))); + VPValue *StrideVPV = + Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride)); auto *StridedLoad = new VPWidenStridedLoadRecipe( *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, LoadR->getDebugLoc()); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 97afa9f87ac24..31681824624be 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -104,7 +104,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> -; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]> ; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> @@ -238,7 +238,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> -; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]> ; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> @@ -511,7 +511,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> -; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]> ; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> ; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> @@ -645,7 +645,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> -; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]> ; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> ; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> From fe13002cae3582a3a9adab820baef487fd79354c Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 11 Jun 2025 03:51:39 -0700 Subject: [PATCH 13/16] [Fix] New operand Stride for VPVectorPointerRecipe --- .../Transforms/Vectorize/LoopVectorize.cpp | 11 +++++++---- llvm/lib/Transforms/Vectorize/VPlan.h | 19 ++++++++----------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 18 +++++++++++------- .../Transforms/Vectorize/VPlanTransforms.cpp | 15 +++++++-------- .../RISCV/riscv-vector-reverse.ll | 8 ++++---- .../LoopVectorize/vplan-dot-printing.ll | 4 ++-- 6 files changed, 39 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d2d44e170fde3..3910169a346c0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7774,10 +7774,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), /*Stride*/ -1, Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe( - Ptr, getLoadStoreType(I), /*Strided*/ false, - GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), - I->getDebugLoc()); + const DataLayout &DL = I->getDataLayout(); + auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType()); + VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1)); + VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne, + GEP ? GEP->getNoWrapFlags() + : GEPNoWrapFlags::none(), + I->getDebugLoc()); } Builder.insert(VectorPtr); Ptr = VectorPtr; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c9e51d9abaf90..b012c0149b39a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1767,24 +1767,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, }; /// A recipe to compute the pointers for widened memory accesses of IndexTy. -/// Supports both consecutive and reverse consecutive accesses. -/// TODO: Support non-unit strided accesses . class VPVectorPointerRecipe : public VPRecipeWithIRFlags, - public VPUnrollPartAccessor<1> { + public VPUnrollPartAccessor<2> { Type *IndexedTy; - /// Indicate whether to compute the pointer for strided memory accesses. - bool Strided; - public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided, + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride, GEPNoWrapFlags GEPFlags, DebugLoc DL) - : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef(Ptr), - GEPFlags, DL), - IndexedTy(IndexedTy), Strided(Strided) {} + : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, + ArrayRef({Ptr, Stride}), GEPFlags, DL), + IndexedTy(IndexedTy) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) + VPValue *getStride() const { return getOperand(1); } + void execute(VPTransformState &State) override; bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -1802,7 +1799,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, } VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided, + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, getOperand(1), getGEPNoWrapFlags(), getDebugLoc()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 0d1209004dfad..53dab59316126 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2384,16 +2384,20 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, void VPVectorPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; unsigned CurrentPart = getUnrollPart(*this); - Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, - /*IsUnitStride*/ true, CurrentPart, Builder); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + + auto *StrideC = dyn_cast(Stride); + bool IsStrideOne = StrideC && StrideC->isOne(); + bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne()); + Type *IndexTy = + getGEPIndexTy(State.VF.isScalable(), + /*IsReverse*/ false, IsUnitStride, CurrentPart, Builder); Value *Ptr = State.get(getOperand(0), VPLane(0)); + Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); - // TODO: Support non-unit-reverse strided accesses. - Value *Index = - Strided - ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) - : Increment; + Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride); + Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1fe2f58f80789..948145e632afd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2732,22 +2732,21 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, // The stride of consecutive reverse access must be -1. int64_t Stride = -1; auto *GEP = dyn_cast(PtrUV->stripPointerCasts()); + const DataLayout &DL = Ingredient.getDataLayout(); + auto *StrideTy = DL.getIndexType(PtrUV->getType()); + VPValue *StrideVPV = + Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride)); // Create a new vector pointer for strided access. - auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true, + auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, StrideVPV, GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), VecEndPtr->getDebugLoc()); NewPtr->insertBefore(MemR); auto *LoadR = cast(MemR); - auto *LI = cast(&Ingredient); - const DataLayout &DL = LI->getDataLayout(); - auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); - VPValue *StrideVPV = - Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride)); auto *StridedLoad = new VPWidenStridedLoadRecipe( - *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, - LoadR->getDebugLoc()); + *cast(&Ingredient), NewPtr, StrideVPV, &Plan.getVF(), + LoadR->getMask(), *LoadR, LoadR->getDebugLoc()); StridedLoad->insertBefore(LoadR); LoadR->replaceAllUsesWith(StridedLoad); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 31681824624be..ea193aff5593b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -103,7 +103,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> ; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> -; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]> ; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> @@ -237,7 +237,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> ; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> -; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]> ; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> @@ -510,7 +510,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> ; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> -; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]> ; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> ; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> @@ -644,7 +644,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> ; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> -; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]> ; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> ; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll index 528f2448616e8..2c757021e76ff 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -42,11 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" + ; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>, vp\<[[VF]]\>\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>\l" + +; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>, ir\<1\>\l" + ; CHECK-NEXT: " WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" + ; CHECK-NEXT: " WIDEN-INTRINSIC ir\<%call\> = call llvm.sqrt(ir\<%lv\>)\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>\l" + +; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>, ir\<1\>\l" + ; CHECK-NEXT: " WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" + ; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" + ; CHECK-NEXT: " EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<[[VEC_TC]]\>\l" + From d4ea1d663270b07f5621a1b96c6a6591d06d475f Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 16 Jun 2025 02:17:19 -0700 Subject: [PATCH 14/16] [Comment] Remove the unrelated change, nfc --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 948145e632afd..5c1dfeea81205 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -633,6 +633,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { auto *PhiR = dyn_cast(&Phi); if (!PhiR) continue; + // Try to narrow wide and replicating recipes to uniform recipes, based on // VPlan analysis. // TODO: Apply to all recipes in the future, to replace legacy uniformity From bb03212376c41e3a34ee1fb2424b0b5f2e56a853 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 16 Jun 2025 02:25:50 -0700 Subject: [PATCH 15/16] [Comment] Add assert for consecutive, nfc --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 5c1dfeea81205..89573b9f18033 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2702,6 +2702,8 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, if (!MemR || !isa(MemR) || !MemR->isReverse()) continue; + assert(MemR->isConsecutive() && "Reverse access must be consecutive"); + auto *VecEndPtr = cast(MemR->getAddr()); VPValue *Ptr = VecEndPtr->getPtr(); Value *PtrUV = Ptr->getUnderlyingValue(); From d9e53618e4c97a46c632c44f084ff79d60352e64 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 16 Jun 2025 03:06:21 -0700 Subject: [PATCH 16/16] [Comment] Update comment of VPVectorPointerRecipe --- llvm/lib/Transforms/Vectorize/VPlan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index b012c0149b39a..7ca9eedc46cf0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1766,7 +1766,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, #endif }; -/// A recipe to compute the pointers for widened memory accesses of IndexTy. +/// A recipe to compute the pointers for widened memory accesses of IndexedTy, +/// with the Stride expressed in units of IndexedTy. class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<2> { Type *IndexedTy;