diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 907839711a39c..3910169a346c0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4184,7 +4184,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( [](const auto *R) { return Instruction::Select; }) .Case( [](const auto *R) { return Instruction::Store; }) - .Case( + .Case( [](const auto *R) { return Instruction::Load; }) .Case( [](const auto *R) { return Instruction::Call; }) @@ -4283,6 +4283,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: case VPDef::VPInterleaveSC: + case VPDef::VPWidenStridedLoadSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: case VPDef::VPWidenStoreEVLSC: @@ -7773,7 +7774,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), /*Stride*/ -1, Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), + const DataLayout &DL = I->getDataLayout(); + auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType()); + VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1)); + VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne, GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); @@ -8913,16 +8917,20 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. - if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); + if (!CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); - } + + // Convert reverse memory recipes to strided access recipes if the strided + // access is legal and profitable. + VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, + CostCtx, Range); for (ElementCount VF : Range) Plan->addVF(VF); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c5b214b355545..7ca9eedc46cf0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -557,6 +557,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: + case VPRecipeBase::VPWidenStridedLoadSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: @@ -1724,6 +1725,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC) + VPValue *getPtr() const { return getOperand(0); } + VPValue *getVFValue() { return getOperand(1); } const VPValue *getVFValue() const { return getOperand(1); } @@ -1763,20 +1766,23 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, #endif }; -/// A recipe to compute the pointers for widened memory accesses of IndexTy. +/// A recipe to compute the pointers for widened memory accesses of IndexedTy, +/// with the Stride expressed in units of IndexedTy. class VPVectorPointerRecipe : public VPRecipeWithIRFlags, - public VPUnrollPartAccessor<1> { + public VPUnrollPartAccessor<2> { Type *IndexedTy; public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, - DebugLoc DL) - : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef(Ptr), - GEPFlags, DL), + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride, + GEPNoWrapFlags GEPFlags, DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, + ArrayRef({Ptr, Stride}), GEPFlags, DL), IndexedTy(IndexedTy) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) + VPValue *getStride() const { return getOperand(1); } + void execute(VPTransformState &State) override; bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -1794,7 +1800,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, } VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, getOperand(1), getGEPNoWrapFlags(), getDebugLoc()); } @@ -2931,7 +2937,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata { return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || - R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; + R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC || + R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC; } static inline bool classof(const VPUser *U) { @@ -3050,6 +3057,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { } }; +/// A recipe for strided load operations, using the base address, stride, and an +/// optional mask. This recipe will generate an vp.strided.load intrinsic call +/// to represent memory accesses with a fixed stride. +struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe, + public VPValue { + VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride, + VPValue *VF, VPValue *Mask, + const VPIRMetadata &Metadata, DebugLoc DL) + : VPWidenMemoryRecipe( + VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF}, + /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL), + VPValue(this, &Load) { + setMask(Mask); + } + + VPWidenStridedLoadRecipe *clone() override { + return new VPWidenStridedLoadRecipe(cast(Ingredient), getAddr(), + getStride(), getVF(), getMask(), *this, + getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC); + + /// Return the stride operand. + VPValue *getStride() const { return getOperand(1); } + + /// Return the VF operand. + VPValue *getVF() const { return getOperand(2); } + + /// Generate a strided load. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() || Op == getStride() || Op == getVF(); + } +}; + /// A recipe for widening store operations, using the stored value, the address /// to store to and an optional mask. struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 92db9674ef42b..714fef032c9b1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -184,8 +184,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert((isa(R)) && - "Store recipes should not define any values"); + assert( + (isa( + R)) && + "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 06511b61a67c3..53dab59316126 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -80,6 +80,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: @@ -103,6 +104,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return cast(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -184,6 +186,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { } case VPInterleaveSC: return mayWriteToMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenStoreEVLSC: @@ -2381,13 +2384,22 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, void VPVectorPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; unsigned CurrentPart = getUnrollPart(*this); - Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, - /*IsUnitStride*/ true, CurrentPart, Builder); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + + auto *StrideC = dyn_cast(Stride); + bool IsStrideOne = StrideC && StrideC->isOne(); + bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne()); + Type *IndexTy = + getGEPIndexTy(State.VF.isScalable(), + /*IsReverse*/ false, IsUnitStride, CurrentPart, Builder); Value *Ptr = State.get(getOperand(0), VPLane(0)); + Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); + Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride); + Value *ResultPtr = - Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); + Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } @@ -3065,9 +3077,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); - unsigned Opcode = isa(this) - ? Instruction::Load - : Instruction::Store; + unsigned Opcode = + isa( + this) + ? Instruction::Load + : Instruction::Store; if (!Consecutive) { // TODO: Using the original IR may not be accurate. @@ -3076,6 +3090,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, const Value *Ptr = getLoadStorePointerOperand(&Ingredient); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + if (isa(this)) + return Ctx.TTI.getStridedMemoryOpCost( + Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); + return Ctx.TTI.getAddressComputationCost(Ty) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); @@ -3226,6 +3245,50 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPWidenStridedLoadRecipe::execute(VPTransformState &State) { + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + Value *Addr = State.get(getAddr(), /*IsScalar*/ true); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) + Mask = State.get(VPMask); + else + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)), + Builder.getInt32Ty()); + + auto *PtrTy = Addr->getType(); + auto *StrideTy = Stride->getType(); + const DataLayout &DL = Ingredient.getDataLayout(); + Value *StrideInBytes = Builder.CreateMul( + Stride, ConstantInt::get(StrideTy, DL.getTypeAllocSize(ScalarDataTy))); + CallInst *NewLI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy}, + {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load"); + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + applyMetadata(*NewLI); + State.set(this, NewLI); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = load "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", stride = "; + getStride()->printAsOperand(O, SlotTracker); + O << ", runtimeVF = "; + getVF()->printAsOperand(O, SlotTracker); +} +#endif + void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 931d4d42f56e4..89573b9f18033 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2144,6 +2144,12 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, VPValue *NewMask = GetNewMask(L->getMask()); return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); }) + .Case([&](VPWidenStridedLoadRecipe *L) { + VPValue *NewMask = GetNewMask(L->getMask()); + return new VPWidenStridedLoadRecipe( + *cast(&L->getIngredient()), L->getAddr(), L->getStride(), + &EVL, NewMask, *L, L->getDebugLoc()); + }) .Case([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); @@ -2198,10 +2204,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); - assert(all_of(Plan.getVF().users(), - IsaPred) && - "User of VF that we can't transform to EVL."); + assert( + all_of( + Plan.getVF().users(), + IsaPred) && + "User of VF that we can't transform to EVL."); Plan.getVF().replaceAllUsesWith(&EVL); // Create a scalar phi to track the previous EVL if fixed-order recurrence is @@ -2240,7 +2248,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { NumDefVal <= 1 && "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa(EVLRecipe)) { + if (isa(EVLRecipe)) { VPValue *CurVPV = CurRecipe->getVPSingleValue(); CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); } @@ -2678,6 +2687,80 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { R->dissolveToCFGLoop(); } +void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + if (Plan.hasScalarVFOnly()) + return; + + SmallVector ToErase; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *MemR = dyn_cast(&R); + // TODO: support strided store + // TODO: support strided accesses with stride not equal to -1 + if (!MemR || !isa(MemR) || !MemR->isReverse()) + continue; + + assert(MemR->isConsecutive() && "Reverse access must be consecutive"); + + auto *VecEndPtr = cast(MemR->getAddr()); + VPValue *Ptr = VecEndPtr->getPtr(); + Value *PtrUV = Ptr->getUnderlyingValue(); + // Memory cost model requires the pointer operand of memory access + // instruction. + if (!PtrUV) + continue; + + Instruction &Ingredient = MemR->getIngredient(); + Type *ElementTy = getLoadStoreType(&Ingredient); + + auto IsProfitable = [&](ElementCount VF) -> bool { + Type *DataTy = toVectorTy(ElementTy, VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) + return false; + const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx); + const InstructionCost StridedLoadStoreCost = + Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV, + MemR->isMasked(), Alignment, + Ctx.CostKind, &Ingredient); + return StridedLoadStoreCost < CurrentCost; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable, + Range)) + continue; + + // The stride of consecutive reverse access must be -1. + int64_t Stride = -1; + auto *GEP = dyn_cast(PtrUV->stripPointerCasts()); + const DataLayout &DL = Ingredient.getDataLayout(); + auto *StrideTy = DL.getIndexType(PtrUV->getType()); + VPValue *StrideVPV = + Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride)); + // Create a new vector pointer for strided access. + auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, StrideVPV, + GEP ? GEP->getNoWrapFlags() + : GEPNoWrapFlags::none(), + VecEndPtr->getDebugLoc()); + NewPtr->insertBefore(MemR); + + auto *LoadR = cast(MemR); + auto *StridedLoad = new VPWidenStridedLoadRecipe( + *cast(&Ingredient), NewPtr, StrideVPV, &Plan.getVF(), + LoadR->getMask(), *LoadR, LoadR->getDebugLoc()); + StridedLoad->insertBefore(LoadR); + LoadR->replaceAllUsesWith(StridedLoad); + + ToErase.append({LoadR, VecEndPtr}); + } + } + + for (VPRecipeBase *R : ToErase) + R->eraseFromParent(); +} + void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { using namespace llvm::VPlanPatternMatch; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 8d2eded45da22..1f0404b63248d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -175,6 +175,12 @@ struct VPlanTransforms { &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); + /// Transform reverse memory recipes into strided access recipes when legal + /// and profitable. Clamps \p Range to maintain consistency with widen + /// decisions of \p Plan, and uses \p Ctx to evaluate the cost. + static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); + /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 279cdac92d2d1..d9b1f7d4f5d53 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -340,6 +340,7 @@ class VPDef { VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, + VPWidenStridedLoadSC, VPVectorPointerSC, VPVectorEndPointerSC, VPWidenCallSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 38ada33d7ee19..bc9d40834c185 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) .Case( + VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case([&](auto *R) { if (R->getNumOperands() != 3) { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index 29b27cdb7556d..d53fb60e7c7c9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -37,27 +37,23 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64: [[VECTOR_BODY]]: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]] +; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; RV64-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP9]], i64 -4, splat (i1 true), i32 [[TMP10]]) +; RV64-NEXT: [[TMP11:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] +; RV64-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP14]] +; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP13]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) +; RV64-NEXT: store [[REVERSE]], ptr [[TMP17]], align 4 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -68,8 +64,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1 +; RV64-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP19]], 1 ; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -96,29 +92,24 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32: [[VECTOR_BODY]]: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]] +; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 ; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1 -; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = add [[REVERSE]], splat (i32 1) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 -; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP9]], i32 -4, splat (i1 true), i32 [[TMP10]]) +; RV32-NEXT: [[TMP11:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] +; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]] +; RV32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], 1 +; RV32-NEXT: [[TMP16:%.*]] = mul i32 -1, [[TMP15]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 [[TMP16]] +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) +; RV32-NEXT: store [[REVERSE]], ptr [[TMP18]], align 4 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -129,8 +120,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1 +; RV32-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP20]], 1 ; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -158,39 +149,34 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP20:%.*]] = add [[REVERSE2]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP15]]) +; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP14]], i64 -4, splat (i1 true), i32 [[TMP16]]) +; RV64-UF2-NEXT: [[TMP17:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP18:%.*]] = add [[WIDE_STRIDED_LOAD1]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 -1, [[TMP21]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP25]] ; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) -; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; RV64-UF2-NEXT: store [[REVERSE]], ptr [[TMP24]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; RV64-UF2-NEXT: store [[REVERSE2]], ptr [[TMP29]], align 4 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -251,27 +237,23 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64: [[VECTOR_BODY]]: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; RV64-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP9]], i64 -4, splat (i1 true), i32 [[TMP10]]) +; RV64-NEXT: [[TMP11:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; RV64-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP14]] +; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP15]] +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP11]]) +; RV64-NEXT: store [[REVERSE]], ptr [[TMP17]], align 4 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -282,8 +264,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00 +; RV64-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP19]], 1.000000e+00 ; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -310,29 +292,24 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32: [[VECTOR_BODY]]: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 ; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1 -; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 -; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP9]], i32 -4, splat (i1 true), i32 [[TMP10]]) +; RV32-NEXT: [[TMP11:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]] +; RV32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], 1 +; RV32-NEXT: [[TMP16:%.*]] = mul i32 -1, [[TMP15]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP16]] +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP11]]) +; RV32-NEXT: store [[REVERSE]], ptr [[TMP18]], align 4 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -343,8 +320,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00 +; RV32-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP20]], 1.000000e+00 ; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -372,39 +349,34 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd [[REVERSE2]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP15]]) +; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP14]], i64 -4, splat (i1 true), i32 [[TMP16]]) +; RV64-UF2-NEXT: [[TMP17:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 -1, [[TMP21]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP25]] ; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP19]]) -; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP20]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP17]]) +; RV64-UF2-NEXT: store [[REVERSE]], ptr [[TMP24]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP18]]) +; RV64-UF2-NEXT: store [[REVERSE2]], ptr [[TMP29]], align 4 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index b4e49a60e0887..ea193aff5593b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -80,9 +80,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -97,20 +97,20 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> -; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> -; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0> -; CHECK-NEXT: WIDEN store vp<%10>, ir<%add9> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -189,9 +189,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { -; CHECK-NEXT: Live-in ir<%18> = VF -; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF -; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count +; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF +; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -232,19 +232,19 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> -; CHECK-NEXT: WIDEN ir<%19> = load vp<%4> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18> -; CHECK-NEXT: WIDEN store vp<%5>, ir<%add9> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1 -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec> +; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: @@ -333,25 +333,21 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 ; CHECK-NEXT: %22 = zext i32 %21 to i64 ; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22 -; CHECK-NEXT: %24 = mul i64 0, %18 -; CHECK-NEXT: %25 = sub i64 %18, 1 -; CHECK-NEXT: %26 = mul i64 -1, %25 -; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24 -; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26 -; CHECK-NEXT: %wide.load = load , ptr %28, align 4 -; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %wide.load) -; CHECK-NEXT: %29 = add %reverse, splat (i32 1) -; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22 -; CHECK-NEXT: %31 = mul i64 0, %18 -; CHECK-NEXT: %32 = sub i64 %18, 1 -; CHECK-NEXT: %33 = mul i64 -1, %32 -; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31 -; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33 -; CHECK-NEXT: %reverse4 = call @llvm.vector.reverse.nxv4i32( %29) -; CHECK-NEXT: store %reverse4, ptr %35, align 4 +; CHECK-NEXT: %24 = getelementptr inbounds i32, ptr %23, i32 0 +; CHECK-NEXT: %25 = trunc i64 %18 to i32 +; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %24, i64 -4, splat (i1 true), i32 %25) +; CHECK-NEXT: %26 = add %wide.strided.load, splat (i32 1) +; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %A, i64 %22 +; CHECK-NEXT: %28 = mul i64 0, %18 +; CHECK-NEXT: %29 = sub i64 %18, 1 +; CHECK-NEXT: %30 = mul i64 -1, %29 +; CHECK-NEXT: %31 = getelementptr inbounds i32, ptr %27, i64 %28 +; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %31, i64 %30 +; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4i32( %26) +; CHECK-NEXT: store %reverse, ptr %32, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %36, , label %vector.body +; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %33, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -368,7 +364,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader ; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] ; CHECK-NEXT: br label %for.body ; CHECK-NEXT: LV: draw edge from middle.block ; CHECK-NEXT: LV: draw edge from for.body.preheader @@ -378,12 +374,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph ; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ] ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: %add9 = add i32 %37, 1 +; CHECK-NEXT: %34 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: %add9 = add i32 %34, 1 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 @@ -491,9 +487,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -508,20 +504,20 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> -; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> -; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0> -; CHECK-NEXT: WIDEN store vp<%10>, ir<%conv1> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -600,9 +596,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { -; CHECK-NEXT: Live-in ir<%18> = VF -; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF -; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count +; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF +; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -643,19 +639,19 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> -; CHECK-NEXT: WIDEN ir<%19> = load vp<%4> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18> -; CHECK-NEXT: WIDEN store vp<%5>, ir<%conv1> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1 -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec> +; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: @@ -744,25 +740,21 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 ; CHECK-NEXT: %22 = zext i32 %21 to i64 ; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22 -; CHECK-NEXT: %24 = mul i64 0, %18 -; CHECK-NEXT: %25 = sub i64 %18, 1 -; CHECK-NEXT: %26 = mul i64 -1, %25 -; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24 -; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26 -; CHECK-NEXT: %wide.load = load , ptr %28, align 4 -; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %wide.load) -; CHECK-NEXT: %29 = fadd %reverse, splat (float 1.000000e+00) -; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22 -; CHECK-NEXT: %31 = mul i64 0, %18 -; CHECK-NEXT: %32 = sub i64 %18, 1 -; CHECK-NEXT: %33 = mul i64 -1, %32 -; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31 -; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33 -; CHECK-NEXT: %reverse4 = call @llvm.vector.reverse.nxv4f32( %29) -; CHECK-NEXT: store %reverse4, ptr %35, align 4 +; CHECK-NEXT: %24 = getelementptr inbounds float, ptr %23, i32 0 +; CHECK-NEXT: %25 = trunc i64 %18 to i32 +; CHECK-NEXT: %wide.strided.load = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %24, i64 -4, splat (i1 true), i32 %25) +; CHECK-NEXT: %26 = fadd %wide.strided.load, splat (float 1.000000e+00) +; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %A, i64 %22 +; CHECK-NEXT: %28 = mul i64 0, %18 +; CHECK-NEXT: %29 = sub i64 %18, 1 +; CHECK-NEXT: %30 = mul i64 -1, %29 +; CHECK-NEXT: %31 = getelementptr inbounds float, ptr %27, i64 %28 +; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %31, i64 %30 +; CHECK-NEXT: %reverse = call @llvm.vector.reverse.nxv4f32( %26) +; CHECK-NEXT: store %reverse, ptr %32, align 4 ; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %36, , label %vector.body +; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %33, , label %vector.body ; CHECK-NEXT: LV: created middle.block ; CHECK-NEXT: LV: draw edge from vector.body ; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block @@ -779,7 +771,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader ; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] ; CHECK-NEXT: br label %for.body ; CHECK-NEXT: LV: draw edge from middle.block ; CHECK-NEXT: LV: draw edge from for.body.preheader @@ -789,12 +781,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: filled BB: ; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph ; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ] ; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00 +; CHECK-NEXT: %34 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: %conv1 = fadd float %34, 1.000000e+00 ; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4 ; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 91d94e52d0990..c156fc14a2300 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -29,39 +29,33 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1 -; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]] -; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[TMP18]], 1 -; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 -1, [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP19]], 1 -; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP23]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP8]], i64 -4, splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP10:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], 1 +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 -1, [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP9]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i64 [[TMP13]] +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[WIDE_STRIDED_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE]], ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 0, [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1 ; IF-EVL-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]] ; IF-EVL-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4 @@ -129,37 +123,30 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP26]], 1 -; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP15]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] -; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP27]], 1 -; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP30]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[OFFSET_IDX1:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 +; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, [[TMP9]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP13]], 1 +; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP15]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP12]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP16]] +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[WIDE_STRIDED_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP9]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE]], ptr align 4 [[TMP18]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll index 528f2448616e8..2c757021e76ff 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -42,11 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" + ; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>, vp\<[[VF]]\>\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>\l" + +; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>, ir\<1\>\l" + ; CHECK-NEXT: " WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" + ; CHECK-NEXT: " WIDEN-INTRINSIC ir\<%call\> = call llvm.sqrt(ir\<%lv\>)\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>\l" + +; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>, ir\<1\>\l" + ; CHECK-NEXT: " WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" + ; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" + ; CHECK-NEXT: " EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<[[VEC_TC]]\>\l" +