From 6f5f6c2fe1f3e5c4ca6893d07042e8f0f841459b Mon Sep 17 00:00:00 2001 From: "Liqin.Weng" Date: Mon, 23 Jun 2025 16:24:08 +0800 Subject: [PATCH] [LV][VPlan] When the load/store stride is -1, use vle/vse instead of vlse/vsse --- .../Transforms/Vectorize/LoopVectorize.cpp | 9 ++- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 78 +++++++++---------- ...-force-tail-with-evl-reverse-load-store.ll | 77 +++++++----------- ...orize-force-tail-with-evl-uniform-store.ll | 11 +-- 4 files changed, 75 insertions(+), 100 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f887b34e76422..8267a29cb11c3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7061,6 +7061,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, RepR->getUnderlyingInstr(), VF)) return true; } + + // The VPlan-based cost model may calculate the cost of strided load/store + // which can't be modeled in the legacy cost model. + if (isa(&R)) + if (cast(&R)->isReverse()) + return true; + if (Instruction *UI = GetInstructionForCost(&R)) { // If we adjusted the predicate of the recipe, the cost in the legacy // cost model may be different. @@ -7758,7 +7765,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; - if (Reverse) { + if (Reverse && !CM.foldTailWithEVL()) { // When folding the tail, we may compute an address that we don't in the // original scalar loop and it may not be inbounds. Drop Inbounds in that // case. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1ed0b97849a8d..f18a080124a7a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2918,17 +2918,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -/// Use all-true mask for reverse rather than actual mask, as it avoids a -/// dependence w/o affecting the result. -static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, - Value *EVL, const Twine &Name) { - VectorType *ValTy = cast(Operand->getType()); - Value *AllTrueMask = - Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); - return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, - {Operand, AllTrueMask, EVL}, nullptr, Name); -} - void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); @@ -2940,29 +2929,33 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Value *EVL = State.get(getEVL(), VPLane(0)); Value *Addr = State.get(getAddr(), !CreateGather); Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) { + if (VPValue *VPMask = getMask()) Mask = State.get(VPMask); - if (isReverse()) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); - } else { + else Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - } if (CreateGather) { NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, nullptr, "wide.masked.gather"); } else { + if (isReverse()) { + auto *EltTy = DataTy->getElementType(); + // if (EltTy->getScalarSizeInBits() != + // EVL->getType()->getScalarSizeInBits()) + // EVL = ConstantInt::getSigned(EVL->getType(), + // static_cast(EltTy->getScalarSizeInBits()) / 8); + auto *GEP = dyn_cast(Addr->stripPointerCasts()); + Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL); + Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds()); + } NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load, {Addr, Mask, EVL}, nullptr, "vp.op.load"); } NewLI->addParamAttr( 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); applyMetadata(*NewLI); - Instruction *Res = NewLI; - if (isReverse()) - Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); - State.set(this, Res); + State.set(this, NewLI); } InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, @@ -2980,14 +2973,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); - InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Instruction::Load, Ty, Alignment, AS, Ctx.CostKind); - if (!Reverse) - return Cost; - - return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, Ctx.CostKind, - 0); + return Ctx.TTI.getMaskedMemoryOpCost(Instruction::Load, Ty, Alignment, AS, + Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3044,6 +3031,8 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); VPValue *StoredValue = getStoredValue(); bool CreateScatter = !isConsecutive(); const Align Alignment = getLoadStoreAlignment(&Ingredient); @@ -3053,22 +3042,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { CallInst *NewSI = nullptr; Value *StoredVal = State.get(StoredValue); Value *EVL = State.get(getEVL(), VPLane(0)); - if (isReverse()) - StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) { + if (VPValue *VPMask = getMask()) Mask = State.get(VPMask); - if (isReverse()) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); - } else { + else Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - } + Value *Addr = State.get(getAddr(), !CreateScatter); if (CreateScatter) { NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), Intrinsic::vp_scatter, {StoredVal, Addr, Mask, EVL}); } else { + if (isReverse()) { + auto *EltTy = DataTy->getElementType(); + // FIXME: we may need not deal with the size, the InstCombine will deal + // with the Offset Type if (EltTy->getScalarSizeInBits() != + // EVL->getType()->getScalarSizeInBits()) + // EVL = ConstantInt::getSigned(EVL->getType(), + // static_cast(EltTy->getScalarSizeInBits()) / 8); + auto *GEP = dyn_cast(Addr->stripPointerCasts()); + // Value *Offset = + // Builder.CreateSub(State.Builder.getIntN(EVL->getType()->getScalarSizeInBits(), + // 1), EVL); + Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL); + Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds()); + } NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), Intrinsic::vp_store, {StoredVal, Addr, Mask, EVL}); @@ -3093,14 +3092,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); - InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Instruction::Store, Ty, Alignment, AS, Ctx.CostKind); - if (!Reverse) - return Cost; - return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, Ctx.CostKind, - 0); + return Ctx.TTI.getMaskedMemoryOpCost(Instruction::Store, Ty, Alignment, AS, + Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 4d8166eaa46f1..ed389bea3ead5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -31,21 +31,15 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1 ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 1, [[TMP5]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP9]] ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP5]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP16]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP_LOAD]], ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] @@ -134,23 +128,15 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]] -; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP5]] +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 [[TMP19]] +; IF-EVL-NEXT: [[VP_OP_LOAD2:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[TMP14]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0 +; IF-EVL-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP5]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP15]], i32 [[TMP22]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP_LOAD2]], ptr align 4 [[TMP17]], [[TMP14]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] @@ -259,31 +245,22 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]] -; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]] -; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 1, [[TMP6]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 [[TMP9]] ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_REVERSE]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_OP_LOAD]] ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]] -; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]] -; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[VP_REVERSE1]], ptr align 1 [[TMP20]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP6]] +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP12]], i32 [[TMP19]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]] -; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]] -; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]] -; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]] -; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[VP_REVERSE2]], ptr align 1 [[TMP26]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0 +; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 1, [[TMP6]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP16]], i32 [[TMP17]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP18]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll index 82e8d3d6c611a..fc10b8d093967 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll @@ -36,13 +36,10 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]] ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]] -; CHECK-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( zeroinitializer, splat (i1 true), i32 [[TMP11]]) -; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_REVERSE]], ptr align 8 [[TMP19]], splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = sub i32 1, [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[TMP13]], i32 [[TMP14]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( zeroinitializer, ptr align 8 [[TMP15]], splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]