From 84c1204cda2d9e039526899fa810e8f16b2e5fff Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Thu, 30 May 2024 09:56:26 -0700 Subject: [PATCH 01/15] [LV] Support binary and unary operations with EVL-vectorization The patch adds `VPWidenEVLRecipe` which represents `VPWidenRecipe` + EVL argument. The new recipe replaces `VPWidenRecipe` in `tryAddExplicitVectorLength` for each binary and unary operations. Follow up patches will extend support for remaining cases, like `FCmp` and `ICmp` --- llvm/lib/Transforms/Vectorize/VPlan.h | 54 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 71 + .../Transforms/Vectorize/VPlanTransforms.cpp | 102 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + ...-force-tail-with-evl-bin-unary-ops-args.ll | 1763 +++++++++++++++++ ...ze-force-tail-with-evl-masked-loadstore.ll | 20 +- .../RISCV/vectorize-vp-intrinsics.ll | 22 +- .../RISCV/vplan-vp-intrinsics.ll | 2 +- 8 files changed, 1970 insertions(+), 65 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9ad98a5371d81..73cc303403adf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -923,6 +923,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenCastSC: case VPRecipeBase::VPWidenGEPSC: case VPRecipeBase::VPWidenSC: + case VPRecipeBase::VPWidenEVLSC: case VPRecipeBase::VPWidenSelectSC: case VPRecipeBase::VPBlendSC: case VPRecipeBase::VPPredInstPHISC: @@ -1107,6 +1108,7 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPInstructionSC || R->getVPDefID() == VPRecipeBase::VPWidenSC || + R->getVPDefID() == VPRecipeBase::VPWidenEVLSC || R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || R->getVPDefID() == VPRecipeBase::VPWidenCastSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || @@ -1408,13 +1410,18 @@ class VPInstruction : public VPRecipeWithIRFlags { /// traditional vectorization cases where each recipe transforms into a /// vectorized version of itself. class VPWidenRecipe : public VPRecipeWithIRFlags { +protected: unsigned Opcode; + template + VPWidenRecipe(unsigned VPDefOpcode, Instruction &I, + iterator_range Operands) + : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {} + public: template VPWidenRecipe(Instruction &I, iterator_range Operands) - : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), - Opcode(I.getOpcode()) {} + : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} ~VPWidenRecipe() override = default; @@ -1443,6 +1450,49 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { #endif }; +class VPWidenEVLRecipe : public VPWidenRecipe { +private: + using VPRecipeWithIRFlags::transferFlags; + +public: + template + VPWidenEVLRecipe(Instruction &I, iterator_range Operands, VPValue &EVL) + : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) { + addOperand(&EVL); + } + + ~VPWidenEVLRecipe() override = default; + + VPWidenRecipe *clone() override final { + SmallVector Ops(operands()); + VPValue *EVL = Ops.pop_back_val(); + auto *R = new VPWidenEVLRecipe(*getUnderlyingInstr(), + make_range(Ops.begin(), Ops.end()), *EVL); + R->transferFlags(*this); + return R; + } + + VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC); + + VPValue *getEVL() { return getOperand(getNumOperands() - 1); } + const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); } + + /// A helper function to create widen EVL recipe from regular widen recipe. + static VPWidenEVLRecipe *create(VPWidenRecipe *W, VPValue &EVL); + + /// Produce widened copies of all Ingredients. + void execute(VPTransformState &State) override final; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override final; +#endif +}; + /// VPWidenCastRecipe is a recipe to create vector cast instructions. class VPWidenCastRecipe : public VPRecipeWithIRFlags { /// Cast instruction opcode. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 49ed733107da9..934e479000689 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/IR/VectorBuilder.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -74,6 +75,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenLoadSC: case VPWidenPHISC: case VPWidenSC: + case VPWidenEVLSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); @@ -114,6 +116,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenIntOrFpInductionSC: case VPWidenPHISC: case VPWidenSC: + case VPWidenEVLSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); @@ -164,6 +167,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPWidenPHISC: case VPWidenPointerInductionSC: case VPWidenSC: + case VPWidenEVLSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); @@ -1262,6 +1266,64 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, } } +VPWidenEVLRecipe *VPWidenEVLRecipe::create(VPWidenRecipe *W, VPValue &EVL) { + auto *R = new VPWidenEVLRecipe(*W->getUnderlyingInstr(), W->operands(), EVL); + R->transferFlags(*W); + return R; +} + +void VPWidenEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + VPValue *Op0 = getOperand(0); + + // If it's scalar operation, hand translation over to VPWidenRecipe + if (!State.get(Op0, 0)->getType()->isVectorTy()) + return VPWidenRecipe::execute(State); + + VPValue *EVL = getEVL(); + Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true); + unsigned Opcode = getOpcode(); + Instruction *I = getUnderlyingInstr(); + IRBuilderBase &BuilderIR = State.Builder; + VectorBuilder Builder(BuilderIR); + Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); + Value *VPInst = nullptr; + + //===------------------- Binary and Unary Ops ---------------------===// + if (Instruction::isBinaryOp(Opcode) || Instruction::isUnaryOp(Opcode)) { + // Just widen unops and binops. + + SmallVector Ops; + for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { + VPValue *VPOp = getOperand(I); + Ops.push_back(State.get(VPOp, 0)); + } + + Builder.setMask(Mask).setEVL(EVLArg); + VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, + "vp.op"); + + if (I) + if (auto *VecOp = dyn_cast(VPInst)) + VecOp->copyIRFlags(I); + } else { + llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); + } + State.set(this, VPInst, 0); + State.addMetadata(VPInst, I); +} + +bool VPWidenEVLRecipe::onlyFirstLaneUsed(const VPValue *Op) const { + assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); + // EVL in that recipe is always the last operand, thus any use before means + // the VPValue should be vectorized. + for (unsigned I = 0, E = getNumOperands() - 1; I != E; ++I) + if (getOperand(I) == Op) + return false; + return true; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -1271,6 +1333,15 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, printFlags(O); printOperands(O, SlotTracker); } + +void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN vp "; + printAsOperand(O, SlotTracker); + O << " = " << Instruction::getOpcodeName(Opcode); + printFlags(O); + printOperands(O, SlotTracker); +} #endif void VPWidenCastRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9796ee64f6ef9..e10abc6529d7a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -13,6 +13,7 @@ #include "VPlanTransforms.h" #include "VPRecipeBuilder.h" +#include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" @@ -21,6 +22,7 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" @@ -1268,6 +1270,7 @@ static SmallVector collectAllHeaderMasks(VPlan &Plan) { // Walk users of wide canonical IVs and collect to all compares of the form // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). SmallVector HeaderMasks; + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); for (auto *Wide : WideCanonicalIVs) { for (VPUser *U : SmallVector(Wide->users())) { auto *HeaderMask = dyn_cast(U); @@ -1315,6 +1318,63 @@ void VPlanTransforms::addActiveLaneMask( HeaderMask->replaceAllUsesWith(LaneMask); } +/// Replace recipes with their EVL variants. +static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { + DenseSet ToRemove; + + SmallVector HeaderMasks = collectAllHeaderMasks(Plan); + for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { + for (VPUser *U : collectUsersRecursively(HeaderMask)) { + auto *CurRecipe = dyn_cast(U); + if (!CurRecipe) + continue; + auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { + assert(OrigMask && "Unmasked recipe when folding tail"); + return HeaderMask == OrigMask ? nullptr : OrigMask; + }; + + VPRecipeBase *NewRecipe = + TypeSwitch(CurRecipe) + .Case([&](VPWidenLoadRecipe *L) { + VPValue *NewMask = GetNewMask(L->getMask()); + return new VPWidenLoadEVLRecipe(L, &EVL, NewMask); + }) + .Case([&](VPWidenStoreRecipe *S) { + VPValue *NewMask = GetNewMask(S->getMask()); + return new VPWidenStoreEVLRecipe(S, &EVL, NewMask); + }) + .Case([&](VPWidenRecipe *W) -> VPRecipeBase * { + unsigned Opcode = W->getOpcode(); + if (!Instruction::isBinaryOp(Opcode) && + !Instruction::isUnaryOp(Opcode)) + return nullptr; + return VPWidenEVLRecipe::create(W, EVL); + }) + .Case([&](VPReductionRecipe *Red) { + return new VPReductionEVLRecipe( + *Red, GetNewMask(Red->getCondOp()), EVL); + }); + + if (NewRecipe) { + [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues(); + assert(NumDefVal == CurRecipe->getNumDefinedValues() && + "New recipe must define the same number of values as the " + "original."); + assert( + NumDefVal <= 1 && + "Only supports recipes with a single definition or without users."); + NewRecipe->insertBefore(CurRecipe); + if (isa(NewRecipe)) { + VPValue *CurVPV = CurRecipe->getVPSingleValue(); + CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue()); + } + CurRecipe->eraseFromParent(); + } + } + recursivelyDeleteDeadRecipes(HeaderMask); + } +} + /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and /// replaces all uses except the canonical IV increment of /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe @@ -1384,48 +1444,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { NextEVLIV->insertBefore(CanonicalIVIncrement); EVLPhi->addOperand(NextEVLIV); - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { - for (VPUser *U : collectUsersRecursively(HeaderMask)) { - VPRecipeBase *NewRecipe = nullptr; - auto *CurRecipe = dyn_cast(U); - if (!CurRecipe) - continue; - - auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { - assert(OrigMask && "Unmasked recipe when folding tail"); - return HeaderMask == OrigMask ? nullptr : OrigMask; - }; - if (auto *MemR = dyn_cast(CurRecipe)) { - VPValue *NewMask = GetNewMask(MemR->getMask()); - if (auto *L = dyn_cast(MemR)) - NewRecipe = new VPWidenLoadEVLRecipe(*L, *VPEVL, NewMask); - else if (auto *S = dyn_cast(MemR)) - NewRecipe = new VPWidenStoreEVLRecipe(*S, *VPEVL, NewMask); - else - llvm_unreachable("unsupported recipe"); - } else if (auto *RedR = dyn_cast(CurRecipe)) { - NewRecipe = new VPReductionEVLRecipe(*RedR, *VPEVL, - GetNewMask(RedR->getCondOp())); - } + transformRecipestoEVLRecipes(Plan, *VPEVL); - if (NewRecipe) { - [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues(); - assert(NumDefVal == CurRecipe->getNumDefinedValues() && - "New recipe must define the same number of values as the " - "original."); - assert( - NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); - NewRecipe->insertBefore(CurRecipe); - if (isa(NewRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue()); - } - CurRecipe->eraseFromParent(); - } - } - recursivelyDeleteDeadRecipes(HeaderMask); - } // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 452c977106a77..b8b2c0bd4d5ff 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -356,6 +356,7 @@ class VPDef { VPWidenStoreEVLSC, VPWidenStoreSC, VPWidenSC, + VPWidenEVLSC, VPWidenSelectSC, VPBlendSC, // START: Phi-like recipes. Need to be kept together. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll new file mode 100644 index 0000000000000..e90b4ff4ac54b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll @@ -0,0 +1,1763 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL + +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP + + +define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_and( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.and.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = and i8 [[TMP20]], 1 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_and( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = and i8 [[TMP0]], 1 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = and i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_or( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.or.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = or i8 [[TMP20]], 1 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_or( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = or i8 [[TMP0]], 1 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = or i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_xor( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.xor.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = xor i8 [[TMP20]], 1 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_xor( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = xor i8 [[TMP0]], 1 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = xor i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_shl( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.shl.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = shl i8 [[TMP20]], 1 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_shl( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = shl i8 [[TMP0]], 1 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_lshr( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.lshr.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = lshr i8 [[TMP20]], 1 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_lshr( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = lshr i8 [[TMP0]], 1 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = lshr i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_ashr( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.ashr.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = ashr i8 [[TMP20]], 1 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_ashr( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = ashr i8 [[TMP0]], 1 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = ashr i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_add( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.add.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = add i8 [[TMP20]], 1 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_add( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = add i8 [[TMP0]], 1 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = add i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_sub( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.sub.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = sub i8 [[TMP20]], 1 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_sub( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = sub i8 [[TMP0]], 1 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = sub i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_mul( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.mul.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = mul i8 [[TMP20]], 3 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_mul( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = mul i8 [[TMP0]], 3 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = mul i8 %0, 3 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_sdiv( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.sdiv.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = sdiv i8 [[TMP20]], 3 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_sdiv( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = sdiv i8 [[TMP0]], 3 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = sdiv i8 %0, 3 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_udiv( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.udiv.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = udiv i8 [[TMP20]], 3 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_udiv( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = udiv i8 [[TMP0]], 3 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = udiv i8 %0, 3 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_srem( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.srem.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = srem i8 [[TMP20]], 3 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_srem( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = srem i8 [[TMP0]], 3 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = srem i8 %0, 3 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_urem( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.vp.urem.nxv16i8( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP15]], ptr align 1 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; IF-EVL-NEXT: [[TMP:%.*]] = urem i8 [[TMP20]], 3 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_urem( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-VP-NEXT: [[TMP:%.*]] = urem i8 [[TMP0]], 3 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len + %0 = load i8, ptr %arrayidx, align 1 + %tmp = urem i8 %0, 3 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len + store i8 %tmp, ptr %arrayidx1, align 1 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +; Floating point tests + +define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_fadd( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fadd.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP:%.*]] = fadd fast float [[TMP21]], 3.000000e+00 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_fadd( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[TMP:%.*]] = fadd fast float [[TMP0]], 3.000000e+00 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %len + %0 = load float, ptr %arrayidx, align 4 + %tmp = fadd fast float %0, 3.000000e+00 + %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len + store float %tmp, ptr %arrayidx1, align 4 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_fsub( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fsub.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP:%.*]] = fsub fast float [[TMP21]], 3.000000e+00 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_fsub( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[TMP:%.*]] = fsub fast float [[TMP0]], 3.000000e+00 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %len + %0 = load float, ptr %arrayidx, align 4 + %tmp = fsub fast float %0, 3.000000e+00 + %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len + store float %tmp, ptr %arrayidx1, align 4 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_fmul( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fmul.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP:%.*]] = fmul fast float [[TMP21]], 3.000000e+00 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_fmul( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[TMP:%.*]] = fmul fast float [[TMP0]], 3.000000e+00 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %len + %0 = load float, ptr %arrayidx, align 4 + %tmp = fmul fast float %0, 3.000000e+00 + %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len + store float %tmp, ptr %arrayidx1, align 4 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_fdiv( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fdiv.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP:%.*]] = fdiv fast float [[TMP21]], 3.000000e+00 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP35:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_fdiv( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[TMP:%.*]] = fdiv fast float [[TMP0]], 3.000000e+00 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %len + %0 = load float, ptr %arrayidx, align 4 + %tmp = fdiv fast float %0, 3.000000e+00 + %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len + store float %tmp, ptr %arrayidx1, align 4 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_frem(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_frem( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP:%.*]] = frem fast float [[TMP0]], 3.000000e+00 +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_frem( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[TMP:%.*]] = frem fast float [[TMP0]], 3.000000e+00 +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %len + %0 = load float, ptr %arrayidx, align 4 + %tmp = frem fast float %0, 3.000000e+00 + %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len + store float %tmp, ptr %arrayidx1, align 4 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} + +define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) { +; IF-EVL-LABEL: define void @test_fneg( +; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]: +; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IF-EVL: [[VECTOR_MEMCHECK]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]] +; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fneg.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP:%.*]] = fneg fast float [[TMP21]] +; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP37:![0-9]+]] +; IF-EVL: [[FINISH_LOOPEXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @test_fneg( +; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1 +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]] +; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[TMP:%.*]] = fneg fast float [[TMP0]] +; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] +; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 +; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 +; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]] +; NO-VP: [[FINISH_LOOPEXIT]]: +; NO-VP-NEXT: ret void +; +loop.preheader: + br label %loop + +loop: + %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ] + %dec = add nsw i64 %len, 1 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %len + %0 = load float, ptr %arrayidx, align 4 + %tmp = fneg fast float %0 + %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len + store float %tmp, ptr %arrayidx1, align 4 + %.not = icmp eq i64 %dec, 100 + br i1 %.not, label %finish.loopexit, label %loop + +finish.loopexit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll index b8b2558247fa6..4cfcbebb8d7f1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll @@ -50,13 +50,13 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], [[TMP20]], i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = add [[VP_OP_LOAD]], [[VP_OP_LOAD3]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP23]], ptr align 4 [[TMP22]], [[TMP20]], i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP24:%.*]] = zext i32 [[TMP12]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP_LOAD]], [[VP_OP_LOAD3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP22]], [[TMP20]], i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] -; IF-EVL-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -65,13 +65,13 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL: for.body: ; IF-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]] -; IF-EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0 +; IF-EVL-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP25]], 0 ; IF-EVL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; IF-EVL: if.then: ; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]] -; IF-EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]] +; IF-EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP25]], [[TMP26]] ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4 ; IF-EVL-NEXT: br label [[FOR_INC]] ; IF-EVL: for.inc: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll index 362bfd61ebd07..83f7dc3702b08 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll @@ -39,15 +39,15 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = add nsw [[VP_OP_LOAD1]], [[VP_OP_LOAD]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP18]], ptr align 4 [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP_LOAD1]], [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] -; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -56,10 +56,10 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP22]] ; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index 8caa9368bfde1..0b220a0884b74 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -31,7 +31,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; IF-EVL-NEXT: WIDEN vp ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> From 229b94c56fa2d88c14da65238c7fb33c449c4436 Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Wed, 5 Jun 2024 14:22:01 -0700 Subject: [PATCH 02/15] Addressed comments --- .../Transforms/Vectorize/LoopVectorize.cpp | 5 +- llvm/lib/Transforms/Vectorize/VPlan.h | 10 +-- .../Transforms/Vectorize/VPlanAnalysis.cpp | 3 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 11 ++- .../Transforms/Vectorize/VPlanTransforms.cpp | 4 +- .../Transforms/Vectorize/VPlanVerifier.cpp | 71 +++++++++++++++++++ ...-force-tail-with-evl-bin-unary-ops-args.ll | 4 +- 7 files changed, 90 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0200525a718d5..247faa69faccb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8585,14 +8585,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { + bool IsScalarVPlan = Plan->hasVF(ElementCount::getFixed(1)); // Now optimize the initial VPlan. - if (!Plan->hasVF(ElementCount::getFixed(1))) + if (!IsScalarVPlan) VPlanTransforms::truncateToMinimalBitwidths( *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); VPlanTransforms::optimize(*Plan, *PSE.getSE()); // TODO: try to put it close to addActiveLaneMask(). // Discard the plan if it is not EVL-compatible - if (CM.foldTailWithEVL() && + if (!IsScalarVPlan && CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) break; assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 73cc303403adf..e9e26efb22132 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1410,9 +1410,9 @@ class VPInstruction : public VPRecipeWithIRFlags { /// traditional vectorization cases where each recipe transforms into a /// vectorized version of itself. class VPWidenRecipe : public VPRecipeWithIRFlags { -protected: unsigned Opcode; +protected: template VPWidenRecipe(unsigned VPDefOpcode, Instruction &I, iterator_range Operands) @@ -1451,7 +1451,6 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { }; class VPWidenEVLRecipe : public VPWidenRecipe { -private: using VPRecipeWithIRFlags::transferFlags; public: @@ -1460,6 +1459,10 @@ class VPWidenEVLRecipe : public VPWidenRecipe { : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) { addOperand(&EVL); } + VPWidenEVLRecipe(VPWidenRecipe *W, VPValue &EVL) + : VPWidenEVLRecipe(*W->getUnderlyingInstr(), W->operands(), EVL) { + this->transferFlags(*W); + } ~VPWidenEVLRecipe() override = default; @@ -1477,9 +1480,6 @@ class VPWidenEVLRecipe : public VPWidenRecipe { VPValue *getEVL() { return getOperand(getNumOperands() - 1); } const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); } - /// A helper function to create widen EVL recipe from regular widen recipe. - static VPWidenEVLRecipe *create(VPWidenRecipe *W, VPValue &EVL); - /// Produce widened copies of all Ingredients. void execute(VPTransformState &State) override final; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index d79a7e814ecb3..e8423a23f3e58 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -264,7 +264,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { return inferScalarType(R->getOperand(0)); }) .Case( + VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe, + VPWidenEVLRecipe>( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) .Case([V](const VPInterleaveRecipe *R) { // TODO: Use info from interleave group. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 934e479000689..248d4c2664f2a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1278,8 +1278,8 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { VPValue *Op0 = getOperand(0); // If it's scalar operation, hand translation over to VPWidenRecipe - if (!State.get(Op0, 0)->getType()->isVectorTy()) - return VPWidenRecipe::execute(State); + assert(State.get(Op0, 0)->getType()->isVectorTy() && + "VPWidenEVLRecipe should not be used for scalars"); VPValue *EVL = getEVL(); Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true); @@ -1318,10 +1318,7 @@ bool VPWidenEVLRecipe::onlyFirstLaneUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // EVL in that recipe is always the last operand, thus any use before means // the VPValue should be vectorized. - for (unsigned I = 0, E = getNumOperands() - 1; I != E; ++I) - if (getOperand(I) == Op) - return false; - return true; + return getEVL() == Op; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1338,7 +1335,7 @@ void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN vp "; printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(Opcode); + O << " = " << Instruction::getOpcodeName(getOpcode()); printFlags(O); printOperands(O, SlotTracker); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e10abc6529d7a..58ef633d7b96a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1320,6 +1320,8 @@ void VPlanTransforms::addActiveLaneMask( /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { + VPDominatorTree VPDT; + VPDT.recalculate(Plan); DenseSet ToRemove; SmallVector HeaderMasks = collectAllHeaderMasks(Plan); @@ -1348,7 +1350,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) return nullptr; - return VPWidenEVLRecipe::create(W, EVL); + return new VPWidenEVLRecipe(W, EVL); }) .Case([&](VPReductionRecipe *Red) { return new VPReductionEVLRecipe( diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 765dc983cab4f..290fe749f48c3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -18,6 +18,7 @@ #include "VPlanDominatorTree.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/CommandLine.h" #define DEBUG_TYPE "loop-vectorize" @@ -35,6 +36,11 @@ class VPlanVerifier { // VPHeaderPHIRecipes. bool verifyPhiRecipes(const VPBasicBlock *VPBB); + // Verify that \p EVL is used correctly. The user must be either in EVL-based + // recipes as a last operand or VPInstruction::Add which is incoming value + // into EVL's recipe. + bool verifyEVLRecipe(const VPInstruction &EVL) const; + bool verifyVPBasicBlock(const VPBasicBlock *VPBB); bool verifyBlock(const VPBlockBase *VPB); @@ -114,6 +120,64 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) { return true; } +bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { + if (EVL.getOpcode() != VPInstruction::ExplicitVectorLength) { + errs() << "verifyEVLRecipe should only be called on " + "VPInstruction::ExplicitVectorLength\n"; + return false; + } + auto VerifyEVLUse = [&](const VPRecipeBase &R, + const unsigned ExpectedIdx) -> bool { + SmallVector Ops(R.operands()); + unsigned UseCount = count(Ops, &EVL); + if (UseCount != 1 || Ops[ExpectedIdx] != &EVL) { + errs() << "EVL is used as non-last operand in EVL-based recipe\n"; + return false; + } + return true; + }; + for (const VPUser *U : EVL.users()) { + if (!TypeSwitch(U) + .Case([&](const VPWidenStoreEVLRecipe *S) { + return VerifyEVLUse(*S, 2); + }) + .Case([&](const VPWidenLoadEVLRecipe *L) { + return VerifyEVLUse(*L, 1); + }) + .Case([&](const VPWidenEVLRecipe *W) { + return VerifyEVLUse( + *W, Instruction::isUnaryOp(W->getOpcode()) ? 1 : 2); + }) + .Case( + [&](const VPScalarCastRecipe *S) { return true; }) + .Case([&](const VPInstruction *I) { + if (I->getOpcode() != Instruction::Add) { + errs() + << "EVL is used as an operand in non-VPInstruction::Add\n"; + return false; + } + if (I->getNumUsers() != 1) { + errs() << "EVL is used in VPInstruction:Add with multiple " + "users\n"; + return false; + } + if (!isa(*I->users().begin())) { + errs() << "Result of VPInstruction::Add with EVL operand is " + "not used by VPEVLBasedIVPHIRecipe\n"; + return false; + } + return true; + }) + .Default([&](const VPUser *U) { + errs() << "EVL has unexpected user\n"; + return false; + })) { + return false; + } + } + return true; +} + bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { if (!verifyPhiRecipes(VPBB)) return false; @@ -150,6 +214,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { } } } + if (const auto *EVL = dyn_cast(&R)) { + if (EVL->getOpcode() == VPInstruction::ExplicitVectorLength && + !verifyEVLRecipe(*EVL)) { + errs() << "EVL VPValue is not used correctly\n"; + return false; + } + } } auto *IRBB = dyn_cast(VPBB); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll index e90b4ff4ac54b..501e27d73737c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll @@ -2,12 +2,12 @@ ; RUN: opt -passes=loop-vectorize \ ; RUN: -force-tail-folding-style=data-with-evl \ ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ -; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL +; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=IF-EVL ; RUN: opt -passes=loop-vectorize \ ; RUN: -force-tail-folding-style=none \ ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ -; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP +; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=NO-VP define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) { From 1416588fdd8b0540f96b0116de8baa515be4772a Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Thu, 13 Jun 2024 10:20:20 -0700 Subject: [PATCH 03/15] Rebase --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 248d4c2664f2a..2ac459e291163 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1333,7 +1333,7 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN vp "; + O << Indent << "WIDEN-VP "; printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(getOpcode()); printFlags(O); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index 0b220a0884b74..0dbb6e0541afc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -31,7 +31,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN vp ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; IF-EVL-NEXT: WIDEN-VP ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> From e5af0d5197d70a91fc3947518bcb548c05fd4807 Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Thu, 13 Jun 2024 14:36:33 -0700 Subject: [PATCH 04/15] Removed evl verification --- .../Transforms/Vectorize/VPlanVerifier.cpp | 71 ------------------- 1 file changed, 71 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 290fe749f48c3..765dc983cab4f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -18,7 +18,6 @@ #include "VPlanDominatorTree.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/CommandLine.h" #define DEBUG_TYPE "loop-vectorize" @@ -36,11 +35,6 @@ class VPlanVerifier { // VPHeaderPHIRecipes. bool verifyPhiRecipes(const VPBasicBlock *VPBB); - // Verify that \p EVL is used correctly. The user must be either in EVL-based - // recipes as a last operand or VPInstruction::Add which is incoming value - // into EVL's recipe. - bool verifyEVLRecipe(const VPInstruction &EVL) const; - bool verifyVPBasicBlock(const VPBasicBlock *VPBB); bool verifyBlock(const VPBlockBase *VPB); @@ -120,64 +114,6 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) { return true; } -bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { - if (EVL.getOpcode() != VPInstruction::ExplicitVectorLength) { - errs() << "verifyEVLRecipe should only be called on " - "VPInstruction::ExplicitVectorLength\n"; - return false; - } - auto VerifyEVLUse = [&](const VPRecipeBase &R, - const unsigned ExpectedIdx) -> bool { - SmallVector Ops(R.operands()); - unsigned UseCount = count(Ops, &EVL); - if (UseCount != 1 || Ops[ExpectedIdx] != &EVL) { - errs() << "EVL is used as non-last operand in EVL-based recipe\n"; - return false; - } - return true; - }; - for (const VPUser *U : EVL.users()) { - if (!TypeSwitch(U) - .Case([&](const VPWidenStoreEVLRecipe *S) { - return VerifyEVLUse(*S, 2); - }) - .Case([&](const VPWidenLoadEVLRecipe *L) { - return VerifyEVLUse(*L, 1); - }) - .Case([&](const VPWidenEVLRecipe *W) { - return VerifyEVLUse( - *W, Instruction::isUnaryOp(W->getOpcode()) ? 1 : 2); - }) - .Case( - [&](const VPScalarCastRecipe *S) { return true; }) - .Case([&](const VPInstruction *I) { - if (I->getOpcode() != Instruction::Add) { - errs() - << "EVL is used as an operand in non-VPInstruction::Add\n"; - return false; - } - if (I->getNumUsers() != 1) { - errs() << "EVL is used in VPInstruction:Add with multiple " - "users\n"; - return false; - } - if (!isa(*I->users().begin())) { - errs() << "Result of VPInstruction::Add with EVL operand is " - "not used by VPEVLBasedIVPHIRecipe\n"; - return false; - } - return true; - }) - .Default([&](const VPUser *U) { - errs() << "EVL has unexpected user\n"; - return false; - })) { - return false; - } - } - return true; -} - bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { if (!verifyPhiRecipes(VPBB)) return false; @@ -214,13 +150,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { } } } - if (const auto *EVL = dyn_cast(&R)) { - if (EVL->getOpcode() == VPInstruction::ExplicitVectorLength && - !verifyEVLRecipe(*EVL)) { - errs() << "EVL VPValue is not used correctly\n"; - return false; - } - } } auto *IRBB = dyn_cast(VPBB); From 61c08ce0d14541aa51801a5e20bf6cabfc9e455b Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Thu, 20 Jun 2024 13:12:58 -0700 Subject: [PATCH 05/15] Moved clone() methods to unreachable for EVL-recipes --- llvm/lib/Transforms/Vectorize/VPlan.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e9e26efb22132..375223a897e34 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1467,12 +1467,8 @@ class VPWidenEVLRecipe : public VPWidenRecipe { ~VPWidenEVLRecipe() override = default; VPWidenRecipe *clone() override final { - SmallVector Ops(operands()); - VPValue *EVL = Ops.pop_back_val(); - auto *R = new VPWidenEVLRecipe(*getUnderlyingInstr(), - make_range(Ops.begin(), Ops.end()), *EVL); - R->transferFlags(*this); - return R; + llvm_unreachable("VPWidenStoreEVLRecipe cannot be cloned"); + return nullptr; } VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC); @@ -2646,6 +2642,11 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { setMask(Mask); } + VPWidenLoadEVLRecipe *clone() override { + llvm_unreachable("VPWidenLoadEVLRecipe recipe cannot be cloned"); + return nullptr; + } + VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC) /// Return the EVL operand. @@ -2721,6 +2722,11 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { setMask(Mask); } + VPWidenStoreEVLRecipe *clone() override { + llvm_unreachable("VPWidenStoreEVLRecipe cannot be cloned"); + return nullptr; + } + VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC) /// Return the address accessed by this recipe. From 83512bb1b3c8a25451e6e7790503800b07092476 Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Tue, 25 Jun 2024 13:24:29 -0700 Subject: [PATCH 06/15] Addressed comments --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 58ef633d7b96a..44bbfdf2a10c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1270,7 +1270,6 @@ static SmallVector collectAllHeaderMasks(VPlan &Plan) { // Walk users of wide canonical IVs and collect to all compares of the form // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). SmallVector HeaderMasks; - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); for (auto *Wide : WideCanonicalIVs) { for (VPUser *U : SmallVector(Wide->users())) { auto *HeaderMask = dyn_cast(U); @@ -1322,7 +1321,7 @@ void VPlanTransforms::addActiveLaneMask( static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPDominatorTree VPDT; VPDT.recalculate(Plan); - DenseSet ToRemove; + SmallVector ToRemove; SmallVector HeaderMasks = collectAllHeaderMasks(Plan); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { From 730eb10d0dc0e89054dfcf3159f1837c468c17ac Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Tue, 25 Jun 2024 14:42:49 -0700 Subject: [PATCH 07/15] Addressed comments --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 247faa69faccb..0200525a718d5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8585,15 +8585,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { - bool IsScalarVPlan = Plan->hasVF(ElementCount::getFixed(1)); // Now optimize the initial VPlan. - if (!IsScalarVPlan) + if (!Plan->hasVF(ElementCount::getFixed(1))) VPlanTransforms::truncateToMinimalBitwidths( *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); VPlanTransforms::optimize(*Plan, *PSE.getSE()); // TODO: try to put it close to addActiveLaneMask(). // Discard the plan if it is not EVL-compatible - if (!IsScalarVPlan && CM.foldTailWithEVL() && + if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) break; assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); From a2890b323b20986ddddb3aa9c0e1b87174ee0239 Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Tue, 16 Jul 2024 10:55:06 -0700 Subject: [PATCH 08/15] Don't use RPOT + rebase --- llvm/lib/Transforms/Vectorize/VPlan.h | 4 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 3 +- .../LoopVectorize/RISCV/inloop-reduction.ll | 61 ++++++++ ...ze-force-tail-with-evl-masked-loadstore.ll | 60 +++---- ...-force-tail-with-evl-reverse-load-store.ll | 148 +++++++++--------- 5 files changed, 169 insertions(+), 107 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 375223a897e34..6c60c10c7a9e2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2318,7 +2318,7 @@ class VPReductionRecipe : public VPSingleDefRecipe { /// The Operands are {ChainOp, VecOp, EVL, [Condition]}. class VPReductionEVLRecipe : public VPReductionRecipe { public: - VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp) + VPReductionEVLRecipe(VPReductionRecipe &R, VPValue *CondOp, VPValue &EVL) : VPReductionRecipe( VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(), cast_or_null(R.getUnderlyingValue()), @@ -2643,7 +2643,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { } VPWidenLoadEVLRecipe *clone() override { - llvm_unreachable("VPWidenLoadEVLRecipe recipe cannot be cloned"); + llvm_unreachable("VPWidenLoadEVLRecipe cannot be cloned"); return nullptr; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 44bbfdf2a10c9..e9b04b2c953d3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1354,7 +1354,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { .Case([&](VPReductionRecipe *Red) { return new VPReductionEVLRecipe( *Red, GetNewMask(Red->getCondOp()), EVL); - }); + }) + .Default([&](VPRecipeBase *R) { return nullptr; }); if (NewRecipe) { [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues(); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index deb9f0f9bb7e0..0381f6dae9811 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -385,6 +385,67 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] ; IF-EVL-INLOOP-NEXT: ret i32 [[SMIN_LCSSA]] ; +; IF-EVL-LABEL: @smin( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP10:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP11:%.*]] = add zeroinitializer, [[TMP10]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP11]] +; IF-EVL-NEXT: [[TMP12:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, [[TMP12]], poison) +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp slt [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = select [[TMP15]], [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP17:%.*]] = select [[TMP12]], [[TMP16]], [[VEC_PHI]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP17]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]] entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll index 4cfcbebb8d7f1..6d81b55fc8d89 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll @@ -20,43 +20,43 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP15]] -; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]] -; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer -; IF-EVL-NEXT: [[TMP20:%.*]] = select [[TMP16]], [[TMP19]], zeroinitializer -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], [[TMP20]], i32 [[TMP12]]) -; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP_LOAD]], [[VP_OP_LOAD3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP22]], [[TMP20]], i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] -; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer +; IF-EVL-NEXT: [[TMP18:%.*]] = select [[TMP14]], [[TMP17]], zeroinitializer +; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[TMP18]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP_LOAD]], [[VP_OP_LOAD3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP20]], [[TMP18]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -65,13 +65,13 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL: for.body: ; IF-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]] -; IF-EVL-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP25]], 0 +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP23]], 0 ; IF-EVL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; IF-EVL: if.then: ; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]] -; IF-EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP25]], [[TMP26]] +; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], [[TMP24]] ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4 ; IF-EVL-NEXT: br label [[FOR_INC]] ; IF-EVL: for.inc: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index c5a89d48f77b0..bda0c0100d6f4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -16,46 +16,46 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] ; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 -; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 -; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], -1 -; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP13]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 -; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP20]] -; IF-EVL-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP20]] -; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP8]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; IF-EVL-NEXT: [[TMP12:%.*]] = mul i64 0, [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = sub i64 1, [[TMP11]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP13]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 0, [[TMP18]] +; IF-EVL-NEXT: [[TMP20:%.*]] = sub i64 1, [[TMP18]] +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP19]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP20]] +; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -119,61 +119,61 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] ; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 -; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 ; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX3]], 0 +; IF-EVL-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX3]], 0 ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = add zeroinitializer, [[TMP11]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP12]] -; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IV]], shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP14:%.*]] = add i64 [[TMP9]], -1 -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP10]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP18:%.*]] = select [[TMP13]], [[TMP17]], zeroinitializer -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP21]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP21]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP25]], [[VP_REVERSE_MASK]], i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4 -; IF-EVL-NEXT: [[TMP29:%.*]] = mul i64 0, [[TMP28]] -; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 1, [[TMP28]] -; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP29]] -; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i64 [[TMP30]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP32]], [[VP_REVERSE_MASK6]], i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP33:%.*]] = zext i32 [[TMP8]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP33]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; IF-EVL-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = add zeroinitializer, [[TMP9]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP10]] +; IF-EVL-NEXT: [[TMP11:%.*]] = icmp ule [[VEC_IV]], shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[TMP7]], -1 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP8]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP16:%.*]] = select [[TMP11]], [[TMP15]], zeroinitializer +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP19]] +; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP19]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP20]] +; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP21]] +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP23]], [[VP_REVERSE_MASK]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; IF-EVL-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP26]] +; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP26]] +; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP27]] +; IF-EVL-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP28]] +; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP30]], [[VP_REVERSE_MASK6]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP31:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP31]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: From 8fb08bc5576430d816ee5e81f397ef6c1a1214aa Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Wed, 24 Jul 2024 08:09:47 -0700 Subject: [PATCH 09/15] Addressed comments + rebase --- llvm/lib/Transforms/Vectorize/VPlan.h | 3 ++- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 ++---- .../Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6c60c10c7a9e2..250e1bde9eb1c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1476,7 +1476,8 @@ class VPWidenEVLRecipe : public VPWidenRecipe { VPValue *getEVL() { return getOperand(getNumOperands() - 1); } const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); } - /// Produce widened copies of all Ingredients. + /// Produce a vp-intrinsic using the opcode and operands of the recipe, + /// processing EVL elements. void execute(VPTransformState &State) override final; /// Returns true if the recipe only uses the first lane of operand \p Op. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2ac459e291163..6153910a6e0da 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1277,7 +1277,6 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { "explicit vector length."); VPValue *Op0 = getOperand(0); - // If it's scalar operation, hand translation over to VPWidenRecipe assert(State.get(Op0, 0)->getType()->isVectorTy() && "VPWidenEVLRecipe should not be used for scalars"); @@ -1304,9 +1303,8 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); - if (I) - if (auto *VecOp = dyn_cast(VPInst)) - VecOp->copyIRFlags(I); + if (auto *VecOp = dyn_cast_or_null(VPInst)) + VecOp->copyIRFlags(I); } else { llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index 0dbb6e0541afc..04b3ba52cbefc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -31,7 +31,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-VP ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; IF-EVL-NEXT: WIDEN-VP ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> From 9cffa8fb152fe7ae0d884df4a74876b0bda8a3de Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Mon, 29 Jul 2024 11:28:43 -0700 Subject: [PATCH 10/15] Removed dead code, moved EVL to last args in EVL recipes, added debugloc --- llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++-- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 + llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 8 ++------ llvm/unittests/Transforms/Vectorize/VPlanTest.cpp | 4 ++-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 250e1bde9eb1c..2f523d1082041 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2635,7 +2635,7 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { /// using the address to load from, the explicit vector length and an optional /// mask. struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { - VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask) + VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Mask, VPValue &EVL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), {L.getAddr(), &EVL}, L.isConsecutive(), L.isReverse(), L.getDebugLoc()), @@ -2716,7 +2716,7 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { /// using the value to store, the address to store to, the explicit vector /// length and an optional mask. struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { - VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask) + VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Mask, VPValue &EVL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {S.getAddr(), S.getStoredValue(), &EVL}, S.isConsecutive(), S.isReverse(), S.getDebugLoc()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6153910a6e0da..59be53be65d42 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1273,6 +1273,7 @@ VPWidenEVLRecipe *VPWidenEVLRecipe::create(VPWidenRecipe *W, VPValue &EVL) { } void VPWidenEVLRecipe::execute(VPTransformState &State) { + State.setDebugLocFrom(getDebugLoc()); assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " "explicit vector length."); VPValue *Op0 = getOperand(0); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e9b04b2c953d3..160a4d62eeb52 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1319,10 +1319,6 @@ void VPlanTransforms::addActiveLaneMask( /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { - VPDominatorTree VPDT; - VPDT.recalculate(Plan); - SmallVector ToRemove; - SmallVector HeaderMasks = collectAllHeaderMasks(Plan); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { for (VPUser *U : collectUsersRecursively(HeaderMask)) { @@ -1338,11 +1334,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { TypeSwitch(CurRecipe) .Case([&](VPWidenLoadRecipe *L) { VPValue *NewMask = GetNewMask(L->getMask()); - return new VPWidenLoadEVLRecipe(L, &EVL, NewMask); + return new VPWidenLoadEVLRecipe(*L, NewMask, EVL); }) .Case([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); - return new VPWidenStoreEVLRecipe(S, &EVL, NewMask); + return new VPWidenStoreEVLRecipe(*S, NewMask, EVL); }) .Case([&](VPWidenRecipe *W) -> VPRecipeBase * { unsigned Opcode = W->getOpcode(); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 9cf9060458bc9..ab2c99d2ac613 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1140,7 +1140,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp, &VecOp, false); VPValue EVL; - VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp); + VPReductionEVLRecipe EVLRecipe(Recipe, &CondOp, EVL); EXPECT_FALSE(EVLRecipe.mayHaveSideEffects()); EXPECT_FALSE(EVLRecipe.mayReadFromMemory()); EXPECT_FALSE(EVLRecipe.mayWriteToMemory()); @@ -1495,7 +1495,7 @@ TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp, &VecOp, false); VPValue EVL; - VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp); + VPReductionEVLRecipe EVLRecipe(Recipe, &CondOp, EVL); EXPECT_TRUE(isa(&EVLRecipe)); VPRecipeBase *BaseR = &EVLRecipe; EXPECT_TRUE(isa(BaseR)); From 73a7df3fccf5001bd6a1553d2921149052db9c09 Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Wed, 7 Aug 2024 12:58:17 -0700 Subject: [PATCH 11/15] Addressed latest comments --- llvm/lib/Transforms/Vectorize/VPlan.h | 20 ++++++++++++++----- .../Transforms/Vectorize/VPlanTransforms.cpp | 8 ++++---- .../Transforms/Vectorize/VPlanTest.cpp | 4 ++-- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 2f523d1082041..e9435b15f9e5b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1431,7 +1431,15 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { return R; } - VP_CLASSOF_IMPL(VPDef::VPWidenSC) + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenSC || + R->getVPDefID() == VPRecipeBase::VPWidenEVLSC; + } + + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast(U); + return R && classof(R); + } /// Produce a widened instruction using the opcode and operands of the recipe, /// processing State.VF elements. @@ -1450,6 +1458,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { #endif }; +/// A recipe for widening operations with vector-predication intrinsics.with +/// explicit vector length (EVL). class VPWidenEVLRecipe : public VPWidenRecipe { using VPRecipeWithIRFlags::transferFlags; @@ -1467,7 +1477,7 @@ class VPWidenEVLRecipe : public VPWidenRecipe { ~VPWidenEVLRecipe() override = default; VPWidenRecipe *clone() override final { - llvm_unreachable("VPWidenStoreEVLRecipe cannot be cloned"); + llvm_unreachable("VPWidenEVLRecipe cannot be cloned"); return nullptr; } @@ -2319,7 +2329,7 @@ class VPReductionRecipe : public VPSingleDefRecipe { /// The Operands are {ChainOp, VecOp, EVL, [Condition]}. class VPReductionEVLRecipe : public VPReductionRecipe { public: - VPReductionEVLRecipe(VPReductionRecipe &R, VPValue *CondOp, VPValue &EVL) + VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp) : VPReductionRecipe( VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(), cast_or_null(R.getUnderlyingValue()), @@ -2635,7 +2645,7 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { /// using the address to load from, the explicit vector length and an optional /// mask. struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { - VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Mask, VPValue &EVL) + VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), {L.getAddr(), &EVL}, L.isConsecutive(), L.isReverse(), L.getDebugLoc()), @@ -2716,7 +2726,7 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { /// using the value to store, the address to store to, the explicit vector /// length and an optional mask. struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { - VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Mask, VPValue &EVL) + VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {S.getAddr(), S.getStoredValue(), &EVL}, S.isConsecutive(), S.isReverse(), S.getDebugLoc()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 160a4d62eeb52..90cc493610156 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1334,11 +1334,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { TypeSwitch(CurRecipe) .Case([&](VPWidenLoadRecipe *L) { VPValue *NewMask = GetNewMask(L->getMask()); - return new VPWidenLoadEVLRecipe(*L, NewMask, EVL); + return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); }) .Case([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); - return new VPWidenStoreEVLRecipe(*S, NewMask, EVL); + return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); }) .Case([&](VPWidenRecipe *W) -> VPRecipeBase * { unsigned Opcode = W->getOpcode(); @@ -1348,8 +1348,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { return new VPWidenEVLRecipe(W, EVL); }) .Case([&](VPReductionRecipe *Red) { - return new VPReductionEVLRecipe( - *Red, GetNewMask(Red->getCondOp()), EVL); + return new VPReductionEVLRecipe(*Red, EVL, + GetNewMask(Red->getCondOp())); }) .Default([&](VPRecipeBase *R) { return nullptr; }); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index ab2c99d2ac613..9cf9060458bc9 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1140,7 +1140,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp, &VecOp, false); VPValue EVL; - VPReductionEVLRecipe EVLRecipe(Recipe, &CondOp, EVL); + VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp); EXPECT_FALSE(EVLRecipe.mayHaveSideEffects()); EXPECT_FALSE(EVLRecipe.mayReadFromMemory()); EXPECT_FALSE(EVLRecipe.mayWriteToMemory()); @@ -1495,7 +1495,7 @@ TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp, &VecOp, false); VPValue EVL; - VPReductionEVLRecipe EVLRecipe(Recipe, &CondOp, EVL); + VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp); EXPECT_TRUE(isa(&EVLRecipe)); VPRecipeBase *BaseR = &EVLRecipe; EXPECT_TRUE(isa(BaseR)); From 906c4301ab4b9c1984550baf2c67892d5091b9eb Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Tue, 20 Aug 2024 09:02:42 -0700 Subject: [PATCH 12/15] Addressed comments --- llvm/lib/Transforms/Vectorize/VPlan.h | 26 ++- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 -- .../Transforms/Vectorize/VPlanTransforms.cpp | 35 +++-- ...-force-tail-with-evl-reverse-load-store.ll | 148 +++++++++--------- 4 files changed, 103 insertions(+), 119 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e9435b15f9e5b..cec3135395483 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1458,7 +1458,7 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { #endif }; -/// A recipe for widening operations with vector-predication intrinsics.with +/// A recipe for widening operations with vector-predication intrinsics with /// explicit vector length (EVL). class VPWidenEVLRecipe : public VPWidenRecipe { using VPRecipeWithIRFlags::transferFlags; @@ -1469,9 +1469,9 @@ class VPWidenEVLRecipe : public VPWidenRecipe { : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) { addOperand(&EVL); } - VPWidenEVLRecipe(VPWidenRecipe *W, VPValue &EVL) - : VPWidenEVLRecipe(*W->getUnderlyingInstr(), W->operands(), EVL) { - this->transferFlags(*W); + VPWidenEVLRecipe(VPWidenRecipe &W, VPValue &EVL) + : VPWidenEVLRecipe(*W.getUnderlyingInstr(), W.operands(), EVL) { + transferFlags(W); } ~VPWidenEVLRecipe() override = default; @@ -1491,7 +1491,13 @@ class VPWidenEVLRecipe : public VPWidenRecipe { void execute(VPTransformState &State) override final; /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override; + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + // EVL in that recipe is always the last operand, thus any use before means + // the VPValue should be vectorized. + return getEVL() == Op; + } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -2653,11 +2659,6 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { setMask(Mask); } - VPWidenLoadEVLRecipe *clone() override { - llvm_unreachable("VPWidenLoadEVLRecipe cannot be cloned"); - return nullptr; - } - VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC) /// Return the EVL operand. @@ -2733,11 +2734,6 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { setMask(Mask); } - VPWidenStoreEVLRecipe *clone() override { - llvm_unreachable("VPWidenStoreEVLRecipe cannot be cloned"); - return nullptr; - } - VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC) /// Return the address accessed by this recipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 59be53be65d42..227fabe0c02c3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1266,12 +1266,6 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, } } -VPWidenEVLRecipe *VPWidenEVLRecipe::create(VPWidenRecipe *W, VPValue &EVL) { - auto *R = new VPWidenEVLRecipe(*W->getUnderlyingInstr(), W->operands(), EVL); - R->transferFlags(*W); - return R; -} - void VPWidenEVLRecipe::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " @@ -1313,13 +1307,6 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { State.addMetadata(VPInst, I); } -bool VPWidenEVLRecipe::onlyFirstLaneUsed(const VPValue *Op) const { - assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - // EVL in that recipe is always the last operand, thus any use before means - // the VPValue should be vectorized. - return getEVL() == Op; -} - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 90cc493610156..27ef417652f88 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1345,29 +1345,30 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) return nullptr; - return new VPWidenEVLRecipe(W, EVL); + return new VPWidenEVLRecipe(*W, EVL); }) .Case([&](VPReductionRecipe *Red) { - return new VPReductionEVLRecipe(*Red, EVL, - GetNewMask(Red->getCondOp())); + VPValue *NewMask = GetNewMask(Red->getCondOp()); + return new VPReductionEVLRecipe(*Red, EVL, NewMask); }) .Default([&](VPRecipeBase *R) { return nullptr; }); - if (NewRecipe) { - [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues(); - assert(NumDefVal == CurRecipe->getNumDefinedValues() && - "New recipe must define the same number of values as the " - "original."); - assert( - NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); - NewRecipe->insertBefore(CurRecipe); - if (isa(NewRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue()); - } - CurRecipe->eraseFromParent(); + if (!NewRecipe) + continue; + + [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues(); + assert(NumDefVal == CurRecipe->getNumDefinedValues() && + "New recipe must define the same number of values as the " + "original."); + assert( + NumDefVal <= 1 && + "Only supports recipes with a single definition or without users."); + NewRecipe->insertBefore(CurRecipe); + if (isa(NewRecipe)) { + VPValue *CurVPV = CurRecipe->getVPSingleValue(); + CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue()); } + CurRecipe->eraseFromParent(); } recursivelyDeleteDeadRecipes(HeaderMask); } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index bda0c0100d6f4..d62f70c06a5fb 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -16,46 +16,46 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] +; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] ; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 -; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 -; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1 -; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP8]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; IF-EVL-NEXT: [[TMP12:%.*]] = mul i64 0, [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = sub i64 1, [[TMP11]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP13]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]] -; IF-EVL-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 -; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 0, [[TMP18]] -; IF-EVL-NEXT: [[TMP20:%.*]] = sub i64 1, [[TMP18]] -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP19]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP20]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], -1 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP13]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 +; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP20]] +; IF-EVL-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP20]] +; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]] +; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]] +; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP8]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -119,61 +119,61 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] +; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] ; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 -; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 ; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX3]], 0 +; IF-EVL-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX3]], 0 ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP10:%.*]] = add zeroinitializer, [[TMP9]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP10]] -; IF-EVL-NEXT: [[TMP11:%.*]] = icmp ule [[VEC_IV]], shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[TMP7]], -1 -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP8]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP16:%.*]] = select [[TMP11]], [[TMP15]], zeroinitializer -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP19]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP20]] -; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP21]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP23]], [[VP_REVERSE_MASK]], i32 [[TMP6]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 -; IF-EVL-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP26]] -; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP26]] -; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP27]] -; IF-EVL-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP28]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP30]], [[VP_REVERSE_MASK6]], i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP31:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP31]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP12:%.*]] = add zeroinitializer, [[TMP11]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP12]] +; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IV]], shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP14:%.*]] = add i64 [[TMP9]], -1 +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP10]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP18:%.*]] = select [[TMP13]], [[TMP17]], zeroinitializer +; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP21]] +; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP21]] +; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP22]] +; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP25]], [[VP_REVERSE_MASK]], i32 [[TMP8]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4 +; IF-EVL-NEXT: [[TMP29:%.*]] = mul i64 0, [[TMP28]] +; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 1, [[TMP28]] +; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP29]] +; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i64 [[TMP30]] +; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP32]], [[VP_REVERSE_MASK6]], i32 [[TMP8]]) +; IF-EVL-NEXT: [[TMP33:%.*]] = zext i32 [[TMP8]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP33]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; IF-EVL-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: From 7ff8c04842f9c215c40a9bd4964be65abd49c70d Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Mon, 26 Aug 2024 12:26:14 -0700 Subject: [PATCH 13/15] Refactored code and removed flag propagation as they're not yet supported by vp intrinsics --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 36 ++++++++----------- ...-force-tail-with-evl-bin-unary-ops-args.ll | 10 +++--- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 227fabe0c02c3..2509932bb0aff 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1267,6 +1267,11 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, } void VPWidenEVLRecipe::execute(VPTransformState &State) { + unsigned Opcode = getOpcode(); + // TODO: Support other opcodes + if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) + llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); + State.setDebugLocFrom(getDebugLoc()); assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " "explicit vector length."); @@ -1277,34 +1282,23 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { VPValue *EVL = getEVL(); Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true); - unsigned Opcode = getOpcode(); - Instruction *I = getUnderlyingInstr(); IRBuilderBase &BuilderIR = State.Builder; VectorBuilder Builder(BuilderIR); Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); - Value *VPInst = nullptr; - - //===------------------- Binary and Unary Ops ---------------------===// - if (Instruction::isBinaryOp(Opcode) || Instruction::isUnaryOp(Opcode)) { - // Just widen unops and binops. - SmallVector Ops; - for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { - VPValue *VPOp = getOperand(I); - Ops.push_back(State.get(VPOp, 0)); - } + SmallVector Ops; + for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { + VPValue *VPOp = getOperand(I); + Ops.push_back(State.get(VPOp, 0)); + } - Builder.setMask(Mask).setEVL(EVLArg); - VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, - "vp.op"); + Builder.setMask(Mask).setEVL(EVLArg); + Value *VPInst = + Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); - if (auto *VecOp = dyn_cast_or_null(VPInst)) - VecOp->copyIRFlags(I); - } else { - llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); - } State.set(this, VPInst, 0); - State.addMetadata(VPInst, I); + State.addMetadata(VPInst, + dyn_cast_or_null(getUnderlyingValue())); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll index 501e27d73737c..c821083b6197a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll @@ -1267,7 +1267,7 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fadd.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fadd.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) @@ -1362,7 +1362,7 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fsub.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fsub.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) @@ -1457,7 +1457,7 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fmul.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fmul.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) @@ -1552,7 +1552,7 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fdiv.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fdiv.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) @@ -1700,7 +1700,7 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fneg.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fneg.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) From 7987c798b564e5c86468014a4561201342819723 Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Mon, 26 Aug 2024 17:18:41 -0700 Subject: [PATCH 14/15] Only set FMFs on vp intrinsics VP intrinsics can only accept FMFs at this moment, thus trying to set other flags will lead to ICE --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 ++ ...vectorize-force-tail-with-evl-bin-unary-ops-args.ll | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2509932bb0aff..c8e9290cd6f9b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1295,6 +1295,8 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { Builder.setMask(Mask).setEVL(EVLArg); Value *VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); + if (isa(VPInst)) + setFlags(cast(VPInst)); State.set(this, VPInst, 0); State.addMetadata(VPInst, diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll index c821083b6197a..501e27d73737c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll @@ -1267,7 +1267,7 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fadd.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fadd.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) @@ -1362,7 +1362,7 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fsub.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fsub.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) @@ -1457,7 +1457,7 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fmul.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fmul.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) @@ -1552,7 +1552,7 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fdiv.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fdiv.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) @@ -1700,7 +1700,7 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.fneg.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call fast @llvm.vp.fneg.nxv4f32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) From 1f8726efad19e51da3d799e565d2932b5acf5feb Mon Sep 17 00:00:00 2001 From: Kolya Panchenko Date: Wed, 4 Sep 2024 12:46:21 -0700 Subject: [PATCH 15/15] Rebase --- .../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 + ...ze-force-tail-with-evl-masked-loadstore.ll | 2 +- ...-force-tail-with-evl-reverse-load-store.ll | 148 +++++++++--------- 4 files changed, 80 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index e8423a23f3e58..f091ee5a71b29 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -263,9 +263,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) { return inferScalarType(R->getOperand(0)); }) - .Case( + .Case( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) .Case([V](const VPInterleaveRecipe *R) { // TODO: Use info from interleave group. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c8e9290cd6f9b..911c485a3ded6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1295,6 +1295,8 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { Builder.setMask(Mask).setEVL(EVLArg); Value *VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); + // Currently vp-intrinsics only accept FMF flags. + // TODO: Enable other flags when support is added. if (isa(VPInst)) setFlags(cast(VPInst)); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll index 6d81b55fc8d89..99da5058fbf92 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll @@ -38,7 +38,7 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i64() ; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] ; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index d62f70c06a5fb..c1cf8b0fc541e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -16,46 +16,46 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] ; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 -; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 -; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], -1 -; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP13]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 -; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP20]] -; IF-EVL-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP20]] -; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP8]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; IF-EVL-NEXT: [[TMP12:%.*]] = mul i64 0, [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = sub i64 1, [[TMP11]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP13]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 0, [[TMP18]] +; IF-EVL-NEXT: [[TMP20:%.*]] = sub i64 1, [[TMP18]] +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP19]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP20]] +; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -119,61 +119,61 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] ; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 -; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 ; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX3]], 0 +; IF-EVL-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX3]], 0 ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = add zeroinitializer, [[TMP11]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP12]] -; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IV]], shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP14:%.*]] = add i64 [[TMP9]], -1 -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP10]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP18:%.*]] = select [[TMP13]], [[TMP17]], zeroinitializer -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP21]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP21]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP25]], [[VP_REVERSE_MASK]], i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4 -; IF-EVL-NEXT: [[TMP29:%.*]] = mul i64 0, [[TMP28]] -; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 1, [[TMP28]] -; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP29]] -; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i64 [[TMP30]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP32]], [[VP_REVERSE_MASK6]], i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP33:%.*]] = zext i32 [[TMP8]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP33]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; IF-EVL-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = add zeroinitializer, [[TMP9]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP10]] +; IF-EVL-NEXT: [[TMP11:%.*]] = icmp ule [[VEC_IV]], shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[TMP7]], -1 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP8]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP16:%.*]] = select [[TMP11]], [[TMP15]], zeroinitializer +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP19]] +; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP19]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP20]] +; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP21]] +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP23]], [[VP_REVERSE_MASK]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; IF-EVL-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP26]] +; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP26]] +; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP27]] +; IF-EVL-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP28]] +; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP30]], [[VP_REVERSE_MASK6]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP31:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP31]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: