diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index f715e0ec8dbb4..5e15d8d56703e 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -180,9 +180,10 @@ class MemoryDepChecker { MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L, const DenseMap &SymbolicStrides, - unsigned MaxTargetVectorWidthInBits) + unsigned MaxTargetVectorWidthInBits, bool AllowNonPow2Deps) : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides), - MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {} + MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits), + AllowNonPow2Deps(AllowNonPow2Deps) {} /// Register the location (instructions are given increasing numbers) /// of a write access. @@ -218,17 +219,28 @@ class MemoryDepChecker { /// Return true if there are no store-load forwarding dependencies. bool isSafeForAnyStoreLoadForwardDistances() const { - return MaxStoreLoadForwardSafeDistanceInBits == - std::numeric_limits::max(); + return MaxPowerOf2StoreLoadForwardSafeDistanceInBits == + std::numeric_limits::max() && + MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits == + std::numeric_limits::max(); } - /// Return safe power-of-2 number of elements, which do not prevent store-load - /// forwarding, multiplied by the size of the elements in bits. - uint64_t getStoreLoadForwardSafeDistanceInBits() const { + /// Return safe number of elements, which do not prevent store-load + /// forwarding, multiplied by the size of the elements in bits (power-of-2). + uint64_t getPowerOf2StoreLoadForwardSafeDistanceInBits() const { assert(!isSafeForAnyStoreLoadForwardDistances() && "Expected the distance, that prevent store-load forwarding, to be " "set."); - return MaxStoreLoadForwardSafeDistanceInBits; + return MaxPowerOf2StoreLoadForwardSafeDistanceInBits; + } + + /// Return safe number of elements, which do not prevent store-load + /// forwarding, multiplied by the size of the elements in bits (power-of-2). + uint64_t getNonPowerOf2StoreLoadForwardSafeDistanceInBits() const { + assert(!isSafeForAnyStoreLoadForwardDistances() && + "Expected the distance, that prevent store-load forwarding, to be " + "set."); + return MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits; } /// In same cases when the dependency check fails we can still @@ -319,9 +331,14 @@ class MemoryDepChecker { /// restrictive. uint64_t MaxSafeVectorWidthInBits = -1U; - /// Maximum power-of-2 number of elements, which do not prevent store-load - /// forwarding, multiplied by the size of the elements in bits. - uint64_t MaxStoreLoadForwardSafeDistanceInBits = + /// Maximum number of elements, which do not prevent store-load forwarding, + /// multiplied by the size of the elements in bits (power-of-2). + uint64_t MaxPowerOf2StoreLoadForwardSafeDistanceInBits = + std::numeric_limits::max(); + + /// Maximum number of elements, which do not prevent store-load forwarding, + /// multiplied by the size of the elements in bits (non-power-of-2). + uint64_t MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits = std::numeric_limits::max(); /// If we see a non-constant dependence distance we can still try to @@ -348,6 +365,9 @@ class MemoryDepChecker { /// backwards-vectorizable or unknown (triggering a runtime check). unsigned MaxTargetVectorWidthInBits = 0; + /// True if current target supports non-power-of-2 dependence distances. + bool AllowNonPow2Deps = false; + /// Mapping of SCEV expressions to their expanded pointer bounds (pair of /// start and end pointer expressions). DenseMap, diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index d654ac3ec9273..465a3904c3f74 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -382,8 +382,7 @@ class LoopVectorizationLegality { const LoopAccessInfo *getLAI() const { return LAI; } bool isSafeForAnyVectorWidth() const { - return LAI->getDepChecker().isSafeForAnyVectorWidth() && - LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances(); + return LAI->getDepChecker().isSafeForAnyVectorWidth(); } uint64_t getMaxSafeVectorWidthInBits() const { @@ -414,8 +413,15 @@ class LoopVectorizationLegality { /// Return safe power-of-2 number of elements, which do not prevent store-load /// forwarding and safe to operate simultaneously. - uint64_t getMaxStoreLoadForwardSafeDistanceInBits() const { - return LAI->getDepChecker().getStoreLoadForwardSafeDistanceInBits(); + uint64_t getPowerOf2MaxStoreLoadForwardSafeDistanceInBits() const { + return LAI->getDepChecker().getPowerOf2StoreLoadForwardSafeDistanceInBits(); + } + + /// Return safe non-power-of-2 number of elements, which do not prevent + /// store-load forwarding and safe to operate simultaneously. + uint64_t getNonPowerOf2MaxStoreLoadForwardSafeDistanceInBits() const { + return LAI->getDepChecker() + .getNonPowerOf2StoreLoadForwardSafeDistanceInBits(); } /// Returns true if vector representation of the instruction \p I diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 9a7d361b5b512..33f5f75c81d67 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1756,7 +1756,8 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, // Maximum vector factor. uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = std::min(VectorizerParams::MaxVectorWidth * TypeByteSize, - MaxStoreLoadForwardSafeDistanceInBits); + MaxPowerOf2StoreLoadForwardSafeDistanceInBits); + uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 = 0; // Compute the smallest VF at which the store and load would be misaligned. for (uint64_t VF = 2 * TypeByteSize; @@ -1768,24 +1769,61 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, break; } } + // RISCV VLA supports non-power-2 vector factor. So, we iterate in a + // backward order to find largest VF, which allows aligned stores-loads or + // the number of iterations between conflicting memory addresses is not less + // than 8 (NumItersForStoreLoadThroughMemory). + if (AllowNonPow2Deps) { + MaxVFWithoutSLForwardIssuesNonPowerOf2 = + std::min(VectorizerParams::MaxVectorWidth * TypeByteSize, + MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits); + + for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2; + VF > MaxVFWithoutSLForwardIssuesPowerOf2; VF -= TypeByteSize) { + if (Distance % VF == 0 || + Distance / VF >= NumItersForStoreLoadThroughMemory) { + uint64_t GCD = + isSafeForAnyStoreLoadForwardDistances() + ? VF + : std::gcd(MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits, + VF); + MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD; + break; + } + } + } - if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) { + if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize && + MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) { LLVM_DEBUG( dbgs() << "LAA: Distance " << Distance << " that could cause a store-load forwarding conflict\n"); return true; } + // Handle non-power-2 store-load forwarding distance, power-of-2 distance can + // be calculated. + if (AllowNonPow2Deps && CommonStride && + MaxVFWithoutSLForwardIssuesNonPowerOf2 < + MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits && + MaxVFWithoutSLForwardIssuesNonPowerOf2 != + VectorizerParams::MaxVectorWidth * TypeByteSize) { + uint64_t MaxVF = MaxVFWithoutSLForwardIssuesNonPowerOf2 / CommonStride; + uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; + MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits = + std::min(MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits, MaxVFInBits); + } + if (CommonStride && MaxVFWithoutSLForwardIssuesPowerOf2 < - MaxStoreLoadForwardSafeDistanceInBits && + MaxPowerOf2StoreLoadForwardSafeDistanceInBits && MaxVFWithoutSLForwardIssuesPowerOf2 != VectorizerParams::MaxVectorWidth * TypeByteSize) { uint64_t MaxVF = bit_floor(MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride); uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; - MaxStoreLoadForwardSafeDistanceInBits = - std::min(MaxStoreLoadForwardSafeDistanceInBits, MaxVFInBits); + MaxPowerOf2StoreLoadForwardSafeDistanceInBits = + std::min(MaxPowerOf2StoreLoadForwardSafeDistanceInBits, MaxVFInBits); } return false; } @@ -2250,7 +2288,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, return Dependence::Unknown; } - MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits); + if (!AllowNonPow2Deps) + MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits); return Dependence::BackwardVectorizable; } @@ -2984,8 +3023,9 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, MaxTargetVectorWidthInBits = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; - DepChecker = std::make_unique(*PSE, L, SymbolicStrides, - MaxTargetVectorWidthInBits); + DepChecker = std::make_unique( + *PSE, L, SymbolicStrides, MaxTargetVectorWidthInBits, + TTI && TTI->hasActiveVectorLength(0, nullptr, Align())); PtrRtChecking = std::make_unique(*DepChecker, SE); if (canAnalyzeLoop()) CanVecMem = analyzeLoop(AA, LI, TLI, DT); @@ -2999,7 +3039,9 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { OS << " with a maximum safe vector width of " << DC.getMaxSafeVectorWidthInBits() << " bits"; if (!DC.isSafeForAnyStoreLoadForwardDistances()) { - uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits(); + uint64_t SLDist = DC.getNonPowerOf2StoreLoadForwardSafeDistanceInBits(); + if (SLDist == std::numeric_limits::max()) + SLDist = DC.getPowerOf2StoreLoadForwardSafeDistanceInBits(); OS << ", with a maximum safe store-load forward width of " << SLDist << " bits"; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 36e14bb27a029..b5689cd38cb3d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1559,9 +1559,10 @@ class LoopVectorizationCostModel { /// elements is a power-of-2 larger than zero. If scalable vectorization is /// disabled or unsupported, then the scalable part will be equal to /// ElementCount::getScalable(0). - FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, - ElementCount UserVF, - bool FoldTailByMasking); + FixedScalableVFPair + computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF, + bool FoldTailByMasking, + bool AllowNonPowerOf2SafeDist = false); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. @@ -3751,7 +3752,9 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { return false; } - if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) { + if ((!Legal->isSafeForAnyVectorWidth() || + !Legal->isSafeForAnyStoreLoadForwardDistances()) && + !getMaxVScale(*TheFunction, TTI)) { reportVectorizationInfo("The target does not provide maximum vscale value " "for safe distance analysis.", "ScalableVFUnfeasible", ORE, TheLoop); @@ -3769,7 +3772,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { auto MaxScalableVF = ElementCount::getScalable( std::numeric_limits::max()); - if (Legal->isSafeForAnyVectorWidth()) + if (Legal->isSafeForAnyVectorWidth() && + Legal->isSafeForAnyStoreLoadForwardDistances()) return MaxScalableVF; std::optional MaxVScale = getMaxVScale(*TheFunction, TTI); @@ -3786,7 +3790,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { } FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( - unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { + unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking, + bool AllowNonPowerOf2SafeDist) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -3795,18 +3800,32 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). - unsigned MaxSafeElementsPowerOf2 = - bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + unsigned MaxSafeElementsNonPowerOf2 = + Legal->getMaxSafeVectorWidthInBits() / WidestType; + unsigned MaxSafeElementsPowerOf2 = bit_floor(MaxSafeElementsNonPowerOf2); if (!Legal->isSafeForAnyStoreLoadForwardDistances()) { - unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits(); - MaxSafeElementsPowerOf2 = - std::min(MaxSafeElementsPowerOf2, SLDist / WidestType); + uint64_t SLDist = Legal->getPowerOf2MaxStoreLoadForwardSafeDistanceInBits(); + if (SLDist != std::numeric_limits::max()) { + unsigned SLVF = SLDist / WidestType; + MaxSafeElementsPowerOf2 = std::min(MaxSafeElementsPowerOf2, SLVF); + } + if (FoldTailByMasking && AllowNonPowerOf2SafeDist) { + uint64_t SLDist = + Legal->getNonPowerOf2MaxStoreLoadForwardSafeDistanceInBits(); + if (SLDist != std::numeric_limits::max()) { + unsigned SLVF = SLDist / WidestType; + MaxSafeElements = Legal->isSafeForAnyVectorWidth() + ? SLVF + : std::gcd(MaxSafeElementsNonPowerOf2, SLVF); + } + } else { + MaxSafeElements = MaxSafeElementsPowerOf2; + } } auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2); - auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2); - - if (!Legal->isSafeForAnyVectorWidth()) - this->MaxSafeElements = MaxSafeElementsPowerOf2; + auto MaxSafeScalableVF = getMaxLegalScalableVF( + FoldTailByMasking && AllowNonPowerOf2SafeDist ? MaxSafeElementsNonPowerOf2 + : MaxSafeElementsPowerOf2); LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); @@ -4018,7 +4037,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return Rem->isZero(); }; - if (MaxPowerOf2RuntimeVF > 0u) { + FixedScalableVFPair FoldTailMaxFactors = + computeFeasibleMaxVF(MaxTC, UserVF, /*FoldTailByMasking=*/true, + /*AllowNonPowerOf2SafeDist=*/true); + if ((Legal->isSafeForAnyStoreLoadForwardDistances() || + has_single_bit(*MaxSafeElements)) && + MaxPowerOf2RuntimeVF) { assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && "MaxFixedVF must be a power of 2"); if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) { @@ -4030,7 +4054,14 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop); if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) { - if (MaxPowerOf2RuntimeVF > 0u) { + if (MaxPowerOf2RuntimeVF) { + assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && + "MaxFixedVF must be a power of 2"); + if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) { + // Accept MaxFixedVF if we do not have a tail. + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxFactors; + } // If we have a low-trip-count, and the fixed-width VF is known to divide // the trip count but the scalable factor does not, use the fixed-width // factor in preference to allow the generation of a non-predicated loop. @@ -4054,10 +4085,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero(); + bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero() || + FoldTailMaxFactors.ScalableVF.isNonZero(); setTailFoldingStyles(ContainsScalableVF, UserIC); if (foldTailByMasking()) { - if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { + if (foldTailWithEVL()) { LLVM_DEBUG( dbgs() << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " @@ -4069,6 +4101,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { assert(ContainsScalableVF && "Expected scalable vector factor."); MaxFactors.FixedVF = ElementCount::getFixed(1); + MaxFactors.ScalableVF = FoldTailMaxFactors.ScalableVF; } return MaxFactors; } @@ -5137,7 +5170,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, } // We used the distance for the interleave count. - if (!Legal->isSafeForAnyVectorWidth()) + if (!Legal->isSafeForAnyVectorWidth() || + !Legal->isSafeForAnyStoreLoadForwardDistances()) return 1; // We don't attempt to perform interleaving for loops with uncountable early diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll index fb2ec8c61d745..8886cd6c12892 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll @@ -102,23 +102,38 @@ define void @test_may_clobber1(ptr %p) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP9]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]] -; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32 -; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 100 -; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] +; IF-EVL-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 200, [[TMP3]] +; IF-EVL-NEXT: [[TMP12:%.*]] = icmp ult i64 [[AVL]], 50 +; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP12]], i64 [[AVL]], i64 50 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]] ; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; IF-EVL-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP5]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = add i64 [[TMP3]], 100 +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP15]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP17]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[TMP3]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -135,36 +150,17 @@ define void @test_may_clobber1(ptr %p) { ; ; NO-VP-LABEL: @test_may_clobber1( ; NO-VP-NEXT: entry: -; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; NO-VP: vector.ph: -; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] -; NO-VP: vector.body: -; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]] -; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32 -; NO-VP-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 100 -; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; NO-VP-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; NO-VP-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; NO-VP-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; NO-VP: middle.block: -; NO-VP-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; NO-VP: scalar.ph: -; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; NO-VP-NEXT: br label [[LOOP:%.*]] ; NO-VP: loop: -; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; NO-VP-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; NO-VP-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]] ; NO-VP-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; NO-VP-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 100 ; NO-VP-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]] ; NO-VP-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; NO-VP-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; NO-VP-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; NO-VP-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] ; NO-VP: exit: ; NO-VP-NEXT: ret void ; @@ -189,17 +185,51 @@ exit: define void @test_may_clobber2(ptr %p) { ; IF-EVL-LABEL: @test_may_clobber2( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; IF-EVL-NEXT: br label [[LOOP:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[LOOP]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 200, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[AVL]], 9 +; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP5]], i64 [[AVL]], i64 9 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP9]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 9 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP12]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br label [[EXIT:%.*]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: br label [[LOOP1:%.*]] ; IF-EVL: loop: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; IF-EVL-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[LOOP1]] ] +; IF-EVL-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV1]] ; IF-EVL-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 -; IF-EVL-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 9 +; IF-EVL-NEXT: [[OFFSET:%.*]] = add i64 [[IV1]], 9 ; IF-EVL-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]] ; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32 -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; IF-EVL-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 +; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV1]], 199 +; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: exit: ; IF-EVL-NEXT: ret void ; @@ -242,23 +272,38 @@ define void @test_may_clobber3(ptr %p) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP9]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]] -; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32 -; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 10 -; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] +; IF-EVL-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 200, [[TMP3]] +; IF-EVL-NEXT: [[TMP12:%.*]] = icmp ult i64 [[AVL]], 10 +; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP12]], i64 [[AVL]], i64 10 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]] ; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; IF-EVL-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP5]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = add i64 [[TMP3]], 10 +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP15]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP17]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[TMP3]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -269,42 +314,23 @@ define void @test_may_clobber3(ptr %p) { ; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: exit: ; IF-EVL-NEXT: ret void ; ; NO-VP-LABEL: @test_may_clobber3( ; NO-VP-NEXT: entry: -; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; NO-VP: vector.ph: -; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] -; NO-VP: vector.body: -; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]] -; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32 -; NO-VP-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 10 -; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; NO-VP-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; NO-VP-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; NO-VP-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; NO-VP: middle.block: -; NO-VP-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; NO-VP: scalar.ph: -; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; NO-VP-NEXT: br label [[LOOP:%.*]] ; NO-VP: loop: -; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; NO-VP-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; NO-VP-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]] ; NO-VP-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; NO-VP-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 10 ; NO-VP-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]] ; NO-VP-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; NO-VP-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; NO-VP-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; NO-VP-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; NO-VP-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] ; NO-VP: exit: ; NO-VP-NEXT: ret void ; @@ -357,7 +383,7 @@ define void @trivial_due_max_vscale(ptr %p) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: @@ -372,7 +398,7 @@ define void @trivial_due_max_vscale(ptr %p) { ; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] ; IF-EVL: exit: ; IF-EVL-NEXT: ret void ; @@ -417,31 +443,31 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP7]], 2 +; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[TMP13]], 1 ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP1]] -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP13]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 [[TMP8]], 2 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 3002, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[AVL]], 1024 -; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP9]], i64 [[AVL]], i64 1024 -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 1, i1 true) -; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP3]], splat (i1 true), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[EVL_BASED_IV]], 1024 -; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv1i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP6]], splat (i1 true), i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP6]], splat (i1 true), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = add i64 [[EVL_BASED_IV]], 1024 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP15]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP16]], splat (i1 true), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: @@ -456,7 +482,7 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 3001 -; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] ; IF-EVL: exit: ; IF-EVL-NEXT: ret void ; @@ -497,11 +523,51 @@ exit: define void @non-power-2-storeloadforward(ptr %A) { ; IF-EVL-LABEL: @non-power-2-storeloadforward( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; IF-EVL-NEXT: [[TMP20:%.*]] = sub i64 [[TMP19]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 112, [[TMP20]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP19]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP21]], 4 ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 112, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[AVL]], 3 +; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP5]], i64 [[AVL]], i64 3 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = add i64 16, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -3 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = add nsw i64 [[OFFSET_IDX]], 4 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_OP:%.*]] = add nsw [[VP_OP_LOAD1]], [[VP_OP_LOAD]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ] ; IF-EVL-NEXT: [[TMP0:%.*]] = add nsw i64 [[IV]], -3 -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP0]] ; IF-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[TMP2:%.*]] = add nsw i64 [[IV]], 4 ; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] @@ -512,7 +578,7 @@ define void @non-power-2-storeloadforward(ptr %A) { ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32 ; IF-EVL-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128 -; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY1]], label [[FOR_END]], !llvm.loop [[LOOP15:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index b0b69c74a2299..6170144163441 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -3,12 +3,12 @@ ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ ; RUN: -force-tail-folding-style=data-with-evl \ ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ -; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL,CHECK %s +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL %s ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ ; RUN: -force-tail-folding-style=none \ ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ -; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' @@ -96,30 +96,62 @@ for.cond.cleanup: } define void @safe_dep(ptr %p) { -; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; CHECK-NEXT: Live-in ir<512> = original trip-count -; CHECK-EMPTY: -; CHECK: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>, vp<[[VF]]> -; CHECK-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]> -; CHECK-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; CHECK-NEXT: WIDEN ir<[[V:%.+]]> = load vp<[[PTR1]]> -; CHECK-NEXT: CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100> -; CHECK-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]> -; CHECK-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; CHECK-NEXT: WIDEN store vp<[[PTR2]]>, ir<[[V]]> -; CHECK-NEXT: EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> -; CHECK-NEXT: No successors -; CHECK-NEXT: } +; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' { +; IF-EVL-NEXT: Live-in vp<[[VF:%[0-9]+]]> = VF +; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-NEXT: Live-in ir<512> = original trip-count +; IF-EVL-EMPTY: +; IF-EVL: vector.ph: +; IF-EVL-NEXT: Successor(s): vector loop +; IF-EVL-EMPTY: +; IF-EVL-NEXT: vector loop: { +; IF-EVL-NEXT: vector.body: +; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[EVL_NEXT:%.+]]> +; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<512>, vp<[[EVL_PHI]]> +; IF-EVL-NEXT: EMIT vp<[[CMP:%[0-9]+]]> = icmp ult vp<[[AVL]]>, ir<50> +; IF-EVL-NEXT: EMIT vp<[[SAFE_AVL:%.+]]> = select vp<[[CMP]]>, vp<[[AVL]]>, ir<50> +; IF-EVL-NEXT: EMIT vp<[[EVL:%[0-9]+]]> = EXPLICIT-VECTOR-LENGTH vp<[[SAFE_AVL]]> +; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[VF]]> +; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]> +; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; IF-EVL-NEXT: WIDEN ir<[[V:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> +; IF-EVL-NEXT: CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100> +; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]> +; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> +; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[V]]>, vp<[[EVL]]> +; IF-EVL-NEXT: EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 +; IF-EVL-NEXT: EMIT vp<[[EVL_NEXT]]> = add nuw vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; IF-EVL-NEXT: No successors +; IF-EVL-NEXT: } +; +; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' { +; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF +; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-NEXT: Live-in ir<512> = original trip-count +; NO-VP-EMPTY: +; NO-VP: vector.ph: +; NO-VP-NEXT: Successor(s): vector loop +; NO-VP-EMPTY: +; NO-VP-NEXT: vector loop: { +; NO-VP-NEXT: vector.body: +; NO-VP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>, vp<[[VF]]> +; NO-VP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]> +; NO-VP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; NO-VP-NEXT: WIDEN ir<[[V:%.+]]> = load vp<[[PTR1]]> +; NO-VP-NEXT: CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100> +; NO-VP-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]> +; NO-VP-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> +; NO-VP-NEXT: WIDEN store vp<[[PTR2]]>, ir<[[V]]> +; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; NO-VP-NEXT: No successors +; NO-VP-NEXT: } entry: br label %loop