From 4c16b87b90aee4bebc1b6640af554fd449565d8d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 26 Jul 2024 14:38:27 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../llvm/Analysis/LoopAccessAnalysis.h | 25 ++ .../Vectorize/LoopVectorizationLegality.h | 12 + llvm/lib/Analysis/LoopAccessAnalysis.cpp | 55 ++++- .../Transforms/Vectorize/LoopVectorize.cpp | 75 ++++-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 + .../Transforms/Vectorize/VPlanTransforms.cpp | 11 +- .../Transforms/Vectorize/VPlanTransforms.h | 4 +- .../LoopAccessAnalysis/depend_diff_types.ll | 4 +- .../forward-loop-independent.ll | 5 +- .../forward-negative-step.ll | 5 +- .../num-iters-for-store-load-conflict.ll | 40 ++-- .../Analysis/LoopAccessAnalysis/pr64637.ll | 5 +- .../stride-access-dependence.ll | 5 +- .../RISCV/riscv-vector-reverse.ll | 4 +- ...e-force-tail-with-evl-safe-dep-distance.ll | 225 +++++++++++++----- .../LoopVectorize/memory-dep-remarks.ll | 25 +- 16 files changed, 359 insertions(+), 146 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index cc40d2e83f2e0..b661e117d01ee 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -37,6 +37,8 @@ class Value; struct VectorizerParams { /// Maximum SIMD width. static const unsigned MaxVectorWidth; + /// Maximum LMUL factor. + static const unsigned MaxVectorLMUL; /// VF as overridden by the user. static unsigned VectorizationFactor; @@ -222,6 +224,23 @@ class MemoryDepChecker { return MaxSafeVectorWidthInBits; } + /// Return safe power-of-2 number of elements, which do not prevent store-load + /// forwarding. + std::optional getStoreLoadForwardSafeVFPowerOf2() const { + if (MaxStoreLoadForwardSafeVF.first == std::numeric_limits::max()) + return std::nullopt; + return MaxStoreLoadForwardSafeVF.first; + } + + /// Return safe non-power-of-2 number of elements, which do not prevent + /// store-load forwarding. + std::optional getStoreLoadForwardSafeVFNonPowerOf2() const { + if (MaxStoreLoadForwardSafeVF.second == + std::numeric_limits::max()) + return std::nullopt; + return MaxStoreLoadForwardSafeVF.second; + } + /// In same cases when the dependency check fails we can still /// vectorize the loop with a dynamic array access check. bool shouldRetryWithRuntimeCheck() const { @@ -310,6 +329,12 @@ class MemoryDepChecker { /// restrictive. uint64_t MaxSafeVectorWidthInBits = -1U; + /// Maximum number of elements (power-of-2 and non-power-of-2), which do not + /// prevent store-load forwarding. + std::pair MaxStoreLoadForwardSafeVF = + std::make_pair(std::numeric_limits::max(), + std::numeric_limits::max()); + /// If we see a non-constant dependence distance we can still try to /// vectorize this loop with runtime checks. bool FoundNonConstantDistanceDependence = false; diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 0f4d1355dd2bf..c16a5f9a1344c 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -377,6 +377,18 @@ class LoopVectorizationLegality { return LAI->getDepChecker().getMaxSafeVectorWidthInBits(); } + /// Return safe power-of-2 number of elements, which do not prevent store-load + /// forwarding. + std::optional getMaxStoreLoadForwardSafeVFPowerOf2() const { + return LAI->getDepChecker().getStoreLoadForwardSafeVFPowerOf2(); + } + + /// Return safe non-power-of-2 number of elements, which do not prevent + /// store-load forwarding. + std::optional getMaxStoreLoadForwardSafeVFNonPowerOf2() const { + return LAI->getDepChecker().getStoreLoadForwardSafeVFNonPowerOf2(); + } + /// Returns true if vector representation of the instruction \p I /// requires mask. bool isMaskRequired(const Instruction *I) const { diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 646d2f7ef3077..29816bd1d845c 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -100,6 +100,8 @@ static cl::opt MemoryCheckMergeThreshold( /// Maximum SIMD width. const unsigned VectorizerParams::MaxVectorWidth = 64; +/// Maximum LMUL factor. +const unsigned VectorizerParams::MaxVectorLMUL = 8; /// We collect dependences up to this threshold. static cl::opt @@ -1764,31 +1766,64 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, // cause any slowdowns. const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize; // Maximum vector factor. - uint64_t MaxVFWithoutSLForwardIssues = std::min( - VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes); + uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = + std::min(VectorizerParams::MaxVectorWidth * TypeByteSize, + MaxStoreLoadForwardSafeVF.first); + uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 = + std::min(VectorizerParams::MaxVectorLMUL * + VectorizerParams::MaxVectorWidth * TypeByteSize, + MaxStoreLoadForwardSafeVF.second); // Compute the smallest VF at which the store and load would be misaligned. - for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues; - VF *= 2) { + for (uint64_t VF = 2 * TypeByteSize; + VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) { // If the number of vector iteration between the store and the load are // small we could incur conflicts. if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) { - MaxVFWithoutSLForwardIssues = (VF >> 1); + MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1); + break; + } + } + // RISCV VLA supports non-power-2 vector factor. So, we iterate in a + // backward order to find largest VF, which allows aligned stores-loads or + // the number of iterations between conflicting memory addresses is not less + // than 8 (NumItersForStoreLoadThroughMemory). + for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2, + E = 2 * TypeByteSize; + VF >= E; VF -= TypeByteSize) { + if (Distance % VF == 0 || + Distance / VF >= NumItersForStoreLoadThroughMemory) { + uint64_t GCD = MaxStoreLoadForwardSafeVF.second == + std::numeric_limits::max() + ? VF + : std::gcd(MaxStoreLoadForwardSafeVF.second, VF); + MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD; break; } } - if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) { + if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize && + MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) { LLVM_DEBUG( dbgs() << "LAA: Distance " << Distance << " that could cause a store-load forwarding conflict\n"); return true; } - if (MaxVFWithoutSLForwardIssues < MinDepDistBytes && - MaxVFWithoutSLForwardIssues != - VectorizerParams::MaxVectorWidth * TypeByteSize) - MinDepDistBytes = MaxVFWithoutSLForwardIssues; + if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) + MaxStoreLoadForwardSafeVF.first = 1; + else if (MaxVFWithoutSLForwardIssuesPowerOf2 < + MaxStoreLoadForwardSafeVF.first && + MaxVFWithoutSLForwardIssuesPowerOf2 != + VectorizerParams::MaxVectorWidth * TypeByteSize) + MaxStoreLoadForwardSafeVF.first = MaxVFWithoutSLForwardIssuesPowerOf2; + if (MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) + MaxStoreLoadForwardSafeVF.second = 1; + else if (MaxVFWithoutSLForwardIssuesNonPowerOf2 < + MaxStoreLoadForwardSafeVF.second && + MaxVFWithoutSLForwardIssuesNonPowerOf2 != + VectorizerParams::MaxVectorWidth * TypeByteSize) + MaxStoreLoadForwardSafeVF.second = MaxVFWithoutSLForwardIssuesNonPowerOf2; return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 09ca859f52680..28e814e9c89e9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1444,9 +1444,8 @@ class LoopVectorizationCostModel { /// Selects and saves TailFoldingStyle for 2 options - if IV update may /// overflow or not. - /// \param IsScalableVF true if scalable vector factors enabled. /// \param UserIC User specific interleave count. - void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { + void setTailFoldingStyles(unsigned UserIC) { assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); if (!Legal->canFoldTailByMasking()) { ChosenTailFoldingStyle = @@ -1470,11 +1469,9 @@ class LoopVectorizationCostModel { // FIXME: use actual opcode/data type for analysis here. // FIXME: Investigate opportunity for fixed vector factor. bool EVLIsLegal = - IsScalableVF && UserIC <= 1 && + UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) && - !EnableVPlanNativePath && - // FIXME: implement support for max safe dependency distance. - Legal->isSafeForAnyVectorWidth(); + !EnableVPlanNativePath; if (!EVLIsLegal) { // If for some reason EVL mode is unsupported, fallback to // DataWithoutLaneMask to try to vectorize the loop with folded tail @@ -1492,6 +1489,14 @@ class LoopVectorizationCostModel { } } + /// Disables previously chosen tail folding policy, sets it to None. Expects, + /// that the tail policy was selected. + void disableTailFolding() { + assert(ChosenTailFoldingStyle && "Tail folding must be selected."); + ChosenTailFoldingStyle = + std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); + } + /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { // TODO: check if it is possible to check for None style independent of @@ -1499,6 +1504,14 @@ class LoopVectorizationCostModel { return getTailFoldingStyle() != TailFoldingStyle::None; } + /// Return maximum safe number of elements to be processed, which do not + /// prevent store-load forwarding. + /// TODO: need to consider adjusting cost model to use this value as a + /// vectorization factor for EVL-based vectorization. + std::optional getMaxEVLSafeElements() const { + return MaxEVLSafeElements; + } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -1654,6 +1667,10 @@ class LoopVectorizationCostModel { /// true if scalable vectorization is supported and enabled. std::optional IsScalableVectorizationAllowed; + /// Maximum safe number of elements to be processed, which do not + /// prevent store-load forwarding. + std::optional MaxEVLSafeElements; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -3903,11 +3920,31 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). - unsigned MaxSafeElements = - llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + unsigned MaxSafeElements = Legal->getMaxSafeVectorWidthInBits() / WidestType; + if (Legal->isSafeForAnyVectorWidth()) + MaxSafeElements = PowerOf2Ceil(MaxSafeElements); + unsigned MaxFixedSafeElements = std::gcd( + MaxSafeElements, + Legal->getMaxStoreLoadForwardSafeVFPowerOf2().value_or(MaxSafeElements)); + MaxFixedSafeElements = bit_floor(MaxFixedSafeElements); + unsigned MaxScalableSafeElements = MaxFixedSafeElements; + if (foldTailWithEVL()) { + MaxScalableSafeElements = std::numeric_limits::max(); + std::optional SafeStoreLoadForwarding = + Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2(); + if (!Legal->isSafeForAnyVectorWidth() || SafeStoreLoadForwarding) { + unsigned SLForwardDist = + Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2().value_or( + MaxSafeElements); + if (MaxSafeElements >= SLForwardDist) + MaxEVLSafeElements = SLForwardDist; + else + MaxEVLSafeElements = std::gcd(MaxSafeElements, SLForwardDist); + } + } - auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); - auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); + auto MaxSafeFixedVF = ElementCount::getFixed(MaxFixedSafeElements); + auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxScalableSafeElements); LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); @@ -4077,7 +4114,13 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); + // If we don't know the precise trip count, or if the trip count that we + // found modulo the vectorization factor is not zero, try to fold the tail + // by masking. + // FIXME: look for a smaller MaxVF that does divide TC rather than masking. + setTailFoldingStyles(UserIC); + FixedScalableVFPair MaxFactors = + computeFeasibleMaxVF(MaxTC, UserVF, foldTailByMasking()); // Avoid tail folding if the trip count is known to be a multiple of any VF // we choose. @@ -4108,15 +4151,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Rem->isZero()) { // Accept MaxFixedVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + disableTailFolding(); return MaxFactors; } } - // If we don't know the precise trip count, or if the trip count that we - // found modulo the vectorization factor is not zero, try to fold the tail - // by masking. - // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); if (foldTailByMasking()) { if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { LLVM_DEBUG( @@ -8388,8 +8427,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VPlanTransforms::optimize(*Plan, *PSE.getSE()); // TODO: try to put it close to addActiveLaneMask(). // Discard the plan if it is not EVL-compatible - if (CM.foldTailWithEVL() && - !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) + if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength( + *Plan, CM.getMaxEVLSafeElements())) break; assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2d6d67a55c17d..de24688593ebe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -471,6 +471,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { assert(State.VF.isScalable() && "Expected scalable vector factor."); Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); + if (getNumOperands() == 3) { + Value *MaxSafeVF = State.get(getOperand(2), VPIteration(0, 0)); + AVL = State.Builder.CreateBinaryIntrinsic(Intrinsic::umin, AVL, + MaxSafeVF); + } Value *EVL = State.Builder.CreateIntrinsic( State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, {AVL, VFArg, State.Builder.getTrue()}); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c91fd0f118e31..e703bb893d938 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1427,7 +1427,8 @@ void VPlanTransforms::addActiveLaneMask( /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi /// ... /// -bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { +bool VPlanTransforms::tryAddExplicitVectorLength( + VPlan &Plan, const std::optional &MaxEVLSafeElements) { VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); // The transform updates all users of inductions to work based on EVL, instead // of the VF directly. At the moment, widened inductions cannot be updated, so @@ -1452,8 +1453,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { // Create the ExplicitVectorLengthPhi recipe in the main loop. auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); EVLPhi->insertAfter(CanonicalIVPHI); - auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, - {EVLPhi, Plan.getTripCount()}); + SmallVector Operands = {EVLPhi, Plan.getTripCount()}; + if (MaxEVLSafeElements) + Operands.push_back(Plan.getOrAddLiveIn(ConstantInt::get( + CanonicalIVPHI->getScalarType(), *MaxEVLSafeElements))); + auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, Operands, + DebugLoc()); VPEVL->insertBefore(*Header, Header->getFirstNonPhi()); auto *CanonicalIVIncrement = diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 96b8a6639723c..8158c832f1a95 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -105,7 +105,9 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. /// \returns true if the transformation succeeds, or false if it doesn't. - static bool tryAddExplicitVectorLength(VPlan &Plan); + static bool + tryAddExplicitVectorLength(VPlan &Plan, + const std::optional &MaxEVLSafeElements); }; } // namespace llvm diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll index 81d8b01fe7fb7..c5ba25a5c0ace 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll @@ -140,11 +140,11 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) { ; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 -> ; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8 ; CHECK-EMPTY: -; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: +; CHECK-NEXT: BackwardVectorizable: ; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 -> ; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8 ; CHECK-EMPTY: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8 -> ; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 ; CHECK-EMPTY: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll index 7fc9958dba552..6e4bcec013a73 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll @@ -24,14 +24,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define void @f(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %N) { ; CHECK-LABEL: 'f' ; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 %b_p1, ptr %Aidx, align 4 -> ; CHECK-NEXT: %a = load i32, ptr %Aidx, align 4 ; CHECK-EMPTY: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 %b_p2, ptr %Aidx_next, align 4 -> ; CHECK-NEXT: %a = load i32, ptr %Aidx, align 4 ; CHECK-EMPTY: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll index 46e81cd74ab31..f182a3d2aa7d1 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll @@ -47,10 +47,9 @@ exit: define void @neg_step_ForwardButPreventsForwarding(ptr nocapture %A, ptr noalias %B) { ; CHECK-LABEL: 'neg_step_ForwardButPreventsForwarding' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 0, ptr %gep.A, align 4 -> ; CHECK-NEXT: %l = load i32, ptr %gep.A.plus.1, align 4 ; CHECK-EMPTY: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll b/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll index d3eda21dee27e..3bb5155369205 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll @@ -4,10 +4,9 @@ define void @forward_dist_7(ptr %A, ptr noalias %B) { ; CHECK-LABEL: 'forward_dist_7' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> ; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4 ; CHECK-EMPTY: @@ -40,10 +39,9 @@ exit: define void @forward_dist_9(ptr %A, ptr noalias %B) { ; CHECK-LABEL: 'forward_dist_9' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> ; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4 ; CHECK-EMPTY: @@ -76,10 +74,9 @@ exit: define void @forward_dist_11(ptr %A, ptr noalias %B) { ; CHECK-LABEL: 'forward_dist_11' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> ; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4 ; CHECK-EMPTY: @@ -112,10 +109,9 @@ exit: define void @forward_dist_13(ptr %A, ptr noalias %B) { ; CHECK-LABEL: 'forward_dist_13' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> ; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4 ; CHECK-EMPTY: @@ -148,10 +144,9 @@ exit: define void @forward_dist_15(ptr %A, ptr noalias %B) { ; CHECK-LABEL: 'forward_dist_15' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> ; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4 ; CHECK-EMPTY: @@ -184,10 +179,9 @@ exit: define void @forward_dist_17(ptr %A, ptr noalias %B) { ; CHECK-LABEL: 'forward_dist_17' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> ; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4 ; CHECK-EMPTY: @@ -220,10 +214,9 @@ exit: define void @forward_dist_19(ptr %A, ptr noalias %B) { ; CHECK-LABEL: 'forward_dist_19' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: Forward: ; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> ; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4 ; CHECK-EMPTY: @@ -258,10 +251,9 @@ exit: define void @unknown_loop_bounds(i64 %x, i64 %y) { ; CHECK-LABEL: 'unknown_loop_bounds' ; CHECK-NEXT: inner: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2368 bits ; CHECK-NEXT: Dependences: -; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: +; CHECK-NEXT: BackwardVectorizable: ; CHECK-NEXT: %l = load double, ptr %gep.0, align 8 -> ; CHECK-NEXT: store double %l, ptr %gep.1, align 8 ; CHECK-EMPTY: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll index d3e589cf99cf3..4c0fb6adce1a8 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll @@ -20,10 +20,9 @@ define void @foo(ptr noalias nocapture noundef %y, ptr noalias nocapture noundef readnone %x, ptr noalias nocapture noundef readonly %indices, i32 noundef %n) { ; CHECK-LABEL: 'foo' ; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. -; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 96 bits ; CHECK-NEXT: Dependences: -; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: +; CHECK-NEXT: BackwardVectorizable: ; CHECK-NEXT: %1 = load i32, ptr %arrayidx, align 4 -> ; CHECK-NEXT: store i32 %add8, ptr %arrayidx12, align 4 ; CHECK-EMPTY: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll index ef19e173b6599..6aeebebb05766 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll @@ -416,10 +416,9 @@ for.body: ; preds = %entry, %for.body define void @vectorizable_unscaled_Read_Write(ptr nocapture %A) { ; CHECK-LABEL: 'vectorizable_unscaled_Read_Write' ; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 32 bits ; CHECK-NEXT: Dependences: -; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: +; CHECK-NEXT: BackwardVectorizable: ; CHECK-NEXT: %0 = load i32, ptr %arrayidx, align 4 -> ; CHECK-NEXT: store i32 %add, ptr %arrayidx2, align 4 ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 3a14842580425..4906cc1ca3042 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -21,7 +21,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: LV: Found trip count: 0 ; CHECK-NEXT: LV: Scalable vectorization is available -; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. +; CHECK-NEXT: LV: The max safe fixed VF is: 134217728. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom @@ -226,7 +226,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: LV: Found trip count: 0 ; CHECK-NEXT: LV: Scalable vectorization is available -; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. +; CHECK-NEXT: LV: The max safe fixed VF is: 134217728. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll index 2dd47d5c1ea8a..6143ca8de8b34 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll @@ -103,24 +103,38 @@ define void @test_may_clobber1(ptr %p) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32 -; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 100 -; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; IF-EVL-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 100) +; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 100 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -192,17 +206,51 @@ exit: define void @test_may_clobber2(ptr %p) { ; IF-EVL-LABEL: @test_may_clobber2( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 9) +; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 9 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; IF-EVL-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; IF-EVL-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; IF-EVL-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; IF-EVL-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 9 ; IF-EVL-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]] ; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: exit: ; IF-EVL-NEXT: ret void ; @@ -245,24 +293,38 @@ define void @test_may_clobber3(ptr %p) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32 -; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 10 -; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; IF-EVL-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 10) +; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 10 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -273,7 +335,7 @@ define void @test_may_clobber3(ptr %p) { ; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: exit: ; IF-EVL-NEXT: ret void ; @@ -363,7 +425,7 @@ define void @trivial_due_max_vscale(ptr %p) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -378,7 +440,7 @@ define void @trivial_due_max_vscale(ptr %p) { ; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] ; IF-EVL: exit: ; IF-EVL-NEXT: ret void ; @@ -422,28 +484,38 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP2]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], -; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison) -; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1024 -; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]] -; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 -; IF-EVL-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]]) -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004 -; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 3002, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 1024) +; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 1024 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -454,7 +526,7 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 3001 -; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] ; IF-EVL: exit: ; IF-EVL-NEXT: ret void ; @@ -495,22 +567,63 @@ exit: define void @non-power-2-storeloadforward(ptr %A) { ; IF-EVL-LABEL: @non-power-2-storeloadforward( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 112, [[TMP2]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[IND_END:%.*]] = add i64 16, [[N_VEC]] +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 112, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 3) +; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 4, i1 true) +; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = add i64 16, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = add nsw i64 [[TMP8]], -3 +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP8]], 4 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = add nsw [[VP_OP_LOAD1]], [[VP_OP_LOAD]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP15]], ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP7]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 16, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[TMP0:%.*]] = add nsw i64 [[IV]], -3 -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] -; IF-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[TMP2:%.*]] = add nsw i64 [[IV]], 4 -; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] -; IF-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP3]], [[TMP1]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[TMP20:%.*]] = add nsw i64 [[IV]], -3 +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] +; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP22:%.*]] = add nsw i64 [[IV]], 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP23]], [[TMP21]] ; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IF-EVL-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32 ; IF-EVL-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128 -; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP15:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll index d96d85512621c..6945afd09dd07 100644 --- a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll +++ b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll @@ -194,9 +194,7 @@ for.body: ; preds = %for.body.preheader, ; } ; } -; CHECK: remark: source.c:61:12: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. Memory location is the same as accessed at source.c:60:5 - +; CHECK-NOT: remark: source.c:{{0-9]+}}:{{[0-9]+}}: define void @test_forwardButPreventsForwarding_dep(i64 %n, ptr nocapture %A, ptr nocapture %B) !dbg !166 { entry: %cmp11 = icmp sgt i64 %n, 3 @@ -233,7 +231,7 @@ for.body: ; preds = %entry, %for.body ; } ; CHECK: remark: source.c:74:5: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK: Backward loop carried data dependence that prevents store-to-load forwarding. Memory location is the same as accessed at source.c:74:21 +; CHECK: Backward loop carried data dependence that prevents store-to-load forwarding. define void @test_backwardVectorizableButPreventsForwarding(i64 %n, ptr nocapture %A) !dbg !189 { entry: @@ -328,25 +326,19 @@ for.body: ; preds = %entry, %for.body ; YAML-NEXT: Args: ; YAML-NEXT: - String: loop not vectorized ; YAML-NEXT: ... -; YAML-NEXT: --- !Analysis +; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: loop-vectorize -; YAML-NEXT: Name: UnsafeDep -; YAML-NEXT: DebugLoc: { File: source.c, Line: 61, Column: 12 } +; YAML-NEXT: Name: VectorizationNotBeneficial ; YAML-NEXT: Function: test_forwardButPreventsForwarding_dep ; YAML-NEXT: Args: -; YAML-NEXT: - String: 'loop not vectorized: ' -; YAML-NEXT: - String: 'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop' -; YAML-NEXT: - String: "\nForward loop carried data dependence that prevents store-to-load forwarding." -; YAML-NEXT: - String: ' Memory location is the same as accessed at ' -; YAML-NEXT: - Location: 'source.c:60:5' -; YAML-NEXT: DebugLoc: { File: source.c, Line: 60, Column: 5 } +; YAML-NEXT: - String: the cost-model indicates that vectorization is not beneficial ; YAML-NEXT: ... ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: loop-vectorize -; YAML-NEXT: Name: MissedDetails +; YAML-NEXT: Name: InterleavingNotBeneficial ; YAML-NEXT: Function: test_forwardButPreventsForwarding_dep ; YAML-NEXT: Args: -; YAML-NEXT: - String: loop not vectorized +; YAML-NEXT: - String: the cost-model indicates that interleaving is not beneficial ; YAML-NEXT: ... ; YAML-NEXT: --- !Analysis ; YAML-NEXT: Pass: loop-vectorize @@ -357,9 +349,6 @@ for.body: ; preds = %entry, %for.body ; YAML-NEXT: - String: 'loop not vectorized: ' ; YAML-NEXT: - String: 'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop' ; YAML-NEXT: - String: "\nBackward loop carried data dependence that prevents store-to-load forwarding." -; YAML-NEXT: - String: ' Memory location is the same as accessed at ' -; YAML-NEXT: - Location: 'source.c:74:21' -; YAML-NEXT: DebugLoc: { File: source.c, Line: 74, Column: 21 } ; YAML-NEXT: ... ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: loop-vectorize