From 72dccc1b5ded92345de8b63048f2995c223b29dc Mon Sep 17 00:00:00 2001 From: Lou Knauer Date: Thu, 20 Feb 2025 20:56:40 +0100 Subject: [PATCH 1/5] [VPlan] Update entry/exiting blocks in VPRegionBlocks --- llvm/lib/Transforms/Vectorize/VPlanUtils.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 6ddb88308955f..fd197fc8add2e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -96,6 +96,9 @@ class VPBlockUtils { connectBlocks(NewBlock, Succ); } connectBlocks(BlockPtr, NewBlock); + VPRegionBlock *Parent = BlockPtr->getParent(); + if (Parent && Parent->getExiting() == BlockPtr) + Parent->setExiting(NewBlock); } /// Insert disconnected block \p NewBlock before \p Blockptr. First @@ -112,6 +115,9 @@ class VPBlockUtils { connectBlocks(Pred, NewBlock); } connectBlocks(NewBlock, BlockPtr); + VPRegionBlock *Parent = BlockPtr->getParent(); + if (Parent && Parent->getEntry() == BlockPtr) + Parent->setEntry(NewBlock); } /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p From 3819995b4647718b3b66ff941d40e9a3184f6bef Mon Sep 17 00:00:00 2001 From: Lou Knauer Date: Thu, 20 Feb 2025 21:03:45 +0100 Subject: [PATCH 2/5] [VPlan] Cloning and unrolling for VPWidenPHIRecipe --- llvm/lib/Transforms/Vectorize/VPlan.h | 6 +++- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 31 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 8089cfd1ce802..15e90bc18bc87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1974,7 +1974,11 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe { } VPWidenPHIRecipe *clone() override { - llvm_unreachable("cloning not implemented yet"); + auto *Phi = new VPWidenPHIRecipe( + dyn_cast_if_present(getUnderlyingValue())); + for (unsigned I = 0; I < getNumOperands(); I++) + Phi->addOperand(getIncomingValue(I)); + return Phi; } ~VPWidenPHIRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 89e372d6b46cf..0b46e043e873d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -384,6 +384,21 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { continue; } + // Handle inner-loop/region header phis. The backedge values will be set + // later. Phis not in a loop header can be unrolled like any other recipes, + // RPO makes sure the predecessors are all visited first. + VPRegionBlock *Region = R.getParent()->getParent(); + if (auto *P = dyn_cast(&R); + P && Region->getEntryBasicBlock() == P->getParent()) { + auto InsertPt = std::next(R.getIterator()); + for (unsigned Part = 1; Part != UF; ++Part) { + VPWidenPHIRecipe *Copy = P->clone(); + Copy->insertBefore(*R.getParent(), InsertPt); + addRecipeForPart(&R, Copy, Part); + } + continue; + } + unrollRecipeByUF(R); } } @@ -442,5 +457,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { Part++; } + // Remap operands of cloned inner-loop header phis to update backedge values, + // a problem unique to outer-loop vectorization. + ReversePostOrderTraversal> + DeepRPOT(Plan.getEntry()); + for (VPRegionBlock *Region : + VPBlockUtils::blocksOnly(DeepRPOT)) + for (VPRecipeBase &R : Region->getEntryBasicBlock()->phis()) + if (auto *Phi = dyn_cast(&R)) { + if (Unroller.contains(Phi->getVPSingleValue())) { + Part = 1; + continue; + } + Unroller.remapOperands(&R, Part); + Part++; + } + VPlanTransforms::removeDeadRecipes(Plan); } From 407d320aba89140c06e59f326847c5b6854a3359 Mon Sep 17 00:00:00 2001 From: Lou Knauer Date: Thu, 20 Feb 2025 21:04:35 +0100 Subject: [PATCH 3/5] [VPlan] Unrolling of VPInstruction::AnyOf --- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 0b46e043e873d..2360a20d78cd5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -373,6 +373,28 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { continue; } + if (auto *Any = dyn_cast(&R); + Any && Any->getOpcode() == VPInstruction::AnyOf) { + VPValue *Res = Any; + VPRecipeBase *FirstOr = nullptr; + for (unsigned Part = 1; Part != UF; ++Part) { + auto *NewAny = new VPInstruction( + VPInstruction::AnyOf, {getValueForPart(Any->getOperand(0), Part)}, + Any->getDebugLoc()); + NewAny->insertAfter(Res->getDefiningRecipe()); + auto *Or = new VPInstruction(Instruction::Or, {Res, NewAny}, + Any->getDebugLoc()); + Or->insertAfter(NewAny->getDefiningRecipe()); + ToSkip.insert(Or); + if (Part == 1) + FirstOr = Or; + Res = Or; + } + Any->getVPSingleValue()->replaceAllUsesWith(Res); + FirstOr->setOperand(0, Any); + continue; + } + auto *SingleDef = dyn_cast(&R); if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) { addUniformForAllParts(SingleDef); From 66556d57feadce2782b3498c52171d8fa564c48a Mon Sep 17 00:00:00 2001 From: Lou Knauer Date: Thu, 20 Feb 2025 21:06:00 +0100 Subject: [PATCH 4/5] [LAA] Basic initial outer-loop support --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 69 ++++++++-- .../LoopAccessAnalysis/outer-loops.ll | 128 ++++++++++++++++++ 2 files changed, 186 insertions(+), 11 deletions(-) create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index a1d91de3bb788..6fe7a8a9eed69 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -792,21 +792,65 @@ class AccessAnalysis { } // end anonymous namespace +/// Return true if \p E is invariant with regards to the Loop \p L. +/// If \p E is a recurrence around a inner loop of \p L, then the +/// start and step of that inner loop recurrence must be invariant +/// to \p L. +static bool isInvariantToTheLoop(const Loop *L, ScalarEvolution &SE, + const SCEV *E) { + if (SE.isLoopInvariant(E, L)) + return true; + + if (auto *AddRec = dyn_cast(E); + AddRec && L != AddRec->getLoop() && L->contains(AddRec->getLoop())) { + for (auto *Op : AddRec->operands()) + if (!isInvariantToTheLoop(L, SE, Op)) + return false; + + return true; + } + + return false; +} + /// Try to compute a constant stride for \p AR. Used by getPtrStride and /// isNoWrap. static std::optional getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE) { - // The access function must stride over the innermost loop. + // The access function must stride over the queried loop. if (Lp != AR->getLoop()) { - LLVM_DEBUG({ - dbgs() << "LAA: Bad stride - Not striding over innermost loop "; - if (Ptr) - dbgs() << *Ptr << " "; + assert(!Lp->isInnermost() && Lp->contains(AR->getLoop()) && + "Classic SE should have detected invariance"); + while (AR && Lp != AR->getLoop()) { + if (isInvariantToTheLoop(Lp, *PSE.getSE(), AR)) + return {0}; + + const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); + if (!isInvariantToTheLoop(Lp, *PSE.getSE(), Step)) { + LLVM_DEBUG({ + dbgs() << "LAA: Bad stride - Depends on inner loop "; + if (Ptr) + dbgs() << *Ptr << " "; + + dbgs() << "SCEV: " << *AR << "\n"; + }); + return std::nullopt; + } - dbgs() << "SCEV: " << *AR << "\n"; - }); - return std::nullopt; + AR = dyn_cast(AR->getStart()); + } + + if (!AR || Lp != AR->getLoop()) { + LLVM_DEBUG({ + dbgs() << "LAA: Bad stride - Strides over inner loop "; + if (Ptr) + dbgs() << *Ptr << " "; + + dbgs() << "SCEV: " << *AR << "\n"; + }); + return std::nullopt; + } } // Check the step is constant. @@ -2365,8 +2409,9 @@ bool LoopAccessInfo::canAnalyzeLoop() { << TheLoop->getHeader()->getParent()->getName() << "' from " << TheLoop->getLocStr() << "\n"); - // We can only analyze innermost loops. - if (!TheLoop->isInnermost()) { + // We can only analyze innermost loops if no memory dependency checks + // are needed. + if (!TheLoop->isInnermost() && !TheLoop->isAnnotatedParallel()) { LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n"); recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop"; return false; @@ -2587,6 +2632,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, return true; } + assert(TheLoop->isInnermost()); + for (LoadInst *LD : Loads) { Value *Ptr = LD->getPointerOperand(); // If we did *not* see this pointer before, insert it to the @@ -2812,7 +2859,7 @@ bool LoopAccessInfo::isInvariant(Value *V) const { if (!SE->isSCEVable(V->getType())) return false; const SCEV *S = SE->getSCEV(V); - return SE->isLoopInvariant(S, TheLoop); + return isInvariantToTheLoop(TheLoop, *SE, S); } /// If \p Ptr is a GEP, which has a loop-variant operand, return that operand. diff --git a/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll b/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll new file mode 100644 index 0000000000000..c71d821a7b0b6 --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -disable-output -passes='print' %s 2>&1 | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; The inner two loops of a naive matrix multiplication. +; Not annotated as parallel, so the outer loop should not be analyzed. +define void @outer_loop_not_parallel(i64 %N, i64 %M, ptr noalias %A, ptr %B, ptr %C) { +; CHECK-LABEL: 'outer_loop_not_parallel' +; CHECK-NEXT: inner.loop: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: loop.header: +; CHECK-NEXT: Report: loop is not the innermost loop +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ] + %M.is.zero = icmp eq i64 %M, 0 + br i1 %M.is.zero, label %loop.latch, label %inner.loop + +inner.loop: + %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ] + %a = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ] + %b.addr = getelementptr inbounds float, ptr %B, i64 %j + %b = load float, ptr %b.addr, align 4 + %jxM = mul i64 %j, %M + %jxMpi = add i64 %jxM, %i + %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi + %c = load float, ptr %c.addr, align 4 + %mul = fmul float %b, %c + %a.next = fadd float %a, %mul + %j.next = add nuw nsw i64 %j, 1 + %inner.exitcond = icmp eq i64 %j.next, %M + br i1 %inner.exitcond, label %loop.latch, label %inner.loop + +loop.latch: + %a.lcssa = phi float [ 0x0, %loop.header ], [ %a.next, %inner.loop ] + %a.addr = getelementptr inbounds float, ptr %A, i64 %i + store float %a.lcssa, ptr %a.addr, align 4 + %i.next = add nuw nsw i64 %i, 1 + %loop.exitcond = icmp eq i64 %i.next, %N + br i1 %loop.exitcond, label %exit, label %loop.header + +exit: + ret void +} + + +; The inner two loops of a naive matrix multiplication. +; The outer loop is annotated as parallel. +define void @outer_loop_parallel(i64 %N, i64 %M, ptr noalias %A, ptr %B, ptr %C) { +; CHECK-LABEL: 'outer_loop_parallel' +; CHECK-NEXT: inner.loop: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: loop.header: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ] + %M.is.zero = icmp eq i64 %M, 0 + br i1 %M.is.zero, label %loop.latch, label %inner.loop + +inner.loop: + %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ] + %a = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ] + %b.addr = getelementptr inbounds float, ptr %B, i64 %j + %b = load float, ptr %b.addr, align 4, !llvm.access.group !1 + %jxM = mul i64 %j, %M + %jxMpi = add i64 %jxM, %i + %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi + %c = load float, ptr %c.addr, align 4, !llvm.access.group !1 + %mul = fmul float %b, %c + %a.next = fadd float %a, %mul + %j.next = add nuw nsw i64 %j, 1 + %inner.exitcond = icmp eq i64 %j.next, %M + br i1 %inner.exitcond, label %loop.latch, label %inner.loop + +loop.latch: + %a.lcssa = phi float [ 0x0, %loop.header ], [ %a.next, %inner.loop ] + %a.addr = getelementptr inbounds float, ptr %A, i64 %i + store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !1 + %i.next = add nuw nsw i64 %i, 1 + %loop.exitcond = icmp eq i64 %i.next, %N + br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}} +!1 = distinct !{} From 78a89034e061cf16ba22e478ee3edeeb09b55362 Mon Sep 17 00:00:00 2001 From: Lou Knauer Date: Thu, 20 Feb 2025 21:38:40 +0100 Subject: [PATCH 5/5] [LV] Outer-loop vectorization in the default vectorizer codepath --- .../Vectorize/LoopVectorizationLegality.h | 4 + .../Vectorize/LoopVectorizationLegality.cpp | 66 +- .../Transforms/Vectorize/LoopVectorize.cpp | 275 +++++- .../Transforms/Vectorize/VPRecipeBuilder.h | 20 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 +- llvm/lib/Transforms/Vectorize/VPlan.h | 8 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 40 +- .../outer-loop-vect-in-classic-path.ll | 831 ++++++++++++++++++ .../outer-loop-vect-in-classic-path.ll | 647 ++++++++++++++ 9 files changed, 1843 insertions(+), 55 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll create mode 100644 llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index e959d93b57275..871a79d081719 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -642,6 +642,10 @@ class LoopVectorizationLegality { /// Keep track of the loop edge to an uncountable exit, comprising a pair /// of (Exiting, Exit) blocks, if there is exactly one early exit. std::optional> UncountableEdge; + + /// Contains true for a nested loop if it or any of its parents up + /// to the loop to vectorize needs a inner-loop active lane mask. + mutable DenseMap InnerLoopsNeedingPredication; }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 420cbc5384ce4..1b107179ba4ee 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -572,6 +572,11 @@ bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const { if (VF.isScalar()) return true; + // The SCEVAddRecForUniformityRewriter does not support accesses to addresses + // invariant w.r.t. the vectorized loop but with recurrences of inner loops. + if (!TheLoop->isInnermost()) + return false; + // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is // never considered uniform. auto *SE = PSE.getSE(); @@ -1207,8 +1212,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() { }); } - if (!LAI->canVectorizeMemory()) - return canVectorizeIndirectUnsafeDependences(); + if (!LAI->canVectorizeMemory()) { + if (canVectorizeIndirectUnsafeDependences()) + return true; + + return false; + } if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { reportVectorizationFailure("We don't allow storing to uniform addresses", @@ -1403,7 +1412,31 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { "Uncountable exiting block must be a direct predecessor of latch"); return BB == Latch; } - return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); + + if (LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT)) + return true; + + // Blocks in inner loops need predication if the inner loop trip-count + // is not invariant to the vectorized loop. + if (!TheLoop->isInnermost()) { + Loop *BBLoop = LI->getLoopFor(BB); + if (BBLoop != TheLoop) { + if (auto Iter = InnerLoopsNeedingPredication.find(BBLoop); + Iter != InnerLoopsNeedingPredication.end()) + return Iter->second; + + for (Loop *L = BBLoop; L != TheLoop; L = L->getParentLoop()) + if (!isUniformLoop(L, TheLoop)) { + InnerLoopsNeedingPredication[BBLoop] = true; + return true; + } + + InnerLoopsNeedingPredication[BBLoop] = false; + return false; + } + } + + return false; } bool LoopVectorizationLegality::blockCanBePredicated( @@ -1537,9 +1570,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // Helper function to canVectorizeLoopNestCFG. bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, bool UseVPlanNativePath) { - assert((UseVPlanNativePath || Lp->isInnermost()) && - "VPlan-native path is not enabled."); - // TODO: ORE should be improved to show more accurate information when an // outer loop can't be vectorized because a nested loop is not understood or // legal. Something like: "outer_loop_location: loop not vectorized: @@ -1573,6 +1603,23 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, return false; } + if (Lp != TheLoop && !UseVPlanNativePath) { + // Inner loops must be in loop-simplify form with the latch block being + // also the only exiting block and a dedicated exit. + BasicBlock *Exiting = Lp->getExitingBlock(); + if (!Lp->isLoopSimplifyForm() || !Exiting || + Exiting != Lp->getLoopLatch() || !Lp->isLCSSAForm(*DT)) { + reportVectorizationFailure( + "The inner loops must exit through their latch", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood", ORE, TheLoop); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + } + return Result; } @@ -1775,9 +1822,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { // Specific checks for outer loops. We skip the remaining legal checks at this // point because they don't support outer loops. - if (!TheLoop->isInnermost()) { - assert(UseVPlanNativePath && "VPlan-native path is not enabled."); - + if (!TheLoop->isInnermost() && UseVPlanNativePath) { if (!canVectorizeOuterLoop()) { reportVectorizationFailure("Unsupported outer loop", "UnsupportedOuterLoop", ORE, TheLoop); @@ -1790,7 +1835,6 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return Result; } - assert(TheLoop->isInnermost() && "Inner loop expected."); // Check if we can if-convert non-single-bb loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { @@ -1811,7 +1855,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { } if (isa(PSE.getBackedgeTakenCount())) { - if (TheLoop->getExitingBlock()) { + if (TheLoop->getExitingBlock() || !TheLoop->isInnermost()) { reportVectorizationFailure("Cannot vectorize uncountable loop", "UnsupportedUncountableLoop", ORE, TheLoop); if (DoExtraAnalysis) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e8a5db28ea0a4..555135a73ce28 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,6 +58,7 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanAnalysis.h" +#include "VPlanDominatorTree.h" #include "VPlanHCFGBuilder.h" #include "VPlanHelpers.h" #include "VPlanPatternMatch.h" @@ -401,6 +402,11 @@ static cl::opt EnableEarlyExitVectorization( cl::desc( "Enable vectorization of early exit loops with uncountable exits.")); +static cl::opt ExperimentalOLVInClassicPath( + "experimental-olv-in-classic-vect", cl::init(false), cl::Hidden, + cl::desc("Enable experimental outer-loop vectorization outside the " + "VPlan-native path.")); + // Likelyhood of bypassing the vectorized loop because assumptions about SCEV // variables not overflowing do not hold. See `emitSCEVChecks`. static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; @@ -1085,9 +1091,8 @@ class LoopVectorizationCostModel { assert(VF.isVector() && "Profitable to scalarize relevant only for VF > 1."); assert( - TheLoop->isInnermost() && + (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) && "cost-model should not be used for outer loops (in VPlan-native path)"); - auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); @@ -1097,7 +1102,7 @@ class LoopVectorizationCostModel { /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { assert( - TheLoop->isInnermost() && + (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) && "cost-model should not be used for outer loops (in VPlan-native path)"); // Pseudo probe needs to be duplicated for each unrolled iteration and // vector lane so that profiled loop trip count can be accurately @@ -1117,7 +1122,7 @@ class LoopVectorizationCostModel { /// Returns true if \p I is known to be scalar after vectorization. bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { assert( - TheLoop->isInnermost() && + (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) && "cost-model should not be used for outer loops (in VPlan-native path)"); if (VF.isScalar()) return true; @@ -1190,7 +1195,7 @@ class LoopVectorizationCostModel { InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { assert(VF.isVector() && "Expected VF to be a vector VF"); assert( - TheLoop->isInnermost() && + (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) && "cost-model should not be used for outer loops (in VPlan-native path)"); std::pair InstOnVF = std::make_pair(I, VF); @@ -2205,7 +2210,7 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp, return false; } - if (Hints.getInterleave() > 1) { + if (Hints.getInterleave() > 1 && EnableVPlanNativePath) { // TODO: Interleave support is future work. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " "outer loops.\n"); @@ -2224,7 +2229,8 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, // are stress testing the VPlan H-CFG construction, we collect the outermost // loop of every loop nest. if (L.isInnermost() || VPlanBuildStressTest || - (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { + ((EnableVPlanNativePath || ExperimentalOLVInClassicPath) && + isExplicitVecOuterLoop(&L, ORE))) { LoopBlocksRPO RPOT(&L); RPOT.perform(LI); if (!containsIrreducibleCFG(RPOT, *LI)) { @@ -2932,7 +2938,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Fix widened non-induction PHIs by setting up the PHI operands. - if (EnableVPlanNativePath) + if (EnableVPlanNativePath || ExperimentalOLVInClassicPath) fixNonInductionPHIs(State); // After vectorization, the exit blocks of the original loop will have @@ -3675,6 +3681,31 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { HasUniformUse.insert(Ptr); } + if (!TheLoop->isInnermost()) { + SmallVector Loops(ArrayRef(TheLoop->getSubLoops())); + while (!Loops.empty()) { + auto *Lp = Loops.pop_back_val(); + // Inner-loop inductions can be uniform, as well as their backedge value. + for (PHINode &Phi : Lp->getHeader()->phis()) + if (Legal->isInvariant(&Phi)) { + AddToWorklistIfAllowed(&Phi); + auto *BackedgeVal = Phi.getIncomingValueForBlock(Lp->getLoopLatch()); + assert(Legal->isInvariant(BackedgeVal)); + if (auto *I = dyn_cast(BackedgeVal)) + AddToWorklistIfAllowed(I); + } + + // The exit condition of a inner loop can be uniform. + auto *Br = cast(Lp->getLoopLatch()->getTerminator()); + auto *ICmp = dyn_cast(Br->getCondition()); + if (ICmp && Legal->isInvariant(ICmp->getOperand(0)) && + Legal->isInvariant(ICmp->getOperand(1))) + AddToWorklistIfAllowed(ICmp); + + Loops.append(Lp->getSubLoops().begin(), Lp->getSubLoops().end()); + } + } + // Add to the worklist any operands which have *only* uniform (e.g. lane 0 // demanding) users. Since loops are assumed to be in LCSSA form, this // disallows uses outside the loop as well. @@ -6408,14 +6439,23 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) { if (!Legal->isInvariant(Op)) return false; + // Consider Op invariant, if it or its operands aren't predicated // instruction in the loop. In that case, it is not trivially hoistable. auto *OpI = dyn_cast(Op); - return !OpI || !TheLoop->contains(OpI) || - (!isPredicatedInst(OpI) && - (!isa(OpI) || OpI->getParent() != TheLoop->getHeader()) && - all_of(OpI->operands(), - [this](Value *Op) { return shouldConsiderInvariant(Op); })); + if (!OpI || !TheLoop->contains(OpI)) + return true; + + // Be pessimistic in case of inner loops and do not assume things are + // invariant. The approach below results in a endless loop in case a + // inner-loop header PHI is part of the operands. + if (!TheLoop->isInnermost()) + return false; + + return !isPredicatedInst(OpI) && + (!isa(OpI) || OpI->getParent() != TheLoop->getHeader()) && + all_of(OpI->operands(), + [this](Value *Op) { return shouldConsiderInvariant(Op); }); } InstructionCost @@ -7134,7 +7174,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { } void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { - assert(OrigLoop->isInnermost() && "Inner loop expected."); + assert((OrigLoop->isInnermost() || ExperimentalOLVInClassicPath) && + "Inner loop expected."); CM.collectValuesToIgnore(); CM.collectElementTypesForWidening(); @@ -7577,6 +7618,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { BestPlan.getVectorLoopRegion()->getSingleSuccessor() != BestPlan.getMiddleBlock(); assert((BestFactor.Width == LegacyVF.Width || PlanForEarlyExitLoop || + ExperimentalOLVInClassicPath || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), CostCtx, OrigLoop) || planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), @@ -8265,7 +8307,7 @@ void VPRecipeBuilder::createHeaderMask() { BlockMaskCache[Header] = BlockMask; } -VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { +VPValue *VPRecipeBuilder::getBlockInMask(const BasicBlock *BB) const { // Return the cached value. BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); assert(BCEntryIt != BlockMaskCache.end() && @@ -8986,7 +9028,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { - assert(OrigLoop->isInnermost() && "Inner loop expected."); + assert((OrigLoop->isInnermost() || ExperimentalOLVInClassicPath) && + "Inner loop expected."); auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { @@ -9295,6 +9338,141 @@ static void addExitUsersForFirstOrderRecurrences( } } +// Called before visiting the first instruction in the entry block +// of the inner-loop region. +static void enterInnerLoopRegion(VPlanHCFGBuilder &HCFGBuilder, + VPRecipeBuilder &RecipeBuilder, + VPRegionBlock &Region, ScalarEvolution &SE, + const Loop *TheLoop, const LoopInfo &LI) { + VPBasicBlock *Entry = Region.getEntryBasicBlock(); + const Loop *InnerLoop = LI.getLoopFor(HCFGBuilder.getIRBBForVPB(Entry)); + assert(InnerLoop->isLoopSimplifyForm() && InnerLoop->getNumBackEdges() == 1 && + InnerLoop->getExitingBlock()); + + // Handle the inner-loop header phis. + const BasicBlock *IRPreheader = InnerLoop->getLoopPreheader(); + for (VPRecipeBase &R : Entry->phis()) { + // TODO: If the phi has only uniform users (can happen for inner-loop + // inductions), then creating a scalar phi instead would be + // beneficial, or even a scalar and a widened phi in case the inner-loop + // induction has uniform and non-uniform users. + auto *Phi = cast(&R); + auto *IRPhi = cast(Phi->getUnderlyingValue()); + Phi->setOperand(0, RecipeBuilder.getVPValueOrAddLiveIn( + IRPhi->getIncomingValueForBlock(IRPreheader))); + + // This will ensure that this instruction is kept and not replaced when + // the entry block instructions are visited. + RecipeBuilder.setRecipe(IRPhi, Phi); + } + + // Handle predication for the inner loop. + VPValue *PreheaderMask = RecipeBuilder.getBlockInMask(IRPreheader); + const SCEV *BTC = SE.getBackedgeTakenCount(InnerLoop); + bool NeedsActiveLaneMask = + !isa(BTC) && SE.isLoopInvariant(BTC, TheLoop); + if (NeedsActiveLaneMask) { + auto *InnerALM = new VPWidenPHIRecipe(nullptr); + if (!PreheaderMask) + PreheaderMask = Region.getPlan()->getOrAddLiveIn( + ConstantInt::getTrue(SE.getContext())); + // The backedge value will be filled in when the exit block of the + // region is visted. + InnerALM->addOperand(PreheaderMask); + InnerALM->insertBefore(*Entry, Entry->getFirstNonPhi()); + RecipeBuilder.setBlockInMask(InnerLoop->getHeader(), InnerALM); + } else { + RecipeBuilder.setBlockInMask(InnerLoop->getHeader(), PreheaderMask); + } +} + +// Called after the exiting block of the region is visited before +// visiting the exit block. +static void exitInnerLoopRegion(VPlanHCFGBuilder &HCFGBuilder, + VPRecipeBuilder &RecipeBuilder, + VPRegionBlock &Region) { + + auto *Entry = Region.getEntryBasicBlock(); + auto *Exiting = Region.getExitingBasicBlock(); + const auto *IRHeader = HCFGBuilder.getIRBBForVPB(Entry); + const auto *IRBr = + cast(HCFGBuilder.getIRBBForVPB(Exiting)->getTerminator()); + bool ExitIfTrue = IRBr->getSuccessor(1) == IRHeader; + + // Create the inner-loop exit condition and the backedge value for the + // inner-loop active-lane mask (if needed). + VPValue *ExitCond = RecipeBuilder.getVPValueOrAddLiveIn(IRBr->getCondition()); + auto *ALM = dyn_cast_or_null( + RecipeBuilder.getBlockInMask(IRHeader)); + VPBuilder Builder(Exiting, Exiting->end()); + DebugLoc DL = IRBr->getDebugLoc(); + if (ALM && ALM->getParent() == Entry) { + assert(!ALM->getUnderlyingValue() && ALM->getNumOperands() == 1); + if (ExitIfTrue) + ExitCond = Builder.createNot(ExitCond, DL); + + auto *ALMBackedgeVal = Builder.createLogicalAnd(ALM, ExitCond, DL); + ALM->addOperand(ALMBackedgeVal); + auto *Any = + Builder.createNaryOp(VPInstruction::AnyOf, {ALMBackedgeVal}, DL); + ExitCond = Builder.createNot(Any, DL); + } else if (!ExitIfTrue) { + ExitCond = Builder.createNot(ExitCond, DL); + } + Builder.createNaryOp(VPInstruction::BranchOnCond, {ExitCond}, DL); + + // Set the backedge values of the inner-loop header phis. + const auto *IRPreheader = + HCFGBuilder.getIRBBForVPB(Region.getSinglePredecessor()); + for (VPRecipeBase &R : Entry->phis()) { + auto *Phi = cast(&R); + if (Phi == ALM) + continue; + + auto *IRPhi = cast(Phi->getUnderlyingValue()); + Phi->setOperand(1, RecipeBuilder.getVPValueOrAddLiveIn( + IRPhi->getIncomingValueForBlock(IRBr->getParent()))); + } + + // Handle the LCSSA phis for inner-loop live-out values. + auto *ExitBlock = cast(Region.getSingleSuccessor()); + for (VPRecipeBase &R : ExitBlock->phis()) { + auto *Phi = cast(&R); + auto *IRPhi = cast(Phi->getUnderlyingValue()); + assert(Phi->getNumOperands() == 1); + RecipeBuilder.setRecipe(IRPhi, Phi); + VPValue *OutVal = + RecipeBuilder.getVPValueOrAddLiveIn(IRPhi->getIncomingValue(0)); + VPRecipeBase *OutValDef = OutVal->getDefiningRecipe(); + if (OutValDef && OutValDef->getParent()->getParent() == &Region && ALM && + ALM->getParent() == Entry) { + // In case there is a inner-loop active-lane mask, the live out value of + // the inner loop for a vector must contain the values of the last + // iteration where that lane was active. For this, a new phi is created + // that passes through the value from the last iteration if the lane is + // inactive and the current one if not. + auto *PassthroughPhi = new VPWidenPHIRecipe(IRPhi); + PassthroughPhi->addOperand( + Region.getPlan()->getOrAddLiveIn(PoisonValue::get(IRPhi->getType()))); + PassthroughPhi->insertBefore(*Entry, Entry->getFirstNonPhi()); + + auto *Select = + new VPInstruction(Instruction::Select, {ALM, OutVal, PassthroughPhi}, + OutValDef->getDebugLoc()); + Select->insertAfter(OutValDef); + + PassthroughPhi->addOperand(Select); + OutVal = Select; + } + + Phi->setOperand(0, OutVal); + } + + // The mask of the exit block should be that of the preheader. + RecipeBuilder.setBlockInMask(HCFGBuilder.getIRBBForVPB(ExitBlock), + RecipeBuilder.getBlockInMask(IRPreheader)); +} + VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { @@ -9378,9 +9556,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); + BasicBlock *HeaderBB = OrigLoop->getHeader(); bool NeedsMasks = - CM.foldTailByMasking() || + CM.foldTailByMasking() || !OrigLoop->isInnermost() || any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); return Legal->blockNeedsPredication(BB) || NeedsBlends; @@ -9392,12 +9571,30 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. - ReversePostOrderTraversal> RPOT( + ReversePostOrderTraversal> RPOT( HeaderVPBB); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); VPBlockBase *PrevVPBB = nullptr; - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + for (VPBlockBase *VPBlock : RPOT) { + // Handle the entering into a new inner loop. + if (auto *Region = dyn_cast(VPBlock)) { + assert(ExperimentalOLVInClassicPath); + enterInnerLoopRegion(HCFGBuilder, RecipeBuilder, *Region, *PSE.getSE(), + OrigLoop, *LI); + + // The inner-loop region can keep its successor connection and should be + // connected to its RPO predecessor, but when visiting the entry block of + // the inner loop, there should be no connection to the RPO predecessor. + assert(Region->getNumSuccessors() == 1 && PrevVPBB && + "Invalid inner loop (expected preheader and dedicated exit)"); + VPBlockUtils::connectBlocks(PrevVPBB, Region); + PrevVPBB = nullptr; + continue; + } + + VPBasicBlock *VPBB = cast(VPBlock); + // Handle VPBBs down to the latch. if (VPBB == LoopRegion->getExiting()) { assert(!HCFGBuilder.getIRBBForVPB(VPBB) && @@ -9409,7 +9606,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // Create mask based on the IR BB corresponding to VPBB. // TODO: Predicate directly based on VPlan. Builder.setInsertPoint(VPBB, VPBB->begin()); - if (VPBB == HeaderVPBB) { + if (RecipeBuilder.hasBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB))) { + Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi()); + } else if (VPBB == HeaderVPBB) { Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi()); RecipeBuilder.createHeaderMask(); } else if (NeedsMasks) { @@ -9429,7 +9628,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // FIXME: Migrate code relying on the underlying instruction from VPlan0 // to construct recipes below to not use the underlying instruction. if (isa(&R) || - (isa(&R) && !UnderlyingValue)) + (isa(&R) && !UnderlyingValue) || + (isa(&R) && + (!UnderlyingValue || + RecipeBuilder.hasRecipe(cast(UnderlyingValue))))) continue; // FIXME: VPlan0, which models a copy of the original scalar loop, should @@ -9451,6 +9653,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { Builder.setInsertPoint(SingleDef); SmallVector Operands; auto *Phi = dyn_cast(Instr); + if (Phi && RecipeBuilder.hasRecipe(Phi)) + // Skip over LCSSA or inner-loop header phis. + continue; + if (Phi && Phi->getParent() == HeaderBB) { // The backedge value will be added in fixHeaderPhis later. Operands.push_back(Plan->getOrAddLiveIn( @@ -9498,6 +9704,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { R.eraseFromParent(); } + // Handle the exit of a inner loop region. + if (auto *Region = VPBB->getParent(); + Region && Region->getExiting() == VPBB) { + exitInnerLoopRegion(HCFGBuilder, RecipeBuilder, *Region); + + if (PrevVPBB) + VPBlockUtils::connectBlocks(PrevVPBB, VPBB); + + // The region will already be connected to its single successor. + assert(Region->getNumSuccessors() == 1 && VPBB->getNumSuccessors() == 0); + PrevVPBB = nullptr; + continue; + } + // Flatten the CFG in the loop. Masks for blocks have already been generated // and added to recipes as needed. To do so, first disconnect VPBB from its // successors. Then connect VPBB to the previously visited VPBB. @@ -10460,9 +10680,6 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, } bool LoopVectorizePass::processLoop(Loop *L) { - assert((EnableVPlanNativePath || L->isInnermost()) && - "VPlan-native path is not enabled. Only process inner loops."); - LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" << L->getHeader()->getParent()->getName() << "' from " << L->getLocStr() << "\n"); @@ -10520,11 +10737,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { // even evaluating whether vectorization is profitable. Since we cannot modify // the incoming IR, we need to build VPlan upfront in the vectorization // pipeline. - if (!L->isInnermost()) + // + // The normal vectorization codepath now also has experimental support for + // outer-loop vectorization. + if (!L->isInnermost() && EnableVPlanNativePath) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, ORE, BFI, PSI, Hints, Requirements); - assert(L->isInnermost() && "Inner loop expected."); + assert((L->isInnermost() || ExperimentalOLVInClassicPath) && + "Inner loop expected."); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); @@ -10534,7 +10755,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { UseInterleaved = EnableInterleavedMemAccesses; // Analyze interleaved memory accesses. - if (UseInterleaved) + if (UseInterleaved && L->isInnermost()) IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); if (LVL.hasUncountableEarlyExit()) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index e8d3ad89e14cf..464f43927f780 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -196,7 +196,17 @@ class VPRecipeBuilder { void createBlockInMask(BasicBlock *BB); /// Returns the *entry* mask for the block \p BB. - VPValue *getBlockInMask(BasicBlock *BB) const; + VPValue *getBlockInMask(const BasicBlock *BB) const; + + /// Returns true if there already is a block-in mask for \p BB. + bool hasBlockInMask(BasicBlock *BB) const { + return BlockMaskCache.contains(BB); + } + + /// Set the block-in mask of \p BB directly. + void setBlockInMask(BasicBlock *BB, VPValue *Mask) { + BlockMaskCache[BB] = Mask; + } /// Create an edge mask for every destination of cases and/or default. void createSwitchEdgeMasks(SwitchInst *SI); @@ -225,6 +235,14 @@ class VPRecipeBuilder { ArrayRef Operands, VFRange &Range); + /// Return true if there already is a recipe for the given ingredient. + bool hasRecipe(Instruction *I) const { return Ingredient2Recipe.contains(I); } + + /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as + /// last operand. Range.End may be decreased to ensure same recipe behavior + /// from \p Range.Start to \p Range.End. + VPReplicateRecipe *handleReplication(Instruction *I, VFRange &Range); + /// Add the incoming values from the backedge to reduction & first-order /// recurrence cross-iteration phis. void fixHeaderPhis(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index cd111365c134c..ac8823df0c2f2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -970,7 +970,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. - assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) && + // When outer-loop vectorizing and the trip-count is known, it is possible + // that VPlanTransforms::optimizeForVFAndUF() destroys the vector loop region, + // but getVectorLoopRegion() will falsely return the inner loop region. + assert((!getVectorLoopRegion() || VFxUF.getNumUsers() || + !State.LI->getLoopFor(getScalarHeader()->getIRBasicBlock()) + ->isInnermost()) && "VFxUF expected to always have users"); unsigned UF = getUF(); if (VF.getNumUsers()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 15e90bc18bc87..f0786d3d9e529 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1994,11 +1994,17 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe { VPSlotTracker &SlotTracker) const override; #endif + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + /// Returns the \p I th incoming VPBasicBlock. VPBasicBlock *getIncomingBlock(unsigned I); /// Returns the \p I th incoming VPValue. - VPValue *getIncomingValue(unsigned I) { return getOperand(I); } + VPValue *getIncomingValue(unsigned I) const { return getOperand(I); } + + /// Return the incoming VPValue for the predecessor \p BB. + VPValue *getIncomingValueForBlock(const VPBasicBlock *BB) const; }; /// A recipe for handling first-order recurrence phis. The start value is the diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d57a6c481748c..42a918e8c76d9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3642,10 +3642,25 @@ VPBasicBlock *VPWidenPHIRecipe::getIncomingBlock(unsigned I) { return Pred->getExitingBasicBlock(); } -void VPWidenPHIRecipe::execute(VPTransformState &State) { - assert(EnableVPlanNativePath && - "Non-native vplans are not expected to have VPWidenPHIRecipes."); +VPValue * +VPWidenPHIRecipe::getIncomingValueForBlock(const VPBasicBlock *BB) const { + const VPBasicBlock *Parent = getParent(); + const VPRegionBlock *Region = Parent->getParent(); + if (Region && Region->getEntryBasicBlock() == Parent) { + if (Region->getSinglePredecessor() == BB) + return getOperand(0); + if (Region->getExitingBasicBlock() == BB) + return getOperand(1); + } + + for (unsigned I = 0; I < Parent->getNumPredecessors(); I++) + if (Parent->getPredecessors()[I] == BB) + return getOperand(I); + return nullptr; +} + +void VPWidenPHIRecipe::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); Value *Op0 = State.get(getOperand(0)); Type *VecTy = Op0->getType(); @@ -3657,23 +3672,20 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) { void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-PHI "; - - auto *OriginalPhi = cast(getUnderlyingValue()); - // Unless all incoming values are modeled in VPlan print the original PHI - // directly. - // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming - // values as VPValues. - if (getNumOperands() != OriginalPhi->getNumOperands()) { - O << VPlanIngredient(OriginalPhi); - return; - } - printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif +InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + if (getNumOperands() == 1) + return 0; // LCSSA Phis can be considered free. + + return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); +} + // TODO: It would be good to use the existing VPWidenPHIRecipe instead and // remove VPActiveLaneMaskPHIRecipe. void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll new file mode 100644 index 0000000000000..bed6c3ece93a6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll @@ -0,0 +1,831 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=aarch64 -mattr=+sve -passes=loop-vectorize,instcombine,simplifycfg \ +; RUN: -force-vector-interleave=1 -experimental-olv-in-classic-vect \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck -check-prefix=CHECK-IC1 %s +; RUN: opt -S -mtriple=aarch64 -mattr=+sve -passes=loop-vectorize,instcombine,simplifycfg \ +; RUN: -force-vector-interleave=2 -experimental-olv-in-classic-vect \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck -check-prefix=CHECK-IC2 %s + +;;; Effectively the inner two loops of: +; for (size_t i = 0; i < N; i++) { +; #pragma clang loop vectorize(enable) +; for (size_t j = 0; j < N; j++) { +; float a = 0.; +; for (size_t k = 0; k < N; k++) +; a += B[i][k] * C[k][j]; +; A[i][j] = a; +; } +; } +define void @foo(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) { +; CHECK-IC1-LABEL: define void @foo( +; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-IC1-NEXT: [[ENTRY:.*]]: +; CHECK-IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC1-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC1-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-IC1-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-IC1: [[VECTOR_BODY]]: +; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH9:.*]] ] +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH9]] ] +; CHECK-IC1-NEXT: br label %[[INNER_LOOP1:.*]] +; CHECK-IC1: [[INNER_LOOP1]]: +; CHECK-IC1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT12:%.*]], %[[INNER_LOOP1]] ] +; CHECK-IC1-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP13:%.*]], %[[INNER_LOOP1]] ] +; CHECK-IC1-NEXT: [[VEC_PHI3:%.*]] = phi [ [[ACTIVE_LANE_MASK]], %[[VECTOR_BODY]] ], [ [[TMP19:%.*]], %[[INNER_LOOP1]] ] +; CHECK-IC1-NEXT: [[VEC_PHI4:%.*]] = phi [ shufflevector ( insertelement ( poison, float poison, i64 0), poison, zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP14:%.*]], %[[INNER_LOOP1]] ] +; CHECK-IC1-NEXT: [[TMP5:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; CHECK-IC1-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[TMP7]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP8:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC1-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], [[M]] +; CHECK-IC1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]] +; CHECK-IC1-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]] +; CHECK-IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[VEC_PHI3]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[TMP12:%.*]] = fmul [[BROADCAST_SPLAT]], [[WIDE_MASKED_LOAD]] +; CHECK-IC1-NEXT: [[TMP13]] = fadd [[VEC_PHI2]], [[TMP12]] +; CHECK-IC1-NEXT: [[TMP14]] = select [[VEC_PHI3]], [[TMP13]], [[VEC_PHI4]] +; CHECK-IC1-NEXT: [[TMP15:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC1-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[TMP15]], 1 +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT12]] = shufflevector [[BROADCAST_SPLATINSERT11]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP16]], [[M]] +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i1 [[TMP17]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP18:%.*]] = xor [[BROADCAST_SPLAT6]], splat (i1 true) +; CHECK-IC1-NEXT: [[TMP19]] = select [[VEC_PHI3]], [[TMP18]], zeroinitializer +; CHECK-IC1-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) +; CHECK-IC1-NEXT: br i1 [[TMP20]], label %[[INNER_LOOP1]], label %[[LOOP_LATCH9]] +; CHECK-IC1: [[LOOP_LATCH9]]: +; CHECK-IC1-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; CHECK-IC1-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP14]], ptr [[TMP21]], i32 4, [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-IC1-NEXT: [[TMP22:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-IC1-NEXT: br i1 [[TMP22]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-IC1: [[EXIT]]: +; CHECK-IC1-NEXT: ret void +; +; CHECK-IC2-LABEL: define void @foo( +; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-IC2-NEXT: [[ENTRY:.*]]: +; CHECK-IC2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 +; CHECK-IC2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 +; CHECK-IC2-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-IC2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP6]], i64 [[N]]) +; CHECK-IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-IC2: [[VECTOR_BODY]]: +; CHECK-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH22:.*]] ] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH22]] ] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT25:%.*]], %[[LOOP_LATCH22]] ] +; CHECK-IC2-NEXT: br label %[[INNER_LOOP3:.*]] +; CHECK-IC2: [[INNER_LOOP3]]: +; CHECK-IC2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT27:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT29:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP21:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP22:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI7:%.*]] = phi [ [[ACTIVE_LANE_MASK]], %[[VECTOR_BODY]] ], [ [[TMP33:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI8:%.*]] = phi [ [[ACTIVE_LANE_MASK2]], %[[VECTOR_BODY]] ], [ [[TMP34:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI9:%.*]] = phi [ shufflevector ( insertelement ( poison, float poison, i64 0), poison, zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP23:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI10:%.*]] = phi [ shufflevector ( insertelement ( poison, float poison, i64 0), poison, zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP24:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[TMP40:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP40]] +; CHECK-IC2-NEXT: [[TMP41:%.*]] = extractelement [[VEC_PHI4]], i64 0 +; CHECK-IC2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] +; CHECK-IC2-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP7]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[TMP14]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP42]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement poison, float [[TMP12]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector [[BROADCAST_SPLATINSERT12]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP13:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP13]], [[M]] +; CHECK-IC2-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]] +; CHECK-IC2-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]] +; CHECK-IC2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[DOTIDX:%.*]] = shl i64 [[TMP17]], 4 +; CHECK-IC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[DOTIDX]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[VEC_PHI7]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, [[VEC_PHI8]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[TMP19:%.*]] = fmul [[BROADCAST_SPLAT]], [[WIDE_MASKED_LOAD]] +; CHECK-IC2-NEXT: [[TMP20:%.*]] = fmul [[BROADCAST_SPLAT13]], [[WIDE_MASKED_LOAD11]] +; CHECK-IC2-NEXT: [[TMP21]] = fadd [[VEC_PHI5]], [[TMP19]] +; CHECK-IC2-NEXT: [[TMP22]] = fadd [[VEC_PHI6]], [[TMP20]] +; CHECK-IC2-NEXT: [[TMP23]] = select [[VEC_PHI7]], [[TMP21]], [[VEC_PHI9]] +; CHECK-IC2-NEXT: [[TMP24]] = select [[VEC_PHI8]], [[TMP22]], [[VEC_PHI10]] +; CHECK-IC2-NEXT: [[TMP25:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP26:%.*]] = add nuw nsw i64 [[TMP25]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement poison, i64 [[TMP26]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT27]] = shufflevector [[BROADCAST_SPLATINSERT26]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP27:%.*]] = extractelement [[VEC_PHI4]], i64 0 +; CHECK-IC2-NEXT: [[TMP28:%.*]] = add nuw nsw i64 [[TMP27]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement poison, i64 [[TMP28]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT29]] = shufflevector [[BROADCAST_SPLATINSERT28]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP29:%.*]] = icmp eq i64 [[TMP26]], [[M]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement poison, i1 [[TMP29]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector [[BROADCAST_SPLATINSERT14]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[M]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement poison, i1 [[TMP30]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector [[BROADCAST_SPLATINSERT16]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP31:%.*]] = xor [[BROADCAST_SPLAT15]], splat (i1 true) +; CHECK-IC2-NEXT: [[TMP32:%.*]] = xor [[BROADCAST_SPLAT17]], splat (i1 true) +; CHECK-IC2-NEXT: [[TMP33]] = select [[VEC_PHI7]], [[TMP31]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP34]] = select [[VEC_PHI8]], [[TMP32]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP33]]) +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement poison, i1 [[TMP35]], i64 0 +; CHECK-IC2-NEXT: [[TMP36:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP34]]) +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement poison, i1 [[TMP36]], i64 0 +; CHECK-IC2-NEXT: [[TMP37:%.*]] = or [[BROADCAST_SPLATINSERT18]], [[BROADCAST_SPLATINSERT20]] +; CHECK-IC2-NEXT: [[TMP38:%.*]] = shufflevector [[TMP37]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP39:%.*]] = extractelement [[TMP38]], i64 0 +; CHECK-IC2-NEXT: br i1 [[TMP39]], label %[[INNER_LOOP3]], label %[[LOOP_LATCH22]] +; CHECK-IC2: [[LOOP_LATCH22]]: +; CHECK-IC2-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; CHECK-IC2-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[DOTIDX30:%.*]] = shl i64 [[TMP48]], 4 +; CHECK-IC2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i64 [[DOTIDX30]] +; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP23]], ptr [[TMP47]], i32 4, [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP24]], ptr [[TMP49]], i32 4, [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-IC2-NEXT: [[TMP43:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP44:%.*]] = shl i64 [[TMP43]], 2 +; CHECK-IC2-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], [[TMP44]] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT25]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP45]], i64 [[TMP4]]) +; CHECK-IC2-NEXT: [[TMP46:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-IC2-NEXT: br i1 [[TMP46]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-IC2: [[EXIT]]: +; CHECK-IC2-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ] + br label %inner.loop + +inner.loop: + %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ] + %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ] + %b.addr = getelementptr inbounds float, ptr %B, i64 %j + %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3 + %jxM = mul i64 %j, %M + %jxMpi = add i64 %jxM, %i + %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi + %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3 + %mul = fmul float %b.load, %c.load + %a.next = fadd float %a.phi, %mul + %j.next = add nuw nsw i64 %j, 1 + %inner.exitcond = icmp eq i64 %j.next, %M + br i1 %inner.exitcond, label %loop.latch, label %inner.loop + +loop.latch: + %a.lcssa = phi float [ %a.next, %inner.loop ] + %a.addr = getelementptr inbounds float, ptr %A, i64 %i + store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3 + %i.next = add nuw nsw i64 %i, 1 + %loop.exitcond = icmp eq i64 %i.next, %N + br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +;;; Effectively the inner two loops of: +; for (size_t i = 0; i < N; i++) { +; #pragma clang loop vectorize(enable) +; for (size_t j = 0; j < N; j++) { +; float a = 0.; +; for (size_t k = 0; k < j; k++) +; a += B[i][k] * C[k][j]; +; A[i][j] = a; +; } +; } +;;; Note that the inner loop's trip-count depends on the outer loop. +define void @bar(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) { +; CHECK-IC1-LABEL: define void @bar( +; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0]] { +; CHECK-IC1-NEXT: [[ENTRY:.*]]: +; CHECK-IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC1-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC1-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-IC1-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-IC1: [[VECTOR_BODY]]: +; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH3:.*]] ] +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH3]] ] +; CHECK-IC1-NEXT: br label %[[INNER_LOOP1:.*]] +; CHECK-IC1: [[INNER_LOOP1]]: +; CHECK-IC1-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ] +; CHECK-IC1-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP13:%.*]], %[[INNER_LOOP1]] ] +; CHECK-IC1-NEXT: [[TMP6:%.*]] = extractelement [[VEC_PHI1]], i64 0 +; CHECK-IC1-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[TMP7]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[TMP8:%.*]] = extractelement [[VEC_PHI1]], i64 0 +; CHECK-IC1-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], [[M]] +; CHECK-IC1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]] +; CHECK-IC1-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]] +; CHECK-IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[TMP12:%.*]] = fmul [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]] +; CHECK-IC1-NEXT: [[TMP13]] = fadd [[VEC_PHI3]], [[TMP12]] +; CHECK-IC1-NEXT: [[TMP15:%.*]] = extractelement [[VEC_PHI1]], i64 0 +; CHECK-IC1-NEXT: [[TMP21:%.*]] = add i64 [[TMP15]], 1 +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP21]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT6]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP21]], [[INDEX]] +; CHECK-IC1-NEXT: br i1 [[TMP14]], label %[[LOOP_LATCH3]], label %[[INNER_LOOP1]] +; CHECK-IC1: [[LOOP_LATCH3]]: +; CHECK-IC1-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; CHECK-IC1-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP13]], ptr [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-IC1-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-IC1-NEXT: br i1 [[TMP20]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-IC1: [[EXIT]]: +; CHECK-IC1-NEXT: ret void +; +; CHECK-IC2-LABEL: define void @bar( +; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0]] { +; CHECK-IC2-NEXT: [[ENTRY:.*]]: +; CHECK-IC2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 3 +; CHECK-IC2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 3 +; CHECK-IC2-NEXT: [[TMP5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]]) +; CHECK-IC2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 2 +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 [[N]]) +; CHECK-IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-IC2: [[VECTOR_BODY]]: +; CHECK-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH11:.*]] ] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH11]] ] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT22:%.*]], %[[LOOP_LATCH11]] ] +; CHECK-IC2-NEXT: br label %[[INNER_LOOP3:.*]] +; CHECK-IC2: [[INNER_LOOP3]]: +; CHECK-IC2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT16:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT18:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP29:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP30:%.*]], %[[INNER_LOOP3]] ] +; CHECK-IC2-NEXT: [[TMP13:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[TMP12]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP9:%.*]] = extractelement [[VEC_PHI4]], i64 0 +; CHECK-IC2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, ptr [[TMP10]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[BROADCAST_SPLAT8]], i32 4, [[ACTIVE_LANE_MASK2]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[TMP11:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP11]], [[M]] +; CHECK-IC2-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP14]] +; CHECK-IC2-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[INDEX]] +; CHECK-IC2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[DOTIDX:%.*]] = shl i64 [[TMP17]], 4 +; CHECK-IC2-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[DOTIDX]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, [[ACTIVE_LANE_MASK]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, [[ACTIVE_LANE_MASK2]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[TMP19:%.*]] = fmul [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]] +; CHECK-IC2-NEXT: [[TMP20:%.*]] = fmul [[WIDE_MASKED_GATHER13]], [[WIDE_MASKED_LOAD14]] +; CHECK-IC2-NEXT: [[TMP29]] = fadd [[VEC_PHI5]], [[TMP19]] +; CHECK-IC2-NEXT: [[TMP30]] = fadd [[VEC_PHI6]], [[TMP20]] +; CHECK-IC2-NEXT: [[TMP21:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP22:%.*]] = add i64 [[TMP21]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement poison, i64 [[TMP22]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT16]] = shufflevector [[BROADCAST_SPLATINSERT15]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP23:%.*]] = extractelement [[VEC_PHI4]], i64 0 +; CHECK-IC2-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement poison, i64 [[TMP24]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT18]] = shufflevector [[BROADCAST_SPLATINSERT17]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[TMP22]], [[INDEX]] +; CHECK-IC2-NEXT: br i1 [[TMP25]], label %[[LOOP_LATCH11]], label %[[INNER_LOOP3]] +; CHECK-IC2: [[LOOP_LATCH11]]: +; CHECK-IC2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; CHECK-IC2-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[DOTIDX19:%.*]] = shl i64 [[TMP27]], 4 +; CHECK-IC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[DOTIDX19]] +; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP29]], ptr [[TMP26]], i32 4, [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP30]], ptr [[TMP28]], i32 4, [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; CHECK-IC2-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP40:%.*]] = shl i64 [[TMP39]], 2 +; CHECK-IC2-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP5]]) +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT22]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP41]], i64 [[TMP5]]) +; CHECK-IC2-NEXT: [[TMP42:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-IC2-NEXT: br i1 [[TMP42]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-IC2: [[EXIT]]: +; CHECK-IC2-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ] + br label %inner.loop + +inner.loop: + %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ] + %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ] + %b.addr = getelementptr inbounds float, ptr %B, i64 %j + %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3 + %jxM = mul i64 %j, %M + %jxMpi = add i64 %jxM, %i + %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi + %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3 + %mul = fmul float %b.load, %c.load + %a.next = fadd float %a.phi, %mul + %j.next = add nuw nsw i64 %j, 1 + %inner.exitcond = icmp eq i64 %j.next, %i + br i1 %inner.exitcond, label %loop.latch, label %inner.loop + +loop.latch: + %a.lcssa = phi float [ %a.next, %inner.loop ] + %a.addr = getelementptr inbounds float, ptr %A, i64 %i + store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3 + %i.next = add nuw nsw i64 %i, 1 + %loop.exitcond = icmp eq i64 %i.next, %N + br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +;;; Effectively something like: +; #pragma clang loop vectorize(enable) +; for (long i = 0; i < N; i++) { +; long a = A[i]; +; long j = 0; +; if (a > 0) { +; do { +; a -= B[j]; +; j++; +; } while (a > 0); +; } +; A[i] = a + j; +; } +;;; Note that the inner loop is behind a branch, so the start value of the inner +;;; loop mask phi must be corespondingly. The induction of the inner loop is used +;;; for a uniform memory accesses and as live-out, so the vectorized code should +;;; contain two phis for it (one scalar and one widened). +;;; Also, in this example, the inner loop backedge is the first successor of the +;;; the latch terminator, not the second one as is assumed by VPlan. +define void @baz(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B) { +; CHECK-IC1-LABEL: define void @baz( +; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] { +; CHECK-IC1-NEXT: [[ENTRY:.*]]: +; CHECK-IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC1-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 +; CHECK-IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC1-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 +; CHECK-IC1-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) +; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-IC1: [[VECTOR_BODY]]: +; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT3:.*]] ] +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT3]] ] +; CHECK-IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[TMP6:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-IC1-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; CHECK-IC1-NEXT: br label %[[INNER_LOOP1:.*]] +; CHECK-IC1: [[INNER_LOOP1]]: +; CHECK-IC1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ] +; CHECK-IC1-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP9:%.*]], %[[INNER_LOOP1]] ] +; CHECK-IC1-NEXT: [[TMP10:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP10]] +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[TMP8]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], i32 8, [[TMP7]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[TMP9]] = sub [[VEC_PHI3]], [[WIDE_MASKED_GATHER]] +; CHECK-IC1-NEXT: [[J2:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC1-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[J2]], 1 +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT6]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP13:%.*]] = extractelement [[TMP9]], i64 0 +; CHECK-IC1-NEXT: [[TMP12:%.*]] = icmp slt i64 [[TMP13]], 1 +; CHECK-IC1-NEXT: br i1 [[TMP12]], label %[[LOOP_LATCH_LOOPEXIT3]], label %[[INNER_LOOP1]] +; CHECK-IC1: [[LOOP_LATCH_LOOPEXIT3]]: +; CHECK-IC1-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_MASKED_LOAD]] +; CHECK-IC1-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-IC1-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-IC1-NEXT: br i1 [[TMP15]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-IC1: [[EXIT]]: +; CHECK-IC1-NEXT: ret void +; +; CHECK-IC2-LABEL: define void @baz( +; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] { +; CHECK-IC2-NEXT: [[ENTRY:.*]]: +; CHECK-IC2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-IC2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-IC2-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-IC2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 1 +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]]) +; CHECK-IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-IC2: [[VECTOR_BODY]]: +; CHECK-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT11:.*]] ] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT11]] ] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], %[[LOOP_LATCH_LOOPEXIT11]] ] +; CHECK-IC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-IC2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4 +; CHECK-IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[DOTIDX]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK2]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[TMP10:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP11:%.*]] = icmp sgt [[WIDE_MASKED_LOAD3]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP11]], zeroinitializer +; CHECK-IC2-NEXT: br label %[[INNER_LOOP4:.*]] +; CHECK-IC2: [[INNER_LOOP4]]: +; CHECK-IC2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT17:%.*]], %[[INNER_LOOP4]] ] +; CHECK-IC2-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT19:%.*]], %[[INNER_LOOP4]] ] +; CHECK-IC2-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP18:%.*]], %[[INNER_LOOP4]] ] +; CHECK-IC2-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP19:%.*]], %[[INNER_LOOP4]] ] +; CHECK-IC2-NEXT: [[TMP14:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP14]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[TMP15]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP16:%.*]] = extractelement [[VEC_PHI5]], i64 0 +; CHECK-IC2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP16]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, ptr [[TMP17]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], i32 8, [[TMP12]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT9]], i32 8, [[TMP13]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[TMP18]] = sub [[VEC_PHI6]], [[WIDE_MASKED_GATHER]] +; CHECK-IC2-NEXT: [[TMP19]] = sub [[VEC_PHI7]], [[WIDE_MASKED_GATHER10]] +; CHECK-IC2-NEXT: [[J6:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP20:%.*]] = add nuw nsw i64 [[J6]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement poison, i64 [[TMP20]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT17]] = shufflevector [[BROADCAST_SPLATINSERT16]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP22:%.*]] = extractelement [[VEC_PHI5]], i64 0 +; CHECK-IC2-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[TMP22]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement poison, i64 [[TMP23]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT19]] = shufflevector [[BROADCAST_SPLATINSERT18]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP24:%.*]] = extractelement [[TMP18]], i64 0 +; CHECK-IC2-NEXT: [[TMP25:%.*]] = icmp slt i64 [[TMP24]], 1 +; CHECK-IC2-NEXT: br i1 [[TMP25]], label %[[LOOP_LATCH_LOOPEXIT11]], label %[[INNER_LOOP4]] +; CHECK-IC2: [[LOOP_LATCH_LOOPEXIT11]]: +; CHECK-IC2-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], [[TMP18]], [[WIDE_MASKED_LOAD]] +; CHECK-IC2-NEXT: [[PREDPHI14:%.*]] = select [[TMP13]], [[TMP19]], [[WIDE_MASKED_LOAD3]] +; CHECK-IC2-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[DOTIDX20:%.*]] = shl i64 [[TMP26]], 4 +; CHECK-IC2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[DOTIDX20]] +; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI14]], ptr [[TMP27]], i32 8, [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-IC2-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP29:%.*]] = shl i64 [[TMP28]], 1 +; CHECK-IC2-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], [[TMP29]] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP30]], i64 [[TMP4]]) +; CHECK-IC2-NEXT: [[TMP31:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-IC2-NEXT: br i1 [[TMP31]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-IC2: [[EXIT]]: +; CHECK-IC2-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ] + %a.addr = getelementptr inbounds i64, ptr %A, i64 %i + %a.load = load i64, ptr %a.addr, align 8, !llvm.access.group !3 + %a.is.positive = icmp sgt i64 %a.load, 0 + br i1 %a.is.positive, label %inner.loop, label %loop.latch + +inner.loop: + %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ] + %a.phi = phi i64 [ %a.next, %inner.loop ], [ 0, %loop.header ] + %b.addr = getelementptr inbounds i64, ptr %B, i64 %j + %b.load = load i64, ptr %b.addr, align 8, !llvm.access.group !3 + %a.next = sub i64 %a.phi, %b.load + %j.next = add nuw nsw i64 %j, 1 + %a.is.still.positive = icmp sgt i64 %a.next, 0 + br i1 %a.is.still.positive, label %inner.loop, label %loop.latch + +loop.latch: + %a.res = phi i64 [ %a.load, %loop.header ], [ %a.next, %inner.loop ] + store i64 %a.res, ptr %a.addr, align 8, !llvm.access.group !3 + %i.next = add nuw nsw i64 %i, 1 + %loop.exitcond = icmp eq i64 %i.next, %N + br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +;;; Triple-loop nest with the outer-most one beeing vectorized. +; #pragma clang loop vectorize(enable) +; for (size_t i = 0; i < N; i++) +; for (size_t j = 0; j < M; j++) +; for (size_t k = 0; k < L; k++) +; A[k][i] += B[i][k]; +define void @quuz(i64 %N, i64 %M, i64 %L, ptr noalias %A, ptr readonly %B) { +; CHECK-IC1-LABEL: define void @quuz( +; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] { +; CHECK-IC1-NEXT: [[ENTRY:.*:]] +; CHECK-IC1-NEXT: [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-IC1-NEXT: br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[VECTOR_PH:.*]] +; CHECK-IC1: [[VECTOR_PH]]: +; CHECK-IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC1-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC1-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-IC1-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[M]], i64 0 +; CHECK-IC1-NEXT: [[TMP5:%.*]] = icmp eq [[BROADCAST_SPLATINSERT]], zeroinitializer +; CHECK-IC1-NEXT: [[TMP6:%.*]] = shufflevector [[TMP5]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP11:%.*]] = xor [[TMP6]], splat (i1 true) +; CHECK-IC1-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP1]], i64 0 +; CHECK-IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[L]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-IC1: [[VECTOR_BODY]]: +; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17:.*]] ] +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17]] ] +; CHECK-IC1-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17]] ] +; CHECK-IC1-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] +; CHECK-IC1-NEXT: [[TMP9:%.*]] = mul [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-IC1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[B]], [[TMP9]] +; CHECK-IC1-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-IC1-NEXT: br label %[[MIDDLE_LOOP3:.*]] +; CHECK-IC1: [[MIDDLE_LOOP3]]: +; CHECK-IC1-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT19:%.*]], %[[MIDDLE_LATCH_LOOPEXIT12:.*]] ] +; CHECK-IC1-NEXT: [[VEC_PHI1:%.*]] = phi [ [[TMP12]], %[[VECTOR_BODY]] ], [ [[TMP27:%.*]], %[[MIDDLE_LATCH_LOOPEXIT12]] ] +; CHECK-IC1-NEXT: [[TMP13:%.*]] = icmp ne [[BROADCAST_SPLAT2]], zeroinitializer +; CHECK-IC1-NEXT: [[TMP24:%.*]] = select [[VEC_PHI1]], [[TMP13]], zeroinitializer +; CHECK-IC1-NEXT: br label %[[INNER_LOOP5:.*]] +; CHECK-IC1: [[INNER_LOOP5]]: +; CHECK-IC1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[MIDDLE_LOOP3]] ], [ [[BROADCAST_SPLAT10:%.*]], %[[INNER_LOOP5]] ] +; CHECK-IC1-NEXT: [[TMP14:%.*]] = phi [ [[TMP24]], %[[MIDDLE_LOOP3]] ], [ [[TMP25:%.*]], %[[INNER_LOOP5]] ] +; CHECK-IC1-NEXT: [[K6:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC1-NEXT: [[TMP15:%.*]] = mul i64 [[K6]], [[N]] +; CHECK-IC1-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[TMP15]] +; CHECK-IC1-NEXT: [[TMP17:%.*]] = getelementptr float, [[TMP10]], [[VEC_PHI]] +; CHECK-IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, [[TMP14]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP17]], i32 4, [[TMP14]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-IC1-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP18]], ptr [[TMP16]], i32 4, [[TMP14]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC1-NEXT: [[TMP31:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC1-NEXT: [[TMP19:%.*]] = add i64 [[TMP31]], 1 +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i64 [[TMP19]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT10]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[L]] +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement poison, i1 [[TMP20]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector [[BROADCAST_SPLATINSERT10]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP29:%.*]] = xor [[BROADCAST_SPLAT11]], splat (i1 true) +; CHECK-IC1-NEXT: [[TMP25]] = select [[TMP14]], [[TMP29]], zeroinitializer +; CHECK-IC1-NEXT: [[TMP30:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP25]]) +; CHECK-IC1-NEXT: br i1 [[TMP30]], label %[[INNER_LOOP5]], label %[[MIDDLE_LATCH_LOOPEXIT12]] +; CHECK-IC1: [[MIDDLE_LATCH_LOOPEXIT12]]: +; CHECK-IC1-NEXT: [[J4:%.*]] = extractelement [[VEC_PHI2]], i64 0 +; CHECK-IC1-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[J4]], 1 +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement poison, i64 [[TMP21]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT19]] = shufflevector [[BROADCAST_SPLATINSERT18]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], [[M]] +; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement poison, i1 [[TMP22]], i64 0 +; CHECK-IC1-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector [[BROADCAST_SPLATINSERT14]], poison, zeroinitializer +; CHECK-IC1-NEXT: [[TMP26:%.*]] = xor [[BROADCAST_SPLAT15]], splat (i1 true) +; CHECK-IC1-NEXT: [[TMP27]] = select [[VEC_PHI1]], [[TMP26]], zeroinitializer +; CHECK-IC1-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP27]]) +; CHECK-IC1-NEXT: br i1 [[TMP28]], label %[[MIDDLE_LOOP3]], label %[[OUTER_LATCH_LOOPEXIT17]] +; CHECK-IC1: [[OUTER_LATCH_LOOPEXIT17]]: +; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-IC1-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-IC1-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-IC1-NEXT: br i1 [[TMP23]], label %[[VECTOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-IC1: [[EXIT]]: +; CHECK-IC1-NEXT: ret void +; +; CHECK-IC2-LABEL: define void @quuz( +; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] { +; CHECK-IC2-NEXT: [[ENTRY:.*:]] +; CHECK-IC2-NEXT: [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-IC2-NEXT: br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[VECTOR_PH:.*]] +; CHECK-IC2: [[VECTOR_PH]]: +; CHECK-IC2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-IC2-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 3 +; CHECK-IC2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 3 +; CHECK-IC2-NEXT: [[TMP5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]]) +; CHECK-IC2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 2 +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 [[N]]) +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[M]], i64 0 +; CHECK-IC2-NEXT: [[TMP25:%.*]] = icmp eq [[BROADCAST_SPLATINSERT]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP9:%.*]] = shufflevector [[TMP25]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP10:%.*]] = xor [[TMP9]], splat (i1 true) +; CHECK-IC2-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[TMP1]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[L]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-IC2: [[VECTOR_BODY]]: +; CHECK-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35:.*]] ] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ] +; CHECK-IC2-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP11]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ] +; CHECK-IC2-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] +; CHECK-IC2-NEXT: [[B_INV_GEP:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] +; CHECK-IC2-NEXT: [[TMP28:%.*]] = mul [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-IC2-NEXT: [[TMP14:%.*]] = mul [[STEP_ADD]], [[BROADCAST_SPLAT6]] +; CHECK-IC2-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[B]], [[TMP28]] +; CHECK-IC2-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[B]], [[TMP14]] +; CHECK-IC2-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP10]], zeroinitializer +; CHECK-IC2-NEXT: br label %[[MIDDLE_LOOP7:.*]] +; CHECK-IC2: [[MIDDLE_LOOP7]]: +; CHECK-IC2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT38:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26:.*]] ] +; CHECK-IC2-NEXT: [[VEC_PHI8:%.*]] = phi [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT40:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ] +; CHECK-IC2-NEXT: [[VEC_PHI9:%.*]] = phi [ [[TMP17]], %[[VECTOR_BODY]] ], [ [[TMP57:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ] +; CHECK-IC2-NEXT: [[VEC_PHI10:%.*]] = phi [ [[TMP18]], %[[VECTOR_BODY]] ], [ [[TMP58:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ] +; CHECK-IC2-NEXT: [[TMP19:%.*]] = icmp ne [[BROADCAST_SPLAT6]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP20:%.*]] = icmp ne [[BROADCAST_SPLAT6]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP21:%.*]] = select [[VEC_PHI9]], [[TMP19]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP22:%.*]] = select [[VEC_PHI10]], [[TMP20]], zeroinitializer +; CHECK-IC2-NEXT: br label %[[INNER_LOOP11:.*]] +; CHECK-IC2: [[INNER_LOOP11]]: +; CHECK-IC2-NEXT: [[VEC_PHI12:%.*]] = phi [ zeroinitializer, %[[MIDDLE_LOOP7]] ], [ [[BROADCAST_SPLAT42:%.*]], %[[INNER_LOOP11]] ] +; CHECK-IC2-NEXT: [[VEC_PHI13:%.*]] = phi [ zeroinitializer, %[[MIDDLE_LOOP7]] ], [ [[BROADCAST_SPLAT44:%.*]], %[[INNER_LOOP11]] ] +; CHECK-IC2-NEXT: [[VEC_PHI14:%.*]] = phi [ [[TMP21]], %[[MIDDLE_LOOP7]] ], [ [[TMP64:%.*]], %[[INNER_LOOP11]] ] +; CHECK-IC2-NEXT: [[VEC_PHI15:%.*]] = phi [ [[TMP22]], %[[MIDDLE_LOOP7]] ], [ [[TMP43:%.*]], %[[INNER_LOOP11]] ] +; CHECK-IC2-NEXT: [[TMP23:%.*]] = extractelement [[VEC_PHI12]], i64 0 +; CHECK-IC2-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], [[N]] +; CHECK-IC2-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[B_INV_GEP]], i64 [[TMP24]] +; CHECK-IC2-NEXT: [[TMP26:%.*]] = getelementptr float, [[TMP15]], [[VEC_PHI12]] +; CHECK-IC2-NEXT: [[TMP27:%.*]] = getelementptr float, [[TMP16]], [[VEC_PHI13]] +; CHECK-IC2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[DOTIDX7:%.*]] = shl i64 [[TMP13]], 4 +; CHECK-IC2-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[DOTIDX7]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP8]], i32 4, [[VEC_PHI14]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[VEC_PHI15]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP26]], i32 4, [[VEC_PHI14]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP27]], i32 4, [[VEC_PHI15]], poison), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[TMP30:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-IC2-NEXT: [[TMP31:%.*]] = fadd [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_GATHER13]] +; CHECK-IC2-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[DOTIDX21:%.*]] = shl i64 [[TMP32]], 4 +; CHECK-IC2-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[DOTIDX21]] +; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP30]], ptr [[TMP8]], i32 4, [[VEC_PHI14]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP31]], ptr [[TMP33]], i32 4, [[VEC_PHI15]]), !llvm.access.group [[ACC_GRP0]] +; CHECK-IC2-NEXT: [[TMP34:%.*]] = extractelement [[VEC_PHI12]], i64 0 +; CHECK-IC2-NEXT: [[TMP35:%.*]] = add i64 [[TMP34]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT41:%.*]] = insertelement poison, i64 [[TMP35]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT42]] = shufflevector [[BROADCAST_SPLATINSERT41]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP36:%.*]] = extractelement [[VEC_PHI13]], i64 0 +; CHECK-IC2-NEXT: [[TMP37:%.*]] = add i64 [[TMP36]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT43:%.*]] = insertelement poison, i64 [[TMP37]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT44]] = shufflevector [[BROADCAST_SPLATINSERT43]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[TMP35]], [[L]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement poison, i1 [[TMP38]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector [[BROADCAST_SPLATINSERT18]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP65:%.*]] = icmp eq i64 [[TMP37]], [[L]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement poison, i1 [[TMP65]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector [[BROADCAST_SPLATINSERT20]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP66:%.*]] = xor [[BROADCAST_SPLAT19]], splat (i1 true) +; CHECK-IC2-NEXT: [[TMP67:%.*]] = xor [[BROADCAST_SPLAT21]], splat (i1 true) +; CHECK-IC2-NEXT: [[TMP64]] = select [[VEC_PHI14]], [[TMP66]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP43]] = select [[VEC_PHI15]], [[TMP67]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP44:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP64]]) +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT22:%.*]] = insertelement poison, i1 [[TMP44]], i64 0 +; CHECK-IC2-NEXT: [[TMP45:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP43]]) +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT24:%.*]] = insertelement poison, i1 [[TMP45]], i64 0 +; CHECK-IC2-NEXT: [[TMP46:%.*]] = or [[BROADCAST_SPLATINSERT22]], [[BROADCAST_SPLATINSERT24]] +; CHECK-IC2-NEXT: [[TMP47:%.*]] = shufflevector [[TMP46]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP48:%.*]] = extractelement [[TMP47]], i64 0 +; CHECK-IC2-NEXT: br i1 [[TMP48]], label %[[INNER_LOOP11]], label %[[MIDDLE_LATCH_LOOPEXIT26]] +; CHECK-IC2: [[MIDDLE_LATCH_LOOPEXIT26]]: +; CHECK-IC2-NEXT: [[TMP49:%.*]] = extractelement [[VEC_PHI]], i64 0 +; CHECK-IC2-NEXT: [[TMP50:%.*]] = add nuw nsw i64 [[TMP49]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT37:%.*]] = insertelement poison, i64 [[TMP50]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT38]] = shufflevector [[BROADCAST_SPLATINSERT37]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP51:%.*]] = extractelement [[VEC_PHI8]], i64 0 +; CHECK-IC2-NEXT: [[TMP52:%.*]] = add nuw nsw i64 [[TMP51]], 1 +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT39:%.*]] = insertelement poison, i64 [[TMP52]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT40]] = shufflevector [[BROADCAST_SPLATINSERT39]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP53:%.*]] = icmp eq i64 [[TMP50]], [[M]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement poison, i1 [[TMP53]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector [[BROADCAST_SPLATINSERT27]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP54:%.*]] = icmp eq i64 [[TMP52]], [[M]] +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement poison, i1 [[TMP54]], i64 0 +; CHECK-IC2-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector [[BROADCAST_SPLATINSERT29]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP55:%.*]] = xor [[BROADCAST_SPLAT28]], splat (i1 true) +; CHECK-IC2-NEXT: [[TMP56:%.*]] = xor [[BROADCAST_SPLAT30]], splat (i1 true) +; CHECK-IC2-NEXT: [[TMP57]] = select [[VEC_PHI9]], [[TMP55]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP58]] = select [[VEC_PHI10]], [[TMP56]], zeroinitializer +; CHECK-IC2-NEXT: [[TMP59:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP57]]) +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement poison, i1 [[TMP59]], i64 0 +; CHECK-IC2-NEXT: [[TMP60:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP58]]) +; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement poison, i1 [[TMP60]], i64 0 +; CHECK-IC2-NEXT: [[TMP61:%.*]] = or [[BROADCAST_SPLATINSERT31]], [[BROADCAST_SPLATINSERT33]] +; CHECK-IC2-NEXT: [[TMP62:%.*]] = shufflevector [[TMP61]], poison, zeroinitializer +; CHECK-IC2-NEXT: [[TMP63:%.*]] = extractelement [[TMP62]], i64 0 +; CHECK-IC2-NEXT: br i1 [[TMP63]], label %[[MIDDLE_LOOP7]], label %[[OUTER_LATCH_LOOPEXIT35]] +; CHECK-IC2: [[OUTER_LATCH_LOOPEXIT35]]: +; CHECK-IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; CHECK-IC2-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IC2-NEXT: [[TMP40:%.*]] = shl i64 [[TMP39]], 2 +; CHECK-IC2-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP5]]) +; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP41]], i64 [[TMP5]]) +; CHECK-IC2-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[BROADCAST_SPLAT4]] +; CHECK-IC2-NEXT: [[TMP42:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-IC2-NEXT: br i1 [[TMP42]], label %[[VECTOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-IC2: [[EXIT]]: +; CHECK-IC2-NEXT: ret void +; +entry: + %N.is.zero = icmp eq i64 %N, 0 + br i1 %N.is.zero, label %exit, label %outer.loop + +outer.loop: + %i = phi i64 [ %i.next, %outer.latch ], [ 0, %entry ] + %a.inv.gep = getelementptr float, ptr %A, i64 %i + %i.x.L = mul i64 %i, %L + %b.inv.gep = getelementptr float, ptr %B, i64 %i.x.L + %M.is.zero = icmp eq i64 %M, 0 + br i1 %M.is.zero, label %outer.latch, label %middle.loop + +middle.loop: + %j = phi i64 [ %j.next, %middle.latch ], [ 0, %outer.loop ] + %L.is.zero = icmp eq i64 %L, 0 + br i1 %L.is.zero, label %middle.latch, label %inner.loop + +inner.loop: + %k = phi i64 [ %k.next, %inner.loop ], [ 0, %middle.loop ] + %k.x.N = mul i64 %k, %N + %a.gep = getelementptr float, ptr %a.inv.gep, i64 %k.x.N + %b.gep = getelementptr float, ptr %b.inv.gep, i64 %k + %a.load = load float, ptr %a.gep, align 4, !llvm.access.group !3 + %b.load = load float, ptr %b.gep, align 4, !llvm.access.group !3 + %res = fadd float %a.load, %b.load + store float %res, ptr %a.gep, align 4, !llvm.access.group !3 + %k.next = add nuw nsw i64 %k, 1 + %inner.exitcond = icmp eq i64 %k.next, %L + br i1 %inner.exitcond, label %middle.latch, label %inner.loop + +middle.latch: + %j.next = add nuw nsw i64 %j, 1 + %middle.exitcond = icmp eq i64 %j.next, %M + br i1 %middle.exitcond, label %outer.latch, label %middle.loop + +outer.latch: + %i.next = add nuw nsw i64 %i, 1 + %outer.exitcond = icmp eq i64 %i.next, %N + br i1 %outer.exitcond, label %exit, label %outer.loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +!2 = !{!"llvm.loop.parallel_accesses", !3} +!3 = distinct !{} +;. +; CHECK-IC1: [[ACC_GRP0]] = distinct !{} +; CHECK-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]} +; CHECK-IC1: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]} +; CHECK-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META3]], [[META4]]} +; CHECK-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]} +; CHECK-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META3]], [[META4]]} +;. +; CHECK-IC2: [[ACC_GRP0]] = distinct !{} +; CHECK-IC2: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]} +; CHECK-IC2: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]} +; CHECK-IC2: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-IC2: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-IC2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META3]], [[META4]]} +; CHECK-IC2: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]} +; CHECK-IC2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META3]], [[META4]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll new file mode 100644 index 0000000000000..46b7bf6f4c7b3 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll @@ -0,0 +1,647 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=loop-vectorize,instcombine,simplifycfg -force-vector-width=4 -force-vector-interleave=1 -experimental-olv-in-classic-vect < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +;;; Effectively the inner two loops of: +; for (size_t i = 0; i < N; i++) { +; #pragma clang loop vectorize(enable) +; for (size_t j = 0; j < N; j++) { +; float a = 0.; +; for (size_t k = 0; k < N; k++) +; a += B[i][k] * C[k][j]; +; A[i][j] = a; +; } +; } +define void @foo(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH9:.*]] ] +; CHECK-NEXT: br label %[[INNER_LOOP1:.*]] +; CHECK: [[INNER_LOOP1]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT12:%.*]], %[[INNER_LOOP1]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP6:%.*]], %[[INNER_LOOP1]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ splat (i1 true), %[[VECTOR_BODY]] ], [ [[TMP15:%.*]], %[[INNER_LOOP1]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x float> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9:%.*]], %[[INNER_LOOP1]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP7]], [[M]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6]] = fadd <4 x float> [[VEC_PHI]], [[TMP5]] +; CHECK-NEXT: [[TMP9]] = select <4 x i1> [[VEC_PHI3]], <4 x float> [[TMP6]], <4 x float> [[VEC_PHI4]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[TMP19]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP21]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT11]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[TMP21]], [[M]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i1> poison, i1 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT5]], +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15]] = select <4 x i1> [[VEC_PHI3]], <4 x i1> [[TMP22]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i1> [[TMP15]] to i4 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP16]], 0 +; CHECK-NEXT: br i1 [[DOTNOT]], label %[[LOOP_LATCH9]], label %[[INNER_LOOP1]] +; CHECK: [[LOOP_LATCH9]]: +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x float> [[TMP9]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: br label %[[INNER_LOOP:.*]] +; CHECK: [[INNER_LOOP]]: +; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[A_PHI:%.*]] = phi float [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0.000000e+00, %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[J]] +; CHECK-NEXT: [[B_LOAD:%.*]] = load float, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[JXM:%.*]] = mul i64 [[J]], [[M]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[C]], i64 [[JXM]] +; CHECK-NEXT: [[C_ADDR:%.*]] = getelementptr float, ptr [[TMP11]], i64 [[I]] +; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[B_LOAD]], [[C_LOAD]] +; CHECK-NEXT: [[A_NEXT]] = fadd float [[A_PHI]], [[MUL]] +; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 +; CHECK-NEXT: [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[M]] +; CHECK-NEXT: br i1 [[INNER_EXITCOND]], label %[[LOOP_LATCH]], label %[[INNER_LOOP]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]] +; CHECK-NEXT: store float [[A_NEXT]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ] + br label %inner.loop + +inner.loop: + %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ] + %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ] + %b.addr = getelementptr inbounds float, ptr %B, i64 %j + %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3 + %jxM = mul i64 %j, %M + %jxMpi = add i64 %jxM, %i + %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi + %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3 + %mul = fmul float %b.load, %c.load + %a.next = fadd float %a.phi, %mul + %j.next = add nuw nsw i64 %j, 1 + %inner.exitcond = icmp eq i64 %j.next, %M + br i1 %inner.exitcond, label %loop.latch, label %inner.loop + +loop.latch: + %a.lcssa = phi float [ %a.next, %inner.loop ] + %a.addr = getelementptr inbounds float, ptr %A, i64 %i + store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3 + %i.next = add nuw nsw i64 %i, 1 + %loop.exitcond = icmp eq i64 %i.next, %N + br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +;;; Effectively the inner two loops of: +; for (size_t i = 0; i < N; i++) { +; #pragma clang loop vectorize(enable) +; for (size_t j = 0; j < N; j++) { +; float a = 0.; +; for (size_t k = 0; k < j; k++) +; a += B[i][k] * C[k][j]; +; A[i][j] = a; +; } +; } +;;; Note that the inner loop's trip-count depends on the outer loop. +define void @bar(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) { +; CHECK-LABEL: define void @bar( +; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_VEC1:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[M]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[N_VEC:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[LOOP_LATCH3:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[LOOP_LATCH3]] ] +; CHECK-NEXT: br label %[[INNER_LOOP1:.*]] +; CHECK: [[INNER_LOOP1]]: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP44:%.*]], %[[INNER_LOOP1]] ] +; CHECK-NEXT: [[INDEX:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[VEC_PHI]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i64 2 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i64 3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP12]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP14]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP16]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i64 2 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 3 +; CHECK-NEXT: [[TMP43:%.*]] = fmul <4 x float> [[TMP4]], [[TMP41]] +; CHECK-NEXT: [[TMP44]] = fadd <4 x float> [[VEC_PHI3]], [[TMP43]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0 +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[TMP25]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX_NEXT]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP51]], label %[[LOOP_LATCH3]], label %[[INNER_LOOP1]] +; CHECK: [[LOOP_LATCH3]]: +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[N_VEC]] +; CHECK-NEXT: store <4 x float> [[TMP44]], ptr [[TMP28]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[N_VEC]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]] +; CHECK-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC1]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: br label %[[INNER_LOOP:.*]] +; CHECK: [[INNER_LOOP]]: +; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[A_PHI:%.*]] = phi float [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0.000000e+00, %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[J]] +; CHECK-NEXT: [[B_LOAD:%.*]] = load float, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[JXM:%.*]] = mul i64 [[J]], [[M]] +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr float, ptr [[C]], i64 [[JXM]] +; CHECK-NEXT: [[C_ADDR:%.*]] = getelementptr float, ptr [[TMP52]], i64 [[I]] +; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[B_LOAD]], [[C_LOAD]] +; CHECK-NEXT: [[A_NEXT]] = fadd float [[A_PHI]], [[MUL]] +; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 +; CHECK-NEXT: [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[I]] +; CHECK-NEXT: br i1 [[INNER_EXITCOND]], label %[[LOOP_LATCH]], label %[[INNER_LOOP]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]] +; CHECK-NEXT: store float [[A_NEXT]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ] + br label %inner.loop + +inner.loop: + %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ] + %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ] + %b.addr = getelementptr inbounds float, ptr %B, i64 %j + %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3 + %jxM = mul i64 %j, %M + %jxMpi = add i64 %jxM, %i + %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi + %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3 + %mul = fmul float %b.load, %c.load + %a.next = fadd float %a.phi, %mul + %j.next = add nuw nsw i64 %j, 1 + %inner.exitcond = icmp eq i64 %j.next, %i + br i1 %inner.exitcond, label %loop.latch, label %inner.loop + +loop.latch: + %a.lcssa = phi float [ %a.next, %inner.loop ] + %a.addr = getelementptr inbounds float, ptr %A, i64 %i + store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3 + %i.next = add nuw nsw i64 %i, 1 + %loop.exitcond = icmp eq i64 %i.next, %N + br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +;;; Effectively something like: +; #pragma clang loop vectorize(enable) +; for (long i = 0; i < N; i++) { +; long a = A[i]; +; long j = 0; +; if (a > 0) { +; do { +; a -= B[j]; +; j++; +; } while (a > 0); +; } +; A[i] = a + j; +; } +;;; Note that the inner loop is behind a branch, so the start value of the inner +;;; loop mask phi must be corespondingly. The induction of the inner loop is used +;;; for a uniform memory accesses and as live-out, so the vectorized code should +;;; contain two phis for it (one scalar and one widened). +;;; Also, in this example, the inner loop backedge is the first successor of the +;;; the latch terminator, not the second one as is assumed by VPlan. +define void @baz(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B) { +; CHECK-LABEL: define void @baz( +; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT9:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: br label %[[INNER_LOOP1:.*]] +; CHECK: [[INNER_LOOP1]]: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP20:%.*]], %[[PRED_LOAD_CONTINUE8]] ] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0 +; CHECK-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[A_LOAD:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[A_LOAD]], i64 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ poison, %[[INNER_LOOP1]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1 +; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[TMP9]], i64 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2 +; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP13]], i64 2 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i64> [ [[TMP11]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP14]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3 +; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP17]], i64 3 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i64> [ [[TMP15]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP18]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP20]] = sub <4 x i64> [[VEC_PHI2]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[TMP21]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP22]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = icmp slt i64 [[TMP23]], 1 +; CHECK-NEXT: br i1 [[TMP24]], label %[[LOOP_LATCH_LOOPEXIT9]], label %[[INNER_LOOP1]] +; CHECK: [[LOOP_LATCH_LOOPEXIT9]]: +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[TMP20]], <4 x i64> [[WIDE_LOAD]] +; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[A_ADDR1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[A_LOAD1:%.*]] = load i64, ptr [[A_ADDR1]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[A_IS_POSITIVE:%.*]] = icmp sgt i64 [[A_LOAD1]], 0 +; CHECK-NEXT: br i1 [[A_IS_POSITIVE]], label %[[INNER_LOOP:.*]], label %[[LOOP_LATCH]] +; CHECK: [[INNER_LOOP]]: +; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[A_PHI:%.*]] = phi i64 [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw i64, ptr [[B]], i64 [[J]] +; CHECK-NEXT: [[B_LOAD:%.*]] = load i64, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[A_NEXT]] = sub i64 [[A_PHI]], [[B_LOAD]] +; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 +; CHECK-NEXT: [[A_IS_STILL_POSITIVE:%.*]] = icmp sgt i64 [[A_NEXT]], 0 +; CHECK-NEXT: br i1 [[A_IS_STILL_POSITIVE]], label %[[INNER_LOOP]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[A_RES:%.*]] = phi i64 [ [[A_LOAD1]], %[[LOOP_HEADER]] ], [ [[A_NEXT]], %[[INNER_LOOP]] ] +; CHECK-NEXT: store i64 [[A_RES]], ptr [[A_ADDR1]], align 8, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ] + %a.addr = getelementptr inbounds i64, ptr %A, i64 %i + %a.load = load i64, ptr %a.addr, align 8, !llvm.access.group !3 + %a.is.positive = icmp sgt i64 %a.load, 0 + br i1 %a.is.positive, label %inner.loop, label %loop.latch + +inner.loop: + %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ] + %a.phi = phi i64 [ %a.next, %inner.loop ], [ 0, %loop.header ] + %b.addr = getelementptr inbounds i64, ptr %B, i64 %j + %b.load = load i64, ptr %b.addr, align 8, !llvm.access.group !3 + %a.next = sub i64 %a.phi, %b.load + %j.next = add nuw nsw i64 %j, 1 + %a.is.still.positive = icmp sgt i64 %a.next, 0 + br i1 %a.is.still.positive, label %inner.loop, label %loop.latch + +loop.latch: + %a.res = phi i64 [ %a.load, %loop.header ], [ %a.next, %inner.loop ] + store i64 %a.res, ptr %a.addr, align 8, !llvm.access.group !3 + %i.next = add nuw nsw i64 %i, 1 + %loop.exitcond = icmp eq i64 %i.next, %N + br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +;;; Triple-loop nest with the outer-most one beeing vectorized. +; #pragma clang loop vectorize(enable) +; for (size_t i = 0; i < N; i++) +; for (size_t j = 0; j < M; j++) +; for (size_t k = 0; k < L; k++) +; A[k][i] += B[i][k]; +define void @quuz(i64 %N, i64 %M, i64 %L, ptr noalias %A, ptr readonly %B) { +; CHECK-LABEL: define void @quuz( +; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-NEXT: br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[OUTER_LOOP_PREHEADER:.*]] +; CHECK: [[OUTER_LOOP_PREHEADER]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[M]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLATINSERT]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[L]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP28:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT25:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT25]] ] +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: br label %[[MIDDLE_LOOP3:.*]] +; CHECK: [[MIDDLE_LOOP3]]: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT27:%.*]], %[[MIDDLE_LATCH_LOOPEXIT20:.*]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ [[TMP7]], %[[VECTOR_BODY]] ], [ [[TMP65:%.*]], %[[MIDDLE_LATCH_LOOPEXIT20]] ] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[VEC_PHI4]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer +; CHECK-NEXT: br label %[[INNER_LOOP5:.*]] +; CHECK: [[INNER_LOOP5]]: +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP3]] ], [ [[BROADCAST_SPLAT29:%.*]], %[[PRED_STORE_CONTINUE15:.*]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ [[TMP4]], %[[MIDDLE_LOOP3]] ], [ [[TMP58:%.*]], %[[PRED_STORE_CONTINUE15]] ] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[VEC_PHI6]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP28]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP29]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP32]] +; CHECK-NEXT: [[INDEX:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr [[TMP33]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP16:%.*]] = fadd float [[TMP10]], [[TMP15]] +; CHECK-NEXT: store float [[TMP16]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 1 +; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] +; CHECK: [[PRED_STORE_IF10]]: +; CHECK-NEXT: [[TMP18:%.*]] = or disjoint i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[TMP19]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 1 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr float, ptr [[TMP24]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP67:%.*]] = fadd float [[TMP22]], [[TMP27]] +; CHECK-NEXT: store float [[TMP67]], ptr [[TMP21]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE11]] +; CHECK: [[PRED_STORE_CONTINUE11]]: +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 2 +; CHECK-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] +; CHECK: [[PRED_STORE_IF12]]: +; CHECK-NEXT: [[TMP30:%.*]] = or disjoint i64 [[TMP28]], 2 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i64> [[TMP5]], i64 2 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr float, ptr [[TMP31]], i64 [[TMP69]] +; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP70]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 2 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr float, ptr [[TMP36]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP40:%.*]] = fadd float [[TMP34]], [[TMP39]] +; CHECK-NEXT: store float [[TMP40]], ptr [[TMP70]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; CHECK: [[PRED_STORE_CONTINUE13]]: +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 3 +; CHECK-NEXT: br i1 [[TMP41]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15]] +; CHECK: [[PRED_STORE_IF14]]: +; CHECK-NEXT: [[TMP42:%.*]] = or disjoint i64 [[TMP28]], 3 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i64> [[TMP5]], i64 3 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP43]], i64 [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[TMP45]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 3 +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP48]], i64 [[TMP49]] +; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[TMP50]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP52:%.*]] = fadd float [[TMP46]], [[TMP51]] +; CHECK-NEXT: store float [[TMP52]], ptr [[TMP45]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE15]] +; CHECK: [[PRED_STORE_CONTINUE15]]: +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 0 +; CHECK-NEXT: [[TMP54:%.*]] = add nuw nsw i64 [[TMP71]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i64> poison, i64 [[TMP54]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT29]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT28]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP55:%.*]] = icmp eq i64 [[TMP54]], [[L]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x i1> poison, i1 [[TMP55]], i64 0 +; CHECK-NEXT: [[TMP56:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT16]], +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <4 x i1> [[TMP56]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP58]] = select <4 x i1> [[VEC_PHI7]], <4 x i1> [[TMP57]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = bitcast <4 x i1> [[TMP58]] to i4 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP59]], 0 +; CHECK-NEXT: br i1 [[DOTNOT]], label %[[MIDDLE_LATCH_LOOPEXIT20]], label %[[INNER_LOOP5]] +; CHECK: [[MIDDLE_LATCH_LOOPEXIT20]]: +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0 +; CHECK-NEXT: [[TMP61:%.*]] = add nuw nsw i64 [[TMP60]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <4 x i64> poison, i64 [[TMP61]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT27]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT26]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP62:%.*]] = icmp eq i64 [[TMP61]], [[M]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i1> poison, i1 [[TMP62]], i64 0 +; CHECK-NEXT: [[TMP63:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT21]], +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <4 x i1> [[TMP63]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP65]] = select <4 x i1> [[VEC_PHI4]], <4 x i1> [[TMP64]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP66:%.*]] = bitcast <4 x i1> [[TMP65]] to i4 +; CHECK-NEXT: [[DOTNOT30:%.*]] = icmp eq i4 [[TMP66]], 0 +; CHECK-NEXT: br i1 [[DOTNOT30]], label %[[OUTER_LATCH_LOOPEXIT25]], label %[[MIDDLE_LOOP3]] +; CHECK: [[OUTER_LATCH_LOOPEXIT25]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP_PREHEADER]] ] +; CHECK-NEXT: br label %[[OUTER_LOOP:.*]] +; CHECK: [[OUTER_LOOP]]: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[OUTER_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[A_INV_GEP:%.*]] = getelementptr float, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[I_X_L:%.*]] = mul i64 [[I]], [[L]] +; CHECK-NEXT: [[B_INV_GEP:%.*]] = getelementptr float, ptr [[B]], i64 [[I_X_L]] +; CHECK-NEXT: [[M_IS_ZERO:%.*]] = icmp eq i64 [[M]], 0 +; CHECK-NEXT: br i1 [[M_IS_ZERO]], label %[[OUTER_LATCH]], label %[[MIDDLE_LOOP:.*]] +; CHECK: [[MIDDLE_LOOP]]: +; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[MIDDLE_LATCH:.*]] ], [ 0, %[[OUTER_LOOP]] ] +; CHECK-NEXT: [[L_IS_ZERO:%.*]] = icmp eq i64 [[L]], 0 +; CHECK-NEXT: br i1 [[L_IS_ZERO]], label %[[MIDDLE_LATCH]], label %[[INNER_LOOP:.*]] +; CHECK: [[INNER_LOOP]]: +; CHECK-NEXT: [[K:%.*]] = phi i64 [ [[K_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[MIDDLE_LOOP]] ] +; CHECK-NEXT: [[K_X_N:%.*]] = mul i64 [[K]], [[N]] +; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr float, ptr [[A_INV_GEP]], i64 [[K_X_N]] +; CHECK-NEXT: [[B_GEP:%.*]] = getelementptr float, ptr [[B_INV_GEP]], i64 [[K]] +; CHECK-NEXT: [[A_LOAD:%.*]] = load float, ptr [[A_GEP]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[B_LOAD:%.*]] = load float, ptr [[B_GEP]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[RES:%.*]] = fadd float [[A_LOAD]], [[B_LOAD]] +; CHECK-NEXT: store float [[RES]], ptr [[A_GEP]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[K_NEXT]] = add nuw nsw i64 [[K]], 1 +; CHECK-NEXT: [[INNER_EXITCOND:%.*]] = icmp eq i64 [[K_NEXT]], [[L]] +; CHECK-NEXT: br i1 [[INNER_EXITCOND]], label %[[MIDDLE_LATCH]], label %[[INNER_LOOP]] +; CHECK: [[MIDDLE_LATCH]]: +; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 +; CHECK-NEXT: [[MIDDLE_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[M]] +; CHECK-NEXT: br i1 [[MIDDLE_EXITCOND]], label %[[OUTER_LATCH]], label %[[MIDDLE_LOOP]] +; CHECK: [[OUTER_LATCH]]: +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[OUTER_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[OUTER_EXITCOND]], label %[[EXIT]], label %[[OUTER_LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %N.is.zero = icmp eq i64 %N, 0 + br i1 %N.is.zero, label %exit, label %outer.loop + +outer.loop: + %i = phi i64 [ %i.next, %outer.latch ], [ 0, %entry ] + %a.inv.gep = getelementptr float, ptr %A, i64 %i + %i.x.L = mul i64 %i, %L + %b.inv.gep = getelementptr float, ptr %B, i64 %i.x.L + %M.is.zero = icmp eq i64 %M, 0 + br i1 %M.is.zero, label %outer.latch, label %middle.loop + +middle.loop: + %j = phi i64 [ %j.next, %middle.latch ], [ 0, %outer.loop ] + %L.is.zero = icmp eq i64 %L, 0 + br i1 %L.is.zero, label %middle.latch, label %inner.loop + +inner.loop: + %k = phi i64 [ %k.next, %inner.loop ], [ 0, %middle.loop ] + %k.x.N = mul i64 %k, %N + %a.gep = getelementptr float, ptr %a.inv.gep, i64 %k.x.N + %b.gep = getelementptr float, ptr %b.inv.gep, i64 %k + %a.load = load float, ptr %a.gep, align 4, !llvm.access.group !3 + %b.load = load float, ptr %b.gep, align 4, !llvm.access.group !3 + %res = fadd float %a.load, %b.load + store float %res, ptr %a.gep, align 4, !llvm.access.group !3 + %k.next = add nuw nsw i64 %k, 1 + %inner.exitcond = icmp eq i64 %k.next, %L + br i1 %inner.exitcond, label %middle.latch, label %inner.loop + +middle.latch: + %j.next = add nuw nsw i64 %j, 1 + %middle.exitcond = icmp eq i64 %j.next, %M + br i1 %middle.exitcond, label %outer.latch, label %middle.loop + +outer.latch: + %i.next = add nuw nsw i64 %i, 1 + %outer.exitcond = icmp eq i64 %i.next, %N + br i1 %outer.exitcond, label %exit, label %outer.loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +!2 = !{!"llvm.loop.parallel_accesses", !3} +!3 = distinct !{} +;. +; CHECK: [[ACC_GRP0]] = distinct !{} +; CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]} +; CHECK: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]} +; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META4]], [[META3]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META4]], [[META3]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META3]], [[META4]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META4]], [[META3]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META3]], [[META4]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META4]], [[META3]]} +;.