diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index 966ffb7a1fbd2..e4e794c434adb 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -190,6 +190,38 @@ class SwingSchedulerDDGEdge { bool ignoreDependence(bool IgnoreAnti) const; }; +/// Represents loop-carried dependencies. Because SwingSchedulerDAG doesn't +/// assume cycle dependencies as the name suggests, such dependencies must be +/// handled separately. After DAG construction is finished, these dependencies +/// are added to SwingSchedulerDDG. +/// TODO: Also handle output-dependencies introduced by physical registers. +struct LoopCarriedEdges { + using OrderDep = SmallSetVector; + using OrderDepsType = DenseMap; + + OrderDepsType OrderDeps; + + const OrderDep *getOrderDepOrNull(SUnit *Key) const { + auto Ite = OrderDeps.find(Key); + if (Ite == OrderDeps.end()) + return nullptr; + return &Ite->second; + } + + /// Retruns true if the edge from \p From to \p To is a back-edge that should + /// be used when scheduling. + bool shouldUseWhenScheduling(const SUnit *From, const SUnit *To) const; + + /// Adds some edges to the original DAG that correspond to loop-carried + /// dependencies. Historically, loop-carried edges are represented by using + /// non-loop-carried edges in the original DAG. This function appends such + /// edges to preserve the previous behavior. + void modifySUnits(std::vector &SUnits); + + void dump(SUnit *SU, const TargetRegisterInfo *TRI, + const MachineRegisterInfo *MRI) const; +}; + /// Represents dependencies between instructions. This class is a wrapper of /// `SUnits` and its dependencies to manipulate back-edges in a natural way. /// Currently it only supports back-edges via PHI, which are expressed as @@ -402,7 +434,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { const MachineInstr *OtherMI) const; private: - void addLoopCarriedDependences(); + LoopCarriedEdges addLoopCarriedDependences(); void updatePhiDependences(); void changeDependences(); unsigned calculateResMII(); diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 3d161ffbe40a4..fdc6102c719e8 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -266,6 +266,82 @@ struct SUnitWithMemInfo { bool getUnderlyingObjects(); }; +/// Add loop-carried chain dependencies. This class handles the same type of +/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into +/// account dependencies across iterations. +class LoopCarriedOrderDepsTracker { + // Type of instruction that is relevant to order-dependencies + enum class InstrTag { + Barrier = 0, ///< A barrier event instruction. + LoadOrStore = 1, ///< An instruction that may load or store memory, but is + ///< not a barrier event. + FPExceptions = 2, ///< An instruction that does not match above, but may + ///< raise floatin-point exceptions. + }; + + struct TaggedSUnit : PointerIntPair { + TaggedSUnit(SUnit *SU, InstrTag Tag) + : PointerIntPair(SU, unsigned(Tag)) {} + + InstrTag getTag() const { return InstrTag(getInt()); } + }; + + /// Holds loads and stores with memory related information. + struct LoadStoreChunk { + SmallVector Loads; + SmallVector Stores; + + void append(SUnit *SU); + }; + + SwingSchedulerDAG *DAG; + BatchAAResults *BAA; + std::vector &SUnits; + + /// The size of SUnits, for convenience. + const unsigned N; + + /// Loop-carried Edges. + std::vector LoopCarried; + + /// Instructions related to chain dependencies. They are one of the + /// following: + /// + /// 1. Barrier event. + /// 2. Load, but neither a barrier event, invariant load, nor may load trap + /// value. + /// 3. Store, but not a barrier event. + /// 4. None of them, but may raise floating-point exceptions. + /// + /// This is used when analyzing loop-carried dependencies that access global + /// barrier instructions. + std::vector TaggedSUnits; + + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + +public: + LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI); + + /// The main function to compute loop-carried order-dependencies. + void computeDependencies(); + + const BitVector &getLoopCarried(unsigned Idx) const { + return LoopCarried[Idx]; + } + +private: + /// Tags to \p SU if the instruction may affect the order-dependencies. + std::optional getInstrTag(SUnit *SU) const; + + void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From, + const LoadStoreChunk &To); + + void computeDependenciesAux(); +}; + } // end anonymous namespace /// The "main" function for implementing Swing Modulo Scheduling. @@ -593,13 +669,19 @@ void SwingSchedulerDAG::setMAX_II() { /// scheduling part of the Swing Modulo Scheduling algorithm. void SwingSchedulerDAG::schedule() { buildSchedGraph(AA); - addLoopCarriedDependences(); + const LoopCarriedEdges LCE = addLoopCarriedDependences(); updatePhiDependences(); Topo.InitDAGTopologicalSorting(); changeDependences(); postProcessDAG(); DDG = std::make_unique(SUnits, &EntrySU, &ExitSU); - LLVM_DEBUG(dump()); + LLVM_DEBUG({ + dump(); + dbgs() << "===== Loop Carried Edges Begin =====\n"; + for (SUnit &SU : SUnits) + LCE.dump(&SU, TRI, &MRI); + dbgs() << "===== Loop Carried Edges End =====\n"; + }); NodeSetType NodeSets; findCircuits(NodeSets); @@ -832,15 +914,6 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) { return false; } -/// Return true if the instruction causes a chain between memory -/// references before and after it. -static bool isDependenceBarrier(MachineInstr &MI) { - return MI.isCall() || MI.mayRaiseFPException() || - MI.hasUnmodeledSideEffects() || - (MI.hasOrderedMemoryRef() && - (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad())); -} - SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) { if (!getUnderlyingObjects()) return; @@ -941,28 +1014,111 @@ static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, return false; } +void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) { + const MachineInstr *MI = SU->getInstr(); + if (!MI->mayLoadOrStore()) + return; + (MI->mayStore() ? Stores : Loads).emplace_back(SU); +} + +LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker( + SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) + : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()), + LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {} + +void LoopCarriedOrderDepsTracker::computeDependencies() { + // Traverse all instructions and extract only what we are targetting. + for (auto &SU : SUnits) { + auto Tagged = getInstrTag(&SU); + + // This instruction has no loop-carried order-dependencies. + if (!Tagged) + continue; + TaggedSUnits.emplace_back(&SU, *Tagged); + } + + computeDependenciesAux(); +} + +std::optional +LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const { + MachineInstr *MI = SU->getInstr(); + if (TII->isGlobalMemoryObject(MI)) + return InstrTag::Barrier; + + if (MI->mayStore() || + (MI->mayLoad() && !MI->isDereferenceableInvariantLoad())) + return InstrTag::LoadOrStore; + + if (MI->mayRaiseFPException()) + return InstrTag::FPExceptions; + + return std::nullopt; +} + +void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks( + const LoadStoreChunk &From, const LoadStoreChunk &To) { + // Add dependencies for load-to-store (WAR) from top to bottom. + for (const SUnitWithMemInfo &Src : From.Loads) + for (const SUnitWithMemInfo &Dst : To.Stores) + if (Src.SU->NodeNum < Dst.SU->NodeNum && + hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI)) + LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum); + + // TODO: The following dependencies are missed. + // + // - Dependencies for load-to-store from bottom to top. + // - Dependencies for store-to-load (RAW). + // - Dependencies for store-to-store (WAW). +} + +void LoopCarriedOrderDepsTracker::computeDependenciesAux() { + SmallVector Chunks(1); + for (const auto &TSU : TaggedSUnits) { + InstrTag Tag = TSU.getTag(); + SUnit *SU = TSU.getPointer(); + switch (Tag) { + case InstrTag::Barrier: + Chunks.emplace_back(); + break; + case InstrTag::LoadOrStore: + Chunks.back().append(SU); + break; + case InstrTag::FPExceptions: + // TODO: Handle this properly. + break; + } + } + + // Add dependencies between memory operations. If there are one or more + // barrier events between two memory instructions, we don't add a + // loop-carried dependence for them. + for (const LoadStoreChunk &Chunk : Chunks) + addLoopCarriedDepenenciesForChunks(Chunk, Chunk); + + // TODO: If there are multiple barrier instructions, dependencies from the + // last barrier instruction (or load/store below it) to the first barrier + // instruction (or load/store above it). +} + /// Add a chain edge between a load and store if the store can be an /// alias of the load on a subsequent iteration, i.e., a loop carried /// dependence. This code is very similar to the code in ScheduleDAGInstrs /// but that code doesn't create loop carried dependences. -void SwingSchedulerDAG::addLoopCarriedDependences() { - SmallVector PendingLoads; - for (auto &SU : SUnits) { - MachineInstr &MI = *SU.getInstr(); - if (isDependenceBarrier(MI)) - PendingLoads.clear(); - else if (MI.mayLoad()) { - PendingLoads.emplace_back(&SU); - } else if (MI.mayStore()) { - SUnitWithMemInfo Store(&SU); - for (const SUnitWithMemInfo &Load : PendingLoads) - if (hasLoopCarriedMemDep(Load, Store, BAA, TII, TRI)) { - SDep Dep(Load.SU, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - } - } - } +/// TODO: Also compute output-dependencies. +LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() { + LoopCarriedEdges LCE; + + // Add loop-carried order-dependencies + LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI); + LCODTracker.computeDependencies(); + for (unsigned I = 0; I != SUnits.size(); I++) + for (const int Succ : LCODTracker.getLoopCarried(I).set_bits()) + LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]); + + LCE.modifySUnits(SUnits); + return LCE; } /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer @@ -4002,3 +4158,37 @@ const SwingSchedulerDDG::EdgesType & SwingSchedulerDDG::getOutEdges(const SUnit *SU) const { return getEdges(SU).Succs; } + +void LoopCarriedEdges::modifySUnits(std::vector &SUnits) { + // Currently this function simply adds all dependencies represented by this + // object. After we properly handle missed dependencies, the logic here will + // be more complex, as currently missed edges should not be added to the DAG. + for (SUnit &SU : SUnits) { + SUnit *Src = &SU; + if (auto *OrderDep = getOrderDepOrNull(Src)) { + SDep Dep(Src, SDep::Barrier); + Dep.setLatency(1); + for (SUnit *Dst : *OrderDep) + Dst->addPred(Dep); + } + } +} + +void LoopCarriedEdges::dump(SUnit *SU, const TargetRegisterInfo *TRI, + const MachineRegisterInfo *MRI) const { + const auto *Order = getOrderDepOrNull(SU); + + if (!Order) + return; + + const auto DumpSU = [](const SUnit *SU) { + std::ostringstream OSS; + OSS << "SU(" << SU->NodeNum << ")"; + return OSS.str(); + }; + + dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n" + << " Order\n"; + for (SUnit *Dst : *Order) + dbgs() << " " << DumpSU(Dst) << "\n"; +} diff --git a/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir new file mode 100644 index 0000000000000..bcc6a3ea9b285 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir @@ -0,0 +1,109 @@ +# RUN: llc -mtriple=aarch64 -run-pass=pipeliner -debug-only=pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test a case where fenv is enabled, there are instructions that may raise a +# floating-point exception, and there is an instruction for barrier event. In +# this case the order of them must not change. +# +# FIXME: Currently the following dependencies are missed. +# +# Loop carried edges from SU(7) +# Order +# SU(2) +# SU(3) +# SU(4) +# SU(5) + +# CHECK: ===== Loop Carried Edges Begin ===== +# CHECK-NEXT: ===== Loop Carried Edges End ===== + +--- | + @x = dso_local global i32 0, align 4 + + define dso_local void @f(ptr nocapture noundef writeonly %a, float noundef %y, i32 noundef %n) { + entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + + for.cond.cleanup: + ret void + + for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %tmp9 = trunc i64 %indvars.iv to i32 + %conv = tail call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %tmp9, metadata !"round.dynamic", metadata !"fpexcept.strict") #2 + %add = tail call float @llvm.experimental.constrained.fadd.f32(float %conv, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #2 + %0 = shl nuw nsw i64 %indvars.iv, 2 + %scevgep = getelementptr i8, ptr %a, i64 %0 + store float %add, ptr %scevgep, align 4, !tbaa !6 + %1 = load volatile i32, ptr @x, align 4, !tbaa !10 + %2 = zext i32 %1 to i64 + %3 = add i64 %indvars.iv, %2 + %tmp = trunc i64 %3 to i32 + store volatile i32 %tmp, ptr @x, align 4, !tbaa !10 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %wide.trip.count, %indvars.iv.next + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + + declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) + + declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) + + attributes #2 = { strictfp } + + !6 = !{!7, !7, i64 0} + !7 = !{!"float", !8, i64 0} + !8 = !{!"omnipotent char", !9, i64 0} + !9 = !{!"Simple C/C++ TBAA"} + !10 = !{!11, !11, i64 0} + !11 = !{!"int", !8, i64 0} + +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $x0, $s0, $w1 + + %5:gpr32common = COPY $w1 + %4:fpr32 = COPY $s0 + %3:gpr64common = COPY $x0 + dead $wzr = SUBSWri %5, 1, 0, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %8:gpr32 = ORRWrs $wzr, %5, 0 + %0:gpr64 = SUBREG_TO_REG 0, killed %8, %subreg.sub_32 + %9:gpr64all = COPY $xzr + %7:gpr64all = COPY %9 + %13:gpr64common = ADRP target-flags(aarch64-page) @x + B %bb.3 + + bb.2.for.cond.cleanup: + RET_ReallyLR + + bb.3.for.body: + successors: %bb.2, %bb.3 + + %1:gpr64common = PHI %7, %bb.1, %2, %bb.3 + %10:gpr32 = COPY %1.sub_32 + %11:fpr32 = SCVTFUWSri %10, implicit $fpcr + %12:fpr32 = FADDSrr killed %11, %4, implicit $fpcr + STRSroX killed %12, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep, !tbaa !6) + %14:gpr32 = LDRWui %13, target-flags(aarch64-pageoff, aarch64-nc) @x :: (volatile dereferenceable load (s32) from @x, !tbaa !10) + %15:gpr32 = ADDWrr %10, killed %14 + STRWui killed %15, %13, target-flags(aarch64-pageoff, aarch64-nc) @x :: (volatile store (s32) into @x, !tbaa !10) + %16:gpr64common = nuw nsw ADDXri %1, 1, 0 + %2:gpr64all = COPY %16 + dead $xzr = SUBSXrr %0, %16, implicit-def $nzcv + Bcc 0, %bb.2, implicit $nzcv + B %bb.3 +... diff --git a/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir new file mode 100644 index 0000000000000..6116f15811ec7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir @@ -0,0 +1,99 @@ +# RUN: llc -mtriple=aarch64 -run-pass=pipeliner -debug-only=pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test a case where fenv is enabled, there are instructions that may raise a +# floatin-point exception, but there is no instruction for barrier event. In +# this case no loop-carried dependencies are necessary. + +# CHECK: ===== Loop Carried Edges Begin ===== +# CHECK-NEXT: ===== Loop Carried Edges End ===== + +--- | + define dso_local float @f(ptr nocapture noundef writeonly %a, float noundef %y, i32 noundef %n) local_unnamed_addr { + entry: + %conv = tail call float @llvm.experimental.constrained.fptrunc.f32.f64(double 1.000000e+00, metadata !"round.dynamic", metadata !"fpexcept.strict") + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + + for.cond.cleanup: + %acc.0.lcssa = phi float [ %conv, %entry ], [ %mul, %for.body ] + ret float %acc.0.lcssa + + for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %acc.010 = phi float [ %conv, %for.body.preheader ], [ %mul, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv2 = tail call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %tmp, metadata !"round.dynamic", metadata !"fpexcept.strict") + %add = tail call float @llvm.experimental.constrained.fadd.f32(float %conv2, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict") + %mul = tail call float @llvm.experimental.constrained.fmul.f32(float %acc.010, float %add, metadata !"round.dynamic", metadata !"fpexcept.strict") + %0 = shl nuw nsw i64 %indvars.iv, 2 + %scevgep = getelementptr i8, ptr %a, i64 %0 + store float %add, ptr %scevgep, align 4, !tbaa !6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %wide.trip.count, %indvars.iv.next + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + + declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) + + declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) + + declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) + + declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) + + !6 = !{!7, !7, i64 0} + !7 = !{!"float", !8, i64 0} + !8 = !{!"omnipotent char", !9, i64 0} + !9 = !{!"Simple C/C++ TBAA"} + +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $x0, $s0, $w1 + + %9:gpr32common = COPY $w1 + %8:fpr32 = COPY $s0 + %7:gpr64common = COPY $x0 + %10:fpr64 = FMOVDi 112 + %0:fpr32 = FCVTSDr killed %10, implicit $fpcr + dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %13:gpr32 = ORRWrs $wzr, %9, 0 + %1:gpr64 = SUBREG_TO_REG 0, killed %13, %subreg.sub_32 + %14:gpr64all = COPY $xzr + %12:gpr64all = COPY %14 + B %bb.3 + + bb.2.for.cond.cleanup: + %2:fpr32 = PHI %0, %bb.0, %5, %bb.3 + $s0 = COPY %2 + RET_ReallyLR implicit $s0 + + bb.3.for.body: + successors: %bb.2, %bb.3 + + %3:gpr64common = PHI %12, %bb.1, %6, %bb.3 + %4:fpr32 = PHI %0, %bb.1, %5, %bb.3 + %15:gpr32 = COPY %3.sub_32 + %16:fpr32 = SCVTFUWSri killed %15, implicit $fpcr + %17:fpr32 = FADDSrr killed %16, %8, implicit $fpcr + %5:fpr32 = FMULSrr %4, %17, implicit $fpcr + STRSroX %17, %7, %3, 0, 1 :: (store (s32) into %ir.scevgep, !tbaa !6) + %18:gpr64common = nuw nsw ADDXri %3, 1, 0 + %6:gpr64all = COPY %18 + dead $xzr = SUBSXrr %1, %18, implicit-def $nzcv + Bcc 0, %bb.2, implicit $nzcv + B %bb.3 +... diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir new file mode 100644 index 0000000000000..17ee07f49324a --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir @@ -0,0 +1,110 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s +# REQUIRES: asserts + +# Test that loop-carried memory dependencies are added correctly. +# The original code is as follows. +# +# ``` +# void f(int *a, int n) { +# for (int i = 0; i < n-1; i++) { +# a[i] += a[i]; +# a[i+1] += i; +# } +# } +# ``` +# +# Loop-carried dependencies exist from store for a[i+1] to load/store for a[i], but not vice versa. +# FIXME: Currently the following dependencies are missed. +# +# Loop carried edges from SU(6) +# Order +# SU(4) +# Loop carried edges from SU(8) +# Order +# SU(4) + +# CHECK: ===== Loop Carried Edges Begin ===== +# CHECK-NEXT: ===== Loop Carried Edges End ===== + +--- | + define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) local_unnamed_addr { + entry: + %cmp12 = icmp sgt i32 %n, 1 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: + %.pre = load i32, ptr %a, align 4, !tbaa !5 + %0 = add i32 %n, -1 + %cgep = getelementptr i8, ptr %a, i32 4 + br label %for.body + + for.cond.cleanup: + ret void + + for.body: + %lsr.iv14 = phi ptr [ %cgep, %for.body.preheader ], [ %cgep18, %for.body ] + %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %1 = phi i32 [ %add4, %for.body ], [ %.pre, %for.body.preheader ] + %i.013 = phi i32 [ %add2, %for.body ], [ 0, %for.body.preheader ] + %add = shl nsw i32 %1, 1 + %cgep17 = getelementptr i8, ptr %lsr.iv14, i32 -4 + store i32 %add, ptr %cgep17, align 4, !tbaa !5 + %add2 = add nuw nsw i32 %i.013, 1 + %2 = load i32, ptr %lsr.iv14, align 4, !tbaa !5 + %add4 = add nsw i32 %2, %i.013 + %3 = add i32 %i.013, %2 + store i32 %3, ptr %lsr.iv14, align 4, !tbaa !5 + %lsr.iv.next = add i32 %lsr.iv, -1 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + %cgep18 = getelementptr i8, ptr %lsr.iv14, i32 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + + !5 = !{!6, !6, i64 0} + !6 = !{!"int", !7, i64 0} + !7 = !{!"omnipotent char", !8, i64 0} + !8 = !{!"Simple C/C++ TBAA"} + +... +--- +name: f +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $r0, $r1 + + %12:intregs = COPY $r1 + %11:intregs = COPY $r0 + %13:predregs = C2_cmpgti %12, 1 + J2_jumpf %13, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.for.body.preheader: + %0:intregs, %2:intregs = L2_loadri_pi %11, 4 :: (load (s32) from %ir.a, !tbaa !5) + %1:intregs = A2_addi %12, -1 + %15:intregs = A2_tfrsi 0 + %19:intregs = COPY %1 + J2_loop0r %bb.3, %19, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.for.cond.cleanup: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.for.body: + successors: %bb.2, %bb.3 + + %3:intregs = PHI %2, %bb.1, %10, %bb.3 + %5:intregs = PHI %0, %bb.1, %8, %bb.3 + %6:intregs = PHI %15, %bb.1, %7, %bb.3 + %16:intregs = nsw S2_asl_i_r %5, 1 + S2_storeri_io %3, -4, killed %16 :: (store (s32) into %ir.cgep17, !tbaa !5) + %7:intregs = nuw nsw A2_addi %6, 1 + %17:intregs = L2_loadri_io %3, 0 :: (load (s32) from %ir.lsr.iv14, !tbaa !5) + %8:intregs = A2_add killed %17, %6 + S2_storeri_io %3, 0, %8 :: (store (s32) into %ir.lsr.iv14, !tbaa !5) + %10:intregs = A2_addi %3, 4 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir new file mode 100644 index 0000000000000..850e602c9146f --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir @@ -0,0 +1,105 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s +# REQUIRES: asserts + +# Test that loop-carried memory dependencies are added correctly. +# The original code is as follows. +# +# ``` +# void f(int *a, int n) { +# for (int i = 1; i < n; i++) { +# a[i] += a[i]; +# a[i-1] += i; +# } +# } +# ``` +# +# Loop-carried dependencies exist from load/store for a[i] to store for a[i-1], but not vice versa. +# FIXME: Currently the following dependencies are missed. +# +# Loop carried edges from SU(5) +# Order +# SU(7) + +# CHECK: ===== Loop Carried Edges Begin ===== +# CHECK-NEXT: Loop carried edges from SU(3) +# CHECK-NEXT: Order +# CHECK-NEXT: SU(7) +# CHECK-NEXT: ===== Loop Carried Edges End ===== + +--- | + define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) local_unnamed_addr { + entry: + %cmp11 = icmp sgt i32 %n, 1 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: + %load_initial = load i32, ptr %a, align 4 + %cgep = getelementptr i8, ptr %a, i32 4 + br label %for.body + + for.cond.cleanup: + ret void + + for.body: + %lsr.iv = phi ptr [ %cgep, %for.body.preheader ], [ %cgep16, %for.body ] + %store_forwarded = phi i32 [ %load_initial, %for.body.preheader ], [ %add, %for.body ] + %i.012 = phi i32 [ 1, %for.body.preheader ], [ %inc, %for.body ] + %0 = load i32, ptr %lsr.iv, align 4, !tbaa !5 + %add = shl nsw i32 %0, 1 + store i32 %add, ptr %lsr.iv, align 4, !tbaa !5 + %1 = add i32 %store_forwarded, %i.012 + %cgep15 = getelementptr i8, ptr %lsr.iv, i32 -4 + store i32 %1, ptr %cgep15, align 4, !tbaa !5 + %inc = add nuw nsw i32 %i.012, 1 + %exitcond.not = icmp eq i32 %n, %inc + %cgep16 = getelementptr i8, ptr %lsr.iv, i32 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + + !5 = !{!6, !6, i64 0} + !6 = !{!"int", !7, i64 0} + !7 = !{!"omnipotent char", !8, i64 0} + !8 = !{!"Simple C/C++ TBAA"} + +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $r0, $r1 + + %9:intregs = COPY $r1 + %8:intregs = COPY $r0 + %10:predregs = C2_cmpgti %9, 1 + J2_jumpf %10, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.for.body.preheader: + %0:intregs, %1:intregs = L2_loadri_pi %8, 4 :: (load (s32) from %ir.a) + %12:intregs = A2_tfrsi 1 + %16:intregs = A2_addi %9, -1 + %17:intregs = COPY %16 + J2_loop0r %bb.3, %17, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.for.cond.cleanup: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.for.body (machine-block-address-taken): + successors: %bb.2(0x04000000), %bb.3(0x7c000000) + + %2:intregs = PHI %1, %bb.1, %7, %bb.3 + %3:intregs = PHI %0, %bb.1, %5, %bb.3 + %4:intregs = PHI %12, %bb.1, %6, %bb.3 + %13:intregs = L2_loadri_io %2, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5) + %5:intregs = nsw S2_asl_i_r killed %13, 1 + S2_storeri_io %2, 0, %5 :: (store (s32) into %ir.lsr.iv, !tbaa !5) + %14:intregs = A2_add %3, %4 + S2_storeri_io %2, -4, killed %14 :: (store (s32) into %ir.cgep15, !tbaa !5) + %6:intregs = nuw nsw A2_addi %4, 1 + %7:intregs = A2_addi %2, 4 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir new file mode 100644 index 0000000000000..ca59b97dd11e9 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir @@ -0,0 +1,109 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s +# REQUIRES: asserts + +# Test that loop-carried memory dependencies are added correctly. +# The original code is as follows. +# +# ``` +# void f(int * restrict a, int * restrict b, int n) { +# for (int i = 0; i < n; i++) { +# a[i] += i; +# b[i] += a[i+1]; +# } +# } +# ``` +# +# Loop-carried dependencies exist from load for a[i+1] to store for a[i]. +# FIXME: Currently the following dependencies are missed. +# +# Loop carried edges from SU(7) +# Order +# SU(5) + +# CHECK: ===== Loop Carried Edges Begin ===== +# CHECK-NEXT: ===== Loop Carried Edges End ===== + +--- | + define dso_local void @f(ptr noalias nocapture noundef %a, ptr noalias nocapture noundef %b, i32 noundef %n) local_unnamed_addr { + entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: + %.pre = load i32, ptr %a, align 4, !tbaa !5 + %cgep = getelementptr i8, ptr %a, i32 4 + br label %for.body + + for.cond.cleanup: + ret void + + for.body: + %lsr.iv15 = phi ptr [ %cgep, %for.body.preheader ], [ %cgep20, %for.body ] + %lsr.iv13 = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %lsr.iv = phi ptr [ %b, %for.body.preheader ], [ %cgep19, %for.body ] + %0 = phi i32 [ %2, %for.body ], [ %.pre, %for.body.preheader ] + %i.012 = phi i32 [ %add1, %for.body ], [ 0, %for.body.preheader ] + %1 = add i32 %0, %i.012 + %cgep18 = getelementptr i8, ptr %lsr.iv15, i32 -4 + store i32 %1, ptr %cgep18, align 4, !tbaa !5 + %add1 = add nuw nsw i32 %i.012, 1 + %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5 + %3 = load i32, ptr %lsr.iv, align 4, !tbaa !5 + %add4 = add nsw i32 %3, %2 + store i32 %add4, ptr %lsr.iv, align 4, !tbaa !5 + %lsr.iv.next = add i32 %lsr.iv13, -1 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + %cgep19 = getelementptr i8, ptr %lsr.iv, i32 4 + %cgep20 = getelementptr i8, ptr %lsr.iv15, i32 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + + !5 = !{!6, !6, i64 0} + !6 = !{!"int", !7, i64 0} + !7 = !{!"omnipotent char", !8, i64 0} + !8 = !{!"Simple C/C++ TBAA"} + +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $r0, $r1, $r2 + + %14:intregs = COPY $r2 + %13:intregs = COPY $r1 + %12:intregs = COPY $r0 + %15:predregs = C2_cmpgti %14, 0 + J2_jumpf %15, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.for.body.preheader: + %0:intregs, %1:intregs = L2_loadri_pi %12, 4 :: (load (s32) from %ir.a, !tbaa !5) + %17:intregs = A2_tfrsi 0 + %22:intregs = COPY %14 + J2_loop0r %bb.3, %22, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.for.cond.cleanup: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.for.body: + successors: %bb.2, %bb.3 + + %2:intregs = PHI %1, %bb.1, %11, %bb.3 + %4:intregs = PHI %13, %bb.1, %10, %bb.3 + %5:intregs = PHI %0, %bb.1, %8, %bb.3 + %6:intregs = PHI %17, %bb.1, %7, %bb.3 + %18:intregs = A2_add %5, %6 + S2_storeri_io %2, -4, killed %18 :: (store (s32) into %ir.cgep18, !tbaa !5) + %7:intregs = nuw nsw A2_addi %6, 1 + %8:intregs = L2_loadri_io %2, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5) + %19:intregs = L2_loadri_io %4, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5) + %20:intregs = nsw A2_add killed %19, %8 + %10:intregs = S2_storeri_pi %4, 4, killed %20 :: (store (s32) into %ir.lsr.iv, !tbaa !5) + %11:intregs = A2_addi %2, 4 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir new file mode 100644 index 0000000000000..4bc4b48735947 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir @@ -0,0 +1,109 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s +# REQUIRES: asserts + +# Test that loop carried memory dependences are computed correctly. +# The original code is as follows. +# +# ``` +# void f(int *a, int n) { +# for (int i = 0; i < n-2; i++) { +# a[i] += a[i+10]; +# a[i+2] += i; +# } +# } +# ``` +# +# Here is what each instruction does. +# SU(2): Load a[i+10] +# SU(3): Store it to a[i] +# SU(4): Load a[i+2], add i, then store it +# +# FIXME: Currently the following dependencies are missed. +# +# Loop carried edges from SU(4) +# Order +# SU(3) + +# CHECK: ===== Loop Carried Edges Begin ===== +# CHECK-NEXT: Loop carried edges from SU(2) +# CHECK-NEXT: Order +# CHECK-NEXT: SU(3) +# CHECK-NEXT: SU(4) +# CHECK-NEXT: ===== Loop Carried Edges End ===== + +--- | + define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) { + entry: + %cmp13 = icmp sgt i32 %n, 2 + br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: + %0 = add i32 %n, -2 + br label %for.body + + for.cond.cleanup: + ret void + + for.body: + %lsr.iv15 = phi ptr [ %a, %for.body.preheader ], [ %cgep19, %for.body ] + %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %i.014 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %cgep = getelementptr i8, ptr %lsr.iv15, i32 40 + %1 = load i32, ptr %cgep, align 4, !tbaa !5 + %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5 + %add2 = add nsw i32 %2, %1 + store i32 %add2, ptr %lsr.iv15, align 4, !tbaa !5 + %cgep18 = getelementptr i8, ptr %lsr.iv15, i32 8 + %3 = load i32, ptr %cgep18, align 4, !tbaa !5 + %4 = add i32 %i.014, %3 + store i32 %4, ptr %cgep18, align 4, !tbaa !5 + %inc = add nuw nsw i32 %i.014, 1 + %lsr.iv.next = add i32 %lsr.iv, -1 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + %cgep19 = getelementptr i8, ptr %lsr.iv15, i32 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + + !5 = !{!6, !6, i64 0} + !6 = !{!"int", !7, i64 0} + !7 = !{!"omnipotent char", !8, i64 0} + !8 = !{!"Simple C/C++ TBAA"} + +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $r0, $r1 + + %8:intregs = COPY $r1 + %7:intregs = COPY $r0 + %9:predregs = C2_cmpgti %8, 2 + J2_jumpf %9, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.for.body.preheader: + %0:intregs = A2_addi %8, -2 + %11:intregs = A2_tfrsi 0 + %14:intregs = COPY %0 + J2_loop0r %bb.3, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.for.cond.cleanup: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.for.body: + successors: %bb.2, %bb.3 + + %1:intregs = PHI %7, %bb.1, %6, %bb.3 + %3:intregs = PHI %11, %bb.1, %4, %bb.3 + %12:intregs = L2_loadri_io %1, 40 :: (load (s32) from %ir.cgep, !tbaa !5) + L4_add_memopw_io %1, 0, killed %12 :: (store (s32) into %ir.lsr.iv15, !tbaa !5), (load (s32) from %ir.lsr.iv15, !tbaa !5) + L4_add_memopw_io %1, 8, %3 :: (store (s32) into %ir.cgep18, !tbaa !5), (load (s32) from %ir.cgep18, !tbaa !5) + %4:intregs = nuw nsw A2_addi %3, 1 + %6:intregs = A2_addi %1, 4 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir new file mode 100644 index 0000000000000..77c3d569db181 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir @@ -0,0 +1,111 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s +# REQUIRES: asserts + +# Test that loop carried memory dependencies are correctly added when two +# arrays may point to the same memory location. +# +# ``` +# void f(int *a, int *b, int n) { +# for (int i = 0; i < n; i++) { +# a[i] += b[i]; +# b[i] += a[i]; +# } +# } +# ``` +# +# Here is what each instruction does. +# SU(2): Load b[i] +# SU(3): Load a[i] +# SU(5): Store a[i] +# SU(6): Load b[i] +# SU(8): Store b[i] +# +# Note that if there is already a dependency between two instructions, we don't +# add loop-carried on between them since non-loop-carried one imposes stronger +# constraint than loop-carried one. +# +# FIXME: Currently the following dependencies are missed. +# Loop carried edges from SU(5) +# Order +# SU(2) +# Loop carried edges from SU(6) +# Order +# SU(5) +# Loop carried edges from SU(8) +# Order +# SU(3) +# SU(5) + +# CHECK: ===== Loop Carried Edges Begin ===== +# CHECK-NEXT: ===== Loop Carried Edges End ===== + +--- | + define dso_local void @f(ptr nocapture noundef %a, ptr nocapture noundef %b, i32 noundef %n) local_unnamed_addr { + entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + + for.cond.cleanup: + ret void + + for.body: + %lsr.iv15 = phi ptr [ %cgep17, %for.body ], [ %b, %entry ] + %lsr.iv14 = phi ptr [ %cgep, %for.body ], [ %a, %entry ] + %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %n, %entry ] + %0 = load i32, ptr %lsr.iv15, align 4, !tbaa !5 + %1 = load i32, ptr %lsr.iv14, align 4, !tbaa !5 + %add = add nsw i32 %1, %0 + store i32 %add, ptr %lsr.iv14, align 4, !tbaa !5 + %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5 + %add4 = add nsw i32 %2, %add + store i32 %add4, ptr %lsr.iv15, align 4, !tbaa !5 + %lsr.iv.next = add i32 %lsr.iv, -1 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + %cgep = getelementptr i8, ptr %lsr.iv14, i32 4 + %cgep17 = getelementptr i8, ptr %lsr.iv15, i32 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + + !5 = !{!6, !6, i64 0} + !6 = !{!"int", !7, i64 0} + !7 = !{!"omnipotent char", !8, i64 0} + !8 = !{!"Simple C/C++ TBAA"} + +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.3, %bb.1 + liveins: $r0, $r1, $r2 + + %8:intregs = COPY $r2 + %7:intregs = COPY $r1 + %6:intregs = COPY $r0 + %9:predregs = C2_cmpgti %8, 0 + J2_jumpf %9, %bb.1, implicit-def $pc + + bb.3: + %16:intregs = COPY %8 + J2_loop0r %bb.2, %16, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.2, implicit-def $pc + + bb.1.for.cond.cleanup: + PS_jmpret $r31, implicit-def dead $pc + + bb.2.for.body: + successors: %bb.1, %bb.2 + + %0:intregs = PHI %7, %bb.3, %5, %bb.2 + %1:intregs = PHI %6, %bb.3, %4, %bb.2 + %10:intregs = L2_loadri_io %0, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5) + %11:intregs = L2_loadri_io %1, 0 :: (load (s32) from %ir.lsr.iv14, !tbaa !5) + %12:intregs = nsw A2_add killed %11, killed %10 + %4:intregs = S2_storeri_pi %1, 4, %12 :: (store (s32) into %ir.lsr.iv14, !tbaa !5) + %13:intregs = L2_loadri_io %0, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5) + %14:intregs = nsw A2_add killed %13, %12 + %5:intregs = S2_storeri_pi %0, 4, killed %14 :: (store (s32) into %ir.lsr.iv15, !tbaa !5) + ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.1, implicit-def $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir new file mode 100644 index 0000000000000..4281d15377141 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir @@ -0,0 +1,154 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s +# REQUIRES: asserts + +# Test that loop carried memory dependencies are computed correctly +# when barrier instructions exist in the loop. +# The original code is as follows. +# +# ``` +# volatile int x = 0; +# void f(int * restrict a, int * restrict b, int * restrict c, int n) { +# for (int i = 0; i < n; i++) { +# a[i] *= c[i]; +# b[i] *= c[i]; +# x += i; +# a[i + 1] *= i; +# x += i; +# b[i + 1] *= i; +# } +# } +# ``` +# +# FIXME: Currently the following dependencies are missed. +# Loop carried edges from SU(16) +# Order +# SU(6) +# SU(8) +# SU(10) +# SU(11) +# Loop carried edges from SU(17) +# Order +# SU(10) +# SU(11) +# Loop carried edges from SU(19) +# Order +# SU(10) +# SU(11) + +# CHECK: ===== Loop Carried Edges Begin ===== +# CHECK-NEXT: ===== Loop Carried Edges End ===== + +--- | + @x = dso_local global i32 0, align 4 + + define dso_local void @f(ptr noalias nocapture noundef %a, ptr noalias nocapture noundef %b, ptr noalias nocapture noundef readonly %c, i32 noundef %n) { + entry: + %cmp26 = icmp sgt i32 %n, 0 + br i1 %cmp26, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: + %.pre = load i32, ptr %a, align 4, !tbaa !5 + %.pre28 = load i32, ptr %b, align 4, !tbaa !5 + %cgep = getelementptr i8, ptr %b, i32 4 + %cgep37 = getelementptr i8, ptr %a, i32 4 + br label %for.body + + for.cond.cleanup: + ret void + + for.body: + %lsr.iv35 = phi ptr [ %c, %for.body.preheader ], [ %cgep42, %for.body ] + %lsr.iv31 = phi ptr [ %cgep37, %for.body.preheader ], [ %cgep41, %for.body ] + %lsr.iv = phi ptr [ %cgep, %for.body.preheader ], [ %cgep40, %for.body ] + %0 = phi i32 [ %mul11, %for.body ], [ %.pre28, %for.body.preheader ] + %1 = phi i32 [ %mul7, %for.body ], [ %.pre, %for.body.preheader ] + %i.027 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] + %2 = load i32, ptr %lsr.iv35, align 4, !tbaa !5 + %mul = mul nsw i32 %1, %2 + %cgep38 = getelementptr i8, ptr %lsr.iv31, i32 -4 + store i32 %mul, ptr %cgep38, align 4, !tbaa !5 + %mul4 = mul nsw i32 %0, %2 + %cgep39 = getelementptr i8, ptr %lsr.iv, i32 -4 + store i32 %mul4, ptr %cgep39, align 4, !tbaa !5 + %3 = load volatile i32, ptr @x, align 4, !tbaa !5 + %4 = add i32 %i.027, %3 + store volatile i32 %4, ptr @x, align 4, !tbaa !5 + %add5 = add nuw nsw i32 %i.027, 1 + %5 = load i32, ptr %lsr.iv31, align 4, !tbaa !5 + %mul7 = mul nsw i32 %5, %i.027 + store i32 %mul7, ptr %lsr.iv31, align 4, !tbaa !5 + %6 = load volatile i32, ptr @x, align 4, !tbaa !5 + %7 = add i32 %i.027, %6 + store volatile i32 %7, ptr @x, align 4, !tbaa !5 + %8 = load i32, ptr %lsr.iv, align 4, !tbaa !5 + %mul11 = mul nsw i32 %8, %i.027 + store i32 %mul11, ptr %lsr.iv, align 4, !tbaa !5 + %exitcond.not = icmp eq i32 %n, %add5 + %cgep40 = getelementptr i8, ptr %lsr.iv, i32 4 + %cgep41 = getelementptr i8, ptr %lsr.iv31, i32 4 + %cgep42 = getelementptr i8, ptr %lsr.iv35, i32 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + + !5 = !{!6, !6, i64 0} + !6 = !{!"int", !7, i64 0} + !7 = !{!"omnipotent char", !8, i64 0} + !8 = !{!"Simple C/C++ TBAA"} + +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $r0, $r1, $r2, $r3 + + %19:intregs = COPY $r3 + %18:intregs = COPY $r2 + %17:intregs = COPY $r1 + %16:intregs = COPY $r0 + %20:predregs = C2_cmpgti %19, 0 + J2_jumpf %20, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.for.body.preheader: + %0:intregs, %3:intregs = L2_loadri_pi %16, 4 :: (load (s32) from %ir.a, !tbaa !5) + %1:intregs, %2:intregs = L2_loadri_pi %17, 4 :: (load (s32) from %ir.b, !tbaa !5) + %22:intregs = A2_tfrsi 0 + %26:intregs = C4_addipc target-flags(hexagon-pcrel) @x + %30:intregs = COPY %19 + J2_loop0r %bb.3, %30, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.for.cond.cleanup: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.for.body: + successors: %bb.2, %bb.3 + + %4:intregs = PHI %18, %bb.1, %15, %bb.3 + %5:intregs = PHI %3, %bb.1, %14, %bb.3 + %6:intregs = PHI %2, %bb.1, %13, %bb.3 + %7:intregs = PHI %1, %bb.1, %12, %bb.3 + %8:intregs = PHI %0, %bb.1, %11, %bb.3 + %9:intregs = PHI %22, %bb.1, %10, %bb.3 + %23:intregs, %15:intregs = L2_loadri_pi %4, 4 :: (load (s32) from %ir.lsr.iv35, !tbaa !5) + %24:intregs = nsw M2_mpyi %8, %23 + S2_storeri_io %5, -4, killed %24 :: (store (s32) into %ir.cgep38, !tbaa !5) + %25:intregs = nsw M2_mpyi %7, %23 + S2_storeri_io %6, -4, killed %25 :: (store (s32) into %ir.cgep39, !tbaa !5) + L4_add_memopw_io %26, 0, %9 :: (volatile store (s32) into @x, !tbaa !5), (volatile dereferenceable load (s32) from @x, !tbaa !5) + %10:intregs = nuw nsw A2_addi %9, 1 + %27:intregs = L2_loadri_io %5, 0 :: (load (s32) from %ir.lsr.iv31, !tbaa !5) + %11:intregs = nsw M2_mpyi killed %27, %9 + S2_storeri_io %5, 0, %11 :: (store (s32) into %ir.lsr.iv31, !tbaa !5) + L4_add_memopw_io %26, 0, %9 :: (volatile store (s32) into @x, !tbaa !5), (volatile dereferenceable load (s32) from @x, !tbaa !5) + %28:intregs = L2_loadri_io %6, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5) + %12:intregs = nsw M2_mpyi killed %28, %9 + S2_storeri_io %6, 0, %12 :: (store (s32) into %ir.lsr.iv, !tbaa !5) + %13:intregs = A2_addi %6, 4 + %14:intregs = A2_addi %5, 4 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def $pc +...