diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 966ffb7a1fbd2..e4e794c434adb 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -190,6 +190,38 @@ class SwingSchedulerDDGEdge {
   bool ignoreDependence(bool IgnoreAnti) const;
 };
 
+/// Represents loop-carried dependencies. Because SwingSchedulerDAG doesn't
+/// assume cycle dependencies as the name suggests, such dependencies must be
+/// handled separately. After DAG construction is finished, these dependencies
+/// are added to SwingSchedulerDDG.
+/// TODO: Also handle output-dependencies introduced by physical registers.
+struct LoopCarriedEdges {
+  using OrderDep = SmallSetVector<SUnit *, 8>;
+  using OrderDepsType = DenseMap<SUnit *, OrderDep>;
+
+  OrderDepsType OrderDeps;
+
+  const OrderDep *getOrderDepOrNull(SUnit *Key) const {
+    auto Ite = OrderDeps.find(Key);
+    if (Ite == OrderDeps.end())
+      return nullptr;
+    return &Ite->second;
+  }
+
+  /// Retruns true if the edge from \p From to \p To is a back-edge that should
+  /// be used when scheduling.
+  bool shouldUseWhenScheduling(const SUnit *From, const SUnit *To) const;
+
+  /// Adds some edges to the original DAG that correspond to loop-carried
+  /// dependencies. Historically, loop-carried edges are represented by using
+  /// non-loop-carried edges in the original DAG. This function appends such
+  /// edges to preserve the previous behavior.
+  void modifySUnits(std::vector<SUnit> &SUnits);
+
+  void dump(SUnit *SU, const TargetRegisterInfo *TRI,
+            const MachineRegisterInfo *MRI) const;
+};
+
 /// Represents dependencies between instructions. This class is a wrapper of
 /// `SUnits` and its dependencies to manipulate back-edges in a natural way.
 /// Currently it only supports back-edges via PHI, which are expressed as
@@ -402,7 +434,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
                              const MachineInstr *OtherMI) const;
 
 private:
-  void addLoopCarriedDependences();
+  LoopCarriedEdges addLoopCarriedDependences();
   void updatePhiDependences();
   void changeDependences();
   unsigned calculateResMII();
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 3d161ffbe40a4..fdc6102c719e8 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -266,6 +266,82 @@ struct SUnitWithMemInfo {
   bool getUnderlyingObjects();
 };
 
+/// Add loop-carried chain dependencies. This class handles the same type of
+/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
+/// account dependencies across iterations.
+class LoopCarriedOrderDepsTracker {
+  // Type of instruction that is relevant to order-dependencies
+  enum class InstrTag {
+    Barrier = 0,      ///< A barrier event instruction.
+    LoadOrStore = 1,  ///< An instruction that may load or store memory, but is
+                      ///< not a barrier event.
+    FPExceptions = 2, ///< An instruction that does not match above, but may
+                      ///< raise floatin-point exceptions.
+  };
+
+  struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
+    TaggedSUnit(SUnit *SU, InstrTag Tag)
+        : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
+
+    InstrTag getTag() const { return InstrTag(getInt()); }
+  };
+
+  /// Holds loads and stores with memory related information.
+  struct LoadStoreChunk {
+    SmallVector<SUnitWithMemInfo, 4> Loads;
+    SmallVector<SUnitWithMemInfo, 4> Stores;
+
+    void append(SUnit *SU);
+  };
+
+  SwingSchedulerDAG *DAG;
+  BatchAAResults *BAA;
+  std::vector<SUnit> &SUnits;
+
+  /// The size of SUnits, for convenience.
+  const unsigned N;
+
+  /// Loop-carried Edges.
+  std::vector<BitVector> LoopCarried;
+
+  /// Instructions related to chain dependencies. They are one of the
+  /// following:
+  ///
+  ///  1. Barrier event.
+  ///  2. Load, but neither a barrier event, invariant load, nor may load trap
+  ///     value.
+  ///  3. Store, but not a barrier event.
+  ///  4. None of them, but may raise floating-point exceptions.
+  ///
+  /// This is used when analyzing loop-carried dependencies that access global
+  /// barrier instructions.
+  std::vector<TaggedSUnit> TaggedSUnits;
+
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+
+public:
+  LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA,
+                              const TargetInstrInfo *TII,
+                              const TargetRegisterInfo *TRI);
+
+  /// The main function to compute loop-carried order-dependencies.
+  void computeDependencies();
+
+  const BitVector &getLoopCarried(unsigned Idx) const {
+    return LoopCarried[Idx];
+  }
+
+private:
+  /// Tags to \p SU if the instruction may affect the order-dependencies.
+  std::optional<InstrTag> getInstrTag(SUnit *SU) const;
+
+  void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
+                                          const LoadStoreChunk &To);
+
+  void computeDependenciesAux();
+};
+
 } // end anonymous namespace
 
 /// The "main" function for implementing Swing Modulo Scheduling.
@@ -593,13 +669,19 @@ void SwingSchedulerDAG::setMAX_II() {
 /// scheduling part of the Swing Modulo Scheduling algorithm.
 void SwingSchedulerDAG::schedule() {
   buildSchedGraph(AA);
-  addLoopCarriedDependences();
+  const LoopCarriedEdges LCE = addLoopCarriedDependences();
   updatePhiDependences();
   Topo.InitDAGTopologicalSorting();
   changeDependences();
   postProcessDAG();
   DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
-  LLVM_DEBUG(dump());
+  LLVM_DEBUG({
+    dump();
+    dbgs() << "===== Loop Carried Edges Begin =====\n";
+    for (SUnit &SU : SUnits)
+      LCE.dump(&SU, TRI, &MRI);
+    dbgs() << "===== Loop Carried Edges End =====\n";
+  });
 
   NodeSetType NodeSets;
   findCircuits(NodeSets);
@@ -832,15 +914,6 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
   return false;
 }
 
-/// Return true if the instruction causes a chain between memory
-/// references before and after it.
-static bool isDependenceBarrier(MachineInstr &MI) {
-  return MI.isCall() || MI.mayRaiseFPException() ||
-         MI.hasUnmodeledSideEffects() ||
-         (MI.hasOrderedMemoryRef() &&
-          (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
-}
-
 SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {
   if (!getUnderlyingObjects())
     return;
@@ -941,28 +1014,111 @@ static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
   return false;
 }
 
+void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
+  const MachineInstr *MI = SU->getInstr();
+  if (!MI->mayLoadOrStore())
+    return;
+  (MI->mayStore() ? Stores : Loads).emplace_back(SU);
+}
+
+LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker(
+    SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
+    const TargetRegisterInfo *TRI)
+    : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
+      LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
+
+void LoopCarriedOrderDepsTracker::computeDependencies() {
+  // Traverse all instructions and extract only what we are targetting.
+  for (auto &SU : SUnits) {
+    auto Tagged = getInstrTag(&SU);
+
+    // This instruction has no loop-carried order-dependencies.
+    if (!Tagged)
+      continue;
+    TaggedSUnits.emplace_back(&SU, *Tagged);
+  }
+
+  computeDependenciesAux();
+}
+
+std::optional<LoopCarriedOrderDepsTracker::InstrTag>
+LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
+  MachineInstr *MI = SU->getInstr();
+  if (TII->isGlobalMemoryObject(MI))
+    return InstrTag::Barrier;
+
+  if (MI->mayStore() ||
+      (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
+    return InstrTag::LoadOrStore;
+
+  if (MI->mayRaiseFPException())
+    return InstrTag::FPExceptions;
+
+  return std::nullopt;
+}
+
+void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
+    const LoadStoreChunk &From, const LoadStoreChunk &To) {
+  // Add dependencies for load-to-store (WAR) from top to bottom.
+  for (const SUnitWithMemInfo &Src : From.Loads)
+    for (const SUnitWithMemInfo &Dst : To.Stores)
+      if (Src.SU->NodeNum < Dst.SU->NodeNum &&
+          hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI))
+        LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
+
+  // TODO: The following dependencies are missed.
+  //
+  // - Dependencies for load-to-store from bottom to top.
+  // - Dependencies for store-to-load (RAW).
+  // - Dependencies for store-to-store (WAW).
+}
+
+void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
+  SmallVector<LoadStoreChunk, 2> Chunks(1);
+  for (const auto &TSU : TaggedSUnits) {
+    InstrTag Tag = TSU.getTag();
+    SUnit *SU = TSU.getPointer();
+    switch (Tag) {
+    case InstrTag::Barrier:
+      Chunks.emplace_back();
+      break;
+    case InstrTag::LoadOrStore:
+      Chunks.back().append(SU);
+      break;
+    case InstrTag::FPExceptions:
+      // TODO: Handle this properly.
+      break;
+    }
+  }
+
+  // Add dependencies between memory operations. If there are one or more
+  // barrier events between two memory instructions, we don't add a
+  // loop-carried dependence for them.
+  for (const LoadStoreChunk &Chunk : Chunks)
+    addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
+
+  // TODO: If there are multiple barrier instructions, dependencies from the
+  // last barrier instruction (or load/store below it) to the first barrier
+  // instruction (or load/store above it).
+}
+
 /// Add a chain edge between a load and store if the store can be an
 /// alias of the load on a subsequent iteration, i.e., a loop carried
 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
 /// but that code doesn't create loop carried dependences.
-void SwingSchedulerDAG::addLoopCarriedDependences() {
-  SmallVector<SUnitWithMemInfo, 4> PendingLoads;
-  for (auto &SU : SUnits) {
-    MachineInstr &MI = *SU.getInstr();
-    if (isDependenceBarrier(MI))
-      PendingLoads.clear();
-    else if (MI.mayLoad()) {
-      PendingLoads.emplace_back(&SU);
-    } else if (MI.mayStore()) {
-      SUnitWithMemInfo Store(&SU);
-      for (const SUnitWithMemInfo &Load : PendingLoads)
-        if (hasLoopCarriedMemDep(Load, Store, BAA, TII, TRI)) {
-          SDep Dep(Load.SU, SDep::Barrier);
-          Dep.setLatency(1);
-          SU.addPred(Dep);
-        }
-    }
-  }
+/// TODO: Also compute output-dependencies.
+LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
+  LoopCarriedEdges LCE;
+
+  // Add loop-carried order-dependencies
+  LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
+  LCODTracker.computeDependencies();
+  for (unsigned I = 0; I != SUnits.size(); I++)
+    for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
+      LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
+
+  LCE.modifySUnits(SUnits);
+  return LCE;
 }
 
 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
@@ -4002,3 +4158,37 @@ const SwingSchedulerDDG::EdgesType &
 SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
   return getEdges(SU).Succs;
 }
+
+void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits) {
+  // Currently this function simply adds all dependencies represented by this
+  // object. After we properly handle missed dependencies, the logic here will
+  // be more complex, as currently missed edges should not be added to the DAG.
+  for (SUnit &SU : SUnits) {
+    SUnit *Src = &SU;
+    if (auto *OrderDep = getOrderDepOrNull(Src)) {
+      SDep Dep(Src, SDep::Barrier);
+      Dep.setLatency(1);
+      for (SUnit *Dst : *OrderDep)
+        Dst->addPred(Dep);
+    }
+  }
+}
+
+void LoopCarriedEdges::dump(SUnit *SU, const TargetRegisterInfo *TRI,
+                            const MachineRegisterInfo *MRI) const {
+  const auto *Order = getOrderDepOrNull(SU);
+
+  if (!Order)
+    return;
+
+  const auto DumpSU = [](const SUnit *SU) {
+    std::ostringstream OSS;
+    OSS << "SU(" << SU->NodeNum << ")";
+    return OSS.str();
+  };
+
+  dbgs() << "  Loop carried edges from " << DumpSU(SU) << "\n"
+         << "    Order\n";
+  for (SUnit *Dst : *Order)
+    dbgs() << "      " << DumpSU(Dst) << "\n";
+}
diff --git a/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir
new file mode 100644
index 0000000000000..bcc6a3ea9b285
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir
@@ -0,0 +1,109 @@
+# RUN: llc -mtriple=aarch64 -run-pass=pipeliner -debug-only=pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg %s -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Test a case where fenv is enabled, there are instructions that may raise a
+# floating-point exception, and there is an instruction for barrier event. In
+# this case the order of them must not change.
+#
+# FIXME: Currently the following dependencies are missed.
+#
+# Loop carried edges from SU(7)
+#   Order
+#     SU(2)
+#     SU(3)
+#     SU(4)
+#     SU(5)
+
+# CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT: ===== Loop Carried Edges End =====
+
+--- |
+  @x = dso_local global i32 0, align 4
+
+  define dso_local void @f(ptr nocapture noundef writeonly %a, float noundef %y, i32 noundef %n) {
+  entry:
+    %cmp6 = icmp sgt i32 %n, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:
+    %wide.trip.count = zext nneg i32 %n to i64
+    br label %for.body
+
+  for.cond.cleanup:
+    ret void
+
+  for.body:
+    %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+    %tmp9 = trunc i64 %indvars.iv to i32
+    %conv = tail call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %tmp9, metadata !"round.dynamic", metadata !"fpexcept.strict") #2
+    %add = tail call float @llvm.experimental.constrained.fadd.f32(float %conv, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #2
+    %0 = shl nuw nsw i64 %indvars.iv, 2
+    %scevgep = getelementptr i8, ptr %a, i64 %0
+    store float %add, ptr %scevgep, align 4, !tbaa !6
+    %1 = load volatile i32, ptr @x, align 4, !tbaa !10
+    %2 = zext i32 %1 to i64
+    %3 = add i64 %indvars.iv, %2
+    %tmp = trunc i64 %3 to i32
+    store volatile i32 %tmp, ptr @x, align 4, !tbaa !10
+    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+    %exitcond.not = icmp eq i64 %wide.trip.count, %indvars.iv.next
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata)
+
+  declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+
+  attributes #2 = { strictfp }
+
+  !6 = !{!7, !7, i64 0}
+  !7 = !{!"float", !8, i64 0}
+  !8 = !{!"omnipotent char", !9, i64 0}
+  !9 = !{!"Simple C/C++ TBAA"}
+  !10 = !{!11, !11, i64 0}
+  !11 = !{!"int", !8, i64 0}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $x0, $s0, $w1
+  
+    %5:gpr32common = COPY $w1
+    %4:fpr32 = COPY $s0
+    %3:gpr64common = COPY $x0
+    dead $wzr = SUBSWri %5, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1
+  
+  bb.1.for.body.preheader:
+    %8:gpr32 = ORRWrs $wzr, %5, 0
+    %0:gpr64 = SUBREG_TO_REG 0, killed %8, %subreg.sub_32
+    %9:gpr64all = COPY $xzr
+    %7:gpr64all = COPY %9
+    %13:gpr64common = ADRP target-flags(aarch64-page) @x
+    B %bb.3
+  
+  bb.2.for.cond.cleanup:
+    RET_ReallyLR
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %1:gpr64common = PHI %7, %bb.1, %2, %bb.3
+    %10:gpr32 = COPY %1.sub_32
+    %11:fpr32 = SCVTFUWSri %10, implicit $fpcr
+    %12:fpr32 = FADDSrr killed %11, %4, implicit $fpcr
+    STRSroX killed %12, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep, !tbaa !6)
+    %14:gpr32 = LDRWui %13, target-flags(aarch64-pageoff, aarch64-nc) @x :: (volatile dereferenceable load (s32) from @x, !tbaa !10)
+    %15:gpr32 = ADDWrr %10, killed %14
+    STRWui killed %15, %13, target-flags(aarch64-pageoff, aarch64-nc) @x :: (volatile store (s32) into @x, !tbaa !10)
+    %16:gpr64common = nuw nsw ADDXri %1, 1, 0
+    %2:gpr64all = COPY %16
+    dead $xzr = SUBSXrr %0, %16, implicit-def $nzcv
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.3
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir
new file mode 100644
index 0000000000000..6116f15811ec7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir
@@ -0,0 +1,99 @@
+# RUN: llc -mtriple=aarch64 -run-pass=pipeliner -debug-only=pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg %s -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Test a case where fenv is enabled, there are instructions that may raise a
+# floatin-point exception, but there is no instruction for barrier event. In
+# this case no loop-carried dependencies are necessary.
+
+# CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT: ===== Loop Carried Edges End =====
+
+--- |
+  define dso_local float @f(ptr nocapture noundef writeonly %a, float noundef %y, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %conv = tail call float @llvm.experimental.constrained.fptrunc.f32.f64(double 1.000000e+00, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    %cmp8 = icmp sgt i32 %n, 0
+    br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:
+    %wide.trip.count = zext nneg i32 %n to i64
+    br label %for.body
+
+  for.cond.cleanup:
+    %acc.0.lcssa = phi float [ %conv, %entry ], [ %mul, %for.body ]
+    ret float %acc.0.lcssa
+
+  for.body:
+    %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+    %acc.010 = phi float [ %conv, %for.body.preheader ], [ %mul, %for.body ]
+    %tmp = trunc i64 %indvars.iv to i32
+    %conv2 = tail call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %tmp, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    %add = tail call float @llvm.experimental.constrained.fadd.f32(float %conv2, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    %mul = tail call float @llvm.experimental.constrained.fmul.f32(float %acc.010, float %add, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    %0 = shl nuw nsw i64 %indvars.iv, 2
+    %scevgep = getelementptr i8, ptr %a, i64 %0
+    store float %add, ptr %scevgep, align 4, !tbaa !6
+    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+    %exitcond.not = icmp eq i64 %wide.trip.count, %indvars.iv.next
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata)
+
+  declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata)
+
+  declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+
+  declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
+
+  !6 = !{!7, !7, i64 0}
+  !7 = !{!"float", !8, i64 0}
+  !8 = !{!"omnipotent char", !9, i64 0}
+  !9 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $x0, $s0, $w1
+  
+    %9:gpr32common = COPY $w1
+    %8:fpr32 = COPY $s0
+    %7:gpr64common = COPY $x0
+    %10:fpr64 = FMOVDi 112
+    %0:fpr32 = FCVTSDr killed %10, implicit $fpcr
+    dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1
+  
+  bb.1.for.body.preheader:
+    %13:gpr32 = ORRWrs $wzr, %9, 0
+    %1:gpr64 = SUBREG_TO_REG 0, killed %13, %subreg.sub_32
+    %14:gpr64all = COPY $xzr
+    %12:gpr64all = COPY %14
+    B %bb.3
+  
+  bb.2.for.cond.cleanup:
+    %2:fpr32 = PHI %0, %bb.0, %5, %bb.3
+    $s0 = COPY %2
+    RET_ReallyLR implicit $s0
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %3:gpr64common = PHI %12, %bb.1, %6, %bb.3
+    %4:fpr32 = PHI %0, %bb.1, %5, %bb.3
+    %15:gpr32 = COPY %3.sub_32
+    %16:fpr32 = SCVTFUWSri killed %15, implicit $fpcr
+    %17:fpr32 = FADDSrr killed %16, %8, implicit $fpcr
+    %5:fpr32 = FMULSrr %4, %17, implicit $fpcr
+    STRSroX %17, %7, %3, 0, 1 :: (store (s32) into %ir.scevgep, !tbaa !6)
+    %18:gpr64common = nuw nsw ADDXri %3, 1, 0
+    %6:gpr64all = COPY %18
+    dead $xzr = SUBSXrr %1, %18, implicit-def $nzcv
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.3
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir
new file mode 100644
index 0000000000000..17ee07f49324a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir
@@ -0,0 +1,110 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop-carried memory dependencies are added correctly.
+# The original code is as follows.
+#
+# ```
+# void f(int *a, int n) {
+#   for (int i = 0; i < n-1; i++) {
+#     a[i] += a[i];
+#     a[i+1] += i;
+#   }
+# }
+# ```
+# 
+# Loop-carried dependencies exist from store for a[i+1] to load/store for a[i], but not vice versa.
+# FIXME: Currently the following dependencies are missed.
+#
+# Loop carried edges from SU(6)
+#   Order
+#     SU(4)
+# Loop carried edges from SU(8)
+#   Order
+#     SU(4)
+
+# CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT: ===== Loop Carried Edges End =====
+
+--- |
+  define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %cmp12 = icmp sgt i32 %n, 1
+    br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:
+    %.pre = load i32, ptr %a, align 4, !tbaa !5
+    %0 = add i32 %n, -1
+    %cgep = getelementptr i8, ptr %a, i32 4
+    br label %for.body
+
+  for.cond.cleanup:
+    ret void
+
+  for.body:
+    %lsr.iv14 = phi ptr [ %cgep, %for.body.preheader ], [ %cgep18, %for.body ]
+    %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %1 = phi i32 [ %add4, %for.body ], [ %.pre, %for.body.preheader ]
+    %i.013 = phi i32 [ %add2, %for.body ], [ 0, %for.body.preheader ]
+    %add = shl nsw i32 %1, 1
+    %cgep17 = getelementptr i8, ptr %lsr.iv14, i32 -4
+    store i32 %add, ptr %cgep17, align 4, !tbaa !5
+    %add2 = add nuw nsw i32 %i.013, 1
+    %2 = load i32, ptr %lsr.iv14, align 4, !tbaa !5
+    %add4 = add nsw i32 %2, %i.013
+    %3 = add i32 %i.013, %2
+    store i32 %3, ptr %lsr.iv14, align 4, !tbaa !5
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep18 = getelementptr i8, ptr %lsr.iv14, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1
+  
+    %12:intregs = COPY $r1
+    %11:intregs = COPY $r0
+    %13:predregs = C2_cmpgti %12, 1
+    J2_jumpf %13, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs, %2:intregs = L2_loadri_pi %11, 4 :: (load (s32) from %ir.a, !tbaa !5)
+    %1:intregs = A2_addi %12, -1
+    %15:intregs = A2_tfrsi 0
+    %19:intregs = COPY %1
+    J2_loop0r %bb.3, %19, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %3:intregs = PHI %2, %bb.1, %10, %bb.3
+    %5:intregs = PHI %0, %bb.1, %8, %bb.3
+    %6:intregs = PHI %15, %bb.1, %7, %bb.3
+    %16:intregs = nsw S2_asl_i_r %5, 1
+    S2_storeri_io %3, -4, killed %16 :: (store (s32) into %ir.cgep17, !tbaa !5)
+    %7:intregs = nuw nsw A2_addi %6, 1
+    %17:intregs = L2_loadri_io %3, 0 :: (load (s32) from %ir.lsr.iv14, !tbaa !5)
+    %8:intregs = A2_add killed %17, %6
+    S2_storeri_io %3, 0, %8 :: (store (s32) into %ir.lsr.iv14, !tbaa !5)
+    %10:intregs = A2_addi %3, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir
new file mode 100644
index 0000000000000..850e602c9146f
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir
@@ -0,0 +1,105 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop-carried memory dependencies are added correctly.
+# The original code is as follows.
+#
+# ```
+# void f(int *a, int n) {
+#   for (int i = 1; i < n; i++) {
+#     a[i] += a[i];
+#     a[i-1] += i;
+#   }
+# }
+# ```
+# 
+# Loop-carried dependencies exist from load/store for a[i] to store for a[i-1], but not vice versa.
+# FIXME: Currently the following dependencies are missed.
+#
+#  Loop carried edges from SU(5)
+#    Order
+#      SU(7)
+
+# CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT:   Loop carried edges from SU(3)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(7)
+# CHECK-NEXT: ===== Loop Carried Edges End =====
+
+--- |
+  define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %cmp11 = icmp sgt i32 %n, 1
+    br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:
+    %load_initial = load i32, ptr %a, align 4
+    %cgep = getelementptr i8, ptr %a, i32 4
+    br label %for.body
+
+  for.cond.cleanup:
+    ret void
+
+  for.body:
+    %lsr.iv = phi ptr [ %cgep, %for.body.preheader ], [ %cgep16, %for.body ]
+    %store_forwarded = phi i32 [ %load_initial, %for.body.preheader ], [ %add, %for.body ]
+    %i.012 = phi i32 [ 1, %for.body.preheader ], [ %inc, %for.body ]
+    %0 = load i32, ptr %lsr.iv, align 4, !tbaa !5
+    %add = shl nsw i32 %0, 1
+    store i32 %add, ptr %lsr.iv, align 4, !tbaa !5
+    %1 = add i32 %store_forwarded, %i.012
+    %cgep15 = getelementptr i8, ptr %lsr.iv, i32 -4
+    store i32 %1, ptr %cgep15, align 4, !tbaa !5
+    %inc = add nuw nsw i32 %i.012, 1
+    %exitcond.not = icmp eq i32 %n, %inc
+    %cgep16 = getelementptr i8, ptr %lsr.iv, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1
+  
+    %9:intregs = COPY $r1
+    %8:intregs = COPY $r0
+    %10:predregs = C2_cmpgti %9, 1
+    J2_jumpf %10, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs, %1:intregs = L2_loadri_pi %8, 4 :: (load (s32) from %ir.a)
+    %12:intregs = A2_tfrsi 1
+    %16:intregs = A2_addi %9, -1
+    %17:intregs = COPY %16
+    J2_loop0r %bb.3, %17, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body (machine-block-address-taken):
+    successors: %bb.2(0x04000000), %bb.3(0x7c000000)
+  
+    %2:intregs = PHI %1, %bb.1, %7, %bb.3
+    %3:intregs = PHI %0, %bb.1, %5, %bb.3
+    %4:intregs = PHI %12, %bb.1, %6, %bb.3
+    %13:intregs = L2_loadri_io %2, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5)
+    %5:intregs = nsw S2_asl_i_r killed %13, 1
+    S2_storeri_io %2, 0, %5 :: (store (s32) into %ir.lsr.iv, !tbaa !5)
+    %14:intregs = A2_add %3, %4
+    S2_storeri_io %2, -4, killed %14 :: (store (s32) into %ir.cgep15, !tbaa !5)
+    %6:intregs = nuw nsw A2_addi %4, 1
+    %7:intregs = A2_addi %2, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir
new file mode 100644
index 0000000000000..ca59b97dd11e9
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir
@@ -0,0 +1,109 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop-carried memory dependencies are added correctly.
+# The original code is as follows.
+#
+# ```
+# void f(int * restrict a, int * restrict b, int n) {
+#   for (int i = 0; i < n; i++) {
+#     a[i] += i;
+#     b[i] += a[i+1];
+#   }
+# }
+# ```
+# 
+# Loop-carried dependencies exist from load for a[i+1] to store for a[i].
+# FIXME: Currently the following dependencies are missed.
+#
+# Loop carried edges from SU(7)
+#   Order
+#     SU(5)
+
+# CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT: ===== Loop Carried Edges End =====
+
+--- |
+  define dso_local void @f(ptr noalias nocapture noundef %a, ptr noalias nocapture noundef %b, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %cmp11 = icmp sgt i32 %n, 0
+    br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:
+    %.pre = load i32, ptr %a, align 4, !tbaa !5
+    %cgep = getelementptr i8, ptr %a, i32 4
+    br label %for.body
+
+  for.cond.cleanup:
+    ret void
+
+  for.body:
+    %lsr.iv15 = phi ptr [ %cgep, %for.body.preheader ], [ %cgep20, %for.body ]
+    %lsr.iv13 = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %lsr.iv = phi ptr [ %b, %for.body.preheader ], [ %cgep19, %for.body ]
+    %0 = phi i32 [ %2, %for.body ], [ %.pre, %for.body.preheader ]
+    %i.012 = phi i32 [ %add1, %for.body ], [ 0, %for.body.preheader ]
+    %1 = add i32 %0, %i.012
+    %cgep18 = getelementptr i8, ptr %lsr.iv15, i32 -4
+    store i32 %1, ptr %cgep18, align 4, !tbaa !5
+    %add1 = add nuw nsw i32 %i.012, 1
+    %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5
+    %3 = load i32, ptr %lsr.iv, align 4, !tbaa !5
+    %add4 = add nsw i32 %3, %2
+    store i32 %add4, ptr %lsr.iv, align 4, !tbaa !5
+    %lsr.iv.next = add i32 %lsr.iv13, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep19 = getelementptr i8, ptr %lsr.iv, i32 4
+    %cgep20 = getelementptr i8, ptr %lsr.iv15, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1, $r2
+  
+    %14:intregs = COPY $r2
+    %13:intregs = COPY $r1
+    %12:intregs = COPY $r0
+    %15:predregs = C2_cmpgti %14, 0
+    J2_jumpf %15, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs, %1:intregs = L2_loadri_pi %12, 4 :: (load (s32) from %ir.a, !tbaa !5)
+    %17:intregs = A2_tfrsi 0
+    %22:intregs = COPY %14
+    J2_loop0r %bb.3, %22, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %2:intregs = PHI %1, %bb.1, %11, %bb.3
+    %4:intregs = PHI %13, %bb.1, %10, %bb.3
+    %5:intregs = PHI %0, %bb.1, %8, %bb.3
+    %6:intregs = PHI %17, %bb.1, %7, %bb.3
+    %18:intregs = A2_add %5, %6
+    S2_storeri_io %2, -4, killed %18 :: (store (s32) into %ir.cgep18, !tbaa !5)
+    %7:intregs = nuw nsw A2_addi %6, 1
+    %8:intregs = L2_loadri_io %2, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5)
+    %19:intregs = L2_loadri_io %4, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5)
+    %20:intregs = nsw A2_add killed %19, %8
+    %10:intregs = S2_storeri_pi %4, 4, killed %20 :: (store (s32) into %ir.lsr.iv, !tbaa !5)
+    %11:intregs = A2_addi %2, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir
new file mode 100644
index 0000000000000..4bc4b48735947
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir
@@ -0,0 +1,109 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop carried memory dependences are computed correctly.
+# The original code is as follows.
+#
+# ```
+# void f(int *a, int n) {
+#   for (int i = 0; i < n-2; i++) {
+#     a[i] += a[i+10];
+#     a[i+2] += i;
+#   }
+# }
+# ```
+#
+# Here is what each instruction does.
+# SU(2): Load a[i+10]
+# SU(3): Store it to a[i]
+# SU(4): Load a[i+2], add i, then store it
+#
+# FIXME: Currently the following dependencies are missed.
+#
+# Loop carried edges from SU(4)
+#   Order
+#     SU(3)
+
+# CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT:   Loop carried edges from SU(2)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(3)
+# CHECK-NEXT:       SU(4)
+# CHECK-NEXT: ===== Loop Carried Edges End =====
+
+--- |
+  define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) {
+  entry:
+    %cmp13 = icmp sgt i32 %n, 2
+    br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:
+    %0 = add i32 %n, -2
+    br label %for.body
+
+  for.cond.cleanup:
+    ret void
+
+  for.body:
+    %lsr.iv15 = phi ptr [ %a, %for.body.preheader ], [ %cgep19, %for.body ]
+    %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %i.014 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+    %cgep = getelementptr i8, ptr %lsr.iv15, i32 40
+    %1 = load i32, ptr %cgep, align 4, !tbaa !5
+    %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5
+    %add2 = add nsw i32 %2, %1
+    store i32 %add2, ptr %lsr.iv15, align 4, !tbaa !5
+    %cgep18 = getelementptr i8, ptr %lsr.iv15, i32 8
+    %3 = load i32, ptr %cgep18, align 4, !tbaa !5
+    %4 = add i32 %i.014, %3
+    store i32 %4, ptr %cgep18, align 4, !tbaa !5
+    %inc = add nuw nsw i32 %i.014, 1
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep19 = getelementptr i8, ptr %lsr.iv15, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1
+  
+    %8:intregs = COPY $r1
+    %7:intregs = COPY $r0
+    %9:predregs = C2_cmpgti %8, 2
+    J2_jumpf %9, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs = A2_addi %8, -2
+    %11:intregs = A2_tfrsi 0
+    %14:intregs = COPY %0
+    J2_loop0r %bb.3, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %1:intregs = PHI %7, %bb.1, %6, %bb.3
+    %3:intregs = PHI %11, %bb.1, %4, %bb.3
+    %12:intregs = L2_loadri_io %1, 40 :: (load (s32) from %ir.cgep, !tbaa !5)
+    L4_add_memopw_io %1, 0, killed %12 :: (store (s32) into %ir.lsr.iv15, !tbaa !5), (load (s32) from %ir.lsr.iv15, !tbaa !5)
+    L4_add_memopw_io %1, 8, %3 :: (store (s32) into %ir.cgep18, !tbaa !5), (load (s32) from %ir.cgep18, !tbaa !5)
+    %4:intregs = nuw nsw A2_addi %3, 1
+    %6:intregs = A2_addi %1, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir
new file mode 100644
index 0000000000000..77c3d569db181
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir
@@ -0,0 +1,111 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop carried memory dependencies are correctly added when two
+# arrays may point to the same memory location.
+#
+# ```
+# void f(int *a, int *b, int n) {
+#   for (int i = 0; i < n; i++) {
+#     a[i] += b[i];
+#     b[i] += a[i];
+#   }
+# }
+# ```
+#
+# Here is what each instruction does.
+# SU(2): Load b[i]
+# SU(3): Load a[i]
+# SU(5): Store a[i]
+# SU(6): Load b[i]
+# SU(8): Store b[i]
+#
+# Note that if there is already a dependency between two instructions, we don't
+# add loop-carried on between them since non-loop-carried one imposes stronger
+# constraint than loop-carried one.
+#
+# FIXME: Currently the following dependencies are missed.
+#  Loop carried edges from SU(5)
+#    Order
+#      SU(2)
+#  Loop carried edges from SU(6)
+#    Order
+#      SU(5)
+#  Loop carried edges from SU(8)
+#    Order
+#      SU(3)
+#      SU(5)
+
+# CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT: ===== Loop Carried Edges End =====
+
+--- |
+  define dso_local void @f(ptr nocapture noundef %a, ptr nocapture noundef %b, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %cmp12 = icmp sgt i32 %n, 0
+    br i1 %cmp12, label %for.body, label %for.cond.cleanup
+
+  for.cond.cleanup:
+    ret void
+
+  for.body:
+    %lsr.iv15 = phi ptr [ %cgep17, %for.body ], [ %b, %entry ]
+    %lsr.iv14 = phi ptr [ %cgep, %for.body ], [ %a, %entry ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %n, %entry ]
+    %0 = load i32, ptr %lsr.iv15, align 4, !tbaa !5
+    %1 = load i32, ptr %lsr.iv14, align 4, !tbaa !5
+    %add = add nsw i32 %1, %0
+    store i32 %add, ptr %lsr.iv14, align 4, !tbaa !5
+    %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5
+    %add4 = add nsw i32 %2, %add
+    store i32 %add4, ptr %lsr.iv15, align 4, !tbaa !5
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep = getelementptr i8, ptr %lsr.iv14, i32 4
+    %cgep17 = getelementptr i8, ptr %lsr.iv15, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.3, %bb.1
+    liveins: $r0, $r1, $r2
+  
+    %8:intregs = COPY $r2
+    %7:intregs = COPY $r1
+    %6:intregs = COPY $r0
+    %9:predregs = C2_cmpgti %8, 0
+    J2_jumpf %9, %bb.1, implicit-def $pc
+  
+  bb.3:
+    %16:intregs = COPY %8
+    J2_loop0r %bb.2, %16, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.2, implicit-def $pc
+  
+  bb.1.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.2.for.body:
+    successors: %bb.1, %bb.2
+  
+    %0:intregs = PHI %7, %bb.3, %5, %bb.2
+    %1:intregs = PHI %6, %bb.3, %4, %bb.2
+    %10:intregs = L2_loadri_io %0, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5)
+    %11:intregs = L2_loadri_io %1, 0 :: (load (s32) from %ir.lsr.iv14, !tbaa !5)
+    %12:intregs = nsw A2_add killed %11, killed %10
+    %4:intregs = S2_storeri_pi %1, 4, %12 :: (store (s32) into %ir.lsr.iv14, !tbaa !5)
+    %13:intregs = L2_loadri_io %0, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5)
+    %14:intregs = nsw A2_add killed %13, %12
+    %5:intregs = S2_storeri_pi %0, 4, killed %14 :: (store (s32) into %ir.lsr.iv15, !tbaa !5)
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.1, implicit-def $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir
new file mode 100644
index 0000000000000..4281d15377141
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir
@@ -0,0 +1,154 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop carried memory dependencies are computed correctly
+# when barrier instructions exist in the loop.
+# The original code is as follows.
+# 
+# ```
+# volatile int x = 0;
+# void f(int * restrict a, int * restrict b, int * restrict c, int n) {
+#   for (int i = 0; i < n; i++) {
+#     a[i] *= c[i];
+#     b[i] *= c[i];
+#     x += i;
+#     a[i + 1] *= i;
+#     x += i;
+#     b[i + 1] *= i;
+#   }
+# }
+# ```
+#
+# FIXME: Currently the following dependencies are missed.
+# Loop carried edges from SU(16)
+#   Order
+#     SU(6)
+#     SU(8)
+#     SU(10)
+#     SU(11)
+# Loop carried edges from SU(17)
+#   Order
+#     SU(10)
+#     SU(11)
+# Loop carried edges from SU(19)
+#   Order
+#     SU(10)
+#     SU(11)
+
+# CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT: ===== Loop Carried Edges End =====
+
+--- |
+  @x = dso_local global i32 0, align 4
+
+  define dso_local void @f(ptr noalias nocapture noundef %a, ptr noalias nocapture noundef %b, ptr noalias nocapture noundef readonly %c, i32 noundef %n) {
+  entry:
+    %cmp26 = icmp sgt i32 %n, 0
+    br i1 %cmp26, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:
+    %.pre = load i32, ptr %a, align 4, !tbaa !5
+    %.pre28 = load i32, ptr %b, align 4, !tbaa !5
+    %cgep = getelementptr i8, ptr %b, i32 4
+    %cgep37 = getelementptr i8, ptr %a, i32 4
+    br label %for.body
+
+  for.cond.cleanup:
+    ret void
+
+  for.body:
+    %lsr.iv35 = phi ptr [ %c, %for.body.preheader ], [ %cgep42, %for.body ]
+    %lsr.iv31 = phi ptr [ %cgep37, %for.body.preheader ], [ %cgep41, %for.body ]
+    %lsr.iv = phi ptr [ %cgep, %for.body.preheader ], [ %cgep40, %for.body ]
+    %0 = phi i32 [ %mul11, %for.body ], [ %.pre28, %for.body.preheader ]
+    %1 = phi i32 [ %mul7, %for.body ], [ %.pre, %for.body.preheader ]
+    %i.027 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+    %2 = load i32, ptr %lsr.iv35, align 4, !tbaa !5
+    %mul = mul nsw i32 %1, %2
+    %cgep38 = getelementptr i8, ptr %lsr.iv31, i32 -4
+    store i32 %mul, ptr %cgep38, align 4, !tbaa !5
+    %mul4 = mul nsw i32 %0, %2
+    %cgep39 = getelementptr i8, ptr %lsr.iv, i32 -4
+    store i32 %mul4, ptr %cgep39, align 4, !tbaa !5
+    %3 = load volatile i32, ptr @x, align 4, !tbaa !5
+    %4 = add i32 %i.027, %3
+    store volatile i32 %4, ptr @x, align 4, !tbaa !5
+    %add5 = add nuw nsw i32 %i.027, 1
+    %5 = load i32, ptr %lsr.iv31, align 4, !tbaa !5
+    %mul7 = mul nsw i32 %5, %i.027
+    store i32 %mul7, ptr %lsr.iv31, align 4, !tbaa !5
+    %6 = load volatile i32, ptr @x, align 4, !tbaa !5
+    %7 = add i32 %i.027, %6
+    store volatile i32 %7, ptr @x, align 4, !tbaa !5
+    %8 = load i32, ptr %lsr.iv, align 4, !tbaa !5
+    %mul11 = mul nsw i32 %8, %i.027
+    store i32 %mul11, ptr %lsr.iv, align 4, !tbaa !5
+    %exitcond.not = icmp eq i32 %n, %add5
+    %cgep40 = getelementptr i8, ptr %lsr.iv, i32 4
+    %cgep41 = getelementptr i8, ptr %lsr.iv31, i32 4
+    %cgep42 = getelementptr i8, ptr %lsr.iv35, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1, $r2, $r3
+  
+    %19:intregs = COPY $r3
+    %18:intregs = COPY $r2
+    %17:intregs = COPY $r1
+    %16:intregs = COPY $r0
+    %20:predregs = C2_cmpgti %19, 0
+    J2_jumpf %20, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs, %3:intregs = L2_loadri_pi %16, 4 :: (load (s32) from %ir.a, !tbaa !5)
+    %1:intregs, %2:intregs = L2_loadri_pi %17, 4 :: (load (s32) from %ir.b, !tbaa !5)
+    %22:intregs = A2_tfrsi 0
+    %26:intregs = C4_addipc target-flags(hexagon-pcrel) @x
+    %30:intregs = COPY %19
+    J2_loop0r %bb.3, %30, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %4:intregs = PHI %18, %bb.1, %15, %bb.3
+    %5:intregs = PHI %3, %bb.1, %14, %bb.3
+    %6:intregs = PHI %2, %bb.1, %13, %bb.3
+    %7:intregs = PHI %1, %bb.1, %12, %bb.3
+    %8:intregs = PHI %0, %bb.1, %11, %bb.3
+    %9:intregs = PHI %22, %bb.1, %10, %bb.3
+    %23:intregs, %15:intregs = L2_loadri_pi %4, 4 :: (load (s32) from %ir.lsr.iv35, !tbaa !5)
+    %24:intregs = nsw M2_mpyi %8, %23
+    S2_storeri_io %5, -4, killed %24 :: (store (s32) into %ir.cgep38, !tbaa !5)
+    %25:intregs = nsw M2_mpyi %7, %23
+    S2_storeri_io %6, -4, killed %25 :: (store (s32) into %ir.cgep39, !tbaa !5)
+    L4_add_memopw_io %26, 0, %9 :: (volatile store (s32) into @x, !tbaa !5), (volatile dereferenceable load (s32) from @x, !tbaa !5)
+    %10:intregs = nuw nsw A2_addi %9, 1
+    %27:intregs = L2_loadri_io %5, 0 :: (load (s32) from %ir.lsr.iv31, !tbaa !5)
+    %11:intregs = nsw M2_mpyi killed %27, %9
+    S2_storeri_io %5, 0, %11 :: (store (s32) into %ir.lsr.iv31, !tbaa !5)
+    L4_add_memopw_io %26, 0, %9 :: (volatile store (s32) into @x, !tbaa !5), (volatile dereferenceable load (s32) from @x, !tbaa !5)
+    %28:intregs = L2_loadri_io %6, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5)
+    %12:intregs = nsw M2_mpyi killed %28, %9
+    S2_storeri_io %6, 0, %12 :: (store (s32) into %ir.lsr.iv, !tbaa !5)
+    %13:intregs = A2_addi %6, 4
+    %14:intregs = A2_addi %5, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+...