diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index dbf320f88fd65..7d96b18e9fbdc 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -3178,6 +3178,32 @@ bool SMSchedule::normalizeNonPipelinedInstructions( << ") is not pipelined; moving from cycle " << OldCycle << " to " << NewCycle << " Instr:" << *SU.getInstr()); } + + // We traverse the SUs in the order of the original basic block. Computing + // NewCycle in this order normally works fine because all dependencies + // (except for loop-carried dependencies) don't violate the original order. + // However, an artificial dependency (e.g., added by CopyToPhiMutation) can + // break it. That is, there may be exist an artificial dependency from + // bottom to top. In such a case, NewCycle may become too large to be + // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the + // following case: + // + // | Inst0 <-+ + // SU order | | artificial dep + // | Inst1 --+ + // v + // + // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of + // Inst0 must be greater than or equal to N so that Inst0 is not be + // scheduled at Stage 0. In such cases, we reject this schedule at this + // time. + // FIXME: The reason for this is the existence of artificial dependencies + // that are contradict to the original SU order. If ignoring artificial + // dependencies does not affect correctness, then it is better to ignore + // them. + if (FirstCycle + InitiationInterval <= NewCycle) + return false; + NewLastCycle = std::max(NewLastCycle, NewCycle); } LastCycle = NewLastCycle; diff --git a/llvm/test/CodeGen/AArch64/sms-unpipeline-insts3.mir b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts3.mir new file mode 100644 index 0000000000000..0b1dcecba03c4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts3.mir @@ -0,0 +1,226 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -run-pass=pipeliner -o - %s -aarch64-enable-pipeliner -pipeliner-enable-copytophi=1 + +--- | + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" + + @glb = internal unnamed_addr global { [256 x i32], [256 x i32], [256 x i32] } zeroinitializer + + ; Function Attrs: nounwind vscale_range(1,16) + define internal void @f(i32 %0, i32 %1) #0 { + entry: + %reass.sub = sub i32 %1, %0 + %invariant.op = add i32 %0, 1 + %invariant.op3 = add i32 %0, 2 + %omp_loop.cmp5.not = icmp eq i32 %reass.sub, -1 + br i1 %omp_loop.cmp5.not, label %exit, label %preheader + + preheader: ; preds = %entry + %2 = add i32 %1, 1 + %3 = icmp slt i32 %2, %invariant.op + br i1 %3, label %body.preheader, label %vector.ph + + body.preheader: ; preds = %preheader + %4 = add i32 %1, 1 + %5 = sub i32 %4, %0 + br label %body + + vector.ph: ; preds = %preheader + %6 = add i32 %1, 1 + %7 = sub i32 %6, %0 + %8 = tail call i32 @llvm.vscale.i32() + %9 = shl nuw nsw i32 %8, 2 + %10 = tail call i32 @llvm.vscale.i32() + %11 = shl nuw nsw i32 %10, 2 + %12 = call i32 @llvm.usub.sat.i32(i32 %7, i32 %11) + %active.lane.mask.entry = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 %7) + %13 = tail call @llvm.stepvector.nxv4i32() + %.splatinsert = insertelement poison, i32 %9, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %broadcast.splatinsert = insertelement poison, i32 %invariant.op, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %broadcast.splatinsert7 = insertelement poison, i32 %invariant.op3, i64 0 + %broadcast.splat8 = shufflevector %broadcast.splatinsert7, poison, zeroinitializer + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ] + %vec.ind = phi [ %13, %vector.ph ], [ %vec.ind.next, %vector.body ] + %14 = add %vec.ind, %broadcast.splat + %15 = extractelement %14, i64 0 + %16 = sext i32 %15 to i64 + %17 = add nsw i64 %16, -1 + %18 = getelementptr i32, ptr @glb, i64 %17 + call void @llvm.masked.store.nxv4i32.p0( %14, ptr %18, i32 4, %active.lane.mask) + %19 = add %vec.ind, %broadcast.splat8 + %20 = mul %14, %19 + %21 = sdiv %20, splat (i32 2) + %22 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 1024), i64 %17 + call void @llvm.masked.store.nxv4i32.p0( %21, ptr %22, i32 4, %active.lane.mask) + %23 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 2048), i64 %17 + %wide.masked.load = call @llvm.masked.load.nxv4i32.p0(ptr %23, i32 4, %active.lane.mask, poison) + %24 = add %wide.masked.load, %21 + call void @llvm.masked.store.nxv4i32.p0( %24, ptr %23, i32 4, %active.lane.mask) + %25 = tail call i32 @llvm.vscale.i32() + %26 = shl nuw nsw i32 %25, 2 + %index.next = add i32 %index, %26 + %active.lane.mask.next = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 %index, i32 %12) + %vec.ind.next = add %vec.ind, %.splat + %27 = extractelement %active.lane.mask.next, i64 0 + br i1 %27, label %vector.body, label %exit + + exit: ; preds = %vector.body, %body, %entry + ret void + + body: ; preds = %body.preheader, %body + %lsr.iv2 = phi i32 [ %invariant.op3, %body.preheader ], [ %lsr.iv.next3, %body ] + %lsr.iv = phi i32 [ %5, %body.preheader ], [ %lsr.iv.next, %body ] + %28 = add i32 %lsr.iv2, -1 + %29 = sext i32 %28 to i64 + %30 = add nsw i64 %29, -1 + %31 = getelementptr i32, ptr @glb, i64 %30 + store i32 %28, ptr %31, align 4 + %32 = mul i32 %28, %lsr.iv2 + %33 = sdiv i32 %32, 2 + %34 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 1024), i64 %30 + store i32 %33, ptr %34, align 4 + %35 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 2048), i64 %30 + %36 = load i32, ptr %35, align 4 + %37 = add i32 %36, %33 + store i32 %37, ptr %35, align 4 + %lsr.iv.next = add i32 %lsr.iv, -1 + %lsr.iv.next3 = add i32 %lsr.iv2, 1 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond.not, label %exit, label %body + } + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare @llvm.stepvector.nxv4i32() #1 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare i32 @llvm.vscale.i32() #1 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #1 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) + declare void @llvm.masked.store.nxv4i32.p0(, ptr captures(none), i32 immarg, ) #2 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) + declare @llvm.masked.load.nxv4i32.p0(ptr captures(none), i32 immarg, , ) #3 + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare i32 @llvm.usub.sat.i32(i32, i32) #4 + + attributes #0 = { nounwind vscale_range(1,16) "frame-pointer"="non-leaf" "target-cpu"="neoverse-v1" "target-features"="+sve" } + attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } + attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } + attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.5(0x30000000), %bb.1(0x50000000) + liveins: $w0, $w1 + + %20:gpr32common = COPY $w1 + %19:gpr32common = COPY $w0 + %21:gpr32common = SUBWrr %20, %19 + dead $wzr = ADDSWri %21, 1, 0, implicit-def $nzcv + Bcc 0, %bb.5, implicit $nzcv + B %bb.1 + + bb.1.preheader: + successors: %bb.2(0x40000000), %bb.3(0x40000000) + + %22:gpr32common = ADDWri %19, 1, 0 + %23:gpr32sp = ADDWri %19, 2, 0 + %25:gpr32common = ADDWri %20, 1, 0 + dead $wzr = SUBSWrr killed %25, %22, implicit-def $nzcv + Bcc 10, %bb.3, implicit $nzcv + B %bb.2 + + bb.2.body.preheader: + successors: %bb.6(0x80000000) + + %1:gpr32sp = COPY %23 + %55:gpr32sp = ADDWri %21, 1, 0 + %2:gpr32all = COPY %55 + %57:gpr64common = MOVaddr target-flags(aarch64-page) @glb, target-flags(aarch64-pageoff, aarch64-nc) @glb + B %bb.6 + + bb.3.vector.ph: + successors: %bb.4(0x80000000) + + %29:gpr32common = ADDWri %21, 1, 0 + %30:gpr64 = CNTW_XPiI 31, 1, implicit $vg + %31:gpr32common = COPY %30.sub_32 + %32:gpr32 = SUBSWrr %29, %31, implicit-def $nzcv + %33:gpr32 = COPY $wzr + %34:gpr32 = CSELWr %33, killed %32, 3, implicit $nzcv + %4:ppr = WHILELO_PWW_S %33, %29, implicit-def dead $nzcv + %5:zpr = INDEX_II_S 0, 1, implicit $vg + %6:zpr = DUP_ZR_S %31 + %7:zpr = DUP_ZR_S %22 + %8:zpr = DUP_ZR_S %23 + %27:gpr32all = COPY %33 + %37:gpr64common = MOVaddr target-flags(aarch64-page) @glb, target-flags(aarch64-pageoff, aarch64-nc) @glb + %39:gpr64common = MOVi64imm -1 + %41:ppr_3b = PTRUE_S 31, implicit $vg + %44:gpr64common = MOVi64imm 255 + %45:gpr64common = MOVi64imm 511 + + bb.4.vector.body: + successors: %bb.4(0x7c000000), %bb.5(0x04000000) + + %9:gpr32 = PHI %27, %bb.3, %12, %bb.4 + %10:ppr_3b = PHI %4, %bb.3, %13, %bb.4 + %11:zpr = PHI %5, %bb.3, %14, %bb.4 + %35:zpr = ADD_ZZZ_S %11, %7 + %36:gpr32 = COPY %35.ssub + %38:gpr64sp = ADDXrx %37, killed %36, 50 + ST1W %35, %10, %38, %39 :: (store unknown-size into %ir.18, align 4) + %40:zpr = ADD_ZZZ_S %11, %8 + %42:zpr = MUL_ZPZZ_S_UNDEF %41, %35, killed %40 + %43:zpr = ASRD_ZPmI_S %41, %42, 1 + ST1W %43, %10, %38, %44 :: (store unknown-size into %ir.22, align 4) + %46:zpr = LD1W %10, %38, %45 :: (load unknown-size from %ir.23, align 4) + %47:zpr = ADD_ZZZ_S killed %46, %43 + ST1W killed %47, %10, %38, %45 :: (store unknown-size into %ir.23, align 4) + %50:gpr32 = ADDWrr %9, %31 + %12:gpr32all = COPY %50 + %13:ppr = WHILELO_PWW_S %9, %34, implicit-def $nzcv + %14:zpr = ADD_ZZZ_S %11, %6 + Bcc 4, %bb.4, implicit $nzcv + B %bb.5 + + bb.5.exit: + RET_ReallyLR + + bb.6.body: + successors: %bb.5(0x04000000), %bb.6(0x7c000000) + + %15:gpr32common = PHI %1, %bb.2, %18, %bb.6 + %16:gpr32sp = PHI %2, %bb.2, %17, %bb.6 + %56:gpr32common = SUBWri %15, 1, 0 + %58:gpr64sp = ADDXrx %57, %56, 50 + STURWi %56, %58, -4 :: (store (s32) into %ir.31) + %59:gpr32 = MADDWrrr %56, %15, $wzr + %60:gpr32 = ADDWrs %59, %59, 95 + %61:gpr32 = SBFMWri killed %60, 1, 31 + STRWui %61, %58, 255 :: (store (s32) into %ir.34) + %62:gpr32 = LDRWui %58, 511 :: (load (s32) from %ir.35) + %63:gpr32 = ADDWrr killed %62, %61 + STRWui killed %63, %58, 511 :: (store (s32) into %ir.35) + %64:gpr32 = SUBSWri %16, 1, 0, implicit-def $nzcv + %17:gpr32all = COPY %64 + %65:gpr32sp = ADDWri %15, 1, 0 + %18:gpr32all = COPY %65 + Bcc 0, %bb.5, implicit $nzcv + B %bb.6 + +...