diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 187820717b6fd..fa0c753bff2f0 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8230,7 +8230,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) { return true; } - return false; } if (FreezeInst *FI = dyn_cast(I)) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 94901c2d1a656..c284a0370ac0f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14520,6 +14520,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl &Ops) { return true; } +/// We want to sink following cases: +/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale +static bool shouldSinkVScale(Value *Op, SmallVectorImpl &Ops) { + if (match(Op, m_VScale())) + return true; + if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) || + match(Op, m_Mul(m_VScale(), m_ConstantInt()))) { + Ops.push_back(&cast(Op)->getOperandUse(0)); + return true; + } + return false; +} + /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). @@ -14636,6 +14649,22 @@ bool AArch64TargetLowering::shouldSinkOperands( } } + // Sink vscales closer to uses for better isel + switch (I->getOpcode()) { + case Instruction::GetElementPtr: + case Instruction::Add: + case Instruction::Sub: + for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { + if (shouldSinkVScale(I->getOperand(Op), Ops)) { + Ops.push_back(&I->getOperandUse(Op)); + return true; + } + } + break; + default: + break; + } + if (!I->getType()->isVectorTy()) return false; diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index bb58248c6f60e..956d2d941ac71 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -18,10 +18,10 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: neg x10, x9 -; CHECK-NEXT: mov w11, #100 // =0x64 +; CHECK-NEXT: neg x9, x9 +; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: and x10, x10, x11 +; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d @@ -33,7 +33,7 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl] ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8] ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl] -; CHECK-NEXT: subs x10, x10, x9 +; CHECK-NEXT: adds x10, x10, x9 ; CHECK-NEXT: add x8, x8, x11 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 @@ -106,11 +106,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z1.d, #0 // =0x0 ; CHECK-NEXT: fmov d2, #2.00000000 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: mov w11, #100 // =0x64 +; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: neg x10, x9 +; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: and x10, x10, x11 +; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d ; CHECK-NEXT: mov z1.d, p0/m, z2.d @@ -125,7 +125,7 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl] ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8] ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl] -; CHECK-NEXT: subs x10, x10, x9 +; CHECK-NEXT: adds x10, x10, x9 ; CHECK-NEXT: add x8, x8, x11 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 @@ -193,34 +193,34 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cntw x9 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: neg x10, x9 -; CHECK-NEXT: mov w11, #1000 // =0x3e8 -; CHECK-NEXT: rdvl x13, #2 +; CHECK-NEXT: neg x9, x9 +; CHECK-NEXT: mov w10, #1000 // =0x3e8 +; CHECK-NEXT: rdvl x12, #2 ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: and x10, x10, x11 +; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d -; CHECK-NEXT: rdvl x11, #4 -; CHECK-NEXT: add x12, x1, x13 -; CHECK-NEXT: add x13, x0, x13 +; CHECK-NEXT: add x11, x1, x12 +; CHECK-NEXT: add x12, x0, x12 +; CHECK-NEXT: rdvl x13, #4 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x14, x0, x8 -; CHECK-NEXT: add x15, x13, x8 +; CHECK-NEXT: add x15, x12, x8 ; CHECK-NEXT: add x16, x1, x8 -; CHECK-NEXT: add x17, x12, x8 +; CHECK-NEXT: add x17, x11, x8 ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8] ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1b { z6.b }, p1/z, [x13, x8] +; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8] ; CHECK-NEXT: ld1d { z7.d }, p0/z, [x15, #1, mul vl] ; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1, x8] ; CHECK-NEXT: ld1d { z17.d }, p0/z, [x16, #1, mul vl] -; CHECK-NEXT: ld1b { z18.b }, p1/z, [x12, x8] +; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8] ; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl] -; CHECK-NEXT: subs x10, x10, x9 -; CHECK-NEXT: add x8, x8, x11 +; CHECK-NEXT: adds x10, x10, x9 +; CHECK-NEXT: add x8, x8, x13 ; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z4.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z17.d, z5.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0 diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll index 1bace71db0c11..486f59d7900e9 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll @@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) { ; CHECK-NEXT: b.lt .LBB70_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, #1 // =0x1 -; CHECK-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: cntw x10 ; CHECK-NEXT: .LBB70_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2] -; CHECK-NEXT: mad z1.s, p1/m, z2.s, z0.s -; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p1/z, [x2, x8, lsl #2] +; CHECK-NEXT: mad z1.s, p0/m, z2.s, z0.s +; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p0.s, x8, x9 +; CHECK-NEXT: whilelo p1.s, x8, x9 ; CHECK-NEXT: b.mi .LBB70_2 ; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll index 39fe92aae0619..124f81e7864d1 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll @@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) { ; CHECK-NEXT: whilelt p0.s, wzr, w0 ; CHECK-NEXT: b.pl .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w9, wzr -; CHECK-NEXT: cntw x8 +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: cntw x9 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: whilelt p0.s, w9, w0 -; CHECK-NEXT: add w9, w9, w8 +; CHECK-NEXT: whilelt p0.s, w8, w0 +; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: b.mi .LBB0_2 ; CHECK-NEXT: .LBB0_3: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll new file mode 100644 index 0000000000000..c80aa82ef9a83 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -codegenprepare -S -o - %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 { +; CHECK-LABEL: define void @inc_add +; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load , ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD16]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP2]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + %wide.trip.count = zext i32 %N to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 2 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds float, ptr %in1, i64 %index + %wide.load = load , ptr %2, align 4 + %3 = getelementptr inbounds float, ptr %in2, i64 %index + %wide.load16 = load , ptr %3, align 4 + %4 = fmul %wide.load, %wide.load16 + %5 = getelementptr inbounds float, ptr %out, i64 %index + store %4, ptr %5, align 4 + %index.next = add nuw i64 %index, %1 + %6 = icmp eq i64 %index.next, %wide.trip.count + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 { +; CHECK-LABEL: define void @dec_sub +; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]] +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]] +; CHECK-NEXT: [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]] +; CHECK-NEXT: [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[GEP]], align 4 +; CHECK-NEXT: [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load , ptr [[GEP21]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD16]] +; CHECK-NEXT: [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: store [[TMP4]], ptr [[GEP23]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + %0 = zext i32 %N to i64 + %1 = tail call i64 @llvm.vscale.i64() + %2 = shl nuw nsw i64 %1, 2 + %3 = sub nsw i64 1, %2 + %invariant.gep = getelementptr float, ptr %in1, i64 %3 + %invariant.gep20 = getelementptr float, ptr %in2, i64 %3 + %invariant.gep22 = getelementptr float, ptr %out, i64 %3 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %offset.idx = sub i64 %0, %index + %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx + %wide.load = load , ptr %gep, align 4 + %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx + %wide.load16 = load , ptr %gep21, align 4 + %4 = fmul %wide.load, %wide.load16 + %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx + store %4, ptr %gep23, align 4 + %index.next = add nuw i64 %index, %2 + %5 = icmp eq i64 %index.next, %0 + br i1 %5, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, %pg, %val) #0 { +; CHECK-LABEL: define void @gep +; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr nocapture noundef writeonly [[PTR:%.*]], [[PG:%.*]], [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[VAL]], ptr [[PTR_ADDR]], i32 1, [[PG]]) +; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]] +; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[VAL]], ptr [[ADD_PTR]], i32 1, [[PG]]) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 4 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]] +; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[VAL]], ptr [[ADD_PTR_1]], i32 1, [[PG]]) +; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 4 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]] +; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[VAL]], ptr [[ADD_PTR_2]], i32 1, [[PG]]) +; CHECK-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 4 +; CHECK-NEXT: [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]] +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.exit: +; CHECK-NEXT: ret void +; +entry: + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 4 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph.new + %lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ] + %ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ] + tail call void @llvm.masked.store.nxv16i8.p0( %val, ptr %ptr.addr, i32 1, %pg) + %add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %1 + tail call void @llvm.masked.store.nxv16i8.p0( %val, ptr %add.ptr, i32 1, %pg) + %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1 + tail call void @llvm.masked.store.nxv16i8.p0( %val, ptr %add.ptr.1, i32 1, %pg) + %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %1 + tail call void @llvm.masked.store.nxv16i8.p0( %val, ptr %add.ptr.2, i32 1, %pg) + %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %1 + %lsr.iv.next = add i32 %lsr.iv, -4 + %cmp = icmp eq i32 %lsr.iv.next, 0 + br i1 %cmp, label %for.exit, label %for.body + +for.exit: + ret void +} + +declare void @llvm.masked.store.nxv16i8.p0(, ptr nocapture, i32 immarg, ) + +declare i64 @llvm.vscale.i64() + +attributes #0 = { "target-features"="+sve2" }