Skip to content

[AArch64] Sink vscale calls into loops for better isel #70304

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8230,7 +8230,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
return true;
}
return false;
}

if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
Expand Down
29 changes: 29 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14520,6 +14520,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
return true;
}

/// We want to sink following cases:
/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
if (match(Op, m_VScale()))
return true;
if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
return true;
}
return false;
}

/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
Expand Down Expand Up @@ -14636,6 +14649,22 @@ bool AArch64TargetLowering::shouldSinkOperands(
}
}

// Sink vscales closer to uses for better isel
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
case Instruction::Add:
case Instruction::Sub:
for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
if (shouldSinkVScale(I->getOperand(Op), Ops)) {
Ops.push_back(&I->getOperandUse(Op));
return true;
}
}
break;
default:
break;
}

if (!I->getType()->isVectorTy())
return false;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: cntd x9
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: neg x10, x9
; CHECK-NEXT: mov w11, #100 // =0x64
; CHECK-NEXT: neg x9, x9
; CHECK-NEXT: mov w10, #100 // =0x64
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: and x10, x10, x11
; CHECK-NEXT: and x10, x9, x10
; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
Expand All @@ -33,7 +33,7 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: adds x10, x10, x9
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
Expand Down Expand Up @@ -106,11 +106,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: fmov d2, #2.00000000
; CHECK-NEXT: cntd x9
; CHECK-NEXT: mov w11, #100 // =0x64
; CHECK-NEXT: mov w10, #100 // =0x64
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: neg x10, x9
; CHECK-NEXT: neg x9, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: and x10, x10, x11
; CHECK-NEXT: and x10, x9, x10
; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d
; CHECK-NEXT: mov z1.d, p0/m, z2.d
Expand All @@ -125,7 +125,7 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: adds x10, x10, x9
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
Expand Down Expand Up @@ -193,34 +193,34 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: cntw x9
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: neg x10, x9
; CHECK-NEXT: mov w11, #1000 // =0x3e8
; CHECK-NEXT: rdvl x13, #2
; CHECK-NEXT: neg x9, x9
; CHECK-NEXT: mov w10, #1000 // =0x3e8
; CHECK-NEXT: rdvl x12, #2
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: and x10, x10, x11
; CHECK-NEXT: and x10, x9, x10
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: rdvl x11, #4
; CHECK-NEXT: add x12, x1, x13
; CHECK-NEXT: add x13, x0, x13
; CHECK-NEXT: add x11, x1, x12
; CHECK-NEXT: add x12, x0, x12
; CHECK-NEXT: rdvl x13, #4
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x14, x0, x8
; CHECK-NEXT: add x15, x13, x8
; CHECK-NEXT: add x15, x12, x8
; CHECK-NEXT: add x16, x1, x8
; CHECK-NEXT: add x17, x12, x8
; CHECK-NEXT: add x17, x11, x8
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl]
; CHECK-NEXT: ld1b { z6.b }, p1/z, [x13, x8]
; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8]
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x15, #1, mul vl]
; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1, x8]
; CHECK-NEXT: ld1d { z17.d }, p0/z, [x16, #1, mul vl]
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x12, x8]
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8]
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: adds x10, x10, x9
; CHECK-NEXT: add x8, x8, x13
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z4.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z17.d, z5.d, #0
; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AArch64/sve-int-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
; CHECK-NEXT: b.lt .LBB70_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, w3
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov z0.s, #1 // =0x1
; CHECK-NEXT: whilelo p0.s, xzr, x9
; CHECK-NEXT: whilelo p1.s, xzr, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cntw x10
; CHECK-NEXT: .LBB70_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
; CHECK-NEXT: mad z1.s, p1/m, z2.s, z0.s
; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
; CHECK-NEXT: ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
; CHECK-NEXT: mad z1.s, p0/m, z2.s, z0.s
; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p0.s, x8, x9
; CHECK-NEXT: whilelo p1.s, x8, x9
; CHECK-NEXT: b.mi .LBB70_2
; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup
; CHECK-NEXT: ret
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
; CHECK-NEXT: whilelt p0.s, wzr, w0
; CHECK-NEXT: b.pl .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, wzr
; CHECK-NEXT: cntw x8
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: cntw x9
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: whilelt p0.s, w9, w0
; CHECK-NEXT: add w9, w9, w8
; CHECK-NEXT: whilelt p0.s, w8, w0
; CHECK-NEXT: add w8, w8, w9
; CHECK-NEXT: b.mi .LBB0_2
; CHECK-NEXT: .LBB0_3: // %exit
; CHECK-NEXT: ret
Expand Down
168 changes: 168 additions & 0 deletions llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -codegenprepare -S -o - %s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
; CHECK-LABEL: define void @inc_add
; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
entry:
%wide.trip.count = zext i32 %N to i64
%0 = tail call i64 @llvm.vscale.i64()
%1 = shl nuw nsw i64 %0, 2
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%2 = getelementptr inbounds float, ptr %in1, i64 %index
%wide.load = load <vscale x 4 x float>, ptr %2, align 4
%3 = getelementptr inbounds float, ptr %in2, i64 %index
%wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
%5 = getelementptr inbounds float, ptr %out, i64 %index
store <vscale x 4 x float> %4, ptr %5, align 4
%index.next = add nuw i64 %index, %1
%6 = icmp eq i64 %index.next, %wide.trip.count
br i1 %6, label %for.cond.cleanup, label %vector.body

for.cond.cleanup:
ret void
}

define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
; CHECK-LABEL: define void @dec_sub
; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
; CHECK-NEXT: [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
; CHECK-NEXT: [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
; CHECK-NEXT: [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
; CHECK-NEXT: [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
entry:
%0 = zext i32 %N to i64
%1 = tail call i64 @llvm.vscale.i64()
%2 = shl nuw nsw i64 %1, 2
%3 = sub nsw i64 1, %2
%invariant.gep = getelementptr float, ptr %in1, i64 %3
%invariant.gep20 = getelementptr float, ptr %in2, i64 %3
%invariant.gep22 = getelementptr float, ptr %out, i64 %3
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%offset.idx = sub i64 %0, %index
%gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
%wide.load = load <vscale x 4 x float>, ptr %gep, align 4
%gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
%wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
%gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
store <vscale x 4 x float> %4, ptr %gep23, align 4
%index.next = add nuw i64 %index, %2
%5 = icmp eq i64 %index.next, %0
br i1 %5, label %for.cond.cleanup, label %vector.body

for.cond.cleanup:
ret void
}

define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %val) #0 {
; CHECK-LABEL: define void @gep
; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr nocapture noundef writeonly [[PTR:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 4
; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 4
; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
; CHECK-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 4
; CHECK-NEXT: [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
; CHECK: for.exit:
; CHECK-NEXT: ret void
;
entry:
%0 = tail call i64 @llvm.vscale.i64()
%1 = shl i64 %0, 4
br label %for.body

for.body: ; preds = %for.body, %for.body.lr.ph.new
%lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ]
%ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ]
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %ptr.addr, i32 1, <vscale x 16 x i1> %pg)
%add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %1
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr, i32 1, <vscale x 16 x i1> %pg)
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.1, i32 1, <vscale x 16 x i1> %pg)
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %1
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.2, i32 1, <vscale x 16 x i1> %pg)
%add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %1
%lsr.iv.next = add i32 %lsr.iv, -4
%cmp = icmp eq i32 %lsr.iv.next, 0
br i1 %cmp, label %for.exit, label %for.body

for.exit:
ret void
}

declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)

declare i64 @llvm.vscale.i64()

attributes #0 = { "target-features"="+sve2" }