diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f4163b0743a9a..0ce25fc9b4fc4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -952,8 +952,10 @@ class VPInstruction : public VPRecipeWithIRFlags, // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, - // Returns a scalar boolean value, which is true if any lane of its (only - // boolean) vector operand is true. + // Returns a scalar boolean value, which is true if any lane of its + // (boolean) vector operand is true. It produces the reduced value across + // all unrolled iterations. Unrolling will add all copies of its original + // operand as additional operands. AnyOf, // Calculates the first active lane index of the vector predicate operand. FirstActiveLane, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 805cd04c5ce35..6daac1d61c2f7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -761,8 +761,10 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } case VPInstruction::AnyOf: { - Value *A = State.get(getOperand(0)); - return Builder.CreateOrReduce(A); + Value *Res = State.get(getOperand(0)); + for (VPValue *Op : drop_begin(operands())) + Res = Builder.CreateOr(Res, State.get(Op)); + return Builder.CreateOrReduce(Res); } case VPInstruction::FirstActiveLane: { Value *Mask = State.get(getOperand(0)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 0bc683e557e70..111ef44c96b56 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -344,10 +344,11 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { if (ToSkip.contains(&R) || isa(&R)) continue; - // Add all VPValues for all parts to ComputeReductionResult which combines - // the parts to compute the final reduction value. + // Add all VPValues for all parts to AnyOf and Compute*Result which combine + // all parts to compute the final value. VPValue *Op1; - if (match(&R, m_VPInstruction( + if (match(&R, m_VPInstruction(m_VPValue(Op1))) || + match(&R, m_VPInstruction( m_VPValue(), m_VPValue(), m_VPValue(Op1))) || match(&R, m_VPInstruction( m_VPValue(), m_VPValue(Op1))) || diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll index 9dfe70ddf1b05..b5cef4fccddcd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll @@ -31,13 +31,43 @@ define i64 @same_exit_block_pre_inc_use1() #0 { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP37:%.*]] = mul nuw i64 [[TMP36]], 32 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 48 +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP40]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP29]], align 1 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP38]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP41]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 32 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 48 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP27]] ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP25]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp ne [[WIDE_LOAD5]], [[WIDE_LOAD6]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne [[WIDE_LOAD4]], [[WIDE_LOAD8]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP11]]) +; CHECK-NEXT: [[TMP33:%.*]] = or [[TMP11]], [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = or [[TMP33]], [[TMP31]] +; CHECK-NEXT: [[TMP35:%.*]] = or [[TMP34]], [[TMP32]] +; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP35]]) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]] ; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]] ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll index 1f8cfa1bfd11c..ae7adc6507e8e 100644 --- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll @@ -16,10 +16,22 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() { ; VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +; VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4 +; VF4IC4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8 +; VF4IC4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4 +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 ; VF4IC4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10) +; VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], splat (i32 10) +; VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD2]], splat (i32 10) +; VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD3]], splat (i32 10) ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; VF4IC4-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP2]], [[TMP6]] +; VF4IC4-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP7]] +; VF4IC4-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP8]] +; VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]]) ; VF4IC4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 ; VF4IC4-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] ; VF4IC4-NEXT: br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -91,13 +103,31 @@ define i64 @same_exit_block_pre_inc_use1() { ; VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]] ; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; VF4IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4 +; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8 +; VF4IC4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1 ; VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; VF4IC4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4 +; VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8 +; VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1 ; VF4IC4-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; VF4IC4-NEXT: [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]] +; VF4IC4-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; VF4IC4-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]] ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; VF4IC4-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]] +; VF4IC4-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]] +; VF4IC4-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]] +; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]]) ; VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 ; VF4IC4-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; VF4IC4-NEXT: br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -170,10 +200,22 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() { ; VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VF4IC4-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[INDEX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 4 +; VF4IC4-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 +; VF4IC4-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1 ; VF4IC4-NEXT: [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72) +; VF4IC4-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], splat (i8 72) +; VF4IC4-NEXT: [[TMP16:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], splat (i8 72) +; VF4IC4-NEXT: [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], splat (i8 72) ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; VF4IC4-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP2]], [[TMP15]] +; VF4IC4-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP16]] +; VF4IC4-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP17]] +; VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]]) ; VF4IC4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; VF4IC4-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] ; VF4IC4-NEXT: br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -240,13 +282,31 @@ define i64 @same_exit_block_post_inc_use() { ; VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]] ; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; VF4IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4 +; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8 +; VF4IC4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1 ; VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; VF4IC4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4 +; VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8 +; VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1 ; VF4IC4-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; VF4IC4-NEXT: [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]] +; VF4IC4-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; VF4IC4-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]] ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; VF4IC4-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]] +; VF4IC4-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]] +; VF4IC4-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]] +; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]]) ; VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 ; VF4IC4-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; VF4IC4-NEXT: br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -320,13 +380,31 @@ define i64 @diff_exit_block_pre_inc_use1() { ; VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]] ; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; VF4IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4 +; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8 +; VF4IC4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1 ; VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; VF4IC4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4 +; VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8 +; VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1 ; VF4IC4-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; VF4IC4-NEXT: [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]] +; VF4IC4-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; VF4IC4-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]] ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; VF4IC4-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]] +; VF4IC4-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]] +; VF4IC4-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]] +; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]]) ; VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 ; VF4IC4-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; VF4IC4-NEXT: br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -407,13 +485,31 @@ define i64 @diff_exit_block_post_inc_use1() { ; VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]] ; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; VF4IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4 +; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8 +; VF4IC4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1 ; VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; VF4IC4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4 +; VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8 +; VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1 ; VF4IC4-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; VF4IC4-NEXT: [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]] +; VF4IC4-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; VF4IC4-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]] ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; VF4IC4-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]] +; VF4IC4-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]] +; VF4IC4-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]] +; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]]) ; VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 ; VF4IC4-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; VF4IC4-NEXT: br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -495,16 +591,46 @@ define i64 @same_exit_block_pre_inc_use1_reverse() { ; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 ; VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3 +; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4 +; VF4IC4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3 +; VF4IC4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8 +; VF4IC4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3 +; VF4IC4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12 +; VF4IC4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 -3 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 ; VF4IC4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP25]], align 1 +; VF4IC4-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1 +; VF4IC4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP29]], align 1 +; VF4IC4-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> ; VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3 +; VF4IC4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -4 +; VF4IC4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 -3 +; VF4IC4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -8 +; VF4IC4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3 +; VF4IC4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -12 +; VF4IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3 ; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 ; VF4IC4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1 +; VF4IC4-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; VF4IC4-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; VF4IC4-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> ; VF4IC4-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]] +; VF4IC4-NEXT: [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE3]], [[REVERSE10]] +; VF4IC4-NEXT: [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]] +; VF4IC4-NEXT: [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]] ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; VF4IC4-NEXT: [[TMP22:%.*]] = or <4 x i1> [[TMP6]], [[TMP19]] +; VF4IC4-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]] +; VF4IC4-NEXT: [[TMP24:%.*]] = or <4 x i1> [[TMP23]], [[TMP21]] +; VF4IC4-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]]) ; VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008 ; VF4IC4-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] ; VF4IC4-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -577,13 +703,31 @@ define i8 @same_exit_block_use_loaded_value() { ; VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4 +; VF4IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8 +; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1 ; VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; VF4IC4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4 +; VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8 +; VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12 ; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; VF4IC4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 ; VF4IC4-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; VF4IC4-NEXT: [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]] +; VF4IC4-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; VF4IC4-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]] ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; VF4IC4-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]] +; VF4IC4-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]] +; VF4IC4-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]] +; VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]]) ; VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; VF4IC4-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; VF4IC4-NEXT: br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -657,16 +801,46 @@ define i8 @same_exit_block_reverse_use_loaded_value() { ; VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 ; VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3 +; VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4 +; VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 -3 +; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8 +; VF4IC4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3 +; VF4IC4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12 +; VF4IC4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3 ; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 ; VF4IC4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; VF4IC4-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP25]], align 1 +; VF4IC4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1 +; VF4IC4-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> ; VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] ; VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3 +; VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -4 +; VF4IC4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 -3 +; VF4IC4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -8 +; VF4IC4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3 +; VF4IC4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -12 +; VF4IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3 ; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 ; VF4IC4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1 +; VF4IC4-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; VF4IC4-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> +; VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; VF4IC4-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> ; VF4IC4-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]] +; VF4IC4-NEXT: [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE3]], [[REVERSE10]] +; VF4IC4-NEXT: [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]] +; VF4IC4-NEXT: [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]] ; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF4IC4-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; VF4IC4-NEXT: [[TMP22:%.*]] = or <4 x i1> [[TMP6]], [[TMP19]] +; VF4IC4-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]] +; VF4IC4-NEXT: [[TMP24:%.*]] = or <4 x i1> [[TMP23]], [[TMP21]] +; VF4IC4-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]]) ; VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008 ; VF4IC4-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] ; VF4IC4-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll index 1a8361ce2bac3..16be32f606064 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll @@ -56,12 +56,16 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0 +; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 8 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 ; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer -; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]]) +; VF8UF2-NEXT: [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer +; VF8UF2-NEXT: [[TMP4:%.*]] = or <8 x i1> [[TMP3]], [[TMP6]] +; VF8UF2-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP4]]) ; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]] ; VF8UF2: [[MIDDLE_SPLIT]]: -; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; VF8UF2-NEXT: br i1 [[TMP5]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF2: [[VECTOR_EARLY_EXIT]]: @@ -189,12 +193,16 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0 +; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 8 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 ; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer -; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]]) +; VF8UF2-NEXT: [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer +; VF8UF2-NEXT: [[TMP4:%.*]] = or <8 x i1> [[TMP3]], [[TMP6]] +; VF8UF2-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP4]]) ; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]] ; VF8UF2: [[MIDDLE_SPLIT]]: -; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; VF8UF2-NEXT: br i1 [[TMP7]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF2: [[VECTOR_EARLY_EXIT]]: