diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 45148449dfb82..03139faea1ee6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4070,10 +4070,6 @@ InstructionCost AArch64TTIImpl::getShuffleCost( Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, NTp, NMask, CostKind, 0, nullptr, Args, CxtI); - else if (any_of(enumerate(NMask), [&](const auto &ME) { - return ME.value() % LTNumElts == ME.index(); - })) - Cost += LTNumElts - 1; else Cost += LTNumElts; } diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll index d67f056366104..7f4030a81e749 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -367,7 +367,7 @@ define void @multipart() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> @@ -409,15 +409,15 @@ define void @vst3(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> @@ -453,15 +453,15 @@ define void @vst4(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll index b49f3c9f3eeb2..775ad4c5ecc36 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -22,13 +22,11 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) { define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test1_reduce( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] +; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]]) ; CHECK-NEXT: ret i32 [[R]] ; @@ -741,70 +739,70 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], -; CHECK-NEXT: [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]] -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = add nsw <16 x i32> [[TMP53]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP54]] -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP68:%.*]] = add nsw <16 x i32> [[TMP65]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP66]] -; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP70]], [[TMP72]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] -; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] +; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]] +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = add nsw <16 x i32> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP56]], [[TMP58]] +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], +; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], +; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], +; CHECK-NEXT: [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]] +; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP74]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP74]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]