From 2241b274f49eb50850006fc3dbd4eff3b6c38437 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 25 Jul 2024 17:46:48 +0100 Subject: [PATCH 1/3] [AArch64] Remove special-case inserted shuffle cost. This special case tried to measure if the shuffle vector will be multiple inserts into an existing vector, with one of the lanes already in-place. If so it reduces the cost by 1 to to represent it will can insert n-1 vector lanes. This isn't always true though as the original vector may need to be moved to a new value to start inserting new values into it, if other values from the original are still needed. This didn't effect performance much when I tried it, but should hopefully start to address a regression we see from differences in SLP vectorization lane orders. (cherry picked from commit f67fa3be4db68afc08c7f3d9523f1533fa5687b7) --- .../AArch64/AArch64TargetTransformInfo.cpp | 7 +- .../CostModel/AArch64/shuffle-other.ll | 26 ++-- .../VectorCombine/AArch64/select-shuffle.ll | 140 +++++++++--------- 3 files changed, 84 insertions(+), 89 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 45148449dfb82..7de813f603264 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4065,15 +4065,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost( NMask.push_back(MaskElt % LTNumElts); } // If the sub-mask has at most 2 input sub-vectors then re-cost it using - // getShuffleCost. If not then cost it using the worst case. + // getShuffleCost. If not then cost it using the worst case as the number + // of element moves into a new vector. if (NumSources <= 2) Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, NTp, NMask, CostKind, 0, nullptr, Args, CxtI); - else if (any_of(enumerate(NMask), [&](const auto &ME) { - return ME.value() % LTNumElts == ME.index(); - })) - Cost += LTNumElts - 1; else Cost += LTNumElts; } diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll index d67f056366104..7f4030a81e749 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -367,7 +367,7 @@ define void @multipart() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> @@ -409,15 +409,15 @@ define void @vst3(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> @@ -453,15 +453,15 @@ define void @vst4(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll index b49f3c9f3eeb2..775ad4c5ecc36 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -22,13 +22,11 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) { define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test1_reduce( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] +; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]]) ; CHECK-NEXT: ret i32 [[R]] ; @@ -741,70 +739,70 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], -; CHECK-NEXT: [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]] -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = add nsw <16 x i32> [[TMP53]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP54]] -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP68:%.*]] = add nsw <16 x i32> [[TMP65]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP66]] -; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP70]], [[TMP72]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] -; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] +; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]] +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = add nsw <16 x i32> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP56]], [[TMP58]] +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], +; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], +; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], +; CHECK-NEXT: [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]] +; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP74]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP74]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] From 8408f493f0a6967105182d50374ceddfddf08b94 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 25 Jul 2024 21:47:18 +0100 Subject: [PATCH 2/3] [AArch64][PhaseOrdering] Add a SLP lane-ordering phase-ordering test. NFC (cherry picked from commit e0c14c05a3e3178164ca7368516c080b823ef484) --- .../PhaseOrdering/AArch64/slpordering.ll | 482 ++++++++++++++++++ 1 file changed, 482 insertions(+) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll new file mode 100644 index 0000000000000..0e14481e4ea0a --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -0,0 +1,482 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -O3 < %s | FileCheck %s + +; Check unrolling / SLP vectorization where the order of lanes is important for +; producing efficient shuffles. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64" + +; Function Attrs: nounwind uwtable +define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) #0 { +; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering +; CHECK-SAME: (ptr nocapture noundef readonly [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr nocapture noundef readonly [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[IP1]] to i64 +; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64 +; CHECK-NEXT: [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]] +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]] +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], +; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], +; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], +; CHECK-NEXT: [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]] +; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP74]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP74]], 16 +; CHECK-NEXT: [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] +; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[RDD119]], 1 +; CHECK-NEXT: ret i32 [[SHR120]] +; +entry: + %p1.addr = alloca ptr, align 8 + %ip1.addr = alloca i32, align 4 + %p2.addr = alloca ptr, align 8 + %ip2.addr = alloca i32, align 4 + %emp = alloca [4 x [4 x i32]], align 4 + %r0 = alloca i32, align 4 + %r1 = alloca i32, align 4 + %r2 = alloca i32, align 4 + %r3 = alloca i32, align 4 + %sum = alloca i32, align 4 + %i = alloca i32, align 4 + %e0 = alloca i32, align 4 + %e1 = alloca i32, align 4 + %e2 = alloca i32, align 4 + %e3 = alloca i32, align 4 + %i65 = alloca i32, align 4 + %e071 = alloca i32, align 4 + %e179 = alloca i32, align 4 + %e287 = alloca i32, align 4 + %e395 = alloca i32, align 4 + store ptr %p1, ptr %p1.addr, align 8, !tbaa !4 + store i32 %ip1, ptr %ip1.addr, align 4, !tbaa !8 + store ptr %p2, ptr %p2.addr, align 8, !tbaa !4 + store i32 %ip2, ptr %ip2.addr, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 64, ptr %emp) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %r0) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %r1) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %r2) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %r3) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #2 + store i32 0, ptr %sum, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2 + store i32 0, ptr %i, align 4, !tbaa !8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !tbaa !8 + %cmp = icmp slt i32 %0, 4 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2 + br label %for.end + +for.body: ; preds = %for.cond + %1 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx = getelementptr inbounds i8, ptr %1, i64 0 + %2 = load i8, ptr %rrrayidx, align 1, !tbaa !10 + %conv = zext i8 %2 to i32 + %3 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx1 = getelementptr inbounds i8, ptr %3, i64 0 + %4 = load i8, ptr %rrrayidx1, align 1, !tbaa !10 + %conv2 = zext i8 %4 to i32 + %sub = sub nsw i32 %conv, %conv2 + %5 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx3 = getelementptr inbounds i8, ptr %5, i64 4 + %6 = load i8, ptr %rrrayidx3, align 1, !tbaa !10 + %conv4 = zext i8 %6 to i32 + %7 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx5 = getelementptr inbounds i8, ptr %7, i64 4 + %8 = load i8, ptr %rrrayidx5, align 1, !tbaa !10 + %conv6 = zext i8 %8 to i32 + %sub7 = sub nsw i32 %conv4, %conv6 + %shl = shl i32 %sub7, 16 + %rdd = add nsw i32 %sub, %shl + store i32 %rdd, ptr %r0, align 4, !tbaa !8 + %9 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx8 = getelementptr inbounds i8, ptr %9, i64 1 + %10 = load i8, ptr %rrrayidx8, align 1, !tbaa !10 + %conv9 = zext i8 %10 to i32 + %11 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx10 = getelementptr inbounds i8, ptr %11, i64 1 + %12 = load i8, ptr %rrrayidx10, align 1, !tbaa !10 + %conv11 = zext i8 %12 to i32 + %sub12 = sub nsw i32 %conv9, %conv11 + %13 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx13 = getelementptr inbounds i8, ptr %13, i64 5 + %14 = load i8, ptr %rrrayidx13, align 1, !tbaa !10 + %conv14 = zext i8 %14 to i32 + %15 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx15 = getelementptr inbounds i8, ptr %15, i64 5 + %16 = load i8, ptr %rrrayidx15, align 1, !tbaa !10 + %conv16 = zext i8 %16 to i32 + %sub17 = sub nsw i32 %conv14, %conv16 + %shl18 = shl i32 %sub17, 16 + %rdd19 = add nsw i32 %sub12, %shl18 + store i32 %rdd19, ptr %r1, align 4, !tbaa !8 + %17 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx20 = getelementptr inbounds i8, ptr %17, i64 2 + %18 = load i8, ptr %rrrayidx20, align 1, !tbaa !10 + %conv21 = zext i8 %18 to i32 + %19 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx22 = getelementptr inbounds i8, ptr %19, i64 2 + %20 = load i8, ptr %rrrayidx22, align 1, !tbaa !10 + %conv23 = zext i8 %20 to i32 + %sub24 = sub nsw i32 %conv21, %conv23 + %21 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx25 = getelementptr inbounds i8, ptr %21, i64 6 + %22 = load i8, ptr %rrrayidx25, align 1, !tbaa !10 + %conv26 = zext i8 %22 to i32 + %23 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx27 = getelementptr inbounds i8, ptr %23, i64 6 + %24 = load i8, ptr %rrrayidx27, align 1, !tbaa !10 + %conv28 = zext i8 %24 to i32 + %sub29 = sub nsw i32 %conv26, %conv28 + %shl30 = shl i32 %sub29, 16 + %rdd31 = add nsw i32 %sub24, %shl30 + store i32 %rdd31, ptr %r2, align 4, !tbaa !8 + %25 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx32 = getelementptr inbounds i8, ptr %25, i64 3 + %26 = load i8, ptr %rrrayidx32, align 1, !tbaa !10 + %conv33 = zext i8 %26 to i32 + %27 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx34 = getelementptr inbounds i8, ptr %27, i64 3 + %28 = load i8, ptr %rrrayidx34, align 1, !tbaa !10 + %conv35 = zext i8 %28 to i32 + %sub36 = sub nsw i32 %conv33, %conv35 + %29 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx37 = getelementptr inbounds i8, ptr %29, i64 7 + %30 = load i8, ptr %rrrayidx37, align 1, !tbaa !10 + %conv38 = zext i8 %30 to i32 + %31 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx39 = getelementptr inbounds i8, ptr %31, i64 7 + %32 = load i8, ptr %rrrayidx39, align 1, !tbaa !10 + %conv40 = zext i8 %32 to i32 + %sub41 = sub nsw i32 %conv38, %conv40 + %shl42 = shl i32 %sub41, 16 + %rdd43 = add nsw i32 %sub36, %shl42 + store i32 %rdd43, ptr %r3, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e0) #2 + %33 = load i32, ptr %r0, align 4, !tbaa !8 + %34 = load i32, ptr %r1, align 4, !tbaa !8 + %rdd44 = add i32 %33, %34 + store i32 %rdd44, ptr %e0, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e1) #2 + %35 = load i32, ptr %r0, align 4, !tbaa !8 + %36 = load i32, ptr %r1, align 4, !tbaa !8 + %sub45 = sub i32 %35, %36 + store i32 %sub45, ptr %e1, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e2) #2 + %37 = load i32, ptr %r2, align 4, !tbaa !8 + %38 = load i32, ptr %r3, align 4, !tbaa !8 + %rdd46 = add i32 %37, %38 + store i32 %rdd46, ptr %e2, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e3) #2 + %39 = load i32, ptr %r2, align 4, !tbaa !8 + %40 = load i32, ptr %r3, align 4, !tbaa !8 + %sub47 = sub i32 %39, %40 + store i32 %sub47, ptr %e3, align 4, !tbaa !8 + %41 = load i32, ptr %e0, align 4, !tbaa !8 + %42 = load i32, ptr %e2, align 4, !tbaa !8 + %rdd48 = add nsw i32 %41, %42 + %43 = load i32, ptr %i, align 4, !tbaa !8 + %idxprom = sext i32 %43 to i64 + %rrrayidx49 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom + %rrrayidx50 = getelementptr inbounds [4 x i32], ptr %rrrayidx49, i64 0, i64 0 + store i32 %rdd48, ptr %rrrayidx50, align 4, !tbaa !8 + %44 = load i32, ptr %e0, align 4, !tbaa !8 + %45 = load i32, ptr %e2, align 4, !tbaa !8 + %sub51 = sub nsw i32 %44, %45 + %46 = load i32, ptr %i, align 4, !tbaa !8 + %idxprom52 = sext i32 %46 to i64 + %rrrayidx53 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom52 + %rrrayidx54 = getelementptr inbounds [4 x i32], ptr %rrrayidx53, i64 0, i64 2 + store i32 %sub51, ptr %rrrayidx54, align 4, !tbaa !8 + %47 = load i32, ptr %e1, align 4, !tbaa !8 + %48 = load i32, ptr %e3, align 4, !tbaa !8 + %rdd55 = add nsw i32 %47, %48 + %49 = load i32, ptr %i, align 4, !tbaa !8 + %idxprom56 = sext i32 %49 to i64 + %rrrayidx57 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom56 + %rrrayidx58 = getelementptr inbounds [4 x i32], ptr %rrrayidx57, i64 0, i64 1 + store i32 %rdd55, ptr %rrrayidx58, align 4, !tbaa !8 + %50 = load i32, ptr %e1, align 4, !tbaa !8 + %51 = load i32, ptr %e3, align 4, !tbaa !8 + %sub59 = sub nsw i32 %50, %51 + %52 = load i32, ptr %i, align 4, !tbaa !8 + %idxprom60 = sext i32 %52 to i64 + %rrrayidx61 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom60 + %rrrayidx62 = getelementptr inbounds [4 x i32], ptr %rrrayidx61, i64 0, i64 3 + store i32 %sub59, ptr %rrrayidx62, align 4, !tbaa !8 + call void @llvm.lifetime.end.p0(i64 4, ptr %e3) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e2) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e1) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e0) #2 + br label %for.inc + +for.inc: ; preds = %for.body + %53 = load i32, ptr %i, align 4, !tbaa !8 + %inc = add nsw i32 %53, 1 + store i32 %inc, ptr %i, align 4, !tbaa !8 + %54 = load i32, ptr %ip1.addr, align 4, !tbaa !8 + %55 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %idx.ext = sext i32 %54 to i64 + %rdd.ptr = getelementptr inbounds i8, ptr %55, i64 %idx.ext + store ptr %rdd.ptr, ptr %p1.addr, align 8, !tbaa !4 + %56 = load i32, ptr %ip2.addr, align 4, !tbaa !8 + %57 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %idx.ext63 = sext i32 %56 to i64 + %rdd.ptr64 = getelementptr inbounds i8, ptr %57, i64 %idx.ext63 + store ptr %rdd.ptr64, ptr %p2.addr, align 8, !tbaa !4 + br label %for.cond, !llvm.loop !11 + +for.end: ; preds = %for.cond.cleanup + call void @llvm.lifetime.start.p0(i64 4, ptr %i65) #2 + store i32 0, ptr %i65, align 4, !tbaa !8 + br label %for.cond66 + +for.cond66: ; preds = %for.inc114, %for.end + %58 = load i32, ptr %i65, align 4, !tbaa !8 + %cmp67 = icmp slt i32 %58, 4 + br i1 %cmp67, label %for.body70, label %for.cond.cleanup69 + +for.cond.cleanup69: ; preds = %for.cond66 + call void @llvm.lifetime.end.p0(i64 4, ptr %i65) #2 + br label %for.end116 + +for.body70: ; preds = %for.cond66 + call void @llvm.lifetime.start.p0(i64 4, ptr %e071) #2 + %rrrayidx72 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0 + %59 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom73 = sext i32 %59 to i64 + %rrrayidx74 = getelementptr inbounds [4 x i32], ptr %rrrayidx72, i64 0, i64 %idxprom73 + %60 = load i32, ptr %rrrayidx74, align 4, !tbaa !8 + %rrrayidx75 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 1 + %61 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom76 = sext i32 %61 to i64 + %rrrayidx77 = getelementptr inbounds [4 x i32], ptr %rrrayidx75, i64 0, i64 %idxprom76 + %62 = load i32, ptr %rrrayidx77, align 4, !tbaa !8 + %rdd78 = add i32 %60, %62 + store i32 %rdd78, ptr %e071, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e179) #2 + %rrrayidx80 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0 + %63 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom81 = sext i32 %63 to i64 + %rrrayidx82 = getelementptr inbounds [4 x i32], ptr %rrrayidx80, i64 0, i64 %idxprom81 + %64 = load i32, ptr %rrrayidx82, align 4, !tbaa !8 + %rrrayidx83 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 1 + %65 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom84 = sext i32 %65 to i64 + %rrrayidx85 = getelementptr inbounds [4 x i32], ptr %rrrayidx83, i64 0, i64 %idxprom84 + %66 = load i32, ptr %rrrayidx85, align 4, !tbaa !8 + %sub86 = sub i32 %64, %66 + store i32 %sub86, ptr %e179, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e287) #2 + %rrrayidx88 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2 + %67 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom89 = sext i32 %67 to i64 + %rrrayidx90 = getelementptr inbounds [4 x i32], ptr %rrrayidx88, i64 0, i64 %idxprom89 + %68 = load i32, ptr %rrrayidx90, align 4, !tbaa !8 + %rrrayidx91 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 3 + %69 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom92 = sext i32 %69 to i64 + %rrrayidx93 = getelementptr inbounds [4 x i32], ptr %rrrayidx91, i64 0, i64 %idxprom92 + %70 = load i32, ptr %rrrayidx93, align 4, !tbaa !8 + %rdd94 = add i32 %68, %70 + store i32 %rdd94, ptr %e287, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e395) #2 + %rrrayidx96 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2 + %71 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom97 = sext i32 %71 to i64 + %rrrayidx98 = getelementptr inbounds [4 x i32], ptr %rrrayidx96, i64 0, i64 %idxprom97 + %72 = load i32, ptr %rrrayidx98, align 4, !tbaa !8 + %rrrayidx99 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 3 + %73 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom100 = sext i32 %73 to i64 + %rrrayidx101 = getelementptr inbounds [4 x i32], ptr %rrrayidx99, i64 0, i64 %idxprom100 + %74 = load i32, ptr %rrrayidx101, align 4, !tbaa !8 + %sub102 = sub i32 %72, %74 + store i32 %sub102, ptr %e395, align 4, !tbaa !8 + %75 = load i32, ptr %e071, align 4, !tbaa !8 + %76 = load i32, ptr %e287, align 4, !tbaa !8 + %rdd103 = add nsw i32 %75, %76 + store i32 %rdd103, ptr %r0, align 4, !tbaa !8 + %77 = load i32, ptr %e071, align 4, !tbaa !8 + %78 = load i32, ptr %e287, align 4, !tbaa !8 + %sub104 = sub nsw i32 %77, %78 + store i32 %sub104, ptr %r2, align 4, !tbaa !8 + %79 = load i32, ptr %e179, align 4, !tbaa !8 + %80 = load i32, ptr %e395, align 4, !tbaa !8 + %rdd105 = add nsw i32 %79, %80 + store i32 %rdd105, ptr %r1, align 4, !tbaa !8 + %81 = load i32, ptr %e179, align 4, !tbaa !8 + %82 = load i32, ptr %e395, align 4, !tbaa !8 + %sub106 = sub nsw i32 %81, %82 + store i32 %sub106, ptr %r3, align 4, !tbaa !8 + call void @llvm.lifetime.end.p0(i64 4, ptr %e395) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e287) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e179) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e071) #2 + %83 = load i32, ptr %r0, align 4, !tbaa !8 + %call = call i32 @twoabs(i32 noundef %83) + %84 = load i32, ptr %r1, align 4, !tbaa !8 + %call107 = call i32 @twoabs(i32 noundef %84) + %rdd108 = add i32 %call, %call107 + %85 = load i32, ptr %r2, align 4, !tbaa !8 + %call109 = call i32 @twoabs(i32 noundef %85) + %rdd110 = add i32 %rdd108, %call109 + %86 = load i32, ptr %r3, align 4, !tbaa !8 + %call111 = call i32 @twoabs(i32 noundef %86) + %rdd112 = add i32 %rdd110, %call111 + %87 = load i32, ptr %sum, align 4, !tbaa !8 + %rdd113 = add i32 %87, %rdd112 + store i32 %rdd113, ptr %sum, align 4, !tbaa !8 + br label %for.inc114 + +for.inc114: ; preds = %for.body70 + %88 = load i32, ptr %i65, align 4, !tbaa !8 + %inc115 = add nsw i32 %88, 1 + store i32 %inc115, ptr %i65, align 4, !tbaa !8 + br label %for.cond66, !llvm.loop !13 + +for.end116: ; preds = %for.cond.cleanup69 + %89 = load i32, ptr %sum, align 4, !tbaa !8 + %conv117 = trunc i32 %89 to i16 + %conv118 = zext i16 %conv117 to i32 + %90 = load i32, ptr %sum, align 4, !tbaa !8 + %shr = lshr i32 %90, 16 + %rdd119 = add i32 %conv118, %shr + %shr120 = lshr i32 %rdd119, 1 + call void @llvm.lifetime.end.p0(i64 4, ptr %sum) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %r3) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %r2) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %r1) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %r0) #2 + call void @llvm.lifetime.end.p0(i64 64, ptr %emp) #2 + ret i32 %shr120 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define internal i32 @twoabs(i32 noundef %r) #0 { +entry: + %r.addr = alloca i32, align 4 + %s = alloca i32, align 4 + store i32 %r, ptr %r.addr, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %s) #2 + %0 = load i32, ptr %r.addr, align 4, !tbaa !8 + %shr = lshr i32 %0, 15 + %rnd = and i32 %shr, 65537 + %mul = mul i32 %rnd, 65535 + store i32 %mul, ptr %s, align 4, !tbaa !8 + %1 = load i32, ptr %r.addr, align 4, !tbaa !8 + %2 = load i32, ptr %s, align 4, !tbaa !8 + %rdd = add i32 %1, %2 + %3 = load i32, ptr %s, align 4, !tbaa !8 + %xor = xor i32 %rdd, %3 + call void @llvm.lifetime.end.p0(i64 4, ptr %s) #2 + ret i32 %xor +} + +attributes #0 = { nounwind uwtable "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nounwind } + +!4 = !{!5, !5, i64 0} +!5 = !{!"any pointer", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!9, !9, i64 0} +!9 = !{!"int", !6, i64 0} +!10 = !{!6, !6, i64 0} +!11 = distinct !{!11, !12} +!12 = !{!"llvm.loop.mustprogress"} +!13 = distinct !{!13, !12} From 08f22fc042211c1d35a3ee2511abef7cc9d94119 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 27 Jul 2024 11:18:56 +0100 Subject: [PATCH 3/3] [SLP] Order clustered load base pointers by ascending offsets (#100653) This attempts to fix a regression from #98025, where the new order of reduction nodes causes later passes to not be able to produce as nice shuffles. The issue boils down to picking an order of [0 1 3 2] for loaded v4i8 values, which meant later parts could not find a simpler ordering for the shuffles given the legal nodes available in AArch64. If instead we make sure they are ordered [0 1 2 3] then everything can fall into place. In order to produce a better order that is more likely to work in more cases, this patch takes the existing clustered loads and sort the base pointers if there is an order between them. i.e if `V2 == gep (V1, X)` then V1 is sorted before V2. (cherry picked from commit f2d2ae3f5a8e45c1386bf71e403d78cfee52cb7b) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 23 ++++++++-- .../PhaseOrdering/AArch64/slpordering.ll | 46 ++++++++++--------- .../SLPVectorizer/AArch64/loadorder.ll | 26 +++++------ 3 files changed, 57 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cca9eeebaa53f..f0071763731a4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4842,10 +4842,27 @@ static bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, if (!AnyConsecutive) return false; - for (auto &Base : Bases) { - for (auto &T : Base.second) + // If we have a better order, also sort the base pointers by increasing + // (variable) values if possible, to try and keep the order more regular. + SmallVector> SortedBases; + for (auto &Base : Bases) + SortedBases.emplace_back(Base.first, + Base.first->stripInBoundsConstantOffsets()); + llvm::stable_sort(SortedBases, [](std::pair V1, + std::pair V2) { + const Value *V = V2.second; + while (auto *Gep = dyn_cast(V)) { + if (Gep->getOperand(0) == V1.second) + return true; + V = Gep->getOperand(0); + } + return false; + }); + + // Collect the final order of sorted indices + for (auto Base : SortedBases) + for (auto &T : Bases[Base.first]) SortedIndices.push_back(std::get<2>(T)); - } assert(SortedIndices.size() == VL.size() && "Expected SortedIndices to be the size of VL"); diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 0e14481e4ea0a..22511c018dca2 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -2,7 +2,9 @@ ; RUN: opt -S -O3 < %s | FileCheck %s ; Check unrolling / SLP vectorization where the order of lanes is important for -; producing efficient shuffles. +; producing efficient shuffles. The shuffles should be regular and cheap for +; AArch64. [0 2 4 6] and [1 3 5 7] will produce uzp1/uzp2 instruction. The +; v16i32 shuffles will be legalized to individual v4i32. target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" target triple = "aarch64" @@ -44,29 +46,29 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> ; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> ; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] @@ -84,19 +86,19 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> ; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] ; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> ; CHECK-NEXT: [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]] ; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]] -; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] ; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> ; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], ; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], ; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index 807d2468d4271..6b5503f26fabf 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1231,29 +1231,29 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 ; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 ; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> ; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> ; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] @@ -1262,7 +1262,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> ; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]]