diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 45148449dfb82..7de813f603264 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4065,15 +4065,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost( NMask.push_back(MaskElt % LTNumElts); } // If the sub-mask has at most 2 input sub-vectors then re-cost it using - // getShuffleCost. If not then cost it using the worst case. + // getShuffleCost. If not then cost it using the worst case as the number + // of element moves into a new vector. if (NumSources <= 2) Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, NTp, NMask, CostKind, 0, nullptr, Args, CxtI); - else if (any_of(enumerate(NMask), [&](const auto &ME) { - return ME.value() % LTNumElts == ME.index(); - })) - Cost += LTNumElts - 1; else Cost += LTNumElts; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cca9eeebaa53f..f0071763731a4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4842,10 +4842,27 @@ static bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, if (!AnyConsecutive) return false; - for (auto &Base : Bases) { - for (auto &T : Base.second) + // If we have a better order, also sort the base pointers by increasing + // (variable) values if possible, to try and keep the order more regular. + SmallVector> SortedBases; + for (auto &Base : Bases) + SortedBases.emplace_back(Base.first, + Base.first->stripInBoundsConstantOffsets()); + llvm::stable_sort(SortedBases, [](std::pair V1, + std::pair V2) { + const Value *V = V2.second; + while (auto *Gep = dyn_cast(V)) { + if (Gep->getOperand(0) == V1.second) + return true; + V = Gep->getOperand(0); + } + return false; + }); + + // Collect the final order of sorted indices + for (auto Base : SortedBases) + for (auto &T : Bases[Base.first]) SortedIndices.push_back(std::get<2>(T)); - } assert(SortedIndices.size() == VL.size() && "Expected SortedIndices to be the size of VL"); diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll index d67f056366104..7f4030a81e749 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -367,7 +367,7 @@ define void @multipart() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> @@ -409,15 +409,15 @@ define void @vst3(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> @@ -453,15 +453,15 @@ define void @vst4(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll new file mode 100644 index 0000000000000..22511c018dca2 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -0,0 +1,484 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -O3 < %s | FileCheck %s + +; Check unrolling / SLP vectorization where the order of lanes is important for +; producing efficient shuffles. The shuffles should be regular and cheap for +; AArch64. [0 2 4 6] and [1 3 5 7] will produce uzp1/uzp2 instruction. The +; v16i32 shuffles will be legalized to individual v4i32. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64" + +; Function Attrs: nounwind uwtable +define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) #0 { +; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering +; CHECK-SAME: (ptr nocapture noundef readonly [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr nocapture noundef readonly [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[IP1]] to i64 +; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64 +; CHECK-NEXT: [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]] +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]] +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], +; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], +; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], +; CHECK-NEXT: [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]] +; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP74]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP74]], 16 +; CHECK-NEXT: [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] +; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[RDD119]], 1 +; CHECK-NEXT: ret i32 [[SHR120]] +; +entry: + %p1.addr = alloca ptr, align 8 + %ip1.addr = alloca i32, align 4 + %p2.addr = alloca ptr, align 8 + %ip2.addr = alloca i32, align 4 + %emp = alloca [4 x [4 x i32]], align 4 + %r0 = alloca i32, align 4 + %r1 = alloca i32, align 4 + %r2 = alloca i32, align 4 + %r3 = alloca i32, align 4 + %sum = alloca i32, align 4 + %i = alloca i32, align 4 + %e0 = alloca i32, align 4 + %e1 = alloca i32, align 4 + %e2 = alloca i32, align 4 + %e3 = alloca i32, align 4 + %i65 = alloca i32, align 4 + %e071 = alloca i32, align 4 + %e179 = alloca i32, align 4 + %e287 = alloca i32, align 4 + %e395 = alloca i32, align 4 + store ptr %p1, ptr %p1.addr, align 8, !tbaa !4 + store i32 %ip1, ptr %ip1.addr, align 4, !tbaa !8 + store ptr %p2, ptr %p2.addr, align 8, !tbaa !4 + store i32 %ip2, ptr %ip2.addr, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 64, ptr %emp) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %r0) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %r1) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %r2) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %r3) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #2 + store i32 0, ptr %sum, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2 + store i32 0, ptr %i, align 4, !tbaa !8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !tbaa !8 + %cmp = icmp slt i32 %0, 4 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2 + br label %for.end + +for.body: ; preds = %for.cond + %1 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx = getelementptr inbounds i8, ptr %1, i64 0 + %2 = load i8, ptr %rrrayidx, align 1, !tbaa !10 + %conv = zext i8 %2 to i32 + %3 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx1 = getelementptr inbounds i8, ptr %3, i64 0 + %4 = load i8, ptr %rrrayidx1, align 1, !tbaa !10 + %conv2 = zext i8 %4 to i32 + %sub = sub nsw i32 %conv, %conv2 + %5 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx3 = getelementptr inbounds i8, ptr %5, i64 4 + %6 = load i8, ptr %rrrayidx3, align 1, !tbaa !10 + %conv4 = zext i8 %6 to i32 + %7 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx5 = getelementptr inbounds i8, ptr %7, i64 4 + %8 = load i8, ptr %rrrayidx5, align 1, !tbaa !10 + %conv6 = zext i8 %8 to i32 + %sub7 = sub nsw i32 %conv4, %conv6 + %shl = shl i32 %sub7, 16 + %rdd = add nsw i32 %sub, %shl + store i32 %rdd, ptr %r0, align 4, !tbaa !8 + %9 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx8 = getelementptr inbounds i8, ptr %9, i64 1 + %10 = load i8, ptr %rrrayidx8, align 1, !tbaa !10 + %conv9 = zext i8 %10 to i32 + %11 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx10 = getelementptr inbounds i8, ptr %11, i64 1 + %12 = load i8, ptr %rrrayidx10, align 1, !tbaa !10 + %conv11 = zext i8 %12 to i32 + %sub12 = sub nsw i32 %conv9, %conv11 + %13 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx13 = getelementptr inbounds i8, ptr %13, i64 5 + %14 = load i8, ptr %rrrayidx13, align 1, !tbaa !10 + %conv14 = zext i8 %14 to i32 + %15 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx15 = getelementptr inbounds i8, ptr %15, i64 5 + %16 = load i8, ptr %rrrayidx15, align 1, !tbaa !10 + %conv16 = zext i8 %16 to i32 + %sub17 = sub nsw i32 %conv14, %conv16 + %shl18 = shl i32 %sub17, 16 + %rdd19 = add nsw i32 %sub12, %shl18 + store i32 %rdd19, ptr %r1, align 4, !tbaa !8 + %17 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx20 = getelementptr inbounds i8, ptr %17, i64 2 + %18 = load i8, ptr %rrrayidx20, align 1, !tbaa !10 + %conv21 = zext i8 %18 to i32 + %19 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx22 = getelementptr inbounds i8, ptr %19, i64 2 + %20 = load i8, ptr %rrrayidx22, align 1, !tbaa !10 + %conv23 = zext i8 %20 to i32 + %sub24 = sub nsw i32 %conv21, %conv23 + %21 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx25 = getelementptr inbounds i8, ptr %21, i64 6 + %22 = load i8, ptr %rrrayidx25, align 1, !tbaa !10 + %conv26 = zext i8 %22 to i32 + %23 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx27 = getelementptr inbounds i8, ptr %23, i64 6 + %24 = load i8, ptr %rrrayidx27, align 1, !tbaa !10 + %conv28 = zext i8 %24 to i32 + %sub29 = sub nsw i32 %conv26, %conv28 + %shl30 = shl i32 %sub29, 16 + %rdd31 = add nsw i32 %sub24, %shl30 + store i32 %rdd31, ptr %r2, align 4, !tbaa !8 + %25 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx32 = getelementptr inbounds i8, ptr %25, i64 3 + %26 = load i8, ptr %rrrayidx32, align 1, !tbaa !10 + %conv33 = zext i8 %26 to i32 + %27 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx34 = getelementptr inbounds i8, ptr %27, i64 3 + %28 = load i8, ptr %rrrayidx34, align 1, !tbaa !10 + %conv35 = zext i8 %28 to i32 + %sub36 = sub nsw i32 %conv33, %conv35 + %29 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %rrrayidx37 = getelementptr inbounds i8, ptr %29, i64 7 + %30 = load i8, ptr %rrrayidx37, align 1, !tbaa !10 + %conv38 = zext i8 %30 to i32 + %31 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %rrrayidx39 = getelementptr inbounds i8, ptr %31, i64 7 + %32 = load i8, ptr %rrrayidx39, align 1, !tbaa !10 + %conv40 = zext i8 %32 to i32 + %sub41 = sub nsw i32 %conv38, %conv40 + %shl42 = shl i32 %sub41, 16 + %rdd43 = add nsw i32 %sub36, %shl42 + store i32 %rdd43, ptr %r3, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e0) #2 + %33 = load i32, ptr %r0, align 4, !tbaa !8 + %34 = load i32, ptr %r1, align 4, !tbaa !8 + %rdd44 = add i32 %33, %34 + store i32 %rdd44, ptr %e0, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e1) #2 + %35 = load i32, ptr %r0, align 4, !tbaa !8 + %36 = load i32, ptr %r1, align 4, !tbaa !8 + %sub45 = sub i32 %35, %36 + store i32 %sub45, ptr %e1, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e2) #2 + %37 = load i32, ptr %r2, align 4, !tbaa !8 + %38 = load i32, ptr %r3, align 4, !tbaa !8 + %rdd46 = add i32 %37, %38 + store i32 %rdd46, ptr %e2, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e3) #2 + %39 = load i32, ptr %r2, align 4, !tbaa !8 + %40 = load i32, ptr %r3, align 4, !tbaa !8 + %sub47 = sub i32 %39, %40 + store i32 %sub47, ptr %e3, align 4, !tbaa !8 + %41 = load i32, ptr %e0, align 4, !tbaa !8 + %42 = load i32, ptr %e2, align 4, !tbaa !8 + %rdd48 = add nsw i32 %41, %42 + %43 = load i32, ptr %i, align 4, !tbaa !8 + %idxprom = sext i32 %43 to i64 + %rrrayidx49 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom + %rrrayidx50 = getelementptr inbounds [4 x i32], ptr %rrrayidx49, i64 0, i64 0 + store i32 %rdd48, ptr %rrrayidx50, align 4, !tbaa !8 + %44 = load i32, ptr %e0, align 4, !tbaa !8 + %45 = load i32, ptr %e2, align 4, !tbaa !8 + %sub51 = sub nsw i32 %44, %45 + %46 = load i32, ptr %i, align 4, !tbaa !8 + %idxprom52 = sext i32 %46 to i64 + %rrrayidx53 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom52 + %rrrayidx54 = getelementptr inbounds [4 x i32], ptr %rrrayidx53, i64 0, i64 2 + store i32 %sub51, ptr %rrrayidx54, align 4, !tbaa !8 + %47 = load i32, ptr %e1, align 4, !tbaa !8 + %48 = load i32, ptr %e3, align 4, !tbaa !8 + %rdd55 = add nsw i32 %47, %48 + %49 = load i32, ptr %i, align 4, !tbaa !8 + %idxprom56 = sext i32 %49 to i64 + %rrrayidx57 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom56 + %rrrayidx58 = getelementptr inbounds [4 x i32], ptr %rrrayidx57, i64 0, i64 1 + store i32 %rdd55, ptr %rrrayidx58, align 4, !tbaa !8 + %50 = load i32, ptr %e1, align 4, !tbaa !8 + %51 = load i32, ptr %e3, align 4, !tbaa !8 + %sub59 = sub nsw i32 %50, %51 + %52 = load i32, ptr %i, align 4, !tbaa !8 + %idxprom60 = sext i32 %52 to i64 + %rrrayidx61 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom60 + %rrrayidx62 = getelementptr inbounds [4 x i32], ptr %rrrayidx61, i64 0, i64 3 + store i32 %sub59, ptr %rrrayidx62, align 4, !tbaa !8 + call void @llvm.lifetime.end.p0(i64 4, ptr %e3) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e2) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e1) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e0) #2 + br label %for.inc + +for.inc: ; preds = %for.body + %53 = load i32, ptr %i, align 4, !tbaa !8 + %inc = add nsw i32 %53, 1 + store i32 %inc, ptr %i, align 4, !tbaa !8 + %54 = load i32, ptr %ip1.addr, align 4, !tbaa !8 + %55 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %idx.ext = sext i32 %54 to i64 + %rdd.ptr = getelementptr inbounds i8, ptr %55, i64 %idx.ext + store ptr %rdd.ptr, ptr %p1.addr, align 8, !tbaa !4 + %56 = load i32, ptr %ip2.addr, align 4, !tbaa !8 + %57 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %idx.ext63 = sext i32 %56 to i64 + %rdd.ptr64 = getelementptr inbounds i8, ptr %57, i64 %idx.ext63 + store ptr %rdd.ptr64, ptr %p2.addr, align 8, !tbaa !4 + br label %for.cond, !llvm.loop !11 + +for.end: ; preds = %for.cond.cleanup + call void @llvm.lifetime.start.p0(i64 4, ptr %i65) #2 + store i32 0, ptr %i65, align 4, !tbaa !8 + br label %for.cond66 + +for.cond66: ; preds = %for.inc114, %for.end + %58 = load i32, ptr %i65, align 4, !tbaa !8 + %cmp67 = icmp slt i32 %58, 4 + br i1 %cmp67, label %for.body70, label %for.cond.cleanup69 + +for.cond.cleanup69: ; preds = %for.cond66 + call void @llvm.lifetime.end.p0(i64 4, ptr %i65) #2 + br label %for.end116 + +for.body70: ; preds = %for.cond66 + call void @llvm.lifetime.start.p0(i64 4, ptr %e071) #2 + %rrrayidx72 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0 + %59 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom73 = sext i32 %59 to i64 + %rrrayidx74 = getelementptr inbounds [4 x i32], ptr %rrrayidx72, i64 0, i64 %idxprom73 + %60 = load i32, ptr %rrrayidx74, align 4, !tbaa !8 + %rrrayidx75 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 1 + %61 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom76 = sext i32 %61 to i64 + %rrrayidx77 = getelementptr inbounds [4 x i32], ptr %rrrayidx75, i64 0, i64 %idxprom76 + %62 = load i32, ptr %rrrayidx77, align 4, !tbaa !8 + %rdd78 = add i32 %60, %62 + store i32 %rdd78, ptr %e071, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e179) #2 + %rrrayidx80 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0 + %63 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom81 = sext i32 %63 to i64 + %rrrayidx82 = getelementptr inbounds [4 x i32], ptr %rrrayidx80, i64 0, i64 %idxprom81 + %64 = load i32, ptr %rrrayidx82, align 4, !tbaa !8 + %rrrayidx83 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 1 + %65 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom84 = sext i32 %65 to i64 + %rrrayidx85 = getelementptr inbounds [4 x i32], ptr %rrrayidx83, i64 0, i64 %idxprom84 + %66 = load i32, ptr %rrrayidx85, align 4, !tbaa !8 + %sub86 = sub i32 %64, %66 + store i32 %sub86, ptr %e179, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e287) #2 + %rrrayidx88 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2 + %67 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom89 = sext i32 %67 to i64 + %rrrayidx90 = getelementptr inbounds [4 x i32], ptr %rrrayidx88, i64 0, i64 %idxprom89 + %68 = load i32, ptr %rrrayidx90, align 4, !tbaa !8 + %rrrayidx91 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 3 + %69 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom92 = sext i32 %69 to i64 + %rrrayidx93 = getelementptr inbounds [4 x i32], ptr %rrrayidx91, i64 0, i64 %idxprom92 + %70 = load i32, ptr %rrrayidx93, align 4, !tbaa !8 + %rdd94 = add i32 %68, %70 + store i32 %rdd94, ptr %e287, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %e395) #2 + %rrrayidx96 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2 + %71 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom97 = sext i32 %71 to i64 + %rrrayidx98 = getelementptr inbounds [4 x i32], ptr %rrrayidx96, i64 0, i64 %idxprom97 + %72 = load i32, ptr %rrrayidx98, align 4, !tbaa !8 + %rrrayidx99 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 3 + %73 = load i32, ptr %i65, align 4, !tbaa !8 + %idxprom100 = sext i32 %73 to i64 + %rrrayidx101 = getelementptr inbounds [4 x i32], ptr %rrrayidx99, i64 0, i64 %idxprom100 + %74 = load i32, ptr %rrrayidx101, align 4, !tbaa !8 + %sub102 = sub i32 %72, %74 + store i32 %sub102, ptr %e395, align 4, !tbaa !8 + %75 = load i32, ptr %e071, align 4, !tbaa !8 + %76 = load i32, ptr %e287, align 4, !tbaa !8 + %rdd103 = add nsw i32 %75, %76 + store i32 %rdd103, ptr %r0, align 4, !tbaa !8 + %77 = load i32, ptr %e071, align 4, !tbaa !8 + %78 = load i32, ptr %e287, align 4, !tbaa !8 + %sub104 = sub nsw i32 %77, %78 + store i32 %sub104, ptr %r2, align 4, !tbaa !8 + %79 = load i32, ptr %e179, align 4, !tbaa !8 + %80 = load i32, ptr %e395, align 4, !tbaa !8 + %rdd105 = add nsw i32 %79, %80 + store i32 %rdd105, ptr %r1, align 4, !tbaa !8 + %81 = load i32, ptr %e179, align 4, !tbaa !8 + %82 = load i32, ptr %e395, align 4, !tbaa !8 + %sub106 = sub nsw i32 %81, %82 + store i32 %sub106, ptr %r3, align 4, !tbaa !8 + call void @llvm.lifetime.end.p0(i64 4, ptr %e395) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e287) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e179) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %e071) #2 + %83 = load i32, ptr %r0, align 4, !tbaa !8 + %call = call i32 @twoabs(i32 noundef %83) + %84 = load i32, ptr %r1, align 4, !tbaa !8 + %call107 = call i32 @twoabs(i32 noundef %84) + %rdd108 = add i32 %call, %call107 + %85 = load i32, ptr %r2, align 4, !tbaa !8 + %call109 = call i32 @twoabs(i32 noundef %85) + %rdd110 = add i32 %rdd108, %call109 + %86 = load i32, ptr %r3, align 4, !tbaa !8 + %call111 = call i32 @twoabs(i32 noundef %86) + %rdd112 = add i32 %rdd110, %call111 + %87 = load i32, ptr %sum, align 4, !tbaa !8 + %rdd113 = add i32 %87, %rdd112 + store i32 %rdd113, ptr %sum, align 4, !tbaa !8 + br label %for.inc114 + +for.inc114: ; preds = %for.body70 + %88 = load i32, ptr %i65, align 4, !tbaa !8 + %inc115 = add nsw i32 %88, 1 + store i32 %inc115, ptr %i65, align 4, !tbaa !8 + br label %for.cond66, !llvm.loop !13 + +for.end116: ; preds = %for.cond.cleanup69 + %89 = load i32, ptr %sum, align 4, !tbaa !8 + %conv117 = trunc i32 %89 to i16 + %conv118 = zext i16 %conv117 to i32 + %90 = load i32, ptr %sum, align 4, !tbaa !8 + %shr = lshr i32 %90, 16 + %rdd119 = add i32 %conv118, %shr + %shr120 = lshr i32 %rdd119, 1 + call void @llvm.lifetime.end.p0(i64 4, ptr %sum) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %r3) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %r2) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %r1) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr %r0) #2 + call void @llvm.lifetime.end.p0(i64 64, ptr %emp) #2 + ret i32 %shr120 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define internal i32 @twoabs(i32 noundef %r) #0 { +entry: + %r.addr = alloca i32, align 4 + %s = alloca i32, align 4 + store i32 %r, ptr %r.addr, align 4, !tbaa !8 + call void @llvm.lifetime.start.p0(i64 4, ptr %s) #2 + %0 = load i32, ptr %r.addr, align 4, !tbaa !8 + %shr = lshr i32 %0, 15 + %rnd = and i32 %shr, 65537 + %mul = mul i32 %rnd, 65535 + store i32 %mul, ptr %s, align 4, !tbaa !8 + %1 = load i32, ptr %r.addr, align 4, !tbaa !8 + %2 = load i32, ptr %s, align 4, !tbaa !8 + %rdd = add i32 %1, %2 + %3 = load i32, ptr %s, align 4, !tbaa !8 + %xor = xor i32 %rdd, %3 + call void @llvm.lifetime.end.p0(i64 4, ptr %s) #2 + ret i32 %xor +} + +attributes #0 = { nounwind uwtable "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nounwind } + +!4 = !{!5, !5, i64 0} +!5 = !{!"any pointer", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!9, !9, i64 0} +!9 = !{!"int", !6, i64 0} +!10 = !{!6, !6, i64 0} +!11 = distinct !{!11, !12} +!12 = !{!"llvm.loop.mustprogress"} +!13 = distinct !{!13, !12} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index 807d2468d4271..6b5503f26fabf 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1231,29 +1231,29 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 ; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 ; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> ; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> ; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] @@ -1262,7 +1262,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> ; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]] diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll index b49f3c9f3eeb2..775ad4c5ecc36 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -22,13 +22,11 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) { define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test1_reduce( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] +; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]]) ; CHECK-NEXT: ret i32 [[R]] ; @@ -741,70 +739,70 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], -; CHECK-NEXT: [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]] -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = add nsw <16 x i32> [[TMP53]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP54]] -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP68:%.*]] = add nsw <16 x i32> [[TMP65]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP66]] -; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP70]], [[TMP72]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] -; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] +; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]] +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = add nsw <16 x i32> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP56]], [[TMP58]] +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], +; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], +; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], +; CHECK-NEXT: [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]] +; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP74]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP74]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]