diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 89f63c3b66aad..d3c22dea72efb 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -343,6 +343,9 @@ class Vectorizer { /// Postcondition: For all i, ret[i][0].second == 0, because the first instr /// in the chain is the leader, and an instr touches distance 0 from itself. std::vector gatherChains(ArrayRef Instrs); + + /// Propagates the best alignment in a chain of contiguous accesses + void propagateBestAlignmentsInChain(ArrayRef C) const; }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -716,6 +719,14 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { unsigned AS = getLoadStoreAddressSpace(C[0].Inst); unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8; + // We know that the accesses are contiguous. Propagate alignment + // information so that slices of the chain can still be vectorized. + propagateBestAlignmentsInChain(C); + LLVM_DEBUG({ + dbgs() << "LSV: Chain after alignment propagation:\n"; + dumpChain(C); + }); + std::vector Ret; for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { // Find candidate chains of size not greater than the largest vector reg. @@ -823,6 +834,7 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { << Alignment.value() << " to " << NewAlign.value() << "\n"); Alignment = NewAlign; + setLoadStoreAlignment(C[CBegin].Inst, Alignment); } } @@ -880,14 +892,6 @@ bool Vectorizer::vectorizeChain(Chain &C) { VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy)); Align Alignment = getLoadStoreAlignment(C[0].Inst); - // If this is a load/store of an alloca, we might have upgraded the alloca's - // alignment earlier. Get the new alignment. - if (AS == DL.getAllocaAddrSpace()) { - Alignment = std::max( - Alignment, - getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst), - MaybeAlign(), DL, C[0].Inst, nullptr, &DT)); - } // All elements of the chain must have the same scalar-type size. #ifndef NDEBUG @@ -1634,3 +1638,32 @@ std::optional Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, .sextOrTrunc(OrigBitWidth); return std::nullopt; } + +void Vectorizer::propagateBestAlignmentsInChain(ArrayRef C) const { + auto PropagateAlignments = [](auto ChainIt) { + ChainElem BestAlignedElem = *ChainIt.begin(); + Align BestAlignSoFar = getLoadStoreAlignment(BestAlignedElem.Inst); + + for (const ChainElem &E : ChainIt) { + Align OrigAlign = getLoadStoreAlignment(E.Inst); + if (OrigAlign > BestAlignSoFar) { + BestAlignedElem = E; + BestAlignSoFar = OrigAlign; + continue; + } + + APInt DeltaFromBestAlignedElem = + APIntOps::abdu(E.OffsetFromLeader, BestAlignedElem.OffsetFromLeader); + // commonAlignment is equivalent to a greatest common power-of-two + // divisor; it returns the largest power of 2 that divides both A and B. + Align NewAlign = commonAlignment( + BestAlignSoFar, DeltaFromBestAlignedElem.getLimitedValue()); + if (NewAlign > OrigAlign) + setLoadStoreAlignment(E.Inst, NewAlign); + } + }; + + // Propagate forwards and backwards. + PropagateAlignments(C); + PropagateAlignments(reverse(C)); +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll index fe8a7e58a6a57..0931caa1fde8a 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll @@ -155,7 +155,7 @@ define void @variadics1(ptr %vlist) { ; CHECK-NEXT: [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8 ; CHECK-NEXT: [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7 ; CHECK-NEXT: [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0) -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 8 ; CHECK-NEXT: [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 ; CHECK-NEXT: [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; CHECK-NEXT: [[X5:%.*]] = fadd double [[X42]], [[X31]] diff --git a/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll new file mode 100644 index 0000000000000..aeface5f91abd --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll @@ -0,0 +1,450 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=load-store-vectorizer -S < %s | FileCheck %s + +; The IR has the first float3 labeled with align 16, and that 16 should +; be propagated such that the second set of 4 values +; can also be vectorized together. +%struct.float3 = type { float, float, float } +%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 } + +define void @testStore(ptr %1) { +; CHECK-LABEL: define void @testStore( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: ret void +; + store float 0.000000e+00, ptr %1, align 16 + %getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1 + store float 0.000000e+00, ptr %getElem, align 4 + %getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2 + store float 0.000000e+00, ptr %getElem8, align 8 + %getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1 + store float 0.000000e+00, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1 + store float 0.000000e+00, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2 + store float 0.000000e+00, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2 + store i32 0, ptr %getElem12, align 8 + %getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3 + store i32 0, ptr %getElem13, align 4 + ret void +} + +define void @testLoad(ptr %1) { +; CHECK-LABEL: define void @testLoad( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16 +; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float +; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float +; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; CHECK-NEXT: ret void +; + %l1 = load float, ptr %1, align 16 + %getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1 + %l2 = load float, ptr %getElem, align 4 + %getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2 + %l3 = load float, ptr %getElem8, align 8 + %getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1 + %l4 = load float, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1 + %l5 = load float, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2 + %l6 = load float, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2 + %l7 = load i32, ptr %getElem12, align 8 + %getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3 + %l8 = load i32, ptr %getElem13, align 4 + ret void +} + +; Also, test without the struct geps, to see if it still works with i8 geps/ptradd + +define void @testStorei8(ptr %1) { +; CHECK-LABEL: define void @testStorei8( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: ret void +; + store float 0.000000e+00, ptr %1, align 16 + %getElem = getelementptr inbounds i8, ptr %1, i64 4 + store float 0.000000e+00, ptr %getElem, align 4 + %getElem8 = getelementptr inbounds i8, ptr %1, i64 8 + store float 0.000000e+00, ptr %getElem8, align 8 + %getElem9 = getelementptr inbounds i8, ptr %1, i64 12 + store float 0.000000e+00, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds i8, ptr %1, i64 16 + store float 0.000000e+00, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds i8, ptr %1, i64 20 + store float 0.000000e+00, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds i8, ptr %1, i64 24 + store i32 0, ptr %getElem12, align 8 + %getElem13 = getelementptr inbounds i8, ptr %1, i64 28 + store i32 0, ptr %getElem13, align 4 + ret void +} + +define void @testLoadi8(ptr %1) { +; CHECK-LABEL: define void @testLoadi8( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16 +; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float +; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float +; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; CHECK-NEXT: ret void +; + %l1 = load float, ptr %1, align 16 + %getElem = getelementptr inbounds i8, ptr %1, i64 4 + %l2 = load float, ptr %getElem, align 4 + %getElem8 = getelementptr inbounds i8, ptr %1, i64 8 + %l3 = load float, ptr %getElem8, align 8 + %getElem9 = getelementptr inbounds i8, ptr %1, i64 12 + %l4 = load float, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds i8, ptr %1, i64 16 + %l5 = load float, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds i8, ptr %1, i64 20 + %l6 = load float, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds i8, ptr %1, i64 24 + %l7 = load i32, ptr %getElem12, align 8 + %getElem13 = getelementptr inbounds i8, ptr %1, i64 28 + %l8 = load i32, ptr %getElem13, align 4 + ret void +} + + +; This version of the test adjusts the struct to hold two i32s at the beginning, +; but still assumes that the first float3 is 16 aligned. If the alignment +; propagation works correctly, it should be able to load this struct in three +; loads: a 2x32, a 4x32, and a 4x32. Without the alignment propagation, the last +; 4x32 will instead be a 2x32 and a 2x32 +%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 } + +define void @testStore_2(ptr %1) { +; CHECK-LABEL: define void @testStore_2( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2 +; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: ret void +; + store i32 0, ptr %1, align 8 + %getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1 + store i32 0, ptr %getElem, align 4 + %getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2 + store float 0.000000e+00, ptr %getElem1, align 16 + %getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1 + store float 0.000000e+00, ptr %getElem2, align 4 + %getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2 + store float 0.000000e+00, ptr %getElem8, align 8 + %getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3 + store float 0.000000e+00, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1 + store float 0.000000e+00, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2 + store float 0.000000e+00, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4 + store i32 0, ptr %getElem12, align 8 + %getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5 + store i32 0, ptr %getElem13, align 4 + ret void +} + +define void @testLoad_2(ptr %1) { +; CHECK-LABEL: define void @testLoad_2( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16 +; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float +; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float +; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: ret void +; + %l = load i32, ptr %1, align 8 + %getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1 + %l2 = load i32, ptr %getElem, align 4 + %getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2 + %l3 = load float, ptr %getElem1, align 16 + %getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1 + %l4 = load float, ptr %getElem2, align 4 + %getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2 + %l5 = load float, ptr %getElem8, align 8 + %getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3 + %l6 = load float, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1 + %l7 = load float, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2 + %l8 = load float, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4 + %l9 = load i32, ptr %getElem12, align 8 + %getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5 + %l0 = load i32, ptr %getElem13, align 4 + ret void +} + +; Also, test without the struct geps, to see if it still works with i8 geps/ptradd + +define void @testStorei8_2(ptr %1) { +; CHECK-LABEL: define void @testStorei8_2( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: ret void +; + store i32 0, ptr %1, align 8 + %getElem = getelementptr inbounds i8, ptr %1, i64 4 + store i32 0, ptr %getElem, align 4 + %getElem1 = getelementptr inbounds i8, ptr %1, i64 8 + store float 0.000000e+00, ptr %getElem1, align 16 + %getElem2 = getelementptr inbounds i8, ptr %1, i64 12 + store float 0.000000e+00, ptr %getElem2, align 4 + %getElem8 = getelementptr inbounds i8, ptr %1, i64 16 + store float 0.000000e+00, ptr %getElem8, align 8 + %getElem9 = getelementptr inbounds i8, ptr %1, i64 20 + store float 0.000000e+00, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds i8, ptr %1, i64 24 + store float 0.000000e+00, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds i8, ptr %1, i64 28 + store float 0.000000e+00, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds i8, ptr %1, i64 32 + store i32 0, ptr %getElem12, align 8 + %getElem13 = getelementptr inbounds i8, ptr %1, i64 36 + store i32 0, ptr %getElem13, align 4 + ret void +} + +define void @testLoadi8_2(ptr %1) { +; CHECK-LABEL: define void @testLoadi8_2( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16 +; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float +; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float +; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: ret void +; + %l = load i32, ptr %1, align 8 + %getElem = getelementptr inbounds i8, ptr %1, i64 4 + %l2 = load i32, ptr %getElem, align 4 + %getElem1 = getelementptr inbounds i8, ptr %1, i64 8 + %l3 = load float, ptr %getElem1, align 16 + %getElem2 = getelementptr inbounds i8, ptr %1, i64 12 + %l4 = load float, ptr %getElem2, align 4 + %getElem8 = getelementptr inbounds i8, ptr %1, i64 16 + %l5 = load float, ptr %getElem8, align 8 + %getElem9 = getelementptr inbounds i8, ptr %1, i64 20 + %l6 = load float, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds i8, ptr %1, i64 24 + %l7 = load float, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds i8, ptr %1, i64 28 + %l8 = load float, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds i8, ptr %1, i64 32 + %l9 = load i32, ptr %getElem12, align 8 + %getElem13 = getelementptr inbounds i8, ptr %1, i64 36 + %l0 = load i32, ptr %getElem13, align 4 + ret void +} + +; Test that the alignment propagation works both forwards and backwards. +; with the "align 16" placed where it is, +; we should end up with a v2 followed by two v4s followed by a v2. +define void @test_forward_and_reverse(ptr %1) { +; CHECK-LABEL: define void @test_forward_and_reverse( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16 +; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 16 +; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16 +; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0 +; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; CHECK-NEXT: ret void +; + %l = load i32, ptr %1, align 4 + %getElem = getelementptr inbounds i8, ptr %1, i64 4 + %l2 = load i32, ptr %getElem, align 4 + %getElem1 = getelementptr inbounds i8, ptr %1, i64 8 + %l3 = load float, ptr %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr %1, i64 12 + %l4 = load float, ptr %getElem2, align 4 + %getElem8 = getelementptr inbounds i8, ptr %1, i64 16 + %l5 = load float, ptr %getElem8, align 4 + %getElem9 = getelementptr inbounds i8, ptr %1, i64 20 + %l6 = load float, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds i8, ptr %1, i64 24 + %l7 = load float, ptr %getElem10, align 16 + %getElem11 = getelementptr inbounds i8, ptr %1, i64 28 + %l8 = load float, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds i8, ptr %1, i64 32 + %l9 = load float, ptr %getElem12, align 4 + %getElem13 = getelementptr inbounds i8, ptr %1, i64 36 + %l0 = load float, ptr %getElem13, align 4 + %getElem14 = getelementptr inbounds i8, ptr %1, i64 40 + %l11 = load i32, ptr %getElem14, align 4 + %getElem15 = getelementptr inbounds i8, ptr %1, i64 44 + %l12 = load i32, ptr %getElem15, align 4 + ret void +} + +; Test an edge case where the defined alignment is max align +define void @test_forward_and_reverse_max_align(ptr %1) { +; CHECK-LABEL: define void @test_forward_and_reverse_max_align( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16 +; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 4294967296 +; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16 +; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: ret void +; + %l = load i32, ptr %1, align 4 + %getElem = getelementptr inbounds i8, ptr %1, i64 4 + %l2 = load i32, ptr %getElem, align 4 + %getElem1 = getelementptr inbounds i8, ptr %1, i64 8 + %l3 = load float, ptr %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr %1, i64 12 + %l4 = load float, ptr %getElem2, align 4 + %getElem8 = getelementptr inbounds i8, ptr %1, i64 16 + %l5 = load float, ptr %getElem8, align 4 + %getElem9 = getelementptr inbounds i8, ptr %1, i64 20 + %l6 = load float, ptr %getElem9, align 4 + %getElem10 = getelementptr inbounds i8, ptr %1, i64 24 + %l7 = load float, ptr %getElem10, align 4294967296 + %getElem11 = getelementptr inbounds i8, ptr %1, i64 28 + %l8 = load float, ptr %getElem11, align 4 + %getElem12 = getelementptr inbounds i8, ptr %1, i64 32 + %l9 = load float, ptr %getElem12, align 4 + %getElem13 = getelementptr inbounds i8, ptr %1, i64 36 + %l0 = load float, ptr %getElem13, align 4 + %getElem14 = getelementptr inbounds i8, ptr %1, i64 40 + %l11 = load i32, ptr %getElem14, align 4 + %getElem15 = getelementptr inbounds i8, ptr %1, i64 44 + %l12 = load i32, ptr %getElem15, align 4 + ret void +} + +define void @test_i8_elements(ptr %1) { +; CHECK-LABEL: define void @test_i8_elements( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP0]], align 2 +; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 +; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GETELEM1]], align 4 +; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x i8> [[TMP3]], i32 0 +; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x i8> [[TMP3]], i32 1 +; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i8> [[TMP3]], i32 2 +; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i8> [[TMP3]], i32 3 +; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 6 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GETELEM10]], align 4 +; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i8> [[TMP4]], i32 0 +; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i8> [[TMP4]], i32 1 +; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i8> [[TMP4]], i32 2 +; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3 +; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 10 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr [[GETELEM14]], align 4 +; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 +; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 +; CHECK-NEXT: ret void +; + %l = load i8, ptr %1, align 1 + %getElem = getelementptr inbounds i8, ptr %1, i64 1 + %l2 = load i8, ptr %getElem, align 1 + %getElem1 = getelementptr inbounds i8, ptr %1, i64 2 + %l3 = load i8, ptr %getElem1, align 1 + %getElem2 = getelementptr inbounds i8, ptr %1, i64 3 + %l4 = load i8, ptr %getElem2, align 1 + %getElem8 = getelementptr inbounds i8, ptr %1, i64 4 + %l5 = load i8, ptr %getElem8, align 1 + %getElem9 = getelementptr inbounds i8, ptr %1, i64 5 + %l6 = load i8, ptr %getElem9, align 1 + %getElem10 = getelementptr inbounds i8, ptr %1, i64 6 + %l7 = load i8, ptr %getElem10, align 4 + %getElem11 = getelementptr inbounds i8, ptr %1, i64 7 + %l8 = load i8, ptr %getElem11, align 1 + %getElem12 = getelementptr inbounds i8, ptr %1, i64 8 + %l9 = load i8, ptr %getElem12, align 1 + %getElem13 = getelementptr inbounds i8, ptr %1, i64 9 + %l0 = load i8, ptr %getElem13, align 1 + %getElem14 = getelementptr inbounds i8, ptr %1, i64 10 + %l11 = load i8, ptr %getElem14, align 1 + %getElem15 = getelementptr inbounds i8, ptr %1, i64 11 + %l12 = load i8, ptr %getElem15, align 1 + ret void +}