diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index b44f4f8c8687d..60606d34c32c3 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -536,6 +536,15 @@ class VectorType : public Type { EltCnt.divideCoefficientBy(2)); } + static VectorType *getOneNthElementsVectorType(VectorType *VTy, + unsigned Denominator) { + auto EltCnt = VTy->getElementCount(); + assert(EltCnt.isKnownMultipleOf(Denominator) && + "Cannot take one-nth of a vector"); + return VectorType::get(VTy->getScalarType(), + EltCnt.divideCoefficientBy(Denominator)); + } + /// This static method returns a VectorType with twice as many elements as the /// input type and the same element type. static VectorType *getDoubleElementsVectorType(VectorType *VTy) { diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index 82f72131b9d2f..65a7fc0ce2c1c 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -148,6 +148,9 @@ namespace Intrinsic { ExtendArgument, TruncArgument, HalfVecArgument, + OneThirdVecArgument, + OneFifthVecArgument, + OneSeventhVecArgument, SameVecWidthArgument, VecOfAnyPtrsToElt, VecElementArgument, @@ -159,6 +162,9 @@ namespace Intrinsic { AArch64Svcount, } Kind; + // These three have to be contiguous. + static_assert(OneFifthVecArgument == OneThirdVecArgument + 1 && + OneSeventhVecArgument == OneFifthVecArgument + 1); union { unsigned Integer_Width; unsigned Float_Width; @@ -178,15 +184,17 @@ namespace Intrinsic { unsigned getArgumentNumber() const { assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || - Kind == SameVecWidthArgument || Kind == VecElementArgument || - Kind == Subdivide2Argument || Kind == Subdivide4Argument || - Kind == VecOfBitcastsToInt); + Kind == OneThirdVecArgument || Kind == OneFifthVecArgument || + Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument || + Kind == VecElementArgument || Kind == Subdivide2Argument || + Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt); return Argument_Info >> 3; } ArgKind getArgumentKind() const { assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || - Kind == SameVecWidthArgument || + Kind == OneThirdVecArgument || Kind == OneFifthVecArgument || + Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument || Kind == VecElementArgument || Kind == Subdivide2Argument || Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt); return (ArgKind)(Argument_Info & 7); diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index ee877349a3314..d4ce4b1d199d7 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -327,6 +327,9 @@ def IIT_I4 : IIT_Int<4, 58>; def IIT_AARCH64_SVCOUNT : IIT_VT; def IIT_V6 : IIT_Vec<6, 60>; def IIT_V10 : IIT_Vec<10, 61>; +def IIT_ONE_THIRD_VEC_ARG : IIT_Base<62>; +def IIT_ONE_FIFTH_VEC_ARG : IIT_Base<63>; +def IIT_ONE_SEVENTH_VEC_ARG : IIT_Base<64>; } defvar IIT_all_FixedTypes = !filter(iit, IIT_all, @@ -467,6 +470,15 @@ class LLVMVectorElementType : LLVMMatchType; class LLVMHalfElementsVectorType : LLVMMatchType; +class LLVMOneThirdElementsVectorType + : LLVMMatchType; + +class LLVMOneFifthElementsVectorType + : LLVMMatchType; + +class LLVMOneSeventhElementsVectorType + : LLVMMatchType; + // Match the type of another intrinsic parameter that is expected to be a // vector type (i.e. ) but with each element subdivided to // form a vector with more elements that are smaller than the original. @@ -2728,6 +2740,54 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType [llvm_anyvector_ty], [IntrNoMem]>; +def int_vector_interleave3 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMOneThirdElementsVectorType<0>, + LLVMOneThirdElementsVectorType<0>, + LLVMOneThirdElementsVectorType<0>], + [IntrNoMem]>; + +def int_vector_deinterleave3 : DefaultAttrsIntrinsic<[LLVMOneThirdElementsVectorType<0>, + LLVMOneThirdElementsVectorType<0>, + LLVMOneThirdElementsVectorType<0>], + [llvm_anyvector_ty], + [IntrNoMem]>; + +def int_vector_interleave5 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMOneFifthElementsVectorType<0>, + LLVMOneFifthElementsVectorType<0>, + LLVMOneFifthElementsVectorType<0>, + LLVMOneFifthElementsVectorType<0>, + LLVMOneFifthElementsVectorType<0>], + [IntrNoMem]>; + +def int_vector_deinterleave5 : DefaultAttrsIntrinsic<[LLVMOneFifthElementsVectorType<0>, + LLVMOneFifthElementsVectorType<0>, + LLVMOneFifthElementsVectorType<0>, + LLVMOneFifthElementsVectorType<0>, + LLVMOneFifthElementsVectorType<0>], + [llvm_anyvector_ty], + [IntrNoMem]>; + +def int_vector_interleave7 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>], + [IntrNoMem]>; + +def int_vector_deinterleave7 : DefaultAttrsIntrinsic<[LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>, + LLVMOneSeventhElementsVectorType<0>], + [llvm_anyvector_ty], + [IntrNoMem]>; + //===-------------- Intrinsics to perform partial reduction ---------------===// def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>], diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index b0a624680231e..c95f7b7eb8dec 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -5825,15 +5825,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) { } SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_INTERLEAVE_DEINTERLEAVE(SDNode *N) { - SDLoc dl(N); + SDLoc DL(N); + unsigned Factor = N->getNumOperands(); + + SmallVector Ops(Factor); + for (unsigned i = 0; i != Factor; i++) + Ops[i] = GetPromotedInteger(N->getOperand(i)); + + SmallVector ResVTs(Factor, Ops[0].getValueType()); + SDValue Res = DAG.getNode(N->getOpcode(), DL, DAG.getVTList(ResVTs), Ops); + + for (unsigned i = 0; i != Factor; i++) + SetPromotedInteger(SDValue(N, i), Res.getValue(i)); - SDValue V0 = GetPromotedInteger(N->getOperand(0)); - SDValue V1 = GetPromotedInteger(N->getOperand(1)); - EVT ResVT = V0.getValueType(); - SDValue Res = DAG.getNode(N->getOpcode(), dl, - DAG.getVTList(ResVT, ResVT), V0, V1); - SetPromotedInteger(SDValue(N, 0), Res.getValue(0)); - SetPromotedInteger(SDValue(N, 1), Res.getValue(1)); return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index f39d9ca15496a..03d0298e99ad4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1668,6 +1668,15 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, return; } + if (getTypeAction(SubVecVT) == TargetLowering::TypeWidenVector && + Vec.isUndef() && SubVecVT.getVectorElementType() == MVT::i1) { + SDValue WideSubVec = GetWidenedVector(SubVec); + if (WideSubVec.getValueType() == VecVT) { + std::tie(Lo, Hi) = DAG.SplitVector(WideSubVec, SDLoc(WideSubVec)); + return; + } + } + // Spill the vector to the stack. // In cases where the vector is illegal it will be broken down into parts // and stored in parts - we should use the alignment for the smallest part. @@ -3183,34 +3192,53 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, } void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) { + unsigned Factor = N->getNumOperands(); + + SmallVector Ops(Factor * 2); + for (unsigned i = 0; i != Factor; ++i) { + SDValue OpLo, OpHi; + GetSplitVector(N->getOperand(i), OpLo, OpHi); + Ops[i * 2] = OpLo; + Ops[i * 2 + 1] = OpHi; + } + + SmallVector VTs(Factor, Ops[0].getValueType()); - SDValue Op0Lo, Op0Hi, Op1Lo, Op1Hi; - GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi); - GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi); - EVT VT = Op0Lo.getValueType(); SDLoc DL(N); - SDValue ResLo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, - DAG.getVTList(VT, VT), Op0Lo, Op0Hi); - SDValue ResHi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, - DAG.getVTList(VT, VT), Op1Lo, Op1Hi); + SDValue ResLo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs, + ArrayRef(Ops).slice(0, Factor)); + SDValue ResHi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs, + ArrayRef(Ops).slice(Factor, Factor)); - SetSplitVector(SDValue(N, 0), ResLo.getValue(0), ResHi.getValue(0)); - SetSplitVector(SDValue(N, 1), ResLo.getValue(1), ResHi.getValue(1)); + for (unsigned i = 0; i != Factor; ++i) + SetSplitVector(SDValue(N, i), ResLo.getValue(i), ResHi.getValue(i)); } void DAGTypeLegalizer::SplitVecRes_VECTOR_INTERLEAVE(SDNode *N) { - SDValue Op0Lo, Op0Hi, Op1Lo, Op1Hi; - GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi); - GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi); - EVT VT = Op0Lo.getValueType(); + unsigned Factor = N->getNumOperands(); + + SmallVector Ops(Factor * 2); + for (unsigned i = 0; i != Factor; ++i) { + SDValue OpLo, OpHi; + GetSplitVector(N->getOperand(i), OpLo, OpHi); + Ops[i] = OpLo; + Ops[i + Factor] = OpHi; + } + + SmallVector VTs(Factor, Ops[0].getValueType()); + SDLoc DL(N); - SDValue Res[] = {DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, - DAG.getVTList(VT, VT), Op0Lo, Op1Lo), - DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, - DAG.getVTList(VT, VT), Op0Hi, Op1Hi)}; + SDValue Res[] = {DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs, + ArrayRef(Ops).slice(0, Factor)), + DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs, + ArrayRef(Ops).slice(Factor, Factor))}; - SetSplitVector(SDValue(N, 0), Res[0].getValue(0), Res[0].getValue(1)); - SetSplitVector(SDValue(N, 1), Res[1].getValue(0), Res[1].getValue(1)); + for (unsigned i = 0; i != Factor; ++i) { + unsigned IdxLo = 2 * i; + unsigned IdxHi = 2 * i + 1; + SetSplitVector(SDValue(N, i), Res[IdxLo / Factor].getValue(IdxLo % Factor), + Res[IdxHi / Factor].getValue(IdxHi % Factor)); + } } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 428e7a316d247..4e1ce6af3abc8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8251,10 +8251,28 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, visitCallBrLandingPad(I); return; case Intrinsic::vector_interleave2: - visitVectorInterleave(I); + visitVectorInterleave(I, 2); + return; + case Intrinsic::vector_interleave3: + visitVectorInterleave(I, 3); + return; + case Intrinsic::vector_interleave5: + visitVectorInterleave(I, 5); + return; + case Intrinsic::vector_interleave7: + visitVectorInterleave(I, 7); return; case Intrinsic::vector_deinterleave2: - visitVectorDeinterleave(I); + visitVectorDeinterleave(I, 2); + return; + case Intrinsic::vector_deinterleave3: + visitVectorDeinterleave(I, 3); + return; + case Intrinsic::vector_deinterleave5: + visitVectorDeinterleave(I, 5); + return; + case Intrinsic::vector_deinterleave7: + visitVectorDeinterleave(I, 7); return; case Intrinsic::experimental_vector_compress: setValue(&I, DAG.getNode(ISD::VECTOR_COMPRESS, sdl, @@ -12565,26 +12583,31 @@ void SelectionDAGBuilder::visitVectorReverse(const CallInst &I) { setValue(&I, DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), Mask)); } -void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) { +void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I, + unsigned Factor) { auto DL = getCurSDLoc(); SDValue InVec = getValue(I.getOperand(0)); - EVT OutVT = - InVec.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); + SmallVector ValueVTs; + ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(), + ValueVTs); + + EVT OutVT = ValueVTs[0]; unsigned OutNumElts = OutVT.getVectorMinNumElements(); - // ISD Node needs the input vectors split into two equal parts - SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, - DAG.getVectorIdxConstant(0, DL)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, - DAG.getVectorIdxConstant(OutNumElts, DL)); + SmallVector SubVecs(Factor); + for (unsigned i = 0; i != Factor; ++i) { + assert(ValueVTs[i] == OutVT && "Expected VTs to be the same"); + SubVecs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, + DAG.getVectorIdxConstant(OutNumElts * i, DL)); + } - // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing - // legalisation and combines. - if (OutVT.isFixedLengthVector()) { - SDValue Even = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, + // Use VECTOR_SHUFFLE for fixed-length vectors with factor of 2 to benefit + // from existing legalisation and combines. + if (OutVT.isFixedLengthVector() && Factor == 2) { + SDValue Even = DAG.getVectorShuffle(OutVT, DL, SubVecs[0], SubVecs[1], createStrideMask(0, 2, OutNumElts)); - SDValue Odd = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, + SDValue Odd = DAG.getVectorShuffle(OutVT, DL, SubVecs[0], SubVecs[1], createStrideMask(1, 2, OutNumElts)); SDValue Res = DAG.getMergeValues({Even, Odd}, getCurSDLoc()); setValue(&I, Res); @@ -12592,32 +12615,43 @@ void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) { } SDValue Res = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, - DAG.getVTList(OutVT, OutVT), Lo, Hi); + DAG.getVTList(ValueVTs), SubVecs); setValue(&I, Res); } -void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I) { +void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I, + unsigned Factor) { auto DL = getCurSDLoc(); - EVT InVT = getValue(I.getOperand(0)).getValueType(); - SDValue InVec0 = getValue(I.getOperand(0)); - SDValue InVec1 = getValue(I.getOperand(1)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT InVT = getValue(I.getOperand(0)).getValueType(); EVT OutVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); - // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing - // legalisation and combines. - if (OutVT.isFixedLengthVector()) { + SmallVector InVecs(Factor); + for (unsigned i = 0; i < Factor; ++i) { + InVecs[i] = getValue(I.getOperand(i)); + assert(InVecs[i].getValueType() == InVecs[0].getValueType() && + "Expected VTs to be the same"); + } + + // Use VECTOR_SHUFFLE for fixed-length vectors with factor of 2 to benefit + // from existing legalisation and combines. + if (OutVT.isFixedLengthVector() && Factor == 2) { unsigned NumElts = InVT.getVectorMinNumElements(); - SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVec0, InVec1); + SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVecs); setValue(&I, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT), createInterleaveMask(NumElts, 2))); return; } - SDValue Res = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, - DAG.getVTList(InVT, InVT), InVec0, InVec1); - Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Res.getValue(0), - Res.getValue(1)); + SmallVector ValueVTs(Factor, InVT); + SDValue Res = + DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, DAG.getVTList(ValueVTs), InVecs); + + SmallVector Results(Factor); + for (unsigned i = 0; i < Factor; ++i) + Results[i] = Res.getValue(i); + + Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Results); setValue(&I, Res); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index ed85deef64fa7..ece48c9bedf72 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -659,8 +659,8 @@ class SelectionDAGBuilder { void visitVectorReduce(const CallInst &I, unsigned Intrinsic); void visitVectorReverse(const CallInst &I); void visitVectorSplice(const CallInst &I); - void visitVectorInterleave(const CallInst &I); - void visitVectorDeinterleave(const CallInst &I); + void visitVectorInterleave(const CallInst &I, unsigned Factor); + void visitVectorDeinterleave(const CallInst &I, unsigned Factor); void visitStepVector(const CallInst &I); void visitUserOp1(const Instruction &I) { diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index ec1184e8d835d..107caebede139 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -362,6 +362,24 @@ DecodeIITType(unsigned &NextElt, ArrayRef Infos, IITDescriptor::get(IITDescriptor::HalfVecArgument, ArgInfo)); return; } + case IIT_ONE_THIRD_VEC_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back( + IITDescriptor::get(IITDescriptor::OneThirdVecArgument, ArgInfo)); + return; + } + case IIT_ONE_FIFTH_VEC_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back( + IITDescriptor::get(IITDescriptor::OneFifthVecArgument, ArgInfo)); + return; + } + case IIT_ONE_SEVENTH_VEC_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back( + IITDescriptor::get(IITDescriptor::OneSeventhVecArgument, ArgInfo)); + return; + } case IIT_SAME_VEC_WIDTH_ARG: { unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); OutputTable.push_back( @@ -555,6 +573,12 @@ static Type *DecodeFixedType(ArrayRef &Infos, case IITDescriptor::HalfVecArgument: return VectorType::getHalfElementsVectorType( cast(Tys[D.getArgumentNumber()])); + case IITDescriptor::OneThirdVecArgument: + case IITDescriptor::OneFifthVecArgument: + case IITDescriptor::OneSeventhVecArgument: + return VectorType::getOneNthElementsVectorType( + cast(Tys[D.getArgumentNumber()]), + 3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2); case IITDescriptor::SameVecWidthArgument: { Type *EltTy = DecodeFixedType(Infos, Tys, Context); Type *Ty = Tys[D.getArgumentNumber()]; @@ -931,6 +955,16 @@ matchIntrinsicType(Type *Ty, ArrayRef &Infos, return !isa(ArgTys[D.getArgumentNumber()]) || VectorType::getHalfElementsVectorType( cast(ArgTys[D.getArgumentNumber()])) != Ty; + case IITDescriptor::OneThirdVecArgument: + case IITDescriptor::OneFifthVecArgument: + case IITDescriptor::OneSeventhVecArgument: + // If this is a forward reference, defer the check for later. + if (D.getArgumentNumber() >= ArgTys.size()) + return IsDeferredCheck || DeferCheck(Ty); + return !isa(ArgTys[D.getArgumentNumber()]) || + VectorType::getOneNthElementsVectorType( + cast(ArgTys[D.getArgumentNumber()]), + 3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2) != Ty; case IITDescriptor::SameVecWidthArgument: { if (D.getArgumentNumber() >= ArgTys.size()) { // Defer check and subsequent check for the vector element type. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5e5bc0819a10c..217baa608f873 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1186,6 +1186,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS, ISD::VECTOR_REVERSE}, VT, Custom); + setOperationAction({ISD::VECTOR_INTERLEAVE, ISD::VECTOR_DEINTERLEAVE}, + VT, Custom); + setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT, Custom); @@ -1341,6 +1344,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VECTOR_SHUFFLE, ISD::VECTOR_COMPRESS}, VT, Custom); + setOperationAction({ISD::VECTOR_INTERLEAVE, ISD::VECTOR_DEINTERLEAVE}, + VT, Custom); + setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, VT, Custom); @@ -10972,78 +10978,161 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, SDLoc DL(Op); MVT VecVT = Op.getSimpleValueType(); - assert(VecVT.isScalableVector() && - "vector_interleave on non-scalable vector!"); + const unsigned Factor = Op->getNumValues(); + assert(Factor <= 8); // 1 bit element vectors need to be widened to e8 if (VecVT.getVectorElementType() == MVT::i1) return widenVectorOpsToi8(Op, DL, DAG); - // If the VT is LMUL=8, we need to split and reassemble. - if (VecVT.getSizeInBits().getKnownMinValue() == + // Convert to scalable vectors first. + if (VecVT.isFixedLengthVector()) { + MVT ContainerVT = getContainerForFixedLengthVector(VecVT); + SmallVector Ops(Factor); + for (unsigned i = 0U; i < Factor; ++i) + Ops[i] = convertToScalableVector(ContainerVT, Op.getOperand(i), DAG, + Subtarget); + + SmallVector VTs(Factor, ContainerVT); + SDValue NewDeinterleave = + DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs, Ops); + + SmallVector Res(Factor); + for (unsigned i = 0U; i < Factor; ++i) + Res[i] = convertFromScalableVector(VecVT, NewDeinterleave.getValue(i), + DAG, Subtarget); + return DAG.getMergeValues(Res, DL); + } + + // If concatenating would exceed LMUL=8, we need to split. + if ((VecVT.getSizeInBits().getKnownMinValue() * Factor) > (8 * RISCV::RVVBitsPerBlock)) { - auto [Op0Lo, Op0Hi] = DAG.SplitVectorOperand(Op.getNode(), 0); - auto [Op1Lo, Op1Hi] = DAG.SplitVectorOperand(Op.getNode(), 1); - EVT SplitVT = Op0Lo.getValueType(); - - SDValue ResLo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, - DAG.getVTList(SplitVT, SplitVT), Op0Lo, Op0Hi); - SDValue ResHi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, - DAG.getVTList(SplitVT, SplitVT), Op1Lo, Op1Hi); - - SDValue Even = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, - ResLo.getValue(0), ResHi.getValue(0)); - SDValue Odd = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, ResLo.getValue(1), - ResHi.getValue(1)); - return DAG.getMergeValues({Even, Odd}, DL); + SmallVector Ops(Factor * 2); + for (unsigned i = 0; i != Factor; ++i) { + auto [OpLo, OpHi] = DAG.SplitVectorOperand(Op.getNode(), i); + Ops[i * 2] = OpLo; + Ops[i * 2 + 1] = OpHi; + } + + SmallVector VTs(Factor, Ops[0].getValueType()); + + SDValue Lo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs, + ArrayRef(Ops).slice(0, Factor)); + SDValue Hi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs, + ArrayRef(Ops).slice(Factor, Factor)); + + SmallVector Res(Factor); + for (unsigned i = 0; i != Factor; ++i) + Res[i] = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo.getValue(i), + Hi.getValue(i)); + + return DAG.getMergeValues(Res, DL); } - // Concatenate the two vectors as one vector to deinterleave + SmallVector Ops(Op->op_values()); + + // Concatenate the vectors as one vector to deinterleave MVT ConcatVT = MVT::getVectorVT(VecVT.getVectorElementType(), - VecVT.getVectorElementCount().multiplyCoefficientBy(2)); - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, - Op.getOperand(0), Op.getOperand(1)); + VecVT.getVectorElementCount().multiplyCoefficientBy( + PowerOf2Ceil(Factor))); + if (Ops.size() < PowerOf2Ceil(Factor)) + Ops.append(PowerOf2Ceil(Factor) - Factor, DAG.getUNDEF(VecVT)); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Ops); + + if (Factor == 2) { + // We can deinterleave through vnsrl.wi if the element type is smaller than + // ELEN + if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) { + SDValue Even = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 0, DAG); + SDValue Odd = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 1, DAG); + return DAG.getMergeValues({Even, Odd}, DL); + } + + // For the indices, use the vmv.v.x of an i8 constant to fill the largest + // possibly mask vector, then extract the required subvector. Doing this + // (instead of a vid, vmsne sequence) reduces LMUL, and allows the mask + // creation to be rematerialized during register allocation to reduce + // register pressure if needed. + + MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1); + + SDValue EvenSplat = DAG.getConstant(0b01010101, DL, MVT::nxv8i8); + EvenSplat = DAG.getBitcast(MVT::nxv64i1, EvenSplat); + SDValue EvenMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, + EvenSplat, DAG.getVectorIdxConstant(0, DL)); + + SDValue OddSplat = DAG.getConstant(0b10101010, DL, MVT::nxv8i8); + OddSplat = DAG.getBitcast(MVT::nxv64i1, OddSplat); + SDValue OddMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, OddSplat, + DAG.getVectorIdxConstant(0, DL)); + + // vcompress the even and odd elements into two separate vectors + SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat, + EvenMask, DAG.getUNDEF(ConcatVT)); + SDValue OddWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat, + OddMask, DAG.getUNDEF(ConcatVT)); + + // Extract the result half of the gather for even and odd + SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide, + DAG.getVectorIdxConstant(0, DL)); + SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide, + DAG.getVectorIdxConstant(0, DL)); - // We can deinterleave through vnsrl.wi if the element type is smaller than - // ELEN - if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) { - SDValue Even = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 0, DAG); - SDValue Odd = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 1, DAG); return DAG.getMergeValues({Even, Odd}, DL); } - // For the indices, use the vmv.v.x of an i8 constant to fill the largest - // possibly mask vector, then extract the required subvector. Doing this - // (instead of a vid, vmsne sequence) reduces LMUL, and allows the mask - // creation to be rematerialized during register allocation to reduce - // register pressure if needed. + // Store with unit-stride store and load it back with segmented load. + MVT XLenVT = Subtarget.getXLenVT(); + SDValue VL = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget).second; + SDValue Passthru = DAG.getUNDEF(ConcatVT); - MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1); + // Allocate a stack slot. + Align Alignment = DAG.getReducedAlign(VecVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(ConcatVT.getStoreSize(), Alignment); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); - SDValue EvenSplat = DAG.getConstant(0b01010101, DL, MVT::nxv8i8); - EvenSplat = DAG.getBitcast(MVT::nxv64i1, EvenSplat); - SDValue EvenMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, EvenSplat, - DAG.getVectorIdxConstant(0, DL)); + SDValue StoreOps[] = {DAG.getEntryNode(), + DAG.getTargetConstant(Intrinsic::riscv_vse, DL, XLenVT), + Concat, StackPtr, VL}; - SDValue OddSplat = DAG.getConstant(0b10101010, DL, MVT::nxv8i8); - OddSplat = DAG.getBitcast(MVT::nxv64i1, OddSplat); - SDValue OddMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, OddSplat, - DAG.getVectorIdxConstant(0, DL)); + SDValue Chain = DAG.getMemIntrinsicNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), StoreOps, + ConcatVT.getVectorElementType(), PtrInfo, Alignment, + MachineMemOperand::MOStore, MemoryLocation::UnknownSize); - // vcompress the even and odd elements into two separate vectors - SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat, - EvenMask, DAG.getUNDEF(ConcatVT)); - SDValue OddWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat, - OddMask, DAG.getUNDEF(ConcatVT)); + static const Intrinsic::ID VlsegIntrinsicsIds[] = { + Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, Intrinsic::riscv_vlseg4, + Intrinsic::riscv_vlseg5, Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7, + Intrinsic::riscv_vlseg8}; - // Extract the result half of the gather for even and odd - SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide, - DAG.getVectorIdxConstant(0, DL)); - SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide, - DAG.getVectorIdxConstant(0, DL)); + SDValue LoadOps[] = { + Chain, + DAG.getTargetConstant(VlsegIntrinsicsIds[Factor - 2], DL, XLenVT), + Passthru, + StackPtr, + VL, + DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()), DL, XLenVT)}; + + unsigned Sz = + Factor * VecVT.getVectorMinNumElements() * VecVT.getScalarSizeInBits(); + EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, Factor); + + SDValue Load = DAG.getMemIntrinsicNode( + ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList({VecTupTy, MVT::Other}), + LoadOps, ConcatVT.getVectorElementType(), PtrInfo, Alignment, + MachineMemOperand::MOLoad, MemoryLocation::UnknownSize); + + SmallVector Res(Factor); + + for (unsigned i = 0U; i < Factor; ++i) + Res[i] = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, VecVT, Load, + DAG.getVectorIdxConstant(i, DL)); - return DAG.getMergeValues({Even, Odd}, DL); + return DAG.getMergeValues(Res, DL); } SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, @@ -11051,36 +11140,125 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, SDLoc DL(Op); MVT VecVT = Op.getSimpleValueType(); - assert(VecVT.isScalableVector() && - "vector_interleave on non-scalable vector!"); + const unsigned Factor = Op.getNumOperands(); + assert(Factor <= 8); // i1 vectors need to be widened to i8 if (VecVT.getVectorElementType() == MVT::i1) return widenVectorOpsToi8(Op, DL, DAG); + // Convert to scalable vectors first. + if (VecVT.isFixedLengthVector()) { + MVT ContainerVT = getContainerForFixedLengthVector(VecVT); + SmallVector Ops(Factor); + for (unsigned i = 0U; i < Factor; ++i) + Ops[i] = convertToScalableVector(ContainerVT, Op.getOperand(i), DAG, + Subtarget); + + SmallVector VTs(Factor, ContainerVT); + SDValue NewInterleave = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs, Ops); + + SmallVector Res(Factor); + for (unsigned i = 0U; i < Factor; ++i) + Res[i] = convertFromScalableVector(VecVT, NewInterleave.getValue(i), DAG, + Subtarget); + return DAG.getMergeValues(Res, DL); + } + MVT XLenVT = Subtarget.getXLenVT(); SDValue VL = DAG.getRegister(RISCV::X0, XLenVT); - // If the VT is LMUL=8, we need to split and reassemble. - if (VecVT.getSizeInBits().getKnownMinValue() == (8 * RISCV::RVVBitsPerBlock)) { - auto [Op0Lo, Op0Hi] = DAG.SplitVectorOperand(Op.getNode(), 0); - auto [Op1Lo, Op1Hi] = DAG.SplitVectorOperand(Op.getNode(), 1); - EVT SplitVT = Op0Lo.getValueType(); + // If the VT is larger than LMUL=8, we need to split and reassemble. + if ((VecVT.getSizeInBits().getKnownMinValue() * Factor) > + (8 * RISCV::RVVBitsPerBlock)) { + SmallVector Ops(Factor * 2); + for (unsigned i = 0; i != Factor; ++i) { + auto [OpLo, OpHi] = DAG.SplitVectorOperand(Op.getNode(), i); + Ops[i] = OpLo; + Ops[i + Factor] = OpHi; + } + + SmallVector VTs(Factor, Ops[0].getValueType()); - SDValue ResLo = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, - DAG.getVTList(SplitVT, SplitVT), Op0Lo, Op1Lo); - SDValue ResHi = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, - DAG.getVTList(SplitVT, SplitVT), Op0Hi, Op1Hi); + SDValue Res[] = {DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs, + ArrayRef(Ops).take_front(Factor)), + DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs, + ArrayRef(Ops).drop_front(Factor))}; - SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, - ResLo.getValue(0), ResLo.getValue(1)); - SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, - ResHi.getValue(0), ResHi.getValue(1)); - return DAG.getMergeValues({Lo, Hi}, DL); + SmallVector Concats(Factor); + for (unsigned i = 0; i != Factor; ++i) { + unsigned IdxLo = 2 * i; + unsigned IdxHi = 2 * i + 1; + Concats[i] = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, + Res[IdxLo / Factor].getValue(IdxLo % Factor), + Res[IdxHi / Factor].getValue(IdxHi % Factor)); + } + + return DAG.getMergeValues(Concats, DL); } SDValue Interleaved; + // Spill to the stack using a segment store for simplicity. + if (Factor != 2) { + EVT MemVT = + EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), + VecVT.getVectorElementCount() * Factor); + + // Allocate a stack slot. + Align Alignment = DAG.getReducedAlign(VecVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment); + EVT PtrVT = StackPtr.getValueType(); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + static const Intrinsic::ID IntrIds[] = { + Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, + Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5, + Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7, + Intrinsic::riscv_vsseg8, + }; + + unsigned Sz = + Factor * VecVT.getVectorMinNumElements() * VecVT.getScalarSizeInBits(); + EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, Factor); + + SDValue StoredVal = DAG.getUNDEF(VecTupTy); + for (unsigned i = 0; i < Factor; i++) + StoredVal = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, + Op.getOperand(i), DAG.getConstant(i, DL, XLenVT)); + + SDValue Ops[] = {DAG.getEntryNode(), + DAG.getTargetConstant(IntrIds[Factor - 2], DL, XLenVT), + StoredVal, + StackPtr, + VL, + DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()), + DL, XLenVT)}; + + SDValue Chain = DAG.getMemIntrinsicNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, + VecVT.getVectorElementType(), PtrInfo, Alignment, + MachineMemOperand::MOStore, MemoryLocation::UnknownSize); + + SmallVector Loads(Factor); + + SDValue Increment = + DAG.getVScale(DL, PtrVT, + APInt(PtrVT.getFixedSizeInBits(), + VecVT.getStoreSize().getKnownMinValue())); + for (unsigned i = 0; i != Factor; ++i) { + if (i != 0) + StackPtr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, Increment); + + Loads[i] = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); + } + + return DAG.getMergeValues(Loads, DL); + } + // If the element type is smaller than ELEN, then we can interleave with // vwaddu.vv and vwmaccu.vx if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 2fc5b40a89afa..7115eacf84920 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,RV64 ; Integers @@ -152,11 +152,268 @@ define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) ret {<8 x i64>, <8 x i64>} %retval } -declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>) -declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>) -declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>) -declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>) -declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>) +define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x i32> %v) { +; CHECK-LABEL: vector_deinterleave3_v2i32_v6i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 4 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v9, v12 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %res = call {<2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave3.v6i32(<6 x i32> %v) + ret {<2 x i32>, <2 x i32>, <2 x i32>} %res +} + + +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) { +; CHECK-LABEL: vector_deinterleave5_v2i16_v10i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 6 +; CHECK-NEXT: vslidedown.vi v11, v8, 4 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v14, v8, 8 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v11, v10, a1 +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v9, v14 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave5.v10i16(<10 x i16> %v) + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res +} + +define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) { +; RV32-LABEL: vector_deinterleave7_v14i8_v2i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s1, -12 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr s1, vlenb +; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; RV32-NEXT: vslidedown.vi v10, v8, 10 +; RV32-NEXT: vslidedown.vi v11, v8, 8 +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: srli s0, s1, 3 +; RV32-NEXT: add a0, s0, s0 +; RV32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma +; RV32-NEXT: vslideup.vx v11, v10, s0 +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vslideup.vx v10, v9, s0 +; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; RV32-NEXT: vslidedown.vi v9, v8, 12 +; RV32-NEXT: srli a0, s1, 2 +; RV32-NEXT: add a1, a0, s0 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV32-NEXT: vslideup.vx v11, v9, a0 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 32 +; RV32-NEXT: vs1r.v v11, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; RV32-NEXT: vslidedown.vi v9, v8, 4 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV32-NEXT: vslideup.vx v10, v9, a0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 32 +; RV32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 3 +; RV32-NEXT: mv a0, s0 +; RV32-NEXT: call __mulsi3 +; RV32-NEXT: add s0, a0, s0 +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 6 +; RV32-NEXT: srli s1, s1, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 32 +; RV32-NEXT: vl1r.v v9, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, s0, e8, mf2, ta, ma +; RV32-NEXT: vslideup.vx v9, v8, a0 +; RV32-NEXT: add a0, s1, s1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 32 +; RV32-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; RV32-NEXT: vslideup.vx v9, v8, s1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a1, a0, 1 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 32 +; RV32-NEXT: vs1r.v v9, (a0) +; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; RV32-NEXT: vlseg7e8.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 48 +; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: .cfi_restore s1 +; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_deinterleave7_v14i8_v2i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -64 +; RV64-NEXT: .cfi_def_cfa_offset 64 +; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s1, -24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 4 * vlenb +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr s1, vlenb +; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; RV64-NEXT: vslidedown.vi v10, v8, 10 +; RV64-NEXT: vslidedown.vi v11, v8, 8 +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: srli s0, s1, 3 +; RV64-NEXT: add a0, s0, s0 +; RV64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma +; RV64-NEXT: vslideup.vx v11, v10, s0 +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vslideup.vx v10, v9, s0 +; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; RV64-NEXT: vslidedown.vi v9, v8, 12 +; RV64-NEXT: srli a0, s1, 2 +; RV64-NEXT: add a1, a0, s0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV64-NEXT: vslideup.vx v11, v9, a0 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 1 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 32 +; RV64-NEXT: vs1r.v v11, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; RV64-NEXT: vslidedown.vi v9, v8, 4 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV64-NEXT: vslideup.vx v10, v9, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 32 +; RV64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; RV64-NEXT: li a1, 3 +; RV64-NEXT: mv a0, s0 +; RV64-NEXT: call __muldi3 +; RV64-NEXT: add s0, a0, s0 +; RV64-NEXT: addi a1, sp, 32 +; RV64-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 6 +; RV64-NEXT: srli s1, s1, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 32 +; RV64-NEXT: vl1r.v v9, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetvli zero, s0, e8, mf2, ta, ma +; RV64-NEXT: vslideup.vx v9, v8, a0 +; RV64-NEXT: add a0, s1, s1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 32 +; RV64-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; RV64-NEXT: vslideup.vx v9, v8, s1 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a1, a0, 1 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 32 +; RV64-NEXT: vs1r.v v9, (a0) +; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; RV64-NEXT: vlseg7e8.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 64 +; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: .cfi_restore s1 +; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave7.v14i8(<14 x i8> %v) + ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res +} + ; Floats @@ -267,9 +524,125 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double ret {<4 x double>, <4 x double>} %retval } -declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>) -declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>) -declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>) -declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>) -declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>) -declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>) +define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32(<6 x float> %v) { +; CHECK-LABEL: vector_deinterleave3_v632_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 4 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v9, v12 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %res = call {<2 x float>, <2 x float>, <2 x float>} @llvm.vector.deinterleave3.v6f32(<6 x float> %v) + ret {<2 x float>, <2 x float>, <2 x float>} %res +} + + +define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_deinterleave5_v10f16_v2f16(<10 x half> %v) { +; CHECK-LABEL: vector_deinterleave5_v10f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 6 +; CHECK-NEXT: vslidedown.vi v11, v8, 4 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v14, v8, 8 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v11, v10, a1 +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v9, v14 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %res = call {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @llvm.vector.deinterleave5.v10f16(<10 x half> %v) + ret {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} %res +} + +define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @vector_deinterleave7_v7f16_v1f16(<7 x half> %v) { +; CHECK-LABEL: vector_deinterleave7_v7f16_v1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vslidedown.vi v11, v8, 1 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vslidedown.vi v14, v8, 5 +; CHECK-NEXT: vslidedown.vi v15, v8, 6 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: add a3, a0, a0 +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v10, v9, a1 +; CHECK-NEXT: vslideup.vx v12, v11, a1 +; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v13, v8, 4 +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v13, v14, a1 +; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v13, v15, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs2r.v v12, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vlseg7e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave7.v7f16(<7 x half> %v) + ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index bb71c2973bb57..81b6de9e662d5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -1,8 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64 ; Integers @@ -104,11 +104,6 @@ define {, } @vector_deinterleave_nxv4i64_nxv ret {, } %retval } -declare {, } @llvm.vector.deinterleave2.nxv32i1() -declare {, } @llvm.vector.deinterleave2.nxv32i8() -declare {, } @llvm.vector.deinterleave2.nxv16i16() -declare {, } @llvm.vector.deinterleave2.nxv8i32() -declare {, } @llvm.vector.deinterleave2.nxv4i64() define {, } @vector_deinterleave_nxv64i1_nxv128i1( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1: @@ -228,11 +223,6 @@ define {, } @vector_deinterleave_nxv8i64_nxv ret {, } %retval } -declare {, } @llvm.vector.deinterleave2.nxv128i1() -declare {, } @llvm.vector.deinterleave2.nxv128i8() -declare {, } @llvm.vector.deinterleave2.nxv64i16() -declare {, } @llvm.vector.deinterleave2.nxv32i32() -declare {, } @llvm.vector.deinterleave2.nxv16i64() ; Floats @@ -358,12 +348,6 @@ define {, } @vector_deinterleave_nxv2f ret {, } %retval } -declare {,} @llvm.vector.deinterleave2.nxv4f16() -declare {, } @llvm.vector.deinterleave2.nxv8f16() -declare {, } @llvm.vector.deinterleave2.nxv4f32() -declare {, } @llvm.vector.deinterleave2.nxv16f16() -declare {, } @llvm.vector.deinterleave2.nxv8f32() -declare {, } @llvm.vector.deinterleave2.nxv4f64() define {, } @vector_deinterleave_nxv32bf16_nxv64bf16( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv32bf16_nxv64bf16: @@ -461,6 +445,629 @@ define {, } @vector_deinterleave_nxv8f ret {, } %retval } -declare {, } @llvm.vector.deinterleave2.nxv64f16() -declare {, } @llvm.vector.deinterleave2.nxv32f32() -declare {, } @llvm.vector.deinterleave2.nxv16f64() +define {, , } @vector_deinterleave_nxv16i1_nxv48i1( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv48i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v0, a1 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v16, v10, 1, v0 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v9, v0, a0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v18, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v20, v10, 1, v0 +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vlseg3e8.v v8, (a1) +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmsne.vi v9, v12, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv48i1( %vec) + ret {, , } %retval +} + + +define {, , } @vector_deinterleave_nxv16i8_nxv48i8( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv48i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vlseg3e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv48i8( %vec) + ret {, , } %retval +} + + +define {, , } @vector_deinterleave_nxv8i16_nxv24i16( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv24i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vlseg3e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv24i16( %vec) + ret {, , } %retval +} + + +define {, , } @vector_deinterleave_nxv4i32_nxv12i32( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv12i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv12i32( %vec) + ret {, , } %retval +} + + +define {, , } @vector_deinterleave_nxv2i64_nxv6i64( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv6i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vlseg3e64.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv6i64( %vec) + ret {, , } %retval +} + +define {, , , , } @vector_deinterleave_nxv16i1_nxv80i1( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv80i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: srli a2, a0, 1 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a1 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: vslidedown.vx v10, v9, a2 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v18, v12, 1, v0 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v9, v9, a0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v20, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v10, v15 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v11, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v8, v21 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v9, v14 +; CHECK-NEXT: vs8r.v v16, (a0) +; CHECK-NEXT: vmv1r.v v12, v13 +; CHECK-NEXT: vs8r.v v8, (a1) +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg5e8.v v8, (a0) +; CHECK-NEXT: vlseg5e8.v v14, (a1) +; CHECK-NEXT: vmv2r.v v20, v8 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v21, v14 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v0, v20, 0 +; CHECK-NEXT: vmv1r.v v14, v9 +; CHECK-NEXT: vmsne.vi v8, v14, 0 +; CHECK-NEXT: vmv1r.v v23, v16 +; CHECK-NEXT: vmsne.vi v9, v22, 0 +; CHECK-NEXT: vmv1r.v v16, v11 +; CHECK-NEXT: vmsne.vi v10, v16, 0 +; CHECK-NEXT: vmv1r.v v13, v18 +; CHECK-NEXT: vmsne.vi v11, v12, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , } @llvm.vector.deinterleave5.nxv80i1( %vec) + ret {, , , , } %retval +} + + +define {, , , , } @vector_deinterleave_nxv16i8_nxv80i8( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv80i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v26, v15 +; CHECK-NEXT: vmv1r.v v27, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg5e8.v v12, (a0) +; CHECK-NEXT: vlseg5e8.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v13 +; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , } @llvm.vector.deinterleave5.nxv80i8( %vec) + ret {, , , , } %retval +} + + +define {, , , , } @vector_deinterleave_nxv8i16_nxv40i16( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv40i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v26, v15 +; CHECK-NEXT: vmv1r.v v27, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg5e16.v v12, (a0) +; CHECK-NEXT: vlseg5e16.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v13 +; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , } @llvm.vector.deinterleave5.nxv40i16( %vec) + ret {, , , , } %retval +} + + +define {, , , , } @vector_deinterleave_nxv4i32_nxv20i32( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv20i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v26, v15 +; CHECK-NEXT: vmv1r.v v27, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg5e32.v v12, (a0) +; CHECK-NEXT: vlseg5e32.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v13 +; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , } @llvm.vector.deinterleave5.nxv20i32( %vec) + ret {, , , , } %retval +} + + +define {, , , , } @vector_deinterleave_nxv2i64_nxv10i64( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv10i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v26, v15 +; CHECK-NEXT: vmv1r.v v27, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg5e64.v v12, (a0) +; CHECK-NEXT: vlseg5e64.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v13 +; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , } @llvm.vector.deinterleave5.nxv10i64( %vec) + ret {, , , , } %retval +} + +define {, , , , , , } @vector_deinterleave_nxv16i1_nxv112i1( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv112i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: srli a0, a1, 1 +; CHECK-NEXT: srli a3, a1, 3 +; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: slli a3, a3, 1 +; CHECK-NEXT: vslidedown.vx v11, v8, a2 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v18, v12, 1, v0 +; CHECK-NEXT: sub a1, a1, a3 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v9, v9, a1 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v20, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v22, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v10, v15 +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vim v24, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v11, v24 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v23 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v9, v14 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v12, v25 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v13, v14 +; CHECK-NEXT: vs8r.v v16, (a0) +; CHECK-NEXT: vmv1r.v v14, v15 +; CHECK-NEXT: vs8r.v v8, (a1) +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg7e8.v v8, (a0) +; CHECK-NEXT: vlseg7e8.v v16, (a1) +; CHECK-NEXT: vmv2r.v v24, v8 +; CHECK-NEXT: vmv2r.v v26, v10 +; CHECK-NEXT: vmv2r.v v28, v12 +; CHECK-NEXT: vmv1r.v v25, v16 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v0, v24, 0 +; CHECK-NEXT: vmv1r.v v16, v9 +; CHECK-NEXT: vmsne.vi v8, v16, 0 +; CHECK-NEXT: vmv1r.v v27, v18 +; CHECK-NEXT: vmsne.vi v9, v26, 0 +; CHECK-NEXT: vmv1r.v v18, v11 +; CHECK-NEXT: vmsne.vi v10, v18, 0 +; CHECK-NEXT: vmv1r.v v29, v20 +; CHECK-NEXT: vmsne.vi v11, v28, 0 +; CHECK-NEXT: vmv1r.v v20, v13 +; CHECK-NEXT: vmsne.vi v12, v20, 0 +; CHECK-NEXT: vmv1r.v v15, v22 +; CHECK-NEXT: vmsne.vi v13, v14, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , } @llvm.vector.deinterleave7.nxv112i1( %vec) + ret {, , , , , , } %retval +} + + +define {, , , , , , } @vector_deinterleave_nxv16i8_nxv112i8( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv112i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v30, v21 +; CHECK-NEXT: vmv1r.v v28, v19 +; CHECK-NEXT: vmv1r.v v29, v20 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v27, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v24, v15 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v25, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg7e8.v v14, (a0) +; CHECK-NEXT: vlseg7e8.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v14 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v15 +; CHECK-NEXT: vmv2r.v v12, v16 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v17 +; CHECK-NEXT: vmv2r.v v16, v18 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v19 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , } @llvm.vector.deinterleave7.nxv112i8( %vec) + ret {, , , , , , } %retval +} + + +define {, , , , , , } @vector_deinterleave_nxv8i16_nxv56i16( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv56i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v30, v21 +; CHECK-NEXT: vmv1r.v v28, v19 +; CHECK-NEXT: vmv1r.v v29, v20 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v27, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v24, v15 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v25, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg7e16.v v14, (a0) +; CHECK-NEXT: vlseg7e16.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v14 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v15 +; CHECK-NEXT: vmv2r.v v12, v16 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v17 +; CHECK-NEXT: vmv2r.v v16, v18 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v19 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , } @llvm.vector.deinterleave7.nxv56i16( %vec) + ret {, , , , , , } %retval +} + + +define {, , , , , , } @vector_deinterleave_nxv4i32_nxv28i32( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv28i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v30, v21 +; CHECK-NEXT: vmv1r.v v28, v19 +; CHECK-NEXT: vmv1r.v v29, v20 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v27, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v24, v15 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v25, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg7e32.v v14, (a0) +; CHECK-NEXT: vlseg7e32.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v14 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v15 +; CHECK-NEXT: vmv2r.v v12, v16 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v17 +; CHECK-NEXT: vmv2r.v v16, v18 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v19 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , } @llvm.vector.deinterleave7.nxv28i32( %vec) + ret {, , , , , , } %retval +} + + +define {, , , , , , } @vector_deinterleave_nxv2i64_nxv14i64( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv14i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v30, v21 +; CHECK-NEXT: vmv1r.v v28, v19 +; CHECK-NEXT: vmv1r.v v29, v20 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v27, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v24, v15 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v25, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg7e64.v v14, (a0) +; CHECK-NEXT: vlseg7e64.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v14 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v15 +; CHECK-NEXT: vmv2r.v v12, v16 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v17 +; CHECK-NEXT: vmv2r.v v16, v18 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v19 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , } @llvm.vector.deinterleave7.nxv14i64( %vec) + ret {, , , , , , } %retval +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll index 08aa02c7e869a..1e4cb06480163 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -122,10 +122,237 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { ret <4 x i64> %res } -declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>) -declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) -declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) -declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) +define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: vector_interleave3_v6i32_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle32.v v9, (a2) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle32.v v10, (a1) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave3_v6i32_v2i32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg3e32.v v8, (a0) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle32.v v9, (a2) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle32.v v10, (a1) +; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 4 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret + %res = call <6 x i32> @llvm.vector.interleave3.v6i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) + ret <6 x i32> %res +} + + +define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) { +; CHECK-LABEL: vector_interleave5_v10i16_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg5e16.v v8, (a0) +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: vle16.v v10, (a4) +; CHECK-NEXT: vle16.v v11, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: add a1, a4, a1 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v11, v10, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v11, 4 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave5_v10i16_v2i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg5e16.v v8, (a0) +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: vle16.v v10, (a4) +; ZVBB-NEXT: vle16.v v11, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: add a1, a4, a1 +; ZVBB-NEXT: vle16.v v12, (a1) +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v11, v10, 2 +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v11, 4 +; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v12, 8 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret + %res = call <10 x i16> @llvm.vector.interleave5.v10i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) + ret <10 x i16> %res +} + +define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) { +; CHECK-LABEL: vector_interleave7_v14i8_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: vsetvli a5, zero, e8, mf8, ta, ma +; CHECK-NEXT: vsseg7e8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a4) +; CHECK-NEXT: add a4, a4, a1 +; CHECK-NEXT: vle8.v v10, (a2) +; CHECK-NEXT: add a2, a4, a1 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle8.v v11, (a2) +; CHECK-NEXT: vle8.v v12, (a4) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v13, (a1) +; CHECK-NEXT: vle8.v v14, (a3) +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v13, 4 +; CHECK-NEXT: vslideup.vi v8, v14, 4 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 6 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave7_v14i8_v2i8: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 3 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: vsetvli a5, zero, e8, mf8, ta, ma +; ZVBB-NEXT: vsseg7e8.v v8, (a0) +; ZVBB-NEXT: vle8.v v9, (a4) +; ZVBB-NEXT: add a4, a4, a1 +; ZVBB-NEXT: vle8.v v10, (a2) +; ZVBB-NEXT: add a2, a4, a1 +; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle8.v v11, (a2) +; ZVBB-NEXT: vle8.v v12, (a4) +; ZVBB-NEXT: vle8.v v8, (a0) +; ZVBB-NEXT: vle8.v v13, (a1) +; ZVBB-NEXT: vle8.v v14, (a3) +; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v11, 2 +; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v13, 4 +; ZVBB-NEXT: vslideup.vi v8, v14, 4 +; ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v9, 6 +; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v12, 8 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret + %res = call <14 x i8> @llvm.vector.interleave7.v14i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) + ret <14 x i8> %res +} + ; Floats @@ -270,13 +497,240 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> ret <4 x double> %res } +define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; CHECK-LABEL: vector_interleave3_v632_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle32.v v9, (a2) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle32.v v10, (a1) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave3_v632_v2f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg3e32.v v8, (a0) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle32.v v9, (a2) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle32.v v10, (a1) +; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 4 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret + %res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <6 x float> %res +} + -declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>) -declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>) -declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>) -declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>) -declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>) -declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>) +define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) { +; CHECK-LABEL: vector_interleave5_v10f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg5e16.v v8, (a0) +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: vle16.v v10, (a4) +; CHECK-NEXT: vle16.v v11, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: add a1, a4, a1 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v11, v10, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v11, 4 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave5_v10f16_v2f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg5e16.v v8, (a0) +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: vle16.v v10, (a4) +; ZVBB-NEXT: vle16.v v11, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: add a1, a4, a1 +; ZVBB-NEXT: vle16.v v12, (a1) +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v11, v10, 2 +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v11, 4 +; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v12, 8 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret + %res = call <10 x half> @llvm.vector.interleave5.v10f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) + ret <10 x half> %res +} + +define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) { +; CHECK-LABEL: vector_interleave7_v7f16_v1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: vsetvli a5, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg7e16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: add a4, a4, a1 +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: add a2, a4, a1 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: vle16.v v12, (a4) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v13, (a1) +; CHECK-NEXT: vle16.v v14, (a3) +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v11, 1 +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v13, 2 +; CHECK-NEXT: vslideup.vi v8, v14, 2 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave7_v7f16_v1f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: vsetvli a5, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg7e16.v v8, (a0) +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: add a4, a4, a1 +; ZVBB-NEXT: vle16.v v10, (a2) +; ZVBB-NEXT: add a2, a4, a1 +; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle16.v v11, (a2) +; ZVBB-NEXT: vle16.v v12, (a4) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vle16.v v13, (a1) +; ZVBB-NEXT: vle16.v v14, (a3) +; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v11, 1 +; ZVBB-NEXT: vslideup.vi v8, v10, 1 +; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v13, 2 +; ZVBB-NEXT: vslideup.vi v8, v14, 2 +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v9, 3 +; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v12, 4 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret + %res = call <7 x half> @llvm.vector.interleave7.v7f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) + ret <7 x half> %res +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32: {{.*}} ; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 864acb320d8fe..d2143089a008b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=ZVBB,ZVBB-RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=ZVBB,ZVBB-RV64 ; Integers @@ -151,11 +151,6 @@ define @vector_interleave_nxv4i64_nxv2i64( ret %res } -declare @llvm.vector.interleave2.nxv32i1(, ) -declare @llvm.vector.interleave2.nxv32i8(, ) -declare @llvm.vector.interleave2.nxv16i16(, ) -declare @llvm.vector.interleave2.nxv8i32(, ) -declare @llvm.vector.interleave2.nxv4i64(, ) define @vector_interleave_nxv128i1_nxv64i1( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1: @@ -324,11 +319,6 @@ define @vector_interleave_nxv16i64_nxv8i64( %res } -declare @llvm.vector.interleave2.nxv128i1(, ) -declare @llvm.vector.interleave2.nxv128i8(, ) -declare @llvm.vector.interleave2.nxv64i16(, ) -declare @llvm.vector.interleave2.nxv32i32(, ) -declare @llvm.vector.interleave2.nxv16i64(, ) ; Floats @@ -565,12 +555,6 @@ define @vector_interleave_nxv4f64_nxv2f64( @llvm.vector.interleave2.nxv4f16(, ) -declare @llvm.vector.interleave2.nxv8f16(, ) -declare @llvm.vector.interleave2.nxv4f32(, ) -declare @llvm.vector.interleave2.nxv16f16(, ) -declare @llvm.vector.interleave2.nxv8f32(, ) -declare @llvm.vector.interleave2.nxv4f64(, ) define @vector_interleave_nxv64bf16_nxv32bf16( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv64bf16_nxv32bf16: @@ -734,6 +718,2856 @@ define @vector_interleave_nxv8i32_nxv4i32_poison2( %res } -declare @llvm.vector.interleave2.nxv64f16(, ) -declare @llvm.vector.interleave2.nxv32f32(, ) -declare @llvm.vector.interleave2.nxv16f64(, ) +define @vector_interleave_nxv48i1_nxv16i1( %a, %b, %c) nounwind { +; CHECK-LABEL: vector_interleave_nxv48i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v18, v12, 1, v0 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vsseg3e8.v v14, (a0) +; CHECK-NEXT: vl2r.v v8, (a2) +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: vl2r.v v10, (a3) +; CHECK-NEXT: vl2r.v v12, (a0) +; CHECK-NEXT: add a0, a2, a2 +; CHECK-NEXT: vmsne.vi v14, v8, 0 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v0, v8, a2 +; CHECK-NEXT: add a0, a1, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v0, v14, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv48i1_nxv16i1: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmv1r.v v10, v0 +; ZVBB-NEXT: vmv1r.v v0, v8 +; ZVBB-NEXT: vmv.v.i v12, 0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0 +; ZVBB-NEXT: slli a2, a1, 1 +; ZVBB-NEXT: vmv1r.v v0, v10 +; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vmv1r.v v0, v9 +; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0 +; ZVBB-NEXT: add a2, a3, a2 +; ZVBB-NEXT: vsseg3e8.v v14, (a0) +; ZVBB-NEXT: vl2r.v v8, (a2) +; ZVBB-NEXT: srli a2, a1, 2 +; ZVBB-NEXT: srli a1, a1, 1 +; ZVBB-NEXT: vl2r.v v10, (a3) +; ZVBB-NEXT: vl2r.v v12, (a0) +; ZVBB-NEXT: add a0, a2, a2 +; ZVBB-NEXT: vmsne.vi v14, v8, 0 +; ZVBB-NEXT: vmsne.vi v8, v10, 0 +; ZVBB-NEXT: vmsne.vi v0, v12, 0 +; ZVBB-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v8, a2 +; ZVBB-NEXT: add a0, a1, a1 +; ZVBB-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v14, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv48i1( %a, %b, %c) + ret %res +} + + +define @vector_interleave_nxv48i8_nxv16i8( %a, %b, %c) nounwind { +; CHECK-LABEL: vector_interleave_nxv48i8_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vsseg3e8.v v8, (a0) +; CHECK-NEXT: vl2r.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2r.v v10, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2r.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv48i8_nxv16i8: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; ZVBB-NEXT: vsseg3e8.v v8, (a0) +; ZVBB-NEXT: vl2r.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2r.v v10, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2r.v v12, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv48i8( %a, %b, %c) + ret %res +} + + +define @vector_interleave_nxv24i16_nxv8i16( %a, %b, %c) nounwind { +; CHECK-LABEL: vector_interleave_nxv24i16_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vsseg3e16.v v8, (a0) +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re16.v v10, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re16.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv24i16_nxv8i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; ZVBB-NEXT: vsseg3e16.v v8, (a0) +; ZVBB-NEXT: vl2re16.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re16.v v10, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re16.v v12, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv24i16( %a, %b, %c) + ret %res +} + + +define @vector_interleave_nxv12i32_nxv4i32( %a, %b, %c) nounwind { +; CHECK-LABEL: vector_interleave_nxv12i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: vl2re32.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re32.v v10, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re32.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv12i32_nxv4i32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; ZVBB-NEXT: vsseg3e32.v v8, (a0) +; ZVBB-NEXT: vl2re32.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re32.v v10, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re32.v v12, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv12i32( %a, %b, %c) + ret %res +} + + +define @vector_interleave_nxv6i64_nxv2i64( %a, %b, %c) nounwind { +; CHECK-LABEL: vector_interleave_nxv6i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; CHECK-NEXT: vsseg3e64.v v8, (a0) +; CHECK-NEXT: vl2re64.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re64.v v10, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re64.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv6i64_nxv2i64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; ZVBB-NEXT: vsseg3e64.v v8, (a0) +; ZVBB-NEXT: vl2re64.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re64.v v10, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re64.v v12, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv6i64( %a, %b, %c) + ret %res +} + +define @vector_interleave_nxv80i1_nxv16i1( %a, %b, %c, %d, %e) nounwind { +; CHECK-LABEL: vector_interleave_nxv80i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 10 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v18, v12, 1, v0 +; CHECK-NEXT: add a2, a4, a1 +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: vmv2r.v v20, v14 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v21, v18 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v22, v16 +; CHECK-NEXT: vmv1r.v v16, v19 +; CHECK-NEXT: add a5, a2, a1 +; CHECK-NEXT: vmv1r.v v23, v8 +; CHECK-NEXT: vmv1r.v v18, v9 +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vim v24, v12, 1, v0 +; CHECK-NEXT: vsetvli a6, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg5e8.v v20, (a4) +; CHECK-NEXT: vmv1r.v v19, v25 +; CHECK-NEXT: vsseg5e8.v v15, (a0) +; CHECK-NEXT: vl1r.v v8, (a5) +; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: vl1r.v v10, (a4) +; CHECK-NEXT: add a4, a5, a1 +; CHECK-NEXT: vl1r.v v12, (a4) +; CHECK-NEXT: add a4, a0, a1 +; CHECK-NEXT: vl1r.v v14, (a4) +; CHECK-NEXT: add a4, a4, a1 +; CHECK-NEXT: vl1r.v v9, (a5) +; CHECK-NEXT: add a5, a4, a1 +; CHECK-NEXT: vl1r.v v16, (a5) +; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: vl1r.v v11, (a2) +; CHECK-NEXT: add a2, a3, a3 +; CHECK-NEXT: vl1r.v v15, (a4) +; CHECK-NEXT: add a4, a1, a1 +; CHECK-NEXT: vl1r.v v13, (a0) +; CHECK-NEXT: vl1r.v v17, (a5) +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v18, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v10, 0 +; CHECK-NEXT: vmsne.vi v8, v14, 0 +; CHECK-NEXT: vmsne.vi v9, v12, 0 +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v0, v18, a3 +; CHECK-NEXT: vslideup.vx v9, v8, a3 +; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v0, v9, a1 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v8, v16, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 10 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv80i1_nxv16i1: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 10 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmv.v.i v12, 0 +; ZVBB-NEXT: addi a4, sp, 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 2 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add a0, sp, a0 +; ZVBB-NEXT: addi a0, a0, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v8 +; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0 +; ZVBB-NEXT: add a2, a4, a1 +; ZVBB-NEXT: srli a3, a1, 2 +; ZVBB-NEXT: vmv2r.v v20, v14 +; ZVBB-NEXT: vmv1r.v v0, v9 +; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0 +; ZVBB-NEXT: vmv1r.v v21, v18 +; ZVBB-NEXT: vmv1r.v v0, v10 +; ZVBB-NEXT: vmerge.vim v8, v12, 1, v0 +; ZVBB-NEXT: vmv1r.v v22, v16 +; ZVBB-NEXT: vmv1r.v v16, v19 +; ZVBB-NEXT: add a5, a2, a1 +; ZVBB-NEXT: vmv1r.v v23, v8 +; ZVBB-NEXT: vmv1r.v v18, v9 +; ZVBB-NEXT: vmv1r.v v0, v11 +; ZVBB-NEXT: vmerge.vim v24, v12, 1, v0 +; ZVBB-NEXT: vsetvli a6, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg5e8.v v20, (a4) +; ZVBB-NEXT: vmv1r.v v19, v25 +; ZVBB-NEXT: vsseg5e8.v v15, (a0) +; ZVBB-NEXT: vl1r.v v8, (a5) +; ZVBB-NEXT: add a5, a5, a1 +; ZVBB-NEXT: vl1r.v v10, (a4) +; ZVBB-NEXT: add a4, a5, a1 +; ZVBB-NEXT: vl1r.v v12, (a4) +; ZVBB-NEXT: add a4, a0, a1 +; ZVBB-NEXT: vl1r.v v14, (a4) +; ZVBB-NEXT: add a4, a4, a1 +; ZVBB-NEXT: vl1r.v v9, (a5) +; ZVBB-NEXT: add a5, a4, a1 +; ZVBB-NEXT: vl1r.v v16, (a5) +; ZVBB-NEXT: add a5, a5, a1 +; ZVBB-NEXT: srli a1, a1, 1 +; ZVBB-NEXT: vl1r.v v11, (a2) +; ZVBB-NEXT: add a2, a3, a3 +; ZVBB-NEXT: vl1r.v v15, (a4) +; ZVBB-NEXT: add a4, a1, a1 +; ZVBB-NEXT: vl1r.v v13, (a0) +; ZVBB-NEXT: vl1r.v v17, (a5) +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmsne.vi v18, v8, 0 +; ZVBB-NEXT: vmsne.vi v0, v10, 0 +; ZVBB-NEXT: vmsne.vi v8, v14, 0 +; ZVBB-NEXT: vmsne.vi v9, v12, 0 +; ZVBB-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v18, a3 +; ZVBB-NEXT: vslideup.vx v9, v8, a3 +; ZVBB-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v9, a1 +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmsne.vi v8, v16, 0 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 10 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave5.nxv80i1( %a, %b, %c, %d, %e) + ret %res +} + + +define @vector_interleave_nxv80i8_nxv16i8( %a, %b, %c, %d, %e) nounwind { +; +; RV32-LABEL: vector_interleave_nxv80i8_nxv16i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v16 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: vmv2r.v v22, v16 +; RV32-NEXT: vmv2r.v v24, v18 +; RV32-NEXT: vmv1r.v v26, v20 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v23, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vmv1r.v v25, v14 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v18, v11 +; RV32-NEXT: vsseg5e8.v v22, (a0) +; RV32-NEXT: vmv1r.v v20, v15 +; RV32-NEXT: vsseg5e8.v v17, (a1) +; RV32-NEXT: vl1r.v v16, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v17, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1r.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v11, (a6) +; RV32-NEXT: vl1r.v v8, (a0) +; RV32-NEXT: vl1r.v v9, (a3) +; RV32-NEXT: vl1r.v v14, (a4) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 10 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v15, (a5) +; RV32-NEXT: vl1r.v v12, (a6) +; RV32-NEXT: vl1r.v v13, (a1) +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vs2r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8r.v v16, (a2) +; RV32-NEXT: vl8r.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv80i8_nxv16i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v16 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: vmv2r.v v22, v16 +; RV64-NEXT: vmv2r.v v24, v18 +; RV64-NEXT: vmv1r.v v26, v20 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v23, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vmv1r.v v25, v14 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v18, v11 +; RV64-NEXT: vsseg5e8.v v22, (a0) +; RV64-NEXT: vmv1r.v v20, v15 +; RV64-NEXT: vsseg5e8.v v17, (a1) +; RV64-NEXT: vl1r.v v16, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v17, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1r.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v11, (a6) +; RV64-NEXT: vl1r.v v8, (a0) +; RV64-NEXT: vl1r.v v9, (a3) +; RV64-NEXT: vl1r.v v14, (a4) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 10 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v15, (a5) +; RV64-NEXT: vl1r.v v12, (a6) +; RV64-NEXT: vl1r.v v13, (a1) +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vs2r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8r.v v16, (a2) +; RV64-NEXT: vl8r.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv80i8_nxv16i8: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v16 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 2 +; ZVBB-RV32-NEXT: add a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: vmv2r.v v22, v16 +; ZVBB-RV32-NEXT: vmv2r.v v24, v18 +; ZVBB-RV32-NEXT: vmv1r.v v26, v20 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v23, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v25, v14 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v18, v11 +; ZVBB-RV32-NEXT: vsseg5e8.v v22, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v20, v15 +; ZVBB-RV32-NEXT: vsseg5e8.v v17, (a1) +; ZVBB-RV32-NEXT: vl1r.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v17, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1r.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v11, (a6) +; ZVBB-RV32-NEXT: vl1r.v v8, (a0) +; ZVBB-RV32-NEXT: vl1r.v v9, (a3) +; ZVBB-RV32-NEXT: vl1r.v v14, (a4) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 10 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v15, (a5) +; ZVBB-RV32-NEXT: vl1r.v v12, (a6) +; ZVBB-RV32-NEXT: vl1r.v v13, (a1) +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vs2r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8r.v v16, (a2) +; ZVBB-RV32-NEXT: vl8r.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv80i8_nxv16i8: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v16 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 2 +; ZVBB-RV64-NEXT: add a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: vmv2r.v v22, v16 +; ZVBB-RV64-NEXT: vmv2r.v v24, v18 +; ZVBB-RV64-NEXT: vmv1r.v v26, v20 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v23, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v25, v14 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v18, v11 +; ZVBB-RV64-NEXT: vsseg5e8.v v22, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v20, v15 +; ZVBB-RV64-NEXT: vsseg5e8.v v17, (a1) +; ZVBB-RV64-NEXT: vl1r.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v17, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1r.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v11, (a6) +; ZVBB-RV64-NEXT: vl1r.v v8, (a0) +; ZVBB-RV64-NEXT: vl1r.v v9, (a3) +; ZVBB-RV64-NEXT: vl1r.v v14, (a4) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 10 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v15, (a5) +; ZVBB-RV64-NEXT: vl1r.v v12, (a6) +; ZVBB-RV64-NEXT: vl1r.v v13, (a1) +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vs2r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8r.v v16, (a2) +; ZVBB-RV64-NEXT: vl8r.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret + %res = call @llvm.vector.interleave5.nxv80i8( %a, %b, %c, %d, %e) + ret %res +} + + +define @vector_interleave_nxv40i8_nxv8i8( %a, %b, %c, %d, %e) nounwind { +; CHECK-LABEL: vector_interleave_nxv40i8_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg5e8.v v8, (a0) +; CHECK-NEXT: vl1r.v v10, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1r.v v11, (a3) +; CHECK-NEXT: vl1r.v v8, (a0) +; CHECK-NEXT: vl1r.v v9, (a2) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1r.v v12, (a1) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv40i8_nxv8i8: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 2 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg5e8.v v8, (a0) +; ZVBB-NEXT: vl1r.v v10, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1r.v v11, (a3) +; ZVBB-NEXT: vl1r.v v8, (a0) +; ZVBB-NEXT: vl1r.v v9, (a2) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1r.v v12, (a1) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 2 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave5.nxv40i8( %a, %b, %c, %d, %e) + ret %res +} + + +define @vector_interleave_nxv20i32_nxv4i32( %a, %b, %c, %d, %e) nounwind { +; +; RV32-LABEL: vector_interleave_nxv20i32_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v16 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: vmv2r.v v22, v16 +; RV32-NEXT: vmv2r.v v24, v18 +; RV32-NEXT: vmv1r.v v26, v20 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v23, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vmv1r.v v25, v14 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v18, v11 +; RV32-NEXT: vsseg5e32.v v22, (a0) +; RV32-NEXT: vmv1r.v v20, v15 +; RV32-NEXT: vsseg5e32.v v17, (a1) +; RV32-NEXT: vl1re32.v v16, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v17, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re32.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v11, (a6) +; RV32-NEXT: vl1re32.v v8, (a0) +; RV32-NEXT: vl1re32.v v9, (a3) +; RV32-NEXT: vl1re32.v v14, (a4) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 10 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v15, (a5) +; RV32-NEXT: vl1re32.v v12, (a6) +; RV32-NEXT: vl1re32.v v13, (a1) +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vs2r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re32.v v16, (a2) +; RV32-NEXT: vl8re32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv20i32_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v16 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: vmv2r.v v22, v16 +; RV64-NEXT: vmv2r.v v24, v18 +; RV64-NEXT: vmv1r.v v26, v20 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v23, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vmv1r.v v25, v14 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v18, v11 +; RV64-NEXT: vsseg5e32.v v22, (a0) +; RV64-NEXT: vmv1r.v v20, v15 +; RV64-NEXT: vsseg5e32.v v17, (a1) +; RV64-NEXT: vl1re32.v v16, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v17, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re32.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v11, (a6) +; RV64-NEXT: vl1re32.v v8, (a0) +; RV64-NEXT: vl1re32.v v9, (a3) +; RV64-NEXT: vl1re32.v v14, (a4) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 10 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v15, (a5) +; RV64-NEXT: vl1re32.v v12, (a6) +; RV64-NEXT: vl1re32.v v13, (a1) +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vs2r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re32.v v16, (a2) +; RV64-NEXT: vl8re32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv20i32_nxv4i32: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v16 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 2 +; ZVBB-RV32-NEXT: add a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: vmv2r.v v22, v16 +; ZVBB-RV32-NEXT: vmv2r.v v24, v18 +; ZVBB-RV32-NEXT: vmv1r.v v26, v20 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v23, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v25, v14 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v18, v11 +; ZVBB-RV32-NEXT: vsseg5e32.v v22, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v20, v15 +; ZVBB-RV32-NEXT: vsseg5e32.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re32.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v17, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re32.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re32.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re32.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re32.v v14, (a4) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 10 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v15, (a5) +; ZVBB-RV32-NEXT: vl1re32.v v12, (a6) +; ZVBB-RV32-NEXT: vl1re32.v v13, (a1) +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vs2r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv20i32_nxv4i32: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v16 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 2 +; ZVBB-RV64-NEXT: add a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: vmv2r.v v22, v16 +; ZVBB-RV64-NEXT: vmv2r.v v24, v18 +; ZVBB-RV64-NEXT: vmv1r.v v26, v20 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v23, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v25, v14 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v18, v11 +; ZVBB-RV64-NEXT: vsseg5e32.v v22, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v20, v15 +; ZVBB-RV64-NEXT: vsseg5e32.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re32.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v17, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re32.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re32.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re32.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re32.v v14, (a4) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 10 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v15, (a5) +; ZVBB-RV64-NEXT: vl1re32.v v12, (a6) +; ZVBB-RV64-NEXT: vl1re32.v v13, (a1) +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vs2r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret + %res = call @llvm.vector.interleave5.nxv20i32( %a, %b, %c, %d, %e) + ret %res +} + + +define @vector_interleave_nxv10i64_nxv2i64( %a, %b, %c, %d, %e) nounwind { +; +; RV32-LABEL: vector_interleave_nxv10i64_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v16 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: vmv2r.v v22, v16 +; RV32-NEXT: vmv2r.v v24, v18 +; RV32-NEXT: vmv1r.v v26, v20 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v23, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vmv1r.v v25, v14 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v18, v11 +; RV32-NEXT: vsseg5e64.v v22, (a0) +; RV32-NEXT: vmv1r.v v20, v15 +; RV32-NEXT: vsseg5e64.v v17, (a1) +; RV32-NEXT: vl1re64.v v16, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v17, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re64.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v11, (a6) +; RV32-NEXT: vl1re64.v v8, (a0) +; RV32-NEXT: vl1re64.v v9, (a3) +; RV32-NEXT: vl1re64.v v14, (a4) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 10 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v15, (a5) +; RV32-NEXT: vl1re64.v v12, (a6) +; RV32-NEXT: vl1re64.v v13, (a1) +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vs2r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re64.v v16, (a2) +; RV32-NEXT: vl8re64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv10i64_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v16 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: vmv2r.v v22, v16 +; RV64-NEXT: vmv2r.v v24, v18 +; RV64-NEXT: vmv1r.v v26, v20 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v23, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vmv1r.v v25, v14 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v18, v11 +; RV64-NEXT: vsseg5e64.v v22, (a0) +; RV64-NEXT: vmv1r.v v20, v15 +; RV64-NEXT: vsseg5e64.v v17, (a1) +; RV64-NEXT: vl1re64.v v16, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v17, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re64.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v11, (a6) +; RV64-NEXT: vl1re64.v v8, (a0) +; RV64-NEXT: vl1re64.v v9, (a3) +; RV64-NEXT: vl1re64.v v14, (a4) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 10 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v15, (a5) +; RV64-NEXT: vl1re64.v v12, (a6) +; RV64-NEXT: vl1re64.v v13, (a1) +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vs2r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re64.v v16, (a2) +; RV64-NEXT: vl8re64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv10i64_nxv2i64: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v16 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 2 +; ZVBB-RV32-NEXT: add a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: vmv2r.v v22, v16 +; ZVBB-RV32-NEXT: vmv2r.v v24, v18 +; ZVBB-RV32-NEXT: vmv1r.v v26, v20 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v23, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v25, v14 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v18, v11 +; ZVBB-RV32-NEXT: vsseg5e64.v v22, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v20, v15 +; ZVBB-RV32-NEXT: vsseg5e64.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re64.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v17, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re64.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re64.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re64.v v14, (a4) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 10 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v15, (a5) +; ZVBB-RV32-NEXT: vl1re64.v v12, (a6) +; ZVBB-RV32-NEXT: vl1re64.v v13, (a1) +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vs2r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv10i64_nxv2i64: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v16 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 2 +; ZVBB-RV64-NEXT: add a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: vmv2r.v v22, v16 +; ZVBB-RV64-NEXT: vmv2r.v v24, v18 +; ZVBB-RV64-NEXT: vmv1r.v v26, v20 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v23, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v25, v14 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v18, v11 +; ZVBB-RV64-NEXT: vsseg5e64.v v22, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v20, v15 +; ZVBB-RV64-NEXT: vsseg5e64.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re64.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v17, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re64.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re64.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re64.v v14, (a4) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 10 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v15, (a5) +; ZVBB-RV64-NEXT: vl1re64.v v12, (a6) +; ZVBB-RV64-NEXT: vl1re64.v v13, (a1) +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vs2r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret + %res = call @llvm.vector.interleave5.nxv10i64( %a, %b, %c, %d, %e) + ret %res +} + +define @vector_interleave_nxv112i1_nxv16i1( %a, %b, %c, %d, %e, %f, %g) nounwind { +; CHECK-LABEL: vector_interleave_nxv112i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 14 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v14, 0 +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vmerge.vim v16, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v22, v14, 1, v0 +; CHECK-NEXT: add a3, a4, a2 +; CHECK-NEXT: srli a1, a2, 2 +; CHECK-NEXT: add a5, a0, a2 +; CHECK-NEXT: vmv4r.v v24, v16 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v18, v14, 1, v0 +; CHECK-NEXT: add a6, a3, a2 +; CHECK-NEXT: vmv1r.v v25, v22 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vim v8, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v26, v18 +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vim v20, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v27, v8 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmerge.vim v10, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v28, v20 +; CHECK-NEXT: vmv1r.v v18, v23 +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: vmv1r.v v29, v10 +; CHECK-NEXT: vmv1r.v v20, v9 +; CHECK-NEXT: vmv1r.v v0, v13 +; CHECK-NEXT: vmerge.vim v30, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v22, v11 +; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg7e8.v v24, (a4) +; CHECK-NEXT: vmv1r.v v23, v31 +; CHECK-NEXT: vsseg7e8.v v17, (a0) +; CHECK-NEXT: vl1r.v v8, (a6) +; CHECK-NEXT: add a6, a7, a2 +; CHECK-NEXT: vl1r.v v10, (a4) +; CHECK-NEXT: add a4, a6, a2 +; CHECK-NEXT: vl1r.v v12, (a6) +; CHECK-NEXT: add a6, a4, a2 +; CHECK-NEXT: vl1r.v v14, (a6) +; CHECK-NEXT: add a6, a5, a2 +; CHECK-NEXT: vl1r.v v16, (a5) +; CHECK-NEXT: add a5, a6, a2 +; CHECK-NEXT: vl1r.v v18, (a5) +; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: vl1r.v v9, (a7) +; CHECK-NEXT: add a7, a5, a2 +; CHECK-NEXT: vl1r.v v20, (a7) +; CHECK-NEXT: add a7, a7, a2 +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: vl1r.v v11, (a3) +; CHECK-NEXT: add a3, a1, a1 +; CHECK-NEXT: vl1r.v v13, (a4) +; CHECK-NEXT: add a4, a2, a2 +; CHECK-NEXT: vl1r.v v15, (a0) +; CHECK-NEXT: vl1r.v v19, (a5) +; CHECK-NEXT: vl1r.v v17, (a6) +; CHECK-NEXT: vl1r.v v21, (a7) +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v22, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v10, 0 +; CHECK-NEXT: vmsne.vi v9, v12, 0 +; CHECK-NEXT: vmsne.vi v10, v14, 0 +; CHECK-NEXT: vmsne.vi v11, v18, 0 +; CHECK-NEXT: vmsne.vi v8, v16, 0 +; CHECK-NEXT: vmsne.vi v12, v20, 0 +; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v0, v22, a1 +; CHECK-NEXT: vslideup.vx v9, v10, a1 +; CHECK-NEXT: vslideup.vx v8, v11, a1 +; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v0, v9, a2 +; CHECK-NEXT: vslideup.vx v8, v12, a2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 14 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv112i1_nxv16i1: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 14 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmv.v.i v14, 0 +; ZVBB-NEXT: addi a4, sp, 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 3 +; ZVBB-NEXT: sub a0, a1, a0 +; ZVBB-NEXT: add a0, sp, a0 +; ZVBB-NEXT: addi a0, a0, 16 +; ZVBB-NEXT: csrr a2, vlenb +; ZVBB-NEXT: vmerge.vim v16, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v8 +; ZVBB-NEXT: vmerge.vim v22, v14, 1, v0 +; ZVBB-NEXT: add a3, a4, a2 +; ZVBB-NEXT: srli a1, a2, 2 +; ZVBB-NEXT: add a5, a0, a2 +; ZVBB-NEXT: vmv4r.v v24, v16 +; ZVBB-NEXT: vmv1r.v v0, v9 +; ZVBB-NEXT: vmerge.vim v18, v14, 1, v0 +; ZVBB-NEXT: add a6, a3, a2 +; ZVBB-NEXT: vmv1r.v v25, v22 +; ZVBB-NEXT: vmv1r.v v0, v10 +; ZVBB-NEXT: vmerge.vim v8, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v26, v18 +; ZVBB-NEXT: vmv1r.v v0, v11 +; ZVBB-NEXT: vmerge.vim v20, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v27, v8 +; ZVBB-NEXT: vmv1r.v v0, v12 +; ZVBB-NEXT: vmerge.vim v10, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v28, v20 +; ZVBB-NEXT: vmv1r.v v18, v23 +; ZVBB-NEXT: add a7, a6, a2 +; ZVBB-NEXT: vmv1r.v v29, v10 +; ZVBB-NEXT: vmv1r.v v20, v9 +; ZVBB-NEXT: vmv1r.v v0, v13 +; ZVBB-NEXT: vmerge.vim v30, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v22, v11 +; ZVBB-NEXT: vsetvli t0, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg7e8.v v24, (a4) +; ZVBB-NEXT: vmv1r.v v23, v31 +; ZVBB-NEXT: vsseg7e8.v v17, (a0) +; ZVBB-NEXT: vl1r.v v8, (a6) +; ZVBB-NEXT: add a6, a7, a2 +; ZVBB-NEXT: vl1r.v v10, (a4) +; ZVBB-NEXT: add a4, a6, a2 +; ZVBB-NEXT: vl1r.v v12, (a6) +; ZVBB-NEXT: add a6, a4, a2 +; ZVBB-NEXT: vl1r.v v14, (a6) +; ZVBB-NEXT: add a6, a5, a2 +; ZVBB-NEXT: vl1r.v v16, (a5) +; ZVBB-NEXT: add a5, a6, a2 +; ZVBB-NEXT: vl1r.v v18, (a5) +; ZVBB-NEXT: add a5, a5, a2 +; ZVBB-NEXT: vl1r.v v9, (a7) +; ZVBB-NEXT: add a7, a5, a2 +; ZVBB-NEXT: vl1r.v v20, (a7) +; ZVBB-NEXT: add a7, a7, a2 +; ZVBB-NEXT: srli a2, a2, 1 +; ZVBB-NEXT: vl1r.v v11, (a3) +; ZVBB-NEXT: add a3, a1, a1 +; ZVBB-NEXT: vl1r.v v13, (a4) +; ZVBB-NEXT: add a4, a2, a2 +; ZVBB-NEXT: vl1r.v v15, (a0) +; ZVBB-NEXT: vl1r.v v19, (a5) +; ZVBB-NEXT: vl1r.v v17, (a6) +; ZVBB-NEXT: vl1r.v v21, (a7) +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmsne.vi v22, v8, 0 +; ZVBB-NEXT: vmsne.vi v0, v10, 0 +; ZVBB-NEXT: vmsne.vi v9, v12, 0 +; ZVBB-NEXT: vmsne.vi v10, v14, 0 +; ZVBB-NEXT: vmsne.vi v11, v18, 0 +; ZVBB-NEXT: vmsne.vi v8, v16, 0 +; ZVBB-NEXT: vmsne.vi v12, v20, 0 +; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v22, a1 +; ZVBB-NEXT: vslideup.vx v9, v10, a1 +; ZVBB-NEXT: vslideup.vx v8, v11, a1 +; ZVBB-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v9, a2 +; ZVBB-NEXT: vslideup.vx v8, v12, a2 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 14 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave7.nxv112i1( %a, %b, %c, %d, %e, %f, %g) + ret %res +} + + +define @vector_interleave_nxv112i8_nxv16i8( %a, %b, %c, %d, %e, %f, %g) nounwind { +; +; RV32-LABEL: vector_interleave_nxv112i8_nxv16i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-NEXT: vmv2r.v v26, v20 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v1, v20 +; RV32-NEXT: vmv1r.v v3, v22 +; RV32-NEXT: vmv1r.v v5, v24 +; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v2, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: slli a5, a2, 2 +; RV32-NEXT: vmv1r.v v4, v14 +; RV32-NEXT: slli a6, a2, 4 +; RV32-NEXT: add a7, a4, a2 +; RV32-NEXT: vmv1r.v v6, v18 +; RV32-NEXT: sub a5, a6, a5 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: add a6, a7, a2 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vsseg7e8.v v1, (a0) +; RV32-NEXT: vmv1r.v v26, v19 +; RV32-NEXT: vsseg7e8.v v21, (a1) +; RV32-NEXT: vl1r.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v11, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v13, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1r.v v18, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v19, (a6) +; RV32-NEXT: vl1r.v v16, (a0) +; RV32-NEXT: vl1r.v v8, (a4) +; RV32-NEXT: vl1r.v v17, (a3) +; RV32-NEXT: vl1r.v v9, (a7) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 14 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v20, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1r.v v21, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vl1r.v v22, (a6) +; RV32-NEXT: vl1r.v v23, (a1) +; RV32-NEXT: add a5, a0, a5 +; RV32-NEXT: vs2r.v v12, (a5) +; RV32-NEXT: vs4r.v v8, (a2) +; RV32-NEXT: vs8r.v v16, (a0) +; RV32-NEXT: vl8r.v v16, (a2) +; RV32-NEXT: vl8r.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv112i8_nxv16i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-NEXT: vmv2r.v v26, v20 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v1, v20 +; RV64-NEXT: vmv1r.v v3, v22 +; RV64-NEXT: vmv1r.v v5, v24 +; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v2, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: vmv1r.v v4, v14 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a4, a2 +; RV64-NEXT: vmv1r.v v6, v18 +; RV64-NEXT: sub a5, a6, a5 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: add a6, a7, a2 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vsseg7e8.v v1, (a0) +; RV64-NEXT: vmv1r.v v26, v19 +; RV64-NEXT: vsseg7e8.v v21, (a1) +; RV64-NEXT: vl1r.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v11, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v13, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1r.v v18, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v19, (a6) +; RV64-NEXT: vl1r.v v16, (a0) +; RV64-NEXT: vl1r.v v8, (a4) +; RV64-NEXT: vl1r.v v17, (a3) +; RV64-NEXT: vl1r.v v9, (a7) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 14 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v20, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1r.v v21, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vl1r.v v22, (a6) +; RV64-NEXT: vl1r.v v23, (a1) +; RV64-NEXT: add a5, a0, a5 +; RV64-NEXT: vs2r.v v12, (a5) +; RV64-NEXT: vs4r.v v8, (a2) +; RV64-NEXT: vs8r.v v16, (a0) +; RV64-NEXT: vl8r.v v16, (a2) +; RV64-NEXT: vl8r.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv112i8_nxv16i8: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v26, v20 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 3 +; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v1, v20 +; ZVBB-RV32-NEXT: vmv1r.v v3, v22 +; ZVBB-RV32-NEXT: vmv1r.v v5, v24 +; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v2, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: slli a5, a2, 2 +; ZVBB-RV32-NEXT: vmv1r.v v4, v14 +; ZVBB-RV32-NEXT: slli a6, a2, 4 +; ZVBB-RV32-NEXT: add a7, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v6, v18 +; ZVBB-RV32-NEXT: sub a5, a6, a5 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: add a6, a7, a2 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vsseg7e8.v v1, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 +; ZVBB-RV32-NEXT: vsseg7e8.v v21, (a1) +; ZVBB-RV32-NEXT: vl1r.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v11, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v13, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1r.v v18, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v19, (a6) +; ZVBB-RV32-NEXT: vl1r.v v16, (a0) +; ZVBB-RV32-NEXT: vl1r.v v8, (a4) +; ZVBB-RV32-NEXT: vl1r.v v17, (a3) +; ZVBB-RV32-NEXT: vl1r.v v9, (a7) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 14 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v20, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v21, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vl1r.v v22, (a6) +; ZVBB-RV32-NEXT: vl1r.v v23, (a1) +; ZVBB-RV32-NEXT: add a5, a0, a5 +; ZVBB-RV32-NEXT: vs2r.v v12, (a5) +; ZVBB-RV32-NEXT: vs4r.v v8, (a2) +; ZVBB-RV32-NEXT: vs8r.v v16, (a0) +; ZVBB-RV32-NEXT: vl8r.v v16, (a2) +; ZVBB-RV32-NEXT: vl8r.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv112i8_nxv16i8: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v26, v20 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 3 +; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v1, v20 +; ZVBB-RV64-NEXT: vmv1r.v v3, v22 +; ZVBB-RV64-NEXT: vmv1r.v v5, v24 +; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v2, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: slli a5, a2, 2 +; ZVBB-RV64-NEXT: vmv1r.v v4, v14 +; ZVBB-RV64-NEXT: slli a6, a2, 4 +; ZVBB-RV64-NEXT: add a7, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v6, v18 +; ZVBB-RV64-NEXT: sub a5, a6, a5 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: add a6, a7, a2 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vsseg7e8.v v1, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 +; ZVBB-RV64-NEXT: vsseg7e8.v v21, (a1) +; ZVBB-RV64-NEXT: vl1r.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v11, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v13, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1r.v v18, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v19, (a6) +; ZVBB-RV64-NEXT: vl1r.v v16, (a0) +; ZVBB-RV64-NEXT: vl1r.v v8, (a4) +; ZVBB-RV64-NEXT: vl1r.v v17, (a3) +; ZVBB-RV64-NEXT: vl1r.v v9, (a7) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 14 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v20, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v21, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vl1r.v v22, (a6) +; ZVBB-RV64-NEXT: vl1r.v v23, (a1) +; ZVBB-RV64-NEXT: add a5, a0, a5 +; ZVBB-RV64-NEXT: vs2r.v v12, (a5) +; ZVBB-RV64-NEXT: vs4r.v v8, (a2) +; ZVBB-RV64-NEXT: vs8r.v v16, (a0) +; ZVBB-RV64-NEXT: vl8r.v v16, (a2) +; ZVBB-RV64-NEXT: vl8r.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret + %res = call @llvm.vector.interleave7.nxv112i8( %a, %b, %c, %d, %e, %f, %g) + ret %res +} + + +define @vector_interleave_nxv56i16_nxv8i16( %a, %b, %c, %d, %e, %f, %g) nounwind { +; +; RV32-LABEL: vector_interleave_nxv56i16_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV32-NEXT: vmv2r.v v26, v20 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v1, v20 +; RV32-NEXT: vmv1r.v v3, v22 +; RV32-NEXT: vmv1r.v v5, v24 +; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v2, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: slli a5, a2, 2 +; RV32-NEXT: vmv1r.v v4, v14 +; RV32-NEXT: slli a6, a2, 4 +; RV32-NEXT: add a7, a4, a2 +; RV32-NEXT: vmv1r.v v6, v18 +; RV32-NEXT: sub a5, a6, a5 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: add a6, a7, a2 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vsseg7e16.v v1, (a0) +; RV32-NEXT: vmv1r.v v26, v19 +; RV32-NEXT: vsseg7e16.v v21, (a1) +; RV32-NEXT: vl1re16.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v11, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v13, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re16.v v18, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v19, (a6) +; RV32-NEXT: vl1re16.v v16, (a0) +; RV32-NEXT: vl1re16.v v8, (a4) +; RV32-NEXT: vl1re16.v v17, (a3) +; RV32-NEXT: vl1re16.v v9, (a7) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 14 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v20, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v21, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vl1re16.v v22, (a6) +; RV32-NEXT: vl1re16.v v23, (a1) +; RV32-NEXT: add a5, a0, a5 +; RV32-NEXT: vs2r.v v12, (a5) +; RV32-NEXT: vs4r.v v8, (a2) +; RV32-NEXT: vs8r.v v16, (a0) +; RV32-NEXT: vl8re16.v v16, (a2) +; RV32-NEXT: vl8re16.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv56i16_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV64-NEXT: vmv2r.v v26, v20 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v1, v20 +; RV64-NEXT: vmv1r.v v3, v22 +; RV64-NEXT: vmv1r.v v5, v24 +; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v2, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: vmv1r.v v4, v14 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a4, a2 +; RV64-NEXT: vmv1r.v v6, v18 +; RV64-NEXT: sub a5, a6, a5 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: add a6, a7, a2 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vsseg7e16.v v1, (a0) +; RV64-NEXT: vmv1r.v v26, v19 +; RV64-NEXT: vsseg7e16.v v21, (a1) +; RV64-NEXT: vl1re16.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v11, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v13, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re16.v v18, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v19, (a6) +; RV64-NEXT: vl1re16.v v16, (a0) +; RV64-NEXT: vl1re16.v v8, (a4) +; RV64-NEXT: vl1re16.v v17, (a3) +; RV64-NEXT: vl1re16.v v9, (a7) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 14 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v20, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v21, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vl1re16.v v22, (a6) +; RV64-NEXT: vl1re16.v v23, (a1) +; RV64-NEXT: add a5, a0, a5 +; RV64-NEXT: vs2r.v v12, (a5) +; RV64-NEXT: vs4r.v v8, (a2) +; RV64-NEXT: vs8r.v v16, (a0) +; RV64-NEXT: vl8re16.v v16, (a2) +; RV64-NEXT: vl8re16.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv56i16_nxv8i16: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v26, v20 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 3 +; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v1, v20 +; ZVBB-RV32-NEXT: vmv1r.v v3, v22 +; ZVBB-RV32-NEXT: vmv1r.v v5, v24 +; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v2, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: slli a5, a2, 2 +; ZVBB-RV32-NEXT: vmv1r.v v4, v14 +; ZVBB-RV32-NEXT: slli a6, a2, 4 +; ZVBB-RV32-NEXT: add a7, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v6, v18 +; ZVBB-RV32-NEXT: sub a5, a6, a5 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: add a6, a7, a2 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 +; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1) +; ZVBB-RV32-NEXT: vl1re16.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re16.v v18, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v19, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v16, (a0) +; ZVBB-RV32-NEXT: vl1re16.v v8, (a4) +; ZVBB-RV32-NEXT: vl1re16.v v17, (a3) +; ZVBB-RV32-NEXT: vl1re16.v v9, (a7) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 14 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v20, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v21, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vl1re16.v v22, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v23, (a1) +; ZVBB-RV32-NEXT: add a5, a0, a5 +; ZVBB-RV32-NEXT: vs2r.v v12, (a5) +; ZVBB-RV32-NEXT: vs4r.v v8, (a2) +; ZVBB-RV32-NEXT: vs8r.v v16, (a0) +; ZVBB-RV32-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv56i16_nxv8i16: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v26, v20 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 3 +; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v1, v20 +; ZVBB-RV64-NEXT: vmv1r.v v3, v22 +; ZVBB-RV64-NEXT: vmv1r.v v5, v24 +; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v2, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: slli a5, a2, 2 +; ZVBB-RV64-NEXT: vmv1r.v v4, v14 +; ZVBB-RV64-NEXT: slli a6, a2, 4 +; ZVBB-RV64-NEXT: add a7, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v6, v18 +; ZVBB-RV64-NEXT: sub a5, a6, a5 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: add a6, a7, a2 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 +; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1) +; ZVBB-RV64-NEXT: vl1re16.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re16.v v18, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v19, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v16, (a0) +; ZVBB-RV64-NEXT: vl1re16.v v8, (a4) +; ZVBB-RV64-NEXT: vl1re16.v v17, (a3) +; ZVBB-RV64-NEXT: vl1re16.v v9, (a7) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 14 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v20, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v21, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vl1re16.v v22, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v23, (a1) +; ZVBB-RV64-NEXT: add a5, a0, a5 +; ZVBB-RV64-NEXT: vs2r.v v12, (a5) +; ZVBB-RV64-NEXT: vs4r.v v8, (a2) +; ZVBB-RV64-NEXT: vs8r.v v16, (a0) +; ZVBB-RV64-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret + %res = call @llvm.vector.interleave7.nxv56i16( %a, %b, %c, %d, %e, %f, %g) + ret %res +} + + +define @vector_interleave_nxv28i32_nxv4i32( %a, %b, %c, %d, %e, %f, %g) nounwind { +; +; RV32-LABEL: vector_interleave_nxv28i32_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV32-NEXT: vmv2r.v v26, v20 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v1, v20 +; RV32-NEXT: vmv1r.v v3, v22 +; RV32-NEXT: vmv1r.v v5, v24 +; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v2, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: slli a5, a2, 2 +; RV32-NEXT: vmv1r.v v4, v14 +; RV32-NEXT: slli a6, a2, 4 +; RV32-NEXT: add a7, a4, a2 +; RV32-NEXT: vmv1r.v v6, v18 +; RV32-NEXT: sub a5, a6, a5 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: add a6, a7, a2 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vsseg7e32.v v1, (a0) +; RV32-NEXT: vmv1r.v v26, v19 +; RV32-NEXT: vsseg7e32.v v21, (a1) +; RV32-NEXT: vl1re32.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v11, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v13, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re32.v v18, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v19, (a6) +; RV32-NEXT: vl1re32.v v16, (a0) +; RV32-NEXT: vl1re32.v v8, (a4) +; RV32-NEXT: vl1re32.v v17, (a3) +; RV32-NEXT: vl1re32.v v9, (a7) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 14 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v20, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v21, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vl1re32.v v22, (a6) +; RV32-NEXT: vl1re32.v v23, (a1) +; RV32-NEXT: add a5, a0, a5 +; RV32-NEXT: vs2r.v v12, (a5) +; RV32-NEXT: vs4r.v v8, (a2) +; RV32-NEXT: vs8r.v v16, (a0) +; RV32-NEXT: vl8re32.v v16, (a2) +; RV32-NEXT: vl8re32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv28i32_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV64-NEXT: vmv2r.v v26, v20 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v1, v20 +; RV64-NEXT: vmv1r.v v3, v22 +; RV64-NEXT: vmv1r.v v5, v24 +; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v2, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: vmv1r.v v4, v14 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a4, a2 +; RV64-NEXT: vmv1r.v v6, v18 +; RV64-NEXT: sub a5, a6, a5 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: add a6, a7, a2 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vsseg7e32.v v1, (a0) +; RV64-NEXT: vmv1r.v v26, v19 +; RV64-NEXT: vsseg7e32.v v21, (a1) +; RV64-NEXT: vl1re32.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v11, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v13, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re32.v v18, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v19, (a6) +; RV64-NEXT: vl1re32.v v16, (a0) +; RV64-NEXT: vl1re32.v v8, (a4) +; RV64-NEXT: vl1re32.v v17, (a3) +; RV64-NEXT: vl1re32.v v9, (a7) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 14 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v20, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v21, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vl1re32.v v22, (a6) +; RV64-NEXT: vl1re32.v v23, (a1) +; RV64-NEXT: add a5, a0, a5 +; RV64-NEXT: vs2r.v v12, (a5) +; RV64-NEXT: vs4r.v v8, (a2) +; RV64-NEXT: vs8r.v v16, (a0) +; RV64-NEXT: vl8re32.v v16, (a2) +; RV64-NEXT: vl8re32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv28i32_nxv4i32: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v26, v20 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 3 +; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v1, v20 +; ZVBB-RV32-NEXT: vmv1r.v v3, v22 +; ZVBB-RV32-NEXT: vmv1r.v v5, v24 +; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v2, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: slli a5, a2, 2 +; ZVBB-RV32-NEXT: vmv1r.v v4, v14 +; ZVBB-RV32-NEXT: slli a6, a2, 4 +; ZVBB-RV32-NEXT: add a7, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v6, v18 +; ZVBB-RV32-NEXT: sub a5, a6, a5 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: add a6, a7, a2 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vsseg7e32.v v1, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 +; ZVBB-RV32-NEXT: vsseg7e32.v v21, (a1) +; ZVBB-RV32-NEXT: vl1re32.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v11, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v13, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re32.v v18, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v19, (a6) +; ZVBB-RV32-NEXT: vl1re32.v v16, (a0) +; ZVBB-RV32-NEXT: vl1re32.v v8, (a4) +; ZVBB-RV32-NEXT: vl1re32.v v17, (a3) +; ZVBB-RV32-NEXT: vl1re32.v v9, (a7) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 14 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v20, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v21, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vl1re32.v v22, (a6) +; ZVBB-RV32-NEXT: vl1re32.v v23, (a1) +; ZVBB-RV32-NEXT: add a5, a0, a5 +; ZVBB-RV32-NEXT: vs2r.v v12, (a5) +; ZVBB-RV32-NEXT: vs4r.v v8, (a2) +; ZVBB-RV32-NEXT: vs8r.v v16, (a0) +; ZVBB-RV32-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv28i32_nxv4i32: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v26, v20 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 3 +; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v1, v20 +; ZVBB-RV64-NEXT: vmv1r.v v3, v22 +; ZVBB-RV64-NEXT: vmv1r.v v5, v24 +; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v2, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: slli a5, a2, 2 +; ZVBB-RV64-NEXT: vmv1r.v v4, v14 +; ZVBB-RV64-NEXT: slli a6, a2, 4 +; ZVBB-RV64-NEXT: add a7, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v6, v18 +; ZVBB-RV64-NEXT: sub a5, a6, a5 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: add a6, a7, a2 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vsseg7e32.v v1, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 +; ZVBB-RV64-NEXT: vsseg7e32.v v21, (a1) +; ZVBB-RV64-NEXT: vl1re32.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v11, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v13, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re32.v v18, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v19, (a6) +; ZVBB-RV64-NEXT: vl1re32.v v16, (a0) +; ZVBB-RV64-NEXT: vl1re32.v v8, (a4) +; ZVBB-RV64-NEXT: vl1re32.v v17, (a3) +; ZVBB-RV64-NEXT: vl1re32.v v9, (a7) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 14 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v20, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v21, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vl1re32.v v22, (a6) +; ZVBB-RV64-NEXT: vl1re32.v v23, (a1) +; ZVBB-RV64-NEXT: add a5, a0, a5 +; ZVBB-RV64-NEXT: vs2r.v v12, (a5) +; ZVBB-RV64-NEXT: vs4r.v v8, (a2) +; ZVBB-RV64-NEXT: vs8r.v v16, (a0) +; ZVBB-RV64-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret + %res = call @llvm.vector.interleave7.nxv28i32( %a, %b, %c, %d, %e, %f, %g) + ret %res +} + +define @vector_interleave_nxv14i64_nxv2i64( %a, %b, %c, %d, %e, %f, %g) nounwind { +; +; RV32-LABEL: vector_interleave_nxv14i64_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vmv2r.v v26, v20 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v1, v20 +; RV32-NEXT: vmv1r.v v3, v22 +; RV32-NEXT: vmv1r.v v5, v24 +; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v2, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: slli a5, a2, 2 +; RV32-NEXT: vmv1r.v v4, v14 +; RV32-NEXT: slli a6, a2, 4 +; RV32-NEXT: add a7, a4, a2 +; RV32-NEXT: vmv1r.v v6, v18 +; RV32-NEXT: sub a5, a6, a5 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: add a6, a7, a2 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vsseg7e64.v v1, (a0) +; RV32-NEXT: vmv1r.v v26, v19 +; RV32-NEXT: vsseg7e64.v v21, (a1) +; RV32-NEXT: vl1re64.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v11, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v13, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re64.v v18, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v19, (a6) +; RV32-NEXT: vl1re64.v v16, (a0) +; RV32-NEXT: vl1re64.v v8, (a4) +; RV32-NEXT: vl1re64.v v17, (a3) +; RV32-NEXT: vl1re64.v v9, (a7) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 14 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v20, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v21, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vl1re64.v v22, (a6) +; RV32-NEXT: vl1re64.v v23, (a1) +; RV32-NEXT: add a5, a0, a5 +; RV32-NEXT: vs2r.v v12, (a5) +; RV32-NEXT: vs4r.v v8, (a2) +; RV32-NEXT: vs8r.v v16, (a0) +; RV32-NEXT: vl8re64.v v16, (a2) +; RV32-NEXT: vl8re64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv14i64_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64-NEXT: vmv2r.v v26, v20 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v1, v20 +; RV64-NEXT: vmv1r.v v3, v22 +; RV64-NEXT: vmv1r.v v5, v24 +; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v2, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: vmv1r.v v4, v14 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a4, a2 +; RV64-NEXT: vmv1r.v v6, v18 +; RV64-NEXT: sub a5, a6, a5 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: add a6, a7, a2 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vsseg7e64.v v1, (a0) +; RV64-NEXT: vmv1r.v v26, v19 +; RV64-NEXT: vsseg7e64.v v21, (a1) +; RV64-NEXT: vl1re64.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v11, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v13, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re64.v v18, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v19, (a6) +; RV64-NEXT: vl1re64.v v16, (a0) +; RV64-NEXT: vl1re64.v v8, (a4) +; RV64-NEXT: vl1re64.v v17, (a3) +; RV64-NEXT: vl1re64.v v9, (a7) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 14 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v20, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v21, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vl1re64.v v22, (a6) +; RV64-NEXT: vl1re64.v v23, (a1) +; RV64-NEXT: add a5, a0, a5 +; RV64-NEXT: vs2r.v v12, (a5) +; RV64-NEXT: vs4r.v v8, (a2) +; RV64-NEXT: vs8r.v v16, (a0) +; RV64-NEXT: vl8re64.v v16, (a2) +; RV64-NEXT: vl8re64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv14i64_nxv2i64: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v26, v20 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 3 +; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v1, v20 +; ZVBB-RV32-NEXT: vmv1r.v v3, v22 +; ZVBB-RV32-NEXT: vmv1r.v v5, v24 +; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v2, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: slli a5, a2, 2 +; ZVBB-RV32-NEXT: vmv1r.v v4, v14 +; ZVBB-RV32-NEXT: slli a6, a2, 4 +; ZVBB-RV32-NEXT: add a7, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v6, v18 +; ZVBB-RV32-NEXT: sub a5, a6, a5 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: add a6, a7, a2 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vsseg7e64.v v1, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 +; ZVBB-RV32-NEXT: vsseg7e64.v v21, (a1) +; ZVBB-RV32-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v11, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v13, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re64.v v18, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v19, (a6) +; ZVBB-RV32-NEXT: vl1re64.v v16, (a0) +; ZVBB-RV32-NEXT: vl1re64.v v8, (a4) +; ZVBB-RV32-NEXT: vl1re64.v v17, (a3) +; ZVBB-RV32-NEXT: vl1re64.v v9, (a7) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 14 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v20, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v21, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vl1re64.v v22, (a6) +; ZVBB-RV32-NEXT: vl1re64.v v23, (a1) +; ZVBB-RV32-NEXT: add a5, a0, a5 +; ZVBB-RV32-NEXT: vs2r.v v12, (a5) +; ZVBB-RV32-NEXT: vs4r.v v8, (a2) +; ZVBB-RV32-NEXT: vs8r.v v16, (a0) +; ZVBB-RV32-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv14i64_nxv2i64: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v26, v20 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 3 +; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v1, v20 +; ZVBB-RV64-NEXT: vmv1r.v v3, v22 +; ZVBB-RV64-NEXT: vmv1r.v v5, v24 +; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v2, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: slli a5, a2, 2 +; ZVBB-RV64-NEXT: vmv1r.v v4, v14 +; ZVBB-RV64-NEXT: slli a6, a2, 4 +; ZVBB-RV64-NEXT: add a7, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v6, v18 +; ZVBB-RV64-NEXT: sub a5, a6, a5 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: add a6, a7, a2 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vsseg7e64.v v1, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 +; ZVBB-RV64-NEXT: vsseg7e64.v v21, (a1) +; ZVBB-RV64-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v11, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v13, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re64.v v18, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v19, (a6) +; ZVBB-RV64-NEXT: vl1re64.v v16, (a0) +; ZVBB-RV64-NEXT: vl1re64.v v8, (a4) +; ZVBB-RV64-NEXT: vl1re64.v v17, (a3) +; ZVBB-RV64-NEXT: vl1re64.v v9, (a7) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 14 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v20, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v21, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vl1re64.v v22, (a6) +; ZVBB-RV64-NEXT: vl1re64.v v23, (a1) +; ZVBB-RV64-NEXT: add a5, a0, a5 +; ZVBB-RV64-NEXT: vs2r.v v12, (a5) +; ZVBB-RV64-NEXT: vs4r.v v8, (a2) +; ZVBB-RV64-NEXT: vs8r.v v16, (a0) +; ZVBB-RV64-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret + %res = call @llvm.vector.interleave7.nxv14i64( %a, %b, %c, %d, %e, %f, %g) + ret %res +}