diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c19265613c706..7cee9261f59cb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15928,17 +15928,32 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG); } - // Vectors that are less than 64 bits get widened to neatly fit a 64 bit - // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to - // this element size leads to the best codegen, since e.g. setcc results - // might need to be truncated otherwise. - EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u)); + // Results of setcc operations get widened to 128 bits if their input + // operands are 128 bits wide, otherwise vectors that are less than 64 bits + // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets + // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element + // size leads to the best codegen, since e.g. setcc results might need to be + // truncated otherwise. + unsigned ExtendedWidth = 64; + if (Vec.getOpcode() == ISD::SETCC && + Vec.getOperand(0).getValueSizeInBits() >= 128) { + ExtendedWidth = 128; + } + EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u)); // any_ext doesn't work with umin/umax, so only use it for uadd. unsigned ExtendOp = ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND; SDValue Extended = DAG.getNode( ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec); + // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so + // in that case we bitcast the sign extended values from v2i64 to v4i32 + // before reduction for optimal code generation. + if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) && + NumElems == 2 && ExtendedWidth == 128) { + Extended = DAG.getBitcast(MVT::v4i32, Extended); + ExtendedVT = MVT::i32; + } switch (ScalarOpcode) { case ISD::AND: Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended); diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll index 767ca91a58bb1..f317a7b808342 100644 --- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll +++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll @@ -12,8 +12,7 @@ define i1 @unordered_floating_point_compare_on_v8f32(<8 x float> %a_vec) { ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: umaxv b0, v0.8b +; CHECK-NEXT: umaxv h0, v0.8h ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll index 58020d28702b2..625e8ae6a98dc 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll @@ -15,8 +15,15 @@ declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a) -define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v1: +declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a) + +define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.b[0] @@ -29,8 +36,8 @@ define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v2: +define i32 @reduce_and_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 @@ -46,8 +53,8 @@ define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v4: +define i32 @reduce_and_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 @@ -63,8 +70,8 @@ define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v8: +define i32 @reduce_and_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 ; CHECK-NEXT: uminv b0, v0.8b @@ -78,8 +85,8 @@ define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v16: +define i32 @reduce_and_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: uminv b0, v0.16b @@ -93,8 +100,8 @@ define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v32: +define i32 @reduce_and_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 @@ -109,8 +116,193 @@ define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v1: +define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[0] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: uminv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: uminv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: uminv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <16 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: uminv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uminv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: uminv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.2d, v1.2d, #0 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uminv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.b[0] @@ -123,8 +315,8 @@ define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v2: +define i32 @reduce_or_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 @@ -140,8 +332,8 @@ define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v4: +define i32 @reduce_or_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 @@ -157,8 +349,8 @@ define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v8: +define i32 @reduce_or_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 ; CHECK-NEXT: umaxv b0, v0.8b @@ -172,8 +364,8 @@ define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v16: +define i32 @reduce_or_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: umaxv b0, v0.16b @@ -187,8 +379,8 @@ define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v32: +define i32 @reduce_or_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 @@ -202,3 +394,468 @@ define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } + +define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[0] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: umaxv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: umaxv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <16 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: umaxv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.2d, v1.2d, #0 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.b[0] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: addv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <16 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: addv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <32 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[0] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: addv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <16 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.2d, v1.2d, #0 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll index c0f1720e1cf8b..5212acc6fca0f 100644 --- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll @@ -58,11 +58,11 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; NEON-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h ; NEON-FIXED-NEXT: adrp x8, .LCPI1_0 ; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI1_0] +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI1_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.8b, v1.8h -; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: umaxv b1, v1.8b +; NEON-FIXED-NEXT: xtn v2.8b, v1.8h +; NEON-FIXED-NEXT: umaxv h1, v1.8h +; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; NEON-FIXED-NEXT: umaxv b2, v2.8b ; NEON-FIXED-NEXT: fmov w8, s2 ; NEON-FIXED-NEXT: bfi x9, x8, #1, #3 @@ -78,12 +78,12 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h -; SVE-FIXED-NEXT: index z2.b, #0, #1 +; SVE-FIXED-NEXT: index z3.b, #0, #1 ; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.8b, v1.8h -; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: umaxv b1, v1.8b +; SVE-FIXED-NEXT: xtn v2.8b, v1.8h +; SVE-FIXED-NEXT: umaxv h1, v1.8h +; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; SVE-FIXED-NEXT: umaxv b2, v2.8b ; SVE-FIXED-NEXT: fmov w8, s2 ; SVE-FIXED-NEXT: bfi x9, x8, #1, #3 @@ -106,11 +106,11 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s ; NEON-FIXED-NEXT: adrp x8, .LCPI2_0 ; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI2_0] +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI2_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.4h, v1.4s -; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: umaxv h1, v1.4h +; NEON-FIXED-NEXT: xtn v2.4h, v1.4s +; NEON-FIXED-NEXT: umaxv s1, v1.4s +; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; NEON-FIXED-NEXT: umaxv h2, v2.4h ; NEON-FIXED-NEXT: fmov w8, s2 ; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 @@ -126,12 +126,12 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s -; SVE-FIXED-NEXT: index z2.h, #0, #1 +; SVE-FIXED-NEXT: index z3.h, #0, #1 ; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.4h, v1.4s -; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: umaxv h1, v1.4h +; SVE-FIXED-NEXT: xtn v2.4h, v1.4s +; SVE-FIXED-NEXT: umaxv s1, v1.4s +; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; SVE-FIXED-NEXT: umaxv h2, v2.4h ; SVE-FIXED-NEXT: fmov w8, s2 ; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 @@ -154,11 +154,11 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d ; NEON-FIXED-NEXT: adrp x8, .LCPI3_0 ; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI3_0] +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI3_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.2s, v1.2d -; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; NEON-FIXED-NEXT: xtn v2.2s, v1.2d +; NEON-FIXED-NEXT: umaxv s1, v1.4s +; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s ; NEON-FIXED-NEXT: fmov w8, s2 ; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 @@ -174,12 +174,12 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d -; SVE-FIXED-NEXT: index z2.s, #0, #1 +; SVE-FIXED-NEXT: index z3.s, #0, #1 ; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.2s, v1.2d -; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; SVE-FIXED-NEXT: xtn v2.2s, v1.2d +; SVE-FIXED-NEXT: umaxv s1, v1.4s +; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s ; SVE-FIXED-NEXT: fmov w8, s2 ; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 @@ -202,11 +202,11 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s ; NEON-FIXED-NEXT: adrp x8, .LCPI4_0 ; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI4_0] +; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI4_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.4h, v1.4s -; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; NEON-FIXED-NEXT: umaxv h1, v1.4h +; NEON-FIXED-NEXT: xtn v3.4h, v1.4s +; NEON-FIXED-NEXT: umaxv s1, v1.4s +; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b ; NEON-FIXED-NEXT: umaxv h3, v3.4h ; NEON-FIXED-NEXT: fmov w8, s3 ; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 @@ -222,12 +222,12 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s -; SVE-FIXED-NEXT: index z3.h, #0, #1 +; SVE-FIXED-NEXT: index z4.h, #0, #1 ; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.4h, v1.4s -; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; SVE-FIXED-NEXT: umaxv h1, v1.4h +; SVE-FIXED-NEXT: xtn v3.4h, v1.4s +; SVE-FIXED-NEXT: umaxv s1, v1.4s +; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b ; SVE-FIXED-NEXT: umaxv h3, v3.4h ; SVE-FIXED-NEXT: fmov w8, s3 ; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 @@ -250,11 +250,11 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d ; NEON-FIXED-NEXT: adrp x8, .LCPI5_0 ; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI5_0] +; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI5_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.2s, v1.2d -; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; NEON-FIXED-NEXT: xtn v3.2s, v1.2d +; NEON-FIXED-NEXT: umaxv s1, v1.4s +; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b ; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s ; NEON-FIXED-NEXT: fmov w8, s3 ; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 @@ -270,12 +270,12 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d -; SVE-FIXED-NEXT: index z3.s, #0, #1 +; SVE-FIXED-NEXT: index z4.s, #0, #1 ; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.2s, v1.2d -; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; SVE-FIXED-NEXT: xtn v3.2s, v1.2d +; SVE-FIXED-NEXT: umaxv s1, v1.4s +; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b ; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s ; SVE-FIXED-NEXT: fmov w8, s3 ; SVE-FIXED-NEXT: bfi x9, x8, #3, #1