From 5b47f49008649d3fe2b060ad38712f0b75fd7d01 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 2 Jul 2025 14:19:05 -0700 Subject: [PATCH] [RISCV][TTI] Model partial reduce of ext for zvqdotq This is the RISCV follow up to f575b18f to leverage the new infrastructure recently added. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 21 +- .../LoopVectorize/RISCV/partial-reduce.ll | 663 ++++++++++++++++++ 2 files changed, 680 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce.ll diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 67a51c12b508e..f84823dca3ad3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -303,16 +303,29 @@ InstructionCost RISCVTTIImpl::getPartialReductionCost( // zve32x is broken for partial_reduce_umla, but let's make sure we // don't generate them. if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 || - Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul || - InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) || + Opcode != Instruction::Add || !InputTypeA->isIntegerTy(8) || !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4)) return InstructionCost::getInvalid(); + // We support both the plain dot product idiom, and the use of dotproduct + // to compute a a reduction of an extended value. + if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB)) + return InstructionCost::getInvalid(); + + InstructionCost IntMatCost = 0; + if (!BinOp) { + // Cost to produce one vmv.v.i -- since the constant is shared across any + // unrolled copies, don't need to scale by LT.first. + Type *Tp = VectorType::get(InputTypeA, VF); + std::pair LT = getTypeLegalizationCost(Tp); + IntMatCost = getRISCVInstructionCost(RISCV::VMV_V_I, LT.second, CostKind); + } + Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4)); std::pair LT = getTypeLegalizationCost(Tp); // Note: Asuming all vqdot* variants are equal cost - return LT.first * - getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind); + return IntMatCost + LT.first * getRISCVInstructionCost(RISCV::VQDOT_VV, + LT.second, CostKind); } bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce.ll new file mode 100644 index 0000000000000..83475796abc6c --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce.ll @@ -0,0 +1,663 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 +; RUN: opt -passes=loop-vectorize -mattr=+v -S < %s | FileCheck %s --check-prefixes=CHECK,V +; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ +; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V +; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ + +target triple = "riscv64-none-unknown-elf" + +; == Partial reductions with add of an extend + +define i32 @zext_add_reduc_i8_i32(ptr %a) { +; V-LABEL: define i32 @zext_add_reduc_i8_i32( +; V-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; V-NEXT: entry: +; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] +; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; V: vector.ph: +; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; V-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; V-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; V-NEXT: br label [[VECTOR_BODY:%.*]] +; V: vector.body: +; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; V-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; V-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; V-NEXT: [[TMP9]] = add [[TMP8]], [[VEC_PHI]] +; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; V-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; V-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; V: middle.block: +; V-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP9]]) +; V-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; V: scalar.ph: +; +; ZVQDOTQ-LABEL: define i32 @zext_add_reduc_i8_i32( +; ZVQDOTQ-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; ZVQDOTQ-NEXT: entry: +; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] +; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; ZVQDOTQ: vector.ph: +; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; ZVQDOTQ-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; ZVQDOTQ: vector.body: +; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP8]]) +; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; ZVQDOTQ-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; ZVQDOTQ: middle.block: +; ZVQDOTQ-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32( [[PARTIAL_REDUCE]]) +; ZVQDOTQ-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; ZVQDOTQ: scalar.ph: +; +; FIXED-V-LABEL: define i32 @zext_add_reduc_i8_i32( +; FIXED-V-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; FIXED-V-NEXT: entry: +; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-V: vector.ph: +; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-V: vector.body: +; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-V-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-V-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-V-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-V-NEXT: [[TMP5]] = add <8 x i32> [[TMP3]], [[VEC_PHI]] +; FIXED-V-NEXT: [[TMP6]] = add <8 x i32> [[TMP4]], [[VEC_PHI1]] +; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-V-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-V-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-V: middle.block: +; FIXED-V-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]] +; FIXED-V-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-V-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-V: scalar.ph: +; +; FIXED-ZVQDOTQ-LABEL: define i32 @zext_add_reduc_i8_i32( +; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; FIXED-ZVQDOTQ-NEXT: entry: +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-ZVQDOTQ: vector.ph: +; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-ZVQDOTQ: vector.body: +; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP4]]) +; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-ZVQDOTQ: middle.block: +; FIXED-ZVQDOTQ-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-ZVQDOTQ: scalar.ph: +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %add = add i32 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i64 @zext_add_reduc_i8_i64(ptr %a) { +; CHECK-LABEL: define i64 @zext_add_reduc_i8_i64( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP9]] = add [[TMP8]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; +; FIXED-LABEL: define i64 @zext_add_reduc_i8_i64( +; FIXED-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; FIXED-NEXT: entry: +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED: vector.ph: +; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED: vector.body: +; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; FIXED-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64> +; FIXED-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i64> +; FIXED-NEXT: [[TMP5]] = add <4 x i64> [[TMP3]], [[VEC_PHI]] +; FIXED-NEXT: [[TMP6]] = add <4 x i64> [[TMP4]], [[VEC_PHI1]] +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED: middle.block: +; FIXED-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP6]], [[TMP5]] +; FIXED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]]) +; FIXED-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED: scalar.ph: +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i64 + %add = add i64 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %add +} + + +define i64 @zext_add_reduc_i16_i64(ptr %a) { +; CHECK-LABEL: define i64 @zext_add_reduc_i16_i64( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP9]] = add [[TMP8]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; +; FIXED-LABEL: define i64 @zext_add_reduc_i16_i64( +; FIXED-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; FIXED-NEXT: entry: +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED: vector.ph: +; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED: vector.body: +; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]] +; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 0 +; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP0]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 2 +; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2 +; FIXED-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i64> +; FIXED-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[WIDE_LOAD2]] to <4 x i64> +; FIXED-NEXT: [[TMP5]] = add <4 x i64> [[TMP3]], [[VEC_PHI]] +; FIXED-NEXT: [[TMP6]] = add <4 x i64> [[TMP4]], [[VEC_PHI1]] +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED: middle.block: +; FIXED-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP6]], [[TMP5]] +; FIXED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]]) +; FIXED-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED: scalar.ph: +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i16, ptr %a, i64 %iv + %load.a = load i16, ptr %gep.a, align 2 + %ext.a = zext i16 %load.a to i64 + %add = add i64 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %add +} + + + +define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) { +; V-LABEL: define i32 @zext_add_reduc_i8_i32_predicated( +; V-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; V-NEXT: entry: +; V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; V: vector.ph: +; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; V-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; V-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] +; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; V-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; V-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 +; V-NEXT: br label [[VECTOR_BODY:%.*]] +; V: vector.body: +; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 1025) +; V-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; V-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; V-NEXT: [[TMP7:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; V-NEXT: [[TMP8]] = add [[TMP7]], [[VEC_PHI]] +; V-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], [[VEC_PHI]] +; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; V-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; V-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; V: middle.block: +; V-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP9]]) +; V-NEXT: br label [[FOR_EXIT:%.*]] +; V: scalar.ph: +; +; ZVQDOTQ-LABEL: define i32 @zext_add_reduc_i8_i32_predicated( +; ZVQDOTQ-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; ZVQDOTQ-NEXT: entry: +; ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; ZVQDOTQ: vector.ph: +; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; ZVQDOTQ-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; ZVQDOTQ-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] +; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; ZVQDOTQ-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 +; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; ZVQDOTQ: vector.body: +; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 1025) +; ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; ZVQDOTQ-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; ZVQDOTQ-NEXT: [[TMP7:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer +; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP8]]) +; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; ZVQDOTQ-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; ZVQDOTQ: middle.block: +; ZVQDOTQ-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32( [[PARTIAL_REDUCE]]) +; ZVQDOTQ-NEXT: br label [[FOR_EXIT:%.*]] +; ZVQDOTQ: scalar.ph: +; +; FIXED-V-LABEL: define i32 @zext_add_reduc_i8_i32_predicated( +; FIXED-V-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; FIXED-V-NEXT: entry: +; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-V: vector.ph: +; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-V: vector.body: +; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 1025) +; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-V-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; FIXED-V-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> +; FIXED-V-NEXT: [[TMP3]] = add <8 x i32> [[TMP2]], [[VEC_PHI]] +; FIXED-V-NEXT: [[TMP4:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP3]], <8 x i32> [[VEC_PHI]] +; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-V-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1032 +; FIXED-V-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-V: middle.block: +; FIXED-V-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; FIXED-V-NEXT: br label [[FOR_EXIT:%.*]] +; FIXED-V: scalar.ph: +; +; FIXED-ZVQDOTQ-LABEL: define i32 @zext_add_reduc_i8_i32_predicated( +; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; FIXED-ZVQDOTQ-NEXT: entry: +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-ZVQDOTQ: vector.ph: +; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-ZVQDOTQ: vector.body: +; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 1025) +; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) +; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1032 +; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-ZVQDOTQ: middle.block: +; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE]]) +; FIXED-ZVQDOTQ-NEXT: br label [[FOR_EXIT:%.*]] +; FIXED-ZVQDOTQ: scalar.ph: +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %add = add i32 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body, !llvm.loop !0 + +for.exit: ; preds = %for.body + ret i32 %add +} + + +define i32 @zext_sub_reduc_i8_i32(ptr %a) { +; CHECK-LABEL: define i32 @zext_sub_reduc_i8_i32( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP9]] = sub [[VEC_PHI]], [[TMP8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; +; FIXED-LABEL: define i32 @zext_sub_reduc_i8_i32( +; FIXED-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; FIXED-NEXT: entry: +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED: vector.ph: +; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED: vector.body: +; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-NEXT: [[TMP5]] = sub <8 x i32> [[VEC_PHI]], [[TMP3]] +; FIXED-NEXT: [[TMP6]] = sub <8 x i32> [[VEC_PHI1]], [[TMP4]] +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FIXED: middle.block: +; FIXED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]] +; FIXED-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED: scalar.ph: +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %add = sub i32 %accum, %ext.a + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + + +define i32 @sext_add_reduc_i8_i32(ptr %a) { +; V-LABEL: define i32 @sext_add_reduc_i8_i32( +; V-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; V-NEXT: entry: +; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] +; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; V: vector.ph: +; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; V-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; V-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; V-NEXT: br label [[VECTOR_BODY:%.*]] +; V: vector.body: +; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; V-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; V-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to +; V-NEXT: [[TMP9]] = add [[TMP8]], [[VEC_PHI]] +; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; V-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; V-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; V: middle.block: +; V-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP9]]) +; V-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; V: scalar.ph: +; +; ZVQDOTQ-LABEL: define i32 @sext_add_reduc_i8_i32( +; ZVQDOTQ-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; ZVQDOTQ-NEXT: entry: +; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] +; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; ZVQDOTQ: vector.ph: +; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; ZVQDOTQ-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; ZVQDOTQ: vector.body: +; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to +; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP8]]) +; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; ZVQDOTQ-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; ZVQDOTQ: middle.block: +; ZVQDOTQ-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32( [[PARTIAL_REDUCE]]) +; ZVQDOTQ-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; ZVQDOTQ: scalar.ph: +; +; FIXED-V-LABEL: define i32 @sext_add_reduc_i8_i32( +; FIXED-V-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; FIXED-V-NEXT: entry: +; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-V: vector.ph: +; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-V: vector.body: +; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-V-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-V-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-V-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-V-NEXT: [[TMP5]] = add <8 x i32> [[TMP3]], [[VEC_PHI]] +; FIXED-V-NEXT: [[TMP6]] = add <8 x i32> [[TMP4]], [[VEC_PHI1]] +; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-V-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-V-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXED-V: middle.block: +; FIXED-V-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]] +; FIXED-V-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-V-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-V: scalar.ph: +; +; FIXED-ZVQDOTQ-LABEL: define i32 @sext_add_reduc_i8_i32( +; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; FIXED-ZVQDOTQ-NEXT: entry: +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-ZVQDOTQ: vector.ph: +; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-ZVQDOTQ: vector.body: +; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP4]]) +; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXED-ZVQDOTQ: middle.block: +; FIXED-ZVQDOTQ-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-ZVQDOTQ: scalar.ph: +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %add = add i32 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}