diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1ebc62f984390..d029997e9565f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6567,6 +6567,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, CmpInst::BAD_ICMP_PREDICATE, CostKind); } + // When tail folding with EVL, if the phi is part of an out of loop + // reduction then it will be transformed into a wide vp_merge. + if (VF.isVector() && foldTailWithEVL() && + Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { + IntrinsicCostAttributes ICA( + Intrinsic::vp_merge, ToVectorTy(Phi->getType(), VF), + {ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); + return TTI.getIntrinsicInstrCost(ICA, CostKind); + } + return TTI.getCFInstrCost(Instruction::PHI, CostKind); } case Instruction::UDiv: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll new file mode 100644 index 0000000000000..aa1bb25af930d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll @@ -0,0 +1,25 @@ +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck %s + +; CHECK: Cost of 2 for VF vscale x 4: WIDEN-INTRINSIC vp<%{{.+}}> = call llvm.vp.merge(ir, ir<%add>, ir<%rdx>, vp<%{{.+}}>) +; CHECK: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %loop ] + +define i32 @add(ptr %a, i64 %n, i32 %start) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ %start, %entry ], [ %add, %loop ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret i32 %add +}