-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[VectorCombine] New folding pattern for extract/binop/shuffle chains #145232
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Rajveer Singh Bharadwaj (Rajveer100) ChangesResolves #144654 This adds a new define i16 @<!-- -->test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #<!-- -->0 {
%1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
%2 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
%3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%4 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
%5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%6 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
%7 = extractelement <8 x i16> %6, i64 0
ret i16 %7
} ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so. Full diff: https://github.com/llvm/llvm-project/pull/145232.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 52cb1dbb33b86..aca939c4f534d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -129,6 +129,7 @@ class VectorCombine {
bool foldShuffleOfIntrinsics(Instruction &I);
bool foldShuffleToIdentity(Instruction &I);
bool foldShuffleFromReductions(Instruction &I);
+ bool foldShuffleChainsToReduce(Instruction &I);
bool foldCastFromReductions(Instruction &I);
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
@@ -2910,6 +2911,130 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
return foldSelectShuffle(*Shuffle, true);
}
+bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
+ auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
+ if (!SVI)
+ return false;
+
+ std::queue<Value *> Worklist;
+ SmallVector<Instruction *> ToEraseFromParent;
+
+ SmallVector<int> ShuffleMask;
+ bool IsShuffleOp = true;
+
+ Worklist.push(SVI);
+ SVI->getShuffleMask(ShuffleMask);
+
+ if (ShuffleMask.size() < 2)
+ return false;
+
+ Instruction *Prev0 = nullptr, *Prev1 = nullptr;
+ Instruction *LastOp = nullptr;
+
+ int MaskHalfPos = ShuffleMask.size() / 2;
+ bool IsFirst = true;
+
+ while (!Worklist.empty()) {
+ Value *V = Worklist.front();
+ Worklist.pop();
+
+ auto *CI = dyn_cast<Instruction>(V);
+ if (!CI)
+ return false;
+
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ if (!IsShuffleOp || MaskHalfPos < 1 || (!Prev1 && !IsFirst))
+ return false;
+
+ auto *Op0 = SV->getOperand(0);
+ auto *Op1 = SV->getOperand(1);
+ if (!Op0 || !Op1)
+ return false;
+
+ auto *FVT = dyn_cast<FixedVectorType>(Op1->getType());
+ if (!FVT || !isa<PoisonValue>(Op1))
+ return false;
+
+ SmallVector<int> CurrentMask;
+ SV->getShuffleMask(CurrentMask);
+
+ int64_t MaskSize = CurrentMask.size();
+ for (int MaskPos = 0; MaskPos != MaskSize; ++MaskPos) {
+ if (MaskPos < MaskHalfPos && CurrentMask[MaskPos] != MaskHalfPos + MaskPos)
+ return false;
+ if (MaskPos >= MaskHalfPos && CurrentMask[MaskPos] != -1)
+ return false;
+ }
+ MaskHalfPos /= 2;
+ Prev0 = SV;
+ } else if (auto *Call = dyn_cast<CallInst>(V)) {
+ if (IsShuffleOp || !Prev0)
+ return false;
+
+ auto *II = dyn_cast<IntrinsicInst>(Call);
+ if (!II)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::umin: {
+ auto *Op0 = Call->getOperand(0);
+ auto *Op1 = Call->getOperand(1);
+ if (!(Op0 == Prev0 && Op1 == Prev1) && !(Op0 == Prev1 && Op1 == Prev0) && !IsFirst)
+ return false;
+
+ if (!IsFirst)
+ Prev0 = Prev1;
+ else
+ IsFirst = false;
+ Prev1 = Call;
+ break;
+ }
+ default:
+ return false;
+ }
+ } else if (auto *ExtractElement = dyn_cast<ExtractElementInst>(CI)) {
+ if (!IsShuffleOp || !Prev0 || !Prev1 || MaskHalfPos != 0)
+ return false;
+
+ auto *Op0 = ExtractElement->getOperand(0);
+ auto *Op1 = ExtractElement->getOperand(1);
+ if (Op0 != Prev1)
+ return false;
+
+ if (auto *Op1Idx = dyn_cast<ConstantInt>(Op1)) {
+ if (Op1Idx->getValue() != 0)
+ return false;
+ } else {
+ return false;
+ }
+ LastOp = ExtractElement;
+ break;
+ }
+ IsShuffleOp ^= 1;
+ ToEraseFromParent.push_back(CI);
+
+ auto *NextI = CI->getNextNode();
+ if (!NextI)
+ return false;
+ Worklist.push(NextI);
+ }
+
+ if (!LastOp)
+ return false;
+
+ auto *ReducedResult = Builder.CreateIntrinsic(Intrinsic::vector_reduce_umin, {SVI->getType()}, {SVI->getOperand(0)});
+ replaceValue(*LastOp, *ReducedResult);
+
+ ToEraseFromParent.push_back(LastOp);
+
+ std::reverse(ToEraseFromParent.begin(), ToEraseFromParent.end());
+ // for (auto &Instr : ToEraseFromParent)
+ // eraseInstruction(*Instr);
+ // Instr->eraseFromParent();
+
+ return true;
+}
+
/// Determine if its more efficient to fold:
/// reduce(trunc(x)) -> trunc(reduce(x)).
/// reduce(sext(x)) -> sext(reduce(x)).
@@ -3607,6 +3732,7 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfIntrinsics(I);
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
+ MadeChange |= foldShuffleChainsToReduce(I);
break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
new file mode 100644
index 0000000000000..6f21eb5097fde
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 {
+; CHECK-LABEL: define i16 @test_reduce_v8i16(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+ %7 = extractelement <8 x i16> %6, i64 0
+ ret i16 %7
+}
|
@llvm/pr-subscribers-vectorizers Author: Rajveer Singh Bharadwaj (Rajveer100) ChangesResolves #144654 This adds a new define i16 @<!-- -->test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #<!-- -->0 {
%1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
%2 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
%3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%4 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
%5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%6 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
%7 = extractelement <8 x i16> %6, i64 0
ret i16 %7
} ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so. Full diff: https://github.com/llvm/llvm-project/pull/145232.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 52cb1dbb33b86..aca939c4f534d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -129,6 +129,7 @@ class VectorCombine {
bool foldShuffleOfIntrinsics(Instruction &I);
bool foldShuffleToIdentity(Instruction &I);
bool foldShuffleFromReductions(Instruction &I);
+ bool foldShuffleChainsToReduce(Instruction &I);
bool foldCastFromReductions(Instruction &I);
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
@@ -2910,6 +2911,130 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
return foldSelectShuffle(*Shuffle, true);
}
+bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
+ auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
+ if (!SVI)
+ return false;
+
+ std::queue<Value *> Worklist;
+ SmallVector<Instruction *> ToEraseFromParent;
+
+ SmallVector<int> ShuffleMask;
+ bool IsShuffleOp = true;
+
+ Worklist.push(SVI);
+ SVI->getShuffleMask(ShuffleMask);
+
+ if (ShuffleMask.size() < 2)
+ return false;
+
+ Instruction *Prev0 = nullptr, *Prev1 = nullptr;
+ Instruction *LastOp = nullptr;
+
+ int MaskHalfPos = ShuffleMask.size() / 2;
+ bool IsFirst = true;
+
+ while (!Worklist.empty()) {
+ Value *V = Worklist.front();
+ Worklist.pop();
+
+ auto *CI = dyn_cast<Instruction>(V);
+ if (!CI)
+ return false;
+
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ if (!IsShuffleOp || MaskHalfPos < 1 || (!Prev1 && !IsFirst))
+ return false;
+
+ auto *Op0 = SV->getOperand(0);
+ auto *Op1 = SV->getOperand(1);
+ if (!Op0 || !Op1)
+ return false;
+
+ auto *FVT = dyn_cast<FixedVectorType>(Op1->getType());
+ if (!FVT || !isa<PoisonValue>(Op1))
+ return false;
+
+ SmallVector<int> CurrentMask;
+ SV->getShuffleMask(CurrentMask);
+
+ int64_t MaskSize = CurrentMask.size();
+ for (int MaskPos = 0; MaskPos != MaskSize; ++MaskPos) {
+ if (MaskPos < MaskHalfPos && CurrentMask[MaskPos] != MaskHalfPos + MaskPos)
+ return false;
+ if (MaskPos >= MaskHalfPos && CurrentMask[MaskPos] != -1)
+ return false;
+ }
+ MaskHalfPos /= 2;
+ Prev0 = SV;
+ } else if (auto *Call = dyn_cast<CallInst>(V)) {
+ if (IsShuffleOp || !Prev0)
+ return false;
+
+ auto *II = dyn_cast<IntrinsicInst>(Call);
+ if (!II)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::umin: {
+ auto *Op0 = Call->getOperand(0);
+ auto *Op1 = Call->getOperand(1);
+ if (!(Op0 == Prev0 && Op1 == Prev1) && !(Op0 == Prev1 && Op1 == Prev0) && !IsFirst)
+ return false;
+
+ if (!IsFirst)
+ Prev0 = Prev1;
+ else
+ IsFirst = false;
+ Prev1 = Call;
+ break;
+ }
+ default:
+ return false;
+ }
+ } else if (auto *ExtractElement = dyn_cast<ExtractElementInst>(CI)) {
+ if (!IsShuffleOp || !Prev0 || !Prev1 || MaskHalfPos != 0)
+ return false;
+
+ auto *Op0 = ExtractElement->getOperand(0);
+ auto *Op1 = ExtractElement->getOperand(1);
+ if (Op0 != Prev1)
+ return false;
+
+ if (auto *Op1Idx = dyn_cast<ConstantInt>(Op1)) {
+ if (Op1Idx->getValue() != 0)
+ return false;
+ } else {
+ return false;
+ }
+ LastOp = ExtractElement;
+ break;
+ }
+ IsShuffleOp ^= 1;
+ ToEraseFromParent.push_back(CI);
+
+ auto *NextI = CI->getNextNode();
+ if (!NextI)
+ return false;
+ Worklist.push(NextI);
+ }
+
+ if (!LastOp)
+ return false;
+
+ auto *ReducedResult = Builder.CreateIntrinsic(Intrinsic::vector_reduce_umin, {SVI->getType()}, {SVI->getOperand(0)});
+ replaceValue(*LastOp, *ReducedResult);
+
+ ToEraseFromParent.push_back(LastOp);
+
+ std::reverse(ToEraseFromParent.begin(), ToEraseFromParent.end());
+ // for (auto &Instr : ToEraseFromParent)
+ // eraseInstruction(*Instr);
+ // Instr->eraseFromParent();
+
+ return true;
+}
+
/// Determine if its more efficient to fold:
/// reduce(trunc(x)) -> trunc(reduce(x)).
/// reduce(sext(x)) -> sext(reduce(x)).
@@ -3607,6 +3732,7 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfIntrinsics(I);
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
+ MadeChange |= foldShuffleChainsToReduce(I);
break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
new file mode 100644
index 0000000000000..6f21eb5097fde
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 {
+; CHECK-LABEL: define i16 @test_reduce_v8i16(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+ %7 = extractelement <8 x i16> %6, i64 0
+ ret i16 %7
+}
|
20aa7b1
to
fea1941
Compare
fea1941
to
b184ba5
Compare
I have updated the implementation itself, re-checking for other potential issues and adding cost analysis/tests. |
d130a70
to
1da772b
Compare
@RKSimon |
@Rajveer100 Please can you investigate the CI failures? |
llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll
Outdated
Show resolved
Hide resolved
1da772b
to
dea9e0b
Compare
dea9e0b
to
0116da2
Compare
Resolves llvm#144654 Part of llvm#143088 This adds a new `foldShuffleChainsToReduce` for horizontal reduction of patterns like: ```llvm define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 { %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) %7 = extractelement <8 x i16> %6, i64 0 ret i16 %7 } ``` ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so.
0116da2
to
44a3268
Compare
b24a5b8
to
61d835b
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
Added negative tests and support for other binary operations as well. Let me know if anything else is needed. |
61d835b
to
09adb45
Compare
Resolves #144654
Part of #143088
This adds a new
foldShuffleChainsToReduce
for horizontal reduction of patterns like:...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call.
Similar transformation for other ops when costs permit to do so.