Closed
Description
LLVM fails to optimize the following right shift+narrow operations down to a single narrowing right shift instruction on AArch64:
define dso_local noundef <4 x i16> @NarrowShrI32By5(<4 x i32> noundef %0) #0 {
%2 = ashr <4 x i32> %0, <i32 5, i32 5, i32 5, i32 5>
%3 = tail call noundef <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %2)
ret <4 x i16> %3
}
declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) #1
define dso_local noundef <4 x i16> @NarrowShrU32By5(<4 x i32> noundef %0) #0 {
%2 = lshr <4 x i32> %0, <i32 5, i32 5, i32 5, i32 5>
%3 = tail call noundef <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %2)
ret <4 x i16> %3
}
declare <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32>) #1
define dso_local noundef <4 x i16> @NarrowShrI32By5ToU16(<4 x i32> noundef %0) #0 {
%2 = lshr <4 x i32> %0, <i32 5, i32 5, i32 5, i32 5>
%3 = tail call noundef <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %2)
ret <4 x i16> %3
}
declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) #1
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
Here is the assembly that is currently generated when the above code is compiled with llc:
NarrowShrI32By5: // @NarrowShrI32By5
sshr v0.4s, v0.4s, #5
sqxtn v0.4h, v0.4s
ret
NarrowShrU32By5: // @NarrowShrU32By5
ushr v0.4s, v0.4s, #5
uqxtn v0.4h, v0.4s
ret
NarrowShrI32By5ToU16: // @NarrowShrI32By5ToU16
ushr v0.4s, v0.4s, #5
sqxtun v0.4h, v0.4s
ret
The snippet above can be found at https://godbolt.org/z/jq3zMKPz1.
Here is a more optimized version of the above code:
NarrowShrI32By5:
sqshrn v0.4s, v0.4s, #5
ret
NarrowShrU32By5:
uqshrn v0.4s, v0.4s, #5
ret
NarrowShrI32By5ToU16:
sqshrun v0.4s, v0.4s, #5
ret