Skip to content

LLVM fails to optimize right shift by constant+saturating narrow to single narrowing right shift on AArch64 #112925

Closed
@johnplatts

Description

@johnplatts

LLVM fails to optimize the following right shift+narrow operations down to a single narrowing right shift instruction on AArch64:

define dso_local noundef <4 x i16> @NarrowShrI32By5(<4 x i32> noundef %0) #0 {
  %2 = ashr <4 x i32> %0, <i32 5, i32 5, i32 5, i32 5>
  %3 = tail call noundef <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %2)
  ret <4 x i16> %3
}

declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) #1

define dso_local noundef <4 x i16> @NarrowShrU32By5(<4 x i32> noundef %0) #0 {
  %2 = lshr <4 x i32> %0, <i32 5, i32 5, i32 5, i32 5>
  %3 = tail call noundef <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %2)
  ret <4 x i16> %3
}

declare <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32>) #1

define dso_local noundef <4 x i16> @NarrowShrI32By5ToU16(<4 x i32> noundef %0) #0 {
  %2 = lshr <4 x i32> %0, <i32 5, i32 5, i32 5, i32 5>
  %3 = tail call noundef <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %2)
  ret <4 x i16> %3
}

declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) #1

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

Here is the assembly that is currently generated when the above code is compiled with llc:

NarrowShrI32By5:                        // @NarrowShrI32By5
        sshr    v0.4s, v0.4s, #5
        sqxtn   v0.4h, v0.4s
        ret
NarrowShrU32By5:                        // @NarrowShrU32By5
        ushr    v0.4s, v0.4s, #5
        uqxtn   v0.4h, v0.4s
        ret
NarrowShrI32By5ToU16:                   // @NarrowShrI32By5ToU16
        ushr    v0.4s, v0.4s, #5
        sqxtun  v0.4h, v0.4s
        ret

The snippet above can be found at https://godbolt.org/z/jq3zMKPz1.

Here is a more optimized version of the above code:

NarrowShrI32By5:
        sqshrn  v0.4s, v0.4s, #5
        ret
NarrowShrU32By5:
        uqshrn  v0.4s, v0.4s, #5
        ret
NarrowShrI32By5ToU16:
        sqshrun v0.4s, v0.4s, #5
        ret

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions