From 60933f22a1d2402fe4dfb9d826a44156c6d7f917 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 25 Feb 2025 00:31:28 -0800 Subject: [PATCH 1/4] Pre-commit for cm_stride --- .../RISCV/riscv-vector-reverse-output.ll | 417 ++++++++++++++++++ 1 file changed, 417 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll new file mode 100644 index 0000000000000..2b8ac0ab685ab --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -0,0 +1,417 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 + +;; This is the loop in c++ being vectorize in this file with +;; vector.reverse +;; #pragma clang loop vectorize_width(4, scalable) +;; for (int i = N-1; i >= 0; --i) +;; a[i] = b[i] + 1.0; + +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \ +; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \ +; RUN: | FileCheck --check-prefix=RV64 %s + +; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v \ +; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \ +; RUN: | FileCheck --check-prefix=RV32 %s + +define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) { +; RV64-LABEL: define void @vector_reverse_i64( +; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; RV64-NEXT: [[ENTRY:.*:]] +; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV64: [[FOR_BODY_PREHEADER]]: +; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64: [[VECTOR_SCEVCHECK]]: +; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) +; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] +; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] +; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] +; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64: [[VECTOR_PH]]: +; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; RV64-NEXT: [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV64-NEXT: [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64: [[VECTOR_BODY]]: +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] +; RV64-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0 +; RV64-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1 +; RV64-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 +; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP19]] +; RV64-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP14]] +; RV64-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP14]] +; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP21]] +; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]] +; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP25:%.*]] = add [[REVERSE]], splat (i32 1) +; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] +; RV64-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP14]] +; RV64-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP14]] +; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP27]] +; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP28]] +; RV64-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP25]]) +; RV64-NEXT: store [[REVERSE2]], ptr [[TMP30]], align 4 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; RV64-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64: [[MIDDLE_BLOCK]]: +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64: [[SCALAR_PH]]: +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ] +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV64: [[FOR_COND_CLEANUP]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 +; RV64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]] +; RV64-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; RV64-NEXT: [[ADD9:%.*]] = add i32 [[TMP32]], 1 +; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] +; RV64-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4 +; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; +; RV32-LABEL: define void @vector_reverse_i64( +; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; RV32-NEXT: [[ENTRY:.*:]] +; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV32: [[FOR_BODY_PREHEADER]]: +; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV32: [[VECTOR_PH]]: +; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV32-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; RV32-NEXT: [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV32-NEXT: [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV32-NEXT: br label %[[VECTOR_BODY:.*]] +; RV32: [[VECTOR_BODY]]: +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] +; RV32-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0 +; RV32-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1 +; RV32-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]] +; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32 +; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]] +; RV32-NEXT: [[TMP15:%.*]] = sub i32 1, [[TMP13]] +; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP15]] +; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 4 +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP18:%.*]] = add [[REVERSE]], splat (i32 1) +; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] +; RV32-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32 +; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP20]] +; RV32-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP20]] +; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 [[TMP21]] +; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP22]] +; RV32-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; RV32-NEXT: store [[REVERSE2]], ptr [[TMP24]], align 4 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; RV32-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV32: [[MIDDLE_BLOCK]]: +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV32: [[SCALAR_PH]]: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; RV32-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ] +; RV32-NEXT: br label %[[FOR_BODY:.*]] +; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV32: [[FOR_COND_CLEANUP]]: +; RV32-NEXT: ret void +; RV32: [[FOR_BODY]]: +; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] +; RV32-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 +; RV32-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]] +; RV32-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; RV32-NEXT: [[ADD9:%.*]] = add i32 [[TMP26]], 1 +; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] +; RV32-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4 +; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 +; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] + %i.0 = add nsw i32 %i.0.in8, -1 + %idxprom = zext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom + %1 = load i32, ptr %arrayidx, align 4 + %add9 = add i32 %1, 1 + %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom + store i32 %add9, ptr %arrayidx3, align 4 + %cmp = icmp ugt i64 %indvars.iv, 1 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) { +; RV64-LABEL: define void @vector_reverse_f32( +; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*:]] +; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV64: [[FOR_BODY_PREHEADER]]: +; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64: [[VECTOR_SCEVCHECK]]: +; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) +; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] +; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] +; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] +; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64: [[VECTOR_PH]]: +; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; RV64-NEXT: [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV64-NEXT: [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64: [[VECTOR_BODY]]: +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] +; RV64-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0 +; RV64-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1 +; RV64-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 +; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] +; RV64-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP14]] +; RV64-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP14]] +; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]] +; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]] +; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP25:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) +; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] +; RV64-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP14]] +; RV64-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP14]] +; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP27]] +; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP28]] +; RV64-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP25]]) +; RV64-NEXT: store [[REVERSE2]], ptr [[TMP30]], align 4 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; RV64-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; RV64: [[MIDDLE_BLOCK]]: +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64: [[SCALAR_PH]]: +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ] +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV64: [[FOR_COND_CLEANUP]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 +; RV64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]] +; RV64-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; RV64-NEXT: [[CONV1:%.*]] = fadd float [[TMP32]], 1.000000e+00 +; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]] +; RV64-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4 +; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; +; RV32-LABEL: define void @vector_reverse_f32( +; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; RV32-NEXT: [[ENTRY:.*:]] +; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV32: [[FOR_BODY_PREHEADER]]: +; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV32: [[VECTOR_PH]]: +; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV32-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; RV32-NEXT: [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV32-NEXT: [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV32-NEXT: br label %[[VECTOR_BODY:.*]] +; RV32: [[VECTOR_BODY]]: +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] +; RV32-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0 +; RV32-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1 +; RV32-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32 +; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]] +; RV32-NEXT: [[TMP15:%.*]] = sub i32 1, [[TMP13]] +; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP15]] +; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 4 +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP18:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) +; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; RV32-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32 +; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP20]] +; RV32-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP20]] +; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP21]] +; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP22]] +; RV32-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP18]]) +; RV32-NEXT: store [[REVERSE2]], ptr [[TMP24]], align 4 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; RV32-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; RV32: [[MIDDLE_BLOCK]]: +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV32: [[SCALAR_PH]]: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; RV32-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ] +; RV32-NEXT: br label %[[FOR_BODY:.*]] +; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV32: [[FOR_COND_CLEANUP]]: +; RV32-NEXT: ret void +; RV32: [[FOR_BODY]]: +; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] +; RV32-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 +; RV32-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]] +; RV32-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; RV32-NEXT: [[CONV1:%.*]] = fadd float [[TMP26]], 1.000000e+00 +; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]] +; RV32-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4 +; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 +; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] + %i.0 = add nsw i32 %i.0.in8, -1 + %idxprom = zext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom + %1 = load float, ptr %arrayidx, align 4 + %conv1 = fadd float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom + store float %conv1, ptr %arrayidx3, align 4 + %cmp = icmp ugt i64 %indvars.iv, 1 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +;. +; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; RV64: [[META1]] = !{!"llvm.loop.mustprogress"} +; RV64: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; RV64: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; RV64: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; RV64: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; RV64: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +;. +; RV32: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; RV32: [[META1]] = !{!"llvm.loop.mustprogress"} +; RV32: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; RV32: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; RV32: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META3]], [[META2]]} +; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; RV32: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META3]], [[META2]]} +;. From 2c6b5bdb92815e1d26d0bb60ce42d95570406aec Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 25 Feb 2025 05:14:10 -0800 Subject: [PATCH 2/4] UF2 test --- .../RISCV/riscv-vector-reverse-output.ll | 221 +++++++++++++++++- 1 file changed, 212 insertions(+), 9 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index 2b8ac0ab685ab..a15a8342154ff 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -7,13 +7,17 @@ ;; a[i] = b[i] + 1.0; ; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \ -; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \ +; RUN: -riscv-v-vector-bits-min=128 -S < %s \ ; RUN: | FileCheck --check-prefix=RV64 %s ; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v \ -; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \ +; RUN: -riscv-v-vector-bits-min=128 -S < %s \ ; RUN: | FileCheck --check-prefix=RV32 %s +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \ +; RUN: -riscv-v-vector-bits-min=128 -force-vector-interleave=2 -S < %s \ +; RUN: | FileCheck --check-prefix=RV64-UF2 %s + define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) { ; RV64-LABEL: define void @vector_reverse_i64( ; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { @@ -174,6 +178,105 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) { ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; +; RV64-UF2-LABEL: define void @vector_reverse_i64( +; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; RV64-UF2-NEXT: [[ENTRY:.*:]] +; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV64-UF2: [[FOR_BODY_PREHEADER]]: +; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64-UF2: [[VECTOR_SCEVCHECK]]: +; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) +; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] +; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] +; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64-UF2: [[VECTOR_PH]]: +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]] +; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-UF2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV64-UF2-NEXT: [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64-UF2: [[VECTOR_BODY]]: +; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-UF2-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] +; RV64-UF2-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0 +; RV64-UF2-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1 +; RV64-UF2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP14]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]] +; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] +; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP25]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) +; RV64-UF2-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD2]]) +; RV64-UF2-NEXT: [[TMP30:%.*]] = add [[REVERSE]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP31:%.*]] = add [[REVERSE3]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP33:%.*]] = mul i64 0, [[TMP14]] +; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP33]] +; RV64-UF2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP35]], i64 [[TMP34]] +; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 -1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP38:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP37]] +; RV64-UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i64 [[TMP38]] +; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP30]]) +; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP36]], align 4 +; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP31]]) +; RV64-UF2-NEXT: store [[REVERSE5]], ptr [[TMP40]], align 4 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; RV64-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-UF2: [[MIDDLE_BLOCK]]: +; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-UF2: [[SCALAR_PH]]: +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] +; RV64-UF2-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ] +; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] +; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV64-UF2: [[FOR_COND_CLEANUP]]: +; RV64-UF2-NEXT: ret void +; RV64-UF2: [[FOR_BODY]]: +; RV64-UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] +; RV64-UF2-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 +; RV64-UF2-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; RV64-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]] +; RV64-UF2-NEXT: [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; RV64-UF2-NEXT: [[ADD9:%.*]] = add i32 [[TMP42]], 1 +; RV64-UF2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] +; RV64-UF2-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4 +; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 +; RV64-UF2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup @@ -360,6 +463,105 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) { ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; +; RV64-UF2-LABEL: define void @vector_reverse_f32( +; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; RV64-UF2-NEXT: [[ENTRY:.*:]] +; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV64-UF2: [[FOR_BODY_PREHEADER]]: +; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64-UF2: [[VECTOR_SCEVCHECK]]: +; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) +; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] +; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] +; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64-UF2: [[VECTOR_PH]]: +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]] +; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-UF2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV64-UF2-NEXT: [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64-UF2: [[VECTOR_BODY]]: +; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-UF2-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] +; RV64-UF2-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0 +; RV64-UF2-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1 +; RV64-UF2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP14]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]] +; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] +; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP25]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) +; RV64-UF2-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD2]]) +; RV64-UF2-NEXT: [[TMP30:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP31:%.*]] = fadd [[REVERSE3]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP33:%.*]] = mul i64 0, [[TMP14]] +; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP33]] +; RV64-UF2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP35]], i64 [[TMP34]] +; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 -1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP38:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP37]] +; RV64-UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP39]], i64 [[TMP38]] +; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP30]]) +; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP36]], align 4 +; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP31]]) +; RV64-UF2-NEXT: store [[REVERSE5]], ptr [[TMP40]], align 4 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; RV64-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; RV64-UF2: [[MIDDLE_BLOCK]]: +; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-UF2: [[SCALAR_PH]]: +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] +; RV64-UF2-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ] +; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] +; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV64-UF2: [[FOR_COND_CLEANUP]]: +; RV64-UF2-NEXT: ret void +; RV64-UF2: [[FOR_BODY]]: +; RV64-UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] +; RV64-UF2-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 +; RV64-UF2-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; RV64-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]] +; RV64-UF2-NEXT: [[TMP42:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; RV64-UF2-NEXT: [[CONV1:%.*]] = fadd float [[TMP42]], 1.000000e+00 +; RV64-UF2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]] +; RV64-UF2-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4 +; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 +; RV64-UF2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup @@ -391,13 +593,6 @@ for.body: ; preds = %for.body.preheader, !2 = !{!"llvm.loop.vectorize.width", i32 4} !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} !4 = !{!"llvm.loop.vectorize.enable", i1 true} -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} ;. ; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} ; RV64: [[META1]] = !{!"llvm.loop.mustprogress"} @@ -415,3 +610,11 @@ for.body: ; preds = %for.body.preheader, ; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} ; RV32: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META3]], [[META2]]} ;. +; RV64-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; RV64-UF2: [[META1]] = !{!"llvm.loop.mustprogress"} +; RV64-UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; RV64-UF2: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; RV64-UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; RV64-UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; RV64-UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +;. From e9f9d387a93750be9c5d09307f2169386915b496 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 11 Mar 2025 06:03:11 -0700 Subject: [PATCH 3/4] Refine test case --- .../RISCV/riscv-vector-reverse-output.ll | 866 +++++++----------- 1 file changed, 355 insertions(+), 511 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index a15a8342154ff..402ba8ba30987 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -18,603 +18,447 @@ ; RUN: -riscv-v-vector-bits-min=128 -force-vector-interleave=2 -S < %s \ ; RUN: | FileCheck --check-prefix=RV64-UF2 %s -define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) { -; RV64-LABEL: define void @vector_reverse_i64( -; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; RV64-NEXT: [[ENTRY:.*:]] -; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV64: [[FOR_BODY_PREHEADER]]: -; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; RV64: [[VECTOR_SCEVCHECK]]: -; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 -; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) -; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] -; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] -; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 -; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { +; RV64-LABEL: define void @vector_reverse_i32( +; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV64: [[VECTOR_PH]]: -; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV64-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; RV64-NEXT: [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-NEXT: [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] ; RV64-NEXT: br label %[[VECTOR_BODY:.*]] ; RV64: [[VECTOR_BODY]]: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 -; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] -; RV64-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0 -; RV64-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1 -; RV64-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP19]] -; RV64-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP14]] -; RV64-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP14]] -; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP21]] -; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]] +; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] +; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 ; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP25:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] -; RV64-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP14]] -; RV64-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP14]] -; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP27]] -; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP28]] -; RV64-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP25]]) -; RV64-NEXT: store [[REVERSE2]], ptr [[TMP30]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] -; RV64-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-NEXT: [[TMP14:%.*]] = add [[REVERSE]], splat (i32 1) +; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] +; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]] +; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) +; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] -; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] ; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV64: [[FOR_COND_CLEANUP]]: -; RV64-NEXT: ret void ; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 -; RV64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]] -; RV64-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; RV64-NEXT: [[ADD9:%.*]] = add i32 [[TMP32]], 1 -; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] -; RV64-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4 -; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 +; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] +; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1 +; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] +; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 +; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 +; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; RV64: [[EXIT]]: +; RV64-NEXT: ret void ; -; RV32-LABEL: define void @vector_reverse_i64( -; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; RV32-NEXT: [[ENTRY:.*:]] -; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV32: [[FOR_BODY_PREHEADER]]: -; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV32-LABEL: define void @vector_reverse_i32( +; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; RV32-NEXT: [[ENTRY:.*]]: +; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] ; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV32: [[VECTOR_PH]]: -; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV32-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; RV32-NEXT: [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV32-NEXT: [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] ; RV32-NEXT: br label %[[VECTOR_BODY:.*]] ; RV32: [[VECTOR_BODY]]: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV32-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 -; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] -; RV32-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0 -; RV32-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1 -; RV32-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 -; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]] -; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32 -; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]] -; RV32-NEXT: [[TMP15:%.*]] = sub i32 1, [[TMP13]] -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]] -; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP15]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 4 +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV32-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] +; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]] +; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]] +; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] +; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 ; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP18:%.*]] = add [[REVERSE]], splat (i32 1) -; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] -; RV32-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32 -; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP20]] -; RV32-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP20]] -; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 [[TMP21]] -; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP22]] -; RV32-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) -; RV32-NEXT: store [[REVERSE2]], ptr [[TMP24]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; RV32-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV32-NEXT: [[TMP15:%.*]] = add [[REVERSE]], splat (i32 1) +; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] +; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]] +; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] +; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] +; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) +; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] -; RV32-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ] +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] ; RV32-NEXT: br label %[[FOR_BODY:.*]] -; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV32: [[FOR_COND_CLEANUP]]: -; RV32-NEXT: ret void ; RV32: [[FOR_BODY]]: -; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] -; RV32-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 -; RV32-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 -; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]] -; RV32-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; RV32-NEXT: [[ADD9:%.*]] = add i32 [[TMP26]], 1 -; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] -; RV32-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4 -; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 -; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 +; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] +; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1 +; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] +; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 +; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 +; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; RV32: [[EXIT]]: +; RV32-NEXT: ret void ; -; RV64-UF2-LABEL: define void @vector_reverse_i64( -; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; RV64-UF2-NEXT: [[ENTRY:.*:]] -; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV64-UF2: [[FOR_BODY_PREHEADER]]: -; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 -; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; RV64-UF2: [[VECTOR_SCEVCHECK]]: -; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 -; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) -; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] -; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] -; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 -; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64-UF2-LABEL: define void @vector_reverse_i32( +; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; RV64-UF2-NEXT: [[ENTRY:.*]]: +; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV64-UF2: [[VECTOR_PH]]: -; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 -; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]] -; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 -; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-UF2-NEXT: [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]] ; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-UF2-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 -; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0 -; RV64-UF2-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1 -; RV64-UF2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP14]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[TMP8]], -1 +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]] +; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]] +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]] +; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]] +; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) +; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD1]]) +; RV64-UF2-NEXT: [[TMP19:%.*]] = add [[REVERSE]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP20:%.*]] = add [[REVERSE2]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]] ; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]] ; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP14]] -; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]] ; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]] ; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP29]], align 4 -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD2]]) -; RV64-UF2-NEXT: [[TMP30:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP31:%.*]] = add [[REVERSE3]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = mul i64 0, [[TMP14]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 1, [[TMP14]] -; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP33]] -; RV64-UF2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP35]], i64 [[TMP34]] -; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 -1, [[TMP14]] -; RV64-UF2-NEXT: [[TMP38:%.*]] = sub i64 1, [[TMP14]] -; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP37]] -; RV64-UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i64 [[TMP38]] -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP30]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP36]], align 4 -; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP31]]) -; RV64-UF2-NEXT: store [[REVERSE5]], ptr [[TMP40]], align 4 -; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] -; RV64-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) +; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 +; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) +; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64-UF2: [[MIDDLE_BLOCK]]: -; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; RV64-UF2: [[SCALAR_PH]]: -; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] -; RV64-UF2-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ] +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] ; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] -; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV64-UF2: [[FOR_COND_CLEANUP]]: -; RV64-UF2-NEXT: ret void ; RV64-UF2: [[FOR_BODY]]: -; RV64-UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] -; RV64-UF2-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 -; RV64-UF2-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 -; RV64-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]] -; RV64-UF2-NEXT: [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; RV64-UF2-NEXT: [[ADD9:%.*]] = add i32 [[TMP42]], 1 -; RV64-UF2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] -; RV64-UF2-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4 -; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 -; RV64-UF2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 +; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] +; RV64-UF2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], 1 +; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] +; RV64-UF2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 +; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 +; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; RV64-UF2: [[EXIT]]: +; RV64-UF2-NEXT: ret void ; entry: - %cmp7 = icmp sgt i32 %n, 0 - br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: ; preds = %entry - %0 = zext i32 %n to i64 br label %for.body -for.cond.cleanup: ; preds = %for.body, %entry - ret void +for.body: + %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ] + %iv.next = add nsw i64 %dec.iv, -1 + %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next + %0 = load i32, ptr %arrayidx.b, align 4 + %add = add i32 %0, 1 + %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next + store i32 %add, ptr %arrayidx.a, align 4 + %cmp = icmp ugt i64 %dec.iv, 1 + br i1 %cmp, label %for.body, label %exit, !llvm.loop !0 -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] - %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] - %i.0 = add nsw i32 %i.0.in8, -1 - %idxprom = zext i32 %i.0 to i64 - %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom - %1 = load i32, ptr %arrayidx, align 4 - %add9 = add i32 %1, 1 - %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom - store i32 %add9, ptr %arrayidx3, align 4 - %cmp = icmp ugt i64 %indvars.iv, 1 - %indvars.iv.next = add nsw i64 %indvars.iv, -1 - br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +exit: + ret void } -define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) { +define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-LABEL: define void @vector_reverse_f32( -; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*:]] -; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV64: [[FOR_BODY_PREHEADER]]: -; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; RV64: [[VECTOR_SCEVCHECK]]: -; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 -; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) -; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] -; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] -; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 -; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV64: [[VECTOR_PH]]: -; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV64-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; RV64-NEXT: [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-NEXT: [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] ; RV64-NEXT: br label %[[VECTOR_BODY:.*]] ; RV64: [[VECTOR_BODY]]: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 -; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] -; RV64-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0 -; RV64-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1 -; RV64-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; RV64-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP14]] -; RV64-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP14]] -; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]] -; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] +; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] +; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 ; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP25:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; RV64-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP14]] -; RV64-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP14]] -; RV64-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP27]] -; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP28]] -; RV64-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP25]]) -; RV64-NEXT: store [[REVERSE2]], ptr [[TMP30]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] -; RV64-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; RV64-NEXT: [[TMP14:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) +; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] +; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]] +; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP14]]) +; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] -; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] ; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV64: [[FOR_COND_CLEANUP]]: -; RV64-NEXT: ret void ; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 -; RV64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]] -; RV64-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; RV64-NEXT: [[CONV1:%.*]] = fadd float [[TMP32]], 1.000000e+00 -; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]] -; RV64-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4 -; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 +; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] +; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00 +; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] +; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 +; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 +; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; RV64: [[EXIT]]: +; RV64-NEXT: ret void ; ; RV32-LABEL: define void @vector_reverse_f32( -; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; RV32-NEXT: [[ENTRY:.*:]] -; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV32: [[FOR_BODY_PREHEADER]]: -; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV32-NEXT: [[ENTRY:.*]]: +; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] ; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV32: [[VECTOR_PH]]: -; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV32-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; RV32-NEXT: [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV32-NEXT: [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] ; RV32-NEXT: br label %[[VECTOR_BODY:.*]] ; RV32: [[VECTOR_BODY]]: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV32-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 -; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] -; RV32-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0 -; RV32-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1 -; RV32-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 -; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] -; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32 -; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]] -; RV32-NEXT: [[TMP15:%.*]] = sub i32 1, [[TMP13]] -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]] -; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP15]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 4 +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV32-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] +; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]] +; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]] +; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] +; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 ; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP18:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] -; RV32-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32 -; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP20]] -; RV32-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP20]] -; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP21]] -; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP22]] -; RV32-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP18]]) -; RV32-NEXT: store [[REVERSE2]], ptr [[TMP24]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; RV32-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; RV32-NEXT: [[TMP15:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) +; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] +; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]] +; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] +; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] +; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP15]]) +; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] -; RV32-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ] +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] ; RV32-NEXT: br label %[[FOR_BODY:.*]] -; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV32: [[FOR_COND_CLEANUP]]: -; RV32-NEXT: ret void ; RV32: [[FOR_BODY]]: -; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV32-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] -; RV32-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 -; RV32-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 -; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]] -; RV32-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; RV32-NEXT: [[CONV1:%.*]] = fadd float [[TMP26]], 1.000000e+00 -; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]] -; RV32-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4 -; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 -; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 +; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] +; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00 +; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] +; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 +; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 +; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; RV32: [[EXIT]]: +; RV32-NEXT: ret void ; ; RV64-UF2-LABEL: define void @vector_reverse_f32( -; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; RV64-UF2-NEXT: [[ENTRY:.*:]] -; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 -; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] -; RV64-UF2: [[FOR_BODY_PREHEADER]]: -; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 -; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; RV64-UF2: [[VECTOR_SCEVCHECK]]: -; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 -; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) -; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] -; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] -; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 -; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV64-UF2-NEXT: [[ENTRY:.*]]: +; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV64-UF2: [[VECTOR_PH]]: -; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 -; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]] -; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 -; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-UF2-NEXT: [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]] ; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-UF2-NEXT: [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32 -; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0 -; RV64-UF2-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1 -; RV64-UF2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP14]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[TMP8]], -1 +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] +; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]] +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]] +; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]] +; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) +; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD1]]) +; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd [[REVERSE2]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]] ; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]] ; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP14]] -; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP14]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]] ; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]] ; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP29]], align 4 -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD2]]) -; RV64-UF2-NEXT: [[TMP30:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP31:%.*]] = fadd [[REVERSE3]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = mul i64 0, [[TMP14]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 1, [[TMP14]] -; RV64-UF2-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP33]] -; RV64-UF2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP35]], i64 [[TMP34]] -; RV64-UF2-NEXT: [[TMP37:%.*]] = mul i64 -1, [[TMP14]] -; RV64-UF2-NEXT: [[TMP38:%.*]] = sub i64 1, [[TMP14]] -; RV64-UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP37]] -; RV64-UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP39]], i64 [[TMP38]] -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP30]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP36]], align 4 -; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP31]]) -; RV64-UF2-NEXT: store [[REVERSE5]], ptr [[TMP40]], align 4 -; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] -; RV64-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP19]]) +; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 +; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP20]]) +; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV64-UF2: [[MIDDLE_BLOCK]]: -; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; RV64-UF2: [[SCALAR_PH]]: -; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] -; RV64-UF2-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ] +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] ; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] -; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]] -; RV64-UF2: [[FOR_COND_CLEANUP]]: -; RV64-UF2-NEXT: ret void ; RV64-UF2: [[FOR_BODY]]: -; RV64-UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-UF2-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ] -; RV64-UF2-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 -; RV64-UF2-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 -; RV64-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]] -; RV64-UF2-NEXT: [[TMP42:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; RV64-UF2-NEXT: [[CONV1:%.*]] = fadd float [[TMP42]], 1.000000e+00 -; RV64-UF2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]] -; RV64-UF2-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4 -; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 -; RV64-UF2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 +; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] +; RV64-UF2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00 +; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] +; RV64-UF2-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 +; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 +; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; RV64-UF2: [[EXIT]]: +; RV64-UF2-NEXT: ret void ; entry: - %cmp7 = icmp sgt i32 %n, 0 - br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: ; preds = %entry - %0 = zext i32 %n to i64 br label %for.body -for.cond.cleanup: ; preds = %for.body, %entry - ret void +for.body: + %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ] + %iv.next = add nsw i64 %dec.iv, -1 + %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next + %0 = load float, ptr %arrayidx.b, align 4 + %fadd = fadd float %0, 1.000000e+00 + %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next + store float %fadd, ptr %arrayidx.a, align 4 + %cmp = icmp ugt i64 %dec.iv, 1 + br i1 %cmp, label %for.body, label %exit, !llvm.loop !0 -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] - %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] - %i.0 = add nsw i32 %i.0.in8, -1 - %idxprom = zext i32 %i.0 to i64 - %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom - %1 = load float, ptr %arrayidx, align 4 - %conv1 = fadd float %1, 1.000000e+00 - %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom - store float %conv1, ptr %arrayidx3, align 4 - %cmp = icmp ugt i64 %indvars.iv, 1 - %indvars.iv.next = add nsw i64 %indvars.iv, -1 - br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +exit: + ret void } -!0 = distinct !{!0, !1, !2, !3, !4} -!1 = !{!"llvm.loop.mustprogress"} -!2 = !{!"llvm.loop.vectorize.width", i32 4} -!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -!4 = !{!"llvm.loop.vectorize.enable", i1 true} +!0 = distinct !{!0, !1, !2, !3} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} ;. -; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} -; RV64: [[META1]] = !{!"llvm.loop.mustprogress"} -; RV64: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} -; RV64: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; RV64: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; RV64: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; RV64: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; RV64: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; RV64: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; RV64: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; RV64: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ;. -; RV32: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} -; RV32: [[META1]] = !{!"llvm.loop.mustprogress"} -; RV32: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} -; RV32: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; RV32: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META3]], [[META2]]} -; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; RV32: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META3]], [[META2]]} +; RV32: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; RV32: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; RV32: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; RV32: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; RV32: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ;. -; RV64-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} -; RV64-UF2: [[META1]] = !{!"llvm.loop.mustprogress"} -; RV64-UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} -; RV64-UF2: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; RV64-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; RV64-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; RV64-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; RV64-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; RV64-UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; RV64-UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; RV64-UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; RV64-UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ;. From 9d825d97c33d5c00427ea06be80dcf42275aed5a Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 18 Mar 2025 00:04:36 -0700 Subject: [PATCH 4/4] Apply --check-globals=none --- .../RISCV/riscv-vector-reverse-output.ll | 25 +------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index 402ba8ba30987..55a969b7c9e76 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -1,5 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 - +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 ;; This is the loop in c++ being vectorize in this file with ;; vector.reverse ;; #pragma clang loop vectorize_width(4, scalable) @@ -440,25 +439,3 @@ exit: !1 = !{!"llvm.loop.vectorize.width", i32 4} !2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} !3 = !{!"llvm.loop.vectorize.enable", i1 true} -;. -; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; RV64: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; RV64: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; RV64: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; RV64: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; RV64: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -;. -; RV32: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; RV32: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; RV32: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; RV32: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; RV32: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -;. -; RV64-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; RV64-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; RV64-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; RV64-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; RV64-UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; RV64-UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -;.