Skip to content

[mlir][OpenMP] rewrite conversion of privatisation for omp.parallel #111844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
262 changes: 262 additions & 0 deletions flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s

! Combinational testing of control flow graph and builder insertion points
! in mlir-to-llvm conversion:
! - mixing multiple delayed privatizations and multiple reductions
! - multiple blocks in the private alloc region
! - private alloc region has to read from the mold variable
! - firstprivate
! - multiple blocks in the private copy region
! - multiple blocks in the reduction init region
! - reduction init region has to read from the mold variable
! - re-used omp.private ops
! - re-used omp.reduction.declare ops
! - unstructured code inside of the parallel region
! - needs private dealloc region, and this has multiple blocks
! - needs reduction cleanup region, and this has multiple blocks

! This maybe belongs in the mlir tests, but what we are doing here is complex
! enough that I find the kind of minimised mlir code preferred by mlir reviewers
! hard to read without some fortran here for reference. Nothing like this would
! be generated by other upstream users of the MLIR OpenMP dialect.

subroutine worst_case(a, b, c, d)
real, allocatable :: a(:), b(:), c(:), d(:)
integer i

!$omp parallel firstprivate(a,b) reduction(+:c,d)
if (sum(a) == 1) stop 1
!$omp end parallel
end subroutine

! CHECK-LABEL: define internal void @worst_case_..omp_par
! CHECK-NEXT: omp.par.entry:
! [reduction alloc regions inlined here]
! CHECK: br label %omp.private.latealloc

! CHECK: omp.private.latealloc: ; preds = %omp.par.entry
! CHECK-NEXT: br label %omp.private.alloc5

! CHECK: omp.private.alloc5: ; preds = %omp.private.latealloc
! [begin private alloc for first var]
! [read the length from the mold argument]
! [if it is non-zero...]
! CHECK: br i1 {{.*}}, label %omp.private.alloc6, label %omp.private.alloc7

! CHECK: omp.private.alloc7: ; preds = %omp.private.alloc5
! [finish private alloc for first var with zero extent]
! CHECK: br label %omp.private.alloc8

! CHECK: omp.private.alloc8: ; preds = %omp.private.alloc6, %omp.private.alloc7
! CHECK-NEXT: br label %omp.region.cont4

! CHECK: omp.region.cont4: ; preds = %omp.private.alloc8
! CHECK-NEXT: %{{.*}} = phi ptr
! CHECK-NEXT: br label %omp.private.alloc

! CHECK: omp.private.alloc: ; preds = %omp.region.cont4
! [begin private alloc for first var]
! [read the length from the mold argument]
! [if it is non-zero...]
! CHECK: br i1 %{{.*}}, label %omp.private.alloc1, label %omp.private.alloc2

! CHECK: omp.private.alloc2: ; preds = %omp.private.alloc
! [finish private alloc for second var with zero extent]
! CHECK: br label %omp.private.alloc3

! CHECK: omp.private.alloc3: ; preds = %omp.private.alloc1, %omp.private.alloc2
! CHECK-NEXT: br label %omp.region.cont

! CHECK: omp.region.cont: ; preds = %omp.private.alloc3
! CHECK-NEXT: %{{.*}} = phi ptr
! CHECK-NEXT: br label %omp.private.copy

! CHECK: omp.private.copy: ; preds = %omp.region.cont
! CHECK-NEXT: br label %omp.private.copy10

! CHECK: omp.private.copy10: ; preds = %omp.private.copy
! [begin firstprivate copy for first var]
! [read the length, is it non-zero?]
! CHECK: br i1 %{{.*}}, label %omp.private.copy11, label %omp.private.copy12

! CHECK: omp.private.copy12: ; preds = %omp.private.copy11, %omp.private.copy10
! CHECK-NEXT: br label %omp.region.cont9

! CHECK: omp.region.cont9: ; preds = %omp.private.copy12
! CHECK-NEXT: %{{.*}} = phi ptr
! CHECK-NEXT: br label %omp.private.copy14

! CHECK: omp.private.copy14: ; preds = %omp.region.cont9
! [begin firstprivate copy for second var]
! [read the length, is it non-zero?]
! CHECK: br i1 %{{.*}}, label %omp.private.copy15, label %omp.private.copy16

! CHECK: omp.private.copy16: ; preds = %omp.private.copy15, %omp.private.copy14
! CHECK-NEXT: br label %omp.region.cont13

! CHECK: omp.region.cont13: ; preds = %omp.private.copy16
! CHECK-NEXT: %{{.*}} = phi ptr
! CHECK-NEXT: br label %omp.reduction.init

! CHECK: omp.reduction.init: ; preds = %omp.region.cont13
! [deffered stores for results of reduction alloc regions]
! CHECK: br label %[[VAL_96:.*]]

! CHECK: omp.reduction.neutral: ; preds = %omp.reduction.init
! [start of reduction initialization region]
! [null check:]
! CHECK: br i1 %{{.*}}, label %omp.reduction.neutral18, label %omp.reduction.neutral19

! CHECK: omp.reduction.neutral19: ; preds = %omp.reduction.neutral
! [malloc and assign the default value to the reduction variable]
! CHECK: br label %omp.reduction.neutral20

! CHECK: omp.reduction.neutral20: ; preds = %omp.reduction.neutral18, %omp.reduction.neutral19
! CHECK-NEXT: br label %omp.region.cont17

! CHECK: omp.region.cont17: ; preds = %omp.reduction.neutral20
! CHECK-NEXT: %{{.*}} = phi ptr
! CHECK-NEXT: br label %omp.reduction.neutral22

! CHECK: omp.reduction.neutral22: ; preds = %omp.region.cont17
! [start of reduction initialization region]
! [null check:]
! CHECK: br i1 %{{.*}}, label %omp.reduction.neutral23, label %omp.reduction.neutral24

! CHECK: omp.reduction.neutral24: ; preds = %omp.reduction.neutral22
! [malloc and assign the default value to the reduction variable]
! CHECK: br label %omp.reduction.neutral25

! CHECK: omp.reduction.neutral25: ; preds = %omp.reduction.neutral23, %omp.reduction.neutral24
! CHECK-NEXT: br label %omp.region.cont21

! CHECK: omp.region.cont21: ; preds = %omp.reduction.neutral25
! CHECK-NEXT: %{{.*}} = phi ptr
! CHECK-NEXT: br label %omp.par.region

! CHECK: omp.par.region: ; preds = %omp.region.cont21
! CHECK-NEXT: br label %omp.par.region27

! CHECK: omp.par.region27: ; preds = %omp.par.region
! [call SUM runtime function]
! [if (sum(a) == 1)]
! CHECK: br i1 %{{.*}}, label %omp.par.region28, label %omp.par.region29

! CHECK: omp.par.region29: ; preds = %omp.par.region27
! CHECK-NEXT: br label %omp.region.cont26

! CHECK: omp.region.cont26: ; preds = %omp.par.region28, %omp.par.region29
! [omp parallel region done, call into the runtime to complete reduction]
! CHECK: %[[VAL_233:.*]] = call i32 @__kmpc_reduce(
! CHECK: switch i32 %[[VAL_233]], label %reduce.finalize [
! CHECK-NEXT: i32 1, label %reduce.switch.nonatomic
! CHECK-NEXT: i32 2, label %reduce.switch.atomic
! CHECK-NEXT: ]

! CHECK: reduce.switch.atomic: ; preds = %omp.region.cont26
! CHECK-NEXT: unreachable

! CHECK: reduce.switch.nonatomic: ; preds = %omp.region.cont26
! CHECK-NEXT: %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8
! CHECK-NEXT: br label %omp.reduction.nonatomic.body

! [various blocks implementing the reduction]

! CHECK: omp.region.cont35: ; preds =
! CHECK-NEXT: %{{.*}} = phi ptr
! CHECK-NEXT: call void @__kmpc_end_reduce(
! CHECK-NEXT: br label %reduce.finalize

! CHECK: reduce.finalize: ; preds =
! CHECK-NEXT: br label %omp.par.pre_finalize

! CHECK: omp.par.pre_finalize: ; preds = %reduce.finalize
! CHECK-NEXT: %{{.*}} = load ptr, ptr
! CHECK-NEXT: br label %omp.reduction.cleanup

! CHECK: omp.reduction.cleanup: ; preds = %omp.par.pre_finalize
! [null check]
! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup41, label %omp.reduction.cleanup42

! CHECK: omp.reduction.cleanup42: ; preds = %omp.reduction.cleanup41, %omp.reduction.cleanup
! CHECK-NEXT: br label %omp.region.cont40

! CHECK: omp.region.cont40: ; preds = %omp.reduction.cleanup42
! CHECK-NEXT: %{{.*}} = load ptr, ptr
! CHECK-NEXT: br label %omp.reduction.cleanup44

! CHECK: omp.reduction.cleanup44: ; preds = %omp.region.cont40
! [null check]
! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup45, label %omp.reduction.cleanup46

! CHECK: omp.reduction.cleanup46: ; preds = %omp.reduction.cleanup45, %omp.reduction.cleanup44
! CHECK-NEXT: br label %omp.region.cont43

! CHECK: omp.region.cont43: ; preds = %omp.reduction.cleanup46
! CHECK-NEXT: br label %omp.private.dealloc

! CHECK: omp.private.dealloc: ; preds = %omp.region.cont43
! [null check]
! CHECK: br i1 %{{.*}}, label %omp.private.dealloc48, label %omp.private.dealloc49

! CHECK: omp.private.dealloc49: ; preds = %omp.private.dealloc48, %omp.private.dealloc
! CHECK-NEXT: br label %omp.region.cont47

! CHECK: omp.region.cont47: ; preds = %omp.private.dealloc49
! CHECK-NEXT: br label %omp.private.dealloc51

! CHECK: omp.private.dealloc51: ; preds = %omp.region.cont47
! [null check]
! CHECK: br i1 %{{.*}}, label %omp.private.dealloc52, label %omp.private.dealloc53

! CHECK: omp.private.dealloc53: ; preds = %omp.private.dealloc52, %omp.private.dealloc51
! CHECK-NEXT: br label %omp.region.cont50

! CHECK: omp.region.cont50: ; preds = %omp.private.dealloc53
! CHECK-NEXT: br label %omp.par.outlined.exit.exitStub

! CHECK: omp.private.dealloc52: ; preds = %omp.private.dealloc51
! [dealloc memory]
! CHECK: br label %omp.private.dealloc53

! CHECK: omp.private.dealloc48: ; preds = %omp.private.dealloc
! [dealloc memory]
! CHECK: br label %omp.private.dealloc49

! CHECK: omp.reduction.cleanup45: ; preds = %omp.reduction.cleanup44
! CHECK-NEXT: call void @free(
! CHECK-NEXT: br label %omp.reduction.cleanup46

! CHECK: omp.reduction.cleanup41: ; preds = %omp.reduction.cleanup
! CHECK-NEXT: call void @free(
! CHECK-NEXT: br label %omp.reduction.cleanup42

! CHECK: omp.par.region28: ; preds = %omp.par.region27
! CHECK-NEXT: call {} @_FortranAStopStatement

! CHECK: omp.reduction.neutral23: ; preds = %omp.reduction.neutral22
! [source length was zero: finish initializing array]
! CHECK: br label %omp.reduction.neutral25

! CHECK: omp.reduction.neutral18: ; preds = %omp.reduction.neutral
! [source length was zero: finish initializing array]
! CHECK: br label %omp.reduction.neutral20

! CHECK: omp.private.copy15: ; preds = %omp.private.copy14
! [source length was non-zero: call assign runtime]
! CHECK: br label %omp.private.copy16

! CHECK: omp.private.copy11: ; preds = %omp.private.copy10
! [source length was non-zero: call assign runtime]
! CHECK: br label %omp.private.copy12

! CHECK: omp.private.alloc1: ; preds = %omp.private.alloc
! [var extent was non-zero: malloc a private array]
! CHECK: br label %omp.private.alloc3

! CHECK: omp.private.alloc6: ; preds = %omp.private.alloc5
! [var extent was non-zero: malloc a private array]
! CHECK: br label %omp.private.alloc8

! CHECK: omp.par.outlined.exit.exitStub: ; preds = %omp.region.cont50
! CHECK-NEXT: ret void
46 changes: 46 additions & 0 deletions flang/test/Integration/OpenMP/private-global.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s

! Regression test for https://github.com/llvm/llvm-project/issues/106297

program bug
implicit none
integer :: table(10)
!$OMP PARALLEL PRIVATE(table)
table = 50
if (any(table/=50)) then
stop 'fail 3'
end if
!$OMP END PARALLEL
print *,'ok'
End Program


! CHECK-LABEL: define internal void {{.*}}..omp_par(
! CHECK: omp.par.entry:
! CHECK: %[[VAL_9:.*]] = alloca i32, align 4
! CHECK: %[[VAL_10:.*]] = load i32, ptr %[[VAL_11:.*]], align 4
! CHECK: store i32 %[[VAL_10]], ptr %[[VAL_9]], align 4
! CHECK: %[[VAL_12:.*]] = load i32, ptr %[[VAL_9]], align 4
! CHECK: %[[PRIV_TABLE:.*]] = alloca [10 x i32], i64 1, align 4
! ...
! check that we use the private copy of table for the assignment
! CHECK: omp.par.region1:
! CHECK: %[[ELEMENTAL_TMP:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
! CHECK: %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
! CHECK: %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
! CHECK: %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
! CHECK: %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0
! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8
! CHECK: %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8
! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8
! CHECK: %[[VAL_26:.*]] = call {} @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9)
! ...
! check that we use the private copy of table for table/=50
! CHECK: omp.par.region3:
! CHECK: %[[VAL_44:.*]] = sub nsw i64 %{{.*}}, 1
! CHECK: %[[VAL_45:.*]] = mul nsw i64 %[[VAL_44]], 1
! CHECK: %[[VAL_46:.*]] = mul nsw i64 %[[VAL_45]], 1
! CHECK: %[[VAL_47:.*]] = add nsw i64 %[[VAL_46]], 0
! CHECK: %[[VAL_48:.*]] = getelementptr i32, ptr %[[PRIV_TABLE]], i64 %[[VAL_47]]
! CHECK: %[[VAL_49:.*]] = load i32, ptr %[[VAL_48]], align 4
! CHECK: %[[VAL_50:.*]] = icmp ne i32 %[[VAL_49]], 50
Loading
Loading