llvm
diff --git a/‎flang/lib/Lower/OpenMP/OpenMP.cpp
Lines changed: 60 additions & 40 deletions b/‎flang/lib/Lower/OpenMP/OpenMP.cpp
Lines changed: 60 additions & 40 deletions
diff --git a/‎flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
Lines changed: 54 additions & 47 deletions b/‎flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
Lines changed: 54 additions & 47 deletions
@@ -521,7 +521,7 @@ struct OpWithBodyGenInfo {
 /// \param [in]   op - the operation the body belongs to.
 /// \param [in] info - options controlling code-gen for the construction.
 template <typename Op>
-static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
+static void createBodyOfOp(mlir::Operation &op, OpWithBodyGenInfo &info) {
   fir::FirOpBuilder &firOpBuilder = info.converter.getFirOpBuilder();
 
   auto insertMarker = [](fir::FirOpBuilder &builder) {
@@ -537,10 +537,10 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
   auto regionArgs =
       [&]() -> llvm::SmallVector<const Fortran::semantics::Symbol *> {
     if (info.genRegionEntryCB != nullptr) {
-      return info.genRegionEntryCB(op);
+      return info.genRegionEntryCB(&op);
     }
 
-    firOpBuilder.createBlock(&op.getRegion());
+    firOpBuilder.createBlock(&op.getRegion(0));
     return {};
   }();
   // Mark the earliest insertion point.
@@ -556,7 +556,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
   // Start with privatization, so that the lowering of the nested
   // code will use the right symbols.
   constexpr bool isLoop = std::is_same_v<Op, mlir::omp::WsloopOp> ||
-                          std::is_same_v<Op, mlir::omp::SimdLoopOp>;
+                          std::is_same_v<Op, mlir::omp::SimdOp>;
   bool privatize = info.clauses && !info.outerCombined;
 
   firOpBuilder.setInsertionPoint(marker);
@@ -582,9 +582,9 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
     // a lot of complications for our approach if the terminator generation
     // is delayed past this point. Insert a temporary terminator here, then
     // delete it.
-    firOpBuilder.setInsertionPointToEnd(&op.getRegion().back());
-    auto *temp = Fortran::lower::genOpenMPTerminator(
-        firOpBuilder, op.getOperation(), info.loc);
+    firOpBuilder.setInsertionPointToEnd(&op.getRegion(0).back());
+    auto *temp =
+        Fortran::lower::genOpenMPTerminator(firOpBuilder, &op, info.loc);
     firOpBuilder.setInsertionPointAfter(marker);
     genNestedEvaluations(info.converter, info.eval);
     temp->erase();
@@ -626,23 +626,36 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
     return exit;
   };
 
-  if (auto *exitBlock = getUniqueExit(op.getRegion())) {
+  if (auto *exitBlock = getUniqueExit(op.getRegion(0))) {
     firOpBuilder.setInsertionPointToEnd(exitBlock);
-    auto *term = Fortran::lower::genOpenMPTerminator(
-        firOpBuilder, op.getOperation(), info.loc);
+    auto *term =
+        Fortran::lower::genOpenMPTerminator(firOpBuilder, &op, info.loc);
     // Only insert lastprivate code when there actually is an exit block.
     // Such a block may not exist if the nested code produced an infinite
     // loop (this may not make sense in production code, but a user could
     // write that and we should handle it).
     firOpBuilder.setInsertionPoint(term);
     if (privatize) {
+      // DataSharingProcessor::processStep2() may create operations before/after
+      // the one passed as argument. We need to treat loop wrappers and their
+      // nested loop as a unit, so we need to pass the top level wrapper (if
+      // present). Otherwise, these operations will be inserted within a
+      // wrapper region.
+      mlir::Operation *privatizationTopLevelOp = &op;
+      if (auto loopNest = llvm::dyn_cast<mlir::omp::LoopNestOp>(op)) {
+        llvm::SmallVector<mlir::omp::LoopWrapperInterface> wrappers;
+        loopNest.gatherWrappers(wrappers);
+        if (!wrappers.empty())
+          privatizationTopLevelOp = &*wrappers.back();
+      }
+
       if (!info.dsp) {
         assert(tempDsp.has_value());
-        tempDsp->processStep2(op, isLoop);
+        tempDsp->processStep2(privatizationTopLevelOp, isLoop);
       } else {
         if (isLoop && regionArgs.size() > 0)
           info.dsp->setLoopIV(info.converter.getSymbolAddress(*regionArgs[0]));
-        info.dsp->processStep2(op, isLoop);
+        info.dsp->processStep2(privatizationTopLevelOp, isLoop);
       }
     }
   }
@@ -719,7 +732,7 @@ template <typename OpTy, typename... Args>
 static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) {
   auto op = info.converter.getFirOpBuilder().create<OpTy>(
       info.loc, std::forward<Args>(args)...);
-  createBodyOfOp<OpTy>(op, info);
+  createBodyOfOp<OpTy>(*op, info);
   return op;
 }
 
@@ -1689,13 +1702,12 @@ genLoopAndReductionVars(
   return llvm::SmallVector<const Fortran::semantics::Symbol *>(loopArgs);
 }
 
-static void
-createSimdLoop(Fortran::lower::AbstractConverter &converter,
-               Fortran::semantics::SemanticsContext &semaCtx,
-               Fortran::lower::pft::Evaluation &eval,
-               llvm::omp::Directive ompDirective,
-               const Fortran::parser::OmpClauseList &loopOpClauseList,
-               mlir::Location loc) {
+static void createSimd(Fortran::lower::AbstractConverter &converter,
+                       Fortran::semantics::SemanticsContext &semaCtx,
+                       Fortran::lower::pft::Evaluation &eval,
+                       llvm::omp::Directive ompDirective,
+                       const Fortran::parser::OmpClauseList &loopOpClauseList,
+                       mlir::Location loc) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   DataSharingProcessor dsp(converter, semaCtx, loopOpClauseList, eval);
   dsp.processStep1();
@@ -1720,11 +1732,20 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
   cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
                  clause::Nontemporal, clause::Order>(loc, ompDirective);
 
+  // Create omp.simd wrapper.
   mlir::TypeRange resultType;
-  auto simdLoopOp = firOpBuilder.create<mlir::omp::SimdLoopOp>(
-      loc, resultType, lowerBound, upperBound, step, alignedVars,
-      /*alignment_values=*/nullptr, ifClauseOperand, nontemporalVars,
-      orderClauseOperand, simdlenClauseOperand, safelenClauseOperand,
+  auto simdOp = firOpBuilder.create<mlir::omp::SimdOp>(
+      loc, resultType, alignedVars, /*alignment_values=*/nullptr,
+      ifClauseOperand, nontemporalVars, orderClauseOperand,
+      simdlenClauseOperand, safelenClauseOperand);
+
+  firOpBuilder.createBlock(&simdOp.getRegion());
+  firOpBuilder.setInsertionPoint(
+      Fortran::lower::genOpenMPTerminator(firOpBuilder, simdOp, loc));
+
+  // Create nested omp.loop_nest and fill body with loop contents.
+  auto loopOp = firOpBuilder.create<mlir::omp::LoopNestOp>(
+      loc, lowerBound, upperBound, step,
       /*inclusive=*/firOpBuilder.getUnitAttr());
 
   auto *nestedEval = getCollapsedLoopEval(
@@ -1734,11 +1755,11 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
     return genLoopVars(op, converter, loc, iv);
   };
 
-  createBodyOfOp<mlir::omp::SimdLoopOp>(
-      simdLoopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
-                      .setClauses(&loopOpClauseList)
-                      .setDataSharingProcessor(&dsp)
-                      .setGenRegionEntryCb(ivCallback));
+  createBodyOfOp<mlir::omp::SimdOp>(
+      *loopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
+                   .setClauses(&loopOpClauseList)
+                   .setDataSharingProcessor(&dsp)
+                   .setGenRegionEntryCb(ivCallback));
 }
 
 static void createWsloop(Fortran::lower::AbstractConverter &converter,
@@ -1819,11 +1840,11 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
   };
 
   createBodyOfOp<mlir::omp::WsloopOp>(
-      wsLoopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
-                    .setClauses(&beginClauseList)
-                    .setDataSharingProcessor(&dsp)
-                    .setReductions(&reductionSymbols, &reductionTypes)
-                    .setGenRegionEntryCb(ivCallback));
+      *wsLoopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
+                     .setClauses(&beginClauseList)
+                     .setDataSharingProcessor(&dsp)
+                     .setReductions(&reductionSymbols, &reductionTypes)
+                     .setGenRegionEntryCb(ivCallback));
 }
 
 static void createSimdWsloop(
@@ -2200,7 +2221,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                                                       global.getSymName()));
   }();
   auto genInfo = OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval);
-  createBodyOfOp<mlir::omp::CriticalOp>(criticalOp, genInfo);
+  createBodyOfOp<mlir::omp::CriticalOp>(*criticalOp, genInfo);
 }
 
 static void
@@ -2285,8 +2306,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
 
   } else if (llvm::omp::allSimdSet.test(ompDirective)) {
     // 2.9.3.1 SIMD construct
-    createSimdLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
-                   currentLocation);
+    createSimd(converter, semaCtx, eval, ompDirective, loopOpClauseList,
+               currentLocation);
     genOpenMPReduction(converter, semaCtx, loopOpClauseList);
   } else {
     createWsloop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
@@ -2410,10 +2431,9 @@ mlir::Operation *Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
                                                      mlir::Operation *op,
                                                      mlir::Location loc) {
   if (mlir::isa<mlir::omp::WsloopOp, mlir::omp::DeclareReductionOp,
-                mlir::omp::AtomicUpdateOp, mlir::omp::SimdLoopOp>(op))
+                mlir::omp::AtomicUpdateOp, mlir::omp::LoopNestOp>(op))
     return builder.create<mlir::omp::YieldOp>(loc);
-  else
-    return builder.create<mlir::omp::TerminatorOp>(loc);
+  return builder.create<mlir::omp::TerminatorOp>(loc);
 }
 
 void Fortran::lower::genOpenMPConstruct(
 
@@ -180,14 +180,16 @@ func.func @_QPsimd1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref
   omp.parallel  {
     %1 = fir.alloca i32 {adapt.valuebyref, pinned}
     %2 = fir.load %arg0 : !fir.ref<i32>
-    omp.simdloop for (%arg2) : i32 = (%c1_i32) to (%2) step (%c1_i32)  {
-      fir.store %arg2 to %1 : !fir.ref<i32>
-      %3 = fir.load %1 : !fir.ref<i32>
-      %4 = fir.convert %3 : (i32) -> i64
-      %5 = arith.subi %4, %c1_i64 : i64
-      %6 = fir.coordinate_of %arg1, %5 : (!fir.ref<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
-      fir.store %3 to %6 : !fir.ref<i32>
-      omp.yield
+    omp.simd {
+      omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%2) step (%c1_i32) {
+        fir.store %arg2 to %1 : !fir.ref<i32>
+        %3 = fir.load %1 : !fir.ref<i32>
+        %4 = fir.convert %3 : (i32) -> i64
+        %5 = arith.subi %4, %c1_i64 : i64
+        %6 = fir.coordinate_of %arg1, %5 : (!fir.ref<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+        fir.store %3 to %6 : !fir.ref<i32>
+        omp.yield
+      }
     }
     omp.terminator
   }
@@ -202,8 +204,8 @@ func.func @_QPsimd1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref
 // CHECK:      %[[ONE_3:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {pinned} : (i64) -> !llvm.ptr
 // CHECK:      %[[N:.*]] = llvm.load %[[N_REF]] : !llvm.ptr -> i32
-// CHECK: omp.simdloop
-// CHECK-SAME: (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) step (%[[ONE_2]]) {
+// CHECK: omp.simd {
+// CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) step (%[[ONE_2]]) {
 // CHECK:   llvm.store %[[I]], %[[I_VAR]] : i32, !llvm.ptr
 // CHECK:   %[[I1:.*]] = llvm.load %[[I_VAR]] : !llvm.ptr -> i32
 // CHECK:   %[[I1_EXT:.*]] = llvm.sext %[[I1]] : i32 to i64
@@ -212,6 +214,7 @@ func.func @_QPsimd1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref
 // CHECK:   llvm.store %[[I1]], %[[ARR_I_REF]] : i32, !llvm.ptr
 // CHECK: omp.yield
 // CHECK: }
+// CHECK: }
 // CHECK: omp.terminator
 // CHECK: }
 // CHECK: llvm.return
@@ -471,55 +474,59 @@ func.func @_QPomp_target() {
 
 // -----
 
-func.func @_QPsimdloop_with_nested_loop() {
+func.func @_QPsimd_with_nested_loop() {
   %0 = fir.alloca i32 {adapt.valuebyref}
-  %1 = fir.alloca !fir.array<10xi32> {bindc_name = "a", uniq_name = "_QFsimdloop_with_nested_loopEa"}
-  %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimdloop_with_nested_loopEi"}
-  %3 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFsimdloop_with_nested_loopEj"}
+  %1 = fir.alloca !fir.array<10xi32> {bindc_name = "a", uniq_name = "_QFsimd_with_nested_loopEa"}
+  %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimd_with_nested_loopEi"}
+  %3 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFsimd_with_nested_loopEj"}
   %c1_i32 = arith.constant 1 : i32
   %c10_i32 = arith.constant 10 : i32
   %c1_i32_0 = arith.constant 1 : i32
-  omp.simdloop   for  (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32_0) {
-    fir.store %arg0 to %0 : !fir.ref<i32>
-    %c1_i32_1 = arith.constant 1 : i32
-    %4 = fir.convert %c1_i32_1 : (i32) -> index
-    %c10_i32_2 = arith.constant 10 : i32
-    %5 = fir.convert %c10_i32_2 : (i32) -> index
-    %c1 = arith.constant 1 : index
-    %6 = fir.do_loop %arg1 = %4 to %5 step %c1 -> index {
-      %8 = fir.convert %arg1 : (index) -> i32
-      fir.store %8 to %3 : !fir.ref<i32>
-      %9 = fir.load %0 : !fir.ref<i32>
-      %10 = fir.load %0 : !fir.ref<i32>
-      %11 = fir.convert %10 : (i32) -> i64
-      %c1_i64 = arith.constant 1 : i64
-      %12 = arith.subi %11, %c1_i64 : i64
-      %13 = fir.coordinate_of %1, %12 : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
-      fir.store %9 to %13 : !fir.ref<i32>
-      %14 = arith.addi %arg1, %c1 : index
-      fir.result %14 : index
+  omp.simd {
+    omp.loop_nest (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32_0) {
+      fir.store %arg0 to %0 : !fir.ref<i32>
+      %c1_i32_1 = arith.constant 1 : i32
+      %4 = fir.convert %c1_i32_1 : (i32) -> index
+      %c10_i32_2 = arith.constant 10 : i32
+      %5 = fir.convert %c10_i32_2 : (i32) -> index
+      %c1 = arith.constant 1 : index
+      %6 = fir.do_loop %arg1 = %4 to %5 step %c1 -> index {
+        %8 = fir.convert %arg1 : (index) -> i32
+        fir.store %8 to %3 : !fir.ref<i32>
+        %9 = fir.load %0 : !fir.ref<i32>
+        %10 = fir.load %0 : !fir.ref<i32>
+        %11 = fir.convert %10 : (i32) -> i64
+        %c1_i64 = arith.constant 1 : i64
+        %12 = arith.subi %11, %c1_i64 : i64
+        %13 = fir.coordinate_of %1, %12 : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+        fir.store %9 to %13 : !fir.ref<i32>
+        %14 = arith.addi %arg1, %c1 : index
+        fir.result %14 : index
+      }
+      %7 = fir.convert %6 : (index) -> i32
+      fir.store %7 to %3 : !fir.ref<i32>
+      omp.yield
     }
-    %7 = fir.convert %6 : (index) -> i32
-    fir.store %7 to %3 : !fir.ref<i32>
-    omp.yield
   }
   return
 }
 
-// CHECK-LABEL:   llvm.func @_QPsimdloop_with_nested_loop() {
+// CHECK-LABEL:   llvm.func @_QPsimd_with_nested_loop() {
 // CHECK:           %[[LOWER:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:           %[[UPPER:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:           %[[STEP:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK:           omp.simdloop   for  (%[[CNT:.*]]) : i32 = (%[[LOWER]]) to (%[[UPPER]]) inclusive step (%[[STEP]]) {
-// CHECK:             llvm.br ^bb1(%[[VAL_1:.*]], %[[VAL_2:.*]] : i64, i64)
-// CHECK:           ^bb1(%[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64):
-// CHECK:             %[[VAL_5:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK:             %[[VAL_6:.*]] = llvm.icmp "sgt" %[[VAL_4]], %[[VAL_5]] : i64
-// CHECK:             llvm.cond_br %[[VAL_6]], ^bb2, ^bb3
-// CHECK:           ^bb2:
-// CHECK:             llvm.br ^bb1(%[[VAL_7:.*]], %[[VAL_8:.*]] : i64, i64)
-// CHECK:           ^bb3:
-// CHECK:             omp.yield
+// CHECK:           omp.simd {
+// CHECK-NEXT:        omp.loop_nest (%[[CNT:.*]]) : i32 = (%[[LOWER]]) to (%[[UPPER]]) inclusive step (%[[STEP]]) {
+// CHECK:               llvm.br ^bb1(%[[VAL_1:.*]], %[[VAL_2:.*]] : i64, i64)
+// CHECK:             ^bb1(%[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64):
+// CHECK:               %[[VAL_5:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK:               %[[VAL_6:.*]] = llvm.icmp "sgt" %[[VAL_4]], %[[VAL_5]] : i64
+// CHECK:               llvm.cond_br %[[VAL_6]], ^bb2, ^bb3
+// CHECK:             ^bb2:
+// CHECK:               llvm.br ^bb1(%[[VAL_7:.*]], %[[VAL_8:.*]] : i64, i64)
+// CHECK:             ^bb3:
+// CHECK:               omp.yield
+// CHECK:             }
 // CHECK:           }
 // CHECK:           llvm.return
 // CHECK:         }