1 files changed, 457 insertions, 218 deletions
diff --git a/runtime/src/kmp_tasking.cpp b/runtime/src/kmp_tasking.cpp
index 7292ed1452d2b..d037299f14772 100644
--- a/runtime/src/kmp_tasking.cpp
+++ b/runtime/src/kmp_tasking.cpp
@@ -4,10 +4,9 @@
 
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,10 +30,7 @@ static void __kmp_alloc_task_deque(kmp_info_t *thread,
                                    kmp_thread_data_t *thread_data);
 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
                                            kmp_task_team_t *task_team);
-
-#if OMP_45_ENABLED
 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
-#endif
 
 #ifdef BUILD_TIED_TASK_STACK
 
@@ -375,7 +371,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   // Check if deque is full
   if (TCR_4(thread_data->td.td_deque_ntasks) >=
       TASK_DEQUE_SIZE(thread_data->td)) {
-    if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+    if (__kmp_enable_task_throttling &&
+        __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
                               thread->th.th_current_task)) {
       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
                     "TASK_NOT_PUSHED for task %p\n",
@@ -391,11 +388,11 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   // Lock the deque for the task push operation
   if (!locked) {
     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
-#if OMP_45_ENABLED
     // Need to recheck as we can get a proxy task from thread outside of OpenMP
     if (TCR_4(thread_data->td.td_deque_ntasks) >=
         TASK_DEQUE_SIZE(thread_data->td)) {
-      if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+      if (__kmp_enable_task_throttling &&
+          __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
                                 thread->th.th_current_task)) {
         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
@@ -407,7 +404,6 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
         __kmp_realloc_task_deque(thread, thread_data);
       }
     }
-#endif
   }
   // Must have room since no thread can add tasks but calling thread
   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
@@ -551,10 +547,8 @@ static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
   task->ompt_task_info.frame.enter_frame = ompt_data_none;
   task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
   task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
-#if OMP_40_ENABLED
   task->ompt_task_info.ndeps = 0;
   task->ompt_task_info.deps = NULL;
-#endif /* OMP_40_ENABLED */
 }
 
 // __ompt_task_start:
@@ -741,16 +735,11 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
                                           kmp_taskdata_t *taskdata,
                                           kmp_info_t *thread) {
-#if OMP_45_ENABLED
   // Proxy tasks must always be allowed to free their parents
   // because they can be run in background even in serial mode.
   kmp_int32 team_serial =
       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
       !taskdata->td_flags.proxy;
-#else
-  kmp_int32 team_serial =
-      taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
-#endif
   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 
   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
@@ -815,10 +804,8 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
                               kmp_taskdata_t *resumed_task) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   kmp_info_t *thread = __kmp_threads[gtid];
-#if OMP_45_ENABLED
   kmp_task_team_t *task_team =
       thread->th.th_task_team; // might be NULL for serial teams...
-#endif // OMP_45_ENABLED
   kmp_int32 children = 0;
 
   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
@@ -875,28 +862,41 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   }
 
   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
-  taskdata->td_flags.complete = 1; // mark the task as completed
+  bool detach = false;
+  if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
+    if (taskdata->td_allow_completion_event.type ==
+        KMP_EVENT_ALLOW_COMPLETION) {
+      // event hasn't been fulfilled yet. Try to detach task.
+      __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
+      if (taskdata->td_allow_completion_event.type ==
+          KMP_EVENT_ALLOW_COMPLETION) {
+        taskdata->td_flags.proxy = TASK_PROXY; // proxify!
+        detach = true;
+      }
+      __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
+    }
+  }
   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 
-  // Only need to keep track of count if team parallel and tasking not
-  // serialized
-  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
-    // Predecrement simulated by "- 1" calculation
-    children =
-        KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
-    KMP_DEBUG_ASSERT(children >= 0);
-#if OMP_40_ENABLED
-    if (taskdata->td_taskgroup)
-      KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
-    __kmp_release_deps(gtid, taskdata);
-#if OMP_45_ENABLED
-  } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
-    // if we found proxy tasks there could exist a dependency chain
-    // with the proxy task as origin
-    __kmp_release_deps(gtid, taskdata);
-#endif // OMP_45_ENABLED
-#endif // OMP_40_ENABLED
+  if (!detach) {
+    taskdata->td_flags.complete = 1; // mark the task as completed
+
+    // Only need to keep track of count if team parallel and tasking not
+    // serialized
+    if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+      // Predecrement simulated by "- 1" calculation
+      children =
+          KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+      KMP_DEBUG_ASSERT(children >= 0);
+      if (taskdata->td_taskgroup)
+        KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+      __kmp_release_deps(gtid, taskdata);
+    } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
+      // if we found proxy tasks there could exist a dependency chain
+      // with the proxy task as origin
+      __kmp_release_deps(gtid, taskdata);
+    }
   }
 
   // td_flags.executing must be marked as 0 after __kmp_release_deps has been
@@ -909,7 +909,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
            gtid, taskdata, children));
 
-#if OMP_40_ENABLED
   /* If the tasks' destructor thunk flag has been set, we need to invoke the
      destructor thunk that has been generated by the compiler. The code is
      placed here, since at this point other tasks might have been released
@@ -921,7 +920,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
     KMP_ASSERT(destr_thunk);
     destr_thunk(gtid, task);
   }
-#endif // OMP_40_ENABLED
 
   // bookkeeping for resuming task:
   // GEH - note tasking_ser => task_serial
@@ -943,7 +941,8 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   // johnmc: if an asynchronous inquiry peers into the runtime system
   // it doesn't see the freed task as the current task.
   thread->th.th_current_task = resumed_task;
-  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
+  if (!detach)
+    __kmp_free_task_and_ancestors(gtid, taskdata, thread);
 
   // TODO: GEH - make sure root team implicit task is initialized properly.
   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
@@ -1052,9 +1051,7 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
 
   task->td_flags.tiedness = TASK_TIED;
   task->td_flags.tasktype = TASK_IMPLICIT;
-#if OMP_45_ENABLED
   task->td_flags.proxy = TASK_FULL;
-#endif
 
   // All implicit tasks are executed immediately, not deferred
   task->td_flags.task_serial = 1;
@@ -1066,19 +1063,16 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
   task->td_flags.complete = 0;
   task->td_flags.freed = 0;
 
-#if OMP_40_ENABLED
   task->td_depnode = NULL;
-#endif
   task->td_last_tied = task;
+  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
 
   if (set_curr_task) { // only do this init first time thread is created
     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
     // Not used: don't need to deallocate implicit task
     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
-#if OMP_40_ENABLED
     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
     task->td_dephash = NULL;
-#endif
     __kmp_push_current_task_to_thread(this_thr, team, tid);
   } else {
     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
@@ -1188,11 +1182,14 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
   }
 
-#if OMP_45_ENABLED
-  if (flags->proxy == TASK_PROXY) {
-    flags->tiedness = TASK_UNTIED;
-    flags->merged_if0 = 1;
-
+  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
+  // the tasking setup
+  // when that happens is too late.
+  if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) {
+    if (flags->proxy == TASK_PROXY) {
+      flags->tiedness = TASK_UNTIED;
+      flags->merged_if0 = 1;
+    }
     /* are we running in a sequential parallel or tskm_immediate_exec... we need
        tasking support enabled */
     if ((thread->th.th_task_team) == NULL) {
@@ -1226,7 +1223,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     if (task_team->tt.tt_found_proxy_tasks == FALSE)
       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
   }
-#endif
 
   // Calculate shared structure offset including padding after kmp_task_t struct
   // to align pointers in shared struct
@@ -1282,23 +1278,18 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata->td_taskwait_counter = 0;
   taskdata->td_taskwait_thread = 0;
   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
-#if OMP_45_ENABLED
   // avoid copying icvs for proxy tasks
   if (flags->proxy == TASK_FULL)
-#endif
     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
 
   taskdata->td_flags.tiedness = flags->tiedness;
   taskdata->td_flags.final = flags->final;
   taskdata->td_flags.merged_if0 = flags->merged_if0;
-#if OMP_40_ENABLED
   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
-#endif // OMP_40_ENABLED
-#if OMP_45_ENABLED
   taskdata->td_flags.proxy = flags->proxy;
+  taskdata->td_flags.detachable = flags->detachable;
   taskdata->td_task_team = thread->th.th_task_team;
   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
-#endif
   taskdata->td_flags.tasktype = TASK_EXPLICIT;
 
   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
@@ -1325,35 +1316,28 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
   // start at one because counts current task and children
   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
-#if OMP_40_ENABLED
   taskdata->td_taskgroup =
       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
   taskdata->td_dephash = NULL;
   taskdata->td_depnode = NULL;
-#endif
   if (flags->tiedness == TASK_UNTIED)
     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
   else
     taskdata->td_last_tied = taskdata;
-
+  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
 #if OMPT_SUPPORT
   if (UNLIKELY(ompt_enabled.enabled))
     __ompt_task_init(taskdata, gtid);
 #endif
 // Only need to keep track of child task counts if team parallel and tasking not
-// serialized or if it is a proxy task
-#if OMP_45_ENABLED
+// serialized or if it is a proxy or detachable task
   if (flags->proxy == TASK_PROXY ||
+      flags->detachable == TASK_DETACHABLE ||
       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
-#else
-  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
-#endif
   {
     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
-#if OMP_40_ENABLED
     if (parent_task->td_taskgroup)
       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
-#endif
     // Only need to keep track of allocated child tasks for explicit tasks since
     // implicit not deallocated
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
@@ -1378,18 +1362,12 @@ kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   input_flags->native = FALSE;
 // __kmp_task_alloc() sets up all other runtime flags
 
-#if OMP_45_ENABLED
-  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
+  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
-                input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
+                input_flags->proxy ? "proxy" : "",
+                input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
                 sizeof_shareds, task_entry));
-#else
-  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
-                "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
-                gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
-                sizeof_kmp_task_t, sizeof_shareds, task_entry));
-#endif
 
   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
                             sizeof_shareds, task_entry);
@@ -1399,7 +1377,16 @@ kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   return retval;
 }
 
-#if OMP_50_ENABLED
+kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                         kmp_int32 flags,
+                                         size_t sizeof_kmp_task_t,
+                                         size_t sizeof_shareds,
+                                         kmp_routine_entry_t task_entry,
+                                         kmp_int64 device_id) {
+  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
+                               sizeof_shareds, task_entry);
+}
+
 /*!
 @ingroup TASKING
 @param loc_ref location of the original task directive
@@ -1419,7 +1406,6 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
                                   kmp_task_affinity_info_t *affin_list) {
   return 0;
 }
-#endif
 
 //  __kmp_invoke_task: invoke the specified task
 //
@@ -1430,14 +1416,11 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
                               kmp_taskdata_t *current_task) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   kmp_info_t *thread;
-#if OMP_40_ENABLED
   int discard = 0 /* false */;
-#endif
   KA_TRACE(
       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
            gtid, taskdata, current_task));
   KMP_DEBUG_ASSERT(task);
-#if OMP_45_ENABLED
   if (taskdata->td_flags.proxy == TASK_PROXY &&
       taskdata->td_flags.complete == 1) {
     // This is a proxy task that was already completed but it needs to run
@@ -1455,7 +1438,6 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
 
     return;
   }
-#endif
 
 #if OMPT_SUPPORT
   // For untied tasks, the first task executed only calls __kmpc_omp_task and
@@ -1473,17 +1455,12 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
   }
 #endif
 
-#if OMP_45_ENABLED
   // Proxy tasks are not handled by the runtime
   if (taskdata->td_flags.proxy != TASK_PROXY) {
-#endif
     ANNOTATE_HAPPENS_AFTER(task);
     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
-#if OMP_45_ENABLED
   }
-#endif
 
-#if OMP_40_ENABLED
   // TODO: cancel tasks if the parallel region has also been cancelled
   // TODO: check if this sequence can be hoisted above __kmp_task_start
   // if cancellation has been enabled for this run ...
@@ -1541,7 +1518,6 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
       break;
     }
 #endif // KMP_STATS_ENABLED
-#endif // OMP_40_ENABLED
 
 // OMPT task begin
 #if OMPT_SUPPORT
@@ -1581,15 +1557,11 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
     }
 #endif
 
-#if OMP_40_ENABLED
   }
-#endif // OMP_40_ENABLED
 
 
-#if OMP_45_ENABLED
   // Proxy tasks are not handled by the runtime
   if (taskdata->td_flags.proxy != TASK_PROXY) {
-#endif
     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
 #if OMPT_SUPPORT
     if (UNLIKELY(ompt_enabled.enabled)) {
@@ -1601,9 +1573,7 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
     } else
 #endif
       __kmp_task_finish<false>(gtid, task, current_task);
-#if OMP_45_ENABLED
   }
-#endif
 
   KA_TRACE(
       30,
@@ -1684,14 +1654,10 @@ kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
                          bool serialize_immediate) {
   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
 
-/* Should we execute the new task or queue it? For now, let's just always try to
-   queue it.  If the queue fills up, then we'll execute it.  */
-#if OMP_45_ENABLED
+  /* Should we execute the new task or queue it? For now, let's just always try
+     to queue it.  If the queue fills up, then we'll execute it.  */
   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
-#else
-  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
-#endif
   { // Execute this task immediately
     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
     if (serialize_immediate)
@@ -1880,10 +1846,8 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
     bool must_wait =
         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
 
-#if OMP_45_ENABLED
     must_wait = must_wait || (thread->th.th_task_team != NULL &&
                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
-#endif
     if (must_wait) {
       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
                              &(taskdata->td_incomplete_child_tasks)),
@@ -2018,51 +1982,110 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
   return TASK_CURRENT_NOT_QUEUED;
 }
 
-#if OMP_50_ENABLED
 // Task Reduction implementation
+//
+// Note: initial implementation didn't take into account the possibility
+// to specify omp_orig for initializer of the UDR (user defined reduction).
+// Corrected implementation takes into account the omp_orig object.
+// Compiler is free to use old implementation if omp_orig is not specified.
 
-typedef struct kmp_task_red_flags {
-  unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+/*!
+Flags for special info per task reduction item.
+*/
+typedef struct kmp_taskred_flags {
+  /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
+  unsigned lazy_priv : 1;
   unsigned reserved31 : 31;
-} kmp_task_red_flags_t;
-
-// internal structure for reduction data item related info
-typedef struct kmp_task_red_data {
-  void *reduce_shar; // shared reduction item
-  size_t reduce_size; // size of data item
-  void *reduce_priv; // thread specific data
-  void *reduce_pend; // end of private data for comparison op
-  void *reduce_init; // data initialization routine
-  void *reduce_fini; // data finalization routine
-  void *reduce_comb; // data combiner routine
-  kmp_task_red_flags_t flags; // flags for additional info from compiler
-} kmp_task_red_data_t;
-
-// structure sent us by compiler - one per reduction item
+} kmp_taskred_flags_t;
+
+/*!
+Internal struct for reduction data item related info set up by compiler.
+*/
 typedef struct kmp_task_red_input {
-  void *reduce_shar; // shared reduction item
-  size_t reduce_size; // size of data item
-  void *reduce_init; // data initialization routine
-  void *reduce_fini; // data finalization routine
-  void *reduce_comb; // data combiner routine
-  kmp_task_red_flags_t flags; // flags for additional info from compiler
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single parameter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
 } kmp_task_red_input_t;
 
 /*!
-@ingroup TASKING
-@param gtid      Global thread ID
-@param num       Number of data items to reduce
-@param data      Array of data for reduction
-@return The taskgroup identifier
+Internal struct for reduction data item related info saved by the library.
+*/
+typedef struct kmp_taskred_data {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+  void *reduce_priv; /**< array of thread specific items */
+  void *reduce_pend; /**< end of private data for faster comparison op */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_comb; /**< data combiner routine */
+  void *reduce_init; /**< data initialization routine (two parameters) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_orig; /**< original item (can be used in UDR initializer) */
+} kmp_taskred_data_t;
 
-Initialize task reduction for the taskgroup.
+/*!
+Internal struct for reduction data item related info set up by compiler.
+
+New interface: added reduce_orig field to provide omp_orig for UDR initializer.
 */
-void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+typedef struct kmp_taskred_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  void *reduce_orig; /**< original reduction item used for initialization */
+  size_t reduce_size; /**< size of data item */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (two parameters) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+} kmp_taskred_input_t;
+/*!
+@}
+*/
+
+template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
+template <>
+void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+                                             kmp_task_red_input_t &src) {
+  item.reduce_orig = NULL;
+}
+template <>
+void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+                                            kmp_taskred_input_t &src) {
+  if (src.reduce_orig != NULL) {
+    item.reduce_orig = src.reduce_orig;
+  } else {
+    item.reduce_orig = src.reduce_shar;
+  } // non-NULL reduce_orig means new interface used
+}
+
+template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
+template <>
+void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+                                           int offset) {
+  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
+}
+template <>
+void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+                                          int offset) {
+  ((void (*)(void *, void *))item.reduce_init)(
+      (char *)(item.reduce_priv) + offset, item.reduce_orig);
+}
+
+template <typename T>
+void *__kmp_task_reduction_init(int gtid, int num, T *data) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
   kmp_int32 nth = thread->th.th_team_nproc;
-  kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
-  kmp_task_red_data_t *arr;
+  kmp_taskred_data_t *arr;
 
   // check input data just in case
   KMP_ASSERT(tg != NULL);
@@ -2075,33 +2098,34 @@ void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
   }
   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
                 gtid, tg, num));
-  arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
-      thread, num * sizeof(kmp_task_red_data_t));
+  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+      thread, num * sizeof(kmp_taskred_data_t));
   for (int i = 0; i < num; ++i) {
-    void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
-    size_t size = input[i].reduce_size - 1;
+    size_t size = data[i].reduce_size - 1;
     // round the size up to cache line per thread-specific item
     size += CACHE_LINE - size % CACHE_LINE;
-    KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
-    arr[i].reduce_shar = input[i].reduce_shar;
+    KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
+    arr[i].reduce_shar = data[i].reduce_shar;
     arr[i].reduce_size = size;
-    arr[i].reduce_init = input[i].reduce_init;
-    arr[i].reduce_fini = input[i].reduce_fini;
-    arr[i].reduce_comb = input[i].reduce_comb;
-    arr[i].flags = input[i].flags;
-    if (!input[i].flags.lazy_priv) {
+    arr[i].flags = data[i].flags;
+    arr[i].reduce_comb = data[i].reduce_comb;
+    arr[i].reduce_init = data[i].reduce_init;
+    arr[i].reduce_fini = data[i].reduce_fini;
+    __kmp_assign_orig<T>(arr[i], data[i]);
+    if (!arr[i].flags.lazy_priv) {
       // allocate cache-line aligned block and fill it with zeros
       arr[i].reduce_priv = __kmp_allocate(nth * size);
       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
-      if (f_init != NULL) {
-        // initialize thread-specific items
+      if (arr[i].reduce_init != NULL) {
+        // initialize all thread-specific items
         for (int j = 0; j < nth; ++j) {
-          f_init((char *)(arr[i].reduce_priv) + j * size);
+          __kmp_call_init<T>(arr[i], j * size);
         }
       }
     } else {
       // only allocate space for pointers now,
-      // objects will be lazily allocated/initialized once requested
+      // objects will be lazily allocated/initialized if/when requested
+      // note that __kmp_allocate zeroes the allocated memory
       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
     }
   }
@@ -2112,6 +2136,59 @@ void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
 
 /*!
 @ingroup TASKING
+@param gtid      Global thread ID
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param gtid      Global thread ID
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_init(int gtid, int num, void *data) {
+  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
+}
+
+// Copy task reduction data (except for shared pointers).
+template <typename T>
+void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
+                                    kmp_taskgroup_t *tg, void *reduce_data) {
+  kmp_taskred_data_t *arr;
+  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
+                " from data %p\n",
+                thr, tg, reduce_data));
+  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+      thr, num * sizeof(kmp_taskred_data_t));
+  // threads will share private copies, thunk routines, sizes, flags, etc.:
+  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
+  for (int i = 0; i < num; ++i) {
+    arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
+  }
+  tg->reduce_data = (void *)arr;
+  tg->reduce_num_data = num;
+}
+
+/*!
+@ingroup TASKING
 @param gtid    Global thread ID
 @param tskgrp  The taskgroup ID (optional)
 @param data    Shared location of the item
@@ -2129,7 +2206,7 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
   if (tg == NULL)
     tg = thread->th.th_current_task->td_taskgroup;
   KMP_ASSERT(tg != NULL);
-  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
+  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
   kmp_int32 num = tg->reduce_num_data;
   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
 
@@ -2153,17 +2230,21 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
       found:
         if (p_priv[tid] == NULL) {
           // allocate thread specific object lazily
-          void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
-          if (f_init != NULL) {
-            f_init(p_priv[tid]);
+          if (arr[i].reduce_init != NULL) {
+            if (arr[i].reduce_orig != NULL) { // new interface
+              ((void (*)(void *, void *))arr[i].reduce_init)(
+                  p_priv[tid], arr[i].reduce_orig);
+            } else { // old interface (single parameter)
+              ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
+            }
           }
         }
         return p_priv[tid];
       }
     }
     tg = tg->parent;
-    arr = (kmp_task_red_data_t *)(tg->reduce_data);
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
     num = tg->reduce_num_data;
   }
   KMP_ASSERT2(0, "Unknown task reduction item");
@@ -2175,7 +2256,7 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
   kmp_int32 nth = th->th.th_team_nproc;
   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
-  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
+  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
   kmp_int32 num = tg->reduce_num_data;
   for (int i = 0; i < num; ++i) {
     void *sh_data = arr[i].reduce_shar;
@@ -2208,9 +2289,112 @@ static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
   tg->reduce_data = NULL;
   tg->reduce_num_data = 0;
 }
-#endif
 
-#if OMP_40_ENABLED
+// Cleanup task reduction data for parallel or worksharing,
+// do not touch task private data other threads still working with.
+// Called from __kmpc_end_taskgroup()
+static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
+  __kmp_thread_free(th, tg->reduce_data);
+  tg->reduce_data = NULL;
+  tg->reduce_num_data = 0;
+}
+
+template <typename T>
+void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                         int num, T *data) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_int32 nth = thr->th.th_team_nproc;
+  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
+  if (nth == 1) {
+    KA_TRACE(10,
+             ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
+              gtid, thr->th.th_current_task->td_taskgroup));
+    return (void *)thr->th.th_current_task->td_taskgroup;
+  }
+  kmp_team_t *team = thr->th.th_team;
+  void *reduce_data;
+  kmp_taskgroup_t *tg;
+  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
+  if (reduce_data == NULL &&
+      __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
+                                 (void *)1)) {
+    // single thread enters this block to initialize common reduction data
+    KMP_DEBUG_ASSERT(reduce_data == NULL);
+    // first initialize own data, then make a copy other threads can use
+    tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
+    reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
+    KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
+    // fini counters should be 0 at this point
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
+  } else {
+    while (
+        (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
+        (void *)1) { // wait for task reduction initialization
+      KMP_CPU_PAUSE();
+    }
+    KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
+    tg = thr->th.th_current_task->td_taskgroup;
+    __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
+  }
+  return tg;
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                          int num, void *data) {
+  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+                                            (kmp_task_red_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
+                                   void *data) {
+  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+                                            (kmp_taskred_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+
+Finalize task reduction for a parallel or worksharing.
+*/
+void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
+  __kmpc_end_taskgroup(loc, gtid);
+}
+
 // __kmpc_taskgroup: Start a new taskgroup
 void __kmpc_taskgroup(ident_t *loc, int gtid) {
   kmp_info_t *thread = __kmp_threads[gtid];
@@ -2221,10 +2405,8 @@ void __kmpc_taskgroup(ident_t *loc, int gtid) {
   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
   tg_new->parent = taskdata->td_taskgroup;
-#if OMP_50_ENABLED
   tg_new->reduce_data = NULL;
   tg_new->reduce_num_data = 0;
-#endif
   taskdata->td_taskgroup = tg_new;
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -2293,14 +2475,9 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
     }
 #endif
 
-#if OMP_45_ENABLED
     if (!taskdata->td_flags.team_serial ||
         (thread->th.th_task_team != NULL &&
-         thread->th.th_task_team->tt.tt_found_proxy_tasks))
-#else
-    if (!taskdata->td_flags.team_serial)
-#endif
-    {
+         thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
                        0U);
       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
@@ -2326,10 +2503,54 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
   }
   KMP_DEBUG_ASSERT(taskgroup->count == 0);
 
-#if OMP_50_ENABLED
-  if (taskgroup->reduce_data != NULL) // need to reduce?
-    __kmp_task_reduction_fini(thread, taskgroup);
-#endif
+  if (taskgroup->reduce_data != NULL) { // need to reduce?
+    int cnt;
+    void *reduce_data;
+    kmp_team_t *t = thread->th.th_team;
+    kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
+    // check if <priv> data of the first reduction variable shared for the team
+    void *priv0 = arr[0].reduce_priv;
+    if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
+        ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+      // finishing task reduction on parallel
+      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
+      if (cnt == thread->th.th_team_nproc - 1) {
+        // we are the last thread passing __kmpc_reduction_modifier_fini()
+        // finalize task reduction:
+        __kmp_task_reduction_fini(thread, taskgroup);
+        // cleanup fields in the team structure:
+        // TODO: is relaxed store enough here (whole barrier should follow)?
+        __kmp_thread_free(thread, reduce_data);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
+      } else {
+        // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+        // so do not finalize reduction, just clean own copy of the data
+        __kmp_task_reduction_clean(thread, taskgroup);
+      }
+    } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
+                   NULL &&
+               ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+      // finishing task reduction on worksharing
+      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
+      if (cnt == thread->th.th_team_nproc - 1) {
+        // we are the last thread passing __kmpc_reduction_modifier_fini()
+        __kmp_task_reduction_fini(thread, taskgroup);
+        // cleanup fields in team structure:
+        // TODO: is relaxed store enough here (whole barrier should follow)?
+        __kmp_thread_free(thread, reduce_data);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
+      } else {
+        // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+        // so do not finalize reduction, just clean own copy of the data
+        __kmp_task_reduction_clean(thread, taskgroup);
+      }
+    } else {
+      // finishing task reduction on taskgroup
+      __kmp_task_reduction_fini(thread, taskgroup);
+    }
+  }
   // Restore parent taskgroup for the current task
   taskdata->td_taskgroup = taskgroup->parent;
   __kmp_thread_free(thread, taskgroup);
@@ -2346,7 +2567,6 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
   }
 #endif
 }
-#endif
 
 // __kmp_remove_my_task: remove a task from my own deque
 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
@@ -2589,11 +2809,7 @@ static inline int __kmp_execute_tasks_template(
 
   nthreads = task_team->tt.tt_nproc;
   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
-#if OMP_45_ENABLED
   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
-#else
-  KMP_DEBUG_ASSERT(nthreads > 1);
-#endif
   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
 
   while (1) { // Outer loop keeps trying to find tasks in case of single thread
@@ -2706,8 +2922,7 @@ static inline int __kmp_execute_tasks_template(
       if (thread->th.th_task_team == NULL) {
         break;
       }
-      // Yield before executing next task
-      KMP_YIELD(__kmp_library == library_throughput);
+      KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
       // If execution of a stolen task results in more tasks being placed on our
       // run queue, reset use_own_tasks
       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
@@ -2719,17 +2934,11 @@ static inline int __kmp_execute_tasks_template(
       }
     }
 
-// The task source has been exhausted. If in final spin loop of barrier, check
-// if termination condition is satisfied.
-#if OMP_45_ENABLED
-    // The work queue may be empty but there might be proxy tasks still
-    // executing
+    // The task source has been exhausted. If in final spin loop of barrier,
+    // check if termination condition is satisfied. The work queue may be empty
+    // but there might be proxy tasks still executing.
     if (final_spin &&
-        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0)
-#else
-    if (final_spin)
-#endif
-    {
+        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
       // First, decrement the #unfinished threads, if that has not already been
       // done.  This decrement might be to the spin location, and result in the
       // termination condition being satisfied.
@@ -2765,14 +2974,11 @@ static inline int __kmp_execute_tasks_template(
       return FALSE;
     }
 
-#if OMP_45_ENABLED
     // We could be getting tasks from target constructs; if this is the only
     // thread, keep trying to execute tasks from own queue
     if (nthreads == 1)
       use_own_tasks = 1;
-    else
-#endif
-    {
+    else {
       KA_TRACE(15,
                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
       return FALSE;
@@ -2839,7 +3045,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
   KMP_DEBUG_ASSERT(threads_data != NULL);
 
-  if ((__kmp_tasking_mode == tskm_task_teams) &&
+  if (__kmp_tasking_mode == tskm_task_teams &&
       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
     // Release any threads sleeping at the barrier, so that they can steal
     // tasks and execute them.  In extra barrier mode, tasks do not sleep
@@ -2895,12 +3101,8 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
  * of the kmp_task_team_t structs for consecutive barriers can overlap
  * (and will, unless the master thread is the last thread to exit the barrier
- * release phase, which is not typical).
- *
- * The existence of such a struct is useful outside the context of tasking,
- * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
- * so that any performance differences show up when comparing the 2.5 vs. 3.0
- * libraries.
+ * release phase, which is not typical). The existence of such a struct is
+ * useful outside the context of tasking.
  *
  * We currently use the existence of the threads array as an indicator that
  * tasks were spawned since the last barrier.  If the structure is to be
@@ -3129,9 +3331,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
   }
 
   TCW_4(task_team->tt.tt_found_tasks, FALSE);
-#if OMP_45_ENABLED
   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-#endif
   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
 
   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
@@ -3243,10 +3443,8 @@ void __kmp_wait_to_unref_task_teams(void) {
       break;
     }
 
-    // If we are oversubscribed, or have waited a bit (and library mode is
-    // throughput), yield. Pause is in the following code.
-    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
-    KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
+    // If oversubscribed or have waited a bit, yield.
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
   }
 }
 
@@ -3295,9 +3493,7 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
           team->t.t_nproc != task_team->tt.tt_nproc) {
         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
         TCW_4(task_team->tt.tt_found_tasks, FALSE);
-#if OMP_45_ENABLED
         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-#endif
         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
                           team->t.t_nproc);
         TCW_4(task_team->tt.tt_active, TRUE);
@@ -3368,13 +3564,9 @@ void __kmp_task_team_wait(
         ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
          "setting active to false, setting local and team's pointer to NULL\n",
          __kmp_gtid_from_thread(this_thr), task_team));
-#if OMP_45_ENABLED
     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
                      task_team->tt.tt_found_proxy_tasks == TRUE);
     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-#else
-    KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
-#endif
     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
     KMP_MB();
@@ -3411,15 +3603,13 @@ void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
         __kmp_abort_thread();
       break;
     }
-    KMP_YIELD(TRUE); // GH: We always yield here
+    KMP_YIELD(TRUE);
   }
 #if USE_ITT_BUILD
   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
 #endif /* USE_ITT_BUILD */
 }
 
-#if OMP_45_ENABLED
-
 // __kmp_give_task puts a task into a given thread queue if:
 //  - the queue for that thread was created
 //  - there's space in that queue
@@ -3639,6 +3829,56 @@ void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
        taskdata));
 }
 
+kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
+                                                kmp_task_t *task) {
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
+    td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
+    td->td_allow_completion_event.ed.task = task;
+    __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
+  }
+  return &td->td_allow_completion_event;
+}
+
+void __kmp_fulfill_event(kmp_event_t *event) {
+  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
+    kmp_task_t *ptask = event->ed.task;
+    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+    bool detached = false;
+    int gtid = __kmp_get_gtid();
+
+    if (taskdata->td_flags.proxy == TASK_PROXY) {
+      // The associated task code completed before this call and detached.
+      detached = true;
+      event->type = KMP_EVENT_UNINITIALIZED;
+    } else {
+      // The associated task has not completed but could be completing at this
+      // point.
+      // We need to take the lock to avoid races
+      __kmp_acquire_tas_lock(&event->lock, gtid);
+      if (taskdata->td_flags.proxy == TASK_PROXY)
+        detached = true;
+      event->type = KMP_EVENT_UNINITIALIZED;
+      __kmp_release_tas_lock(&event->lock, gtid);
+    }
+
+    if (detached) {
+      // If the task detached complete the proxy task
+      if (gtid >= 0) {
+        kmp_team_t *team = taskdata->td_team;
+        kmp_info_t *thread = __kmp_get_thread();
+        if (thread->th.th_team == team) {
+          __kmpc_proxy_task_completed(gtid, ptask);
+          return;
+        }
+      }
+
+      // fallback
+      __kmpc_proxy_task_completed_ooo(ptask);
+    }
+  }
+}
+
 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
 // for taskloop
 //
@@ -4206,6 +4446,7 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
   case 0: // no schedule clause specified, we can choose the default
     // let's try to schedule (team_size*10) tasks
     grainsize = thread->th.th_team_nproc * 10;
+    KMP_FALLTHROUGH();
   case 2: // num_tasks provided
     if (grainsize > tc) {
       num_tasks = tc; // too big num_tasks requested, adjust values
@@ -4289,5 +4530,3 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
   }
   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
 }
-
-#endif