1 files changed, 85 insertions, 69 deletions
diff --git a/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp b/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp
index a31490cbd162..4b188763a58a 100644
--- a/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp
+++ b/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp
@@ -231,13 +231,12 @@ void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
                              kmp_int32 num_threads) {
   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
                 global_tid, num_threads));
-
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_push_num_threads(loc, global_tid, num_threads);
 }
 
 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
-
   /* the num_threads are automatically popped */
 }
 
@@ -245,7 +244,7 @@ void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
                            kmp_int32 proc_bind) {
   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
                 proc_bind));
-
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
 }
 
@@ -298,8 +297,8 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
       }
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(gtid);
     }
+    OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
 #if INCLUDE_SSC_MARKS
@@ -325,6 +324,7 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
 #if KMP_STATS_ENABLED
   if (previous_state == stats_state_e::SERIAL_REGION) {
     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+    KMP_SET_THREAD_STATE(previous_state);
   } else {
     KMP_POP_PARTITIONED_TIMER();
   }
@@ -347,7 +347,7 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
   KA_TRACE(20,
            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
             global_tid, num_teams, num_threads));
-
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
 }
 
@@ -437,6 +437,7 @@ void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
 #if KMP_STATS_ENABLED
   if (previous_state == stats_state_e::SERIAL_REGION) {
     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+    KMP_SET_THREAD_STATE(previous_state);
   } else {
     KMP_POP_PARTITIONED_TIMER();
   }
@@ -462,9 +463,10 @@ conditional parallel region, like this,
 when the condition is false.
 */
 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
-// The implementation is now in kmp_runtime.cpp so that it can share static
-// functions with kmp_fork_call since the tasks to be done are similar in
-// each case.
+  // The implementation is now in kmp_runtime.cpp so that it can share static
+  // functions with kmp_fork_call since the tasks to be done are similar in
+  // each case.
+  __kmp_assert_valid_gtid(global_tid);
 #if OMPT_SUPPORT
   OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
@@ -492,6 +494,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     return;
 
   // Not autopar code
+  __kmp_assert_valid_gtid(global_tid);
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
@@ -667,17 +670,6 @@ void __kmpc_flush(ident_t *loc) {
 // Nothing to see here move along
 #elif KMP_ARCH_PPC64
 // Nothing needed here (we have a real MB above).
-#if KMP_OS_CNK
-  // The flushing thread needs to yield here; this prevents a
-  // busy-waiting thread from saturating the pipeline. flush is
-  // often used in loops like this:
-  // while (!flag) {
-  //   #pragma omp flush(flag)
-  // }
-  // and adding the yield here is good for at least a 10x speedup
-  // when running >2 threads per core (on the NAS LU benchmark).
-  __kmp_yield();
-#endif
 #else
 #error Unknown or unsupported architecture
 #endif
@@ -701,6 +693,7 @@ Execute a barrier.
 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
   KMP_COUNT_BLOCK(OMP_BARRIER);
   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -720,8 +713,8 @@ void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
   __kmp_threads[global_tid]->th.th_ident = loc;
   // TODO: explicit barrier_wait_id:
@@ -750,6 +743,7 @@ kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
   int status = 0;
 
   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -764,12 +758,12 @@ kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (status) {
-    if (ompt_enabled.ompt_callback_master) {
+    if (ompt_enabled.ompt_callback_masked) {
       kmp_info_t *this_thr = __kmp_threads[global_tid];
       kmp_team_t *team = this_thr->th.th_team;
 
       int tid = __kmp_tid_from_gtid(global_tid);
-      ompt_callbacks.ompt_callback(ompt_callback_master)(
+      ompt_callbacks.ompt_callback(ompt_callback_masked)(
           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
           OMPT_GET_RETURN_ADDRESS(0));
@@ -804,16 +798,16 @@ thread that executes the <tt>master</tt> region.
 */
 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
-
+  __kmp_assert_valid_gtid(global_tid);
   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
   KMP_POP_PARTITIONED_TIMER();
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[global_tid];
   kmp_team_t *team = this_thr->th.th_team;
-  if (ompt_enabled.ompt_callback_master) {
+  if (ompt_enabled.ompt_callback_masked) {
     int tid = __kmp_tid_from_gtid(global_tid);
-    ompt_callbacks.ompt_callback(ompt_callback_master)(
+    ompt_callbacks.ompt_callback(ompt_callback_masked)(
         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
         OMPT_GET_RETURN_ADDRESS(0));
@@ -821,9 +815,6 @@ void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
 #endif
 
   if (__kmp_env_consistency_check) {
-    if (global_tid < 0)
-      KMP_WARNING(ThreadIdentInvalid);
-
     if (KMP_MASTER_GTID(global_tid))
       __kmp_pop_sync(global_tid, ct_master, loc);
   }
@@ -842,6 +833,7 @@ void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -859,8 +851,8 @@ void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
   kmp_team_t *team;
   ompt_wait_id_t lck;
   void *codeptr_ra;
+  OMPT_STORE_RETURN_ADDRESS(gtid);
   if (ompt_enabled.enabled) {
-    OMPT_STORE_RETURN_ADDRESS(gtid);
     team = __kmp_team_from_gtid(gtid);
     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
     /* OMPT state update */
@@ -913,6 +905,7 @@ void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
   kmp_info_t *th;
 
   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
 
 #if USE_ITT_BUILD
   __kmp_itt_ordered_end(gtid);
@@ -1135,7 +1128,7 @@ static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
-@param global_tid  global thread number .
+@param global_tid  global thread number.
 @param crit identity of the critical section. This could be a pointer to a lock
 associated with the critical section, or some other suitably unique value.
 
@@ -1158,6 +1151,7 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
   kmp_user_lock_p lck;
 
   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   // TODO: add THR_OVHD_STATE
 
@@ -1255,7 +1249,7 @@ static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
   if (hint & kmp_lock_hint_hle)
     return KMP_TSX_LOCK(hle);
   if (hint & kmp_lock_hint_rtm)
-    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
   if (hint & kmp_lock_hint_adaptive)
     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
 
@@ -1274,9 +1268,9 @@ static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
     return lockseq_tas;
 
-  // HLE lock for speculation
+  // Use RTM lock for speculation
   if (hint & omp_lock_hint_speculative)
-    return KMP_TSX_LOCK(hle);
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
 
   return __kmp_user_lock_seq;
 }
@@ -1297,6 +1291,7 @@ __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
       return kmp_mutex_impl_spin;
 #if KMP_USE_TSX
     case locktag_hle:
+    case locktag_rtm_spin:
       return kmp_mutex_impl_speculative;
 #endif
     default:
@@ -1308,7 +1303,7 @@ __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
   switch (ilock->type) {
 #if KMP_USE_TSX
   case locktag_adaptive:
-  case locktag_rtm:
+  case locktag_rtm_queuing:
     return kmp_mutex_impl_speculative;
 #endif
   case locktag_nested_tas:
@@ -1342,7 +1337,8 @@ static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
     return kmp_mutex_impl_queuing;
 #if KMP_USE_TSX
   case lk_hle:
-  case lk_rtm:
+  case lk_rtm_queuing:
+  case lk_rtm_spin:
   case lk_adaptive:
     return kmp_mutex_impl_speculative;
 #endif
@@ -1380,6 +1376,7 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
 #endif
 
   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
   // Check if it is initialized.
@@ -1595,8 +1592,8 @@ this function.
 */
 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
   int status;
-
   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -1612,8 +1609,8 @@ kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[global_tid]->th.th_ident = loc;
@@ -1639,7 +1636,7 @@ still be waiting at the barrier and this call releases them.
 */
 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
-
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
 }
 
@@ -1655,8 +1652,8 @@ There is no equivalent "end" function, since the
 */
 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
   kmp_int32 ret;
-
   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -1676,8 +1673,8 @@ kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[global_tid]->th.th_ident = loc;
@@ -1694,14 +1691,9 @@ kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
   if (__kmp_env_consistency_check) {
     /*  there's no __kmpc_end_master called; so the (stats) */
     /*  actions of __kmpc_end_master are done here          */
-
-    if (global_tid < 0) {
-      KMP_WARNING(ThreadIdentInvalid);
-    }
     if (ret) {
       /* only one thread should do the pop since only */
       /* one did the push (see __kmpc_master())       */
-
       __kmp_pop_sync(global_tid, ct_master, loc);
     }
   }
@@ -1722,6 +1714,7 @@ should introduce an explicit barrier if it is required.
 */
 
 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
+  __kmp_assert_valid_gtid(global_tid);
   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
 
   if (rc) {
@@ -1774,6 +1767,7 @@ only be called by the thread that executed the block of code protected
 by the `single` construct.
 */
 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_exit_single(global_tid);
   KMP_POP_PARTITIONED_TIMER();
 
@@ -1847,7 +1841,7 @@ void ompc_set_dynamic(int flag) {
 
   __kmp_save_internal_controls(thread);
 
-  set__dynamic(thread, flag ? TRUE : FALSE);
+  set__dynamic(thread, flag ? true : false);
 }
 
 void ompc_set_nested(int flag) {
@@ -2053,8 +2047,8 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
                         void *cpy_data, void (*cpy_func)(void *, void *),
                         kmp_int32 didit) {
   void **data_ptr;
-
   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
 
   KMP_MB();
 
@@ -2077,8 +2071,8 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 /* This barrier is not a barrier region boundary */
 #if USE_ITT_NOTIFY
@@ -2091,11 +2085,9 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
 
 // Consider next barrier a user-visible barrier for barrier region boundaries
 // Nesting checks are already handled by the single construct checks
-
+  {
 #if OMPT_SUPPORT
-  if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
-  }
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
@@ -2107,6 +2099,7 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
+  }
 }
 
 /* -------------------------------------------------------------------------- */
@@ -2153,7 +2146,8 @@ __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
                                kmp_dyna_lockseq_t seq) {
 #if KMP_USE_TSX
   // Don't have nested lock implementation for speculative locks
-  if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
+  if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
+      seq == lockseq_rtm_spin || seq == lockseq_adaptive)
     seq = __kmp_user_lock_seq;
 #endif
   switch (seq) {
@@ -3338,7 +3332,7 @@ __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
   th->th.th_team = team;
   th->th.th_team_nproc = team->t.t_nproc;
   th->th.th_task_team = team->t.t_task_team[task_state];
-  th->th.th_task_state = task_state;
+  __kmp_type_convert(task_state, &(th->th.th_task_state));
 }
 
 /* 2.a.i. Reduce Block without a terminating barrier */
@@ -3370,6 +3364,7 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
   kmp_team_t *team;
   int teams_swapped = 0, task_state;
   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   // why do we need this initialization here at all?
   // Reduction clause can not be used as a stand-alone directive.
@@ -3469,8 +3464,8 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
@@ -3523,6 +3518,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
   PACKED_REDUCTION_METHOD_T packed_reduction_method;
 
   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
 
@@ -3597,6 +3593,7 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
   int teams_swapped = 0, task_state;
 
   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   // why do we need this initialization here at all?
   // Reduction clause can not be a stand-alone directive.
@@ -3656,8 +3653,8 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident =
@@ -3715,6 +3712,7 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
   int teams_swapped = 0, task_state;
 
   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   th = __kmp_thread_from_gtid(global_tid);
   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
@@ -3737,8 +3735,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
@@ -3763,8 +3761,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
@@ -3784,8 +3782,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 // TODO: implicit barrier: should be exposed
 #if USE_ITT_NOTIFY
@@ -3871,6 +3869,7 @@ e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
 */
 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
                           const struct kmp_dim *dims) {
+  __kmp_assert_valid_gtid(gtid);
   int j, idx;
   kmp_int64 last, trace_count;
   kmp_info_t *th = __kmp_threads[gtid];
@@ -3966,7 +3965,8 @@ void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
 #endif
   if (flags == NULL) {
     // we are the first thread, allocate the array of flags
-    size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
+    size_t size =
+        (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
     KMP_MB();
     sh_buf->doacross_flags = flags;
@@ -3990,7 +3990,9 @@ void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
 }
 
 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
-  kmp_int32 shft, num_dims, i;
+  __kmp_assert_valid_gtid(gtid);
+  kmp_int64 shft;
+  size_t num_dims, i;
   kmp_uint32 flag;
   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
   kmp_info_t *th = __kmp_threads[gtid];
@@ -4007,7 +4009,7 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
   // calculate sequential iteration number and check out-of-bounds condition
   pr_buf = th->th.th_dispatch;
   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
-  num_dims = pr_buf->th_doacross_info[0];
+  num_dims = (size_t)pr_buf->th_doacross_info[0];
   lo = pr_buf->th_doacross_info[2];
   up = pr_buf->th_doacross_info[3];
   st = pr_buf->th_doacross_info[4];
@@ -4045,7 +4047,7 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
 #endif
   for (i = 1; i < num_dims; ++i) {
     kmp_int64 iter, ln;
-    kmp_int32 j = i * 4;
+    size_t j = i * 4;
     ln = pr_buf->th_doacross_info[j + 1];
     lo = pr_buf->th_doacross_info[j + 2];
     up = pr_buf->th_doacross_info[j + 3];
@@ -4091,7 +4093,7 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_dependences) {
     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
-        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
+        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
   }
 #endif
   KA_TRACE(20,
@@ -4100,7 +4102,9 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
 }
 
 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
-  kmp_int32 shft, num_dims, i;
+  __kmp_assert_valid_gtid(gtid);
+  kmp_int64 shft;
+  size_t num_dims, i;
   kmp_uint32 flag;
   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
   kmp_info_t *th = __kmp_threads[gtid];
@@ -4118,7 +4122,7 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
   // out-of-bounds checks)
   pr_buf = th->th.th_dispatch;
   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
-  num_dims = pr_buf->th_doacross_info[0];
+  num_dims = (size_t)pr_buf->th_doacross_info[0];
   lo = pr_buf->th_doacross_info[2];
   st = pr_buf->th_doacross_info[4];
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -4137,7 +4141,7 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
 #endif
   for (i = 1; i < num_dims; ++i) {
     kmp_int64 iter, ln;
-    kmp_int32 j = i * 4;
+    size_t j = i * 4;
     ln = pr_buf->th_doacross_info[j + 1];
     lo = pr_buf->th_doacross_info[j + 2];
     st = pr_buf->th_doacross_info[j + 4];
@@ -4157,7 +4161,7 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_dependences) {
     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
-        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
+        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
   }
 #endif
   shft = iter_number % 32; // use 32-bit granularity
@@ -4171,6 +4175,7 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
 }
 
 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
+  __kmp_assert_valid_gtid(gtid);
   kmp_int32 num_done;
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_team_t *team = th->th.th_team;
@@ -4181,7 +4186,8 @@ void __kmpc_doacross_fini(ident_t *loc, int gtid) {
     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
     return; // nothing to do
   }
-  num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
+  num_done =
+      KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
   if (num_done == th->th.th_team_nproc) {
     // we are the last thread, need to free shared resources
     int idx = pr_buf->th_doacross_buf_idx - 1;
@@ -4204,11 +4210,21 @@ void __kmpc_doacross_fini(ident_t *loc, int gtid) {
   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
 }
 
-/* omp_alloc/omp_free only defined for C/C++, not for Fortran */
+/* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
 }
 
+void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
+  return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
+}
+
+void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
+                  omp_allocator_handle_t free_allocator) {
+  return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
+                        free_allocator);
+}
+
 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
 }