diff options
Diffstat (limited to 'runtime/src/kmp_runtime.cpp')
-rw-r--r-- | runtime/src/kmp_runtime.cpp | 741 |
1 files changed, 383 insertions, 358 deletions
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp index 0db376159fc7a..7f6c149c792e7 100644 --- a/runtime/src/kmp_runtime.cpp +++ b/runtime/src/kmp_runtime.cpp @@ -4,10 +4,9 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source Licenses. See LICENSE.txt for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -47,16 +46,8 @@ char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes"; #endif /* defined(KMP_GOMP_COMPAT) */ -char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: " -#if OMP_50_ENABLED - "5.0 (201611)"; -#elif OMP_45_ENABLED - "4.5 (201511)"; -#elif OMP_40_ENABLED - "4.0 (201307)"; -#else - "3.1 (201107)"; -#endif +char const __kmp_version_omp_api[] = + KMP_VERSION_PREFIX "API version: 5.0 (201611)"; #ifdef KMP_DEBUG char const __kmp_version_lock[] = @@ -80,7 +71,7 @@ static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc); -#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED +#if KMP_AFFINITY_SUPPORTED static void __kmp_partition_places(kmp_team_t *team, int update_master_only = 0); #endif @@ -328,7 +319,7 @@ void __kmp_infinite_loop(void) { static int done = FALSE; while (!done) { - KMP_YIELD(1); + KMP_YIELD(TRUE); } } @@ -533,22 +524,10 @@ static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, &team->t.t_disp_buffer[num_disp_buff], sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer", header, team_id); - - __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data, - sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, - team_id); } -static void __kmp_init_allocator() { -#if OMP_50_ENABLED - __kmp_init_memkind(); -#endif -} -static void __kmp_fini_allocator() { -#if OMP_50_ENABLED - __kmp_fini_memkind(); -#endif -} +static void __kmp_init_allocator() { __kmp_init_memkind(); } +static void __kmp_fini_allocator() { __kmp_fini_memkind(); } /* ------------------------------------------------------------------------ */ @@ -673,24 +652,6 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { #endif /* KMP_OS_WINDOWS */ #endif /* KMP_DYNAMIC_LIB */ -/* Change the library type to "status" and return the old type */ -/* called from within initialization routines where __kmp_initz_lock is held */ -int __kmp_change_library(int status) { - int old_status; - - old_status = __kmp_yield_init & - 1; // check whether KMP_LIBRARY=throughput (even init count) - - if (status) { - __kmp_yield_init |= 1; // throughput => turnaround (odd init count) - } else { - __kmp_yield_init &= ~1; // turnaround => throughput (even init count) - } - - return old_status; // return previous setting of whether - // KMP_LIBRARY=throughput -} - /* __kmp_parallel_deo -- Wait until it's our turn. */ void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { int gtid = *gtid_ref; @@ -709,8 +670,8 @@ void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { #ifdef BUILD_PARALLEL_ORDERED if (!team->t.t_serialized) { KMP_MB(); - KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), - KMP_EQ, NULL); + KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, + NULL); KMP_MB(); } #endif /* BUILD_PARALLEL_ORDERED */ @@ -751,6 +712,7 @@ int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { if (!TCR_4(__kmp_init_parallel)) __kmp_parallel_initialize(); + __kmp_resume_if_soft_paused(); th = __kmp_threads[gtid]; team = th->th.th_team; @@ -773,10 +735,7 @@ int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { } #if USE_ITT_BUILD if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && - KMP_MASTER_GTID(gtid) && -#if OMP_40_ENABLED - th->th.th_teams_microtask == NULL && -#endif + KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && team->t.t_active_level == 1) { // Only report metadata by master of active team at level 1 __kmp_itt_metadata_single(id_ref); @@ -814,16 +773,13 @@ void __kmp_exit_single(int gtid) { * otherwise the number of threads to use * The forkjoin lock is held by the caller. */ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, - int master_tid, int set_nthreads -#if OMP_40_ENABLED - , - int enter_teams -#endif /* OMP_40_ENABLED */ - ) { + int master_tid, int set_nthreads, + int enter_teams) { int capacity; int new_nthreads; KMP_DEBUG_ASSERT(__kmp_init_serial); KMP_DEBUG_ASSERT(root && parent_team); + kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; // If dyn-var is set, dynamically adjust the number of desired threads, // according to the method specified by dynamic_mode. @@ -913,10 +869,12 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, } // Respect OMP_THREAD_LIMIT - if (root->r.r_cg_nthreads + new_nthreads - + int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; + int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; + if (cg_nthreads + new_nthreads - (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > - __kmp_cg_max_nth) { - int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads + + max_cg_threads) { + int tl_nthreads = max_cg_threads - cg_nthreads + (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); if (tl_nthreads <= 0) { tl_nthreads = 1; @@ -1069,11 +1027,9 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, __kmp_gtid_from_tid(i, team), team->t.t_id, i, team->t.t_bar[bs_forkjoin_barrier].b_arrived, team->t.t_bar[bs_plain_barrier].b_arrived)); -#if OMP_40_ENABLED thr->th.th_teams_microtask = master_th->th.th_teams_microtask; thr->th.th_teams_level = master_th->th.th_teams_level; thr->th.th_teams_size = master_th->th.th_teams_size; -#endif { // Initialize threads' barrier data. int b; kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; @@ -1087,12 +1043,11 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, } } -#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED +#if KMP_AFFINITY_SUPPORTED __kmp_partition_places(team); #endif } -#if OMP_50_ENABLED if (__kmp_display_affinity && team->t.t_display_affinity != 1) { for (i = 0; i < team->t.t_nproc; i++) { kmp_info_t *thr = team->t.t_threads[i]; @@ -1103,7 +1058,6 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, } } } -#endif KMP_MB(); } @@ -1188,6 +1142,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { if (!TCR_4(__kmp_init_parallel)) __kmp_parallel_initialize(); + __kmp_resume_if_soft_paused(); this_thr = __kmp_threads[global_tid]; serial_team = this_thr->th.th_serial_team; @@ -1208,7 +1163,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { this_thr->th.th_task_team = NULL; } -#if OMP_40_ENABLED kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { proc_bind = proc_bind_false; @@ -1219,7 +1173,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { } // Reset for next parallel region this_thr->th.th_set_proc_bind = proc_bind_default; -#endif /* OMP_40_ENABLED */ #if OMPT_SUPPORT ompt_data_t ompt_parallel_data = ompt_data_none; @@ -1254,15 +1207,13 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); - new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1, + new_team = + __kmp_allocate_team(this_thr->th.th_root, 1, 1, #if OMPT_SUPPORT - ompt_parallel_data, + ompt_parallel_data, #endif -#if OMP_40_ENABLED - proc_bind, -#endif - &this_thr->th.th_current_task->td_icvs, - 0 USE_NESTED_HOT_ARG(NULL)); + proc_bind, &this_thr->th.th_current_task->td_icvs, + 0 USE_NESTED_HOT_ARG(NULL)); __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); KMP_ASSERT(new_team); @@ -1319,13 +1270,11 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { __kmp_nested_nth.nth[level + 1]; } -#if OMP_40_ENABLED if (__kmp_nested_proc_bind.used && (level + 1 < __kmp_nested_proc_bind.used)) { this_thr->th.th_current_task->td_icvs.proc_bind = __kmp_nested_proc_bind.bind_types[level + 1]; } -#endif /* OMP_40_ENABLED */ #if USE_DEBUGGER serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. @@ -1339,9 +1288,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; -#if OMP_50_ENABLED serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save -#endif propagateFPControl(serial_team); @@ -1391,11 +1338,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { KMP_MB(); } -#if OMP_40_ENABLED KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); -#endif -#if OMP_50_ENABLED // Perform the display affinity functionality for // serialized parallel regions if (__kmp_display_affinity) { @@ -1407,7 +1351,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { this_thr->th.th_prev_num_threads = 1; } } -#endif if (__kmp_env_consistency_check) __kmp_push_parallel(global_tid, NULL); @@ -1465,10 +1408,8 @@ int __kmp_fork_call(ident_t *loc, int gtid, int master_active; int master_set_numthreads; int level; -#if OMP_40_ENABLED int active_level; int teams_level; -#endif #if KMP_NESTED_HOT_TEAMS kmp_hot_team_ptr_t **p_hot_teams; #endif @@ -1491,6 +1432,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown if (!TCR_4(__kmp_init_parallel)) __kmp_parallel_initialize(); + __kmp_resume_if_soft_paused(); /* setup current data */ master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with @@ -1520,10 +1462,8 @@ int __kmp_fork_call(ident_t *loc, int gtid, level = parent_team->t.t_level; // used to launch non-serial teams even if nested is not allowed active_level = parent_team->t.t_active_level; -#if OMP_40_ENABLED // needed to check nesting inside the teams teams_level = master_th->th.th_teams_level; -#endif #if KMP_NESTED_HOT_TEAMS p_hot_teams = &master_th->th.th_hot_teams; if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { @@ -1551,7 +1491,6 @@ int __kmp_fork_call(ident_t *loc, int gtid, master_th->th.th_ident = loc; -#if OMP_40_ENABLED if (master_th->th.th_teams_microtask && ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level) { // AC: This is start of parallel that is nested inside teams construct. @@ -1645,9 +1584,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, KMP_ATOMIC_INC(&root->r.r_in_parallel); parent_team->t.t_active_level++; parent_team->t.t_level++; -#if OMP_50_ENABLED parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save -#endif /* Change number of threads in the team if requested */ if (master_set_numthreads) { // The parallel has num_threads clause @@ -1695,7 +1632,6 @@ int __kmp_fork_call(ident_t *loc, int gtid, return TRUE; } // Parallel closely nested in teams construct -#endif /* OMP_40_ENABLED */ #if KMP_DEBUG if (__kmp_tasking_mode != tskm_immediate_exec) { @@ -1708,10 +1644,8 @@ int __kmp_fork_call(ident_t *loc, int gtid, master_th->th.th_current_task->td_icvs.max_active_levels) { nthreads = 1; } else { -#if OMP_40_ENABLED int enter_teams = ((ap == NULL && active_level == 0) || (ap && teams_level > 0 && teams_level == level)); -#endif nthreads = master_set_numthreads ? master_set_numthreads @@ -1723,11 +1657,8 @@ int __kmp_fork_call(ident_t *loc, int gtid, // parallel out of teams construct). This code moved here from // __kmp_reserve_threads() to speedup nested serialized parallels. if (nthreads > 1) { - if ((!get__nested(master_th) && (root->r.r_in_parallel -#if OMP_40_ENABLED - && !enter_teams -#endif /* OMP_40_ENABLED */ - )) || + if ((get__max_active_levels(master_th) == 1 && + (root->r.r_in_parallel && !enter_teams)) || (__kmp_library == library_serial)) { KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" " threads\n", @@ -1738,18 +1669,12 @@ int __kmp_fork_call(ident_t *loc, int gtid, if (nthreads > 1) { /* determine how many new threads we can use */ __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); - nthreads = __kmp_reserve_threads( - root, parent_team, master_tid, nthreads -#if OMP_40_ENABLED - /* AC: If we execute teams from parallel region (on host), then - teams should be created but each can only have 1 thread if - nesting is disabled. If teams called from serial region, then - teams and their threads should be created regardless of the - nesting setting. */ - , - enter_teams -#endif /* OMP_40_ENABLED */ - ); + /* AC: If we execute teams from parallel region (on host), then teams + should be created but each can only have 1 thread if nesting is + disabled. If teams called from serial region, then teams and their + threads should be created regardless of the nesting setting. */ + nthreads = __kmp_reserve_threads(root, parent_team, master_tid, + nthreads, enter_teams); if (nthreads == 1) { // Free lock for single thread execution here; for multi-thread // execution it will be freed later after team of threads created @@ -1782,7 +1707,6 @@ int __kmp_fork_call(ident_t *loc, int gtid, if (call_context == fork_context_intel) { /* TODO this sucks, use the compiler itself to pass args! :) */ master_th->th.th_serial_team->t.t_ident = loc; -#if OMP_40_ENABLED if (!ap) { // revert change made in __kmpc_serialized_parallel() master_th->th.th_serial_team->t.t_level--; @@ -1877,14 +1801,13 @@ int __kmp_fork_call(ident_t *loc, int gtid, // AC: call special invoker for outer "parallel" of teams construct invoker(gtid); } else { -#endif /* OMP_40_ENABLED */ argv = args; for (i = argc - 1; i >= 0; --i) // TODO: revert workaround for Intel(R) 64 tracker #96 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX *argv++ = va_arg(*ap, void *); #else - *argv++ = va_arg(ap, void *); + *argv++ = va_arg(ap, void *); #endif KMP_MB(); @@ -1950,9 +1873,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, master_th->th.ompt_thread_info.state = ompt_state_overhead; } #endif -#if OMP_40_ENABLED } -#endif /* OMP_40_ENABLED */ } else if (call_context == fork_context_gnu) { #if OMPT_SUPPORT ompt_lw_taskteam_t lwt; @@ -1989,10 +1910,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); master_th->th.th_current_task->td_flags.executing = 0; -#if OMP_40_ENABLED - if (!master_th->th.th_teams_microtask || level > teams_level) -#endif /* OMP_40_ENABLED */ - { + if (!master_th->th.th_teams_microtask || level > teams_level) { /* Increment our nested depth level */ KMP_ATOMIC_INC(&root->r.r_in_parallel); } @@ -2006,7 +1924,6 @@ int __kmp_fork_call(ident_t *loc, int gtid, nthreads_icv = 0; // don't update } -#if OMP_40_ENABLED // Figure out the proc_bind_policy for the new team. kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; kmp_proc_bind_t proc_bind_icv = @@ -2032,25 +1949,17 @@ int __kmp_fork_call(ident_t *loc, int gtid, // Reset for next parallel region master_th->th.th_set_proc_bind = proc_bind_default; -#endif /* OMP_40_ENABLED */ - if ((nthreads_icv > 0) -#if OMP_40_ENABLED - || (proc_bind_icv != proc_bind_default) -#endif /* OMP_40_ENABLED */ - ) { + if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { kmp_internal_control_t new_icvs; copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); new_icvs.next = NULL; if (nthreads_icv > 0) { new_icvs.nproc = nthreads_icv; } - -#if OMP_40_ENABLED if (proc_bind_icv != proc_bind_default) { new_icvs.proc_bind = proc_bind_icv; } -#endif /* OMP_40_ENABLED */ /* allocate a new parallel team */ KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); @@ -2058,10 +1967,8 @@ int __kmp_fork_call(ident_t *loc, int gtid, #if OMPT_SUPPORT ompt_parallel_data, #endif -#if OMP_40_ENABLED - proc_bind, -#endif - &new_icvs, argc USE_NESTED_HOT_ARG(master_th)); + proc_bind, &new_icvs, + argc USE_NESTED_HOT_ARG(master_th)); } else { /* allocate a new parallel team */ KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); @@ -2069,9 +1976,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, #if OMPT_SUPPORT ompt_parallel_data, #endif -#if OMP_40_ENABLED proc_bind, -#endif &master_th->th.th_current_task->td_icvs, argc USE_NESTED_HOT_ARG(master_th)); } @@ -2089,15 +1994,12 @@ int __kmp_fork_call(ident_t *loc, int gtid, return_address); #endif KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe -// TODO: parent_team->t.t_level == INT_MAX ??? -#if OMP_40_ENABLED + // TODO: parent_team->t.t_level == INT_MAX ??? if (!master_th->th.th_teams_microtask || level > teams_level) { -#endif /* OMP_40_ENABLED */ int new_level = parent_team->t.t_level + 1; KMP_CHECK_UPDATE(team->t.t_level, new_level); new_level = parent_team->t.t_active_level + 1; KMP_CHECK_UPDATE(team->t.t_active_level, new_level); -#if OMP_40_ENABLED } else { // AC: Do not increase parallel level at start of the teams construct int new_level = parent_team->t.t_level; @@ -2105,17 +2007,12 @@ int __kmp_fork_call(ident_t *loc, int gtid, new_level = parent_team->t.t_active_level; KMP_CHECK_UPDATE(team->t.t_active_level, new_level); } -#endif /* OMP_40_ENABLED */ kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); -#if OMP_40_ENABLED KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); -#endif -#if OMP_50_ENABLED KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); -#endif // Update the floating point rounding in the team if required. propagateFPControl(team); @@ -2191,27 +2088,23 @@ int __kmp_fork_call(ident_t *loc, int gtid, /* now, setup the arguments */ argv = (void **)team->t.t_argv; -#if OMP_40_ENABLED if (ap) { -#endif /* OMP_40_ENABLED */ for (i = argc - 1; i >= 0; --i) { // TODO: revert workaround for Intel(R) 64 tracker #96 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX void *new_argv = va_arg(*ap, void *); #else - void *new_argv = va_arg(ap, void *); + void *new_argv = va_arg(ap, void *); #endif KMP_CHECK_UPDATE(*argv, new_argv); argv++; } -#if OMP_40_ENABLED } else { for (i = 0; i < argc; ++i) { // Get args from parent team for teams construct KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); } } -#endif /* OMP_40_ENABLED */ /* now actually fork the threads */ KMP_CHECK_UPDATE(team->t.t_master_active, master_active); @@ -2230,10 +2123,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, #if USE_ITT_BUILD if (team->t.t_active_level == 1 // only report frames at level 1 -#if OMP_40_ENABLED - && !master_th->th.th_teams_microtask // not in teams construct -#endif /* OMP_40_ENABLED */ - ) { + && !master_th->th.th_teams_microtask) { // not in teams construct #if USE_ITT_NOTIFY if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && (__kmp_forkjoin_frames_mode == 3 || @@ -2271,12 +2161,9 @@ int __kmp_fork_call(ident_t *loc, int gtid, } #endif /* USE_ITT_BUILD */ -#if OMP_40_ENABLED // AC: skip __kmp_internal_fork at teams construct, let only master // threads execute - if (ap) -#endif /* OMP_40_ENABLED */ - { + if (ap) { __kmp_internal_fork(loc, gtid, team); KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " "master_th=%p, gtid=%d\n", @@ -2293,9 +2180,25 @@ int __kmp_fork_call(ident_t *loc, int gtid, team->t.t_id, team->t.t_pkfn)); } // END of timer KMP_fork_call block +#if KMP_STATS_ENABLED + // If beginning a teams construct, then change thread state + stats_state_e previous_state = KMP_GET_THREAD_STATE(); + if (!ap) { + KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); + } +#endif + if (!team->t.t_invoke(gtid)) { KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); } + +#if KMP_STATS_ENABLED + // If was beginning of a teams construct, then reset thread state + if (!ap) { + KMP_SET_THREAD_STATE(previous_state); + } +#endif + KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, team->t.t_id, team->t.t_pkfn)); KMP_MB(); /* Flush all pending memory write invalidates. */ @@ -2340,18 +2243,14 @@ void __kmp_join_call(ident_t *loc, int gtid , enum fork_context_e fork_context #endif -#if OMP_40_ENABLED , - int exit_teams -#endif /* OMP_40_ENABLED */ - ) { + int exit_teams) { KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); kmp_team_t *team; kmp_team_t *parent_team; kmp_info_t *master_th; kmp_root_t *root; int master_active; - int i; KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); @@ -2382,7 +2281,6 @@ void __kmp_join_call(ident_t *loc, int gtid #endif if (team->t.t_serialized) { -#if OMP_40_ENABLED if (master_th->th.th_teams_microtask) { // We are in teams construct int level = team->t.t_level; @@ -2398,7 +2296,6 @@ void __kmp_join_call(ident_t *loc, int gtid team->t.t_serialized++; } } -#endif /* OMP_40_ENABLED */ __kmpc_end_serialized_parallel(loc, gtid); #if OMPT_SUPPORT @@ -2412,20 +2309,14 @@ void __kmp_join_call(ident_t *loc, int gtid master_active = team->t.t_master_active; -#if OMP_40_ENABLED - if (!exit_teams) -#endif /* OMP_40_ENABLED */ - { + if (!exit_teams) { // AC: No barrier for internal teams at exit from teams construct. // But there is barrier for external team (league). __kmp_internal_join(loc, gtid, team); - } -#if OMP_40_ENABLED - else { + } else { master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel) } -#endif /* OMP_40_ENABLED */ KMP_MB(); @@ -2442,11 +2333,8 @@ void __kmp_join_call(ident_t *loc, int gtid } // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. - if (team->t.t_active_level == 1 -#if OMP_40_ENABLED - && !master_th->th.th_teams_microtask /* not in teams construct */ -#endif /* OMP_40_ENABLED */ - ) { + if (team->t.t_active_level == 1 && + !master_th->th.th_teams_microtask) { /* not in teams construct */ master_th->th.th_ident = loc; // only one notification scheme (either "submit" or "forking/joined", not // both) @@ -2461,7 +2349,6 @@ void __kmp_join_call(ident_t *loc, int gtid } // active_level == 1 #endif /* USE_ITT_BUILD */ -#if OMP_40_ENABLED if (master_th->th.th_teams_microtask && !exit_teams && team->t.t_pkfn != (microtask_t)__kmp_teams_master && team->t.t_level == master_th->th.th_teams_level + 1) { @@ -2474,21 +2361,24 @@ void __kmp_join_call(ident_t *loc, int gtid team->t.t_active_level--; KMP_ATOMIC_DEC(&root->r.r_in_parallel); - /* Restore number of threads in the team if needed */ + // Restore number of threads in the team if needed. This code relies on + // the proper adjustment of th_teams_size.nth after the fork in + // __kmp_teams_master on each teams master in the case that + // __kmp_reserve_threads reduced it. if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { int old_num = master_th->th.th_team_nproc; int new_num = master_th->th.th_teams_size.nth; kmp_info_t **other_threads = team->t.t_threads; team->t.t_nproc = new_num; - for (i = 0; i < old_num; ++i) { + for (int i = 0; i < old_num; ++i) { other_threads[i]->th.th_team_nproc = new_num; } // Adjust states of non-used threads of the team - for (i = old_num; i < new_num; ++i) { + for (int i = old_num; i < new_num; ++i) { // Re-initialize thread's barrier data. - int b; + KMP_DEBUG_ASSERT(other_threads[i]); kmp_balign_t *balign = other_threads[i]->th.th_bar; - for (b = 0; b < bs_last_barrier; ++b) { + for (int b = 0; b < bs_last_barrier; ++b) { balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); #if USE_DEBUGGER @@ -2511,7 +2401,6 @@ void __kmp_join_call(ident_t *loc, int gtid return; } -#endif /* OMP_40_ENABLED */ /* do cleanup and restore the parent team */ master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; @@ -2524,11 +2413,8 @@ void __kmp_join_call(ident_t *loc, int gtid from the serial user code called after this function returns. */ __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); -#if OMP_40_ENABLED if (!master_th->th.th_teams_microtask || - team->t.t_level > master_th->th.th_teams_level) -#endif /* OMP_40_ENABLED */ - { + team->t.t_level > master_th->th.th_teams_level) { /* Decrement our nested depth level */ KMP_ATOMIC_DEC(&root->r.r_in_parallel); } @@ -2553,14 +2439,12 @@ void __kmp_join_call(ident_t *loc, int gtid master_th, team)); __kmp_pop_current_task_from_thread(master_th); -#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED +#if KMP_AFFINITY_SUPPORTED // Restore master thread's partition. master_th->th.th_first_place = team->t.t_first_place; master_th->th.th_last_place = team->t.t_last_place; -#endif /* OMP_40_ENABLED */ -#if OMP_50_ENABLED +#endif // KMP_AFFINITY_SUPPORTED master_th->th.th_def_allocator = team->t.t_def_allocator; -#endif updateHWFPControl(team); @@ -2791,9 +2675,13 @@ int __kmp_get_max_active_levels(int gtid) { return thread->th.th_current_task->td_icvs.max_active_levels; } +KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); +KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); + /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { kmp_info_t *thread; + kmp_sched_t orig_kind; // kmp_team_t *team; KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", @@ -2804,6 +2692,9 @@ void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { // Valid parameters should fit in one of two intervals - standard or extended: // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 + orig_kind = kind; + kind = __kmp_sched_without_mods(kind); + if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { // TODO: Hint needs attention in case we change the default schedule. @@ -2834,6 +2725,8 @@ void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2]; } + __kmp_sched_apply_mods_intkind( + orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); if (kind == kmp_sched_auto || chunk < 1) { // ignore parameter chunk for schedule auto thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; @@ -2853,12 +2746,12 @@ void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { thread = __kmp_threads[gtid]; th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; - - switch (th_type) { + switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { case kmp_sch_static: case kmp_sch_static_greedy: case kmp_sch_static_balanced: *kind = kmp_sched_static; + __kmp_sched_apply_mods_stdkind(kind, th_type); *chunk = 0; // chunk was not set, try to show this fact via zero value return; case kmp_sch_static_chunked: @@ -2887,6 +2780,7 @@ void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { KMP_FATAL(UnknownSchedulingType, th_type); } + __kmp_sched_apply_mods_stdkind(kind, th_type); *chunk = thread->th.th_current_task->td_icvs.sched.chunk; } @@ -2910,7 +2804,6 @@ int __kmp_get_ancestor_thread_num(int gtid, int level) { if (level > ii) return -1; -#if OMP_40_ENABLED if (thr->th.th_teams_microtask) { // AC: we are in teams region where multiple nested teams have same level int tlevel = thr->th.th_teams_level; // the level of the teams construct @@ -2926,7 +2819,6 @@ int __kmp_get_ancestor_thread_num(int gtid, int level) { } } } -#endif if (ii == level) return __kmp_tid_from_gtid(gtid); @@ -2970,7 +2862,6 @@ int __kmp_get_team_size(int gtid, int level) { if (level > ii) return -1; -#if OMP_40_ENABLED if (thr->th.th_teams_microtask) { // AC: we are in teams region where multiple nested teams have same level int tlevel = thr->th.th_teams_level; // the level of the teams construct @@ -2986,7 +2877,6 @@ int __kmp_get_team_size(int gtid, int level) { } } } -#endif while (ii > level) { for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { @@ -3015,15 +2905,18 @@ kmp_r_sched_t __kmp_get_schedule_global() { // __kmp_guided. __kmp_sched should keep original value, so that user can set // KMP_SCHEDULE multiple times, and thus have different run-time schedules in // different roots (even in OMP 2.5) - if (__kmp_sched == kmp_sch_static) { + enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); + enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); + if (s == kmp_sch_static) { // replace STATIC with more detailed schedule (balanced or greedy) r_sched.r_sched_type = __kmp_static; - } else if (__kmp_sched == kmp_sch_guided_chunked) { + } else if (s == kmp_sch_guided_chunked) { // replace GUIDED with more detailed schedule (iterative or analytical) r_sched.r_sched_type = __kmp_guided; } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other r_sched.r_sched_type = __kmp_sched; } + SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); if (__kmp_chunk < KMP_DEFAULT_CHUNK) { // __kmp_chunk may be wrong here (if it was not ever set) @@ -3099,9 +2992,7 @@ static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { /* setup dispatch buffers */ for (i = 0; i < num_disp_buff; ++i) { team->t.t_disp_buffer[i].buffer_index = i; -#if OMP_45_ENABLED team->t.t_disp_buffer[i].doacross_buf_idx = i; -#endif } } @@ -3146,14 +3037,10 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) { kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals -#if OMP_40_ENABLED KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); -#endif /* OMP_40_ENABLED */ kmp_internal_control_t g_icvs = { 0, // int serial_nesting_level; //corresponds to value of th_team_serialized - (kmp_int8)__kmp_dflt_nested, // int nested; //internal control - // for nested parallelism (per thread) (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic // adjustment of threads (per thread) (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for @@ -3166,14 +3053,13 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) { __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for // next parallel region (per thread) // (use a max ub on value if __kmp_parallel_initialize not called yet) + __kmp_cg_max_nth, // int thread_limit; __kmp_dflt_max_active_levels, // int max_active_levels; //internal control // for max_active_levels r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule -// {sched,chunk} pair -#if OMP_40_ENABLED + // {sched,chunk} pair __kmp_nested_proc_bind.bind_types[0], __kmp_default_device, -#endif /* OMP_40_ENABLED */ NULL // struct kmp_internal_control *next; }; @@ -3208,8 +3094,6 @@ static void __kmp_initialize_root(kmp_root_t *root) { root->r.r_active = FALSE; root->r.r_in_parallel = 0; root->r.r_blocktime = __kmp_dflt_blocktime; - root->r.r_nested = __kmp_dflt_nested; - root->r.r_cg_nthreads = 1; /* setup the root team for this task */ /* allocate the root team structure */ @@ -3222,10 +3106,7 @@ static void __kmp_initialize_root(kmp_root_t *root) { #if OMPT_SUPPORT ompt_data_none, // root parallel id #endif -#if OMP_40_ENABLED - __kmp_nested_proc_bind.bind_types[0], -#endif - &r_icvs, + __kmp_nested_proc_bind.bind_types[0], &r_icvs, 0 // argc USE_NESTED_HOT_ARG(NULL) // master thread is unknown ); @@ -3262,10 +3143,7 @@ static void __kmp_initialize_root(kmp_root_t *root) { #if OMPT_SUPPORT ompt_data_none, // root parallel id #endif -#if OMP_40_ENABLED - __kmp_nested_proc_bind.bind_types[0], -#endif - &r_icvs, + __kmp_nested_proc_bind.bind_types[0], &r_icvs, 0 // argc USE_NESTED_HOT_ARG(NULL) // master thread is unknown ); @@ -3406,9 +3284,7 @@ void __kmp_print_structure(void) { thread->th.th_team_master); __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); -#if OMP_40_ENABLED __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); -#endif __kmp_print_structure_thread(" Next in pool: ", thread->th.th_next_pool); __kmp_printf("\n"); @@ -3434,7 +3310,6 @@ void __kmp_print_structure(void) { __kmp_print_structure_thread(" Uber Thread: ", root->r.r_uber_thread); __kmp_printf(" Active?: %2d\n", root->r.r_active); - __kmp_printf(" Nested?: %2d\n", root->r.r_nested); __kmp_printf(" In Parallel: %2d\n", KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); __kmp_printf("\n"); @@ -3775,15 +3650,12 @@ int __kmp_register_root(int initial_thread) { if (!root_thread->th.th_serial_team) { kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); - root_thread->th.th_serial_team = - __kmp_allocate_team(root, 1, 1, + root_thread->th.th_serial_team = __kmp_allocate_team( + root, 1, 1, #if OMPT_SUPPORT - ompt_data_none, // root parallel id -#endif -#if OMP_40_ENABLED - proc_bind_default, + ompt_data_none, // root parallel id #endif - &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); + proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); } KMP_ASSERT(root_thread->th.th_serial_team); KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", @@ -3834,21 +3706,27 @@ int __kmp_register_root(int initial_thread) { KMP_INIT_BARRIER_STATE); #if KMP_AFFINITY_SUPPORTED -#if OMP_40_ENABLED root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; -#endif if (TCR_4(__kmp_init_middle)) { __kmp_affinity_set_init_mask(gtid, TRUE); } #endif /* KMP_AFFINITY_SUPPORTED */ -#if OMP_50_ENABLED root_thread->th.th_def_allocator = __kmp_def_allocator; root_thread->th.th_prev_level = 0; root_thread->th.th_prev_num_threads = 1; -#endif + + kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); + tmp->cg_root = root_thread; + tmp->cg_thread_limit = __kmp_cg_max_nth; + tmp->cg_nthreads = 1; + KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" + " cg_nthreads init to 1\n", + root_thread, tmp)); + tmp->up = NULL; + root_thread->th.th_cg_roots = tmp; __kmp_root_counter++; @@ -3864,11 +3742,11 @@ int __kmp_register_root(int initial_thread) { ompt_thread_initial, __ompt_get_thread_data_internal()); } ompt_data_t *task_data; - __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); - if (ompt_enabled.ompt_callback_task_create) { - ompt_callbacks.ompt_callback(ompt_callback_task_create)( - NULL, NULL, task_data, ompt_task_initial, 0, NULL); - // initial task has nothing to return to + ompt_data_t *parallel_data; + __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); + if (ompt_enabled.ompt_callback_implicit_task) { + ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( + ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); } ompt_set_thread_state(root_thread, ompt_state_work_serial); @@ -3958,6 +3836,13 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) { #endif /* KMP_OS_WINDOWS */ #if OMPT_SUPPORT + ompt_data_t *task_data; + ompt_data_t *parallel_data; + __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); + if (ompt_enabled.ompt_callback_implicit_task) { + ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( + ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); + } if (ompt_enabled.ompt_callback_thread_end) { ompt_callbacks.ompt_callback(ompt_callback_thread_end)( &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); @@ -3966,8 +3851,19 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) { TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. - root->r.r_cg_nthreads--; - + i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; + KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" + " to %d\n", + root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, + root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); + if (i == 1) { + // need to free contention group structure + KMP_DEBUG_ASSERT(root->r.r_uber_thread == + root->r.r_uber_thread->th.th_cg_roots->cg_root); + KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); + __kmp_free(root->r.r_uber_thread->th.th_cg_roots); + root->r.r_uber_thread->th.th_cg_roots = NULL; + } __kmp_reap_thread(root->r.r_uber_thread, 1); // We canot put root thread to __kmp_thread_pool, so we have to reap it istead @@ -4001,7 +3897,6 @@ void __kmp_unregister_root_current_thread(int gtid) { KMP_MB(); -#if OMP_45_ENABLED kmp_info_t *thread = __kmp_threads[gtid]; kmp_team_t *team = thread->th.th_team; kmp_task_team_t *task_team = thread->th.th_task_team; @@ -4014,7 +3909,6 @@ void __kmp_unregister_root_current_thread(int gtid) { #endif __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); } -#endif __kmp_reset_root(gtid, root); @@ -4098,12 +3992,10 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; else // no tasking --> always safe to reap this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; -#if OMP_40_ENABLED this_thr->th.th_set_proc_bind = proc_bind_default; #if KMP_AFFINITY_SUPPORTED this_thr->th.th_new_place = this_thr->th.th_current_place; #endif -#endif this_thr->th.th_root = master->th.th_root; /* setup the thread's cache of the team structure */ @@ -4141,6 +4033,33 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, this_thr->th.th_pri_head = NULL; } + if (this_thr != master && // Master's CG root is initialized elsewhere + this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set + // Make new thread's CG root same as master's + KMP_DEBUG_ASSERT(master->th.th_cg_roots); + kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; + if (tmp) { + // worker changes CG, need to check if old CG should be freed + int i = tmp->cg_nthreads--; + KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" + " on node %p of thread %p to %d\n", + this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); + if (i == 1) { + __kmp_free(tmp); // last thread left CG --> free it + } + } + this_thr->th.th_cg_roots = master->th.th_cg_roots; + // Increment new thread's CG root's counter to add the new thread + this_thr->th.th_cg_roots->cg_nthreads++; + KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" + " node %p of thread %p to %d\n", + this_thr, this_thr->th.th_cg_roots, + this_thr->th.th_cg_roots->cg_root, + this_thr->th.th_cg_roots->cg_nthreads)); + this_thr->th.th_current_task->td_icvs.thread_limit = + this_thr->th.th_cg_roots->cg_thread_limit; + } + /* Initialize dynamic dispatch */ { volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; @@ -4155,9 +4074,7 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); dispatch->th_disp_index = 0; -#if OMP_45_ENABLED dispatch->th_doacross_buf_idx = 0; -#endif if (!dispatch->th_disp_buffer) { dispatch->th_disp_buffer = (dispatch_private_info_t *)__kmp_allocate(disp_size); @@ -4222,22 +4139,25 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, /* first, try to get one from the thread pool */ if (__kmp_thread_pool) { - new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; if (new_thr == __kmp_thread_pool_insert_pt) { __kmp_thread_pool_insert_pt = NULL; } TCW_4(new_thr->th.th_in_pool, FALSE); - // Don't touch th_active_in_pool or th_active. - // The worker thread adjusts those flags as it sleeps/awakens. - __kmp_thread_pool_nth--; + __kmp_suspend_initialize_thread(new_thr); + __kmp_lock_suspend_mx(new_thr); + if (new_thr->th.th_active_in_pool == TRUE) { + KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); + KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); + new_thr->th.th_active_in_pool = FALSE; + } + __kmp_unlock_suspend_mx(new_thr); KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); KMP_ASSERT(!new_thr->th.th_team); KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); - KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0); /* setup the thread structure */ __kmp_initialize_info(new_thr, team, new_tid, @@ -4245,7 +4165,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); TCW_4(__kmp_nth, __kmp_nth + 1); - root->r.r_cg_nthreads++; new_thr->th.th_task_state = 0; new_thr->th.th_task_state_top = 0; @@ -4334,10 +4253,8 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, #if OMPT_SUPPORT ompt_data_none, // root parallel id #endif -#if OMP_40_ENABLED - proc_bind_default, -#endif - &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); + proc_bind_default, &r_icvs, + 0 USE_NESTED_HOT_ARG(NULL)); } KMP_ASSERT(serial_team); serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for @@ -4381,17 +4298,15 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, new_thr->th.th_blocking = false; #endif -#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED +#if KMP_AFFINITY_SUPPORTED new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; #endif -#if OMP_50_ENABLED new_thr->th.th_def_allocator = __kmp_def_allocator; new_thr->th.th_prev_level = 0; new_thr->th.th_prev_num_threads = 1; -#endif TCW_4(new_thr->th.th_in_pool, FALSE); new_thr->th.th_active_in_pool = FALSE; @@ -4401,8 +4316,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, __kmp_all_nth++; __kmp_nth++; - root->r.r_cg_nthreads++; - // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low // numbers of procs, and method #2 (keyed API call) for higher numbers. if (__kmp_adjust_gtid_mode) { @@ -4504,8 +4417,6 @@ static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, team->t.t_ordered.dt.t_value = 0; team->t.t_master_active = FALSE; - memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t)); - #ifdef KMP_DEBUG team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ #endif @@ -4540,7 +4451,7 @@ __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { } #endif -#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED +#if KMP_AFFINITY_SUPPORTED // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. // It calculats the worker + master thread's partition based upon the parent @@ -4579,12 +4490,10 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { th->th.th_first_place = first_place; th->th.th_last_place = last_place; th->th.th_new_place = masters_place; -#if OMP_50_ENABLED if (__kmp_display_affinity && masters_place != th->th.th_current_place && team->t.t_display_affinity != 1) { team->t.t_display_affinity = 1; } -#endif KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " "partition = [%d,%d]\n", @@ -4618,12 +4527,10 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { th->th.th_first_place = first_place; th->th.th_last_place = last_place; th->th.th_new_place = place; -#if OMP_50_ENABLED if (__kmp_display_affinity && place != th->th.th_current_place && team->t.t_display_affinity != 1) { team->t.t_display_affinity = 1; } -#endif KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " "partition = [%d,%d]\n", @@ -4645,12 +4552,10 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { th->th.th_first_place = first_place; th->th.th_last_place = last_place; th->th.th_new_place = place; -#if OMP_50_ENABLED if (__kmp_display_affinity && place != th->th.th_current_place && team->t.t_display_affinity != 1) { team->t.t_display_affinity = 1; } -#endif s_count++; if ((s_count == S) && rem && (gap_ct == gap)) { @@ -4719,12 +4624,10 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { th->th.th_first_place = place; th->th.th_new_place = place; -#if OMP_50_ENABLED if (__kmp_display_affinity && place != th->th.th_current_place && team->t.t_display_affinity != 1) { team->t.t_display_affinity = 1; } -#endif s_count = 1; while (s_count < S) { if (place == last_place) { @@ -4816,12 +4719,10 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { th->th.th_first_place = first; th->th.th_new_place = place; th->th.th_last_place = last; -#if OMP_50_ENABLED if (__kmp_display_affinity && place != th->th.th_current_place && team->t.t_display_affinity != 1) { team->t.t_display_affinity = 1; } -#endif KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " "partition = [%d,%d], spacing = %.4f\n", @@ -4850,12 +4751,10 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { th->th.th_first_place = place; th->th.th_last_place = place; th->th.th_new_place = place; -#if OMP_50_ENABLED if (__kmp_display_affinity && place != th->th.th_current_place && team->t.t_display_affinity != 1) { team->t.t_display_affinity = 1; } -#endif s_count++; if ((s_count == S) && rem && (gap_ct == gap)) { @@ -4901,7 +4800,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); } -#endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */ +#endif // KMP_AFFINITY_SUPPORTED /* allocate a new team data structure to use. take one off of the free pool if available */ @@ -4910,9 +4809,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, #if OMPT_SUPPORT ompt_data_t ompt_parallel_data, #endif -#if OMP_40_ENABLED kmp_proc_bind_t new_proc_bind, -#endif kmp_internal_control_t *new_icvs, int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); @@ -4954,7 +4851,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, #endif // Optimization to use a "hot" team if (use_hot_team && new_nproc > 1) { - KMP_DEBUG_ASSERT(new_nproc == max_nproc); + KMP_DEBUG_ASSERT(new_nproc <= max_nproc); #if KMP_NESTED_HOT_TEAMS team = hot_teams[level].hot_team; #else @@ -4993,7 +4890,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, team->t.t_threads[0], team)); __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); -#if OMP_40_ENABLED #if KMP_AFFINITY_SUPPORTED if ((team->t.t_size_changed == 0) && (team->t.t_proc_bind == new_proc_bind)) { @@ -5012,7 +4908,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, #else KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); #endif /* KMP_AFFINITY_SUPPORTED */ -#endif /* OMP_40_ENABLED */ } else if (team->t.t_nproc > new_nproc) { KA_TRACE(20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", @@ -5060,10 +4955,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, __kmp_reinitialize_team(team, new_icvs, root->r.r_uber_thread->th.th_ident); - /* update the remaining threads */ + // Update remaining threads for (f = 0; f < new_nproc; ++f) { team->t.t_threads[f]->th.th_team_nproc = new_nproc; } + // restore the current task state of the master thread: should be the // implicit task KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, @@ -5079,12 +4975,10 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, } #endif -#if OMP_40_ENABLED KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); #if KMP_AFFINITY_SUPPORTED __kmp_partition_places(team); #endif -#endif } else { // team->t.t_nproc < new_nproc #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED kmp_affin_mask_t *old_mask; @@ -5191,6 +5085,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, for (f = 0; f < team->t.t_nproc; ++f) __kmp_initialize_info(team->t.t_threads[f], team, f, __kmp_gtid_from_tid(f, team)); + if (level) { // set th_task_state for new threads in nested hot team // __kmp_initialize_info() no longer zeroes th_task_state, so we should // only need to set the th_task_state for the new threads. th_task_state @@ -5215,15 +5110,12 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, } #endif -#if OMP_40_ENABLED KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); #if KMP_AFFINITY_SUPPORTED __kmp_partition_places(team); #endif -#endif } // Check changes in number of threads -#if OMP_40_ENABLED kmp_info_t *master = team->t.t_threads[0]; if (master->th.th_teams_microtask) { for (f = 1; f < new_nproc; ++f) { @@ -5234,7 +5126,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, thr->th.th_teams_size = master->th.th_teams_size; } } -#endif /* OMP_40_ENABLED */ #if KMP_NESTED_HOT_TEAMS if (level) { // Sync barrier state for nested hot teams, not needed for outermost hot @@ -5315,9 +5206,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, } } -#if OMP_40_ENABLED team->t.t_proc_bind = new_proc_bind; -#endif KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id)); @@ -5382,9 +5271,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, } } -#if OMP_40_ENABLED team->t.t_proc_bind = new_proc_bind; -#endif #if OMPT_SUPPORT __ompt_team_assign_id(team, ompt_parallel_data); @@ -5479,8 +5366,8 @@ void __kmp_free_team(kmp_root_t *root, for (tt_idx = 0; tt_idx < 2; ++tt_idx) { kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; if (task_team != NULL) { - for (f = 0; f < team->t.t_nproc; - ++f) { // Have all threads unref task teams + for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams + KMP_DEBUG_ASSERT(team->t.t_threads[f]); team->t.t_threads[f]->th.th_task_team = NULL; } KA_TRACE( @@ -5511,6 +5398,32 @@ void __kmp_free_team(kmp_root_t *root, /* TODO limit size of team pool, call reap_team if pool too large */ team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); __kmp_team_pool = (volatile kmp_team_t *)team; + } else { // Check if team was created for the masters in a teams construct + // See if first worker is a CG root + KMP_DEBUG_ASSERT(team->t.t_threads[1] && + team->t.t_threads[1]->th.th_cg_roots); + if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { + // Clean up the CG root nodes on workers so that this team can be re-used + for (f = 1; f < team->t.t_nproc; ++f) { + kmp_info_t *thr = team->t.t_threads[f]; + KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && + thr->th.th_cg_roots->cg_root == thr); + // Pop current CG root off list + kmp_cg_root_t *tmp = thr->th.th_cg_roots; + thr->th.th_cg_roots = tmp->up; + KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" + " up to node %p. cg_nthreads was %d\n", + thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); + int i = tmp->cg_nthreads--; + if (i == 1) { + __kmp_free(tmp); // free CG if we are the last thread in it + } + // Restore current task's thread_limit from CG root + if (thr->th.th_cg_roots) + thr->th.th_current_task->td_icvs.thread_limit = + thr->th.th_cg_roots->cg_thread_limit; + } + } } KMP_MB(); @@ -5566,7 +5479,6 @@ kmp_team_t *__kmp_reap_team(kmp_team_t *team) { void __kmp_free_thread(kmp_info_t *this_th) { int gtid; kmp_info_t **scan; - kmp_root_t *root = this_th->th.th_root; KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); @@ -5591,6 +5503,29 @@ void __kmp_free_thread(kmp_info_t *this_th) { TCW_PTR(this_th->th.th_root, NULL); TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ + while (this_th->th.th_cg_roots) { + this_th->th.th_cg_roots->cg_nthreads--; + KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" + " %p of thread %p to %d\n", + this_th, this_th->th.th_cg_roots, + this_th->th.th_cg_roots->cg_root, + this_th->th.th_cg_roots->cg_nthreads)); + kmp_cg_root_t *tmp = this_th->th.th_cg_roots; + if (tmp->cg_root == this_th) { // Thread is a cg_root + KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); + KA_TRACE( + 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); + this_th->th.th_cg_roots = tmp->up; + __kmp_free(tmp); + } else { // Worker thread + if (tmp->cg_nthreads == 0) { // last thread leaves contention group + __kmp_free(tmp); + } + this_th->th.th_cg_roots = NULL; + break; + } + } + /* If the implicit task assigned to this thread can be used by other threads * -> multiple threads can share the data and try to free the task at * __kmp_reap_thread at exit. This duplicate use of the task data can happen @@ -5631,10 +5566,20 @@ void __kmp_free_thread(kmp_info_t *this_th) { (this_th->th.th_info.ds.ds_gtid < this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); TCW_4(this_th->th.th_in_pool, TRUE); - __kmp_thread_pool_nth++; + __kmp_suspend_initialize_thread(this_th); + __kmp_lock_suspend_mx(this_th); + if (this_th->th.th_active == TRUE) { + KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); + this_th->th.th_active_in_pool = TRUE; + } +#if KMP_DEBUG + else { + KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); + } +#endif + __kmp_unlock_suspend_mx(this_th); TCW_4(__kmp_nth, __kmp_nth - 1); - root->r.r_cg_nthreads--; #ifdef KMP_ADJUST_BLOCKTIME /* Adjust blocktime back to user setting or default if necessary */ @@ -5851,7 +5796,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { gtid = thread->th.th_info.ds.ds_gtid; if (!is_root) { - if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { /* Assume the threads are at the fork barrier here */ KA_TRACE( @@ -5882,10 +5826,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); } - - // Decrement # of [worker] threads in the pool. - KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0); - --__kmp_thread_pool_nth; } __kmp_free_implicit_task(thread); @@ -6264,16 +6204,13 @@ void __kmp_internal_end_thread(int gtid_req) { } } #if KMP_DYNAMIC_LIB - // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber - // thread, because we will better shutdown later in the library destructor. - // The reason of this change is performance problem when non-openmp thread in - // a loop forks and joins many openmp threads. We can save a lot of time - // keeping worker threads alive until the program shutdown. - // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) - // and Windows(DPD200287443) that occurs when using critical sections from - // foreign threads. - KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); - return; + if (__kmp_pause_status != kmp_hard_paused) + // AC: lets not shutdown the dynamic library at the exit of uber thread, + // because we will better shutdown later in the library destructor. + { + KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); + return; + } #endif /* synchronize the termination process */ __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); @@ -6409,6 +6346,7 @@ void __kmp_register_library_startup(void) { // library. Assume the other library is alive. // WARN( ... ); // TODO: Issue a warning. file_name = "unknown library"; + KMP_FALLTHROUGH(); // Attention! Falling to the next case. That's intentional. case 1: { // Neighbor is alive. // Check it is allowed. @@ -6747,11 +6685,9 @@ static void __kmp_do_serial_initialize(void) { __kmp_env_print(); } -#if OMP_40_ENABLED if (__kmp_display_env || __kmp_display_env_verbose) { __kmp_env_print_2(); } -#endif // OMP_40_ENABLED #if OMPT_SUPPORT ompt_post_init(); @@ -6919,6 +6855,7 @@ void __kmp_parallel_initialize(void) { if (!__kmp_init_middle) { __kmp_do_middle_initialize(); } + __kmp_resume_if_hard_paused(); /* begin initialization */ KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); @@ -6984,10 +6921,7 @@ void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, // this_thr->th.th_info.ds.ds_tid ] ); dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ -#if OMP_45_ENABLED - dispatch->th_doacross_buf_idx = - 0; /* reset the doacross dispatch buffer counter */ -#endif + dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter if (__kmp_env_consistency_check) __kmp_push_parallel(gtid, team->t.t_ident); @@ -7046,21 +6980,33 @@ int __kmp_invoke_task_func(int gtid) { } #endif - { - KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); - KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); - rc = - __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, - tid, (int)team->t.t_argc, (void **)team->t.t_argv +#if KMP_STATS_ENABLED + stats_state_e previous_state = KMP_GET_THREAD_STATE(); + if (previous_state == stats_state_e::TEAMS_REGION) { + KMP_PUSH_PARTITIONED_TIMER(OMP_teams); + } else { + KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); + } + KMP_SET_THREAD_STATE(IMPLICIT_TASK); +#endif + + rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, + tid, (int)team->t.t_argc, (void **)team->t.t_argv #if OMPT_SUPPORT - , - exit_runtime_p + , + exit_runtime_p #endif - ); + ); #if OMPT_SUPPORT - *exit_runtime_p = NULL; + *exit_runtime_p = NULL; #endif + +#if KMP_STATS_ENABLED + if (previous_state == stats_state_e::TEAMS_REGION) { + KMP_SET_THREAD_STATE(previous_state); } + KMP_POP_PARTITIONED_TIMER(); +#endif #if USE_ITT_BUILD if (__itt_stack_caller_create_ptr) { @@ -7074,7 +7020,6 @@ int __kmp_invoke_task_func(int gtid) { return rc; } -#if OMP_40_ENABLED void __kmp_teams_master(int gtid) { // This routine is called by all master threads in teams construct kmp_info_t *thr = __kmp_threads[gtid]; @@ -7085,6 +7030,19 @@ void __kmp_teams_master(int gtid) { KMP_DEBUG_ASSERT(thr->th.th_set_nproc); KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); + + // This thread is a new CG root. Set up the proper variables. + kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); + tmp->cg_root = thr; // Make thr the CG root + // Init to thread limit that was stored when league masters were forked + tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; + tmp->cg_nthreads = 1; // Init counter to one active thread, this one + KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" + " cg_nthreads to 1\n", + thr, tmp)); + tmp->up = thr->th.th_cg_roots; + thr->th.th_cg_roots = tmp; + // Launch league of teams now, but not let workers execute // (they hang on fork barrier until next parallel) #if INCLUDE_SSC_MARKS @@ -7096,7 +7054,9 @@ void __kmp_teams_master(int gtid) { #if INCLUDE_SSC_MARKS SSC_MARK_JOINING(); #endif - + // If the team size was reduced from the limit, set it to the new size + if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) + thr->th.th_teams_size.nth = thr->th.th_team_nproc; // AC: last parameter "1" eliminates join barrier which won't work because // worker threads are in a fork barrier waiting for more parallel regions __kmp_join_call(loc, gtid @@ -7121,7 +7081,6 @@ int __kmp_invoke_teams_master(int gtid) { __kmp_run_after_invoked_task(gtid, 0, this_thr, team); return 1; } -#endif /* OMP_40_ENABLED */ /* this sets the requested number of threads for the next parallel region encountered by this team. since this should be enclosed in the forkjoin @@ -7135,8 +7094,6 @@ void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { thr->th.th_set_nproc = num_threads; } -#if OMP_40_ENABLED - /* this sets the requested number of teams for the teams region and/or the number of threads for the next parallel region encountered */ void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, @@ -7170,10 +7127,14 @@ void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, num_threads = __kmp_teams_max_nth / num_teams; } } else { + // This thread will be the master of the league masters + // Store new thread limit; old limit is saved in th_cg_roots list + thr->th.th_current_task->td_icvs.thread_limit = num_threads; + if (num_teams * num_threads > __kmp_teams_max_nth) { int new_threads = __kmp_teams_max_nth / num_teams; if (!__kmp_reserve_warn) { // user asked for too many threads - __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT + __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, new_threads), KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); @@ -7190,8 +7151,6 @@ void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { thr->th.th_set_proc_bind = proc_bind; } -#endif /* OMP_40_ENABLED */ - /* Launch the worker threads into the microtask. */ void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { @@ -7216,15 +7175,11 @@ void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { int i; for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { team->t.t_disp_buffer[i].buffer_index = i; -#if OMP_45_ENABLED team->t.t_disp_buffer[i].doacross_buf_idx = i; -#endif } } else { team->t.t_disp_buffer[0].buffer_index = 0; -#if OMP_45_ENABLED team->t.t_disp_buffer[0].doacross_buf_idx = 0; -#endif } KMP_MB(); /* Flush all pending memory write invalidates. */ @@ -7282,11 +7237,13 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { if (ompt_enabled.ompt_callback_sync_region_wait) { ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( - ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr); + ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, + codeptr); } if (ompt_enabled.ompt_callback_sync_region) { ompt_callbacks.ompt_callback(ompt_callback_sync_region)( - ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr); + ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, + codeptr); } #endif if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { @@ -7476,12 +7433,10 @@ void __kmp_cleanup(void) { __kmp_nested_proc_bind.bind_types = NULL; __kmp_nested_proc_bind.size = 0; __kmp_nested_proc_bind.used = 0; -#if OMP_50_ENABLED if (__kmp_affinity_format) { KMP_INTERNAL_FREE(__kmp_affinity_format); __kmp_affinity_format = NULL; } -#endif __kmp_i18n_catclose(); @@ -7625,13 +7580,14 @@ void __kmp_aux_set_library(enum library_type arg) { switch (__kmp_library) { case library_serial: { KMP_INFORM(LibraryIsSerial); - (void)__kmp_change_library(TRUE); } break; case library_turnaround: - (void)__kmp_change_library(TRUE); + if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) + __kmp_use_yield = 2; // only yield when oversubscribed break; case library_throughput: - (void)__kmp_change_library(FALSE); + if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) + __kmp_dflt_blocktime = 200; break; default: KMP_FATAL(UnknownLibraryType, arg); @@ -7696,7 +7652,6 @@ int __kmp_aux_get_num_teams() { /* ------------------------------------------------------------------------ */ -#if OMP_50_ENABLED /* * Affinity Format Parser * @@ -7969,7 +7924,6 @@ void __kmp_aux_display_affinity(int gtid, const char *format) { __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); __kmp_str_buf_free(&buf); } -#endif // OMP_50_ENABLED /* ------------------------------------------------------------------------ */ @@ -8023,11 +7977,7 @@ void __kmp_aux_set_defaults(char const *str, int len) { } __kmp_env_initialize(str); - if (__kmp_settings -#if OMP_40_ENABLED - || __kmp_display_env || __kmp_display_env_verbose -#endif // OMP_40_ENABLED - ) { + if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { __kmp_env_print(); } } // __kmp_aux_set_defaults @@ -8104,7 +8054,7 @@ __kmp_determine_reduction_method( #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS -#if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD +#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD // basic tuning @@ -8190,3 +8140,78 @@ __kmp_determine_reduction_method( kmp_int32 __kmp_get_reduce_method(void) { return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); } + +// Soft pause sets up threads to ignore blocktime and just go to sleep. +// Spin-wait code checks __kmp_pause_status and reacts accordingly. +void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } + +// Hard pause shuts down the runtime completely. Resume happens naturally when +// OpenMP is used subsequently. +void __kmp_hard_pause() { + __kmp_pause_status = kmp_hard_paused; + __kmp_internal_end_thread(-1); +} + +// Soft resume sets __kmp_pause_status, and wakes up all threads. +void __kmp_resume_if_soft_paused() { + if (__kmp_pause_status == kmp_soft_paused) { + __kmp_pause_status = kmp_not_paused; + + for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { + kmp_info_t *thread = __kmp_threads[gtid]; + if (thread) { // Wake it if sleeping + kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); + if (fl.is_sleeping()) + fl.resume(gtid); + else if (__kmp_try_suspend_mx(thread)) { // got suspend lock + __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep + } else { // thread holds the lock and may sleep soon + do { // until either the thread sleeps, or we can get the lock + if (fl.is_sleeping()) { + fl.resume(gtid); + break; + } else if (__kmp_try_suspend_mx(thread)) { + __kmp_unlock_suspend_mx(thread); + break; + } + } while (1); + } + } + } + } +} + +// This function is called via __kmpc_pause_resource. Returns 0 if successful. +// TODO: add warning messages +int __kmp_pause_resource(kmp_pause_status_t level) { + if (level == kmp_not_paused) { // requesting resume + if (__kmp_pause_status == kmp_not_paused) { + // error message about runtime not being paused, so can't resume + return 1; + } else { + KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || + __kmp_pause_status == kmp_hard_paused); + __kmp_pause_status = kmp_not_paused; + return 0; + } + } else if (level == kmp_soft_paused) { // requesting soft pause + if (__kmp_pause_status != kmp_not_paused) { + // error message about already being paused + return 1; + } else { + __kmp_soft_pause(); + return 0; + } + } else if (level == kmp_hard_paused) { // requesting hard pause + if (__kmp_pause_status != kmp_not_paused) { + // error message about already being paused + return 1; + } else { + __kmp_hard_pause(); + return 0; + } + } else { + // error message about invalid level + return 1; + } +} |