diff options
Diffstat (limited to 'openmp/runtime')
44 files changed, 620 insertions, 332 deletions
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports index f76619ec0e3c..45a294b666fa 100644 --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -533,6 +533,7 @@ kmp_set_disp_num_buffers 890 omp_pause_resource_all 757 omp_get_supported_active_levels 758 omp_fulfill_event 759 + omp_display_env 733 omp_null_allocator DATA omp_default_mem_alloc DATA diff --git a/openmp/runtime/src/exports_so.txt b/openmp/runtime/src/exports_so.txt index f7de5fd6474f..30222418163d 100644 --- a/openmp/runtime/src/exports_so.txt +++ b/openmp/runtime/src/exports_so.txt @@ -119,5 +119,7 @@ GOMP_4.0 { } GOMP_3.0; GOMP_4.5 { } GOMP_4.0; +GOMP_5.0 { +} GOMP_4.5; # end of file # diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt index 3a3035b26673..b2ba63c02870 100644 --- a/openmp/runtime/src/i18n/en_US.txt +++ b/openmp/runtime/src/i18n/en_US.txt @@ -324,7 +324,7 @@ WrongMessageCatalog "Incompatible message catalog \"%1$s\": Version \"% StgIgnored "%1$s: ignored because %2$s has been defined" # %1, -- name of ignored variable, %2 -- name of variable with higher priority. OBSOLETE "%1$s: overrides %3$s specified before" - # %1, %2 -- name and value of the overriding variable, %3 -- name of overriden variable. + # %1, %2 -- name and value of the overriding variable, %3 -- name of overridden variable. AffTilesNoHWLOC "%1$s: Tiles are only supported if KMP_TOPOLOGY_METHOD=hwloc, using granularity=package instead" AffTilesNoTiles "%1$s: Tiles requested but were not detected on this HW, using granularity=package instead" TopologyExtraTile "%1$s: %2$d packages x %3$d tiles/pkg x %4$d cores/tile x %5$d threads/core (%6$d total cores)" diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var index 2246e7012bee..f62afc2b693d 100644 --- a/openmp/runtime/src/include/omp.h.var +++ b/openmp/runtime/src/include/omp.h.var @@ -228,36 +228,36 @@ typedef uintptr_t omp_uintptr_t; typedef enum { - OMP_ATK_THREADMODEL = 1, - OMP_ATK_ALIGNMENT = 2, - OMP_ATK_ACCESS = 3, - OMP_ATK_POOL_SIZE = 4, - OMP_ATK_FALLBACK = 5, - OMP_ATK_FB_DATA = 6, - OMP_ATK_PINNED = 7, - OMP_ATK_PARTITION = 8 + omp_atk_threadmodel = 1, + omp_atk_alignment = 2, + omp_atk_access = 3, + omp_atk_pool_size = 4, + omp_atk_fallback = 5, + omp_atk_fb_data = 6, + omp_atk_pinned = 7, + omp_atk_partition = 8 } omp_alloctrait_key_t; typedef enum { - OMP_ATV_FALSE = 0, - OMP_ATV_TRUE = 1, - OMP_ATV_DEFAULT = 2, - OMP_ATV_CONTENDED = 3, - OMP_ATV_UNCONTENDED = 4, - OMP_ATV_SEQUENTIAL = 5, - OMP_ATV_PRIVATE = 6, - OMP_ATV_ALL = 7, - OMP_ATV_THREAD = 8, - OMP_ATV_PTEAM = 9, - OMP_ATV_CGROUP = 10, - OMP_ATV_DEFAULT_MEM_FB = 11, - OMP_ATV_NULL_FB = 12, - OMP_ATV_ABORT_FB = 13, - OMP_ATV_ALLOCATOR_FB = 14, - OMP_ATV_ENVIRONMENT = 15, - OMP_ATV_NEAREST = 16, - OMP_ATV_BLOCKED = 17, - OMP_ATV_INTERLEAVED = 18 + omp_atv_false = 0, + omp_atv_true = 1, + omp_atv_default = 2, + omp_atv_contended = 3, + omp_atv_uncontended = 4, + omp_atv_sequential = 5, + omp_atv_private = 6, + omp_atv_all = 7, + omp_atv_thread = 8, + omp_atv_pteam = 9, + omp_atv_cgroup = 10, + omp_atv_default_mem_fb = 11, + omp_atv_null_fb = 12, + omp_atv_abort_fb = 13, + omp_atv_allocator_fb = 14, + omp_atv_environment = 15, + omp_atv_nearest = 16, + omp_atv_blocked = 17, + omp_atv_interleaved = 18 } omp_alloctrait_value_t; typedef struct { @@ -355,6 +355,9 @@ extern int __KAI_KMPC_CONVENTION omp_get_supported_active_levels(void); + /* OpenMP 5.1 Display Environment */ + extern void omp_display_env(int verbose); + # undef __KAI_KMPC_CONVENTION # undef __KMP_IMP diff --git a/openmp/runtime/src/include/omp_lib.f.var b/openmp/runtime/src/include/omp_lib.f.var index d631438f55ad..bf40c78707a8 100644 --- a/openmp/runtime/src/include/omp_lib.f.var +++ b/openmp/runtime/src/include/omp_lib.f.var @@ -488,6 +488,11 @@ integer (kind=kmp_size_t_kind) omp_capture_affinity end function omp_capture_affinity + subroutine omp_display_env(verbose) bind(c) + use omp_lib_kinds + logical (kind=omp_logical_kind), value :: verbose + end subroutine omp_display_env + ! *** ! *** kmp_* entry points ! *** diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var index ac568486d204..fbbb7b9df94d 100644 --- a/openmp/runtime/src/include/omp_lib.f90.var +++ b/openmp/runtime/src/include/omp_lib.f90.var @@ -503,6 +503,12 @@ integer (kind=kmp_size_t_kind) :: omp_capture_affinity end function omp_capture_affinity + subroutine omp_display_env(verbose) bind(c) + use omp_lib_kinds + logical (kind=omp_logical_kind), value :: verbose + end subroutine omp_display_env + + ! *** ! *** kmp_* entry points ! *** diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var index 8775128157bd..f1b6b03f7725 100644 --- a/openmp/runtime/src/include/omp_lib.h.var +++ b/openmp/runtime/src/include/omp_lib.h.var @@ -580,6 +580,11 @@ integer (kind=kmp_size_t_kind) :: omp_capture_affinity end function omp_capture_affinity + subroutine omp_display_env(verbose) bind(c) + import + logical (kind=omp_logical_kind), value :: verbose + end subroutine omp_display_env + ! *** ! *** kmp_* entry points ! *** diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 086ab3bb011e..5f9b7c895619 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -872,36 +872,36 @@ extern int __kmp_hws_abs_flag; // absolute or per-item number requested typedef uintptr_t omp_uintptr_t; typedef enum { - OMP_ATK_THREADMODEL = 1, - OMP_ATK_ALIGNMENT = 2, - OMP_ATK_ACCESS = 3, - OMP_ATK_POOL_SIZE = 4, - OMP_ATK_FALLBACK = 5, - OMP_ATK_FB_DATA = 6, - OMP_ATK_PINNED = 7, - OMP_ATK_PARTITION = 8 + omp_atk_threadmodel = 1, + omp_atk_alignment = 2, + omp_atk_access = 3, + omp_atk_pool_size = 4, + omp_atk_fallback = 5, + omp_atk_fb_data = 6, + omp_atk_pinned = 7, + omp_atk_partition = 8 } omp_alloctrait_key_t; typedef enum { - OMP_ATV_FALSE = 0, - OMP_ATV_TRUE = 1, - OMP_ATV_DEFAULT = 2, - OMP_ATV_CONTENDED = 3, - OMP_ATV_UNCONTENDED = 4, - OMP_ATV_SEQUENTIAL = 5, - OMP_ATV_PRIVATE = 6, - OMP_ATV_ALL = 7, - OMP_ATV_THREAD = 8, - OMP_ATV_PTEAM = 9, - OMP_ATV_CGROUP = 10, - OMP_ATV_DEFAULT_MEM_FB = 11, - OMP_ATV_NULL_FB = 12, - OMP_ATV_ABORT_FB = 13, - OMP_ATV_ALLOCATOR_FB = 14, - OMP_ATV_ENVIRONMENT = 15, - OMP_ATV_NEAREST = 16, - OMP_ATV_BLOCKED = 17, - OMP_ATV_INTERLEAVED = 18 + omp_atv_false = 0, + omp_atv_true = 1, + omp_atv_default = 2, + omp_atv_contended = 3, + omp_atv_uncontended = 4, + omp_atv_sequential = 5, + omp_atv_private = 6, + omp_atv_all = 7, + omp_atv_thread = 8, + omp_atv_pteam = 9, + omp_atv_cgroup = 10, + omp_atv_default_mem_fb = 11, + omp_atv_null_fb = 12, + omp_atv_abort_fb = 13, + omp_atv_allocator_fb = 14, + omp_atv_environment = 15, + omp_atv_nearest = 16, + omp_atv_blocked = 17, + omp_atv_interleaved = 18 } omp_alloctrait_value_t; typedef void *omp_memspace_handle_t; @@ -1548,7 +1548,7 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 { kmp_int32 tc; kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put after ub */ - + kmp_lock_t *th_steal_lock; // lock used for chunk stealing // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on ) // a) parm3 is properly aligned and // b) all parm1-4 are in the same cache line. @@ -1581,7 +1581,7 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 { kmp_int64 tc; /* trip count (number of iterations) */ kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put after ub */ - + kmp_lock_t *th_steal_lock; // lock used for chunk stealing /* parm[1-4] are used in different ways by different scheduling algorithms */ // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) @@ -1722,11 +1722,7 @@ typedef struct kmp_disp { kmp_int32 th_disp_index; kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags - union { // we can use union here because doacross cannot be used in - // nonmonotonic loops - kmp_int64 *th_doacross_info; // info on loop bounds - kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable) - }; + kmp_int64 *th_doacross_info; // info on loop bounds #if KMP_USE_INTERNODE_ALIGNMENT char more_padding[INTERNODE_CACHE_LINE]; #endif @@ -2435,10 +2431,10 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { int th_teams_level; /* save initial level of teams construct */ /* it is 0 on device but may be any on host */ -/* The blocktime info is copied from the team struct to the thread sruct */ -/* at the start of a barrier, and the values stored in the team are used */ -/* at points in the code where the team struct is no longer guaranteed */ -/* to exist (from the POV of worker threads). */ +/* The blocktime info is copied from the team struct to the thread struct */ +/* at the start of a barrier, and the values stored in the team are used */ +/* at points in the code where the team struct is no longer guaranteed */ +/* to exist (from the POV of worker threads). */ #if KMP_USE_MONITOR int th_team_bt_intervals; int th_team_bt_set; @@ -3908,6 +3904,8 @@ static inline void __kmp_resume_if_hard_paused() { } } +extern void __kmp_omp_display_env(int verbose); + #ifdef __cplusplus } #endif diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp index 650e9ff35e1b..47e70477ced6 100644 --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -601,7 +601,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, int depth = 3; int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread - int labels[3] = {0}; // package [,node] [,tile] - head of lables array + int labels[3] = {0}; // package [,node] [,tile] - head of labels array if (__kmp_numa_detected) ++depth; if (__kmp_tile_depth) @@ -828,7 +828,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, } int depth_full = depth; // number of levels before compressing - // Find any levels with radiix 1, and remove them from the map + // Find any levels with radix 1, and remove them from the map // (except for the package level). depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, levels); @@ -918,7 +918,7 @@ static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, return 0; } - // Contruct the data structure to be returned. + // Construct the data structure to be returned. *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); int avail_ct = 0; @@ -967,7 +967,7 @@ static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, return -1; } - // Contruct the data structure to be returned. + // Construct the data structure to be returned. *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); @@ -1849,7 +1849,7 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, return 0; } - // Find any levels with radiix 1, and remove them from the map + // Find any levels with radix 1, and remove them from the map // (except for the package level). int new_depth = 0; for (level = 0; level < depth; level++) { @@ -1968,7 +1968,8 @@ static void __kmp_dispatch_set_hierarchy_values() { __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; -#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) +#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ + KMP_MIC_SUPPORTED if (__kmp_mic_type >= mic3) __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; else @@ -1982,7 +1983,8 @@ static void __kmp_dispatch_set_hierarchy_values() { __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_nThreadsPerCore; -#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) +#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ + KMP_MIC_SUPPORTED if (__kmp_mic_type >= mic3) __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2 * __kmp_nThreadsPerCore; @@ -4328,7 +4330,7 @@ static void __kmp_aux_affinity_initialize(void) { } #endif // KMP_USE_HWLOC -// If the user has specified that a paricular topology discovery method is to be +// If the user has specified that a particular topology discovery method is to be // used, then we abort if that method fails. The exception is group affinity, // which might have been implicitly set. @@ -4647,7 +4649,7 @@ static void __kmp_aux_affinity_initialize(void) { #undef KMP_EXIT_AFF_NONE void __kmp_affinity_initialize(void) { - // Much of the code above was written assumming that if a machine was not + // Much of the code above was written assuming that if a machine was not // affinity capable, then __kmp_affinity_type == affinity_none. We now // explicitly represent this as __kmp_affinity_type == affinity_disabled. // There are too many checks for __kmp_affinity_type == affinity_none @@ -4713,7 +4715,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) { KMP_CPU_ZERO(th->th.th_affin_mask); } - // Copy the thread mask to the kmp_info_t strucuture. If + // Copy the thread mask to the kmp_info_t structure. If // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, // then the full mask is the same as the mask of the initialization thread. @@ -4823,7 +4825,7 @@ void __kmp_affinity_set_place(int gtid) { (th->th.th_new_place >= th->th.th_last_place)); } - // Copy the thread mask to the kmp_info_t strucuture, + // Copy the thread mask to the kmp_info_t structure, // and set this thread's affinity. kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h index f270bb6dbb8d..664a42393191 100644 --- a/openmp/runtime/src/kmp_affinity.h +++ b/openmp/runtime/src/kmp_affinity.h @@ -303,8 +303,9 @@ class KMPNativeAffinity : public KMPAffinity { int retval = syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); #elif KMP_OS_FREEBSD - int retval = + int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask)); + int retval = (r == 0 ? 0 : -1); #endif if (retval >= 0) { return 0; @@ -322,8 +323,9 @@ class KMPNativeAffinity : public KMPAffinity { int retval = syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); #elif KMP_OS_FREEBSD - int retval = + int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask)); + int retval = (r == 0 ? 0 : -1); #endif if (retval >= 0) { return 0; diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp index 16893d0ffca5..314f56d9b5c6 100644 --- a/openmp/runtime/src/kmp_alloc.cpp +++ b/openmp/runtime/src/kmp_alloc.cpp @@ -186,7 +186,7 @@ typedef struct thr_data { -1: not all pool blocks are the same size >0: (common) block size for all bpool calls made so far */ - bfhead_t *last_pool; /* Last pool owned by this thread (delay dealocation) */ + bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */ } thr_data_t; /* Minimum allocation quantum: */ @@ -195,7 +195,7 @@ typedef struct thr_data { #define MaxSize \ (bufsize)( \ ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1))) -// Maximun for the requested size. +// Maximum for the requested size. /* End sentinel: value placed in bsize field of dummy block delimiting end of pool block. The most negative number which will fit in a @@ -577,7 +577,7 @@ static void *bget(kmp_info_t *th, bufsize requested_size) { if (thr->acqfcn != 0) { if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) { /* Request is too large to fit in a single expansion block. - Try to satisy it by a direct buffer acquisition. */ + Try to satisfy it by a direct buffer acquisition. */ bdhead_t *bdh; size += sizeof(bdhead_t) - sizeof(bhead_t); @@ -1348,27 +1348,27 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms, al->memspace = ms; // not used currently for (i = 0; i < ntraits; ++i) { switch (traits[i].key) { - case OMP_ATK_THREADMODEL: - case OMP_ATK_ACCESS: - case OMP_ATK_PINNED: + case omp_atk_threadmodel: + case omp_atk_access: + case omp_atk_pinned: break; - case OMP_ATK_ALIGNMENT: + case omp_atk_alignment: al->alignment = traits[i].value; KMP_ASSERT(IS_POWER_OF_TWO(al->alignment)); break; - case OMP_ATK_POOL_SIZE: + case omp_atk_pool_size: al->pool_size = traits[i].value; break; - case OMP_ATK_FALLBACK: + case omp_atk_fallback: al->fb = (omp_alloctrait_value_t)traits[i].value; KMP_DEBUG_ASSERT( - al->fb == OMP_ATV_DEFAULT_MEM_FB || al->fb == OMP_ATV_NULL_FB || - al->fb == OMP_ATV_ABORT_FB || al->fb == OMP_ATV_ALLOCATOR_FB); + al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb || + al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb); break; - case OMP_ATK_FB_DATA: + case omp_atk_fb_data: al->fb_data = RCAST(kmp_allocator_t *, traits[i].value); break; - case OMP_ATK_PARTITION: + case omp_atk_partition: al->memkind = RCAST(void **, traits[i].value); break; default: @@ -1377,17 +1377,17 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms, } if (al->fb == 0) { // set default allocator - al->fb = OMP_ATV_DEFAULT_MEM_FB; + al->fb = omp_atv_default_mem_fb; al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc; - } else if (al->fb == OMP_ATV_ALLOCATOR_FB) { + } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al->fb_data != NULL); - } else if (al->fb == OMP_ATV_DEFAULT_MEM_FB) { + } else if (al->fb == omp_atv_default_mem_fb) { al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc; } if (__kmp_memkind_available) { // Let's use memkind library if available if (ms == omp_high_bw_mem_space) { - if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_hbw_interleave) { + if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) { al->memkind = mk_hbw_interleave; } else if (mk_hbw_preferred) { // AC: do not try to use MEMKIND_HBW for now, because memkind library @@ -1402,7 +1402,7 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms, return omp_null_allocator; } } else { - if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_interleave) { + if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) { al->memkind = mk_interleave; } else { al->memkind = mk_default; @@ -1477,12 +1477,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { if (used + desc.size_a > al->pool_size) { // not enough space, need to go fallback path KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); - if (al->fb == OMP_ATV_DEFAULT_MEM_FB) { + if (al->fb == omp_atv_default_mem_fb) { al = (kmp_allocator_t *)omp_default_mem_alloc; ptr = kmp_mk_alloc(*mk_default, desc.size_a); - } else if (al->fb == OMP_ATV_ABORT_FB) { + } else if (al->fb == omp_atv_abort_fb) { KMP_ASSERT(0); // abort fallback requested - } else if (al->fb == OMP_ATV_ALLOCATOR_FB) { + } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al != al->fb_data); al = al->fb_data; return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al); @@ -1491,12 +1491,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { // pool has enough space ptr = kmp_mk_alloc(*al->memkind, desc.size_a); if (ptr == NULL) { - if (al->fb == OMP_ATV_DEFAULT_MEM_FB) { + if (al->fb == omp_atv_default_mem_fb) { al = (kmp_allocator_t *)omp_default_mem_alloc; ptr = kmp_mk_alloc(*mk_default, desc.size_a); - } else if (al->fb == OMP_ATV_ABORT_FB) { + } else if (al->fb == omp_atv_abort_fb) { KMP_ASSERT(0); // abort fallback requested - } else if (al->fb == OMP_ATV_ALLOCATOR_FB) { + } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al != al->fb_data); al = al->fb_data; return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al); @@ -1507,12 +1507,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { // custom allocator, pool size not requested ptr = kmp_mk_alloc(*al->memkind, desc.size_a); if (ptr == NULL) { - if (al->fb == OMP_ATV_DEFAULT_MEM_FB) { + if (al->fb == omp_atv_default_mem_fb) { al = (kmp_allocator_t *)omp_default_mem_alloc; ptr = kmp_mk_alloc(*mk_default, desc.size_a); - } else if (al->fb == OMP_ATV_ABORT_FB) { + } else if (al->fb == omp_atv_abort_fb) { KMP_ASSERT(0); // abort fallback requested - } else if (al->fb == OMP_ATV_ALLOCATOR_FB) { + } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al != al->fb_data); al = al->fb_data; return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al); @@ -1533,12 +1533,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { if (used + desc.size_a > al->pool_size) { // not enough space, need to go fallback path KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); - if (al->fb == OMP_ATV_DEFAULT_MEM_FB) { + if (al->fb == omp_atv_default_mem_fb) { al = (kmp_allocator_t *)omp_default_mem_alloc; ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); - } else if (al->fb == OMP_ATV_ABORT_FB) { + } else if (al->fb == omp_atv_abort_fb) { KMP_ASSERT(0); // abort fallback requested - } else if (al->fb == OMP_ATV_ALLOCATOR_FB) { + } else if (al->fb == omp_atv_allocator_fb) { KMP_ASSERT(al != al->fb_data); al = al->fb_data; return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al); @@ -1546,14 +1546,14 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) { } else { // pool has enough space ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); - if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) { + if (ptr == NULL && al->fb == omp_atv_abort_fb) { KMP_ASSERT(0); // abort fallback requested } // no sense to look for another fallback because of same internal alloc } } else { // custom allocator, pool size not requested ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); - if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) { + if (ptr == NULL && al->fb == omp_atv_abort_fb) { KMP_ASSERT(0); // abort fallback requested } // no sense to look for another fallback because of same internal alloc } @@ -1961,7 +1961,7 @@ void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) { this_thr->th.th_free_lists[index].th_free_list_other = ptr; } else { // either queue blocks owner is changing or size limit exceeded - // return old queue to allocating thread (q_th) synchroneously, + // return old queue to allocating thread (q_th) synchronously, // and start new list for alloc_thr's tasks void *old_ptr; void *tail = head; diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp index a6d87b5d7a2e..4aa7a084f53a 100644 --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -549,6 +549,7 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, if (((tid >> level) & (branch_factor - 1)) != 0) { kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); + KMP_MB(); // Synchronize parent and child threads. KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " "arrived(%p): %llu => %llu\n", @@ -590,6 +591,7 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); + KMP_MB(); // Synchronize parent and child threads. #if USE_ITT_BUILD && USE_ITT_NOTIFY // Barrier imbalance - write min of the thread time and a child time to // the thread. diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index ac9a93590ad0..9cfa64d6ff9e 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -92,7 +92,7 @@ construct, since the master thread is necessarily thread zero). If multiple non-OpenMP threads all enter an OpenMP construct then this will be a unique thread identifier among all the threads created by -the OpenMP runtime (but the value cannote be defined in terms of +the OpenMP runtime (but the value cannot be defined in terms of OpenMP thread ids returned by omp_get_thread_num()). */ kmp_int32 __kmpc_global_thread_num(ident_t *loc) { @@ -4023,6 +4023,9 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { lo = pr_buf->th_doacross_info[2]; up = pr_buf->th_doacross_info[3]; st = pr_buf->th_doacross_info[4]; +#if OMPT_SUPPORT && OMPT_OPTIONAL + ompt_dependence_t deps[num_dims]; +#endif if (st == 1) { // most common case if (vec[0] < lo || vec[0] > up) { KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " @@ -4048,6 +4051,10 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { } iter_number = (kmp_uint64)(lo - vec[0]) / (-st); } +#if OMPT_SUPPORT && OMPT_OPTIONAL + deps[0].variable.value = iter_number; + deps[0].dependence_type = ompt_dependence_type_sink; +#endif for (i = 1; i < num_dims; ++i) { kmp_int64 iter, ln; kmp_int32 j = i * 4; @@ -4081,6 +4088,10 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { iter = (kmp_uint64)(lo - vec[i]) / (-st); } iter_number = iter + ln * iter_number; +#if OMPT_SUPPORT && OMPT_OPTIONAL + deps[i].variable.value = iter; + deps[i].dependence_type = ompt_dependence_type_sink; +#endif } shft = iter_number % 32; // use 32-bit granularity iter_number >>= 5; // divided by 32 @@ -4089,6 +4100,12 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { KMP_YIELD(TRUE); } KMP_MB(); +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.ompt_callback_dependences) { + ompt_callbacks.ompt_callback(ompt_callback_dependences)( + &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims); + } +#endif KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", gtid, (iter_number << 5) + shft)); @@ -4116,6 +4133,9 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { num_dims = pr_buf->th_doacross_info[0]; lo = pr_buf->th_doacross_info[2]; st = pr_buf->th_doacross_info[4]; +#if OMPT_SUPPORT && OMPT_OPTIONAL + ompt_dependence_t deps[num_dims]; +#endif if (st == 1) { // most common case iter_number = vec[0] - lo; } else if (st > 0) { @@ -4123,6 +4143,10 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { } else { // negative increment iter_number = (kmp_uint64)(lo - vec[0]) / (-st); } +#if OMPT_SUPPORT && OMPT_OPTIONAL + deps[0].variable.value = iter_number; + deps[0].dependence_type = ompt_dependence_type_source; +#endif for (i = 1; i < num_dims; ++i) { kmp_int64 iter, ln; kmp_int32 j = i * 4; @@ -4137,7 +4161,17 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { iter = (kmp_uint64)(lo - vec[i]) / (-st); } iter_number = iter + ln * iter_number; +#if OMPT_SUPPORT && OMPT_OPTIONAL + deps[i].variable.value = iter; + deps[i].dependence_type = ompt_dependence_type_source; +#endif + } +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.ompt_callback_dependences) { + ompt_callbacks.ompt_callback(ompt_callback_dependences)( + &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims); } +#endif shft = iter_number % 32; // use 32-bit granularity iter_number >>= 5; // divided by 32 flag = 1 << shft; diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index a91ffa2ba299..9d7b81733eba 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -372,10 +372,10 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, // before spending time on this). // For now use dynamically allocated per-thread lock, // free memory in __kmp_dispatch_next when status==0. - KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); - th->th.th_dispatch->th_steal_lock = + KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); + pr->u.p.th_steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); - __kmp_init_lock(th->th.th_dispatch->th_steal_lock); + __kmp_init_lock(pr->u.p.th_steal_lock); } break; } else { @@ -968,7 +968,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, // all parm3 will be the same, it still exists a bad case like using 0 and 1 // rather than program life-time increment. So the dedicated variable is // required. The 'static_steal_counter' is used. - if (schedule == kmp_sch_static_steal) { + if (pr->schedule == kmp_sch_static_steal) { // Other threads will inspect this variable when searching for a victim. // This is a flag showing that other threads may steal from this thread // since then. @@ -1195,7 +1195,7 @@ int __kmp_dispatch_next_algorithm(int gtid, if (traits_t<T>::type_size > 4) { // use lock for 8-byte and CAS for 4-byte induction // variable. TODO (optional): check and use 16-byte CAS - kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; + kmp_lock_t *lck = pr->u.p.th_steal_lock; KMP_DEBUG_ASSERT(lck != NULL); if (pr->u.p.count < (UT)pr->u.p.ub) { __kmp_acquire_lock(lck, gtid); @@ -1210,37 +1210,38 @@ int __kmp_dispatch_next_algorithm(int gtid, kmp_info_t **other_threads = team->t.t_threads; int while_limit = pr->u.p.parm3; int while_index = 0; + T id = pr->u.p.static_steal_counter; // loop id + int idx = (th->th.th_dispatch->th_disp_index - 1) % + __kmp_dispatch_num_buffers; // current loop index + // note: victim thread can potentially execute another loop // TODO: algorithm of searching for a victim // should be cleaned up and measured while ((!status) && (while_limit != ++while_index)) { + dispatch_private_info_template<T> *victim; T remaining; T victimIdx = pr->u.p.parm4; T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; - dispatch_private_info_template<T> *victim = - reinterpret_cast<dispatch_private_info_template<T> *>( - other_threads[victimIdx] - ->th.th_dispatch->th_dispatch_pr_current); - while ((victim == NULL || victim == pr || - (*(volatile T *)&victim->u.p.static_steal_counter != - *(volatile T *)&pr->u.p.static_steal_counter)) && + victim = reinterpret_cast<dispatch_private_info_template<T> *>( + &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(victim); + while ((victim == pr || id != victim->u.p.static_steal_counter) && oldVictimIdx != victimIdx) { victimIdx = (victimIdx + 1) % nproc; victim = reinterpret_cast<dispatch_private_info_template<T> *>( - other_threads[victimIdx] - ->th.th_dispatch->th_dispatch_pr_current); + &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(victim); } - if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != - *(volatile T *)&pr->u.p.static_steal_counter)) { + if (victim == pr || id != victim->u.p.static_steal_counter) { continue; // try once more (nproc attempts in total) // no victim is ready yet to participate in stealing - // because all victims are still in kmp_init_dispatch + // because no victim passed kmp_init_dispatch yet } if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid continue; // not enough chunks to steal, goto next victim } - lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; + lck = victim->u.p.th_steal_lock; KMP_ASSERT(lck != NULL); __kmp_acquire_lock(lck, gtid); limit = victim->u.p.ub; // keep initial ub @@ -1250,7 +1251,7 @@ int __kmp_dispatch_next_algorithm(int gtid, pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim continue; // not enough chunks to steal } - // stealing succeded, reduce victim's ub by 1/4 of undone chunks or + // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or // by 1 if (remaining > 3) { // steal 1/4 of remaining @@ -1268,10 +1269,10 @@ int __kmp_dispatch_next_algorithm(int gtid, status = 1; while_index = 0; // now update own count and ub with stolen range but init chunk - __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); + __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); pr->u.p.count = init + 1; pr->u.p.ub = limit; - __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); + __kmp_release_lock(pr->u.p.th_steal_lock, gtid); } // while (search for victim) } // if (try to find victim and steal) } else { @@ -1308,32 +1309,32 @@ int __kmp_dispatch_next_algorithm(int gtid, kmp_info_t **other_threads = team->t.t_threads; int while_limit = pr->u.p.parm3; int while_index = 0; - + T id = pr->u.p.static_steal_counter; // loop id + int idx = (th->th.th_dispatch->th_disp_index - 1) % + __kmp_dispatch_num_buffers; // current loop index + // note: victim thread can potentially execute another loop // TODO: algorithm of searching for a victim // should be cleaned up and measured while ((!status) && (while_limit != ++while_index)) { + dispatch_private_info_template<T> *victim; union_i4 vold, vnew; kmp_int32 remaining; T victimIdx = pr->u.p.parm4; T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; - dispatch_private_info_template<T> *victim = - reinterpret_cast<dispatch_private_info_template<T> *>( - other_threads[victimIdx] - ->th.th_dispatch->th_dispatch_pr_current); - while ((victim == NULL || victim == pr || - (*(volatile T *)&victim->u.p.static_steal_counter != - *(volatile T *)&pr->u.p.static_steal_counter)) && + victim = reinterpret_cast<dispatch_private_info_template<T> *>( + &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(victim); + while ((victim == pr || id != victim->u.p.static_steal_counter) && oldVictimIdx != victimIdx) { victimIdx = (victimIdx + 1) % nproc; victim = reinterpret_cast<dispatch_private_info_template<T> *>( - other_threads[victimIdx] - ->th.th_dispatch->th_dispatch_pr_current); + &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(victim); } - if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != - *(volatile T *)&pr->u.p.static_steal_counter)) { + if (victim == pr || id != victim->u.p.static_steal_counter) { continue; // try once more (nproc attempts in total) // no victim is ready yet to participate in stealing - // because all victims are still in kmp_init_dispatch + // because no victim passed kmp_init_dispatch yet } pr->u.p.parm4 = victimIdx; // new victim found while (1) { // CAS loop if victim has enough chunks to steal @@ -1357,7 +1358,7 @@ int __kmp_dispatch_next_algorithm(int gtid, (volatile kmp_int64 *)&victim->u.p.count, *VOLATILE_CAST(kmp_int64 *) & vold.b, *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { - // stealing succedded + // stealing succeeded KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub); status = 1; @@ -1372,7 +1373,7 @@ int __kmp_dispatch_next_algorithm(int gtid, #endif break; } // if (check CAS result) - KMP_CPU_PAUSE(); // CAS failed, repeate attempt + KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt } // while (try to steal from particular victim) } // while (search for victim) } // if (try to find victim and steal) @@ -1532,7 +1533,7 @@ int __kmp_dispatch_next_algorithm(int gtid, } if ((T)remaining < pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default - // use dynamic-style shcedule + // use dynamic-style schedule // atomically increment iterations, get old value init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunkspec); @@ -1601,7 +1602,7 @@ int __kmp_dispatch_next_algorithm(int gtid, KMP_DEBUG_ASSERT(init % chunk == 0); // compare with K*nproc*(chunk+1), K=2 by default if ((T)remaining < pr->u.p.parm2) { - // use dynamic-style shcedule + // use dynamic-style schedule // atomically increment iterations, get old value init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunk); @@ -1892,7 +1893,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, typedef typename traits_t<T>::unsigned_t UT; typedef typename traits_t<T>::signed_t ST; // This is potentially slightly misleading, schedule(runtime) will appear here - // even if the actual runtme schedule is static. (Which points out a + // even if the actual runtime schedule is static. (Which points out a // disadvantage of schedule(runtime): even when static scheduling is used it // costs more than a compile time choice to use static scheduling would.) KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); @@ -1909,7 +1910,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, gtid, p_lb, p_ub, p_st, p_last)); if (team->t.t_serialized) { - /* NOTE: serialize this dispatch becase we are not at the active level */ + /* NOTE: serialize this dispatch because we are not at the active level */ pr = reinterpret_cast<dispatch_private_info_template<T> *>( th->th.th_dispatch->th_disp_buffer); /* top of the stack */ KMP_DEBUG_ASSERT(pr); @@ -2068,14 +2069,19 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, if (pr->schedule == kmp_sch_static_steal && traits_t<T>::type_size > 4) { int i; + int idx = (th->th.th_dispatch->th_disp_index - 1) % + __kmp_dispatch_num_buffers; // current loop index kmp_info_t **other_threads = team->t.t_threads; // loop complete, safe to destroy locks used for stealing for (i = 0; i < th->th.th_team_nproc; ++i) { - kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; + dispatch_private_info_template<T> *buf = + reinterpret_cast<dispatch_private_info_template<T> *>( + &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); + kmp_lock_t *lck = buf->u.p.th_steal_lock; KMP_ASSERT(lck != NULL); __kmp_destroy_lock(lck); __kmp_free(lck); - other_threads[i]->th.th_dispatch->th_steal_lock = NULL; + buf->u.p.th_steal_lock = NULL; } } #endif diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h index 8b3e98435a3f..1f98e4b80a79 100644 --- a/openmp/runtime/src/kmp_dispatch.h +++ b/openmp/runtime/src/kmp_dispatch.h @@ -75,7 +75,7 @@ template <typename T> struct dispatch_private_infoXX_template { ST st; // signed UT tc; // unsigned T static_steal_counter; // for static_steal only; maybe better to put after ub - + kmp_lock_t *th_steal_lock; // lock used for chunk stealing /* parm[1-4] are used in different ways by different scheduling algorithms */ // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) diff --git a/openmp/runtime/src/kmp_dispatch_hier.h b/openmp/runtime/src/kmp_dispatch_hier.h index 3d7faea04272..c615b7b08958 100644 --- a/openmp/runtime/src/kmp_dispatch_hier.h +++ b/openmp/runtime/src/kmp_dispatch_hier.h @@ -993,7 +993,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n, th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate( sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST); } - // Have threads "register" themselves by modifiying the active count for each + // Have threads "register" themselves by modifying the active count for each // level they are involved in. The active count will act as nthreads for that // level regarding the scheduling algorithms for (int i = 0; i < n; ++i) { diff --git a/openmp/runtime/src/kmp_environment.h b/openmp/runtime/src/kmp_environment.h index 76a9672f3240..a7ea9e955788 100644 --- a/openmp/runtime/src/kmp_environment.h +++ b/openmp/runtime/src/kmp_environment.h @@ -1,5 +1,5 @@ /* - * kmp_environment.h -- Handle environment varoiables OS-independently. + * kmp_environment.h -- Handle environment variables OS-independently. */ //===----------------------------------------------------------------------===// diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h index 89172c0b704c..ab57907e088e 100644 --- a/openmp/runtime/src/kmp_ftn_entry.h +++ b/openmp/runtime/src/kmp_ftn_entry.h @@ -1371,6 +1371,13 @@ void FTN_STDCALL FTN_FULFILL_EVENT(kmp_event_t *event) { #endif } +// display environment variables when requested +void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) { +#ifndef KMP_STUB + __kmp_omp_display_env(verbose); +#endif +} + // GCC compatibility (versioned symbols) #ifdef KMP_USE_VERSION_SYMBOLS diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h index 41cafab12537..22fb2bb2f5ca 100644 --- a/openmp/runtime/src/kmp_ftn_os.h +++ b/openmp/runtime/src/kmp_ftn_os.h @@ -133,6 +133,7 @@ #define FTN_PAUSE_RESOURCE omp_pause_resource #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels +#define FTN_DISPLAY_ENV omp_display_env #define FTN_FULFILL_EVENT omp_fulfill_event #endif /* KMP_FTN_PLAIN */ @@ -256,6 +257,7 @@ #define FTN_PAUSE_RESOURCE omp_pause_resource_ #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all_ #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels_ +#define FTN_DISPLAY_ENV omp_display_env_ #define FTN_FULFILL_EVENT omp_fulfill_event_ #endif /* KMP_FTN_APPEND */ @@ -377,6 +379,7 @@ #define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS +#define FTN_DISPLAY_ENV OMP_DISPLAY_ENV #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT #endif /* KMP_FTN_UPPER */ @@ -500,6 +503,7 @@ #define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE_ #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL_ #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS_ +#define FTN_DISPLAY_ENV OMP_DISPLAY_ENV_ #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT_ #endif /* KMP_FTN_UAPPEND */ @@ -654,4 +658,26 @@ #define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED \ GOMP_parallel_loop_nonmonotonic_guided +// All GOMP_5.0 symbols +#define KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT \ + GOMP_loop_maybe_nonmonotonic_runtime_next +#define KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START \ + GOMP_loop_maybe_nonmonotonic_runtime_start +#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT \ + GOMP_loop_nonmonotonic_runtime_next +#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START \ + GOMP_loop_nonmonotonic_runtime_start +#define KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT \ + GOMP_loop_ull_maybe_nonmonotonic_runtime_next +#define KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START \ + GOMP_loop_ull_maybe_nonmonotonic_runtime_start +#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT \ + GOMP_loop_ull_nonmonotonic_runtime_next +#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START \ + GOMP_loop_ull_nonmonotonic_runtime_start +#define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME \ + GOMP_parallel_loop_nonmonotonic_runtime +#define KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME \ + GOMP_parallel_loop_maybe_nonmonotonic_runtime + #endif /* KMP_FTN_OS_H */ diff --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp index e0739a737d9c..ab4f27bfc067 100644 --- a/openmp/runtime/src/kmp_gsupport.cpp +++ b/openmp/runtime/src/kmp_gsupport.cpp @@ -275,7 +275,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) { #define KMP_DISPATCH_FINI_CHUNK_ULL __kmp_aux_dispatch_fini_chunk_8u #define KMP_DISPATCH_NEXT_ULL __kmpc_dispatch_next_8u -// The parallel contruct +// The parallel construct #ifndef KMP_DEBUG static @@ -325,7 +325,7 @@ static enum sched_type schedule, long start, long end, long incr, long chunk_size) { - // Intialize the loop worksharing construct. + // Initialize the loop worksharing construct. KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size, schedule != kmp_sch_static); @@ -635,6 +635,15 @@ LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT), {}) LOOP_RUNTIME_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_START), kmp_sch_runtime) LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT), {}) +LOOP_RUNTIME_START( + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START), + kmp_sch_runtime) +LOOP_RUNTIME_START( + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START), + kmp_sch_runtime) +LOOP_NEXT( + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT), {}) +LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT), {}) LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START), kmp_ord_static) @@ -911,6 +920,18 @@ LOOP_NEXT_ULL( LOOP_RUNTIME_START_ULL( KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START), kmp_sch_runtime) LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT), {}) +LOOP_RUNTIME_START_ULL( + KMP_EXPAND_NAME( + KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START), + kmp_sch_runtime) +LOOP_RUNTIME_START_ULL( + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START), + kmp_sch_runtime) +LOOP_NEXT_ULL( + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT), + {}) +LOOP_NEXT_ULL( + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT), {}) LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START), kmp_ord_static) @@ -1513,6 +1534,12 @@ PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED), kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST) PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME), kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST) +PARALLEL_LOOP( + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME), + kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST) +PARALLEL_LOOP( + KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME), + kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST) void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_START)(void) { int gtid = __kmp_entry_gtid(); @@ -1985,6 +2012,28 @@ KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC, 45, KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED, 45, "GOMP_4.5"); +// GOMP_5.0 versioned symbols +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT, 50, + "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START, 50, + "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT, 50, + "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START, 50, + "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT, + 50, "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START, + 50, "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT, 50, + "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START, 50, + "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME, 50, + "GOMP_5.0"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME, + 50, "GOMP_5.0"); + #endif // KMP_USE_VERSION_SYMBOLS #ifdef __cplusplus diff --git a/openmp/runtime/src/kmp_i18n.cpp b/openmp/runtime/src/kmp_i18n.cpp index 53c442715b0b..d2651cfabdf3 100644 --- a/openmp/runtime/src/kmp_i18n.cpp +++ b/openmp/runtime/src/kmp_i18n.cpp @@ -639,7 +639,7 @@ kmp_msg_t __kmp_msg_format(unsigned id_arg, ...) { // numbers, for example: "%2$s %1$s". __kmp_str_buf_vprint(&buffer, __kmp_i18n_catgets(id), args); #elif KMP_OS_WINDOWS - // On Winodws, printf() family functions does not recognize GNU style + // On Windows, printf() family functions does not recognize GNU style // parameter numbers, so we have to use FormatMessage() instead. It recognizes // parameter numbers, e. g.: "%2!s! "%1!s!". { diff --git a/openmp/runtime/src/kmp_i18n.h b/openmp/runtime/src/kmp_i18n.h index 9d79a21bb2df..3fd6099ad149 100644 --- a/openmp/runtime/src/kmp_i18n.h +++ b/openmp/runtime/src/kmp_i18n.h @@ -32,7 +32,7 @@ extern "C" { __kmp_i18n_catgets() returns read-only string. It should not be freed. - KMP_I18N_STR macro simplifies acces to strings in message catalog a bit. + KMP_I18N_STR macro simplifies access to strings in message catalog a bit. Following two lines are equivalent: __kmp_i18n_catgets( kmp_i18n_str_Warning ) diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp index 2cc9e08278c4..8bf7ef2deb71 100644 --- a/openmp/runtime/src/kmp_lock.cpp +++ b/openmp/runtime/src/kmp_lock.cpp @@ -1239,6 +1239,9 @@ __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck, KMP_MB(); // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck); + // Synchronize writes to both runtime thread structures + // and writes in user code. + KMP_MB(); #ifdef DEBUG_QUEUING_LOCKS TRACE_LOCK(gtid + 1, "acq spin"); @@ -3018,7 +3021,7 @@ kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])( static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0}; // User lock allocator for dynamically dispatched indirect locks. Every entry of -// the indirect lock table holds the address and type of the allocated indrect +// the indirect lock table holds the address and type of the allocated indirect // lock (kmp_indirect_lock_t), and the size of the table doubles when it is // full. A destroyed indirect lock object is returned to the reusable pool of // locks, unique to each lock type. diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h index 75a15f084c69..e54f6812b8b3 100644 --- a/openmp/runtime/src/kmp_lock.h +++ b/openmp/runtime/src/kmp_lock.h @@ -42,7 +42,7 @@ typedef struct ident ident_t; // ---------------------------------------------------------------------------- // We need to know the size of the area we can assume that the compiler(s) -// allocated for obects of type omp_lock_t and omp_nest_lock_t. The Intel +// allocated for objects of type omp_lock_t and omp_nest_lock_t. The Intel // compiler always allocates a pointer-sized area, as does visual studio. // // gcc however, only allocates 4 bytes for regular locks, even on 64-bit @@ -861,11 +861,11 @@ __kmp_destroy_nested_user_lock_with_checks(kmp_user_lock_p lck) { // // In other cases, the calling code really should differentiate between an // unimplemented function and one that is implemented but returning NULL / -// invalied value. If this is the case, no get function wrapper exists. +// invalid value. If this is the case, no get function wrapper exists. extern int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck); -// no set function; fields set durining local allocation +// no set function; fields set during local allocation extern const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck); @@ -899,7 +899,7 @@ static inline void __kmp_set_user_lock_flags(kmp_user_lock_p lck, } } -// The fuction which sets up all of the vtbl pointers for kmp_user_lock_t. +// The function which sets up all of the vtbl pointers for kmp_user_lock_t. extern void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind); // Macros for binding user lock functions. @@ -1128,7 +1128,7 @@ extern int (**__kmp_direct_unset)(kmp_dyna_lock_t *, kmp_int32); extern int (**__kmp_direct_test)(kmp_dyna_lock_t *, kmp_int32); // Function tables for indirect locks. Set/unset/test differentiate functions -// with/withuot consistency checking. +// with/without consistency checking. extern void (*__kmp_indirect_init[])(kmp_user_lock_p); extern void (**__kmp_indirect_destroy)(kmp_user_lock_p); extern int (**__kmp_indirect_set)(kmp_user_lock_p, kmp_int32); diff --git a/openmp/runtime/src/kmp_omp.h b/openmp/runtime/src/kmp_omp.h index 27b550d1f663..c7ba32a14338 100644 --- a/openmp/runtime/src/kmp_omp.h +++ b/openmp/runtime/src/kmp_omp.h @@ -47,7 +47,7 @@ typedef struct { } kmp_omp_nthr_item_t; typedef struct { - kmp_int32 num; // Number of items in the arrray. + kmp_int32 num; // Number of items in the array. kmp_uint64 array; // Address of array of kmp_omp_num_threads_item_t. } kmp_omp_nthr_info_t; diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index acd157db8e52..e0c8cf241044 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -3529,7 +3529,7 @@ static int __kmp_expand_threads(int nNeed) { // > __kmp_max_nth in one of two ways: // // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] - // may not be resused by another thread, so we may need to increase + // may not be reused by another thread, so we may need to increase // __kmp_threads_capacity to __kmp_max_nth + 1. // // 2) New foreign root(s) are encountered. We always register new foreign @@ -4515,11 +4515,11 @@ __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { #if KMP_AFFINITY_SUPPORTED // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. -// It calculats the worker + master thread's partition based upon the parent +// It calculates the worker + master thread's partition based upon the parent // thread's partition, and binds each worker to a thread in their partition. // The master thread's partition should already include its current binding. static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { - // Copy the master thread's place partion to the team struct + // Copy the master thread's place partition to the team struct kmp_info_t *master_th = team->t.t_threads[0]; KMP_DEBUG_ASSERT(master_th != NULL); kmp_proc_bind_t proc_bind = team->t.t_proc_bind; @@ -5536,7 +5536,7 @@ kmp_team_t *__kmp_reap_team(kmp_team_t *team) { // locality problems on programs where the size of the hot team regularly // grew and shrunk. // -// Now, for single-level parallelism, the OMP tid is alway == gtid. +// Now, for single-level parallelism, the OMP tid is always == gtid. void __kmp_free_thread(kmp_info_t *this_th) { int gtid; kmp_info_t **scan; @@ -5609,7 +5609,7 @@ void __kmp_free_thread(kmp_info_t *this_th) { // scan is the address of a link in the list, possibly the address of // __kmp_thread_pool itself. // - // In the absence of nested parallism, the for loop will have 0 iterations. + // In the absence of nested parallelism, the for loop will have 0 iterations. if (__kmp_thread_pool_insert_pt != NULL) { scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); } else { @@ -6088,7 +6088,7 @@ void __kmp_internal_end_library(int gtid_req) { only place to clear __kmp_serial_init */ /* we'll check this later too, after we get the lock */ // 2009-09-06: We do not set g_abort without setting g_done. This check looks - // redundaant, because the next check will work in any case. + // redundant, because the next check will work in any case. if (__kmp_global.g.g_abort) { KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); /* TODO abort? */ @@ -8217,7 +8217,6 @@ __kmp_determine_reduction_method( return (retval); } - // this function is for testing set/get/determine reduce method kmp_int32 __kmp_get_reduce_method(void) { return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); @@ -8297,3 +8296,12 @@ int __kmp_pause_resource(kmp_pause_status_t level) { return 1; } } + + +void __kmp_omp_display_env(int verbose) { + __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); + if (__kmp_init_serial == 0) + __kmp_do_serial_initialize(); + __kmp_display_env_impl(!verbose, verbose); + __kmp_release_bootstrap_lock(&__kmp_initz_lock); +} diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp index 17c149806c89..28d0ffe0fb9d 100644 --- a/openmp/runtime/src/kmp_sched.cpp +++ b/openmp/runtime/src/kmp_sched.cpp @@ -667,7 +667,7 @@ static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid, // stride for next chunks calculation. // Last iteration flag set for the team that will execute // the last iteration of the loop. - // The routine is called for dist_schedue(static,chunk) only. + // The routine is called for dist_schedule(static,chunk) only. typedef typename traits_t<T>::unsigned_t UT; typedef typename traits_t<T>::signed_t ST; kmp_uint32 team_id; diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index c7dec4d218c6..5745cbba585f 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -364,7 +364,7 @@ static void __kmp_stg_parse_int( char const *name, // I: Name of environment variable (used in warning messages). char const *value, // I: Value of environment variable to parse. - int min, // I: Miminal allowed value. + int min, // I: Minimum allowed value. int max, // I: Maximum allowed value. int *out // O: Output (parsed) value. ) { @@ -1305,7 +1305,7 @@ static void __kmp_stg_print_max_task_priority(kmp_str_buf_t *buffer, } // __kmp_stg_print_max_task_priority // KMP_TASKLOOP_MIN_TASKS -// taskloop threashold to switch from recursive to linear tasks creation +// taskloop threshold to switch from recursive to linear tasks creation static void __kmp_stg_parse_taskloop_min_tasks(char const *name, char const *value, void *data) { int tmp; @@ -2041,7 +2041,7 @@ static void __kmp_parse_affinity_env(char const *name, char const *value, // If we see a parse error, emit a warning and scan to the next ",". // // FIXME - there's got to be a better way to print an error -// message, hopefully without overwritting peices of buf. +// message, hopefully without overwriting peices of buf. #define EMIT_WARN(skip, errlist) \ { \ char ch; \ @@ -4395,7 +4395,7 @@ static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer, // ----------------------------------------------------------------------------- // KMP_HW_SUBSET (was KMP_PLACE_THREADS) -// The longest observable sequense of items is +// The longest observable sequence of items is // Socket-Node-Tile-Core-Thread // So, let's limit to 5 levels for now // The input string is usually short enough, let's use 512 limit for now @@ -5720,7 +5720,11 @@ void __kmp_env_print() { } // __kmp_env_print void __kmp_env_print_2() { + __kmp_display_env_impl(__kmp_display_env, __kmp_display_env_verbose); +} // __kmp_env_print_2 + +void __kmp_display_env_impl(int display_env, int display_env_verbose) { kmp_env_blk_t block; kmp_str_buf_t buffer; @@ -5737,9 +5741,9 @@ void __kmp_env_print_2() { for (int i = 0; i < __kmp_stg_count; ++i) { if (__kmp_stg_table[i].print != NULL && - ((__kmp_display_env && + ((display_env && strncmp(__kmp_stg_table[i].name, "OMP_", 4) == 0) || - __kmp_display_env_verbose)) { + display_env_verbose)) { __kmp_stg_table[i].print(&buffer, __kmp_stg_table[i].name, __kmp_stg_table[i].data); } @@ -5754,7 +5758,6 @@ void __kmp_env_print_2() { __kmp_str_buf_free(&buffer); __kmp_printf("\n"); - -} // __kmp_env_print_2 +} // end of file diff --git a/openmp/runtime/src/kmp_settings.h b/openmp/runtime/src/kmp_settings.h index 3247ffc6af74..d61c40694cf6 100644 --- a/openmp/runtime/src/kmp_settings.h +++ b/openmp/runtime/src/kmp_settings.h @@ -17,6 +17,7 @@ void __kmp_reset_global_vars(void); void __kmp_env_initialize(char const *); void __kmp_env_print(); void __kmp_env_print_2(); +void __kmp_display_env_impl(int display_env, int display_env_verbose); int __kmp_initial_threads_capacity(int req_nproc); void __kmp_init_dflt_team_nth(); diff --git a/openmp/runtime/src/kmp_stats.cpp b/openmp/runtime/src/kmp_stats.cpp index dabd0c35b85c..55ac18a4312c 100644 --- a/openmp/runtime/src/kmp_stats.cpp +++ b/openmp/runtime/src/kmp_stats.cpp @@ -270,7 +270,7 @@ void explicitTimer::stop(tsc_tick_count tick, /* ************* partitionedTimers member functions ************* */ partitionedTimers::partitionedTimers() { timer_stack.reserve(8); } -// initialize the paritioned timers to an initial timer +// initialize the partitioned timers to an initial timer void partitionedTimers::init(explicitTimer timer) { KMP_DEBUG_ASSERT(this->timer_stack.size() == 0); timer_stack.push_back(timer); @@ -609,7 +609,7 @@ void kmp_stats_output_module::printTimerStats(FILE *statsOut, totalStats[s].format(tag, true).c_str()); } - // Print historgram of statistics + // Print histogram of statistics if (theStats[0].haveHist()) { fprintf(statsOut, "\nTimer distributions\n"); for (int s = 0; s < TIMER_LAST; s++) { diff --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h index ee95658fd9b7..a36528f3fca3 100644 --- a/openmp/runtime/src/kmp_stats.h +++ b/openmp/runtime/src/kmp_stats.h @@ -195,7 +195,7 @@ enum stats_state_e { // from a dynamically scheduled loop // OMP_critical -- Time thread spends executing critical section // OMP_critical_wait -- Time thread spends waiting to enter -// a critcal seciton +// a critical section // OMP_single -- Time spent executing a "single" region // OMP_master -- Time spent executing a "master" region // OMP_task_immediate -- Time spent executing non-deferred tasks @@ -522,7 +522,7 @@ public: void windup(); }; -// Special wrapper around the partioned timers to aid timing code blocks +// Special wrapper around the partitioned timers to aid timing code blocks // It avoids the need to have an explicit end, leaving the scope suffices. class blockPartitionedTimer { partitionedTimers *part_timers; @@ -920,7 +920,7 @@ extern kmp_stats_output_module __kmp_stats_output; #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string) /*! - * \brief Initializes the paritioned timers to begin with name. + * \brief Initializes the partitioned timers to begin with name. * * @param name timer which you want this thread to begin with * diff --git a/openmp/runtime/src/kmp_str.h b/openmp/runtime/src/kmp_str.h index 09faadb68f1a..9e669bbe4742 100644 --- a/openmp/runtime/src/kmp_str.h +++ b/openmp/runtime/src/kmp_str.h @@ -72,12 +72,12 @@ struct kmp_str_fname { typedef struct kmp_str_fname kmp_str_fname_t; void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path); void __kmp_str_fname_free(kmp_str_fname_t *fname); -// Compares file name with specified patern. If pattern is NULL, any fname +// Compares file name with specified pattern. If pattern is NULL, any fname // matched. int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern); /* The compiler provides source locations in string form - ";file;func;line;col;;". It is not convenient for manupulation. This + ";file;func;line;col;;". It is not convenient for manipulation. This structure keeps source location in more convenient form. Usage: diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp index 6b5041988d5c..0fc022a03a2d 100644 --- a/openmp/runtime/src/kmp_stub.cpp +++ b/openmp/runtime/src/kmp_stub.cpp @@ -147,7 +147,7 @@ void *kmp_malloc(size_t size) { i; void *res; #if KMP_OS_WINDOWS - // If succesfull returns a pointer to the memory block, otherwise returns + // If successful returns a pointer to the memory block, otherwise returns // NULL. // Sets errno to ENOMEM or EINVAL if memory allocation failed or parameter // validation failed. diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp index e1618f5cd9df..a654951f5b3b 100644 --- a/openmp/runtime/src/kmp_taskdeps.cpp +++ b/openmp/runtime/src/kmp_taskdeps.cpp @@ -35,7 +35,7 @@ static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0); static void __kmp_init_node(kmp_depnode_t *node) { node->dn.successors = NULL; - node->dn.task = NULL; // will point to the rigth task + node->dn.task = NULL; // will point to the right task // once dependences have been processed for (int i = 0; i < MAX_MTX_DEPS; ++i) node->dn.mtx_locks[i] = NULL; @@ -205,7 +205,7 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread, return new_head; } -static inline void __kmp_track_dependence(kmp_depnode_t *source, +static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source, kmp_depnode_t *sink, kmp_task_t *sink_task) { #ifdef KMP_SUPPORT_GRAPH_OUTPUT @@ -224,11 +224,14 @@ static inline void __kmp_track_dependence(kmp_depnode_t *source, */ if (ompt_enabled.ompt_callback_task_dependence) { kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task); - kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task); + ompt_data_t *sink_data; + if (sink_task) + sink_data = &(KMP_TASK_TO_TASKDATA(sink_task)->ompt_task_info.task_data); + else + sink_data = &__kmp_threads[gtid]->th.ompt_thread_info.task_data; ompt_callbacks.ompt_callback(ompt_callback_task_dependence)( - &(task_source->ompt_task_info.task_data), - &(task_sink->ompt_task_info.task_data)); + &(task_source->ompt_task_info.task_data), sink_data); } #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */ } @@ -246,7 +249,7 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread, if (dep->dn.task) { KMP_ACQUIRE_DEPNODE(gtid, dep); if (dep->dn.task) { - __kmp_track_dependence(dep, node, task); + __kmp_track_dependence(gtid, dep, node, task); dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node); KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " "%p\n", @@ -272,7 +275,7 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid, // synchronously add source to sink' list of successors KMP_ACQUIRE_DEPNODE(gtid, sink); if (sink->dn.task) { - __kmp_track_dependence(sink, source, task); + __kmp_track_dependence(gtid, sink, source, task); sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source); KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " "%p\n", @@ -473,8 +476,8 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node, npredecessors++; // Update predecessors and obtain current value to check if there are still - // any outstandig dependences (some tasks may have finished while we processed - // the dependences) + // any outstanding dependences (some tasks may have finished while we + // processed the dependences) npredecessors = node->dn.npredecessors.fetch_add(npredecessors) + npredecessors; @@ -498,7 +501,7 @@ task'' @param noalias_dep_list List of depend items with no aliasing @return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not -suspendend and queued, or TASK_CURRENT_QUEUED if it was suspended and queued +suspended and queued, or TASK_CURRENT_QUEUED if it was suspended and queued Schedule a non-thread-switchable task with dependences for execution */ @@ -540,47 +543,40 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid, ompt_enabled.ompt_callback_dependences) { kmp_int32 i; - new_taskdata->ompt_task_info.ndeps = ndeps + ndeps_noalias; - new_taskdata->ompt_task_info.deps = - (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC( - thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t)); + int ompt_ndeps = ndeps + ndeps_noalias; + ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC( + thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t)); - KMP_ASSERT(new_taskdata->ompt_task_info.deps != NULL); + KMP_ASSERT(ompt_deps != NULL); for (i = 0; i < ndeps; i++) { - new_taskdata->ompt_task_info.deps[i].variable.ptr = - (void *)dep_list[i].base_addr; + ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr; if (dep_list[i].flags.in && dep_list[i].flags.out) - new_taskdata->ompt_task_info.deps[i].dependence_type = - ompt_dependence_type_inout; + ompt_deps[i].dependence_type = ompt_dependence_type_inout; else if (dep_list[i].flags.out) - new_taskdata->ompt_task_info.deps[i].dependence_type = - ompt_dependence_type_out; + ompt_deps[i].dependence_type = ompt_dependence_type_out; else if (dep_list[i].flags.in) - new_taskdata->ompt_task_info.deps[i].dependence_type = - ompt_dependence_type_in; + ompt_deps[i].dependence_type = ompt_dependence_type_in; + else if (dep_list[i].flags.mtx) + ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset; } for (i = 0; i < ndeps_noalias; i++) { - new_taskdata->ompt_task_info.deps[ndeps + i].variable.ptr = - (void *)noalias_dep_list[i].base_addr; + ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr; if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out) - new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type = - ompt_dependence_type_inout; + ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout; else if (noalias_dep_list[i].flags.out) - new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type = - ompt_dependence_type_out; + ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out; else if (noalias_dep_list[i].flags.in) - new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type = - ompt_dependence_type_in; + ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in; + else if (noalias_dep_list[i].flags.mtx) + ompt_deps[ndeps + i].dependence_type = + ompt_dependence_type_mutexinoutset; } ompt_callbacks.ompt_callback(ompt_callback_dependences)( - &(new_taskdata->ompt_task_info.task_data), - new_taskdata->ompt_task_info.deps, new_taskdata->ompt_task_info.ndeps); + &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps); /* We can now free the allocated memory for the dependencies */ - /* For OMPD we might want to delay the free until task_end */ - KMP_OMPT_DEPS_FREE(thread, new_taskdata->ompt_task_info.deps); - new_taskdata->ompt_task_info.deps = NULL; - new_taskdata->ompt_task_info.ndeps = 0; + /* For OMPD we might want to delay the free until end of this function */ + KMP_OMPT_DEPS_FREE(thread, ompt_deps); } #endif /* OMPT_OPTIONAL */ #endif /* OMPT_SUPPORT */ @@ -642,6 +638,23 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid, return ret; } +#if OMPT_SUPPORT +void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task, + ompt_data_t *taskwait_task_data) { + if (ompt_enabled.ompt_callback_task_schedule) { + ompt_data_t task_data = ompt_data_none; + ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( + current_task ? &(current_task->ompt_task_info.task_data) : &task_data, + ompt_task_switch, taskwait_task_data); + ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( + taskwait_task_data, ompt_task_complete, + current_task ? &(current_task->ompt_task_info.task_data) : &task_data); + } + current_task->ompt_task_info.frame.enter_frame.ptr = NULL; + *taskwait_task_data = ompt_data_none; +} +#endif /* OMPT_SUPPORT */ + /*! @ingroup TASKING @param loc_ref location of the original task directive @@ -668,6 +681,74 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *current_task = thread->th.th_current_task; +#if OMPT_SUPPORT + // this function represents a taskwait construct with depend clause + // We signal 4 events: + // - creation of the taskwait task + // - dependences of the taskwait task + // - schedule and finish of the taskwait task + ompt_data_t *taskwait_task_data = &thread->th.ompt_thread_info.task_data; + KMP_ASSERT(taskwait_task_data->ptr == NULL); + if (ompt_enabled.enabled) { + if (!current_task->ompt_task_info.frame.enter_frame.ptr) + current_task->ompt_task_info.frame.enter_frame.ptr = + OMPT_GET_FRAME_ADDRESS(0); + if (ompt_enabled.ompt_callback_task_create) { + ompt_data_t task_data = ompt_data_none; + ompt_callbacks.ompt_callback(ompt_callback_task_create)( + current_task ? &(current_task->ompt_task_info.task_data) : &task_data, + current_task ? &(current_task->ompt_task_info.frame) : NULL, + taskwait_task_data, + ompt_task_explicit | ompt_task_undeferred | ompt_task_mergeable, 1, + OMPT_GET_RETURN_ADDRESS(0)); + } + } + +#if OMPT_OPTIONAL + /* OMPT grab all dependences if requested by the tool */ + if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) { + kmp_int32 i; + + int ompt_ndeps = ndeps + ndeps_noalias; + ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC( + thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t)); + + KMP_ASSERT(ompt_deps != NULL); + + for (i = 0; i < ndeps; i++) { + ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr; + if (dep_list[i].flags.in && dep_list[i].flags.out) + ompt_deps[i].dependence_type = ompt_dependence_type_inout; + else if (dep_list[i].flags.out) + ompt_deps[i].dependence_type = ompt_dependence_type_out; + else if (dep_list[i].flags.in) + ompt_deps[i].dependence_type = ompt_dependence_type_in; + else if (dep_list[i].flags.mtx) + ompt_deps[ndeps + i].dependence_type = + ompt_dependence_type_mutexinoutset; + } + for (i = 0; i < ndeps_noalias; i++) { + ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr; + if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out) + ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout; + else if (noalias_dep_list[i].flags.out) + ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out; + else if (noalias_dep_list[i].flags.in) + ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in; + else if (noalias_dep_list[i].flags.mtx) + ompt_deps[ndeps + i].dependence_type = + ompt_dependence_type_mutexinoutset; + } + ompt_callbacks.ompt_callback(ompt_callback_dependences)( + taskwait_task_data, ompt_deps, ompt_ndeps); + /* We can now free the allocated memory for the dependencies */ + /* For OMPD we might want to delay the free until end of this function */ + KMP_OMPT_DEPS_FREE(thread, ompt_deps); + ompt_deps = NULL; + } +#endif /* OMPT_OPTIONAL */ +#endif /* OMPT_SUPPORT */ + // We can return immediately as: // - dependences are not computed in serial teams (except with proxy tasks) // - if the dephash is not yet created it means we have nothing to wait for @@ -682,6 +763,9 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking " "dependencies : loc=%p\n", gtid, loc_ref)); +#if OMPT_SUPPORT + __ompt_taskwait_dep_finish(current_task, taskwait_task_data); +#endif /* OMPT_SUPPORT */ return; } @@ -694,6 +778,9 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking " "dependencies : loc=%p\n", gtid, loc_ref)); +#if OMPT_SUPPORT + __ompt_taskwait_dep_finish(current_task, taskwait_task_data); +#endif /* OMPT_SUPPORT */ return; } @@ -705,6 +792,9 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, __kmp_task_stealing_constraint); } +#if OMPT_SUPPORT + __ompt_taskwait_dep_finish(current_task, taskwait_task_data); +#endif /* OMPT_SUPPORT */ KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n", gtid, loc_ref)); } diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index 15ffc1454fe9..2ddc2e7a6fd7 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -298,6 +298,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, static void __kmp_realloc_task_deque(kmp_info_t *thread, kmp_thread_data_t *thread_data) { kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); + KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size); kmp_int32 new_size = 2 * size; KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " @@ -381,8 +382,11 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { } else { __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); locked = 1; - // expand deque to push the task which is not allowed to execute - __kmp_realloc_task_deque(thread, thread_data); + if (TCR_4(thread_data->td.td_deque_ntasks) >= + TASK_DEQUE_SIZE(thread_data->td)) { + // expand deque to push the task which is not allowed to execute + __kmp_realloc_task_deque(thread, thread_data); + } } } // Lock the deque for the task push operation @@ -547,8 +551,6 @@ static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { task->ompt_task_info.frame.enter_frame = ompt_data_none; task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; - task->ompt_task_info.ndeps = 0; - task->ompt_task_info.deps = NULL; } // __ompt_task_start: @@ -573,24 +575,20 @@ static inline void __ompt_task_start(kmp_task_t *task, // __ompt_task_finish: // Build and trigger final task-schedule event -static inline void -__ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task, - ompt_task_status_t status = ompt_task_complete) { - kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); - if (__kmp_omp_cancellation && taskdata->td_taskgroup && - taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { - status = ompt_task_cancel; - } - - /* let OMPT know that we're returning to the callee task */ +static inline void __ompt_task_finish(kmp_task_t *task, + kmp_taskdata_t *resumed_task, + ompt_task_status_t status) { if (ompt_enabled.ompt_callback_task_schedule) { + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + if (__kmp_omp_cancellation && taskdata->td_taskgroup && + taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { + status = ompt_task_cancel; + } + + /* let OMPT know that we're returning to the callee task */ ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( &(taskdata->ompt_task_info.task_data), status, - &((resumed_task ? resumed_task - : (taskdata->ompt_task_info.scheduling_parent - ? taskdata->ompt_task_info.scheduling_parent - : taskdata->td_parent)) - ->ompt_task_info.task_data)); + (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL)); } } #endif @@ -799,6 +797,10 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid, // gtid: global thread ID for calling thread // task: task to be finished // resumed_task: task to be resumed. (may be NULL if task is serialized) +// +// template<ompt>: effectively ompt_enabled.enabled!=0 +// the version with ompt=false is inlined, allowing to optimize away all ompt +// code in this case template <bool ompt> static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task) { @@ -845,10 +847,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, return; } } -#if OMPT_SUPPORT - if (ompt) - __ompt_task_finish(task, resumed_task); -#endif // Check mutexinoutset dependencies, release locks kmp_depnode_t *node = taskdata->td_depnode; @@ -861,7 +859,37 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, } } + // bookkeeping for resuming task: + // GEH - note tasking_ser => task_serial + KMP_DEBUG_ASSERT( + (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == + taskdata->td_flags.task_serial); + if (taskdata->td_flags.task_serial) { + if (resumed_task == NULL) { + resumed_task = taskdata->td_parent; // In a serialized task, the resumed + // task is the parent + } + } else { + KMP_DEBUG_ASSERT(resumed_task != + NULL); // verify that resumed task is passed as argument + } + + /* If the tasks' destructor thunk flag has been set, we need to invoke the + destructor thunk that has been generated by the compiler. The code is + placed here, since at this point other tasks might have been released + hence overlapping the destructor invocations with some other work in the + released tasks. The OpenMP spec is not specific on when the destructors + are invoked, so we should be free to choose. */ + if (taskdata->td_flags.destructors_thunk) { + kmp_routine_entry_t destr_thunk = task->data1.destructors; + KMP_ASSERT(destr_thunk); + destr_thunk(gtid, task); + } + KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); + KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); + KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); + bool detach = false; if (taskdata->td_flags.detachable == TASK_DETACHABLE) { if (taskdata->td_allow_completion_event.type == @@ -870,21 +898,41 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); if (taskdata->td_allow_completion_event.type == KMP_EVENT_ALLOW_COMPLETION) { + // task finished execution + KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); + taskdata->td_flags.executing = 0; // suspend the finishing task + +#if OMPT_SUPPORT + // For a detached task, which is not completed, we switch back + // the omp_fulfill_event signals completion + // locking is necessary to avoid a race with ompt_task_late_fulfill + if (ompt) + __ompt_task_finish(task, resumed_task, ompt_task_detach); +#endif + + // no access to taskdata after this point! + // __kmp_fulfill_event might free taskdata at any time from now + taskdata->td_flags.proxy = TASK_PROXY; // proxify! detach = true; } __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); } } - KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); - KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); if (!detach) { taskdata->td_flags.complete = 1; // mark the task as completed +#if OMPT_SUPPORT + // This is not a detached task, we are done here + if (ompt) + __ompt_task_finish(task, resumed_task, ompt_task_complete); +#endif + // Only need to keep track of count if team parallel and tasking not - // serialized - if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { + // serialized, or task is detachable and event has already been fulfilled + if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || + taskdata->td_flags.detachable == TASK_DETACHABLE) { // Predecrement simulated by "- 1" calculation children = KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; @@ -897,45 +945,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, // with the proxy task as origin __kmp_release_deps(gtid, taskdata); } + // td_flags.executing must be marked as 0 after __kmp_release_deps has been + // called. Othertwise, if a task is executed immediately from the + // release_deps code, the flag will be reset to 1 again by this same + // function + KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); + taskdata->td_flags.executing = 0; // suspend the finishing task } - // td_flags.executing must be marked as 0 after __kmp_release_deps has been - // called. Othertwise, if a task is executed immediately from the release_deps - // code, the flag will be reset to 1 again by this same function - KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); - taskdata->td_flags.executing = 0; // suspend the finishing task KA_TRACE( 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", gtid, taskdata, children)); - /* If the tasks' destructor thunk flag has been set, we need to invoke the - destructor thunk that has been generated by the compiler. The code is - placed here, since at this point other tasks might have been released - hence overlapping the destructor invokations with some other work in the - released tasks. The OpenMP spec is not specific on when the destructors - are invoked, so we should be free to choose. */ - if (taskdata->td_flags.destructors_thunk) { - kmp_routine_entry_t destr_thunk = task->data1.destructors; - KMP_ASSERT(destr_thunk); - destr_thunk(gtid, task); - } - - // bookkeeping for resuming task: - // GEH - note tasking_ser => task_serial - KMP_DEBUG_ASSERT( - (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == - taskdata->td_flags.task_serial); - if (taskdata->td_flags.task_serial) { - if (resumed_task == NULL) { - resumed_task = taskdata->td_parent; // In a serialized task, the resumed - // task is the parent - } - } else { - KMP_DEBUG_ASSERT(resumed_task != - NULL); // verify that resumed task is passed as argument - } - // Free this task and then ancestor tasks if they have no children. // Restore th_current_task first as suggested by John: // johnmc: if an asynchronous inquiry peers into the runtime system @@ -1304,7 +1326,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, taskdata->td_flags.task_serial = (parent_task->td_flags.final || taskdata->td_flags.team_serial || - taskdata->td_flags.tasking_ser); + taskdata->td_flags.tasking_ser || flags->merged_if0); taskdata->td_flags.started = 0; taskdata->td_flags.executing = 0; @@ -1411,7 +1433,7 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, // // gtid: global thread ID of caller // task: the task to invoke -// current_task: the task to resume after task invokation +// current_task: the task to resume after task invocation static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *current_task) { kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); @@ -2911,7 +2933,7 @@ static inline int __kmp_execute_tasks_template( // met, then return now, so that the barrier gather/release pattern can // proceed. If this thread is in the last spin loop in the barrier, // waiting to be released, we know that the termination condition will not - // be satisified, so don't waste any cycles checking it. + // be satisfied, so don't waste any cycles checking it. if (flag == NULL || (!final_spin && flag->done_check())) { KA_TRACE( 15, @@ -3096,7 +3118,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team, * to each thread in the team, so that it can steal work from it. * * Enter the existence of the kmp_task_team_t struct. It employs a reference - * counting mechanims, and is allocated by the master thread before calling + * counting mechanism, and is allocated by the master thread before calling * __kmp_<barrier_kind>_release, and then is release by the last thread to * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes * of the kmp_task_team_t structs for consecutive barriers can overlap @@ -3107,7 +3129,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team, * We currently use the existence of the threads array as an indicator that * tasks were spawned since the last barrier. If the structure is to be * useful outside the context of tasking, then this will have to change, but - * not settting the field minimizes the performance impact of tasking on + * not setting the field minimizes the performance impact of tasking on * barriers, when no explicit tasks were spawned (pushed, actually). */ @@ -3651,7 +3673,11 @@ static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, return result; __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); - __kmp_realloc_task_deque(thread, thread_data); + if (TCR_4(thread_data->td.td_deque_ntasks) >= + TASK_DEQUE_SIZE(thread_data->td)) { + // expand deque to push the task which is not allowed to execute + __kmp_realloc_task_deque(thread, thread_data); + } } else { @@ -3847,22 +3873,30 @@ void __kmp_fulfill_event(kmp_event_t *event) { bool detached = false; int gtid = __kmp_get_gtid(); + // The associated task might have completed or could be completing at this + // point. + // We need to take the lock to avoid races + __kmp_acquire_tas_lock(&event->lock, gtid); if (taskdata->td_flags.proxy == TASK_PROXY) { - // The associated task code completed before this call and detached. detached = true; - event->type = KMP_EVENT_UNINITIALIZED; } else { - // The associated task has not completed but could be completing at this - // point. - // We need to take the lock to avoid races - __kmp_acquire_tas_lock(&event->lock, gtid); - if (taskdata->td_flags.proxy == TASK_PROXY) - detached = true; - event->type = KMP_EVENT_UNINITIALIZED; - __kmp_release_tas_lock(&event->lock, gtid); +#if OMPT_SUPPORT + // The OMPT event must occur under mutual exclusion, + // otherwise the tool might access ptask after free + if (UNLIKELY(ompt_enabled.enabled)) + __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill); +#endif } + event->type = KMP_EVENT_UNINITIALIZED; + __kmp_release_tas_lock(&event->lock, gtid); if (detached) { +#if OMPT_SUPPORT + // We free ptask afterwards and know the task is finished, + // so locking is not necessary + if (UNLIKELY(ompt_enabled.enabled)) + __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill); +#endif // If the task detached complete the proxy task if (gtid >= 0) { kmp_team_t *team = taskdata->td_team; @@ -3888,14 +3922,13 @@ void __kmp_fulfill_event(kmp_event_t *event) { kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { kmp_task_t *task; kmp_taskdata_t *taskdata; - kmp_taskdata_t *taskdata_src; - kmp_taskdata_t *parent_task = thread->th.th_current_task; + kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src); + kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task size_t shareds_offset; size_t task_size; KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src)); - taskdata_src = KMP_TASK_TO_TASKDATA(task_src); KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == TASK_FULL); // it should not be proxy task KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); @@ -3923,9 +3956,12 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { } taskdata->td_alloc_thread = thread; taskdata->td_parent = parent_task; - taskdata->td_taskgroup = - parent_task - ->td_taskgroup; // task inherits the taskgroup from the parent task + // task inherits the taskgroup from the parent task + taskdata->td_taskgroup = parent_task->td_taskgroup; + // tied task needs to initialize the td_last_tied at creation, + // untied one does this when it is scheduled for execution + if (taskdata->td_flags.tiedness == TASK_TIED) + taskdata->td_last_tied = taskdata; // Only need to keep track of child task counts if team parallel and tasking // not serialized @@ -4255,7 +4291,7 @@ int __kmp_taskloop_task(int gtid, void *ptask) { // grainsize Number of loop iterations per task // extras Number of chunks with grainsize+1 iterations // tc Iterations count -// num_t_min Threashold to launch tasks recursively +// num_t_min Threshold to launch tasks recursively // task_dup Tasks duplication routine // codeptr_ra Return address for OMPT events void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, @@ -4267,7 +4303,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, void *codeptr_ra, #endif void *task_dup) { -#if KMP_DEBUG kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); KMP_DEBUG_ASSERT(task != NULL); KMP_DEBUG_ASSERT(num_tasks > num_t_min); @@ -4275,7 +4310,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, task_dup)); -#endif p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; kmp_uint64 lower = *lb; kmp_info_t *thread = __kmp_threads[gtid]; @@ -4319,9 +4353,14 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, *ub = ub0; // adjust upper bound for the 1st half // create auxiliary task for 2nd half of the loop + // make sure new task has same parent task as the pattern task + kmp_taskdata_t *current_task = thread->th.th_current_task; + thread->th.th_current_task = taskdata->td_parent; kmp_task_t *new_task = __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), sizeof(__taskloop_params_t), &__kmp_taskloop_task); + // restore current task + thread->th.th_current_task = current_task; __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; p->task = next_task; p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); diff --git a/openmp/runtime/src/kmp_utility.cpp b/openmp/runtime/src/kmp_utility.cpp index 44a99d0455b3..6e6785deb445 100644 --- a/openmp/runtime/src/kmp_utility.cpp +++ b/openmp/runtime/src/kmp_utility.cpp @@ -194,7 +194,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) { KA_TRACE(trace_level, (" PSN")); } if ((buf.edx >> 19) & 1) { - /* CLFULSH - Cache Flush Instruction Available */ + /* CLFLUSH - Cache Flush Instruction Available */ cflush_size = data[1] * 8; /* Bits 15-08: CLFLUSH line size = 8 (64 bytes) */ KA_TRACE(trace_level, (" CLFLUSH(%db)", cflush_size)); diff --git a/openmp/runtime/src/kmp_version.h b/openmp/runtime/src/kmp_version.h index 9e726b3805b2..6ce40eecb5de 100644 --- a/openmp/runtime/src/kmp_version.h +++ b/openmp/runtime/src/kmp_version.h @@ -30,7 +30,7 @@ extern "C" { just before version string. */ #define KMP_VERSION_MAGIC_STR "\x00@(#) " #define KMP_VERSION_MAGIC_LEN 6 // Length of KMP_VERSION_MAGIC_STR. -#define KMP_VERSION_PREF_STR "Intel(R) OMP " +#define KMP_VERSION_PREF_STR "LLVM OMP " #define KMP_VERSION_PREFIX KMP_VERSION_MAGIC_STR KMP_VERSION_PREF_STR /* declare all the version string constants for KMP_VERSION env. variable */ diff --git a/openmp/runtime/src/kmp_wrapper_malloc.h b/openmp/runtime/src/kmp_wrapper_malloc.h index 1544c5df3d64..c027e0b297d0 100644 --- a/openmp/runtime/src/kmp_wrapper_malloc.h +++ b/openmp/runtime/src/kmp_wrapper_malloc.h @@ -15,11 +15,11 @@ #define KMP_WRAPPER_MALLOC_H /* This header serves for 3 purposes: - 1. Declaring standard memory allocation rourines in OS-independent way. + 1. Declaring standard memory allocation routines in OS-independent way. 2. Passing source location info through memory allocation wrappers. 3. Enabling native memory debugging capabilities. - 1. Declaring standard memory allocation rourines in OS-independent way. + 1. Declaring standard memory allocation routines in OS-independent way. ----------------------------------------------------------------------- On Linux* OS, alloca() function is declared in <alloca.h> header, while on Windows* OS there is no <alloca.h> header, function _alloca() (note @@ -103,9 +103,9 @@ #error Unknown or unsupported OS. #endif -/* KMP_SRC_LOC_DECL -- Declaring source location paramemters, to be used in +/* KMP_SRC_LOC_DECL -- Declaring source location parameters, to be used in function declaration. - KMP_SRC_LOC_PARM -- Source location paramemters, to be used to pass + KMP_SRC_LOC_PARM -- Source location parameters, to be used to pass parameters to underlying levels. KMP_SRC_LOC_CURR -- Source location arguments describing current location, to be used at top-level. diff --git a/openmp/runtime/src/ompt-internal.h b/openmp/runtime/src/ompt-internal.h index 958b5943af38..f753ab4ebc6d 100644 --- a/openmp/runtime/src/ompt-internal.h +++ b/openmp/runtime/src/ompt-internal.h @@ -57,8 +57,6 @@ typedef struct { ompt_data_t task_data; struct kmp_taskdata *scheduling_parent; int thread_num; - int ndeps; - ompt_dependence_t *deps; } ompt_task_info_t; typedef struct { diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp index 7fb81bb7d1a0..a7288f08a661 100644 --- a/openmp/runtime/src/ompt-specific.cpp +++ b/openmp/runtime/src/ompt-specific.cpp @@ -262,8 +262,6 @@ void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid, lwt->ompt_task_info.frame.enter_frame = ompt_data_none; lwt->ompt_task_info.frame.exit_frame = ompt_data_none; lwt->ompt_task_info.scheduling_parent = NULL; - lwt->ompt_task_info.deps = NULL; - lwt->ompt_task_info.ndeps = 0; lwt->heap = 0; lwt->parent = 0; } diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h index 5ba240c1a950..fa5c5662c649 100644 --- a/openmp/runtime/src/ompt-specific.h +++ b/openmp/runtime/src/ompt-specific.h @@ -102,7 +102,7 @@ inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) { inline const char *ompt_get_runtime_version() { return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN]; } -#endif // OMPT_SUPPRORT +#endif // OMPT_SUPPORT // macros providing the OMPT callbacks for reduction clause #if OMPT_SUPPORT && OMPT_OPTIONAL diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h index d730c48ec705..db1c0d0d9d21 100644 --- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h +++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h @@ -2303,7 +2303,7 @@ ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_ * name of the metadata), and a value (the actual data). The encoding of * the value depends on the type of the metadata. * - * The type of metadata is specified by an enumerated type __itt_metdata_type. + * The type of metadata is specified by an enumerated type __itt_metadata_type. * @{ */ @@ -3196,7 +3196,7 @@ ITT_STUBV(ITTAPI, void, relation_add_ex, (const __itt_domain *domain, #define __itt_relation_add_ex(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b) #define __itt_relation_add_ex_ptr ITTNOTIFY_NAME(relation_add_ex) #else /* INTEL_NO_ITTNOTIFY_API */ -#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail) +#define __itt_relation_add_to_current_ex(domain,clock_domain,timestamp,relation,tail) #define __itt_relation_add_to_current_ex_ptr 0 #define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail) #define __itt_relation_add_ex_ptr 0 diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp index 8f9e2a655ae4..4936b9baaf80 100644 --- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp +++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp @@ -762,7 +762,7 @@ static const char* __itt_fsplit(const char* s, const char* sep, const char** out /* This function return value of env variable that placed into static buffer. * !!! The same static buffer is used for subsequent calls. !!! - * This was done to aviod dynamic allocation for few calls. + * This was done to avoid dynamic allocation for few calls. * Actually we need this function only four times. */ static const char* __itt_get_env_var(const char* name) @@ -1012,7 +1012,7 @@ static void __itt_reinit_all_pointers(void) static void __itt_nullify_all_pointers(void) { int i; - /* Nulify all pointers except domain_create, string_handle_create and counter_create */ + /* Nullify all pointers except domain_create, string_handle_create and counter_create */ for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++) *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func; } diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp index 1daa3d31047e..3b5910fc95e8 100644 --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -164,7 +164,7 @@ void __kmp_affinity_determine_capable(const char *env_var) { if (gCode > 0) { // Linux* OS only // The optimal situation: the OS returns the size of the buffer it expects. // - // A verification of correct behavior is that Isetaffinity on a NULL + // A verification of correct behavior is that setaffinity on a NULL // buffer with the same size fails with errno set to EFAULT. sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL); KA_TRACE(30, ("__kmp_affinity_determine_capable: " @@ -286,7 +286,7 @@ void __kmp_affinity_determine_capable(const char *env_var) { if (gCode == 0) { KMP_AFFINITY_ENABLE(KMP_CPU_SET_SIZE_LIMIT); KA_TRACE(10, ("__kmp_affinity_determine_capable: " - "affinity supported (mask size %d)\n"< + "affinity supported (mask size %d)\n", (int)__kmp_affin_mask_size)); KMP_INTERNAL_FREE(buf); return; @@ -2207,7 +2207,7 @@ int __kmp_get_load_balance(int max) { #else // Linux* OS -// The fuction returns number of running (not sleeping) threads, or -1 in case +// The function returns number of running (not sleeping) threads, or -1 in case // of error. Error could be reported if Linux* OS kernel too old (without // "/proc" support). Counting running threads stops if max running threads // encountered. |