44 files changed, 620 insertions, 332 deletions
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index f76619ec0e3c..45a294b666fa 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -533,6 +533,7 @@ kmp_set_disp_num_buffers                    890
     omp_pause_resource_all                  757
     omp_get_supported_active_levels         758
     omp_fulfill_event                       759
+    omp_display_env                         733
 
     omp_null_allocator                     DATA
     omp_default_mem_alloc                  DATA
diff --git a/openmp/runtime/src/exports_so.txt b/openmp/runtime/src/exports_so.txt
index f7de5fd6474f..30222418163d 100644
--- a/openmp/runtime/src/exports_so.txt
+++ b/openmp/runtime/src/exports_so.txt
@@ -119,5 +119,7 @@ GOMP_4.0 {
 } GOMP_3.0;
 GOMP_4.5 {
 } GOMP_4.0;
+GOMP_5.0 {
+} GOMP_4.5;
 
 # end of file #
diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt
index 3a3035b26673..b2ba63c02870 100644
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -324,7 +324,7 @@ WrongMessageCatalog          "Incompatible message catalog \"%1$s\": Version \"%
 StgIgnored                   "%1$s: ignored because %2$s has been defined"
                                  # %1, -- name of ignored variable, %2 -- name of variable with higher priority.
 OBSOLETE                     "%1$s: overrides %3$s specified before"
-                                 # %1, %2 -- name and value of the overriding variable, %3 -- name of overriden variable.
+                                 # %1, %2 -- name and value of the overriding variable, %3 -- name of overridden variable.
 AffTilesNoHWLOC              "%1$s: Tiles are only supported if KMP_TOPOLOGY_METHOD=hwloc, using granularity=package instead"
 AffTilesNoTiles              "%1$s: Tiles requested but were not detected on this HW, using granularity=package instead"
 TopologyExtraTile            "%1$s: %2$d packages x %3$d tiles/pkg x %4$d cores/tile x %5$d threads/core (%6$d total cores)"
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 2246e7012bee..f62afc2b693d 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -228,36 +228,36 @@
     typedef uintptr_t omp_uintptr_t;
 
     typedef enum {
-        OMP_ATK_THREADMODEL = 1,
-        OMP_ATK_ALIGNMENT = 2,
-        OMP_ATK_ACCESS = 3,
-        OMP_ATK_POOL_SIZE = 4,
-        OMP_ATK_FALLBACK = 5,
-        OMP_ATK_FB_DATA = 6,
-        OMP_ATK_PINNED = 7,
-        OMP_ATK_PARTITION = 8
+        omp_atk_threadmodel = 1,
+        omp_atk_alignment = 2,
+        omp_atk_access = 3,
+        omp_atk_pool_size = 4,
+        omp_atk_fallback = 5,
+        omp_atk_fb_data = 6,
+        omp_atk_pinned = 7,
+        omp_atk_partition = 8
     } omp_alloctrait_key_t;
 
     typedef enum {
-        OMP_ATV_FALSE = 0,
-        OMP_ATV_TRUE = 1,
-        OMP_ATV_DEFAULT = 2,
-        OMP_ATV_CONTENDED = 3,
-        OMP_ATV_UNCONTENDED = 4,
-        OMP_ATV_SEQUENTIAL = 5,
-        OMP_ATV_PRIVATE = 6,
-        OMP_ATV_ALL = 7,
-        OMP_ATV_THREAD = 8,
-        OMP_ATV_PTEAM = 9,
-        OMP_ATV_CGROUP = 10,
-        OMP_ATV_DEFAULT_MEM_FB = 11,
-        OMP_ATV_NULL_FB = 12,
-        OMP_ATV_ABORT_FB = 13,
-        OMP_ATV_ALLOCATOR_FB = 14,
-        OMP_ATV_ENVIRONMENT = 15,
-        OMP_ATV_NEAREST = 16,
-        OMP_ATV_BLOCKED = 17,
-        OMP_ATV_INTERLEAVED = 18
+        omp_atv_false = 0,
+        omp_atv_true = 1,
+        omp_atv_default = 2,
+        omp_atv_contended = 3,
+        omp_atv_uncontended = 4,
+        omp_atv_sequential = 5,
+        omp_atv_private = 6,
+        omp_atv_all = 7,
+        omp_atv_thread = 8,
+        omp_atv_pteam = 9,
+        omp_atv_cgroup = 10,
+        omp_atv_default_mem_fb = 11,
+        omp_atv_null_fb = 12,
+        omp_atv_abort_fb = 13,
+        omp_atv_allocator_fb = 14,
+        omp_atv_environment = 15,
+        omp_atv_nearest = 16,
+        omp_atv_blocked = 17,
+        omp_atv_interleaved = 18
     } omp_alloctrait_value_t;
 
     typedef struct {
@@ -355,6 +355,9 @@
 
     extern int __KAI_KMPC_CONVENTION omp_get_supported_active_levels(void);
 
+    /* OpenMP 5.1 Display Environment */
+    extern void omp_display_env(int verbose);
+
 #   undef __KAI_KMPC_CONVENTION
 #   undef __KMP_IMP
 
diff --git a/openmp/runtime/src/include/omp_lib.f.var b/openmp/runtime/src/include/omp_lib.f.var
index d631438f55ad..bf40c78707a8 100644
--- a/openmp/runtime/src/include/omp_lib.f.var
+++ b/openmp/runtime/src/include/omp_lib.f.var
@@ -488,6 +488,11 @@
             integer (kind=kmp_size_t_kind) omp_capture_affinity
           end function omp_capture_affinity
 
+          subroutine omp_display_env(verbose) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: verbose
+          end subroutine omp_display_env
+
 !         ***
 !         *** kmp_* entry points
 !         ***
diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var
index ac568486d204..fbbb7b9df94d 100644
--- a/openmp/runtime/src/include/omp_lib.f90.var
+++ b/openmp/runtime/src/include/omp_lib.f90.var
@@ -503,6 +503,12 @@
             integer (kind=kmp_size_t_kind) :: omp_capture_affinity
           end function omp_capture_affinity
 
+          subroutine omp_display_env(verbose) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: verbose
+          end subroutine omp_display_env
+
+
 !         ***
 !         *** kmp_* entry points
 !         ***
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 8775128157bd..f1b6b03f7725 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -580,6 +580,11 @@
           integer (kind=kmp_size_t_kind) :: omp_capture_affinity
         end function omp_capture_affinity
 
+        subroutine omp_display_env(verbose) bind(c)
+          import
+          logical (kind=omp_logical_kind), value :: verbose
+        end subroutine omp_display_env
+
 !       ***
 !       *** kmp_* entry points
 !       ***
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 086ab3bb011e..5f9b7c895619 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -872,36 +872,36 @@ extern int __kmp_hws_abs_flag; // absolute or per-item number requested
 typedef uintptr_t omp_uintptr_t;
 
 typedef enum {
-  OMP_ATK_THREADMODEL = 1,
-  OMP_ATK_ALIGNMENT = 2,
-  OMP_ATK_ACCESS = 3,
-  OMP_ATK_POOL_SIZE = 4,
-  OMP_ATK_FALLBACK = 5,
-  OMP_ATK_FB_DATA = 6,
-  OMP_ATK_PINNED = 7,
-  OMP_ATK_PARTITION = 8
+  omp_atk_threadmodel = 1,
+  omp_atk_alignment = 2,
+  omp_atk_access = 3,
+  omp_atk_pool_size = 4,
+  omp_atk_fallback = 5,
+  omp_atk_fb_data = 6,
+  omp_atk_pinned = 7,
+  omp_atk_partition = 8
 } omp_alloctrait_key_t;
 
 typedef enum {
-  OMP_ATV_FALSE = 0,
-  OMP_ATV_TRUE = 1,
-  OMP_ATV_DEFAULT = 2,
-  OMP_ATV_CONTENDED = 3,
-  OMP_ATV_UNCONTENDED = 4,
-  OMP_ATV_SEQUENTIAL = 5,
-  OMP_ATV_PRIVATE = 6,
-  OMP_ATV_ALL = 7,
-  OMP_ATV_THREAD = 8,
-  OMP_ATV_PTEAM = 9,
-  OMP_ATV_CGROUP = 10,
-  OMP_ATV_DEFAULT_MEM_FB = 11,
-  OMP_ATV_NULL_FB = 12,
-  OMP_ATV_ABORT_FB = 13,
-  OMP_ATV_ALLOCATOR_FB = 14,
-  OMP_ATV_ENVIRONMENT = 15,
-  OMP_ATV_NEAREST = 16,
-  OMP_ATV_BLOCKED = 17,
-  OMP_ATV_INTERLEAVED = 18
+  omp_atv_false = 0,
+  omp_atv_true = 1,
+  omp_atv_default = 2,
+  omp_atv_contended = 3,
+  omp_atv_uncontended = 4,
+  omp_atv_sequential = 5,
+  omp_atv_private = 6,
+  omp_atv_all = 7,
+  omp_atv_thread = 8,
+  omp_atv_pteam = 9,
+  omp_atv_cgroup = 10,
+  omp_atv_default_mem_fb = 11,
+  omp_atv_null_fb = 12,
+  omp_atv_abort_fb = 13,
+  omp_atv_allocator_fb = 14,
+  omp_atv_environment = 15,
+  omp_atv_nearest = 16,
+  omp_atv_blocked = 17,
+  omp_atv_interleaved = 18
 } omp_alloctrait_value_t;
 
 typedef void *omp_memspace_handle_t;
@@ -1548,7 +1548,7 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 tc;
   kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
                                      after ub */
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
@@ -1581,7 +1581,7 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 tc; /* trip count (number of iterations) */
   kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
                                      after ub */
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
@@ -1722,11 +1722,7 @@ typedef struct kmp_disp {
   kmp_int32 th_disp_index;
   kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
   volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
-  union { // we can use union here because doacross cannot be used in
-    // nonmonotonic loops
-    kmp_int64 *th_doacross_info; // info on loop bounds
-    kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable)
-  };
+  kmp_int64 *th_doacross_info; // info on loop bounds
 #if KMP_USE_INTERNODE_ALIGNMENT
   char more_padding[INTERNODE_CACHE_LINE];
 #endif
@@ -2435,10 +2431,10 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   int th_teams_level; /* save initial level of teams construct */
 /* it is 0 on device but may be any on host */
 
-/* The blocktime info is copied from the team struct to the thread sruct */
-/* at the start of a barrier, and the values stored in the team are used */
-/* at points in the code where the team struct is no longer guaranteed   */
-/* to exist (from the POV of worker threads).                            */
+/* The blocktime info is copied from the team struct to the thread struct */
+/* at the start of a barrier, and the values stored in the team are used  */
+/* at points in the code where the team struct is no longer guaranteed    */
+/* to exist (from the POV of worker threads).                             */
 #if KMP_USE_MONITOR
   int th_team_bt_intervals;
   int th_team_bt_set;
@@ -3908,6 +3904,8 @@ static inline void __kmp_resume_if_hard_paused() {
   }
 }
 
+extern void __kmp_omp_display_env(int verbose);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 650e9ff35e1b..47e70477ced6 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -601,7 +601,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
 
   int depth = 3;
   int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
-  int labels[3] = {0}; // package [,node] [,tile] - head of lables array
+  int labels[3] = {0}; // package [,node] [,tile] - head of labels array
   if (__kmp_numa_detected)
     ++depth;
   if (__kmp_tile_depth)
@@ -828,7 +828,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
   }
 
   int depth_full = depth; // number of levels before compressing
-  // Find any levels with radiix 1, and remove them from the map
+  // Find any levels with radix 1, and remove them from the map
   // (except for the package level).
   depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
                                                  levels);
@@ -918,7 +918,7 @@ static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
     return 0;
   }
 
-  // Contruct the data structure to be returned.
+  // Construct the data structure to be returned.
   *address2os =
       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
   int avail_ct = 0;
@@ -967,7 +967,7 @@ static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
     return -1;
   }
 
-  // Contruct the data structure to be returned.
+  // Construct the data structure to be returned.
   *address2os =
       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
@@ -1849,7 +1849,7 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
     return 0;
   }
 
-  // Find any levels with radiix 1, and remove them from the map
+  // Find any levels with radix 1, and remove them from the map
   // (except for the package level).
   int new_depth = 0;
   for (level = 0; level < depth; level++) {
@@ -1968,7 +1968,8 @@ static void __kmp_dispatch_set_hierarchy_values() {
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS)
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+    KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
   else
@@ -1982,7 +1983,8 @@ static void __kmp_dispatch_set_hierarchy_values() {
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
       __kmp_nThreadsPerCore;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS)
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+    KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
         2 * __kmp_nThreadsPerCore;
@@ -4328,7 +4330,7 @@ static void __kmp_aux_affinity_initialize(void) {
   }
 #endif // KMP_USE_HWLOC
 
-// If the user has specified that a paricular topology discovery method is to be
+// If the user has specified that a particular topology discovery method is to be
 // used, then we abort if that method fails. The exception is group affinity,
 // which might have been implicitly set.
 
@@ -4647,7 +4649,7 @@ static void __kmp_aux_affinity_initialize(void) {
 #undef KMP_EXIT_AFF_NONE
 
 void __kmp_affinity_initialize(void) {
-  // Much of the code above was written assumming that if a machine was not
+  // Much of the code above was written assuming that if a machine was not
   // affinity capable, then __kmp_affinity_type == affinity_none.  We now
   // explicitly represent this as __kmp_affinity_type == affinity_disabled.
   // There are too many checks for __kmp_affinity_type == affinity_none
@@ -4713,7 +4715,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
     KMP_CPU_ZERO(th->th.th_affin_mask);
   }
 
-  // Copy the thread mask to the kmp_info_t strucuture. If
+  // Copy the thread mask to the kmp_info_t structure. If
   // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
   // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
   // then the full mask is the same as the mask of the initialization thread.
@@ -4823,7 +4825,7 @@ void __kmp_affinity_set_place(int gtid) {
                (th->th.th_new_place >= th->th.th_last_place));
   }
 
-  // Copy the thread mask to the kmp_info_t strucuture,
+  // Copy the thread mask to the kmp_info_t structure,
   // and set this thread's affinity.
   kmp_affin_mask_t *mask =
       KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index f270bb6dbb8d..664a42393191 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -303,8 +303,9 @@ class KMPNativeAffinity : public KMPAffinity {
       int retval =
           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
 #elif KMP_OS_FREEBSD
-      int retval =
+      int r =
           pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+      int retval = (r == 0 ? 0 : -1);
 #endif
       if (retval >= 0) {
         return 0;
@@ -322,8 +323,9 @@ class KMPNativeAffinity : public KMPAffinity {
       int retval =
           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
 #elif KMP_OS_FREEBSD
-      int retval =
+      int r =
           pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+      int retval = (r == 0 ? 0 : -1);
 #endif
       if (retval >= 0) {
         return 0;
diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp
index 16893d0ffca5..314f56d9b5c6 100644
--- a/openmp/runtime/src/kmp_alloc.cpp
+++ b/openmp/runtime/src/kmp_alloc.cpp
@@ -186,7 +186,7 @@ typedef struct thr_data {
                        -1: not all pool blocks are the same size
                        >0: (common) block size for all bpool calls made so far
                     */
-  bfhead_t *last_pool; /* Last pool owned by this thread (delay dealocation) */
+  bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */
 } thr_data_t;
 
 /*  Minimum allocation quantum: */
@@ -195,7 +195,7 @@ typedef struct thr_data {
 #define MaxSize                                                                \
   (bufsize)(                                                                   \
       ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
-// Maximun for the requested size.
+// Maximum for the requested size.
 
 /* End sentinel: value placed in bsize field of dummy block delimiting
    end of pool block.  The most negative number which will  fit  in  a
@@ -577,7 +577,7 @@ static void *bget(kmp_info_t *th, bufsize requested_size) {
   if (thr->acqfcn != 0) {
     if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
       /* Request is too large to fit in a single expansion block.
-         Try to satisy it by a direct buffer acquisition. */
+         Try to satisfy it by a direct buffer acquisition. */
       bdhead_t *bdh;
 
       size += sizeof(bdhead_t) - sizeof(bhead_t);
@@ -1348,27 +1348,27 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
   al->memspace = ms; // not used currently
   for (i = 0; i < ntraits; ++i) {
     switch (traits[i].key) {
-    case OMP_ATK_THREADMODEL:
-    case OMP_ATK_ACCESS:
-    case OMP_ATK_PINNED:
+    case omp_atk_threadmodel:
+    case omp_atk_access:
+    case omp_atk_pinned:
       break;
-    case OMP_ATK_ALIGNMENT:
+    case omp_atk_alignment:
       al->alignment = traits[i].value;
       KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
       break;
-    case OMP_ATK_POOL_SIZE:
+    case omp_atk_pool_size:
       al->pool_size = traits[i].value;
       break;
-    case OMP_ATK_FALLBACK:
+    case omp_atk_fallback:
       al->fb = (omp_alloctrait_value_t)traits[i].value;
       KMP_DEBUG_ASSERT(
-          al->fb == OMP_ATV_DEFAULT_MEM_FB || al->fb == OMP_ATV_NULL_FB ||
-          al->fb == OMP_ATV_ABORT_FB || al->fb == OMP_ATV_ALLOCATOR_FB);
+          al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb ||
+          al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb);
       break;
-    case OMP_ATK_FB_DATA:
+    case omp_atk_fb_data:
       al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
       break;
-    case OMP_ATK_PARTITION:
+    case omp_atk_partition:
       al->memkind = RCAST(void **, traits[i].value);
       break;
     default:
@@ -1377,17 +1377,17 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
   }
   if (al->fb == 0) {
     // set default allocator
-    al->fb = OMP_ATV_DEFAULT_MEM_FB;
+    al->fb = omp_atv_default_mem_fb;
     al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
-  } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+  } else if (al->fb == omp_atv_allocator_fb) {
     KMP_ASSERT(al->fb_data != NULL);
-  } else if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+  } else if (al->fb == omp_atv_default_mem_fb) {
     al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
   }
   if (__kmp_memkind_available) {
     // Let's use memkind library if available
     if (ms == omp_high_bw_mem_space) {
-      if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_hbw_interleave) {
+      if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) {
         al->memkind = mk_hbw_interleave;
       } else if (mk_hbw_preferred) {
         // AC: do not try to use MEMKIND_HBW for now, because memkind library
@@ -1402,7 +1402,7 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
         return omp_null_allocator;
       }
     } else {
-      if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_interleave) {
+      if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) {
         al->memkind = mk_interleave;
       } else {
         al->memkind = mk_default;
@@ -1477,12 +1477,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
       if (used + desc.size_a > al->pool_size) {
         // not enough space, need to go fallback path
         KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
-        if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+        if (al->fb == omp_atv_default_mem_fb) {
           al = (kmp_allocator_t *)omp_default_mem_alloc;
           ptr = kmp_mk_alloc(*mk_default, desc.size_a);
-        } else if (al->fb == OMP_ATV_ABORT_FB) {
+        } else if (al->fb == omp_atv_abort_fb) {
           KMP_ASSERT(0); // abort fallback requested
-        } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+        } else if (al->fb == omp_atv_allocator_fb) {
           KMP_ASSERT(al != al->fb_data);
           al = al->fb_data;
           return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
@@ -1491,12 +1491,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
         // pool has enough space
         ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
         if (ptr == NULL) {
-          if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+          if (al->fb == omp_atv_default_mem_fb) {
             al = (kmp_allocator_t *)omp_default_mem_alloc;
             ptr = kmp_mk_alloc(*mk_default, desc.size_a);
-          } else if (al->fb == OMP_ATV_ABORT_FB) {
+          } else if (al->fb == omp_atv_abort_fb) {
             KMP_ASSERT(0); // abort fallback requested
-          } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+          } else if (al->fb == omp_atv_allocator_fb) {
             KMP_ASSERT(al != al->fb_data);
             al = al->fb_data;
             return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
@@ -1507,12 +1507,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
       // custom allocator, pool size not requested
       ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
       if (ptr == NULL) {
-        if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+        if (al->fb == omp_atv_default_mem_fb) {
           al = (kmp_allocator_t *)omp_default_mem_alloc;
           ptr = kmp_mk_alloc(*mk_default, desc.size_a);
-        } else if (al->fb == OMP_ATV_ABORT_FB) {
+        } else if (al->fb == omp_atv_abort_fb) {
           KMP_ASSERT(0); // abort fallback requested
-        } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+        } else if (al->fb == omp_atv_allocator_fb) {
           KMP_ASSERT(al != al->fb_data);
           al = al->fb_data;
           return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
@@ -1533,12 +1533,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
     if (used + desc.size_a > al->pool_size) {
       // not enough space, need to go fallback path
       KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
-      if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+      if (al->fb == omp_atv_default_mem_fb) {
         al = (kmp_allocator_t *)omp_default_mem_alloc;
         ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
-      } else if (al->fb == OMP_ATV_ABORT_FB) {
+      } else if (al->fb == omp_atv_abort_fb) {
         KMP_ASSERT(0); // abort fallback requested
-      } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+      } else if (al->fb == omp_atv_allocator_fb) {
         KMP_ASSERT(al != al->fb_data);
         al = al->fb_data;
         return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
@@ -1546,14 +1546,14 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
     } else {
       // pool has enough space
       ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
-      if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) {
+      if (ptr == NULL && al->fb == omp_atv_abort_fb) {
         KMP_ASSERT(0); // abort fallback requested
       } // no sense to look for another fallback because of same internal alloc
     }
   } else {
     // custom allocator, pool size not requested
     ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
-    if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) {
+    if (ptr == NULL && al->fb == omp_atv_abort_fb) {
       KMP_ASSERT(0); // abort fallback requested
     } // no sense to look for another fallback because of same internal alloc
   }
@@ -1961,7 +1961,7 @@ void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
       } else {
         // either queue blocks owner is changing or size limit exceeded
-        // return old queue to allocating thread (q_th) synchroneously,
+        // return old queue to allocating thread (q_th) synchronously,
         // and start new list for alloc_thr's tasks
         void *old_ptr;
         void *tail = head;
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index a6d87b5d7a2e..4aa7a084f53a 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -549,6 +549,7 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
     if (((tid >> level) & (branch_factor - 1)) != 0) {
       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
 
+      KMP_MB(); // Synchronize parent and child threads.
       KA_TRACE(20,
                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
                 "arrived(%p): %llu => %llu\n",
@@ -590,6 +591,7 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
       kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
       ANNOTATE_BARRIER_END(child_thr);
+      KMP_MB(); // Synchronize parent and child threads.
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
       // Barrier imbalance - write min of the thread time and a child time to
       // the thread.
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index ac9a93590ad0..9cfa64d6ff9e 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -92,7 +92,7 @@ construct, since the master thread is necessarily thread zero).
 
 If multiple non-OpenMP threads all enter an OpenMP construct then this
 will be a unique thread identifier among all the threads created by
-the OpenMP runtime (but the value cannote be defined in terms of
+the OpenMP runtime (but the value cannot be defined in terms of
 OpenMP thread ids returned by omp_get_thread_num()).
 */
 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
@@ -4023,6 +4023,9 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
   lo = pr_buf->th_doacross_info[2];
   up = pr_buf->th_doacross_info[3];
   st = pr_buf->th_doacross_info[4];
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_dependence_t deps[num_dims];
+#endif
   if (st == 1) { // most common case
     if (vec[0] < lo || vec[0] > up) {
       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
@@ -4048,6 +4051,10 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
     }
     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
   }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  deps[0].variable.value = iter_number;
+  deps[0].dependence_type = ompt_dependence_type_sink;
+#endif
   for (i = 1; i < num_dims; ++i) {
     kmp_int64 iter, ln;
     kmp_int32 j = i * 4;
@@ -4081,6 +4088,10 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
       iter = (kmp_uint64)(lo - vec[i]) / (-st);
     }
     iter_number = iter + ln * iter_number;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    deps[i].variable.value = iter;
+    deps[i].dependence_type = ompt_dependence_type_sink;
+#endif
   }
   shft = iter_number % 32; // use 32-bit granularity
   iter_number >>= 5; // divided by 32
@@ -4089,6 +4100,12 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
     KMP_YIELD(TRUE);
   }
   KMP_MB();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_dependences) {
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
+  }
+#endif
   KA_TRACE(20,
            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
             gtid, (iter_number << 5) + shft));
@@ -4116,6 +4133,9 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
   num_dims = pr_buf->th_doacross_info[0];
   lo = pr_buf->th_doacross_info[2];
   st = pr_buf->th_doacross_info[4];
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_dependence_t deps[num_dims];
+#endif
   if (st == 1) { // most common case
     iter_number = vec[0] - lo;
   } else if (st > 0) {
@@ -4123,6 +4143,10 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
   } else { // negative increment
     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
   }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  deps[0].variable.value = iter_number;
+  deps[0].dependence_type = ompt_dependence_type_source;
+#endif
   for (i = 1; i < num_dims; ++i) {
     kmp_int64 iter, ln;
     kmp_int32 j = i * 4;
@@ -4137,7 +4161,17 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
       iter = (kmp_uint64)(lo - vec[i]) / (-st);
     }
     iter_number = iter + ln * iter_number;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    deps[i].variable.value = iter;
+    deps[i].dependence_type = ompt_dependence_type_source;
+#endif
+  }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_dependences) {
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
   }
+#endif
   shft = iter_number % 32; // use 32-bit granularity
   iter_number >>= 5; // divided by 32
   flag = 1 << shft;
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index a91ffa2ba299..9d7b81733eba 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -372,10 +372,10 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
         // before spending time on this).
         // For now use dynamically allocated per-thread lock,
         // free memory in __kmp_dispatch_next when status==0.
-        KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
-        th->th.th_dispatch->th_steal_lock =
+        KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
+        pr->u.p.th_steal_lock =
             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
-        __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
+        __kmp_init_lock(pr->u.p.th_steal_lock);
       }
       break;
     } else {
@@ -968,7 +968,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   // all parm3 will be the same, it still exists a bad case like using 0 and 1
   // rather than program life-time increment. So the dedicated variable is
   // required. The 'static_steal_counter' is used.
-  if (schedule == kmp_sch_static_steal) {
+  if (pr->schedule == kmp_sch_static_steal) {
     // Other threads will inspect this variable when searching for a victim.
     // This is a flag showing that other threads may steal from this thread
     // since then.
@@ -1195,7 +1195,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
     if (traits_t<T>::type_size > 4) {
       // use lock for 8-byte and CAS for 4-byte induction
       // variable. TODO (optional): check and use 16-byte CAS
-      kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
+      kmp_lock_t *lck = pr->u.p.th_steal_lock;
       KMP_DEBUG_ASSERT(lck != NULL);
       if (pr->u.p.count < (UT)pr->u.p.ub) {
         __kmp_acquire_lock(lck, gtid);
@@ -1210,37 +1210,38 @@ int __kmp_dispatch_next_algorithm(int gtid,
         kmp_info_t **other_threads = team->t.t_threads;
         int while_limit = pr->u.p.parm3;
         int while_index = 0;
+        T id = pr->u.p.static_steal_counter; // loop id
+        int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                  __kmp_dispatch_num_buffers; // current loop index
+        // note: victim thread can potentially execute another loop
         // TODO: algorithm of searching for a victim
         // should be cleaned up and measured
         while ((!status) && (while_limit != ++while_index)) {
+          dispatch_private_info_template<T> *victim;
           T remaining;
           T victimIdx = pr->u.p.parm4;
           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          dispatch_private_info_template<T> *victim =
-              reinterpret_cast<dispatch_private_info_template<T> *>(
-                  other_threads[victimIdx]
-                      ->th.th_dispatch->th_dispatch_pr_current);
-          while ((victim == NULL || victim == pr ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(victim);
+          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
                  oldVictimIdx != victimIdx) {
             victimIdx = (victimIdx + 1) % nproc;
             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                other_threads[victimIdx]
-                    ->th.th_dispatch->th_dispatch_pr_current);
+                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(victim);
           }
-          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
-                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+          if (victim == pr || id != victim->u.p.static_steal_counter) {
             continue; // try once more (nproc attempts in total)
             // no victim is ready yet to participate in stealing
-            // because all victims are still in kmp_init_dispatch
+            // because no victim passed kmp_init_dispatch yet
           }
           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
             continue; // not enough chunks to steal, goto next victim
           }
 
-          lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
+          lck = victim->u.p.th_steal_lock;
           KMP_ASSERT(lck != NULL);
           __kmp_acquire_lock(lck, gtid);
           limit = victim->u.p.ub; // keep initial ub
@@ -1250,7 +1251,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
             continue; // not enough chunks to steal
           }
-          // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
+          // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
           // by 1
           if (remaining > 3) {
             // steal 1/4 of remaining
@@ -1268,10 +1269,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
           status = 1;
           while_index = 0;
           // now update own count and ub with stolen range but init chunk
-          __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
           pr->u.p.count = init + 1;
           pr->u.p.ub = limit;
-          __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
         } // while (search for victim)
       } // if (try to find victim and steal)
     } else {
@@ -1308,32 +1309,32 @@ int __kmp_dispatch_next_algorithm(int gtid,
         kmp_info_t **other_threads = team->t.t_threads;
         int while_limit = pr->u.p.parm3;
         int while_index = 0;
-
+        T id = pr->u.p.static_steal_counter; // loop id
+        int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                  __kmp_dispatch_num_buffers; // current loop index
+        // note: victim thread can potentially execute another loop
         // TODO: algorithm of searching for a victim
         // should be cleaned up and measured
         while ((!status) && (while_limit != ++while_index)) {
+          dispatch_private_info_template<T> *victim;
           union_i4 vold, vnew;
           kmp_int32 remaining;
           T victimIdx = pr->u.p.parm4;
           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          dispatch_private_info_template<T> *victim =
-              reinterpret_cast<dispatch_private_info_template<T> *>(
-                  other_threads[victimIdx]
-                      ->th.th_dispatch->th_dispatch_pr_current);
-          while ((victim == NULL || victim == pr ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(victim);
+          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
                  oldVictimIdx != victimIdx) {
             victimIdx = (victimIdx + 1) % nproc;
             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                other_threads[victimIdx]
-                    ->th.th_dispatch->th_dispatch_pr_current);
+                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(victim);
           }
-          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
-                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+          if (victim == pr || id != victim->u.p.static_steal_counter) {
             continue; // try once more (nproc attempts in total)
             // no victim is ready yet to participate in stealing
-            // because all victims are still in kmp_init_dispatch
+            // because no victim passed kmp_init_dispatch yet
           }
           pr->u.p.parm4 = victimIdx; // new victim found
           while (1) { // CAS loop if victim has enough chunks to steal
@@ -1357,7 +1358,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
                     (volatile kmp_int64 *)&victim->u.p.count,
                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
-              // stealing succedded
+              // stealing succeeded
               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
                                         vold.p.ub - vnew.p.ub);
               status = 1;
@@ -1372,7 +1373,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
 #endif
               break;
             } // if (check CAS result)
-            KMP_CPU_PAUSE(); // CAS failed, repeate attempt
+            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
           } // while (try to steal from particular victim)
         } // while (search for victim)
       } // if (try to find victim and steal)
@@ -1532,7 +1533,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
       }
       if ((T)remaining <
           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
-        // use dynamic-style shcedule
+        // use dynamic-style schedule
         // atomically increment iterations, get old value
         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                  (ST)chunkspec);
@@ -1601,7 +1602,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
       KMP_DEBUG_ASSERT(init % chunk == 0);
       // compare with K*nproc*(chunk+1), K=2 by default
       if ((T)remaining < pr->u.p.parm2) {
-        // use dynamic-style shcedule
+        // use dynamic-style schedule
         // atomically increment iterations, get old value
         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                  (ST)chunk);
@@ -1892,7 +1893,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
   // This is potentially slightly misleading, schedule(runtime) will appear here
-  // even if the actual runtme schedule is static. (Which points out a
+  // even if the actual runtime schedule is static. (Which points out a
   // disadvantage of schedule(runtime): even when static scheduling is used it
   // costs more than a compile time choice to use static scheduling would.)
   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
@@ -1909,7 +1910,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
        gtid, p_lb, p_ub, p_st, p_last));
 
   if (team->t.t_serialized) {
-    /* NOTE: serialize this dispatch becase we are not at the active level */
+    /* NOTE: serialize this dispatch because we are not at the active level */
     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
     KMP_DEBUG_ASSERT(pr);
@@ -2068,14 +2069,19 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
         if (pr->schedule == kmp_sch_static_steal &&
             traits_t<T>::type_size > 4) {
           int i;
+          int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                    __kmp_dispatch_num_buffers; // current loop index
           kmp_info_t **other_threads = team->t.t_threads;
           // loop complete, safe to destroy locks used for stealing
           for (i = 0; i < th->th.th_team_nproc; ++i) {
-            kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
+            dispatch_private_info_template<T> *buf =
+                reinterpret_cast<dispatch_private_info_template<T> *>(
+                    &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
+            kmp_lock_t *lck = buf->u.p.th_steal_lock;
             KMP_ASSERT(lck != NULL);
             __kmp_destroy_lock(lck);
             __kmp_free(lck);
-            other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
+            buf->u.p.th_steal_lock = NULL;
           }
         }
 #endif
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
index 8b3e98435a3f..1f98e4b80a79 100644
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -75,7 +75,7 @@ template <typename T> struct dispatch_private_infoXX_template {
   ST st; // signed
   UT tc; // unsigned
   T static_steal_counter; // for static_steal only; maybe better to put after ub
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
diff --git a/openmp/runtime/src/kmp_dispatch_hier.h b/openmp/runtime/src/kmp_dispatch_hier.h
index 3d7faea04272..c615b7b08958 100644
--- a/openmp/runtime/src/kmp_dispatch_hier.h
+++ b/openmp/runtime/src/kmp_dispatch_hier.h
@@ -993,7 +993,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
     th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
         sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
   }
-  // Have threads "register" themselves by modifiying the active count for each
+  // Have threads "register" themselves by modifying the active count for each
   // level they are involved in. The active count will act as nthreads for that
   // level regarding the scheduling algorithms
   for (int i = 0; i < n; ++i) {
diff --git a/openmp/runtime/src/kmp_environment.h b/openmp/runtime/src/kmp_environment.h
index 76a9672f3240..a7ea9e955788 100644
--- a/openmp/runtime/src/kmp_environment.h
+++ b/openmp/runtime/src/kmp_environment.h
@@ -1,5 +1,5 @@
 /*
- * kmp_environment.h -- Handle environment varoiables OS-independently.
+ * kmp_environment.h -- Handle environment variables OS-independently.
  */
 
 //===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index 89172c0b704c..ab57907e088e 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -1371,6 +1371,13 @@ void FTN_STDCALL FTN_FULFILL_EVENT(kmp_event_t *event) {
 #endif
 }
 
+// display environment variables when requested
+void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) {
+#ifndef KMP_STUB
+  __kmp_omp_display_env(verbose);
+#endif
+}
+
 // GCC compatibility (versioned symbols)
 #ifdef KMP_USE_VERSION_SYMBOLS
 
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index 41cafab12537..22fb2bb2f5ca 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -133,6 +133,7 @@
 #define FTN_PAUSE_RESOURCE omp_pause_resource
 #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels
+#define FTN_DISPLAY_ENV omp_display_env
 #define FTN_FULFILL_EVENT omp_fulfill_event
 
 #endif /* KMP_FTN_PLAIN */
@@ -256,6 +257,7 @@
 #define FTN_PAUSE_RESOURCE omp_pause_resource_
 #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all_
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels_
+#define FTN_DISPLAY_ENV omp_display_env_
 #define FTN_FULFILL_EVENT omp_fulfill_event_
 
 #endif /* KMP_FTN_APPEND */
@@ -377,6 +379,7 @@
 #define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE
 #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS
+#define FTN_DISPLAY_ENV OMP_DISPLAY_ENV
 #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT
 
 #endif /* KMP_FTN_UPPER */
@@ -500,6 +503,7 @@
 #define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE_
 #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL_
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS_
+#define FTN_DISPLAY_ENV OMP_DISPLAY_ENV_
 #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT_
 
 #endif /* KMP_FTN_UAPPEND */
@@ -654,4 +658,26 @@
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED                    \
   GOMP_parallel_loop_nonmonotonic_guided
 
+// All GOMP_5.0 symbols
+#define KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT                 \
+  GOMP_loop_maybe_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START                \
+  GOMP_loop_maybe_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT                       \
+  GOMP_loop_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START                      \
+  GOMP_loop_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT             \
+  GOMP_loop_ull_maybe_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START            \
+  GOMP_loop_ull_maybe_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT                   \
+  GOMP_loop_ull_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START                  \
+  GOMP_loop_ull_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME                   \
+  GOMP_parallel_loop_nonmonotonic_runtime
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME             \
+  GOMP_parallel_loop_maybe_nonmonotonic_runtime
+
 #endif /* KMP_FTN_OS_H */
diff --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp
index e0739a737d9c..ab4f27bfc067 100644
--- a/openmp/runtime/src/kmp_gsupport.cpp
+++ b/openmp/runtime/src/kmp_gsupport.cpp
@@ -275,7 +275,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) {
 #define KMP_DISPATCH_FINI_CHUNK_ULL __kmp_aux_dispatch_fini_chunk_8u
 #define KMP_DISPATCH_NEXT_ULL __kmpc_dispatch_next_8u
 
-// The parallel contruct
+// The parallel construct
 
 #ifndef KMP_DEBUG
 static
@@ -325,7 +325,7 @@ static
                                           enum sched_type schedule, long start,
                                           long end, long incr,
                                           long chunk_size) {
-  // Intialize the loop worksharing construct.
+  // Initialize the loop worksharing construct.
 
   KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size,
                     schedule != kmp_sch_static);
@@ -635,6 +635,15 @@ LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT), {})
 LOOP_RUNTIME_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_START),
                    kmp_sch_runtime)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT), {})
+LOOP_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_NEXT(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT), {})
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT), {})
 
 LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START),
            kmp_ord_static)
@@ -911,6 +920,18 @@ LOOP_NEXT_ULL(
 LOOP_RUNTIME_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START), kmp_sch_runtime)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT), {})
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(
+        KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT),
+    {})
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT), {})
 
 LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START),
                kmp_ord_static)
@@ -1513,6 +1534,12 @@ PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED),
               kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME),
               kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME),
+    kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME),
+    kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_START)(void) {
   int gtid = __kmp_entry_gtid();
@@ -1985,6 +2012,28 @@ KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC, 45,
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED, 45,
                    "GOMP_4.5");
 
+// GOMP_5.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT,
+                   50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START,
+                   50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME,
+                   50, "GOMP_5.0");
+
 #endif // KMP_USE_VERSION_SYMBOLS
 
 #ifdef __cplusplus
diff --git a/openmp/runtime/src/kmp_i18n.cpp b/openmp/runtime/src/kmp_i18n.cpp
index 53c442715b0b..d2651cfabdf3 100644
--- a/openmp/runtime/src/kmp_i18n.cpp
+++ b/openmp/runtime/src/kmp_i18n.cpp
@@ -639,7 +639,7 @@ kmp_msg_t __kmp_msg_format(unsigned id_arg, ...) {
   // numbers, for example:  "%2$s %1$s".
   __kmp_str_buf_vprint(&buffer, __kmp_i18n_catgets(id), args);
 #elif KMP_OS_WINDOWS
-  // On Winodws, printf() family functions does not recognize GNU style
+  // On Windows, printf() family functions does not recognize GNU style
   // parameter numbers, so we have to use FormatMessage() instead. It recognizes
   // parameter numbers, e. g.:  "%2!s! "%1!s!".
   {
diff --git a/openmp/runtime/src/kmp_i18n.h b/openmp/runtime/src/kmp_i18n.h
index 9d79a21bb2df..3fd6099ad149 100644
--- a/openmp/runtime/src/kmp_i18n.h
+++ b/openmp/runtime/src/kmp_i18n.h
@@ -32,7 +32,7 @@ extern "C" {
 
    __kmp_i18n_catgets() returns read-only string. It should not be freed.
 
-   KMP_I18N_STR macro simplifies acces to strings in message catalog a bit.
+   KMP_I18N_STR macro simplifies access to strings in message catalog a bit.
    Following two lines are equivalent:
 
    __kmp_i18n_catgets( kmp_i18n_str_Warning )
diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 2cc9e08278c4..8bf7ef2deb71 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -1239,6 +1239,9 @@ __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
       KMP_MB();
       // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf
       KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck);
+      // Synchronize writes to both runtime thread structures
+      // and writes in user code.
+      KMP_MB();
 
 #ifdef DEBUG_QUEUING_LOCKS
       TRACE_LOCK(gtid + 1, "acq spin");
@@ -3018,7 +3021,7 @@ kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
 static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
 
 // User lock allocator for dynamically dispatched indirect locks. Every entry of
-// the indirect lock table holds the address and type of the allocated indrect
+// the indirect lock table holds the address and type of the allocated indirect
 // lock (kmp_indirect_lock_t), and the size of the table doubles when it is
 // full. A destroyed indirect lock object is returned to the reusable pool of
 // locks, unique to each lock type.
diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h
index 75a15f084c69..e54f6812b8b3 100644
--- a/openmp/runtime/src/kmp_lock.h
+++ b/openmp/runtime/src/kmp_lock.h
@@ -42,7 +42,7 @@ typedef struct ident ident_t;
 // ----------------------------------------------------------------------------
 
 // We need to know the size of the area we can assume that the compiler(s)
-// allocated for obects of type omp_lock_t and omp_nest_lock_t.  The Intel
+// allocated for objects of type omp_lock_t and omp_nest_lock_t.  The Intel
 // compiler always allocates a pointer-sized area, as does visual studio.
 //
 // gcc however, only allocates 4 bytes for regular locks, even on 64-bit
@@ -861,11 +861,11 @@ __kmp_destroy_nested_user_lock_with_checks(kmp_user_lock_p lck) {
 //
 // In other cases, the calling code really should differentiate between an
 // unimplemented function and one that is implemented but returning NULL /
-// invalied value.  If this is the case, no get function wrapper exists.
+// invalid value.  If this is the case, no get function wrapper exists.
 
 extern int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck);
 
-// no set function; fields set durining local allocation
+// no set function; fields set during local allocation
 
 extern const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck);
 
@@ -899,7 +899,7 @@ static inline void __kmp_set_user_lock_flags(kmp_user_lock_p lck,
   }
 }
 
-// The fuction which sets up all of the vtbl pointers for kmp_user_lock_t.
+// The function which sets up all of the vtbl pointers for kmp_user_lock_t.
 extern void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind);
 
 // Macros for binding user lock functions.
@@ -1128,7 +1128,7 @@ extern int (**__kmp_direct_unset)(kmp_dyna_lock_t *, kmp_int32);
 extern int (**__kmp_direct_test)(kmp_dyna_lock_t *, kmp_int32);
 
 // Function tables for indirect locks. Set/unset/test differentiate functions
-// with/withuot consistency checking.
+// with/without consistency checking.
 extern void (*__kmp_indirect_init[])(kmp_user_lock_p);
 extern void (**__kmp_indirect_destroy)(kmp_user_lock_p);
 extern int (**__kmp_indirect_set)(kmp_user_lock_p, kmp_int32);
diff --git a/openmp/runtime/src/kmp_omp.h b/openmp/runtime/src/kmp_omp.h
index 27b550d1f663..c7ba32a14338 100644
--- a/openmp/runtime/src/kmp_omp.h
+++ b/openmp/runtime/src/kmp_omp.h
@@ -47,7 +47,7 @@ typedef struct {
 } kmp_omp_nthr_item_t;
 
 typedef struct {
-  kmp_int32 num; // Number of items in the arrray.
+  kmp_int32 num; // Number of items in the array.
   kmp_uint64 array; // Address of array of kmp_omp_num_threads_item_t.
 } kmp_omp_nthr_info_t;
 
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index acd157db8e52..e0c8cf241044 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -3529,7 +3529,7 @@ static int __kmp_expand_threads(int nNeed) {
   // > __kmp_max_nth in one of two ways:
   //
   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
-  //    may not be resused by another thread, so we may need to increase
+  //    may not be reused by another thread, so we may need to increase
   //    __kmp_threads_capacity to __kmp_max_nth + 1.
   //
   // 2) New foreign root(s) are encountered.  We always register new foreign
@@ -4515,11 +4515,11 @@ __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
 #if KMP_AFFINITY_SUPPORTED
 
 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
-// It calculats the worker + master thread's partition based upon the parent
+// It calculates the worker + master thread's partition based upon the parent
 // thread's partition, and binds each worker to a thread in their partition.
 // The master thread's partition should already include its current binding.
 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
-  // Copy the master thread's place partion to the team struct
+  // Copy the master thread's place partition to the team struct
   kmp_info_t *master_th = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(master_th != NULL);
   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
@@ -5536,7 +5536,7 @@ kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
 // locality problems on programs where the size of the hot team regularly
 // grew and shrunk.
 //
-// Now, for single-level parallelism, the OMP tid is alway == gtid.
+// Now, for single-level parallelism, the OMP tid is always == gtid.
 void __kmp_free_thread(kmp_info_t *this_th) {
   int gtid;
   kmp_info_t **scan;
@@ -5609,7 +5609,7 @@ void __kmp_free_thread(kmp_info_t *this_th) {
   // scan is the address of a link in the list, possibly the address of
   // __kmp_thread_pool itself.
   //
-  // In the absence of nested parallism, the for loop will have 0 iterations.
+  // In the absence of nested parallelism, the for loop will have 0 iterations.
   if (__kmp_thread_pool_insert_pt != NULL) {
     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
   } else {
@@ -6088,7 +6088,7 @@ void __kmp_internal_end_library(int gtid_req) {
      only place to clear __kmp_serial_init */
   /* we'll check this later too, after we get the lock */
   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
-  // redundaant, because the next check will work in any case.
+  // redundant, because the next check will work in any case.
   if (__kmp_global.g.g_abort) {
     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
     /* TODO abort? */
@@ -8217,7 +8217,6 @@ __kmp_determine_reduction_method(
 
   return (retval);
 }
-
 // this function is for testing set/get/determine reduce method
 kmp_int32 __kmp_get_reduce_method(void) {
   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
@@ -8297,3 +8296,12 @@ int __kmp_pause_resource(kmp_pause_status_t level) {
     return 1;
   }
 }
+
+
+void __kmp_omp_display_env(int verbose) {
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (__kmp_init_serial == 0)
+    __kmp_do_serial_initialize();
+  __kmp_display_env_impl(!verbose, verbose);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp
index 17c149806c89..28d0ffe0fb9d 100644
--- a/openmp/runtime/src/kmp_sched.cpp
+++ b/openmp/runtime/src/kmp_sched.cpp
@@ -667,7 +667,7 @@ static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
   // stride for next chunks calculation.
   // Last iteration flag set for the team that will execute
   // the last iteration of the loop.
-  // The routine is called for dist_schedue(static,chunk) only.
+  // The routine is called for dist_schedule(static,chunk) only.
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
   kmp_uint32 team_id;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index c7dec4d218c6..5745cbba585f 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -364,7 +364,7 @@ static void __kmp_stg_parse_int(
     char const
         *name, // I: Name of environment variable (used in warning messages).
     char const *value, // I: Value of environment variable to parse.
-    int min, // I: Miminal allowed value.
+    int min, // I: Minimum allowed value.
     int max, // I: Maximum allowed value.
     int *out // O: Output (parsed) value.
     ) {
@@ -1305,7 +1305,7 @@ static void __kmp_stg_print_max_task_priority(kmp_str_buf_t *buffer,
 } // __kmp_stg_print_max_task_priority
 
 // KMP_TASKLOOP_MIN_TASKS
-// taskloop threashold to switch from recursive to linear tasks creation
+// taskloop threshold to switch from recursive to linear tasks creation
 static void __kmp_stg_parse_taskloop_min_tasks(char const *name,
                                                char const *value, void *data) {
   int tmp;
@@ -2041,7 +2041,7 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
 // If we see a parse error, emit a warning and scan to the next ",".
 //
 // FIXME - there's got to be a better way to print an error
-// message, hopefully without overwritting peices of buf.
+// message, hopefully without overwriting peices of buf.
 #define EMIT_WARN(skip, errlist)                                               \
   {                                                                            \
     char ch;                                                                   \
@@ -4395,7 +4395,7 @@ static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer,
 // -----------------------------------------------------------------------------
 // KMP_HW_SUBSET (was KMP_PLACE_THREADS)
 
-// The longest observable sequense of items is
+// The longest observable sequence of items is
 // Socket-Node-Tile-Core-Thread
 // So, let's limit to 5 levels for now
 // The input string is usually short enough, let's use 512 limit for now
@@ -5720,7 +5720,11 @@ void __kmp_env_print() {
 } // __kmp_env_print
 
 void __kmp_env_print_2() {
+  __kmp_display_env_impl(__kmp_display_env, __kmp_display_env_verbose);
+} // __kmp_env_print_2
+
 
+void __kmp_display_env_impl(int display_env, int display_env_verbose) {
   kmp_env_blk_t block;
   kmp_str_buf_t buffer;
 
@@ -5737,9 +5741,9 @@ void __kmp_env_print_2() {
 
   for (int i = 0; i < __kmp_stg_count; ++i) {
     if (__kmp_stg_table[i].print != NULL &&
-        ((__kmp_display_env &&
+        ((display_env &&
           strncmp(__kmp_stg_table[i].name, "OMP_", 4) == 0) ||
-         __kmp_display_env_verbose)) {
+         display_env_verbose)) {
       __kmp_stg_table[i].print(&buffer, __kmp_stg_table[i].name,
                                __kmp_stg_table[i].data);
     }
@@ -5754,7 +5758,6 @@ void __kmp_env_print_2() {
   __kmp_str_buf_free(&buffer);
 
   __kmp_printf("\n");
-
-} // __kmp_env_print_2
+}
 
 // end of file
diff --git a/openmp/runtime/src/kmp_settings.h b/openmp/runtime/src/kmp_settings.h
index 3247ffc6af74..d61c40694cf6 100644
--- a/openmp/runtime/src/kmp_settings.h
+++ b/openmp/runtime/src/kmp_settings.h
@@ -17,6 +17,7 @@ void __kmp_reset_global_vars(void);
 void __kmp_env_initialize(char const *);
 void __kmp_env_print();
 void __kmp_env_print_2();
+void __kmp_display_env_impl(int display_env, int display_env_verbose);
 
 int __kmp_initial_threads_capacity(int req_nproc);
 void __kmp_init_dflt_team_nth();
diff --git a/openmp/runtime/src/kmp_stats.cpp b/openmp/runtime/src/kmp_stats.cpp
index dabd0c35b85c..55ac18a4312c 100644
--- a/openmp/runtime/src/kmp_stats.cpp
+++ b/openmp/runtime/src/kmp_stats.cpp
@@ -270,7 +270,7 @@ void explicitTimer::stop(tsc_tick_count tick,
 /* ************* partitionedTimers member functions ************* */
 partitionedTimers::partitionedTimers() { timer_stack.reserve(8); }
 
-// initialize the paritioned timers to an initial timer
+// initialize the partitioned timers to an initial timer
 void partitionedTimers::init(explicitTimer timer) {
   KMP_DEBUG_ASSERT(this->timer_stack.size() == 0);
   timer_stack.push_back(timer);
@@ -609,7 +609,7 @@ void kmp_stats_output_module::printTimerStats(FILE *statsOut,
               totalStats[s].format(tag, true).c_str());
   }
 
-  // Print historgram of statistics
+  // Print histogram of statistics
   if (theStats[0].haveHist()) {
     fprintf(statsOut, "\nTimer distributions\n");
     for (int s = 0; s < TIMER_LAST; s++) {
diff --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h
index ee95658fd9b7..a36528f3fca3 100644
--- a/openmp/runtime/src/kmp_stats.h
+++ b/openmp/runtime/src/kmp_stats.h
@@ -195,7 +195,7 @@ enum stats_state_e {
 //                                from a dynamically scheduled loop
 // OMP_critical           -- Time thread spends executing critical section
 // OMP_critical_wait      -- Time thread spends waiting to enter
-//                           a critcal seciton
+//                           a critical section
 // OMP_single             -- Time spent executing a "single" region
 // OMP_master             -- Time spent executing a "master" region
 // OMP_task_immediate     -- Time spent executing non-deferred tasks
@@ -522,7 +522,7 @@ public:
   void windup();
 };
 
-// Special wrapper around the partioned timers to aid timing code blocks
+// Special wrapper around the partitioned timers to aid timing code blocks
 // It avoids the need to have an explicit end, leaving the scope suffices.
 class blockPartitionedTimer {
   partitionedTimers *part_timers;
@@ -920,7 +920,7 @@ extern kmp_stats_output_module __kmp_stats_output;
 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
 
 /*!
- * \brief Initializes the paritioned timers to begin with name.
+ * \brief Initializes the partitioned timers to begin with name.
  *
  * @param name timer which you want this thread to begin with
  *
diff --git a/openmp/runtime/src/kmp_str.h b/openmp/runtime/src/kmp_str.h
index 09faadb68f1a..9e669bbe4742 100644
--- a/openmp/runtime/src/kmp_str.h
+++ b/openmp/runtime/src/kmp_str.h
@@ -72,12 +72,12 @@ struct kmp_str_fname {
 typedef struct kmp_str_fname kmp_str_fname_t;
 void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path);
 void __kmp_str_fname_free(kmp_str_fname_t *fname);
-// Compares file name with specified patern. If pattern is NULL, any fname
+// Compares file name with specified pattern. If pattern is NULL, any fname
 // matched.
 int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern);
 
 /* The compiler provides source locations in string form
-   ";file;func;line;col;;". It is not convenient for manupulation. This
+   ";file;func;line;col;;". It is not convenient for manipulation. This
    structure keeps source location in more convenient form.
    Usage:
 
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index 6b5041988d5c..0fc022a03a2d 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -147,7 +147,7 @@ void *kmp_malloc(size_t size) {
   i;
   void *res;
 #if KMP_OS_WINDOWS
-  // If succesfull returns a pointer to the memory block, otherwise returns
+  // If successful returns a pointer to the memory block, otherwise returns
   // NULL.
   // Sets errno to ENOMEM or EINVAL if memory allocation failed or parameter
   // validation failed.
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index e1618f5cd9df..a654951f5b3b 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -35,7 +35,7 @@ static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
 
 static void __kmp_init_node(kmp_depnode_t *node) {
   node->dn.successors = NULL;
-  node->dn.task = NULL; // will point to the rigth task
+  node->dn.task = NULL; // will point to the right task
   // once dependences have been processed
   for (int i = 0; i < MAX_MTX_DEPS; ++i)
     node->dn.mtx_locks[i] = NULL;
@@ -205,7 +205,7 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
   return new_head;
 }
 
-static inline void __kmp_track_dependence(kmp_depnode_t *source,
+static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
                                           kmp_depnode_t *sink,
                                           kmp_task_t *sink_task) {
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
@@ -224,11 +224,14 @@ static inline void __kmp_track_dependence(kmp_depnode_t *source,
      */
   if (ompt_enabled.ompt_callback_task_dependence) {
     kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
-    kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+    ompt_data_t *sink_data;
+    if (sink_task)
+      sink_data = &(KMP_TASK_TO_TASKDATA(sink_task)->ompt_task_info.task_data);
+    else
+      sink_data = &__kmp_threads[gtid]->th.ompt_thread_info.task_data;
 
     ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
-        &(task_source->ompt_task_info.task_data),
-        &(task_sink->ompt_task_info.task_data));
+        &(task_source->ompt_task_info.task_data), sink_data);
   }
 #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
 }
@@ -246,7 +249,7 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
     if (dep->dn.task) {
       KMP_ACQUIRE_DEPNODE(gtid, dep);
       if (dep->dn.task) {
-        __kmp_track_dependence(dep, node, task);
+        __kmp_track_dependence(gtid, dep, node, task);
         dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
         KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                       "%p\n",
@@ -272,7 +275,7 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
     // synchronously add source to sink' list of successors
     KMP_ACQUIRE_DEPNODE(gtid, sink);
     if (sink->dn.task) {
-      __kmp_track_dependence(sink, source, task);
+      __kmp_track_dependence(gtid, sink, source, task);
       sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
       KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                     "%p\n",
@@ -473,8 +476,8 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   npredecessors++;
 
   // Update predecessors and obtain current value to check if there are still
-  // any outstandig dependences (some tasks may have finished while we processed
-  // the dependences)
+  // any outstanding dependences (some tasks may have finished while we
+  // processed the dependences)
   npredecessors =
       node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
 
@@ -498,7 +501,7 @@ task''
 @param noalias_dep_list List of depend items with no aliasing
 
 @return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not
-suspendend and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
+suspended and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
 
 Schedule a non-thread-switchable task with dependences for execution
 */
@@ -540,47 +543,40 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
       ompt_enabled.ompt_callback_dependences) {
     kmp_int32 i;
 
-    new_taskdata->ompt_task_info.ndeps = ndeps + ndeps_noalias;
-    new_taskdata->ompt_task_info.deps =
-        (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
-            thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
+    int ompt_ndeps = ndeps + ndeps_noalias;
+    ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
+        thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
 
-    KMP_ASSERT(new_taskdata->ompt_task_info.deps != NULL);
+    KMP_ASSERT(ompt_deps != NULL);
 
     for (i = 0; i < ndeps; i++) {
-      new_taskdata->ompt_task_info.deps[i].variable.ptr =
-          (void *)dep_list[i].base_addr;
+      ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
       if (dep_list[i].flags.in && dep_list[i].flags.out)
-        new_taskdata->ompt_task_info.deps[i].dependence_type =
-            ompt_dependence_type_inout;
+        ompt_deps[i].dependence_type = ompt_dependence_type_inout;
       else if (dep_list[i].flags.out)
-        new_taskdata->ompt_task_info.deps[i].dependence_type =
-            ompt_dependence_type_out;
+        ompt_deps[i].dependence_type = ompt_dependence_type_out;
       else if (dep_list[i].flags.in)
-        new_taskdata->ompt_task_info.deps[i].dependence_type =
-            ompt_dependence_type_in;
+        ompt_deps[i].dependence_type = ompt_dependence_type_in;
+      else if (dep_list[i].flags.mtx)
+        ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset;
     }
     for (i = 0; i < ndeps_noalias; i++) {
-      new_taskdata->ompt_task_info.deps[ndeps + i].variable.ptr =
-          (void *)noalias_dep_list[i].base_addr;
+      ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
       if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
-        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
-            ompt_dependence_type_inout;
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
       else if (noalias_dep_list[i].flags.out)
-        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
-            ompt_dependence_type_out;
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
       else if (noalias_dep_list[i].flags.in)
-        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
-            ompt_dependence_type_in;
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
+      else if (noalias_dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
     }
     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
-        &(new_taskdata->ompt_task_info.task_data),
-        new_taskdata->ompt_task_info.deps, new_taskdata->ompt_task_info.ndeps);
+        &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps);
     /* We can now free the allocated memory for the dependencies */
-    /* For OMPD we might want to delay the free until task_end */
-    KMP_OMPT_DEPS_FREE(thread, new_taskdata->ompt_task_info.deps);
-    new_taskdata->ompt_task_info.deps = NULL;
-    new_taskdata->ompt_task_info.ndeps = 0;
+    /* For OMPD we might want to delay the free until end of this function */
+    KMP_OMPT_DEPS_FREE(thread, ompt_deps);
   }
 #endif /* OMPT_OPTIONAL */
 #endif /* OMPT_SUPPORT */
@@ -642,6 +638,23 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
   return ret;
 }
 
+#if OMPT_SUPPORT
+void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task,
+                                ompt_data_t *taskwait_task_data) {
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    ompt_data_t task_data = ompt_data_none;
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
+        ompt_task_switch, taskwait_task_data);
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        taskwait_task_data, ompt_task_complete,
+        current_task ? &(current_task->ompt_task_info.task_data) : &task_data);
+  }
+  current_task->ompt_task_info.frame.enter_frame.ptr = NULL;
+  *taskwait_task_data = ompt_data_none;
+}
+#endif /* OMPT_SUPPORT */
+
 /*!
 @ingroup TASKING
 @param loc_ref location of the original task directive
@@ -668,6 +681,74 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
+#if OMPT_SUPPORT
+  // this function represents a taskwait construct with depend clause
+  // We signal 4 events:
+  //  - creation of the taskwait task
+  //  - dependences of the taskwait task
+  //  - schedule and finish of the taskwait task
+  ompt_data_t *taskwait_task_data = &thread->th.ompt_thread_info.task_data;
+  KMP_ASSERT(taskwait_task_data->ptr == NULL);
+  if (ompt_enabled.enabled) {
+    if (!current_task->ompt_task_info.frame.enter_frame.ptr)
+      current_task->ompt_task_info.frame.enter_frame.ptr =
+          OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_data_t task_data = ompt_data_none;
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
+          current_task ? &(current_task->ompt_task_info.frame) : NULL,
+          taskwait_task_data,
+          ompt_task_explicit | ompt_task_undeferred | ompt_task_mergeable, 1,
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+
+#if OMPT_OPTIONAL
+  /* OMPT grab all dependences if requested by the tool */
+  if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
+    kmp_int32 i;
+
+    int ompt_ndeps = ndeps + ndeps_noalias;
+    ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
+        thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
+
+    KMP_ASSERT(ompt_deps != NULL);
+
+    for (i = 0; i < ndeps; i++) {
+      ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
+      if (dep_list[i].flags.in && dep_list[i].flags.out)
+        ompt_deps[i].dependence_type = ompt_dependence_type_inout;
+      else if (dep_list[i].flags.out)
+        ompt_deps[i].dependence_type = ompt_dependence_type_out;
+      else if (dep_list[i].flags.in)
+        ompt_deps[i].dependence_type = ompt_dependence_type_in;
+      else if (dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
+    }
+    for (i = 0; i < ndeps_noalias; i++) {
+      ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
+      if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
+      else if (noalias_dep_list[i].flags.out)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
+      else if (noalias_dep_list[i].flags.in)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
+      else if (noalias_dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        taskwait_task_data, ompt_deps, ompt_ndeps);
+    /* We can now free the allocated memory for the dependencies */
+    /* For OMPD we might want to delay the free until end of this function */
+    KMP_OMPT_DEPS_FREE(thread, ompt_deps);
+    ompt_deps = NULL;
+  }
+#endif /* OMPT_OPTIONAL */
+#endif /* OMPT_SUPPORT */
+
   // We can return immediately as:
   // - dependences are not computed in serial teams (except with proxy tasks)
   // - if the dephash is not yet created it means we have nothing to wait for
@@ -682,6 +763,9 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
                   "dependencies : loc=%p\n",
                   gtid, loc_ref));
+#if OMPT_SUPPORT
+    __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
     return;
   }
 
@@ -694,6 +778,9 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
                   "dependencies : loc=%p\n",
                   gtid, loc_ref));
+#if OMPT_SUPPORT
+    __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
     return;
   }
 
@@ -705,6 +792,9 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
                        __kmp_task_stealing_constraint);
   }
 
+#if OMPT_SUPPORT
+  __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
   KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n",
                 gtid, loc_ref));
 }
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 15ffc1454fe9..2ddc2e7a6fd7 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -298,6 +298,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
 static void __kmp_realloc_task_deque(kmp_info_t *thread,
                                      kmp_thread_data_t *thread_data) {
   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
   kmp_int32 new_size = 2 * size;
 
   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
@@ -381,8 +382,11 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
     } else {
       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
       locked = 1;
-      // expand deque to push the task which is not allowed to execute
-      __kmp_realloc_task_deque(thread, thread_data);
+      if (TCR_4(thread_data->td.td_deque_ntasks) >=
+          TASK_DEQUE_SIZE(thread_data->td)) {
+        // expand deque to push the task which is not allowed to execute
+        __kmp_realloc_task_deque(thread, thread_data);
+      }
     }
   }
   // Lock the deque for the task push operation
@@ -547,8 +551,6 @@ static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
   task->ompt_task_info.frame.enter_frame = ompt_data_none;
   task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
   task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
-  task->ompt_task_info.ndeps = 0;
-  task->ompt_task_info.deps = NULL;
 }
 
 // __ompt_task_start:
@@ -573,24 +575,20 @@ static inline void __ompt_task_start(kmp_task_t *task,
 
 // __ompt_task_finish:
 //   Build and trigger final task-schedule event
-static inline void
-__ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
-                   ompt_task_status_t status = ompt_task_complete) {
-  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
-      taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
-    status = ompt_task_cancel;
-  }
-
-  /* let OMPT know that we're returning to the callee task */
+static inline void __ompt_task_finish(kmp_task_t *task,
+                                      kmp_taskdata_t *resumed_task,
+                                      ompt_task_status_t status) {
   if (ompt_enabled.ompt_callback_task_schedule) {
+    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+    if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
+        taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
+      status = ompt_task_cancel;
+    }
+
+    /* let OMPT know that we're returning to the callee task */
     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
         &(taskdata->ompt_task_info.task_data), status,
-        &((resumed_task ? resumed_task
-                        : (taskdata->ompt_task_info.scheduling_parent
-                               ? taskdata->ompt_task_info.scheduling_parent
-                               : taskdata->td_parent))
-              ->ompt_task_info.task_data));
+        (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
   }
 }
 #endif
@@ -799,6 +797,10 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
 // gtid: global thread ID for calling thread
 // task: task to be finished
 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
+//
+// template<ompt>: effectively ompt_enabled.enabled!=0
+// the version with ompt=false is inlined, allowing to optimize away all ompt
+// code in this case
 template <bool ompt>
 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
                               kmp_taskdata_t *resumed_task) {
@@ -845,10 +847,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
       return;
     }
   }
-#if OMPT_SUPPORT
-  if (ompt)
-    __ompt_task_finish(task, resumed_task);
-#endif
 
   // Check mutexinoutset dependencies, release locks
   kmp_depnode_t *node = taskdata->td_depnode;
@@ -861,7 +859,37 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
     }
   }
 
+  // bookkeeping for resuming task:
+  // GEH - note tasking_ser => task_serial
+  KMP_DEBUG_ASSERT(
+      (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
+      taskdata->td_flags.task_serial);
+  if (taskdata->td_flags.task_serial) {
+    if (resumed_task == NULL) {
+      resumed_task = taskdata->td_parent; // In a serialized task, the resumed
+      // task is the parent
+    }
+  } else {
+    KMP_DEBUG_ASSERT(resumed_task !=
+                     NULL); // verify that resumed task is passed as argument
+  }
+
+  /* If the tasks' destructor thunk flag has been set, we need to invoke the
+     destructor thunk that has been generated by the compiler. The code is
+     placed here, since at this point other tasks might have been released
+     hence overlapping the destructor invocations with some other work in the
+     released tasks.  The OpenMP spec is not specific on when the destructors
+     are invoked, so we should be free to choose. */
+  if (taskdata->td_flags.destructors_thunk) {
+    kmp_routine_entry_t destr_thunk = task->data1.destructors;
+    KMP_ASSERT(destr_thunk);
+    destr_thunk(gtid, task);
+  }
+
   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
   bool detach = false;
   if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
     if (taskdata->td_allow_completion_event.type ==
@@ -870,21 +898,41 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
       if (taskdata->td_allow_completion_event.type ==
           KMP_EVENT_ALLOW_COMPLETION) {
+        // task finished execution
+        KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+        taskdata->td_flags.executing = 0; // suspend the finishing task
+
+#if OMPT_SUPPORT
+        // For a detached task, which is not completed, we switch back
+        // the omp_fulfill_event signals completion
+        // locking is necessary to avoid a race with ompt_task_late_fulfill
+        if (ompt)
+          __ompt_task_finish(task, resumed_task, ompt_task_detach);
+#endif
+
+        // no access to taskdata after this point!
+        // __kmp_fulfill_event might free taskdata at any time from now
+
         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
         detach = true;
       }
       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
     }
   }
-  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
-  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 
   if (!detach) {
     taskdata->td_flags.complete = 1; // mark the task as completed
 
+#if OMPT_SUPPORT
+    // This is not a detached task, we are done here
+    if (ompt)
+      __ompt_task_finish(task, resumed_task, ompt_task_complete);
+#endif
+
     // Only need to keep track of count if team parallel and tasking not
-    // serialized
-    if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+    // serialized, or task is detachable and event has already been fulfilled 
+    if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
+        taskdata->td_flags.detachable == TASK_DETACHABLE) {
       // Predecrement simulated by "- 1" calculation
       children =
           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
@@ -897,45 +945,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
       // with the proxy task as origin
       __kmp_release_deps(gtid, taskdata);
     }
+    // td_flags.executing must be marked as 0 after __kmp_release_deps has been
+    // called. Othertwise, if a task is executed immediately from the
+    // release_deps code, the flag will be reset to 1 again by this same
+    // function
+    KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+    taskdata->td_flags.executing = 0; // suspend the finishing task
   }
 
-  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
-  // called. Othertwise, if a task is executed immediately from the release_deps
-  // code, the flag will be reset to 1 again by this same function
-  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
-  taskdata->td_flags.executing = 0; // suspend the finishing task
 
   KA_TRACE(
       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
            gtid, taskdata, children));
 
-  /* If the tasks' destructor thunk flag has been set, we need to invoke the
-     destructor thunk that has been generated by the compiler. The code is
-     placed here, since at this point other tasks might have been released
-     hence overlapping the destructor invokations with some other work in the
-     released tasks.  The OpenMP spec is not specific on when the destructors
-     are invoked, so we should be free to choose. */
-  if (taskdata->td_flags.destructors_thunk) {
-    kmp_routine_entry_t destr_thunk = task->data1.destructors;
-    KMP_ASSERT(destr_thunk);
-    destr_thunk(gtid, task);
-  }
-
-  // bookkeeping for resuming task:
-  // GEH - note tasking_ser => task_serial
-  KMP_DEBUG_ASSERT(
-      (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
-      taskdata->td_flags.task_serial);
-  if (taskdata->td_flags.task_serial) {
-    if (resumed_task == NULL) {
-      resumed_task = taskdata->td_parent; // In a serialized task, the resumed
-      // task is the parent
-    }
-  } else {
-    KMP_DEBUG_ASSERT(resumed_task !=
-                     NULL); // verify that resumed task is passed as argument
-  }
-
   // Free this task and then ancestor tasks if they have no children.
   // Restore th_current_task first as suggested by John:
   // johnmc: if an asynchronous inquiry peers into the runtime system
@@ -1304,7 +1326,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
 
   taskdata->td_flags.task_serial =
       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
-       taskdata->td_flags.tasking_ser);
+       taskdata->td_flags.tasking_ser || flags->merged_if0);
 
   taskdata->td_flags.started = 0;
   taskdata->td_flags.executing = 0;
@@ -1411,7 +1433,7 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
 //
 // gtid: global thread ID of caller
 // task: the task to invoke
-// current_task: the task to resume after task invokation
+// current_task: the task to resume after task invocation
 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
                               kmp_taskdata_t *current_task) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
@@ -2911,7 +2933,7 @@ static inline int __kmp_execute_tasks_template(
       // met, then return now, so that the barrier gather/release pattern can
       // proceed. If this thread is in the last spin loop in the barrier,
       // waiting to be released, we know that the termination condition will not
-      // be satisified, so don't waste any cycles checking it.
+      // be satisfied, so don't waste any cycles checking it.
       if (flag == NULL || (!final_spin && flag->done_check())) {
         KA_TRACE(
             15,
@@ -3096,7 +3118,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * to each thread in the team, so that it can steal work from it.
  *
  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
- * counting mechanims, and is allocated by the master thread before calling
+ * counting mechanism, and is allocated by the master thread before calling
  * __kmp_<barrier_kind>_release, and then is release by the last thread to
  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
  * of the kmp_task_team_t structs for consecutive barriers can overlap
@@ -3107,7 +3129,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * We currently use the existence of the threads array as an indicator that
  * tasks were spawned since the last barrier.  If the structure is to be
  * useful outside the context of tasking, then this will have to change, but
- * not settting the field minimizes the performance impact of tasking on
+ * not setting the field minimizes the performance impact of tasking on
  * barriers, when no explicit tasks were spawned (pushed, actually).
  */
 
@@ -3651,7 +3673,11 @@ static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
       return result;
 
     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
-    __kmp_realloc_task_deque(thread, thread_data);
+    if (TCR_4(thread_data->td.td_deque_ntasks) >=
+        TASK_DEQUE_SIZE(thread_data->td)) {
+      // expand deque to push the task which is not allowed to execute
+      __kmp_realloc_task_deque(thread, thread_data);
+    }
 
   } else {
 
@@ -3847,22 +3873,30 @@ void __kmp_fulfill_event(kmp_event_t *event) {
     bool detached = false;
     int gtid = __kmp_get_gtid();
 
+    // The associated task might have completed or could be completing at this
+    // point.
+    // We need to take the lock to avoid races
+    __kmp_acquire_tas_lock(&event->lock, gtid);
     if (taskdata->td_flags.proxy == TASK_PROXY) {
-      // The associated task code completed before this call and detached.
       detached = true;
-      event->type = KMP_EVENT_UNINITIALIZED;
     } else {
-      // The associated task has not completed but could be completing at this
-      // point.
-      // We need to take the lock to avoid races
-      __kmp_acquire_tas_lock(&event->lock, gtid);
-      if (taskdata->td_flags.proxy == TASK_PROXY)
-        detached = true;
-      event->type = KMP_EVENT_UNINITIALIZED;
-      __kmp_release_tas_lock(&event->lock, gtid);
+#if OMPT_SUPPORT
+      // The OMPT event must occur under mutual exclusion,
+      // otherwise the tool might access ptask after free
+      if (UNLIKELY(ompt_enabled.enabled))
+        __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
+#endif
     }
+    event->type = KMP_EVENT_UNINITIALIZED;
+    __kmp_release_tas_lock(&event->lock, gtid);
 
     if (detached) {
+#if OMPT_SUPPORT
+      // We free ptask afterwards and know the task is finished,
+      // so locking is not necessary
+      if (UNLIKELY(ompt_enabled.enabled))
+        __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
+#endif
       // If the task detached complete the proxy task
       if (gtid >= 0) {
         kmp_team_t *team = taskdata->td_team;
@@ -3888,14 +3922,13 @@ void __kmp_fulfill_event(kmp_event_t *event) {
 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
-  kmp_taskdata_t *taskdata_src;
-  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
+  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
   size_t shareds_offset;
   size_t task_size;
 
   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
                 task_src));
-  taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
                    TASK_FULL); // it should not be proxy task
   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
@@ -3923,9 +3956,12 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
   }
   taskdata->td_alloc_thread = thread;
   taskdata->td_parent = parent_task;
-  taskdata->td_taskgroup =
-      parent_task
-          ->td_taskgroup; // task inherits the taskgroup from the parent task
+  // task inherits the taskgroup from the parent task
+  taskdata->td_taskgroup = parent_task->td_taskgroup;
+  // tied task needs to initialize the td_last_tied at creation,
+  // untied one does this when it is scheduled for execution
+  if (taskdata->td_flags.tiedness == TASK_TIED)
+    taskdata->td_last_tied = taskdata;
 
   // Only need to keep track of child task counts if team parallel and tasking
   // not serialized
@@ -4255,7 +4291,7 @@ int __kmp_taskloop_task(int gtid, void *ptask) {
 // grainsize  Number of loop iterations per task
 // extras     Number of chunks with grainsize+1 iterations
 // tc         Iterations count
-// num_t_min  Threashold to launch tasks recursively
+// num_t_min  Threshold to launch tasks recursively
 // task_dup   Tasks duplication routine
 // codeptr_ra Return address for OMPT events
 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
@@ -4267,7 +4303,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
                           void *codeptr_ra,
 #endif
                           void *task_dup) {
-#if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
@@ -4275,7 +4310,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
                 task_dup));
-#endif
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
   kmp_uint64 lower = *lb;
   kmp_info_t *thread = __kmp_threads[gtid];
@@ -4319,9 +4353,14 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   *ub = ub0; // adjust upper bound for the 1st half
 
   // create auxiliary task for 2nd half of the loop
+  // make sure new task has same parent task as the pattern task
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  thread->th.th_current_task = taskdata->td_parent;
   kmp_task_t *new_task =
       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+  // restore current task
+  thread->th.th_current_task = current_task;
   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
   p->task = next_task;
   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
diff --git a/openmp/runtime/src/kmp_utility.cpp b/openmp/runtime/src/kmp_utility.cpp
index 44a99d0455b3..6e6785deb445 100644
--- a/openmp/runtime/src/kmp_utility.cpp
+++ b/openmp/runtime/src/kmp_utility.cpp
@@ -194,7 +194,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
       KA_TRACE(trace_level, (" PSN"));
     }
     if ((buf.edx >> 19) & 1) {
-      /* CLFULSH - Cache Flush Instruction Available */
+      /* CLFLUSH - Cache Flush Instruction Available */
       cflush_size =
           data[1] * 8; /* Bits 15-08: CLFLUSH line size = 8 (64 bytes) */
       KA_TRACE(trace_level, (" CLFLUSH(%db)", cflush_size));
diff --git a/openmp/runtime/src/kmp_version.h b/openmp/runtime/src/kmp_version.h
index 9e726b3805b2..6ce40eecb5de 100644
--- a/openmp/runtime/src/kmp_version.h
+++ b/openmp/runtime/src/kmp_version.h
@@ -30,7 +30,7 @@ extern "C" {
    just before version string. */
 #define KMP_VERSION_MAGIC_STR "\x00@(#) "
 #define KMP_VERSION_MAGIC_LEN 6 // Length of KMP_VERSION_MAGIC_STR.
-#define KMP_VERSION_PREF_STR "Intel(R) OMP "
+#define KMP_VERSION_PREF_STR "LLVM OMP "
 #define KMP_VERSION_PREFIX KMP_VERSION_MAGIC_STR KMP_VERSION_PREF_STR
 
 /* declare all the version string constants for KMP_VERSION env. variable */
diff --git a/openmp/runtime/src/kmp_wrapper_malloc.h b/openmp/runtime/src/kmp_wrapper_malloc.h
index 1544c5df3d64..c027e0b297d0 100644
--- a/openmp/runtime/src/kmp_wrapper_malloc.h
+++ b/openmp/runtime/src/kmp_wrapper_malloc.h
@@ -15,11 +15,11 @@
 #define KMP_WRAPPER_MALLOC_H
 
 /* This header serves for 3 purposes:
-   1. Declaring standard memory allocation rourines in OS-independent way.
+   1. Declaring standard memory allocation routines in OS-independent way.
    2. Passing source location info through memory allocation wrappers.
    3. Enabling native memory debugging capabilities.
 
-   1. Declaring standard memory allocation rourines in OS-independent way.
+   1. Declaring standard memory allocation routines in OS-independent way.
    -----------------------------------------------------------------------
    On Linux* OS, alloca() function is declared in <alloca.h> header, while on
    Windows* OS there is no <alloca.h> header, function _alloca() (note
@@ -103,9 +103,9 @@
 #error Unknown or unsupported OS.
 #endif
 
-/* KMP_SRC_LOC_DECL -- Declaring source location paramemters, to be used in
+/* KMP_SRC_LOC_DECL -- Declaring source location parameters, to be used in
    function declaration.
-   KMP_SRC_LOC_PARM -- Source location paramemters, to be used to pass
+   KMP_SRC_LOC_PARM -- Source location parameters, to be used to pass
    parameters to underlying levels.
    KMP_SRC_LOC_CURR -- Source location arguments describing current location,
    to be used at top-level.
diff --git a/openmp/runtime/src/ompt-internal.h b/openmp/runtime/src/ompt-internal.h
index 958b5943af38..f753ab4ebc6d 100644
--- a/openmp/runtime/src/ompt-internal.h
+++ b/openmp/runtime/src/ompt-internal.h
@@ -57,8 +57,6 @@ typedef struct {
   ompt_data_t task_data;
   struct kmp_taskdata *scheduling_parent;
   int thread_num;
-  int ndeps;
-  ompt_dependence_t *deps;
 } ompt_task_info_t;
 
 typedef struct {
diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
index 7fb81bb7d1a0..a7288f08a661 100644
--- a/openmp/runtime/src/ompt-specific.cpp
+++ b/openmp/runtime/src/ompt-specific.cpp
@@ -262,8 +262,6 @@ void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
   lwt->ompt_task_info.frame.enter_frame = ompt_data_none;
   lwt->ompt_task_info.frame.exit_frame = ompt_data_none;
   lwt->ompt_task_info.scheduling_parent = NULL;
-  lwt->ompt_task_info.deps = NULL;
-  lwt->ompt_task_info.ndeps = 0;
   lwt->heap = 0;
   lwt->parent = 0;
 }
diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h
index 5ba240c1a950..fa5c5662c649 100644
--- a/openmp/runtime/src/ompt-specific.h
+++ b/openmp/runtime/src/ompt-specific.h
@@ -102,7 +102,7 @@ inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) {
 inline const char *ompt_get_runtime_version() {
   return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
 }
-#endif // OMPT_SUPPRORT
+#endif // OMPT_SUPPORT
 
 // macros providing the OMPT callbacks for reduction clause
 #if OMPT_SUPPORT && OMPT_OPTIONAL
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
index d730c48ec705..db1c0d0d9d21 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
@@ -2303,7 +2303,7 @@ ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_
  * name of the metadata), and a value (the actual data). The encoding of
  * the value depends on the type of the metadata.
  *
- * The type of metadata is specified by an enumerated type __itt_metdata_type.
+ * The type of metadata is specified by an enumerated type __itt_metadata_type.
  * @{
  */
 
@@ -3196,7 +3196,7 @@ ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain,
 #define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
 #define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
 #else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail)
+#define __itt_relation_add_to_current_ex(domain,clock_domain,timestamp,relation,tail)
 #define __itt_relation_add_to_current_ex_ptr 0
 #define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
 #define __itt_relation_add_ex_ptr 0
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
index 8f9e2a655ae4..4936b9baaf80 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
@@ -762,7 +762,7 @@ static const char* __itt_fsplit(const char* s, const char* sep, const char** out
 
 /* This function return value of env variable that placed into static buffer.
  * !!! The same static buffer is used for subsequent calls. !!!
- * This was done to aviod dynamic allocation for few calls.
+ * This was done to avoid dynamic allocation for few calls.
  * Actually we need this function only four times.
  */
 static const char* __itt_get_env_var(const char* name)
@@ -1012,7 +1012,7 @@ static void __itt_reinit_all_pointers(void)
 static void __itt_nullify_all_pointers(void)
 {
     int i;
-    /* Nulify all pointers except domain_create, string_handle_create  and counter_create */
+    /* Nullify all pointers except domain_create, string_handle_create  and counter_create */
     for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
         *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
 }
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 1daa3d31047e..3b5910fc95e8 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -164,7 +164,7 @@ void __kmp_affinity_determine_capable(const char *env_var) {
   if (gCode > 0) { // Linux* OS only
     // The optimal situation: the OS returns the size of the buffer it expects.
     //
-    // A verification of correct behavior is that Isetaffinity on a NULL
+    // A verification of correct behavior is that setaffinity on a NULL
     // buffer with the same size fails with errno set to EFAULT.
     sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
     KA_TRACE(30, ("__kmp_affinity_determine_capable: "
@@ -286,7 +286,7 @@ void __kmp_affinity_determine_capable(const char *env_var) {
   if (gCode == 0) {
     KMP_AFFINITY_ENABLE(KMP_CPU_SET_SIZE_LIMIT);
     KA_TRACE(10, ("__kmp_affinity_determine_capable: "
-                  "affinity supported (mask size %d)\n"<
+                  "affinity supported (mask size %d)\n",
 		  (int)__kmp_affin_mask_size));
     KMP_INTERNAL_FREE(buf);
     return;
@@ -2207,7 +2207,7 @@ int __kmp_get_load_balance(int max) {
 
 #else // Linux* OS
 
-// The fuction returns number of running (not sleeping) threads, or -1 in case
+// The function returns number of running (not sleeping) threads, or -1 in case
 // of error. Error could be reported if Linux* OS kernel too old (without
 // "/proc" support). Counting running threads stops if max running threads
 // encountered.