summaryrefslogtreecommitdiff
path: root/openmp/runtime/src/kmp_tasking.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
commitcfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree209fb2a2d68f8f277793fc8df46c753d31bc853b /openmp/runtime/src/kmp_tasking.cpp
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
Notes
Diffstat (limited to 'openmp/runtime/src/kmp_tasking.cpp')
-rw-r--r--openmp/runtime/src/kmp_tasking.cpp207
1 files changed, 123 insertions, 84 deletions
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 15ffc1454fe9..2ddc2e7a6fd7 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -298,6 +298,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
static void __kmp_realloc_task_deque(kmp_info_t *thread,
kmp_thread_data_t *thread_data) {
kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
+ KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
kmp_int32 new_size = 2 * size;
KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
@@ -381,8 +382,11 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
} else {
__kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
locked = 1;
- // expand deque to push the task which is not allowed to execute
- __kmp_realloc_task_deque(thread, thread_data);
+ if (TCR_4(thread_data->td.td_deque_ntasks) >=
+ TASK_DEQUE_SIZE(thread_data->td)) {
+ // expand deque to push the task which is not allowed to execute
+ __kmp_realloc_task_deque(thread, thread_data);
+ }
}
}
// Lock the deque for the task push operation
@@ -547,8 +551,6 @@ static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
task->ompt_task_info.frame.enter_frame = ompt_data_none;
task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
- task->ompt_task_info.ndeps = 0;
- task->ompt_task_info.deps = NULL;
}
// __ompt_task_start:
@@ -573,24 +575,20 @@ static inline void __ompt_task_start(kmp_task_t *task,
// __ompt_task_finish:
// Build and trigger final task-schedule event
-static inline void
-__ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
- ompt_task_status_t status = ompt_task_complete) {
- kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
- if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
- taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
- status = ompt_task_cancel;
- }
-
- /* let OMPT know that we're returning to the callee task */
+static inline void __ompt_task_finish(kmp_task_t *task,
+ kmp_taskdata_t *resumed_task,
+ ompt_task_status_t status) {
if (ompt_enabled.ompt_callback_task_schedule) {
+ kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+ if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
+ taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
+ status = ompt_task_cancel;
+ }
+
+ /* let OMPT know that we're returning to the callee task */
ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
&(taskdata->ompt_task_info.task_data), status,
- &((resumed_task ? resumed_task
- : (taskdata->ompt_task_info.scheduling_parent
- ? taskdata->ompt_task_info.scheduling_parent
- : taskdata->td_parent))
- ->ompt_task_info.task_data));
+ (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
}
}
#endif
@@ -799,6 +797,10 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
// gtid: global thread ID for calling thread
// task: task to be finished
// resumed_task: task to be resumed. (may be NULL if task is serialized)
+//
+// template<ompt>: effectively ompt_enabled.enabled!=0
+// the version with ompt=false is inlined, allowing to optimize away all ompt
+// code in this case
template <bool ompt>
static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
kmp_taskdata_t *resumed_task) {
@@ -845,10 +847,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
return;
}
}
-#if OMPT_SUPPORT
- if (ompt)
- __ompt_task_finish(task, resumed_task);
-#endif
// Check mutexinoutset dependencies, release locks
kmp_depnode_t *node = taskdata->td_depnode;
@@ -861,7 +859,37 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
}
}
+ // bookkeeping for resuming task:
+ // GEH - note tasking_ser => task_serial
+ KMP_DEBUG_ASSERT(
+ (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
+ taskdata->td_flags.task_serial);
+ if (taskdata->td_flags.task_serial) {
+ if (resumed_task == NULL) {
+ resumed_task = taskdata->td_parent; // In a serialized task, the resumed
+ // task is the parent
+ }
+ } else {
+ KMP_DEBUG_ASSERT(resumed_task !=
+ NULL); // verify that resumed task is passed as argument
+ }
+
+ /* If the tasks' destructor thunk flag has been set, we need to invoke the
+ destructor thunk that has been generated by the compiler. The code is
+ placed here, since at this point other tasks might have been released
+ hence overlapping the destructor invocations with some other work in the
+ released tasks. The OpenMP spec is not specific on when the destructors
+ are invoked, so we should be free to choose. */
+ if (taskdata->td_flags.destructors_thunk) {
+ kmp_routine_entry_t destr_thunk = task->data1.destructors;
+ KMP_ASSERT(destr_thunk);
+ destr_thunk(gtid, task);
+ }
+
KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+ KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
+ KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
bool detach = false;
if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
if (taskdata->td_allow_completion_event.type ==
@@ -870,21 +898,41 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
__kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
if (taskdata->td_allow_completion_event.type ==
KMP_EVENT_ALLOW_COMPLETION) {
+ // task finished execution
+ KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+ taskdata->td_flags.executing = 0; // suspend the finishing task
+
+#if OMPT_SUPPORT
+ // For a detached task, which is not completed, we switch back
+ // the omp_fulfill_event signals completion
+ // locking is necessary to avoid a race with ompt_task_late_fulfill
+ if (ompt)
+ __ompt_task_finish(task, resumed_task, ompt_task_detach);
+#endif
+
+ // no access to taskdata after this point!
+ // __kmp_fulfill_event might free taskdata at any time from now
+
taskdata->td_flags.proxy = TASK_PROXY; // proxify!
detach = true;
}
__kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
}
}
- KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
- KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
if (!detach) {
taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPT_SUPPORT
+ // This is not a detached task, we are done here
+ if (ompt)
+ __ompt_task_finish(task, resumed_task, ompt_task_complete);
+#endif
+
// Only need to keep track of count if team parallel and tasking not
- // serialized
- if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+ // serialized, or task is detachable and event has already been fulfilled
+ if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
+ taskdata->td_flags.detachable == TASK_DETACHABLE) {
// Predecrement simulated by "- 1" calculation
children =
KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
@@ -897,45 +945,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
// with the proxy task as origin
__kmp_release_deps(gtid, taskdata);
}
+ // td_flags.executing must be marked as 0 after __kmp_release_deps has been
+ // called. Othertwise, if a task is executed immediately from the
+ // release_deps code, the flag will be reset to 1 again by this same
+ // function
+ KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+ taskdata->td_flags.executing = 0; // suspend the finishing task
}
- // td_flags.executing must be marked as 0 after __kmp_release_deps has been
- // called. Othertwise, if a task is executed immediately from the release_deps
- // code, the flag will be reset to 1 again by this same function
- KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
- taskdata->td_flags.executing = 0; // suspend the finishing task
KA_TRACE(
20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
gtid, taskdata, children));
- /* If the tasks' destructor thunk flag has been set, we need to invoke the
- destructor thunk that has been generated by the compiler. The code is
- placed here, since at this point other tasks might have been released
- hence overlapping the destructor invokations with some other work in the
- released tasks. The OpenMP spec is not specific on when the destructors
- are invoked, so we should be free to choose. */
- if (taskdata->td_flags.destructors_thunk) {
- kmp_routine_entry_t destr_thunk = task->data1.destructors;
- KMP_ASSERT(destr_thunk);
- destr_thunk(gtid, task);
- }
-
- // bookkeeping for resuming task:
- // GEH - note tasking_ser => task_serial
- KMP_DEBUG_ASSERT(
- (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
- taskdata->td_flags.task_serial);
- if (taskdata->td_flags.task_serial) {
- if (resumed_task == NULL) {
- resumed_task = taskdata->td_parent; // In a serialized task, the resumed
- // task is the parent
- }
- } else {
- KMP_DEBUG_ASSERT(resumed_task !=
- NULL); // verify that resumed task is passed as argument
- }
-
// Free this task and then ancestor tasks if they have no children.
// Restore th_current_task first as suggested by John:
// johnmc: if an asynchronous inquiry peers into the runtime system
@@ -1304,7 +1326,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
taskdata->td_flags.task_serial =
(parent_task->td_flags.final || taskdata->td_flags.team_serial ||
- taskdata->td_flags.tasking_ser);
+ taskdata->td_flags.tasking_ser || flags->merged_if0);
taskdata->td_flags.started = 0;
taskdata->td_flags.executing = 0;
@@ -1411,7 +1433,7 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
//
// gtid: global thread ID of caller
// task: the task to invoke
-// current_task: the task to resume after task invokation
+// current_task: the task to resume after task invocation
static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
kmp_taskdata_t *current_task) {
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
@@ -2911,7 +2933,7 @@ static inline int __kmp_execute_tasks_template(
// met, then return now, so that the barrier gather/release pattern can
// proceed. If this thread is in the last spin loop in the barrier,
// waiting to be released, we know that the termination condition will not
- // be satisified, so don't waste any cycles checking it.
+ // be satisfied, so don't waste any cycles checking it.
if (flag == NULL || (!final_spin && flag->done_check())) {
KA_TRACE(
15,
@@ -3096,7 +3118,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
* to each thread in the team, so that it can steal work from it.
*
* Enter the existence of the kmp_task_team_t struct. It employs a reference
- * counting mechanims, and is allocated by the master thread before calling
+ * counting mechanism, and is allocated by the master thread before calling
* __kmp_<barrier_kind>_release, and then is release by the last thread to
* exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
* of the kmp_task_team_t structs for consecutive barriers can overlap
@@ -3107,7 +3129,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
* We currently use the existence of the threads array as an indicator that
* tasks were spawned since the last barrier. If the structure is to be
* useful outside the context of tasking, then this will have to change, but
- * not settting the field minimizes the performance impact of tasking on
+ * not setting the field minimizes the performance impact of tasking on
* barriers, when no explicit tasks were spawned (pushed, actually).
*/
@@ -3651,7 +3673,11 @@ static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
return result;
__kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
- __kmp_realloc_task_deque(thread, thread_data);
+ if (TCR_4(thread_data->td.td_deque_ntasks) >=
+ TASK_DEQUE_SIZE(thread_data->td)) {
+ // expand deque to push the task which is not allowed to execute
+ __kmp_realloc_task_deque(thread, thread_data);
+ }
} else {
@@ -3847,22 +3873,30 @@ void __kmp_fulfill_event(kmp_event_t *event) {
bool detached = false;
int gtid = __kmp_get_gtid();
+ // The associated task might have completed or could be completing at this
+ // point.
+ // We need to take the lock to avoid races
+ __kmp_acquire_tas_lock(&event->lock, gtid);
if (taskdata->td_flags.proxy == TASK_PROXY) {
- // The associated task code completed before this call and detached.
detached = true;
- event->type = KMP_EVENT_UNINITIALIZED;
} else {
- // The associated task has not completed but could be completing at this
- // point.
- // We need to take the lock to avoid races
- __kmp_acquire_tas_lock(&event->lock, gtid);
- if (taskdata->td_flags.proxy == TASK_PROXY)
- detached = true;
- event->type = KMP_EVENT_UNINITIALIZED;
- __kmp_release_tas_lock(&event->lock, gtid);
+#if OMPT_SUPPORT
+ // The OMPT event must occur under mutual exclusion,
+ // otherwise the tool might access ptask after free
+ if (UNLIKELY(ompt_enabled.enabled))
+ __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
+#endif
}
+ event->type = KMP_EVENT_UNINITIALIZED;
+ __kmp_release_tas_lock(&event->lock, gtid);
if (detached) {
+#if OMPT_SUPPORT
+ // We free ptask afterwards and know the task is finished,
+ // so locking is not necessary
+ if (UNLIKELY(ompt_enabled.enabled))
+ __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
+#endif
// If the task detached complete the proxy task
if (gtid >= 0) {
kmp_team_t *team = taskdata->td_team;
@@ -3888,14 +3922,13 @@ void __kmp_fulfill_event(kmp_event_t *event) {
kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
kmp_task_t *task;
kmp_taskdata_t *taskdata;
- kmp_taskdata_t *taskdata_src;
- kmp_taskdata_t *parent_task = thread->th.th_current_task;
+ kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
+ kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
size_t shareds_offset;
size_t task_size;
KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
task_src));
- taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
TASK_FULL); // it should not be proxy task
KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
@@ -3923,9 +3956,12 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
}
taskdata->td_alloc_thread = thread;
taskdata->td_parent = parent_task;
- taskdata->td_taskgroup =
- parent_task
- ->td_taskgroup; // task inherits the taskgroup from the parent task
+ // task inherits the taskgroup from the parent task
+ taskdata->td_taskgroup = parent_task->td_taskgroup;
+ // tied task needs to initialize the td_last_tied at creation,
+ // untied one does this when it is scheduled for execution
+ if (taskdata->td_flags.tiedness == TASK_TIED)
+ taskdata->td_last_tied = taskdata;
// Only need to keep track of child task counts if team parallel and tasking
// not serialized
@@ -4255,7 +4291,7 @@ int __kmp_taskloop_task(int gtid, void *ptask) {
// grainsize Number of loop iterations per task
// extras Number of chunks with grainsize+1 iterations
// tc Iterations count
-// num_t_min Threashold to launch tasks recursively
+// num_t_min Threshold to launch tasks recursively
// task_dup Tasks duplication routine
// codeptr_ra Return address for OMPT events
void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
@@ -4267,7 +4303,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
void *codeptr_ra,
#endif
void *task_dup) {
-#if KMP_DEBUG
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
KMP_DEBUG_ASSERT(task != NULL);
KMP_DEBUG_ASSERT(num_tasks > num_t_min);
@@ -4275,7 +4310,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
" %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
task_dup));
-#endif
p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
kmp_uint64 lower = *lb;
kmp_info_t *thread = __kmp_threads[gtid];
@@ -4319,9 +4353,14 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
*ub = ub0; // adjust upper bound for the 1st half
// create auxiliary task for 2nd half of the loop
+ // make sure new task has same parent task as the pattern task
+ kmp_taskdata_t *current_task = thread->th.th_current_task;
+ thread->th.th_current_task = taskdata->td_parent;
kmp_task_t *new_task =
__kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+ // restore current task
+ thread->th.th_current_task = current_task;
__taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
p->task = next_task;
p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);