diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
commit | cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch) | |
tree | 209fb2a2d68f8f277793fc8df46c753d31bc853b /openmp/runtime/src/kmp_tasking.cpp | |
parent | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff) |
Notes
Diffstat (limited to 'openmp/runtime/src/kmp_tasking.cpp')
-rw-r--r-- | openmp/runtime/src/kmp_tasking.cpp | 207 |
1 files changed, 123 insertions, 84 deletions
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index 15ffc1454fe9..2ddc2e7a6fd7 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -298,6 +298,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, static void __kmp_realloc_task_deque(kmp_info_t *thread, kmp_thread_data_t *thread_data) { kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); + KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size); kmp_int32 new_size = 2 * size; KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " @@ -381,8 +382,11 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { } else { __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); locked = 1; - // expand deque to push the task which is not allowed to execute - __kmp_realloc_task_deque(thread, thread_data); + if (TCR_4(thread_data->td.td_deque_ntasks) >= + TASK_DEQUE_SIZE(thread_data->td)) { + // expand deque to push the task which is not allowed to execute + __kmp_realloc_task_deque(thread, thread_data); + } } } // Lock the deque for the task push operation @@ -547,8 +551,6 @@ static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { task->ompt_task_info.frame.enter_frame = ompt_data_none; task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; - task->ompt_task_info.ndeps = 0; - task->ompt_task_info.deps = NULL; } // __ompt_task_start: @@ -573,24 +575,20 @@ static inline void __ompt_task_start(kmp_task_t *task, // __ompt_task_finish: // Build and trigger final task-schedule event -static inline void -__ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task, - ompt_task_status_t status = ompt_task_complete) { - kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); - if (__kmp_omp_cancellation && taskdata->td_taskgroup && - taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { - status = ompt_task_cancel; - } - - /* let OMPT know that we're returning to the callee task */ +static inline void __ompt_task_finish(kmp_task_t *task, + kmp_taskdata_t *resumed_task, + ompt_task_status_t status) { if (ompt_enabled.ompt_callback_task_schedule) { + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + if (__kmp_omp_cancellation && taskdata->td_taskgroup && + taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { + status = ompt_task_cancel; + } + + /* let OMPT know that we're returning to the callee task */ ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( &(taskdata->ompt_task_info.task_data), status, - &((resumed_task ? resumed_task - : (taskdata->ompt_task_info.scheduling_parent - ? taskdata->ompt_task_info.scheduling_parent - : taskdata->td_parent)) - ->ompt_task_info.task_data)); + (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL)); } } #endif @@ -799,6 +797,10 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid, // gtid: global thread ID for calling thread // task: task to be finished // resumed_task: task to be resumed. (may be NULL if task is serialized) +// +// template<ompt>: effectively ompt_enabled.enabled!=0 +// the version with ompt=false is inlined, allowing to optimize away all ompt +// code in this case template <bool ompt> static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task) { @@ -845,10 +847,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, return; } } -#if OMPT_SUPPORT - if (ompt) - __ompt_task_finish(task, resumed_task); -#endif // Check mutexinoutset dependencies, release locks kmp_depnode_t *node = taskdata->td_depnode; @@ -861,7 +859,37 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, } } + // bookkeeping for resuming task: + // GEH - note tasking_ser => task_serial + KMP_DEBUG_ASSERT( + (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == + taskdata->td_flags.task_serial); + if (taskdata->td_flags.task_serial) { + if (resumed_task == NULL) { + resumed_task = taskdata->td_parent; // In a serialized task, the resumed + // task is the parent + } + } else { + KMP_DEBUG_ASSERT(resumed_task != + NULL); // verify that resumed task is passed as argument + } + + /* If the tasks' destructor thunk flag has been set, we need to invoke the + destructor thunk that has been generated by the compiler. The code is + placed here, since at this point other tasks might have been released + hence overlapping the destructor invocations with some other work in the + released tasks. The OpenMP spec is not specific on when the destructors + are invoked, so we should be free to choose. */ + if (taskdata->td_flags.destructors_thunk) { + kmp_routine_entry_t destr_thunk = task->data1.destructors; + KMP_ASSERT(destr_thunk); + destr_thunk(gtid, task); + } + KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); + KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); + KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); + bool detach = false; if (taskdata->td_flags.detachable == TASK_DETACHABLE) { if (taskdata->td_allow_completion_event.type == @@ -870,21 +898,41 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); if (taskdata->td_allow_completion_event.type == KMP_EVENT_ALLOW_COMPLETION) { + // task finished execution + KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); + taskdata->td_flags.executing = 0; // suspend the finishing task + +#if OMPT_SUPPORT + // For a detached task, which is not completed, we switch back + // the omp_fulfill_event signals completion + // locking is necessary to avoid a race with ompt_task_late_fulfill + if (ompt) + __ompt_task_finish(task, resumed_task, ompt_task_detach); +#endif + + // no access to taskdata after this point! + // __kmp_fulfill_event might free taskdata at any time from now + taskdata->td_flags.proxy = TASK_PROXY; // proxify! detach = true; } __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); } } - KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); - KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); if (!detach) { taskdata->td_flags.complete = 1; // mark the task as completed +#if OMPT_SUPPORT + // This is not a detached task, we are done here + if (ompt) + __ompt_task_finish(task, resumed_task, ompt_task_complete); +#endif + // Only need to keep track of count if team parallel and tasking not - // serialized - if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { + // serialized, or task is detachable and event has already been fulfilled + if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || + taskdata->td_flags.detachable == TASK_DETACHABLE) { // Predecrement simulated by "- 1" calculation children = KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; @@ -897,45 +945,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, // with the proxy task as origin __kmp_release_deps(gtid, taskdata); } + // td_flags.executing must be marked as 0 after __kmp_release_deps has been + // called. Othertwise, if a task is executed immediately from the + // release_deps code, the flag will be reset to 1 again by this same + // function + KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); + taskdata->td_flags.executing = 0; // suspend the finishing task } - // td_flags.executing must be marked as 0 after __kmp_release_deps has been - // called. Othertwise, if a task is executed immediately from the release_deps - // code, the flag will be reset to 1 again by this same function - KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); - taskdata->td_flags.executing = 0; // suspend the finishing task KA_TRACE( 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", gtid, taskdata, children)); - /* If the tasks' destructor thunk flag has been set, we need to invoke the - destructor thunk that has been generated by the compiler. The code is - placed here, since at this point other tasks might have been released - hence overlapping the destructor invokations with some other work in the - released tasks. The OpenMP spec is not specific on when the destructors - are invoked, so we should be free to choose. */ - if (taskdata->td_flags.destructors_thunk) { - kmp_routine_entry_t destr_thunk = task->data1.destructors; - KMP_ASSERT(destr_thunk); - destr_thunk(gtid, task); - } - - // bookkeeping for resuming task: - // GEH - note tasking_ser => task_serial - KMP_DEBUG_ASSERT( - (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == - taskdata->td_flags.task_serial); - if (taskdata->td_flags.task_serial) { - if (resumed_task == NULL) { - resumed_task = taskdata->td_parent; // In a serialized task, the resumed - // task is the parent - } - } else { - KMP_DEBUG_ASSERT(resumed_task != - NULL); // verify that resumed task is passed as argument - } - // Free this task and then ancestor tasks if they have no children. // Restore th_current_task first as suggested by John: // johnmc: if an asynchronous inquiry peers into the runtime system @@ -1304,7 +1326,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, taskdata->td_flags.task_serial = (parent_task->td_flags.final || taskdata->td_flags.team_serial || - taskdata->td_flags.tasking_ser); + taskdata->td_flags.tasking_ser || flags->merged_if0); taskdata->td_flags.started = 0; taskdata->td_flags.executing = 0; @@ -1411,7 +1433,7 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, // // gtid: global thread ID of caller // task: the task to invoke -// current_task: the task to resume after task invokation +// current_task: the task to resume after task invocation static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *current_task) { kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); @@ -2911,7 +2933,7 @@ static inline int __kmp_execute_tasks_template( // met, then return now, so that the barrier gather/release pattern can // proceed. If this thread is in the last spin loop in the barrier, // waiting to be released, we know that the termination condition will not - // be satisified, so don't waste any cycles checking it. + // be satisfied, so don't waste any cycles checking it. if (flag == NULL || (!final_spin && flag->done_check())) { KA_TRACE( 15, @@ -3096,7 +3118,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team, * to each thread in the team, so that it can steal work from it. * * Enter the existence of the kmp_task_team_t struct. It employs a reference - * counting mechanims, and is allocated by the master thread before calling + * counting mechanism, and is allocated by the master thread before calling * __kmp_<barrier_kind>_release, and then is release by the last thread to * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes * of the kmp_task_team_t structs for consecutive barriers can overlap @@ -3107,7 +3129,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team, * We currently use the existence of the threads array as an indicator that * tasks were spawned since the last barrier. If the structure is to be * useful outside the context of tasking, then this will have to change, but - * not settting the field minimizes the performance impact of tasking on + * not setting the field minimizes the performance impact of tasking on * barriers, when no explicit tasks were spawned (pushed, actually). */ @@ -3651,7 +3673,11 @@ static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, return result; __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); - __kmp_realloc_task_deque(thread, thread_data); + if (TCR_4(thread_data->td.td_deque_ntasks) >= + TASK_DEQUE_SIZE(thread_data->td)) { + // expand deque to push the task which is not allowed to execute + __kmp_realloc_task_deque(thread, thread_data); + } } else { @@ -3847,22 +3873,30 @@ void __kmp_fulfill_event(kmp_event_t *event) { bool detached = false; int gtid = __kmp_get_gtid(); + // The associated task might have completed or could be completing at this + // point. + // We need to take the lock to avoid races + __kmp_acquire_tas_lock(&event->lock, gtid); if (taskdata->td_flags.proxy == TASK_PROXY) { - // The associated task code completed before this call and detached. detached = true; - event->type = KMP_EVENT_UNINITIALIZED; } else { - // The associated task has not completed but could be completing at this - // point. - // We need to take the lock to avoid races - __kmp_acquire_tas_lock(&event->lock, gtid); - if (taskdata->td_flags.proxy == TASK_PROXY) - detached = true; - event->type = KMP_EVENT_UNINITIALIZED; - __kmp_release_tas_lock(&event->lock, gtid); +#if OMPT_SUPPORT + // The OMPT event must occur under mutual exclusion, + // otherwise the tool might access ptask after free + if (UNLIKELY(ompt_enabled.enabled)) + __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill); +#endif } + event->type = KMP_EVENT_UNINITIALIZED; + __kmp_release_tas_lock(&event->lock, gtid); if (detached) { +#if OMPT_SUPPORT + // We free ptask afterwards and know the task is finished, + // so locking is not necessary + if (UNLIKELY(ompt_enabled.enabled)) + __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill); +#endif // If the task detached complete the proxy task if (gtid >= 0) { kmp_team_t *team = taskdata->td_team; @@ -3888,14 +3922,13 @@ void __kmp_fulfill_event(kmp_event_t *event) { kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { kmp_task_t *task; kmp_taskdata_t *taskdata; - kmp_taskdata_t *taskdata_src; - kmp_taskdata_t *parent_task = thread->th.th_current_task; + kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src); + kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task size_t shareds_offset; size_t task_size; KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src)); - taskdata_src = KMP_TASK_TO_TASKDATA(task_src); KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == TASK_FULL); // it should not be proxy task KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); @@ -3923,9 +3956,12 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { } taskdata->td_alloc_thread = thread; taskdata->td_parent = parent_task; - taskdata->td_taskgroup = - parent_task - ->td_taskgroup; // task inherits the taskgroup from the parent task + // task inherits the taskgroup from the parent task + taskdata->td_taskgroup = parent_task->td_taskgroup; + // tied task needs to initialize the td_last_tied at creation, + // untied one does this when it is scheduled for execution + if (taskdata->td_flags.tiedness == TASK_TIED) + taskdata->td_last_tied = taskdata; // Only need to keep track of child task counts if team parallel and tasking // not serialized @@ -4255,7 +4291,7 @@ int __kmp_taskloop_task(int gtid, void *ptask) { // grainsize Number of loop iterations per task // extras Number of chunks with grainsize+1 iterations // tc Iterations count -// num_t_min Threashold to launch tasks recursively +// num_t_min Threshold to launch tasks recursively // task_dup Tasks duplication routine // codeptr_ra Return address for OMPT events void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, @@ -4267,7 +4303,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, void *codeptr_ra, #endif void *task_dup) { -#if KMP_DEBUG kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); KMP_DEBUG_ASSERT(task != NULL); KMP_DEBUG_ASSERT(num_tasks > num_t_min); @@ -4275,7 +4310,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, task_dup)); -#endif p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; kmp_uint64 lower = *lb; kmp_info_t *thread = __kmp_threads[gtid]; @@ -4319,9 +4353,14 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, *ub = ub0; // adjust upper bound for the 1st half // create auxiliary task for 2nd half of the loop + // make sure new task has same parent task as the pattern task + kmp_taskdata_t *current_task = thread->th.th_current_task; + thread->th.th_current_task = taskdata->td_parent; kmp_task_t *new_task = __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), sizeof(__taskloop_params_t), &__kmp_taskloop_task); + // restore current task + thread->th.th_current_task = current_task; __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; p->task = next_task; p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); |