LLVM OpenMP* Runtime Library
kmp_csupport.cpp
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37  // By default __kmpc_begin() is no-op.
38  char *env;
39  if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40  __kmp_str_match_true(env)) {
41  __kmp_middle_initialize();
42  __kmp_assign_root_init_mask();
43  KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44  } else if (__kmp_ignore_mppbeg() == FALSE) {
45  // By default __kmp_ignore_mppbeg() returns TRUE.
46  __kmp_internal_begin();
47  KC_TRACE(10, ("__kmpc_begin: called\n"));
48  }
49 }
50 
59 void __kmpc_end(ident_t *loc) {
60  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61  // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62  // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63  // returns FALSE and __kmpc_end() will unregister this root (it can cause
64  // library shut down).
65  if (__kmp_ignore_mppend() == FALSE) {
66  KC_TRACE(10, ("__kmpc_end: called\n"));
67  KA_TRACE(30, ("__kmpc_end\n"));
68 
69  __kmp_internal_end_thread(-1);
70  }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72  // Normal exit process on Windows does not allow worker threads of the final
73  // parallel region to finish reporting their events, so shutting down the
74  // library here fixes the issue at least for the cases where __kmpc_end() is
75  // placed properly.
76  if (ompt_enabled.enabled)
77  __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
100  kmp_int32 gtid = __kmp_entry_gtid();
101 
102  KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104  return gtid;
105 }
106 
122  KC_TRACE(10,
123  ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125  return TCR_4(__kmp_all_nth);
126 }
127 
135  KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136  return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
145  KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147  return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159  return TRUE;
160 
161 #else
162 
163  const char *semi2;
164  const char *semi3;
165  int line_no;
166 
167  if (__kmp_par_range == 0) {
168  return TRUE;
169  }
170  semi2 = loc->psource;
171  if (semi2 == NULL) {
172  return TRUE;
173  }
174  semi2 = strchr(semi2, ';');
175  if (semi2 == NULL) {
176  return TRUE;
177  }
178  semi2 = strchr(semi2 + 1, ';');
179  if (semi2 == NULL) {
180  return TRUE;
181  }
182  if (__kmp_par_range_filename[0]) {
183  const char *name = semi2 - 1;
184  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185  name--;
186  }
187  if ((*name == '/') || (*name == ';')) {
188  name++;
189  }
190  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191  return __kmp_par_range < 0;
192  }
193  }
194  semi3 = strchr(semi2 + 1, ';');
195  if (__kmp_par_range_routine[0]) {
196  if ((semi3 != NULL) && (semi3 > semi2) &&
197  (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198  return __kmp_par_range < 0;
199  }
200  }
201  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203  return __kmp_par_range > 0;
204  }
205  return __kmp_par_range < 0;
206  }
207  return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219  return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232  kmp_int32 num_threads) {
233  KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234  global_tid, num_threads));
235  __kmp_assert_valid_gtid(global_tid);
236  __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240  KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241  /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245  kmp_int32 proc_bind) {
246  KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247  proc_bind));
248  __kmp_assert_valid_gtid(global_tid);
249  __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263  int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266  // If we were in a serial region, then stop the serial timer, record
267  // the event, and start parallel region timer
268  stats_state_e previous_state = KMP_GET_THREAD_STATE();
269  if (previous_state == stats_state_e::SERIAL_REGION) {
270  KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271  } else {
272  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273  }
274  int inParallel = __kmpc_in_parallel(loc);
275  if (inParallel) {
276  KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277  } else {
278  KMP_COUNT_BLOCK(OMP_PARALLEL);
279  }
280 #endif
281 
282  // maybe to save thr_state is enough here
283  {
284  va_list ap;
285  va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288  ompt_frame_t *ompt_frame;
289  if (ompt_enabled.enabled) {
290  kmp_info_t *master_th = __kmp_threads[gtid];
291  kmp_team_t *parent_team = master_th->th.th_team;
292  ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293  if (lwt)
294  ompt_frame = &(lwt->ompt_task_info.frame);
295  else {
296  int tid = __kmp_tid_from_gtid(gtid);
297  ompt_frame = &(
298  parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299  }
300  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301  }
302  OMPT_STORE_RETURN_ADDRESS(gtid);
303 #endif
304 
305 #if INCLUDE_SSC_MARKS
306  SSC_MARK_FORKING();
307 #endif
308  __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309  VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311  kmp_va_addr_of(ap));
312 #if INCLUDE_SSC_MARKS
313  SSC_MARK_JOINING();
314 #endif
315  __kmp_join_call(loc, gtid
316 #if OMPT_SUPPORT
317  ,
318  fork_context_intel
319 #endif
320  );
321 
322  va_end(ap);
323  }
324 
325 #if KMP_STATS_ENABLED
326  if (previous_state == stats_state_e::SERIAL_REGION) {
327  KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
328  KMP_SET_THREAD_STATE(previous_state);
329  } else {
330  KMP_POP_PARTITIONED_TIMER();
331  }
332 #endif // KMP_STATS_ENABLED
333 }
334 
346 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
347  kmp_int32 num_teams, kmp_int32 num_threads) {
348  KA_TRACE(20,
349  ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
350  global_tid, num_teams, num_threads));
351  __kmp_assert_valid_gtid(global_tid);
352  __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
353 }
354 
371 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
372  kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
373  kmp_int32 num_threads) {
374  KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
375  " num_teams_ub=%d num_threads=%d\n",
376  global_tid, num_teams_lb, num_teams_ub, num_threads));
377  __kmp_assert_valid_gtid(global_tid);
378  __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
379  num_threads);
380 }
381 
392 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
393  ...) {
394  int gtid = __kmp_entry_gtid();
395  kmp_info_t *this_thr = __kmp_threads[gtid];
396  va_list ap;
397  va_start(ap, microtask);
398 
399 #if KMP_STATS_ENABLED
400  KMP_COUNT_BLOCK(OMP_TEAMS);
401  stats_state_e previous_state = KMP_GET_THREAD_STATE();
402  if (previous_state == stats_state_e::SERIAL_REGION) {
403  KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
404  } else {
405  KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
406  }
407 #endif
408 
409  // remember teams entry point and nesting level
410  this_thr->th.th_teams_microtask = microtask;
411  this_thr->th.th_teams_level =
412  this_thr->th.th_team->t.t_level; // AC: can be >0 on host
413 
414 #if OMPT_SUPPORT
415  kmp_team_t *parent_team = this_thr->th.th_team;
416  int tid = __kmp_tid_from_gtid(gtid);
417  if (ompt_enabled.enabled) {
418  parent_team->t.t_implicit_task_taskdata[tid]
419  .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
420  }
421  OMPT_STORE_RETURN_ADDRESS(gtid);
422 #endif
423 
424  // check if __kmpc_push_num_teams called, set default number of teams
425  // otherwise
426  if (this_thr->th.th_teams_size.nteams == 0) {
427  __kmp_push_num_teams(loc, gtid, 0, 0);
428  }
429  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
430  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
431  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
432 
433  __kmp_fork_call(
434  loc, gtid, fork_context_intel, argc,
435  VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
436  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
437  __kmp_join_call(loc, gtid
438 #if OMPT_SUPPORT
439  ,
440  fork_context_intel
441 #endif
442  );
443 
444  // Pop current CG root off list
445  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
446  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
447  this_thr->th.th_cg_roots = tmp->up;
448  KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
449  " to node %p. cg_nthreads was %d\n",
450  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
451  KMP_DEBUG_ASSERT(tmp->cg_nthreads);
452  int i = tmp->cg_nthreads--;
453  if (i == 1) { // check is we are the last thread in CG (not always the case)
454  __kmp_free(tmp);
455  }
456  // Restore current task's thread_limit from CG root
457  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
458  this_thr->th.th_current_task->td_icvs.thread_limit =
459  this_thr->th.th_cg_roots->cg_thread_limit;
460 
461  this_thr->th.th_teams_microtask = NULL;
462  this_thr->th.th_teams_level = 0;
463  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
464  va_end(ap);
465 #if KMP_STATS_ENABLED
466  if (previous_state == stats_state_e::SERIAL_REGION) {
467  KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
468  KMP_SET_THREAD_STATE(previous_state);
469  } else {
470  KMP_POP_PARTITIONED_TIMER();
471  }
472 #endif // KMP_STATS_ENABLED
473 }
474 
475 // I don't think this function should ever have been exported.
476 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
477 // openmp code ever called it, but it's been exported from the RTL for so
478 // long that I'm afraid to remove the definition.
479 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
480 
493 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494  // The implementation is now in kmp_runtime.cpp so that it can share static
495  // functions with kmp_fork_call since the tasks to be done are similar in
496  // each case.
497  __kmp_assert_valid_gtid(global_tid);
498 #if OMPT_SUPPORT
499  OMPT_STORE_RETURN_ADDRESS(global_tid);
500 #endif
501  __kmp_serialized_parallel(loc, global_tid);
502 }
503 
511 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
512  kmp_internal_control_t *top;
513  kmp_info_t *this_thr;
514  kmp_team_t *serial_team;
515 
516  KC_TRACE(10,
517  ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
518 
519  /* skip all this code for autopar serialized loops since it results in
520  unacceptable overhead */
521  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
522  return;
523 
524  // Not autopar code
525  __kmp_assert_valid_gtid(global_tid);
526  if (!TCR_4(__kmp_init_parallel))
527  __kmp_parallel_initialize();
528 
529  __kmp_resume_if_soft_paused();
530 
531  this_thr = __kmp_threads[global_tid];
532  serial_team = this_thr->th.th_serial_team;
533 
534  kmp_task_team_t *task_team = this_thr->th.th_task_team;
535  // we need to wait for the proxy tasks before finishing the thread
536  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
537  __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
538 
539  KMP_MB();
540  KMP_DEBUG_ASSERT(serial_team);
541  KMP_ASSERT(serial_team->t.t_serialized);
542  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
543  KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
544  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
545  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
546 
547 #if OMPT_SUPPORT
548  if (ompt_enabled.enabled &&
549  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
550  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
551  if (ompt_enabled.ompt_callback_implicit_task) {
552  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
553  ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
554  OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
555  }
556 
557  // reset clear the task id only after unlinking the task
558  ompt_data_t *parent_task_data;
559  __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
560 
561  if (ompt_enabled.ompt_callback_parallel_end) {
562  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
563  &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
564  ompt_parallel_invoker_program | ompt_parallel_team,
565  OMPT_LOAD_RETURN_ADDRESS(global_tid));
566  }
567  __ompt_lw_taskteam_unlink(this_thr);
568  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
569  }
570 #endif
571 
572  /* If necessary, pop the internal control stack values and replace the team
573  * values */
574  top = serial_team->t.t_control_stack_top;
575  if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
576  copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
577  serial_team->t.t_control_stack_top = top->next;
578  __kmp_free(top);
579  }
580 
581  /* pop dispatch buffers stack */
582  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
583  {
584  dispatch_private_info_t *disp_buffer =
585  serial_team->t.t_dispatch->th_disp_buffer;
586  serial_team->t.t_dispatch->th_disp_buffer =
587  serial_team->t.t_dispatch->th_disp_buffer->next;
588  __kmp_free(disp_buffer);
589  }
590  this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
591 
592  --serial_team->t.t_serialized;
593  if (serial_team->t.t_serialized == 0) {
594 
595  /* return to the parallel section */
596 
597 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
598  if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
599  __kmp_clear_x87_fpu_status_word();
600  __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
601  __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
602  }
603 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
604 
605  __kmp_pop_current_task_from_thread(this_thr);
606 #if OMPD_SUPPORT
607  if (ompd_state & OMPD_ENABLE_BP)
608  ompd_bp_parallel_end();
609 #endif
610 
611  this_thr->th.th_team = serial_team->t.t_parent;
612  this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
613 
614  /* restore values cached in the thread */
615  this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
616  this_thr->th.th_team_master =
617  serial_team->t.t_parent->t.t_threads[0]; /* JPH */
618  this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
619 
620  /* TODO the below shouldn't need to be adjusted for serialized teams */
621  this_thr->th.th_dispatch =
622  &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
623 
624  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
625  this_thr->th.th_current_task->td_flags.executing = 1;
626 
627  if (__kmp_tasking_mode != tskm_immediate_exec) {
628  // Copy the task team from the new child / old parent team to the thread.
629  this_thr->th.th_task_team =
630  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
631  KA_TRACE(20,
632  ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
633  "team %p\n",
634  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
635  }
636  } else {
637  if (__kmp_tasking_mode != tskm_immediate_exec) {
638  KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
639  "depth of serial team %p to %d\n",
640  global_tid, serial_team, serial_team->t.t_serialized));
641  }
642  }
643 
644  serial_team->t.t_level--;
645  if (__kmp_env_consistency_check)
646  __kmp_pop_parallel(global_tid, NULL);
647 #if OMPT_SUPPORT
648  if (ompt_enabled.enabled)
649  this_thr->th.ompt_thread_info.state =
650  ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
651  : ompt_state_work_parallel);
652 #endif
653 }
654 
663 void __kmpc_flush(ident_t *loc) {
664  KC_TRACE(10, ("__kmpc_flush: called\n"));
665 
666  /* need explicit __mf() here since use volatile instead in library */
667  KMP_MB(); /* Flush all pending memory write invalidates. */
668 
669 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
670 #if KMP_MIC
671 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
672 // We shouldn't need it, though, since the ABI rules require that
673 // * If the compiler generates NGO stores it also generates the fence
674 // * If users hand-code NGO stores they should insert the fence
675 // therefore no incomplete unordered stores should be visible.
676 #else
677  // C74404
678  // This is to address non-temporal store instructions (sfence needed).
679  // The clflush instruction is addressed either (mfence needed).
680  // Probably the non-temporal load monvtdqa instruction should also be
681  // addressed.
682  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
683  if (!__kmp_cpuinfo.initialized) {
684  __kmp_query_cpuid(&__kmp_cpuinfo);
685  }
686  if (!__kmp_cpuinfo.sse2) {
687  // CPU cannot execute SSE2 instructions.
688  } else {
689 #if KMP_COMPILER_ICC
690  _mm_mfence();
691 #elif KMP_COMPILER_MSVC
692  MemoryBarrier();
693 #else
694  __sync_synchronize();
695 #endif // KMP_COMPILER_ICC
696  }
697 #endif // KMP_MIC
698 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
699  KMP_ARCH_RISCV64)
700 // Nothing to see here move along
701 #elif KMP_ARCH_PPC64
702 // Nothing needed here (we have a real MB above).
703 #else
704 #error Unknown or unsupported architecture
705 #endif
706 
707 #if OMPT_SUPPORT && OMPT_OPTIONAL
708  if (ompt_enabled.ompt_callback_flush) {
709  ompt_callbacks.ompt_callback(ompt_callback_flush)(
710  __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
711  }
712 #endif
713 }
714 
715 /* -------------------------------------------------------------------------- */
723 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
724  KMP_COUNT_BLOCK(OMP_BARRIER);
725  KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
726  __kmp_assert_valid_gtid(global_tid);
727 
728  if (!TCR_4(__kmp_init_parallel))
729  __kmp_parallel_initialize();
730 
731  __kmp_resume_if_soft_paused();
732 
733  if (__kmp_env_consistency_check) {
734  if (loc == 0) {
735  KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
736  }
737  __kmp_check_barrier(global_tid, ct_barrier, loc);
738  }
739 
740 #if OMPT_SUPPORT
741  ompt_frame_t *ompt_frame;
742  if (ompt_enabled.enabled) {
743  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
744  if (ompt_frame->enter_frame.ptr == NULL)
745  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
746  }
747  OMPT_STORE_RETURN_ADDRESS(global_tid);
748 #endif
749  __kmp_threads[global_tid]->th.th_ident = loc;
750  // TODO: explicit barrier_wait_id:
751  // this function is called when 'barrier' directive is present or
752  // implicit barrier at the end of a worksharing construct.
753  // 1) better to add a per-thread barrier counter to a thread data structure
754  // 2) set to 0 when a new team is created
755  // 4) no sync is required
756 
757  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
758 #if OMPT_SUPPORT && OMPT_OPTIONAL
759  if (ompt_enabled.enabled) {
760  ompt_frame->enter_frame = ompt_data_none;
761  }
762 #endif
763 }
764 
765 /* The BARRIER for a MASTER section is always explicit */
772 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
773  int status = 0;
774 
775  KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
776  __kmp_assert_valid_gtid(global_tid);
777 
778  if (!TCR_4(__kmp_init_parallel))
779  __kmp_parallel_initialize();
780 
781  __kmp_resume_if_soft_paused();
782 
783  if (KMP_MASTER_GTID(global_tid)) {
784  KMP_COUNT_BLOCK(OMP_MASTER);
785  KMP_PUSH_PARTITIONED_TIMER(OMP_master);
786  status = 1;
787  }
788 
789 #if OMPT_SUPPORT && OMPT_OPTIONAL
790  if (status) {
791  if (ompt_enabled.ompt_callback_masked) {
792  kmp_info_t *this_thr = __kmp_threads[global_tid];
793  kmp_team_t *team = this_thr->th.th_team;
794 
795  int tid = __kmp_tid_from_gtid(global_tid);
796  ompt_callbacks.ompt_callback(ompt_callback_masked)(
797  ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
798  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
799  OMPT_GET_RETURN_ADDRESS(0));
800  }
801  }
802 #endif
803 
804  if (__kmp_env_consistency_check) {
805 #if KMP_USE_DYNAMIC_LOCK
806  if (status)
807  __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
808  else
809  __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
810 #else
811  if (status)
812  __kmp_push_sync(global_tid, ct_master, loc, NULL);
813  else
814  __kmp_check_sync(global_tid, ct_master, loc, NULL);
815 #endif
816  }
817 
818  return status;
819 }
820 
829 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
830  KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
831  __kmp_assert_valid_gtid(global_tid);
832  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
833  KMP_POP_PARTITIONED_TIMER();
834 
835 #if OMPT_SUPPORT && OMPT_OPTIONAL
836  kmp_info_t *this_thr = __kmp_threads[global_tid];
837  kmp_team_t *team = this_thr->th.th_team;
838  if (ompt_enabled.ompt_callback_masked) {
839  int tid = __kmp_tid_from_gtid(global_tid);
840  ompt_callbacks.ompt_callback(ompt_callback_masked)(
841  ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
842  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
843  OMPT_GET_RETURN_ADDRESS(0));
844  }
845 #endif
846 
847  if (__kmp_env_consistency_check) {
848  if (KMP_MASTER_GTID(global_tid))
849  __kmp_pop_sync(global_tid, ct_master, loc);
850  }
851 }
852 
861 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
862  int status = 0;
863  int tid;
864  KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
865  __kmp_assert_valid_gtid(global_tid);
866 
867  if (!TCR_4(__kmp_init_parallel))
868  __kmp_parallel_initialize();
869 
870  __kmp_resume_if_soft_paused();
871 
872  tid = __kmp_tid_from_gtid(global_tid);
873  if (tid == filter) {
874  KMP_COUNT_BLOCK(OMP_MASKED);
875  KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
876  status = 1;
877  }
878 
879 #if OMPT_SUPPORT && OMPT_OPTIONAL
880  if (status) {
881  if (ompt_enabled.ompt_callback_masked) {
882  kmp_info_t *this_thr = __kmp_threads[global_tid];
883  kmp_team_t *team = this_thr->th.th_team;
884  ompt_callbacks.ompt_callback(ompt_callback_masked)(
885  ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
886  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
887  OMPT_GET_RETURN_ADDRESS(0));
888  }
889  }
890 #endif
891 
892  if (__kmp_env_consistency_check) {
893 #if KMP_USE_DYNAMIC_LOCK
894  if (status)
895  __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
896  else
897  __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
898 #else
899  if (status)
900  __kmp_push_sync(global_tid, ct_masked, loc, NULL);
901  else
902  __kmp_check_sync(global_tid, ct_masked, loc, NULL);
903 #endif
904  }
905 
906  return status;
907 }
908 
917 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
918  KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
919  __kmp_assert_valid_gtid(global_tid);
920  KMP_POP_PARTITIONED_TIMER();
921 
922 #if OMPT_SUPPORT && OMPT_OPTIONAL
923  kmp_info_t *this_thr = __kmp_threads[global_tid];
924  kmp_team_t *team = this_thr->th.th_team;
925  if (ompt_enabled.ompt_callback_masked) {
926  int tid = __kmp_tid_from_gtid(global_tid);
927  ompt_callbacks.ompt_callback(ompt_callback_masked)(
928  ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
929  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
930  OMPT_GET_RETURN_ADDRESS(0));
931  }
932 #endif
933 
934  if (__kmp_env_consistency_check) {
935  __kmp_pop_sync(global_tid, ct_masked, loc);
936  }
937 }
938 
946 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
947  int cid = 0;
948  kmp_info_t *th;
949  KMP_DEBUG_ASSERT(__kmp_init_serial);
950 
951  KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
952  __kmp_assert_valid_gtid(gtid);
953 
954  if (!TCR_4(__kmp_init_parallel))
955  __kmp_parallel_initialize();
956 
957  __kmp_resume_if_soft_paused();
958 
959 #if USE_ITT_BUILD
960  __kmp_itt_ordered_prep(gtid);
961 // TODO: ordered_wait_id
962 #endif /* USE_ITT_BUILD */
963 
964  th = __kmp_threads[gtid];
965 
966 #if OMPT_SUPPORT && OMPT_OPTIONAL
967  kmp_team_t *team;
968  ompt_wait_id_t lck;
969  void *codeptr_ra;
970  OMPT_STORE_RETURN_ADDRESS(gtid);
971  if (ompt_enabled.enabled) {
972  team = __kmp_team_from_gtid(gtid);
973  lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
974  /* OMPT state update */
975  th->th.ompt_thread_info.wait_id = lck;
976  th->th.ompt_thread_info.state = ompt_state_wait_ordered;
977 
978  /* OMPT event callback */
979  codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
980  if (ompt_enabled.ompt_callback_mutex_acquire) {
981  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
982  ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
983  codeptr_ra);
984  }
985  }
986 #endif
987 
988  if (th->th.th_dispatch->th_deo_fcn != 0)
989  (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
990  else
991  __kmp_parallel_deo(&gtid, &cid, loc);
992 
993 #if OMPT_SUPPORT && OMPT_OPTIONAL
994  if (ompt_enabled.enabled) {
995  /* OMPT state update */
996  th->th.ompt_thread_info.state = ompt_state_work_parallel;
997  th->th.ompt_thread_info.wait_id = 0;
998 
999  /* OMPT event callback */
1000  if (ompt_enabled.ompt_callback_mutex_acquired) {
1001  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1002  ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1003  }
1004  }
1005 #endif
1006 
1007 #if USE_ITT_BUILD
1008  __kmp_itt_ordered_start(gtid);
1009 #endif /* USE_ITT_BUILD */
1010 }
1011 
1019 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1020  int cid = 0;
1021  kmp_info_t *th;
1022 
1023  KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1024  __kmp_assert_valid_gtid(gtid);
1025 
1026 #if USE_ITT_BUILD
1027  __kmp_itt_ordered_end(gtid);
1028 // TODO: ordered_wait_id
1029 #endif /* USE_ITT_BUILD */
1030 
1031  th = __kmp_threads[gtid];
1032 
1033  if (th->th.th_dispatch->th_dxo_fcn != 0)
1034  (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1035  else
1036  __kmp_parallel_dxo(&gtid, &cid, loc);
1037 
1038 #if OMPT_SUPPORT && OMPT_OPTIONAL
1039  OMPT_STORE_RETURN_ADDRESS(gtid);
1040  if (ompt_enabled.ompt_callback_mutex_released) {
1041  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1042  ompt_mutex_ordered,
1043  (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1044  ->t.t_ordered.dt.t_value,
1045  OMPT_LOAD_RETURN_ADDRESS(gtid));
1046  }
1047 #endif
1048 }
1049 
1050 #if KMP_USE_DYNAMIC_LOCK
1051 
1052 static __forceinline void
1053 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1054  kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1055  // Pointer to the allocated indirect lock is written to crit, while indexing
1056  // is ignored.
1057  void *idx;
1058  kmp_indirect_lock_t **lck;
1059  lck = (kmp_indirect_lock_t **)crit;
1060  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1061  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1062  KMP_SET_I_LOCK_LOCATION(ilk, loc);
1063  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1064  KA_TRACE(20,
1065  ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1066 #if USE_ITT_BUILD
1067  __kmp_itt_critical_creating(ilk->lock, loc);
1068 #endif
1069  int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1070  if (status == 0) {
1071 #if USE_ITT_BUILD
1072  __kmp_itt_critical_destroyed(ilk->lock);
1073 #endif
1074  // We don't really need to destroy the unclaimed lock here since it will be
1075  // cleaned up at program exit.
1076  // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1077  }
1078  KMP_DEBUG_ASSERT(*lck != NULL);
1079 }
1080 
1081 // Fast-path acquire tas lock
1082 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
1083  { \
1084  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1085  kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1086  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1087  if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
1088  !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
1089  kmp_uint32 spins; \
1090  KMP_FSYNC_PREPARE(l); \
1091  KMP_INIT_YIELD(spins); \
1092  kmp_backoff_t backoff = __kmp_spin_backoff_params; \
1093  do { \
1094  if (TCR_4(__kmp_nth) > \
1095  (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
1096  KMP_YIELD(TRUE); \
1097  } else { \
1098  KMP_YIELD_SPIN(spins); \
1099  } \
1100  __kmp_spin_backoff(&backoff); \
1101  } while ( \
1102  KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
1103  !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \
1104  } \
1105  KMP_FSYNC_ACQUIRED(l); \
1106  }
1107 
1108 // Fast-path test tas lock
1109 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
1110  { \
1111  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1112  kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1113  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1114  rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
1115  __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
1116  }
1117 
1118 // Fast-path release tas lock
1119 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1120  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1121 
1122 #if KMP_USE_FUTEX
1123 
1124 #include <sys/syscall.h>
1125 #include <unistd.h>
1126 #ifndef FUTEX_WAIT
1127 #define FUTEX_WAIT 0
1128 #endif
1129 #ifndef FUTEX_WAKE
1130 #define FUTEX_WAKE 1
1131 #endif
1132 
1133 // Fast-path acquire futex lock
1134 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1135  { \
1136  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1137  kmp_int32 gtid_code = (gtid + 1) << 1; \
1138  KMP_MB(); \
1139  KMP_FSYNC_PREPARE(ftx); \
1140  kmp_int32 poll_val; \
1141  while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1142  &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1143  KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1144  kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1145  if (!cond) { \
1146  if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1147  poll_val | \
1148  KMP_LOCK_BUSY(1, futex))) { \
1149  continue; \
1150  } \
1151  poll_val |= KMP_LOCK_BUSY(1, futex); \
1152  } \
1153  kmp_int32 rc; \
1154  if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1155  NULL, NULL, 0)) != 0) { \
1156  continue; \
1157  } \
1158  gtid_code |= 1; \
1159  } \
1160  KMP_FSYNC_ACQUIRED(ftx); \
1161  }
1162 
1163 // Fast-path test futex lock
1164 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1165  { \
1166  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1167  if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1168  KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1169  KMP_FSYNC_ACQUIRED(ftx); \
1170  rc = TRUE; \
1171  } else { \
1172  rc = FALSE; \
1173  } \
1174  }
1175 
1176 // Fast-path release futex lock
1177 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1178  { \
1179  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1180  KMP_MB(); \
1181  KMP_FSYNC_RELEASING(ftx); \
1182  kmp_int32 poll_val = \
1183  KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1184  if (KMP_LOCK_STRIP(poll_val) & 1) { \
1185  syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1186  KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1187  } \
1188  KMP_MB(); \
1189  KMP_YIELD_OVERSUB(); \
1190  }
1191 
1192 #endif // KMP_USE_FUTEX
1193 
1194 #else // KMP_USE_DYNAMIC_LOCK
1195 
1196 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1197  ident_t const *loc,
1198  kmp_int32 gtid) {
1199  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1200 
1201  // Because of the double-check, the following load doesn't need to be volatile
1202  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1203 
1204  if (lck == NULL) {
1205  void *idx;
1206 
1207  // Allocate & initialize the lock.
1208  // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1209  lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1210  __kmp_init_user_lock_with_checks(lck);
1211  __kmp_set_user_lock_location(lck, loc);
1212 #if USE_ITT_BUILD
1213  __kmp_itt_critical_creating(lck);
1214 // __kmp_itt_critical_creating() should be called *before* the first usage
1215 // of underlying lock. It is the only place where we can guarantee it. There
1216 // are chances the lock will destroyed with no usage, but it is not a
1217 // problem, because this is not real event seen by user but rather setting
1218 // name for object (lock). See more details in kmp_itt.h.
1219 #endif /* USE_ITT_BUILD */
1220 
1221  // Use a cmpxchg instruction to slam the start of the critical section with
1222  // the lock pointer. If another thread beat us to it, deallocate the lock,
1223  // and use the lock that the other thread allocated.
1224  int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1225 
1226  if (status == 0) {
1227 // Deallocate the lock and reload the value.
1228 #if USE_ITT_BUILD
1229  __kmp_itt_critical_destroyed(lck);
1230 // Let ITT know the lock is destroyed and the same memory location may be reused
1231 // for another purpose.
1232 #endif /* USE_ITT_BUILD */
1233  __kmp_destroy_user_lock_with_checks(lck);
1234  __kmp_user_lock_free(&idx, gtid, lck);
1235  lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1236  KMP_DEBUG_ASSERT(lck != NULL);
1237  }
1238  }
1239  return lck;
1240 }
1241 
1242 #endif // KMP_USE_DYNAMIC_LOCK
1243 
1254 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1255  kmp_critical_name *crit) {
1256 #if KMP_USE_DYNAMIC_LOCK
1257 #if OMPT_SUPPORT && OMPT_OPTIONAL
1258  OMPT_STORE_RETURN_ADDRESS(global_tid);
1259 #endif // OMPT_SUPPORT
1260  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1261 #else
1262  KMP_COUNT_BLOCK(OMP_CRITICAL);
1263 #if OMPT_SUPPORT && OMPT_OPTIONAL
1264  ompt_state_t prev_state = ompt_state_undefined;
1265  ompt_thread_info_t ti;
1266 #endif
1267  kmp_user_lock_p lck;
1268 
1269  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1270  __kmp_assert_valid_gtid(global_tid);
1271 
1272  // TODO: add THR_OVHD_STATE
1273 
1274  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1275  KMP_CHECK_USER_LOCK_INIT();
1276 
1277  if ((__kmp_user_lock_kind == lk_tas) &&
1278  (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1279  lck = (kmp_user_lock_p)crit;
1280  }
1281 #if KMP_USE_FUTEX
1282  else if ((__kmp_user_lock_kind == lk_futex) &&
1283  (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1284  lck = (kmp_user_lock_p)crit;
1285  }
1286 #endif
1287  else { // ticket, queuing or drdpa
1288  lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1289  }
1290 
1291  if (__kmp_env_consistency_check)
1292  __kmp_push_sync(global_tid, ct_critical, loc, lck);
1293 
1294  // since the critical directive binds to all threads, not just the current
1295  // team we have to check this even if we are in a serialized team.
1296  // also, even if we are the uber thread, we still have to conduct the lock,
1297  // as we have to contend with sibling threads.
1298 
1299 #if USE_ITT_BUILD
1300  __kmp_itt_critical_acquiring(lck);
1301 #endif /* USE_ITT_BUILD */
1302 #if OMPT_SUPPORT && OMPT_OPTIONAL
1303  OMPT_STORE_RETURN_ADDRESS(gtid);
1304  void *codeptr_ra = NULL;
1305  if (ompt_enabled.enabled) {
1306  ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1307  /* OMPT state update */
1308  prev_state = ti.state;
1309  ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1310  ti.state = ompt_state_wait_critical;
1311 
1312  /* OMPT event callback */
1313  codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1314  if (ompt_enabled.ompt_callback_mutex_acquire) {
1315  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1316  ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1317  (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1318  }
1319  }
1320 #endif
1321  // Value of 'crit' should be good for using as a critical_id of the critical
1322  // section directive.
1323  __kmp_acquire_user_lock_with_checks(lck, global_tid);
1324 
1325 #if USE_ITT_BUILD
1326  __kmp_itt_critical_acquired(lck);
1327 #endif /* USE_ITT_BUILD */
1328 #if OMPT_SUPPORT && OMPT_OPTIONAL
1329  if (ompt_enabled.enabled) {
1330  /* OMPT state update */
1331  ti.state = prev_state;
1332  ti.wait_id = 0;
1333 
1334  /* OMPT event callback */
1335  if (ompt_enabled.ompt_callback_mutex_acquired) {
1336  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1337  ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1338  }
1339  }
1340 #endif
1341  KMP_POP_PARTITIONED_TIMER();
1342 
1343  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1344  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1345 #endif // KMP_USE_DYNAMIC_LOCK
1346 }
1347 
1348 #if KMP_USE_DYNAMIC_LOCK
1349 
1350 // Converts the given hint to an internal lock implementation
1351 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1352 #if KMP_USE_TSX
1353 #define KMP_TSX_LOCK(seq) lockseq_##seq
1354 #else
1355 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1356 #endif
1357 
1358 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1359 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1360 #else
1361 #define KMP_CPUINFO_RTM 0
1362 #endif
1363 
1364  // Hints that do not require further logic
1365  if (hint & kmp_lock_hint_hle)
1366  return KMP_TSX_LOCK(hle);
1367  if (hint & kmp_lock_hint_rtm)
1368  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1369  if (hint & kmp_lock_hint_adaptive)
1370  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1371 
1372  // Rule out conflicting hints first by returning the default lock
1373  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1374  return __kmp_user_lock_seq;
1375  if ((hint & omp_lock_hint_speculative) &&
1376  (hint & omp_lock_hint_nonspeculative))
1377  return __kmp_user_lock_seq;
1378 
1379  // Do not even consider speculation when it appears to be contended
1380  if (hint & omp_lock_hint_contended)
1381  return lockseq_queuing;
1382 
1383  // Uncontended lock without speculation
1384  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1385  return lockseq_tas;
1386 
1387  // Use RTM lock for speculation
1388  if (hint & omp_lock_hint_speculative)
1389  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1390 
1391  return __kmp_user_lock_seq;
1392 }
1393 
1394 #if OMPT_SUPPORT && OMPT_OPTIONAL
1395 #if KMP_USE_DYNAMIC_LOCK
1396 static kmp_mutex_impl_t
1397 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1398  if (user_lock) {
1399  switch (KMP_EXTRACT_D_TAG(user_lock)) {
1400  case 0:
1401  break;
1402 #if KMP_USE_FUTEX
1403  case locktag_futex:
1404  return kmp_mutex_impl_queuing;
1405 #endif
1406  case locktag_tas:
1407  return kmp_mutex_impl_spin;
1408 #if KMP_USE_TSX
1409  case locktag_hle:
1410  case locktag_rtm_spin:
1411  return kmp_mutex_impl_speculative;
1412 #endif
1413  default:
1414  return kmp_mutex_impl_none;
1415  }
1416  ilock = KMP_LOOKUP_I_LOCK(user_lock);
1417  }
1418  KMP_ASSERT(ilock);
1419  switch (ilock->type) {
1420 #if KMP_USE_TSX
1421  case locktag_adaptive:
1422  case locktag_rtm_queuing:
1423  return kmp_mutex_impl_speculative;
1424 #endif
1425  case locktag_nested_tas:
1426  return kmp_mutex_impl_spin;
1427 #if KMP_USE_FUTEX
1428  case locktag_nested_futex:
1429 #endif
1430  case locktag_ticket:
1431  case locktag_queuing:
1432  case locktag_drdpa:
1433  case locktag_nested_ticket:
1434  case locktag_nested_queuing:
1435  case locktag_nested_drdpa:
1436  return kmp_mutex_impl_queuing;
1437  default:
1438  return kmp_mutex_impl_none;
1439  }
1440 }
1441 #else
1442 // For locks without dynamic binding
1443 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1444  switch (__kmp_user_lock_kind) {
1445  case lk_tas:
1446  return kmp_mutex_impl_spin;
1447 #if KMP_USE_FUTEX
1448  case lk_futex:
1449 #endif
1450  case lk_ticket:
1451  case lk_queuing:
1452  case lk_drdpa:
1453  return kmp_mutex_impl_queuing;
1454 #if KMP_USE_TSX
1455  case lk_hle:
1456  case lk_rtm_queuing:
1457  case lk_rtm_spin:
1458  case lk_adaptive:
1459  return kmp_mutex_impl_speculative;
1460 #endif
1461  default:
1462  return kmp_mutex_impl_none;
1463  }
1464 }
1465 #endif // KMP_USE_DYNAMIC_LOCK
1466 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1467 
1481 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1482  kmp_critical_name *crit, uint32_t hint) {
1483  KMP_COUNT_BLOCK(OMP_CRITICAL);
1484  kmp_user_lock_p lck;
1485 #if OMPT_SUPPORT && OMPT_OPTIONAL
1486  ompt_state_t prev_state = ompt_state_undefined;
1487  ompt_thread_info_t ti;
1488  // This is the case, if called from __kmpc_critical:
1489  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1490  if (!codeptr)
1491  codeptr = OMPT_GET_RETURN_ADDRESS(0);
1492 #endif
1493 
1494  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1495  __kmp_assert_valid_gtid(global_tid);
1496 
1497  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1498  // Check if it is initialized.
1499  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1500  kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1501  if (*lk == 0) {
1502  if (KMP_IS_D_LOCK(lockseq)) {
1503  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1504  KMP_GET_D_TAG(lockseq));
1505  } else {
1506  __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1507  }
1508  }
1509  // Branch for accessing the actual lock object and set operation. This
1510  // branching is inevitable since this lock initialization does not follow the
1511  // normal dispatch path (lock table is not used).
1512  if (KMP_EXTRACT_D_TAG(lk) != 0) {
1513  lck = (kmp_user_lock_p)lk;
1514  if (__kmp_env_consistency_check) {
1515  __kmp_push_sync(global_tid, ct_critical, loc, lck,
1516  __kmp_map_hint_to_lock(hint));
1517  }
1518 #if USE_ITT_BUILD
1519  __kmp_itt_critical_acquiring(lck);
1520 #endif
1521 #if OMPT_SUPPORT && OMPT_OPTIONAL
1522  if (ompt_enabled.enabled) {
1523  ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1524  /* OMPT state update */
1525  prev_state = ti.state;
1526  ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1527  ti.state = ompt_state_wait_critical;
1528 
1529  /* OMPT event callback */
1530  if (ompt_enabled.ompt_callback_mutex_acquire) {
1531  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1532  ompt_mutex_critical, (unsigned int)hint,
1533  __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1534  codeptr);
1535  }
1536  }
1537 #endif
1538 #if KMP_USE_INLINED_TAS
1539  if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1540  KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1541  } else
1542 #elif KMP_USE_INLINED_FUTEX
1543  if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1544  KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1545  } else
1546 #endif
1547  {
1548  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1549  }
1550  } else {
1551  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1552  lck = ilk->lock;
1553  if (__kmp_env_consistency_check) {
1554  __kmp_push_sync(global_tid, ct_critical, loc, lck,
1555  __kmp_map_hint_to_lock(hint));
1556  }
1557 #if USE_ITT_BUILD
1558  __kmp_itt_critical_acquiring(lck);
1559 #endif
1560 #if OMPT_SUPPORT && OMPT_OPTIONAL
1561  if (ompt_enabled.enabled) {
1562  ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1563  /* OMPT state update */
1564  prev_state = ti.state;
1565  ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1566  ti.state = ompt_state_wait_critical;
1567 
1568  /* OMPT event callback */
1569  if (ompt_enabled.ompt_callback_mutex_acquire) {
1570  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1571  ompt_mutex_critical, (unsigned int)hint,
1572  __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1573  codeptr);
1574  }
1575  }
1576 #endif
1577  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1578  }
1579  KMP_POP_PARTITIONED_TIMER();
1580 
1581 #if USE_ITT_BUILD
1582  __kmp_itt_critical_acquired(lck);
1583 #endif /* USE_ITT_BUILD */
1584 #if OMPT_SUPPORT && OMPT_OPTIONAL
1585  if (ompt_enabled.enabled) {
1586  /* OMPT state update */
1587  ti.state = prev_state;
1588  ti.wait_id = 0;
1589 
1590  /* OMPT event callback */
1591  if (ompt_enabled.ompt_callback_mutex_acquired) {
1592  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1593  ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1594  }
1595  }
1596 #endif
1597 
1598  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1599  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1600 } // __kmpc_critical_with_hint
1601 
1602 #endif // KMP_USE_DYNAMIC_LOCK
1603 
1613 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1614  kmp_critical_name *crit) {
1615  kmp_user_lock_p lck;
1616 
1617  KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1618 
1619 #if KMP_USE_DYNAMIC_LOCK
1620  int locktag = KMP_EXTRACT_D_TAG(crit);
1621  if (locktag) {
1622  lck = (kmp_user_lock_p)crit;
1623  KMP_ASSERT(lck != NULL);
1624  if (__kmp_env_consistency_check) {
1625  __kmp_pop_sync(global_tid, ct_critical, loc);
1626  }
1627 #if USE_ITT_BUILD
1628  __kmp_itt_critical_releasing(lck);
1629 #endif
1630 #if KMP_USE_INLINED_TAS
1631  if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1632  KMP_RELEASE_TAS_LOCK(lck, global_tid);
1633  } else
1634 #elif KMP_USE_INLINED_FUTEX
1635  if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1636  KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1637  } else
1638 #endif
1639  {
1640  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1641  }
1642  } else {
1643  kmp_indirect_lock_t *ilk =
1644  (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1645  KMP_ASSERT(ilk != NULL);
1646  lck = ilk->lock;
1647  if (__kmp_env_consistency_check) {
1648  __kmp_pop_sync(global_tid, ct_critical, loc);
1649  }
1650 #if USE_ITT_BUILD
1651  __kmp_itt_critical_releasing(lck);
1652 #endif
1653  KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1654  }
1655 
1656 #else // KMP_USE_DYNAMIC_LOCK
1657 
1658  if ((__kmp_user_lock_kind == lk_tas) &&
1659  (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1660  lck = (kmp_user_lock_p)crit;
1661  }
1662 #if KMP_USE_FUTEX
1663  else if ((__kmp_user_lock_kind == lk_futex) &&
1664  (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1665  lck = (kmp_user_lock_p)crit;
1666  }
1667 #endif
1668  else { // ticket, queuing or drdpa
1669  lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1670  }
1671 
1672  KMP_ASSERT(lck != NULL);
1673 
1674  if (__kmp_env_consistency_check)
1675  __kmp_pop_sync(global_tid, ct_critical, loc);
1676 
1677 #if USE_ITT_BUILD
1678  __kmp_itt_critical_releasing(lck);
1679 #endif /* USE_ITT_BUILD */
1680  // Value of 'crit' should be good for using as a critical_id of the critical
1681  // section directive.
1682  __kmp_release_user_lock_with_checks(lck, global_tid);
1683 
1684 #endif // KMP_USE_DYNAMIC_LOCK
1685 
1686 #if OMPT_SUPPORT && OMPT_OPTIONAL
1687  /* OMPT release event triggers after lock is released; place here to trigger
1688  * for all #if branches */
1689  OMPT_STORE_RETURN_ADDRESS(global_tid);
1690  if (ompt_enabled.ompt_callback_mutex_released) {
1691  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1692  ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1693  OMPT_LOAD_RETURN_ADDRESS(0));
1694  }
1695 #endif
1696 
1697  KMP_POP_PARTITIONED_TIMER();
1698  KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1699 }
1700 
1710 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1711  int status;
1712  KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1713  __kmp_assert_valid_gtid(global_tid);
1714 
1715  if (!TCR_4(__kmp_init_parallel))
1716  __kmp_parallel_initialize();
1717 
1718  __kmp_resume_if_soft_paused();
1719 
1720  if (__kmp_env_consistency_check)
1721  __kmp_check_barrier(global_tid, ct_barrier, loc);
1722 
1723 #if OMPT_SUPPORT
1724  ompt_frame_t *ompt_frame;
1725  if (ompt_enabled.enabled) {
1726  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1727  if (ompt_frame->enter_frame.ptr == NULL)
1728  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1729  }
1730  OMPT_STORE_RETURN_ADDRESS(global_tid);
1731 #endif
1732 #if USE_ITT_NOTIFY
1733  __kmp_threads[global_tid]->th.th_ident = loc;
1734 #endif
1735  status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1736 #if OMPT_SUPPORT && OMPT_OPTIONAL
1737  if (ompt_enabled.enabled) {
1738  ompt_frame->enter_frame = ompt_data_none;
1739  }
1740 #endif
1741 
1742  return (status != 0) ? 0 : 1;
1743 }
1744 
1754 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1755  KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1756  __kmp_assert_valid_gtid(global_tid);
1757  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1758 }
1759 
1770 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1771  kmp_int32 ret;
1772  KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1773  __kmp_assert_valid_gtid(global_tid);
1774 
1775  if (!TCR_4(__kmp_init_parallel))
1776  __kmp_parallel_initialize();
1777 
1778  __kmp_resume_if_soft_paused();
1779 
1780  if (__kmp_env_consistency_check) {
1781  if (loc == 0) {
1782  KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1783  }
1784  __kmp_check_barrier(global_tid, ct_barrier, loc);
1785  }
1786 
1787 #if OMPT_SUPPORT
1788  ompt_frame_t *ompt_frame;
1789  if (ompt_enabled.enabled) {
1790  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1791  if (ompt_frame->enter_frame.ptr == NULL)
1792  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1793  }
1794  OMPT_STORE_RETURN_ADDRESS(global_tid);
1795 #endif
1796 #if USE_ITT_NOTIFY
1797  __kmp_threads[global_tid]->th.th_ident = loc;
1798 #endif
1799  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1800 #if OMPT_SUPPORT && OMPT_OPTIONAL
1801  if (ompt_enabled.enabled) {
1802  ompt_frame->enter_frame = ompt_data_none;
1803  }
1804 #endif
1805 
1806  ret = __kmpc_master(loc, global_tid);
1807 
1808  if (__kmp_env_consistency_check) {
1809  /* there's no __kmpc_end_master called; so the (stats) */
1810  /* actions of __kmpc_end_master are done here */
1811  if (ret) {
1812  /* only one thread should do the pop since only */
1813  /* one did the push (see __kmpc_master()) */
1814  __kmp_pop_sync(global_tid, ct_master, loc);
1815  }
1816  }
1817 
1818  return (ret);
1819 }
1820 
1821 /* The BARRIER for a SINGLE process section is always explicit */
1833 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1834  __kmp_assert_valid_gtid(global_tid);
1835  kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1836 
1837  if (rc) {
1838  // We are going to execute the single statement, so we should count it.
1839  KMP_COUNT_BLOCK(OMP_SINGLE);
1840  KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1841  }
1842 
1843 #if OMPT_SUPPORT && OMPT_OPTIONAL
1844  kmp_info_t *this_thr = __kmp_threads[global_tid];
1845  kmp_team_t *team = this_thr->th.th_team;
1846  int tid = __kmp_tid_from_gtid(global_tid);
1847 
1848  if (ompt_enabled.enabled) {
1849  if (rc) {
1850  if (ompt_enabled.ompt_callback_work) {
1851  ompt_callbacks.ompt_callback(ompt_callback_work)(
1852  ompt_work_single_executor, ompt_scope_begin,
1853  &(team->t.ompt_team_info.parallel_data),
1854  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1855  1, OMPT_GET_RETURN_ADDRESS(0));
1856  }
1857  } else {
1858  if (ompt_enabled.ompt_callback_work) {
1859  ompt_callbacks.ompt_callback(ompt_callback_work)(
1860  ompt_work_single_other, ompt_scope_begin,
1861  &(team->t.ompt_team_info.parallel_data),
1862  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1863  1, OMPT_GET_RETURN_ADDRESS(0));
1864  ompt_callbacks.ompt_callback(ompt_callback_work)(
1865  ompt_work_single_other, ompt_scope_end,
1866  &(team->t.ompt_team_info.parallel_data),
1867  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1868  1, OMPT_GET_RETURN_ADDRESS(0));
1869  }
1870  }
1871  }
1872 #endif
1873 
1874  return rc;
1875 }
1876 
1886 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1887  __kmp_assert_valid_gtid(global_tid);
1888  __kmp_exit_single(global_tid);
1889  KMP_POP_PARTITIONED_TIMER();
1890 
1891 #if OMPT_SUPPORT && OMPT_OPTIONAL
1892  kmp_info_t *this_thr = __kmp_threads[global_tid];
1893  kmp_team_t *team = this_thr->th.th_team;
1894  int tid = __kmp_tid_from_gtid(global_tid);
1895 
1896  if (ompt_enabled.ompt_callback_work) {
1897  ompt_callbacks.ompt_callback(ompt_callback_work)(
1898  ompt_work_single_executor, ompt_scope_end,
1899  &(team->t.ompt_team_info.parallel_data),
1900  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1901  OMPT_GET_RETURN_ADDRESS(0));
1902  }
1903 #endif
1904 }
1905 
1913 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1914  KMP_POP_PARTITIONED_TIMER();
1915  KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1916 
1917 #if OMPT_SUPPORT && OMPT_OPTIONAL
1918  if (ompt_enabled.ompt_callback_work) {
1919  ompt_work_t ompt_work_type = ompt_work_loop;
1920  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1921  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1922  // Determine workshare type
1923  if (loc != NULL) {
1924  if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1925  ompt_work_type = ompt_work_loop;
1926  } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1927  ompt_work_type = ompt_work_sections;
1928  } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1929  ompt_work_type = ompt_work_distribute;
1930  } else {
1931  // use default set above.
1932  // a warning about this case is provided in __kmpc_for_static_init
1933  }
1934  KMP_DEBUG_ASSERT(ompt_work_type);
1935  }
1936  ompt_callbacks.ompt_callback(ompt_callback_work)(
1937  ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1938  &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1939  }
1940 #endif
1941  if (__kmp_env_consistency_check)
1942  __kmp_pop_workshare(global_tid, ct_pdo, loc);
1943 }
1944 
1945 // User routines which take C-style arguments (call by value)
1946 // different from the Fortran equivalent routines
1947 
1948 void ompc_set_num_threads(int arg) {
1949  // !!!!! TODO: check the per-task binding
1950  __kmp_set_num_threads(arg, __kmp_entry_gtid());
1951 }
1952 
1953 void ompc_set_dynamic(int flag) {
1954  kmp_info_t *thread;
1955 
1956  /* For the thread-private implementation of the internal controls */
1957  thread = __kmp_entry_thread();
1958 
1959  __kmp_save_internal_controls(thread);
1960 
1961  set__dynamic(thread, flag ? true : false);
1962 }
1963 
1964 void ompc_set_nested(int flag) {
1965  kmp_info_t *thread;
1966 
1967  /* For the thread-private internal controls implementation */
1968  thread = __kmp_entry_thread();
1969 
1970  __kmp_save_internal_controls(thread);
1971 
1972  set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1973 }
1974 
1975 void ompc_set_max_active_levels(int max_active_levels) {
1976  /* TO DO */
1977  /* we want per-task implementation of this internal control */
1978 
1979  /* For the per-thread internal controls implementation */
1980  __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1981 }
1982 
1983 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1984  // !!!!! TODO: check the per-task binding
1985  __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1986 }
1987 
1988 int ompc_get_ancestor_thread_num(int level) {
1989  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1990 }
1991 
1992 int ompc_get_team_size(int level) {
1993  return __kmp_get_team_size(__kmp_entry_gtid(), level);
1994 }
1995 
1996 /* OpenMP 5.0 Affinity Format API */
1997 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
1998  if (!__kmp_init_serial) {
1999  __kmp_serial_initialize();
2000  }
2001  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2002  format, KMP_STRLEN(format) + 1);
2003 }
2004 
2005 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2006  size_t format_size;
2007  if (!__kmp_init_serial) {
2008  __kmp_serial_initialize();
2009  }
2010  format_size = KMP_STRLEN(__kmp_affinity_format);
2011  if (buffer && size) {
2012  __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2013  format_size + 1);
2014  }
2015  return format_size;
2016 }
2017 
2018 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2019  int gtid;
2020  if (!TCR_4(__kmp_init_middle)) {
2021  __kmp_middle_initialize();
2022  }
2023  __kmp_assign_root_init_mask();
2024  gtid = __kmp_get_gtid();
2025  __kmp_aux_display_affinity(gtid, format);
2026 }
2027 
2028 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2029  char const *format) {
2030  int gtid;
2031  size_t num_required;
2032  kmp_str_buf_t capture_buf;
2033  if (!TCR_4(__kmp_init_middle)) {
2034  __kmp_middle_initialize();
2035  }
2036  __kmp_assign_root_init_mask();
2037  gtid = __kmp_get_gtid();
2038  __kmp_str_buf_init(&capture_buf);
2039  num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2040  if (buffer && buf_size) {
2041  __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2042  capture_buf.used + 1);
2043  }
2044  __kmp_str_buf_free(&capture_buf);
2045  return num_required;
2046 }
2047 
2048 void kmpc_set_stacksize(int arg) {
2049  // __kmp_aux_set_stacksize initializes the library if needed
2050  __kmp_aux_set_stacksize(arg);
2051 }
2052 
2053 void kmpc_set_stacksize_s(size_t arg) {
2054  // __kmp_aux_set_stacksize initializes the library if needed
2055  __kmp_aux_set_stacksize(arg);
2056 }
2057 
2058 void kmpc_set_blocktime(int arg) {
2059  int gtid, tid;
2060  kmp_info_t *thread;
2061 
2062  gtid = __kmp_entry_gtid();
2063  tid = __kmp_tid_from_gtid(gtid);
2064  thread = __kmp_thread_from_gtid(gtid);
2065 
2066  __kmp_aux_set_blocktime(arg, thread, tid);
2067 }
2068 
2069 void kmpc_set_library(int arg) {
2070  // __kmp_user_set_library initializes the library if needed
2071  __kmp_user_set_library((enum library_type)arg);
2072 }
2073 
2074 void kmpc_set_defaults(char const *str) {
2075  // __kmp_aux_set_defaults initializes the library if needed
2076  __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2077 }
2078 
2079 void kmpc_set_disp_num_buffers(int arg) {
2080  // ignore after initialization because some teams have already
2081  // allocated dispatch buffers
2082  if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2083  arg <= KMP_MAX_DISP_NUM_BUFF) {
2084  __kmp_dispatch_num_buffers = arg;
2085  }
2086 }
2087 
2088 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2089 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2090  return -1;
2091 #else
2092  if (!TCR_4(__kmp_init_middle)) {
2093  __kmp_middle_initialize();
2094  }
2095  __kmp_assign_root_init_mask();
2096  return __kmp_aux_set_affinity_mask_proc(proc, mask);
2097 #endif
2098 }
2099 
2100 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2101 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2102  return -1;
2103 #else
2104  if (!TCR_4(__kmp_init_middle)) {
2105  __kmp_middle_initialize();
2106  }
2107  __kmp_assign_root_init_mask();
2108  return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2109 #endif
2110 }
2111 
2112 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2113 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2114  return -1;
2115 #else
2116  if (!TCR_4(__kmp_init_middle)) {
2117  __kmp_middle_initialize();
2118  }
2119  __kmp_assign_root_init_mask();
2120  return __kmp_aux_get_affinity_mask_proc(proc, mask);
2121 #endif
2122 }
2123 
2124 /* -------------------------------------------------------------------------- */
2169 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2170  void *cpy_data, void (*cpy_func)(void *, void *),
2171  kmp_int32 didit) {
2172  void **data_ptr;
2173  KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2174  __kmp_assert_valid_gtid(gtid);
2175 
2176  KMP_MB();
2177 
2178  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2179 
2180  if (__kmp_env_consistency_check) {
2181  if (loc == 0) {
2182  KMP_WARNING(ConstructIdentInvalid);
2183  }
2184  }
2185 
2186  // ToDo: Optimize the following two barriers into some kind of split barrier
2187 
2188  if (didit)
2189  *data_ptr = cpy_data;
2190 
2191 #if OMPT_SUPPORT
2192  ompt_frame_t *ompt_frame;
2193  if (ompt_enabled.enabled) {
2194  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2195  if (ompt_frame->enter_frame.ptr == NULL)
2196  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2197  }
2198  OMPT_STORE_RETURN_ADDRESS(gtid);
2199 #endif
2200 /* This barrier is not a barrier region boundary */
2201 #if USE_ITT_NOTIFY
2202  __kmp_threads[gtid]->th.th_ident = loc;
2203 #endif
2204  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2205 
2206  if (!didit)
2207  (*cpy_func)(cpy_data, *data_ptr);
2208 
2209  // Consider next barrier a user-visible barrier for barrier region boundaries
2210  // Nesting checks are already handled by the single construct checks
2211  {
2212 #if OMPT_SUPPORT
2213  OMPT_STORE_RETURN_ADDRESS(gtid);
2214 #endif
2215 #if USE_ITT_NOTIFY
2216  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2217 // tasks can overwrite the location)
2218 #endif
2219  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2220 #if OMPT_SUPPORT && OMPT_OPTIONAL
2221  if (ompt_enabled.enabled) {
2222  ompt_frame->enter_frame = ompt_data_none;
2223  }
2224 #endif
2225  }
2226 }
2227 
2228 /* -------------------------------------------------------------------------- */
2229 
2230 #define INIT_LOCK __kmp_init_user_lock_with_checks
2231 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2232 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2233 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2234 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2235 #define ACQUIRE_NESTED_LOCK_TIMED \
2236  __kmp_acquire_nested_user_lock_with_checks_timed
2237 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2238 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2239 #define TEST_LOCK __kmp_test_user_lock_with_checks
2240 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2241 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2242 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2243 
2244 // TODO: Make check abort messages use location info & pass it into
2245 // with_checks routines
2246 
2247 #if KMP_USE_DYNAMIC_LOCK
2248 
2249 // internal lock initializer
2250 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2251  kmp_dyna_lockseq_t seq) {
2252  if (KMP_IS_D_LOCK(seq)) {
2253  KMP_INIT_D_LOCK(lock, seq);
2254 #if USE_ITT_BUILD
2255  __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2256 #endif
2257  } else {
2258  KMP_INIT_I_LOCK(lock, seq);
2259 #if USE_ITT_BUILD
2260  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2261  __kmp_itt_lock_creating(ilk->lock, loc);
2262 #endif
2263  }
2264 }
2265 
2266 // internal nest lock initializer
2267 static __forceinline void
2268 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2269  kmp_dyna_lockseq_t seq) {
2270 #if KMP_USE_TSX
2271  // Don't have nested lock implementation for speculative locks
2272  if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2273  seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2274  seq = __kmp_user_lock_seq;
2275 #endif
2276  switch (seq) {
2277  case lockseq_tas:
2278  seq = lockseq_nested_tas;
2279  break;
2280 #if KMP_USE_FUTEX
2281  case lockseq_futex:
2282  seq = lockseq_nested_futex;
2283  break;
2284 #endif
2285  case lockseq_ticket:
2286  seq = lockseq_nested_ticket;
2287  break;
2288  case lockseq_queuing:
2289  seq = lockseq_nested_queuing;
2290  break;
2291  case lockseq_drdpa:
2292  seq = lockseq_nested_drdpa;
2293  break;
2294  default:
2295  seq = lockseq_nested_queuing;
2296  }
2297  KMP_INIT_I_LOCK(lock, seq);
2298 #if USE_ITT_BUILD
2299  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2300  __kmp_itt_lock_creating(ilk->lock, loc);
2301 #endif
2302 }
2303 
2304 /* initialize the lock with a hint */
2305 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2306  uintptr_t hint) {
2307  KMP_DEBUG_ASSERT(__kmp_init_serial);
2308  if (__kmp_env_consistency_check && user_lock == NULL) {
2309  KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2310  }
2311 
2312  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2313 
2314 #if OMPT_SUPPORT && OMPT_OPTIONAL
2315  // This is the case, if called from omp_init_lock_with_hint:
2316  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2317  if (!codeptr)
2318  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2319  if (ompt_enabled.ompt_callback_lock_init) {
2320  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2321  ompt_mutex_lock, (omp_lock_hint_t)hint,
2322  __ompt_get_mutex_impl_type(user_lock),
2323  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2324  }
2325 #endif
2326 }
2327 
2328 /* initialize the lock with a hint */
2329 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2330  void **user_lock, uintptr_t hint) {
2331  KMP_DEBUG_ASSERT(__kmp_init_serial);
2332  if (__kmp_env_consistency_check && user_lock == NULL) {
2333  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2334  }
2335 
2336  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2337 
2338 #if OMPT_SUPPORT && OMPT_OPTIONAL
2339  // This is the case, if called from omp_init_lock_with_hint:
2340  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2341  if (!codeptr)
2342  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2343  if (ompt_enabled.ompt_callback_lock_init) {
2344  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2345  ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2346  __ompt_get_mutex_impl_type(user_lock),
2347  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2348  }
2349 #endif
2350 }
2351 
2352 #endif // KMP_USE_DYNAMIC_LOCK
2353 
2354 /* initialize the lock */
2355 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2356 #if KMP_USE_DYNAMIC_LOCK
2357 
2358  KMP_DEBUG_ASSERT(__kmp_init_serial);
2359  if (__kmp_env_consistency_check && user_lock == NULL) {
2360  KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2361  }
2362  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2363 
2364 #if OMPT_SUPPORT && OMPT_OPTIONAL
2365  // This is the case, if called from omp_init_lock_with_hint:
2366  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2367  if (!codeptr)
2368  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2369  if (ompt_enabled.ompt_callback_lock_init) {
2370  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2371  ompt_mutex_lock, omp_lock_hint_none,
2372  __ompt_get_mutex_impl_type(user_lock),
2373  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2374  }
2375 #endif
2376 
2377 #else // KMP_USE_DYNAMIC_LOCK
2378 
2379  static char const *const func = "omp_init_lock";
2380  kmp_user_lock_p lck;
2381  KMP_DEBUG_ASSERT(__kmp_init_serial);
2382 
2383  if (__kmp_env_consistency_check) {
2384  if (user_lock == NULL) {
2385  KMP_FATAL(LockIsUninitialized, func);
2386  }
2387  }
2388 
2389  KMP_CHECK_USER_LOCK_INIT();
2390 
2391  if ((__kmp_user_lock_kind == lk_tas) &&
2392  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2393  lck = (kmp_user_lock_p)user_lock;
2394  }
2395 #if KMP_USE_FUTEX
2396  else if ((__kmp_user_lock_kind == lk_futex) &&
2397  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2398  lck = (kmp_user_lock_p)user_lock;
2399  }
2400 #endif
2401  else {
2402  lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2403  }
2404  INIT_LOCK(lck);
2405  __kmp_set_user_lock_location(lck, loc);
2406 
2407 #if OMPT_SUPPORT && OMPT_OPTIONAL
2408  // This is the case, if called from omp_init_lock_with_hint:
2409  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2410  if (!codeptr)
2411  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2412  if (ompt_enabled.ompt_callback_lock_init) {
2413  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2414  ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2415  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2416  }
2417 #endif
2418 
2419 #if USE_ITT_BUILD
2420  __kmp_itt_lock_creating(lck);
2421 #endif /* USE_ITT_BUILD */
2422 
2423 #endif // KMP_USE_DYNAMIC_LOCK
2424 } // __kmpc_init_lock
2425 
2426 /* initialize the lock */
2427 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2428 #if KMP_USE_DYNAMIC_LOCK
2429 
2430  KMP_DEBUG_ASSERT(__kmp_init_serial);
2431  if (__kmp_env_consistency_check && user_lock == NULL) {
2432  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2433  }
2434  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2435 
2436 #if OMPT_SUPPORT && OMPT_OPTIONAL
2437  // This is the case, if called from omp_init_lock_with_hint:
2438  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2439  if (!codeptr)
2440  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2441  if (ompt_enabled.ompt_callback_lock_init) {
2442  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2443  ompt_mutex_nest_lock, omp_lock_hint_none,
2444  __ompt_get_mutex_impl_type(user_lock),
2445  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2446  }
2447 #endif
2448 
2449 #else // KMP_USE_DYNAMIC_LOCK
2450 
2451  static char const *const func = "omp_init_nest_lock";
2452  kmp_user_lock_p lck;
2453  KMP_DEBUG_ASSERT(__kmp_init_serial);
2454 
2455  if (__kmp_env_consistency_check) {
2456  if (user_lock == NULL) {
2457  KMP_FATAL(LockIsUninitialized, func);
2458  }
2459  }
2460 
2461  KMP_CHECK_USER_LOCK_INIT();
2462 
2463  if ((__kmp_user_lock_kind == lk_tas) &&
2464  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2465  OMP_NEST_LOCK_T_SIZE)) {
2466  lck = (kmp_user_lock_p)user_lock;
2467  }
2468 #if KMP_USE_FUTEX
2469  else if ((__kmp_user_lock_kind == lk_futex) &&
2470  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2471  OMP_NEST_LOCK_T_SIZE)) {
2472  lck = (kmp_user_lock_p)user_lock;
2473  }
2474 #endif
2475  else {
2476  lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2477  }
2478 
2479  INIT_NESTED_LOCK(lck);
2480  __kmp_set_user_lock_location(lck, loc);
2481 
2482 #if OMPT_SUPPORT && OMPT_OPTIONAL
2483  // This is the case, if called from omp_init_lock_with_hint:
2484  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2485  if (!codeptr)
2486  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2487  if (ompt_enabled.ompt_callback_lock_init) {
2488  ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2489  ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2490  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2491  }
2492 #endif
2493 
2494 #if USE_ITT_BUILD
2495  __kmp_itt_lock_creating(lck);
2496 #endif /* USE_ITT_BUILD */
2497 
2498 #endif // KMP_USE_DYNAMIC_LOCK
2499 } // __kmpc_init_nest_lock
2500 
2501 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2502 #if KMP_USE_DYNAMIC_LOCK
2503 
2504 #if USE_ITT_BUILD
2505  kmp_user_lock_p lck;
2506  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2507  lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2508  } else {
2509  lck = (kmp_user_lock_p)user_lock;
2510  }
2511  __kmp_itt_lock_destroyed(lck);
2512 #endif
2513 #if OMPT_SUPPORT && OMPT_OPTIONAL
2514  // This is the case, if called from omp_init_lock_with_hint:
2515  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2516  if (!codeptr)
2517  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2518  if (ompt_enabled.ompt_callback_lock_destroy) {
2519  ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2520  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2521  }
2522 #endif
2523  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2524 #else
2525  kmp_user_lock_p lck;
2526 
2527  if ((__kmp_user_lock_kind == lk_tas) &&
2528  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2529  lck = (kmp_user_lock_p)user_lock;
2530  }
2531 #if KMP_USE_FUTEX
2532  else if ((__kmp_user_lock_kind == lk_futex) &&
2533  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2534  lck = (kmp_user_lock_p)user_lock;
2535  }
2536 #endif
2537  else {
2538  lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2539  }
2540 
2541 #if OMPT_SUPPORT && OMPT_OPTIONAL
2542  // This is the case, if called from omp_init_lock_with_hint:
2543  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2544  if (!codeptr)
2545  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2546  if (ompt_enabled.ompt_callback_lock_destroy) {
2547  ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2548  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2549  }
2550 #endif
2551 
2552 #if USE_ITT_BUILD
2553  __kmp_itt_lock_destroyed(lck);
2554 #endif /* USE_ITT_BUILD */
2555  DESTROY_LOCK(lck);
2556 
2557  if ((__kmp_user_lock_kind == lk_tas) &&
2558  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2559  ;
2560  }
2561 #if KMP_USE_FUTEX
2562  else if ((__kmp_user_lock_kind == lk_futex) &&
2563  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2564  ;
2565  }
2566 #endif
2567  else {
2568  __kmp_user_lock_free(user_lock, gtid, lck);
2569  }
2570 #endif // KMP_USE_DYNAMIC_LOCK
2571 } // __kmpc_destroy_lock
2572 
2573 /* destroy the lock */
2574 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2575 #if KMP_USE_DYNAMIC_LOCK
2576 
2577 #if USE_ITT_BUILD
2578  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2579  __kmp_itt_lock_destroyed(ilk->lock);
2580 #endif
2581 #if OMPT_SUPPORT && OMPT_OPTIONAL
2582  // This is the case, if called from omp_init_lock_with_hint:
2583  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2584  if (!codeptr)
2585  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2586  if (ompt_enabled.ompt_callback_lock_destroy) {
2587  ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2588  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2589  }
2590 #endif
2591  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2592 
2593 #else // KMP_USE_DYNAMIC_LOCK
2594 
2595  kmp_user_lock_p lck;
2596 
2597  if ((__kmp_user_lock_kind == lk_tas) &&
2598  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2599  OMP_NEST_LOCK_T_SIZE)) {
2600  lck = (kmp_user_lock_p)user_lock;
2601  }
2602 #if KMP_USE_FUTEX
2603  else if ((__kmp_user_lock_kind == lk_futex) &&
2604  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2605  OMP_NEST_LOCK_T_SIZE)) {
2606  lck = (kmp_user_lock_p)user_lock;
2607  }
2608 #endif
2609  else {
2610  lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2611  }
2612 
2613 #if OMPT_SUPPORT && OMPT_OPTIONAL
2614  // This is the case, if called from omp_init_lock_with_hint:
2615  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2616  if (!codeptr)
2617  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2618  if (ompt_enabled.ompt_callback_lock_destroy) {
2619  ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2620  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2621  }
2622 #endif
2623 
2624 #if USE_ITT_BUILD
2625  __kmp_itt_lock_destroyed(lck);
2626 #endif /* USE_ITT_BUILD */
2627 
2628  DESTROY_NESTED_LOCK(lck);
2629 
2630  if ((__kmp_user_lock_kind == lk_tas) &&
2631  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2632  OMP_NEST_LOCK_T_SIZE)) {
2633  ;
2634  }
2635 #if KMP_USE_FUTEX
2636  else if ((__kmp_user_lock_kind == lk_futex) &&
2637  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2638  OMP_NEST_LOCK_T_SIZE)) {
2639  ;
2640  }
2641 #endif
2642  else {
2643  __kmp_user_lock_free(user_lock, gtid, lck);
2644  }
2645 #endif // KMP_USE_DYNAMIC_LOCK
2646 } // __kmpc_destroy_nest_lock
2647 
2648 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2649  KMP_COUNT_BLOCK(OMP_set_lock);
2650 #if KMP_USE_DYNAMIC_LOCK
2651  int tag = KMP_EXTRACT_D_TAG(user_lock);
2652 #if USE_ITT_BUILD
2653  __kmp_itt_lock_acquiring(
2654  (kmp_user_lock_p)
2655  user_lock); // itt function will get to the right lock object.
2656 #endif
2657 #if OMPT_SUPPORT && OMPT_OPTIONAL
2658  // This is the case, if called from omp_init_lock_with_hint:
2659  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2660  if (!codeptr)
2661  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2662  if (ompt_enabled.ompt_callback_mutex_acquire) {
2663  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2664  ompt_mutex_lock, omp_lock_hint_none,
2665  __ompt_get_mutex_impl_type(user_lock),
2666  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2667  }
2668 #endif
2669 #if KMP_USE_INLINED_TAS
2670  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2671  KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2672  } else
2673 #elif KMP_USE_INLINED_FUTEX
2674  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2675  KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2676  } else
2677 #endif
2678  {
2679  __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2680  }
2681 #if USE_ITT_BUILD
2682  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2683 #endif
2684 #if OMPT_SUPPORT && OMPT_OPTIONAL
2685  if (ompt_enabled.ompt_callback_mutex_acquired) {
2686  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2687  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2688  }
2689 #endif
2690 
2691 #else // KMP_USE_DYNAMIC_LOCK
2692 
2693  kmp_user_lock_p lck;
2694 
2695  if ((__kmp_user_lock_kind == lk_tas) &&
2696  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2697  lck = (kmp_user_lock_p)user_lock;
2698  }
2699 #if KMP_USE_FUTEX
2700  else if ((__kmp_user_lock_kind == lk_futex) &&
2701  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2702  lck = (kmp_user_lock_p)user_lock;
2703  }
2704 #endif
2705  else {
2706  lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2707  }
2708 
2709 #if USE_ITT_BUILD
2710  __kmp_itt_lock_acquiring(lck);
2711 #endif /* USE_ITT_BUILD */
2712 #if OMPT_SUPPORT && OMPT_OPTIONAL
2713  // This is the case, if called from omp_init_lock_with_hint:
2714  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2715  if (!codeptr)
2716  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2717  if (ompt_enabled.ompt_callback_mutex_acquire) {
2718  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2719  ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2720  (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2721  }
2722 #endif
2723 
2724  ACQUIRE_LOCK(lck, gtid);
2725 
2726 #if USE_ITT_BUILD
2727  __kmp_itt_lock_acquired(lck);
2728 #endif /* USE_ITT_BUILD */
2729 
2730 #if OMPT_SUPPORT && OMPT_OPTIONAL
2731  if (ompt_enabled.ompt_callback_mutex_acquired) {
2732  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2733  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2734  }
2735 #endif
2736 
2737 #endif // KMP_USE_DYNAMIC_LOCK
2738 }
2739 
2740 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2741 #if KMP_USE_DYNAMIC_LOCK
2742 
2743 #if USE_ITT_BUILD
2744  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2745 #endif
2746 #if OMPT_SUPPORT && OMPT_OPTIONAL
2747  // This is the case, if called from omp_init_lock_with_hint:
2748  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2749  if (!codeptr)
2750  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2751  if (ompt_enabled.enabled) {
2752  if (ompt_enabled.ompt_callback_mutex_acquire) {
2753  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2754  ompt_mutex_nest_lock, omp_lock_hint_none,
2755  __ompt_get_mutex_impl_type(user_lock),
2756  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2757  }
2758  }
2759 #endif
2760  int acquire_status =
2761  KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2762  (void)acquire_status;
2763 #if USE_ITT_BUILD
2764  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2765 #endif
2766 
2767 #if OMPT_SUPPORT && OMPT_OPTIONAL
2768  if (ompt_enabled.enabled) {
2769  if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2770  if (ompt_enabled.ompt_callback_mutex_acquired) {
2771  // lock_first
2772  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2773  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2774  codeptr);
2775  }
2776  } else {
2777  if (ompt_enabled.ompt_callback_nest_lock) {
2778  // lock_next
2779  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2780  ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2781  }
2782  }
2783  }
2784 #endif
2785 
2786 #else // KMP_USE_DYNAMIC_LOCK
2787  int acquire_status;
2788  kmp_user_lock_p lck;
2789 
2790  if ((__kmp_user_lock_kind == lk_tas) &&
2791  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2792  OMP_NEST_LOCK_T_SIZE)) {
2793  lck = (kmp_user_lock_p)user_lock;
2794  }
2795 #if KMP_USE_FUTEX
2796  else if ((__kmp_user_lock_kind == lk_futex) &&
2797  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2798  OMP_NEST_LOCK_T_SIZE)) {
2799  lck = (kmp_user_lock_p)user_lock;
2800  }
2801 #endif
2802  else {
2803  lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2804  }
2805 
2806 #if USE_ITT_BUILD
2807  __kmp_itt_lock_acquiring(lck);
2808 #endif /* USE_ITT_BUILD */
2809 #if OMPT_SUPPORT && OMPT_OPTIONAL
2810  // This is the case, if called from omp_init_lock_with_hint:
2811  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2812  if (!codeptr)
2813  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2814  if (ompt_enabled.enabled) {
2815  if (ompt_enabled.ompt_callback_mutex_acquire) {
2816  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2817  ompt_mutex_nest_lock, omp_lock_hint_none,
2818  __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2819  codeptr);
2820  }
2821  }
2822 #endif
2823 
2824  ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2825 
2826 #if USE_ITT_BUILD
2827  __kmp_itt_lock_acquired(lck);
2828 #endif /* USE_ITT_BUILD */
2829 
2830 #if OMPT_SUPPORT && OMPT_OPTIONAL
2831  if (ompt_enabled.enabled) {
2832  if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2833  if (ompt_enabled.ompt_callback_mutex_acquired) {
2834  // lock_first
2835  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2836  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2837  }
2838  } else {
2839  if (ompt_enabled.ompt_callback_nest_lock) {
2840  // lock_next
2841  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2842  ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2843  }
2844  }
2845  }
2846 #endif
2847 
2848 #endif // KMP_USE_DYNAMIC_LOCK
2849 }
2850 
2851 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2852 #if KMP_USE_DYNAMIC_LOCK
2853 
2854  int tag = KMP_EXTRACT_D_TAG(user_lock);
2855 #if USE_ITT_BUILD
2856  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2857 #endif
2858 #if KMP_USE_INLINED_TAS
2859  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2860  KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2861  } else
2862 #elif KMP_USE_INLINED_FUTEX
2863  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2864  KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2865  } else
2866 #endif
2867  {
2868  __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2869  }
2870 
2871 #if OMPT_SUPPORT && OMPT_OPTIONAL
2872  // This is the case, if called from omp_init_lock_with_hint:
2873  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2874  if (!codeptr)
2875  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2876  if (ompt_enabled.ompt_callback_mutex_released) {
2877  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2878  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2879  }
2880 #endif
2881 
2882 #else // KMP_USE_DYNAMIC_LOCK
2883 
2884  kmp_user_lock_p lck;
2885 
2886  /* Can't use serial interval since not block structured */
2887  /* release the lock */
2888 
2889  if ((__kmp_user_lock_kind == lk_tas) &&
2890  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2891 #if KMP_OS_LINUX && \
2892  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2893 // "fast" path implemented to fix customer performance issue
2894 #if USE_ITT_BUILD
2895  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2896 #endif /* USE_ITT_BUILD */
2897  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2898  KMP_MB();
2899 
2900 #if OMPT_SUPPORT && OMPT_OPTIONAL
2901  // This is the case, if called from omp_init_lock_with_hint:
2902  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2903  if (!codeptr)
2904  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2905  if (ompt_enabled.ompt_callback_mutex_released) {
2906  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2907  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2908  }
2909 #endif
2910 
2911  return;
2912 #else
2913  lck = (kmp_user_lock_p)user_lock;
2914 #endif
2915  }
2916 #if KMP_USE_FUTEX
2917  else if ((__kmp_user_lock_kind == lk_futex) &&
2918  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2919  lck = (kmp_user_lock_p)user_lock;
2920  }
2921 #endif
2922  else {
2923  lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2924  }
2925 
2926 #if USE_ITT_BUILD
2927  __kmp_itt_lock_releasing(lck);
2928 #endif /* USE_ITT_BUILD */
2929 
2930  RELEASE_LOCK(lck, gtid);
2931 
2932 #if OMPT_SUPPORT && OMPT_OPTIONAL
2933  // This is the case, if called from omp_init_lock_with_hint:
2934  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2935  if (!codeptr)
2936  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2937  if (ompt_enabled.ompt_callback_mutex_released) {
2938  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2939  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2940  }
2941 #endif
2942 
2943 #endif // KMP_USE_DYNAMIC_LOCK
2944 }
2945 
2946 /* release the lock */
2947 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2948 #if KMP_USE_DYNAMIC_LOCK
2949 
2950 #if USE_ITT_BUILD
2951  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2952 #endif
2953  int release_status =
2954  KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2955  (void)release_status;
2956 
2957 #if OMPT_SUPPORT && OMPT_OPTIONAL
2958  // This is the case, if called from omp_init_lock_with_hint:
2959  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2960  if (!codeptr)
2961  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2962  if (ompt_enabled.enabled) {
2963  if (release_status == KMP_LOCK_RELEASED) {
2964  if (ompt_enabled.ompt_callback_mutex_released) {
2965  // release_lock_last
2966  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2967  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2968  codeptr);
2969  }
2970  } else if (ompt_enabled.ompt_callback_nest_lock) {
2971  // release_lock_prev
2972  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2973  ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2974  }
2975  }
2976 #endif
2977 
2978 #else // KMP_USE_DYNAMIC_LOCK
2979 
2980  kmp_user_lock_p lck;
2981 
2982  /* Can't use serial interval since not block structured */
2983 
2984  if ((__kmp_user_lock_kind == lk_tas) &&
2985  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2986  OMP_NEST_LOCK_T_SIZE)) {
2987 #if KMP_OS_LINUX && \
2988  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2989  // "fast" path implemented to fix customer performance issue
2990  kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2991 #if USE_ITT_BUILD
2992  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2993 #endif /* USE_ITT_BUILD */
2994 
2995 #if OMPT_SUPPORT && OMPT_OPTIONAL
2996  int release_status = KMP_LOCK_STILL_HELD;
2997 #endif
2998 
2999  if (--(tl->lk.depth_locked) == 0) {
3000  TCW_4(tl->lk.poll, 0);
3001 #if OMPT_SUPPORT && OMPT_OPTIONAL
3002  release_status = KMP_LOCK_RELEASED;
3003 #endif
3004  }
3005  KMP_MB();
3006 
3007 #if OMPT_SUPPORT && OMPT_OPTIONAL
3008  // This is the case, if called from omp_init_lock_with_hint:
3009  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3010  if (!codeptr)
3011  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3012  if (ompt_enabled.enabled) {
3013  if (release_status == KMP_LOCK_RELEASED) {
3014  if (ompt_enabled.ompt_callback_mutex_released) {
3015  // release_lock_last
3016  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3017  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3018  }
3019  } else if (ompt_enabled.ompt_callback_nest_lock) {
3020  // release_lock_previous
3021  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3022  ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3023  }
3024  }
3025 #endif
3026 
3027  return;
3028 #else
3029  lck = (kmp_user_lock_p)user_lock;
3030 #endif
3031  }
3032 #if KMP_USE_FUTEX
3033  else if ((__kmp_user_lock_kind == lk_futex) &&
3034  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3035  OMP_NEST_LOCK_T_SIZE)) {
3036  lck = (kmp_user_lock_p)user_lock;
3037  }
3038 #endif
3039  else {
3040  lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3041  }
3042 
3043 #if USE_ITT_BUILD
3044  __kmp_itt_lock_releasing(lck);
3045 #endif /* USE_ITT_BUILD */
3046 
3047  int release_status;
3048  release_status = RELEASE_NESTED_LOCK(lck, gtid);
3049 #if OMPT_SUPPORT && OMPT_OPTIONAL
3050  // This is the case, if called from omp_init_lock_with_hint:
3051  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3052  if (!codeptr)
3053  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3054  if (ompt_enabled.enabled) {
3055  if (release_status == KMP_LOCK_RELEASED) {
3056  if (ompt_enabled.ompt_callback_mutex_released) {
3057  // release_lock_last
3058  ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3059  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3060  }
3061  } else if (ompt_enabled.ompt_callback_nest_lock) {
3062  // release_lock_previous
3063  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3064  ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3065  }
3066  }
3067 #endif
3068 
3069 #endif // KMP_USE_DYNAMIC_LOCK
3070 }
3071 
3072 /* try to acquire the lock */
3073 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3074  KMP_COUNT_BLOCK(OMP_test_lock);
3075 
3076 #if KMP_USE_DYNAMIC_LOCK
3077  int rc;
3078  int tag = KMP_EXTRACT_D_TAG(user_lock);
3079 #if USE_ITT_BUILD
3080  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3081 #endif
3082 #if OMPT_SUPPORT && OMPT_OPTIONAL
3083  // This is the case, if called from omp_init_lock_with_hint:
3084  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3085  if (!codeptr)
3086  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3087  if (ompt_enabled.ompt_callback_mutex_acquire) {
3088  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3089  ompt_mutex_lock, omp_lock_hint_none,
3090  __ompt_get_mutex_impl_type(user_lock),
3091  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3092  }
3093 #endif
3094 #if KMP_USE_INLINED_TAS
3095  if (tag == locktag_tas && !__kmp_env_consistency_check) {
3096  KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3097  } else
3098 #elif KMP_USE_INLINED_FUTEX
3099  if (tag == locktag_futex && !__kmp_env_consistency_check) {
3100  KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3101  } else
3102 #endif
3103  {
3104  rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3105  }
3106  if (rc) {
3107 #if USE_ITT_BUILD
3108  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3109 #endif
3110 #if OMPT_SUPPORT && OMPT_OPTIONAL
3111  if (ompt_enabled.ompt_callback_mutex_acquired) {
3112  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3113  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3114  }
3115 #endif
3116  return FTN_TRUE;
3117  } else {
3118 #if USE_ITT_BUILD
3119  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3120 #endif
3121  return FTN_FALSE;
3122  }
3123 
3124 #else // KMP_USE_DYNAMIC_LOCK
3125 
3126  kmp_user_lock_p lck;
3127  int rc;
3128 
3129  if ((__kmp_user_lock_kind == lk_tas) &&
3130  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3131  lck = (kmp_user_lock_p)user_lock;
3132  }
3133 #if KMP_USE_FUTEX
3134  else if ((__kmp_user_lock_kind == lk_futex) &&
3135  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3136  lck = (kmp_user_lock_p)user_lock;
3137  }
3138 #endif
3139  else {
3140  lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3141  }
3142 
3143 #if USE_ITT_BUILD
3144  __kmp_itt_lock_acquiring(lck);
3145 #endif /* USE_ITT_BUILD */
3146 #if OMPT_SUPPORT && OMPT_OPTIONAL
3147  // This is the case, if called from omp_init_lock_with_hint:
3148  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3149  if (!codeptr)
3150  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3151  if (ompt_enabled.ompt_callback_mutex_acquire) {
3152  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3153  ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3154  (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3155  }
3156 #endif
3157 
3158  rc = TEST_LOCK(lck, gtid);
3159 #if USE_ITT_BUILD
3160  if (rc) {
3161  __kmp_itt_lock_acquired(lck);
3162  } else {
3163  __kmp_itt_lock_cancelled(lck);
3164  }
3165 #endif /* USE_ITT_BUILD */
3166 #if OMPT_SUPPORT && OMPT_OPTIONAL
3167  if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3168  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3169  ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3170  }
3171 #endif
3172 
3173  return (rc ? FTN_TRUE : FTN_FALSE);
3174 
3175  /* Can't use serial interval since not block structured */
3176 
3177 #endif // KMP_USE_DYNAMIC_LOCK
3178 }
3179 
3180 /* try to acquire the lock */
3181 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3182 #if KMP_USE_DYNAMIC_LOCK
3183  int rc;
3184 #if USE_ITT_BUILD
3185  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3186 #endif
3187 #if OMPT_SUPPORT && OMPT_OPTIONAL
3188  // This is the case, if called from omp_init_lock_with_hint:
3189  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3190  if (!codeptr)
3191  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3192  if (ompt_enabled.ompt_callback_mutex_acquire) {
3193  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3194  ompt_mutex_nest_lock, omp_lock_hint_none,
3195  __ompt_get_mutex_impl_type(user_lock),
3196  (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3197  }
3198 #endif
3199  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3200 #if USE_ITT_BUILD
3201  if (rc) {
3202  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3203  } else {
3204  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3205  }
3206 #endif
3207 #if OMPT_SUPPORT && OMPT_OPTIONAL
3208  if (ompt_enabled.enabled && rc) {
3209  if (rc == 1) {
3210  if (ompt_enabled.ompt_callback_mutex_acquired) {
3211  // lock_first
3212  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3213  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3214  codeptr);
3215  }
3216  } else {
3217  if (ompt_enabled.ompt_callback_nest_lock) {
3218  // lock_next
3219  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3220  ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3221  }
3222  }
3223  }
3224 #endif
3225  return rc;
3226 
3227 #else // KMP_USE_DYNAMIC_LOCK
3228 
3229  kmp_user_lock_p lck;
3230  int rc;
3231 
3232  if ((__kmp_user_lock_kind == lk_tas) &&
3233  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3234  OMP_NEST_LOCK_T_SIZE)) {
3235  lck = (kmp_user_lock_p)user_lock;
3236  }
3237 #if KMP_USE_FUTEX
3238  else if ((__kmp_user_lock_kind == lk_futex) &&
3239  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3240  OMP_NEST_LOCK_T_SIZE)) {
3241  lck = (kmp_user_lock_p)user_lock;
3242  }
3243 #endif
3244  else {
3245  lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3246  }
3247 
3248 #if USE_ITT_BUILD
3249  __kmp_itt_lock_acquiring(lck);
3250 #endif /* USE_ITT_BUILD */
3251 
3252 #if OMPT_SUPPORT && OMPT_OPTIONAL
3253  // This is the case, if called from omp_init_lock_with_hint:
3254  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3255  if (!codeptr)
3256  codeptr = OMPT_GET_RETURN_ADDRESS(0);
3257  if (ompt_enabled.enabled) &&
3258  ompt_enabled.ompt_callback_mutex_acquire) {
3259  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3260  ompt_mutex_nest_lock, omp_lock_hint_none,
3261  __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3262  codeptr);
3263  }
3264 #endif
3265 
3266  rc = TEST_NESTED_LOCK(lck, gtid);
3267 #if USE_ITT_BUILD
3268  if (rc) {
3269  __kmp_itt_lock_acquired(lck);
3270  } else {
3271  __kmp_itt_lock_cancelled(lck);
3272  }
3273 #endif /* USE_ITT_BUILD */
3274 #if OMPT_SUPPORT && OMPT_OPTIONAL
3275  if (ompt_enabled.enabled && rc) {
3276  if (rc == 1) {
3277  if (ompt_enabled.ompt_callback_mutex_acquired) {
3278  // lock_first
3279  ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3280  ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3281  }
3282  } else {
3283  if (ompt_enabled.ompt_callback_nest_lock) {
3284  // lock_next
3285  ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3286  ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3287  }
3288  }
3289  }
3290 #endif
3291  return rc;
3292 
3293  /* Can't use serial interval since not block structured */
3294 
3295 #endif // KMP_USE_DYNAMIC_LOCK
3296 }
3297 
3298 // Interface to fast scalable reduce methods routines
3299 
3300 // keep the selected method in a thread local structure for cross-function
3301 // usage: will be used in __kmpc_end_reduce* functions;
3302 // another solution: to re-determine the method one more time in
3303 // __kmpc_end_reduce* functions (new prototype required then)
3304 // AT: which solution is better?
3305 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3306  ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3307 
3308 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3309  (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3310 
3311 // description of the packed_reduction_method variable: look at the macros in
3312 // kmp.h
3313 
3314 // used in a critical section reduce block
3315 static __forceinline void
3316 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3317  kmp_critical_name *crit) {
3318 
3319  // this lock was visible to a customer and to the threading profile tool as a
3320  // serial overhead span (although it's used for an internal purpose only)
3321  // why was it visible in previous implementation?
3322  // should we keep it visible in new reduce block?
3323  kmp_user_lock_p lck;
3324 
3325 #if KMP_USE_DYNAMIC_LOCK
3326 
3327  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3328  // Check if it is initialized.
3329  if (*lk == 0) {
3330  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3331  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3332  KMP_GET_D_TAG(__kmp_user_lock_seq));
3333  } else {
3334  __kmp_init_indirect_csptr(crit, loc, global_tid,
3335  KMP_GET_I_TAG(__kmp_user_lock_seq));
3336  }
3337  }
3338  // Branch for accessing the actual lock object and set operation. This
3339  // branching is inevitable since this lock initialization does not follow the
3340  // normal dispatch path (lock table is not used).
3341  if (KMP_EXTRACT_D_TAG(lk) != 0) {
3342  lck = (kmp_user_lock_p)lk;
3343  KMP_DEBUG_ASSERT(lck != NULL);
3344  if (__kmp_env_consistency_check) {
3345  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3346  }
3347  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3348  } else {
3349  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3350  lck = ilk->lock;
3351  KMP_DEBUG_ASSERT(lck != NULL);
3352  if (__kmp_env_consistency_check) {
3353  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3354  }
3355  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3356  }
3357 
3358 #else // KMP_USE_DYNAMIC_LOCK
3359 
3360  // We know that the fast reduction code is only emitted by Intel compilers
3361  // with 32 byte critical sections. If there isn't enough space, then we
3362  // have to use a pointer.
3363  if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3364  lck = (kmp_user_lock_p)crit;
3365  } else {
3366  lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3367  }
3368  KMP_DEBUG_ASSERT(lck != NULL);
3369 
3370  if (__kmp_env_consistency_check)
3371  __kmp_push_sync(global_tid, ct_critical, loc, lck);
3372 
3373  __kmp_acquire_user_lock_with_checks(lck, global_tid);
3374 
3375 #endif // KMP_USE_DYNAMIC_LOCK
3376 }
3377 
3378 // used in a critical section reduce block
3379 static __forceinline void
3380 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3381  kmp_critical_name *crit) {
3382 
3383  kmp_user_lock_p lck;
3384 
3385 #if KMP_USE_DYNAMIC_LOCK
3386 
3387  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3388  lck = (kmp_user_lock_p)crit;
3389  if (__kmp_env_consistency_check)
3390  __kmp_pop_sync(global_tid, ct_critical, loc);
3391  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3392  } else {
3393  kmp_indirect_lock_t *ilk =
3394  (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3395  if (__kmp_env_consistency_check)
3396  __kmp_pop_sync(global_tid, ct_critical, loc);
3397  KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3398  }
3399 
3400 #else // KMP_USE_DYNAMIC_LOCK
3401 
3402  // We know that the fast reduction code is only emitted by Intel compilers
3403  // with 32 byte critical sections. If there isn't enough space, then we have
3404  // to use a pointer.
3405  if (__kmp_base_user_lock_size > 32) {
3406  lck = *((kmp_user_lock_p *)crit);
3407  KMP_ASSERT(lck != NULL);
3408  } else {
3409  lck = (kmp_user_lock_p)crit;
3410  }
3411 
3412  if (__kmp_env_consistency_check)
3413  __kmp_pop_sync(global_tid, ct_critical, loc);
3414 
3415  __kmp_release_user_lock_with_checks(lck, global_tid);
3416 
3417 #endif // KMP_USE_DYNAMIC_LOCK
3418 } // __kmp_end_critical_section_reduce_block
3419 
3420 static __forceinline int
3421 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3422  int *task_state) {
3423  kmp_team_t *team;
3424 
3425  // Check if we are inside the teams construct?
3426  if (th->th.th_teams_microtask) {
3427  *team_p = team = th->th.th_team;
3428  if (team->t.t_level == th->th.th_teams_level) {
3429  // This is reduction at teams construct.
3430  KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3431  // Let's swap teams temporarily for the reduction.
3432  th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3433  th->th.th_team = team->t.t_parent;
3434  th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3435  th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3436  *task_state = th->th.th_task_state;
3437  th->th.th_task_state = 0;
3438 
3439  return 1;
3440  }
3441  }
3442  return 0;
3443 }
3444 
3445 static __forceinline void
3446 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3447  // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3448  th->th.th_info.ds.ds_tid = 0;
3449  th->th.th_team = team;
3450  th->th.th_team_nproc = team->t.t_nproc;
3451  th->th.th_task_team = team->t.t_task_team[task_state];
3452  __kmp_type_convert(task_state, &(th->th.th_task_state));
3453 }
3454 
3455 /* 2.a.i. Reduce Block without a terminating barrier */
3471 kmp_int32
3472 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3473  size_t reduce_size, void *reduce_data,
3474  void (*reduce_func)(void *lhs_data, void *rhs_data),
3475  kmp_critical_name *lck) {
3476 
3477  KMP_COUNT_BLOCK(REDUCE_nowait);
3478  int retval = 0;
3479  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3480  kmp_info_t *th;
3481  kmp_team_t *team;
3482  int teams_swapped = 0, task_state;
3483  KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3484  __kmp_assert_valid_gtid(global_tid);
3485 
3486  // why do we need this initialization here at all?
3487  // Reduction clause can not be used as a stand-alone directive.
3488 
3489  // do not call __kmp_serial_initialize(), it will be called by
3490  // __kmp_parallel_initialize() if needed
3491  // possible detection of false-positive race by the threadchecker ???
3492  if (!TCR_4(__kmp_init_parallel))
3493  __kmp_parallel_initialize();
3494 
3495  __kmp_resume_if_soft_paused();
3496 
3497 // check correctness of reduce block nesting
3498 #if KMP_USE_DYNAMIC_LOCK
3499  if (__kmp_env_consistency_check)
3500  __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3501 #else
3502  if (__kmp_env_consistency_check)
3503  __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3504 #endif
3505 
3506  th = __kmp_thread_from_gtid(global_tid);
3507  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3508 
3509  // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3510  // the value should be kept in a variable
3511  // the variable should be either a construct-specific or thread-specific
3512  // property, not a team specific property
3513  // (a thread can reach the next reduce block on the next construct, reduce
3514  // method may differ on the next construct)
3515  // an ident_t "loc" parameter could be used as a construct-specific property
3516  // (what if loc == 0?)
3517  // (if both construct-specific and team-specific variables were shared,
3518  // then unness extra syncs should be needed)
3519  // a thread-specific variable is better regarding two issues above (next
3520  // construct and extra syncs)
3521  // a thread-specific "th_local.reduction_method" variable is used currently
3522  // each thread executes 'determine' and 'set' lines (no need to execute by one
3523  // thread, to avoid unness extra syncs)
3524 
3525  packed_reduction_method = __kmp_determine_reduction_method(
3526  loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3527  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3528 
3529  OMPT_REDUCTION_DECL(th, global_tid);
3530  if (packed_reduction_method == critical_reduce_block) {
3531 
3532  OMPT_REDUCTION_BEGIN;
3533 
3534  __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3535  retval = 1;
3536 
3537  } else if (packed_reduction_method == empty_reduce_block) {
3538 
3539  OMPT_REDUCTION_BEGIN;
3540 
3541  // usage: if team size == 1, no synchronization is required ( Intel
3542  // platforms only )
3543  retval = 1;
3544 
3545  } else if (packed_reduction_method == atomic_reduce_block) {
3546 
3547  retval = 2;
3548 
3549  // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3550  // won't be called by the code gen)
3551  // (it's not quite good, because the checking block has been closed by
3552  // this 'pop',
3553  // but atomic operation has not been executed yet, will be executed
3554  // slightly later, literally on next instruction)
3555  if (__kmp_env_consistency_check)
3556  __kmp_pop_sync(global_tid, ct_reduce, loc);
3557 
3558  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3559  tree_reduce_block)) {
3560 
3561 // AT: performance issue: a real barrier here
3562 // AT: (if primary thread is slow, other threads are blocked here waiting for
3563 // the primary thread to come and release them)
3564 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3565 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3566 // be confusing to a customer)
3567 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3568 // might go faster and be more in line with sense of NOWAIT
3569 // AT: TO DO: do epcc test and compare times
3570 
3571 // this barrier should be invisible to a customer and to the threading profile
3572 // tool (it's neither a terminating barrier nor customer's code, it's
3573 // used for an internal purpose)
3574 #if OMPT_SUPPORT
3575  // JP: can this barrier potentially leed to task scheduling?
3576  // JP: as long as there is a barrier in the implementation, OMPT should and
3577  // will provide the barrier events
3578  // so we set-up the necessary frame/return addresses.
3579  ompt_frame_t *ompt_frame;
3580  if (ompt_enabled.enabled) {
3581  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3582  if (ompt_frame->enter_frame.ptr == NULL)
3583  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3584  }
3585  OMPT_STORE_RETURN_ADDRESS(global_tid);
3586 #endif
3587 #if USE_ITT_NOTIFY
3588  __kmp_threads[global_tid]->th.th_ident = loc;
3589 #endif
3590  retval =
3591  __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3592  global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3593  retval = (retval != 0) ? (0) : (1);
3594 #if OMPT_SUPPORT && OMPT_OPTIONAL
3595  if (ompt_enabled.enabled) {
3596  ompt_frame->enter_frame = ompt_data_none;
3597  }
3598 #endif
3599 
3600  // all other workers except primary thread should do this pop here
3601  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3602  if (__kmp_env_consistency_check) {
3603  if (retval == 0) {
3604  __kmp_pop_sync(global_tid, ct_reduce, loc);
3605  }
3606  }
3607 
3608  } else {
3609 
3610  // should never reach this block
3611  KMP_ASSERT(0); // "unexpected method"
3612  }
3613  if (teams_swapped) {
3614  __kmp_restore_swapped_teams(th, team, task_state);
3615  }
3616  KA_TRACE(
3617  10,
3618  ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3619  global_tid, packed_reduction_method, retval));
3620 
3621  return retval;
3622 }
3623 
3632 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3633  kmp_critical_name *lck) {
3634 
3635  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3636 
3637  KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3638  __kmp_assert_valid_gtid(global_tid);
3639 
3640  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3641 
3642  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3643 
3644  if (packed_reduction_method == critical_reduce_block) {
3645 
3646  __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3647  OMPT_REDUCTION_END;
3648 
3649  } else if (packed_reduction_method == empty_reduce_block) {
3650 
3651  // usage: if team size == 1, no synchronization is required ( on Intel
3652  // platforms only )
3653 
3654  OMPT_REDUCTION_END;
3655 
3656  } else if (packed_reduction_method == atomic_reduce_block) {
3657 
3658  // neither primary thread nor other workers should get here
3659  // (code gen does not generate this call in case 2: atomic reduce block)
3660  // actually it's better to remove this elseif at all;
3661  // after removal this value will checked by the 'else' and will assert
3662 
3663  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3664  tree_reduce_block)) {
3665 
3666  // only primary thread gets here
3667  // OMPT: tree reduction is annotated in the barrier code
3668 
3669  } else {
3670 
3671  // should never reach this block
3672  KMP_ASSERT(0); // "unexpected method"
3673  }
3674 
3675  if (__kmp_env_consistency_check)
3676  __kmp_pop_sync(global_tid, ct_reduce, loc);
3677 
3678  KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3679  global_tid, packed_reduction_method));
3680 
3681  return;
3682 }
3683 
3684 /* 2.a.ii. Reduce Block with a terminating barrier */
3685 
3701 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3702  size_t reduce_size, void *reduce_data,
3703  void (*reduce_func)(void *lhs_data, void *rhs_data),
3704  kmp_critical_name *lck) {
3705  KMP_COUNT_BLOCK(REDUCE_wait);
3706  int retval = 0;
3707  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3708  kmp_info_t *th;
3709  kmp_team_t *team;
3710  int teams_swapped = 0, task_state;
3711 
3712  KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3713  __kmp_assert_valid_gtid(global_tid);
3714 
3715  // why do we need this initialization here at all?
3716  // Reduction clause can not be a stand-alone directive.
3717 
3718  // do not call __kmp_serial_initialize(), it will be called by
3719  // __kmp_parallel_initialize() if needed
3720  // possible detection of false-positive race by the threadchecker ???
3721  if (!TCR_4(__kmp_init_parallel))
3722  __kmp_parallel_initialize();
3723 
3724  __kmp_resume_if_soft_paused();
3725 
3726 // check correctness of reduce block nesting
3727 #if KMP_USE_DYNAMIC_LOCK
3728  if (__kmp_env_consistency_check)
3729  __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3730 #else
3731  if (__kmp_env_consistency_check)
3732  __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3733 #endif
3734 
3735  th = __kmp_thread_from_gtid(global_tid);
3736  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3737 
3738  packed_reduction_method = __kmp_determine_reduction_method(
3739  loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3740  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3741 
3742  OMPT_REDUCTION_DECL(th, global_tid);
3743 
3744  if (packed_reduction_method == critical_reduce_block) {
3745 
3746  OMPT_REDUCTION_BEGIN;
3747  __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3748  retval = 1;
3749 
3750  } else if (packed_reduction_method == empty_reduce_block) {
3751 
3752  OMPT_REDUCTION_BEGIN;
3753  // usage: if team size == 1, no synchronization is required ( Intel
3754  // platforms only )
3755  retval = 1;
3756 
3757  } else if (packed_reduction_method == atomic_reduce_block) {
3758 
3759  retval = 2;
3760 
3761  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3762  tree_reduce_block)) {
3763 
3764 // case tree_reduce_block:
3765 // this barrier should be visible to a customer and to the threading profile
3766 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3767 #if OMPT_SUPPORT
3768  ompt_frame_t *ompt_frame;
3769  if (ompt_enabled.enabled) {
3770  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3771  if (ompt_frame->enter_frame.ptr == NULL)
3772  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3773  }
3774  OMPT_STORE_RETURN_ADDRESS(global_tid);
3775 #endif
3776 #if USE_ITT_NOTIFY
3777  __kmp_threads[global_tid]->th.th_ident =
3778  loc; // needed for correct notification of frames
3779 #endif
3780  retval =
3781  __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3782  global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3783  retval = (retval != 0) ? (0) : (1);
3784 #if OMPT_SUPPORT && OMPT_OPTIONAL
3785  if (ompt_enabled.enabled) {
3786  ompt_frame->enter_frame = ompt_data_none;
3787  }
3788 #endif
3789 
3790  // all other workers except primary thread should do this pop here
3791  // (none of other workers except primary will enter __kmpc_end_reduce())
3792  if (__kmp_env_consistency_check) {
3793  if (retval == 0) { // 0: all other workers; 1: primary thread
3794  __kmp_pop_sync(global_tid, ct_reduce, loc);
3795  }
3796  }
3797 
3798  } else {
3799 
3800  // should never reach this block
3801  KMP_ASSERT(0); // "unexpected method"
3802  }
3803  if (teams_swapped) {
3804  __kmp_restore_swapped_teams(th, team, task_state);
3805  }
3806 
3807  KA_TRACE(10,
3808  ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3809  global_tid, packed_reduction_method, retval));
3810  return retval;
3811 }
3812 
3823 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3824  kmp_critical_name *lck) {
3825 
3826  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3827  kmp_info_t *th;
3828  kmp_team_t *team;
3829  int teams_swapped = 0, task_state;
3830 
3831  KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3832  __kmp_assert_valid_gtid(global_tid);
3833 
3834  th = __kmp_thread_from_gtid(global_tid);
3835  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3836 
3837  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3838 
3839  // this barrier should be visible to a customer and to the threading profile
3840  // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3841  OMPT_REDUCTION_DECL(th, global_tid);
3842 
3843  if (packed_reduction_method == critical_reduce_block) {
3844  __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3845 
3846  OMPT_REDUCTION_END;
3847 
3848 // TODO: implicit barrier: should be exposed
3849 #if OMPT_SUPPORT
3850  ompt_frame_t *ompt_frame;
3851  if (ompt_enabled.enabled) {
3852  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3853  if (ompt_frame->enter_frame.ptr == NULL)
3854  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3855  }
3856  OMPT_STORE_RETURN_ADDRESS(global_tid);
3857 #endif
3858 #if USE_ITT_NOTIFY
3859  __kmp_threads[global_tid]->th.th_ident = loc;
3860 #endif
3861  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3862 #if OMPT_SUPPORT && OMPT_OPTIONAL
3863  if (ompt_enabled.enabled) {
3864  ompt_frame->enter_frame = ompt_data_none;
3865  }
3866 #endif
3867 
3868  } else if (packed_reduction_method == empty_reduce_block) {
3869 
3870  OMPT_REDUCTION_END;
3871 
3872 // usage: if team size==1, no synchronization is required (Intel platforms only)
3873 
3874 // TODO: implicit barrier: should be exposed
3875 #if OMPT_SUPPORT
3876  ompt_frame_t *ompt_frame;
3877  if (ompt_enabled.enabled) {
3878  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3879  if (ompt_frame->enter_frame.ptr == NULL)
3880  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3881  }
3882  OMPT_STORE_RETURN_ADDRESS(global_tid);
3883 #endif
3884 #if USE_ITT_NOTIFY
3885  __kmp_threads[global_tid]->th.th_ident = loc;
3886 #endif
3887  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3888 #if OMPT_SUPPORT && OMPT_OPTIONAL
3889  if (ompt_enabled.enabled) {
3890  ompt_frame->enter_frame = ompt_data_none;
3891  }
3892 #endif
3893 
3894  } else if (packed_reduction_method == atomic_reduce_block) {
3895 
3896 #if OMPT_SUPPORT
3897  ompt_frame_t *ompt_frame;
3898  if (ompt_enabled.enabled) {
3899  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3900  if (ompt_frame->enter_frame.ptr == NULL)
3901  ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3902  }
3903  OMPT_STORE_RETURN_ADDRESS(global_tid);
3904 #endif
3905 // TODO: implicit barrier: should be exposed
3906 #if USE_ITT_NOTIFY
3907  __kmp_threads[global_tid]->th.th_ident = loc;
3908 #endif
3909  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3910 #if OMPT_SUPPORT && OMPT_OPTIONAL
3911  if (ompt_enabled.enabled) {
3912  ompt_frame->enter_frame = ompt_data_none;
3913  }
3914 #endif
3915 
3916  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3917  tree_reduce_block)) {
3918 
3919  // only primary thread executes here (primary releases all other workers)
3920  __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3921  global_tid);
3922 
3923  } else {
3924 
3925  // should never reach this block
3926  KMP_ASSERT(0); // "unexpected method"
3927  }
3928  if (teams_swapped) {
3929  __kmp_restore_swapped_teams(th, team, task_state);
3930  }
3931 
3932  if (__kmp_env_consistency_check)
3933  __kmp_pop_sync(global_tid, ct_reduce, loc);
3934 
3935  KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3936  global_tid, packed_reduction_method));
3937 
3938  return;
3939 }
3940 
3941 #undef __KMP_GET_REDUCTION_METHOD
3942 #undef __KMP_SET_REDUCTION_METHOD
3943 
3944 /* end of interface to fast scalable reduce routines */
3945 
3946 kmp_uint64 __kmpc_get_taskid() {
3947 
3948  kmp_int32 gtid;
3949  kmp_info_t *thread;
3950 
3951  gtid = __kmp_get_gtid();
3952  if (gtid < 0) {
3953  return 0;
3954  }
3955  thread = __kmp_thread_from_gtid(gtid);
3956  return thread->th.th_current_task->td_task_id;
3957 
3958 } // __kmpc_get_taskid
3959 
3960 kmp_uint64 __kmpc_get_parent_taskid() {
3961 
3962  kmp_int32 gtid;
3963  kmp_info_t *thread;
3964  kmp_taskdata_t *parent_task;
3965 
3966  gtid = __kmp_get_gtid();
3967  if (gtid < 0) {
3968  return 0;
3969  }
3970  thread = __kmp_thread_from_gtid(gtid);
3971  parent_task = thread->th.th_current_task->td_parent;
3972  return (parent_task == NULL ? 0 : parent_task->td_task_id);
3973 
3974 } // __kmpc_get_parent_taskid
3975 
3987 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3988  const struct kmp_dim *dims) {
3989  __kmp_assert_valid_gtid(gtid);
3990  int j, idx;
3991  kmp_int64 last, trace_count;
3992  kmp_info_t *th = __kmp_threads[gtid];
3993  kmp_team_t *team = th->th.th_team;
3994  kmp_uint32 *flags;
3995  kmp_disp_t *pr_buf = th->th.th_dispatch;
3996  dispatch_shared_info_t *sh_buf;
3997 
3998  KA_TRACE(
3999  20,
4000  ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4001  gtid, num_dims, !team->t.t_serialized));
4002  KMP_DEBUG_ASSERT(dims != NULL);
4003  KMP_DEBUG_ASSERT(num_dims > 0);
4004 
4005  if (team->t.t_serialized) {
4006  KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4007  return; // no dependencies if team is serialized
4008  }
4009  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4010  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4011  // the next loop
4012  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4013 
4014  // Save bounds info into allocated private buffer
4015  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4016  pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4017  th, sizeof(kmp_int64) * (4 * num_dims + 1));
4018  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4019  pr_buf->th_doacross_info[0] =
4020  (kmp_int64)num_dims; // first element is number of dimensions
4021  // Save also address of num_done in order to access it later without knowing
4022  // the buffer index
4023  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4024  pr_buf->th_doacross_info[2] = dims[0].lo;
4025  pr_buf->th_doacross_info[3] = dims[0].up;
4026  pr_buf->th_doacross_info[4] = dims[0].st;
4027  last = 5;
4028  for (j = 1; j < num_dims; ++j) {
4029  kmp_int64
4030  range_length; // To keep ranges of all dimensions but the first dims[0]
4031  if (dims[j].st == 1) { // most common case
4032  // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4033  range_length = dims[j].up - dims[j].lo + 1;
4034  } else {
4035  if (dims[j].st > 0) {
4036  KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4037  range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4038  } else { // negative increment
4039  KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4040  range_length =
4041  (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4042  }
4043  }
4044  pr_buf->th_doacross_info[last++] = range_length;
4045  pr_buf->th_doacross_info[last++] = dims[j].lo;
4046  pr_buf->th_doacross_info[last++] = dims[j].up;
4047  pr_buf->th_doacross_info[last++] = dims[j].st;
4048  }
4049 
4050  // Compute total trip count.
4051  // Start with range of dims[0] which we don't need to keep in the buffer.
4052  if (dims[0].st == 1) { // most common case
4053  trace_count = dims[0].up - dims[0].lo + 1;
4054  } else if (dims[0].st > 0) {
4055  KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4056  trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4057  } else { // negative increment
4058  KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4059  trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4060  }
4061  for (j = 1; j < num_dims; ++j) {
4062  trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4063  }
4064  KMP_DEBUG_ASSERT(trace_count > 0);
4065 
4066  // Check if shared buffer is not occupied by other loop (idx -
4067  // __kmp_dispatch_num_buffers)
4068  if (idx != sh_buf->doacross_buf_idx) {
4069  // Shared buffer is occupied, wait for it to be free
4070  __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4071  __kmp_eq_4, NULL);
4072  }
4073 #if KMP_32_BIT_ARCH
4074  // Check if we are the first thread. After the CAS the first thread gets 0,
4075  // others get 1 if initialization is in progress, allocated pointer otherwise.
4076  // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4077  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4078  (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4079 #else
4080  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4081  (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4082 #endif
4083  if (flags == NULL) {
4084  // we are the first thread, allocate the array of flags
4085  size_t size =
4086  (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4087  flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4088  KMP_MB();
4089  sh_buf->doacross_flags = flags;
4090  } else if (flags == (kmp_uint32 *)1) {
4091 #if KMP_32_BIT_ARCH
4092  // initialization is still in progress, need to wait
4093  while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4094 #else
4095  while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4096 #endif
4097  KMP_YIELD(TRUE);
4098  KMP_MB();
4099  } else {
4100  KMP_MB();
4101  }
4102  KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4103  pr_buf->th_doacross_flags =
4104  sh_buf->doacross_flags; // save private copy in order to not
4105  // touch shared buffer on each iteration
4106  KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4107 }
4108 
4109 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4110  __kmp_assert_valid_gtid(gtid);
4111  kmp_int64 shft;
4112  size_t num_dims, i;
4113  kmp_uint32 flag;
4114  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4115  kmp_info_t *th = __kmp_threads[gtid];
4116  kmp_team_t *team = th->th.th_team;
4117  kmp_disp_t *pr_buf;
4118  kmp_int64 lo, up, st;
4119 
4120  KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4121  if (team->t.t_serialized) {
4122  KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4123  return; // no dependencies if team is serialized
4124  }
4125 
4126  // calculate sequential iteration number and check out-of-bounds condition
4127  pr_buf = th->th.th_dispatch;
4128  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4129  num_dims = (size_t)pr_buf->th_doacross_info[0];
4130  lo = pr_buf->th_doacross_info[2];
4131  up = pr_buf->th_doacross_info[3];
4132  st = pr_buf->th_doacross_info[4];
4133 #if OMPT_SUPPORT && OMPT_OPTIONAL
4134  ompt_dependence_t deps[num_dims];
4135 #endif
4136  if (st == 1) { // most common case
4137  if (vec[0] < lo || vec[0] > up) {
4138  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4139  "bounds [%lld,%lld]\n",
4140  gtid, vec[0], lo, up));
4141  return;
4142  }
4143  iter_number = vec[0] - lo;
4144  } else if (st > 0) {
4145  if (vec[0] < lo || vec[0] > up) {
4146  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4147  "bounds [%lld,%lld]\n",
4148  gtid, vec[0], lo, up));
4149  return;
4150  }
4151  iter_number = (kmp_uint64)(vec[0] - lo) / st;
4152  } else { // negative increment
4153  if (vec[0] > lo || vec[0] < up) {
4154  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4155  "bounds [%lld,%lld]\n",
4156  gtid, vec[0], lo, up));
4157  return;
4158  }
4159  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4160  }
4161 #if OMPT_SUPPORT && OMPT_OPTIONAL
4162  deps[0].variable.value = iter_number;
4163  deps[0].dependence_type = ompt_dependence_type_sink;
4164 #endif
4165  for (i = 1; i < num_dims; ++i) {
4166  kmp_int64 iter, ln;
4167  size_t j = i * 4;
4168  ln = pr_buf->th_doacross_info[j + 1];
4169  lo = pr_buf->th_doacross_info[j + 2];
4170  up = pr_buf->th_doacross_info[j + 3];
4171  st = pr_buf->th_doacross_info[j + 4];
4172  if (st == 1) {
4173  if (vec[i] < lo || vec[i] > up) {
4174  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4175  "bounds [%lld,%lld]\n",
4176  gtid, vec[i], lo, up));
4177  return;
4178  }
4179  iter = vec[i] - lo;
4180  } else if (st > 0) {
4181  if (vec[i] < lo || vec[i] > up) {
4182  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4183  "bounds [%lld,%lld]\n",
4184  gtid, vec[i], lo, up));
4185  return;
4186  }
4187  iter = (kmp_uint64)(vec[i] - lo) / st;
4188  } else { // st < 0
4189  if (vec[i] > lo || vec[i] < up) {
4190  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4191  "bounds [%lld,%lld]\n",
4192  gtid, vec[i], lo, up));
4193  return;
4194  }
4195  iter = (kmp_uint64)(lo - vec[i]) / (-st);
4196  }
4197  iter_number = iter + ln * iter_number;
4198 #if OMPT_SUPPORT && OMPT_OPTIONAL
4199  deps[i].variable.value = iter;
4200  deps[i].dependence_type = ompt_dependence_type_sink;
4201 #endif
4202  }
4203  shft = iter_number % 32; // use 32-bit granularity
4204  iter_number >>= 5; // divided by 32
4205  flag = 1 << shft;
4206  while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4207  KMP_YIELD(TRUE);
4208  }
4209  KMP_MB();
4210 #if OMPT_SUPPORT && OMPT_OPTIONAL
4211  if (ompt_enabled.ompt_callback_dependences) {
4212  ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4213  &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4214  }
4215 #endif
4216  KA_TRACE(20,
4217  ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4218  gtid, (iter_number << 5) + shft));
4219 }
4220 
4221 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4222  __kmp_assert_valid_gtid(gtid);
4223  kmp_int64 shft;
4224  size_t num_dims, i;
4225  kmp_uint32 flag;
4226  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4227  kmp_info_t *th = __kmp_threads[gtid];
4228  kmp_team_t *team = th->th.th_team;
4229  kmp_disp_t *pr_buf;
4230  kmp_int64 lo, st;
4231 
4232  KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4233  if (team->t.t_serialized) {
4234  KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4235  return; // no dependencies if team is serialized
4236  }
4237 
4238  // calculate sequential iteration number (same as in "wait" but no
4239  // out-of-bounds checks)
4240  pr_buf = th->th.th_dispatch;
4241  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4242  num_dims = (size_t)pr_buf->th_doacross_info[0];
4243  lo = pr_buf->th_doacross_info[2];
4244  st = pr_buf->th_doacross_info[4];
4245 #if OMPT_SUPPORT && OMPT_OPTIONAL
4246  ompt_dependence_t deps[num_dims];
4247 #endif
4248  if (st == 1) { // most common case
4249  iter_number = vec[0] - lo;
4250  } else if (st > 0) {
4251  iter_number = (kmp_uint64)(vec[0] - lo) / st;
4252  } else { // negative increment
4253  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4254  }
4255 #if OMPT_SUPPORT && OMPT_OPTIONAL
4256  deps[0].variable.value = iter_number;
4257  deps[0].dependence_type = ompt_dependence_type_source;
4258 #endif
4259  for (i = 1; i < num_dims; ++i) {
4260  kmp_int64 iter, ln;
4261  size_t j = i * 4;
4262  ln = pr_buf->th_doacross_info[j + 1];
4263  lo = pr_buf->th_doacross_info[j + 2];
4264  st = pr_buf->th_doacross_info[j + 4];
4265  if (st == 1) {
4266  iter = vec[i] - lo;
4267  } else if (st > 0) {
4268  iter = (kmp_uint64)(vec[i] - lo) / st;
4269  } else { // st < 0
4270  iter = (kmp_uint64)(lo - vec[i]) / (-st);
4271  }
4272  iter_number = iter + ln * iter_number;
4273 #if OMPT_SUPPORT && OMPT_OPTIONAL
4274  deps[i].variable.value = iter;
4275  deps[i].dependence_type = ompt_dependence_type_source;
4276 #endif
4277  }
4278 #if OMPT_SUPPORT && OMPT_OPTIONAL
4279  if (ompt_enabled.ompt_callback_dependences) {
4280  ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4281  &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4282  }
4283 #endif
4284  shft = iter_number % 32; // use 32-bit granularity
4285  iter_number >>= 5; // divided by 32
4286  flag = 1 << shft;
4287  KMP_MB();
4288  if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4289  KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4290  KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4291  (iter_number << 5) + shft));
4292 }
4293 
4294 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4295  __kmp_assert_valid_gtid(gtid);
4296  kmp_int32 num_done;
4297  kmp_info_t *th = __kmp_threads[gtid];
4298  kmp_team_t *team = th->th.th_team;
4299  kmp_disp_t *pr_buf = th->th.th_dispatch;
4300 
4301  KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4302  if (team->t.t_serialized) {
4303  KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4304  return; // nothing to do
4305  }
4306  num_done =
4307  KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4308  if (num_done == th->th.th_team_nproc) {
4309  // we are the last thread, need to free shared resources
4310  int idx = pr_buf->th_doacross_buf_idx - 1;
4311  dispatch_shared_info_t *sh_buf =
4312  &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4313  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4314  (kmp_int64)&sh_buf->doacross_num_done);
4315  KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4316  KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4317  __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4318  sh_buf->doacross_flags = NULL;
4319  sh_buf->doacross_num_done = 0;
4320  sh_buf->doacross_buf_idx +=
4321  __kmp_dispatch_num_buffers; // free buffer for future re-use
4322  }
4323  // free private resources (need to keep buffer index forever)
4324  pr_buf->th_doacross_flags = NULL;
4325  __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4326  pr_buf->th_doacross_info = NULL;
4327  KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4328 }
4329 
4330 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4331 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4332  return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4333 }
4334 
4335 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4336  return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4337 }
4338 
4339 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4340  omp_allocator_handle_t free_allocator) {
4341  return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4342  free_allocator);
4343 }
4344 
4345 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4346  __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4347 }
4348 
4349 int __kmpc_get_target_offload(void) {
4350  if (!__kmp_init_serial) {
4351  __kmp_serial_initialize();
4352  }
4353  return __kmp_target_offload;
4354 }
4355 
4356 int __kmpc_pause_resource(kmp_pause_status_t level) {
4357  if (!__kmp_init_serial) {
4358  return 1; // Can't pause if runtime is not initialized
4359  }
4360  return __kmp_pause_resource(level);
4361 }
4362 
4363 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4364  if (!__kmp_init_serial)
4365  __kmp_serial_initialize();
4366 
4367  KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4368 
4369 #if OMPT_SUPPORT
4370  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4371  ompt_callbacks.ompt_callback(ompt_callback_error)(
4372  (ompt_severity_t)severity, message, KMP_STRLEN(message),
4373  OMPT_GET_RETURN_ADDRESS(0));
4374  }
4375 #endif // OMPT_SUPPORT
4376 
4377  char *src_loc;
4378  if (loc && loc->psource) {
4379  kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4380  src_loc =
4381  __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4382  __kmp_str_loc_free(&str_loc);
4383  } else {
4384  src_loc = __kmp_str_format("unknown");
4385  }
4386 
4387  if (severity == severity_warning)
4388  KMP_WARNING(UserDirectedWarning, src_loc, message);
4389  else
4390  KMP_FATAL(UserDirectedError, src_loc, message);
4391 
4392  __kmp_str_free(&src_loc);
4393 }
4394 
4395 // Mark begin of scope directive.
4396 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4397 // reserved is for extension of scope directive and not used.
4398 #if OMPT_SUPPORT && OMPT_OPTIONAL
4399  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4400  kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4401  int tid = __kmp_tid_from_gtid(gtid);
4402  ompt_callbacks.ompt_callback(ompt_callback_work)(
4403  ompt_work_scope, ompt_scope_begin,
4404  &(team->t.ompt_team_info.parallel_data),
4405  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4406  OMPT_GET_RETURN_ADDRESS(0));
4407  }
4408 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4409 }
4410 
4411 // Mark end of scope directive
4412 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4413 // reserved is for extension of scope directive and not used.
4414 #if OMPT_SUPPORT && OMPT_OPTIONAL
4415  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4416  kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4417  int tid = __kmp_tid_from_gtid(gtid);
4418  ompt_callbacks.ompt_callback(ompt_callback_work)(
4419  ompt_work_scope, ompt_scope_end,
4420  &(team->t.ompt_team_info.parallel_data),
4421  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4422  OMPT_GET_RETURN_ADDRESS(0));
4423  }
4424 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4425 }
4426 
4427 #ifdef KMP_USE_VERSION_SYMBOLS
4428 // For GOMP compatibility there are two versions of each omp_* API.
4429 // One is the plain C symbol and one is the Fortran symbol with an appended
4430 // underscore. When we implement a specific ompc_* version of an omp_*
4431 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4432 // instead of the Fortran versions in kmp_ftn_entry.h
4433 extern "C" {
4434 // Have to undef these from omp.h so they aren't translated into
4435 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4436 #ifdef omp_set_affinity_format
4437 #undef omp_set_affinity_format
4438 #endif
4439 #ifdef omp_get_affinity_format
4440 #undef omp_get_affinity_format
4441 #endif
4442 #ifdef omp_display_affinity
4443 #undef omp_display_affinity
4444 #endif
4445 #ifdef omp_capture_affinity
4446 #undef omp_capture_affinity
4447 #endif
4448 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4449  "OMP_5.0");
4450 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4451  "OMP_5.0");
4452 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4453  "OMP_5.0");
4454 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4455  "OMP_5.0");
4456 } // extern "C"
4457 #endif
@ KMP_IDENT_WORK_LOOP
Definition: kmp.h:214
@ KMP_IDENT_WORK_SECTIONS
Definition: kmp.h:216
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
@ KMP_IDENT_WORK_DISTRIBUTE
Definition: kmp.h:218
kmp_int32 __kmpc_ok_to_fork(ident_t *loc)
void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)
void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads)
void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
void(* kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
Definition: kmp.h:1526
void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams_lb, kmp_int32 num_teams_ub, kmp_int32 num_threads)
void __kmpc_begin(ident_t *loc, kmp_int32 flags)
void __kmpc_end(ident_t *loc)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:908
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
void __kmpc_flush(ident_t *loc)
kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), kmp_int32 didit)
kmp_int32 __kmpc_global_num_threads(ident_t *loc)
kmp_int32 __kmpc_global_thread_num(ident_t *loc)
kmp_int32 __kmpc_in_parallel(ident_t *loc)
kmp_int32 __kmpc_bound_thread_num(ident_t *loc)
kmp_int32 __kmpc_bound_num_threads(ident_t *loc)
void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid)
void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit, uint32_t hint)
kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, const struct kmp_dim *dims)
void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_ordered(ident_t *loc, kmp_int32 gtid)
kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter)
void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp.h:234
char const * psource
Definition: kmp.h:244
kmp_int32 flags
Definition: kmp.h:236