LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411  char buffer[MAX_MESSAGE];
412  va_list ap;
413 
414  if (__kmp_generate_warnings == kmp_warnings_off) {
415  return;
416  }
417 
418  va_start(ap, format);
419 
420  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422  __kmp_vprintf(kmp_err, buffer, ap);
423  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425  va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429  // Later threads may stall here, but that's ok because abort() will kill them.
430  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432  if (__kmp_debug_buf) {
433  __kmp_dump_debug_buffer();
434  }
435 
436  if (KMP_OS_WINDOWS) {
437  // Let other threads know of abnormal termination and prevent deadlock
438  // if abort happened during library initialization or shutdown
439  __kmp_global.g.g_abort = SIGABRT;
440 
441  /* On Windows* OS by default abort() causes pop-up error box, which stalls
442  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443  boxes. _set_abort_behavior() works well, but this function is not
444  available in VS7 (this is not problem for DLL, but it is a problem for
445  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446  help, at least in some versions of MS C RTL.
447 
448  It seems following sequence is the only way to simulate abort() and
449  avoid pop-up error box. */
450  raise(SIGABRT);
451  _exit(3); // Just in case, if signal ignored, exit anyway.
452  } else {
453  __kmp_unregister_library();
454  abort();
455  }
456 
457  __kmp_infinite_loop();
458  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463  // TODO: Eliminate g_abort global variable and this function.
464  // In case of abort just call abort(), it will kill all the threads.
465  __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469  that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473  gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479  sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481  __kmp_print_storage_map_gtid(
482  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486  &thr->th.th_bar[bs_plain_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491  &thr->th.th_bar[bs_forkjoin_barrier + 1],
492  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493  gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497  &thr->th.th_bar[bs_reduction_barrier + 1],
498  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499  gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504  that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507  int team_id, int num_thr) {
508  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513  &team->t.t_bar[bs_last_barrier],
514  sizeof(kmp_balign_team_t) * bs_last_barrier,
515  "%s_%d.t_bar", header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518  &team->t.t_bar[bs_plain_barrier + 1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520  header, team_id);
521 
522  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523  &team->t.t_bar[bs_forkjoin_barrier + 1],
524  sizeof(kmp_balign_team_t),
525  "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529  &team->t.t_bar[bs_reduction_barrier + 1],
530  sizeof(kmp_balign_team_t),
531  "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538  __kmp_print_storage_map_gtid(
539  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543  &team->t.t_disp_buffer[num_disp_buff],
544  sizeof(dispatch_shared_info_t) * num_disp_buff,
545  "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549  __kmp_init_memkind();
550  __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562  switch (fdwReason) {
563 
564  case DLL_PROCESS_ATTACH:
565  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567  return TRUE;
568 
569  case DLL_PROCESS_DETACH:
570  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572  // According to Windows* documentation for DllMain entry point:
573  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574  // lpReserved == NULL when FreeLibrary() is called,
575  // lpReserved != NULL when the process is terminated.
576  // When FreeLibrary() is called, worker threads remain alive. So the
577  // runtime's state is consistent and executing proper shutdown is OK.
578  // When the process is terminated, worker threads have exited or been
579  // forcefully terminated by the OS and only the shutdown thread remains.
580  // This can leave the runtime in an inconsistent state.
581  // Hence, only attempt proper cleanup when FreeLibrary() is called.
582  // Otherwise, rely on OS to reclaim resources.
583  if (lpReserved == NULL)
584  __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586  return TRUE;
587 
588  case DLL_THREAD_ATTACH:
589  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591  /* if we want to register new siblings all the time here call
592  * __kmp_get_gtid(); */
593  return TRUE;
594 
595  case DLL_THREAD_DETACH:
596  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598  __kmp_internal_end_thread(__kmp_gtid_get_specific());
599  return TRUE;
600  }
601 
602  return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610  int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612  kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615  if (__kmp_env_consistency_check) {
616  if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622  }
623 #ifdef BUILD_PARALLEL_ORDERED
624  if (!team->t.t_serialized) {
625  KMP_MB();
626  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627  NULL);
628  KMP_MB();
629  }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635  int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637  int tid = __kmp_tid_from_gtid(gtid);
638  kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641  if (__kmp_env_consistency_check) {
642  if (__kmp_threads[gtid]->th.th_root->r.r_active)
643  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644  }
645 #ifdef BUILD_PARALLEL_ORDERED
646  if (!team->t.t_serialized) {
647  KMP_MB(); /* Flush all pending memory write invalidates. */
648 
649  /* use the tid of the next thread in this team */
650  /* TODO replace with general release procedure */
651  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653  KMP_MB(); /* Flush all pending memory write invalidates. */
654  }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662  int status;
663  kmp_info_t *th;
664  kmp_team_t *team;
665 
666  if (!TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668  __kmp_resume_if_soft_paused();
669 
670  th = __kmp_threads[gtid];
671  team = th->th.th_team;
672  status = 0;
673 
674  th->th.th_ident = id_ref;
675 
676  if (team->t.t_serialized) {
677  status = 1;
678  } else {
679  kmp_int32 old_this = th->th.th_local.this_construct;
680 
681  ++th->th.th_local.this_construct;
682  /* try to set team count to thread count--success means thread got the
683  single block */
684  /* TODO: Should this be acquire or release? */
685  if (team->t.t_construct == old_this) {
686  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687  th->th.th_local.this_construct);
688  }
689 #if USE_ITT_BUILD
690  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692  team->t.t_active_level == 1) {
693  // Only report metadata by primary thread of active team at level 1
694  __kmp_itt_metadata_single(id_ref);
695  }
696 #endif /* USE_ITT_BUILD */
697  }
698 
699  if (__kmp_env_consistency_check) {
700  if (status && push_ws) {
701  __kmp_push_workshare(gtid, ct_psingle, id_ref);
702  } else {
703  __kmp_check_workshare(gtid, ct_psingle, id_ref);
704  }
705  }
706 #if USE_ITT_BUILD
707  if (status) {
708  __kmp_itt_single_start(gtid);
709  }
710 #endif /* USE_ITT_BUILD */
711  return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716  __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718  if (__kmp_env_consistency_check)
719  __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729  int master_tid, int set_nthreads,
730  int enter_teams) {
731  int capacity;
732  int new_nthreads;
733  KMP_DEBUG_ASSERT(__kmp_init_serial);
734  KMP_DEBUG_ASSERT(root && parent_team);
735  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737  // If dyn-var is set, dynamically adjust the number of desired threads,
738  // according to the method specified by dynamic_mode.
739  new_nthreads = set_nthreads;
740  if (!get__dynamic_2(parent_team, master_tid)) {
741  ;
742  }
743 #ifdef USE_LOAD_BALANCE
744  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746  if (new_nthreads == 1) {
747  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748  "reservation to 1 thread\n",
749  master_tid));
750  return 1;
751  }
752  if (new_nthreads < set_nthreads) {
753  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754  "reservation to %d threads\n",
755  master_tid, new_nthreads));
756  }
757  }
758 #endif /* USE_LOAD_BALANCE */
759  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760  new_nthreads = __kmp_avail_proc - __kmp_nth +
761  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762  if (new_nthreads <= 1) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764  "reservation to 1 thread\n",
765  master_tid));
766  return 1;
767  }
768  if (new_nthreads < set_nthreads) {
769  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770  "reservation to %d threads\n",
771  master_tid, new_nthreads));
772  } else {
773  new_nthreads = set_nthreads;
774  }
775  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776  if (set_nthreads > 2) {
777  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778  new_nthreads = (new_nthreads % set_nthreads) + 1;
779  if (new_nthreads == 1) {
780  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781  "reservation to 1 thread\n",
782  master_tid));
783  return 1;
784  }
785  if (new_nthreads < set_nthreads) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787  "reservation to %d threads\n",
788  master_tid, new_nthreads));
789  }
790  }
791  } else {
792  KMP_ASSERT(0);
793  }
794 
795  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796  if (__kmp_nth + new_nthreads -
797  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798  __kmp_max_nth) {
799  int tl_nthreads = __kmp_max_nth - __kmp_nth +
800  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801  if (tl_nthreads <= 0) {
802  tl_nthreads = 1;
803  }
804 
805  // If dyn-var is false, emit a 1-time warning.
806  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807  __kmp_reserve_warn = 1;
808  __kmp_msg(kmp_ms_warning,
809  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811  }
812  if (tl_nthreads == 1) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814  "reduced reservation to 1 thread\n",
815  master_tid));
816  return 1;
817  }
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819  "reservation to %d threads\n",
820  master_tid, tl_nthreads));
821  new_nthreads = tl_nthreads;
822  }
823 
824  // Respect OMP_THREAD_LIMIT
825  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827  if (cg_nthreads + new_nthreads -
828  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829  max_cg_threads) {
830  int tl_nthreads = max_cg_threads - cg_nthreads +
831  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832  if (tl_nthreads <= 0) {
833  tl_nthreads = 1;
834  }
835 
836  // If dyn-var is false, emit a 1-time warning.
837  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838  __kmp_reserve_warn = 1;
839  __kmp_msg(kmp_ms_warning,
840  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842  }
843  if (tl_nthreads == 1) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845  "reduced reservation to 1 thread\n",
846  master_tid));
847  return 1;
848  }
849  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850  "reservation to %d threads\n",
851  master_tid, tl_nthreads));
852  new_nthreads = tl_nthreads;
853  }
854 
855  // Check if the threads array is large enough, or needs expanding.
856  // See comment in __kmp_register_root() about the adjustment if
857  // __kmp_threads[0] == NULL.
858  capacity = __kmp_threads_capacity;
859  if (TCR_PTR(__kmp_threads[0]) == NULL) {
860  --capacity;
861  }
862  // If it is not for initializing the hidden helper team, we need to take
863  // __kmp_hidden_helper_threads_num out of the capacity because it is included
864  // in __kmp_threads_capacity.
865  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866  capacity -= __kmp_hidden_helper_threads_num;
867  }
868  if (__kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  capacity) {
871  // Expand the threads array.
872  int slotsRequired = __kmp_nth + new_nthreads -
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874  capacity;
875  int slotsAdded = __kmp_expand_threads(slotsRequired);
876  if (slotsAdded < slotsRequired) {
877  // The threads array was not expanded enough.
878  new_nthreads -= (slotsRequired - slotsAdded);
879  KMP_ASSERT(new_nthreads >= 1);
880 
881  // If dyn-var is false, emit a 1-time warning.
882  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883  __kmp_reserve_warn = 1;
884  if (__kmp_tp_cached) {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889  } else {
890  __kmp_msg(kmp_ms_warning,
891  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893  }
894  }
895  }
896  }
897 
898 #ifdef KMP_DEBUG
899  if (new_nthreads == 1) {
900  KC_TRACE(10,
901  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902  "dead roots and rechecking; requested %d threads\n",
903  __kmp_get_gtid(), set_nthreads));
904  } else {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906  " %d threads\n",
907  __kmp_get_gtid(), new_nthreads, set_nthreads));
908  }
909 #endif // KMP_DEBUG
910  return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914  assured that there are enough threads available, because we checked on that
915  earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917  kmp_info_t *master_th, int master_gtid,
918  int fork_teams_workers) {
919  int i;
920  int use_hot_team;
921 
922  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924  KMP_MB();
925 
926  /* first, let's setup the primary thread */
927  master_th->th.th_info.ds.ds_tid = 0;
928  master_th->th.th_team = team;
929  master_th->th.th_team_nproc = team->t.t_nproc;
930  master_th->th.th_team_master = master_th;
931  master_th->th.th_team_serialized = FALSE;
932  master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936  use_hot_team = 0;
937  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938  if (hot_teams) { // hot teams array is not allocated if
939  // KMP_HOT_TEAMS_MAX_LEVEL=0
940  int level = team->t.t_active_level - 1; // index in array of hot teams
941  if (master_th->th.th_teams_microtask) { // are we inside the teams?
942  if (master_th->th.th_teams_size.nteams > 1) {
943  ++level; // level was not increased in teams construct for
944  // team_of_masters
945  }
946  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947  master_th->th.th_teams_level == team->t.t_level) {
948  ++level; // level was not increased in teams construct for
949  // team_of_workers before the parallel
950  } // team->t.t_level will be increased inside parallel
951  }
952  if (level < __kmp_hot_teams_max_level) {
953  if (hot_teams[level].hot_team) {
954  // hot team has already been allocated for given level
955  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956  use_hot_team = 1; // the team is ready to use
957  } else {
958  use_hot_team = 0; // AC: threads are not allocated yet
959  hot_teams[level].hot_team = team; // remember new hot team
960  hot_teams[level].hot_team_nth = team->t.t_nproc;
961  }
962  } else {
963  use_hot_team = 0;
964  }
965  }
966 #else
967  use_hot_team = team == root->r.r_hot_team;
968 #endif
969  if (!use_hot_team) {
970 
971  /* install the primary thread */
972  team->t.t_threads[0] = master_th;
973  __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975  /* now, install the worker threads */
976  for (i = 1; i < team->t.t_nproc; i++) {
977 
978  /* fork or reallocate a new thread and install it in team */
979  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980  team->t.t_threads[i] = thr;
981  KMP_DEBUG_ASSERT(thr);
982  KMP_DEBUG_ASSERT(thr->th.th_team == team);
983  /* align team and thread arrived states */
984  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985  "T#%d(%d:%d) join =%llu, plain=%llu\n",
986  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989  team->t.t_bar[bs_plain_barrier].b_arrived));
990  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991  thr->th.th_teams_level = master_th->th.th_teams_level;
992  thr->th.th_teams_size = master_th->th.th_teams_size;
993  { // Initialize threads' barrier data.
994  int b;
995  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996  for (b = 0; b < bs_last_barrier; ++b) {
997  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002  }
1003  }
1004  }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007  // Do not partition the places list for teams construct workers who
1008  // haven't actually been forked to do real work yet. This partitioning
1009  // will take place in the parallel region nested within the teams construct.
1010  if (!fork_teams_workers) {
1011  __kmp_partition_places(team);
1012  }
1013 #endif
1014  }
1015 
1016  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017  for (i = 0; i < team->t.t_nproc; i++) {
1018  kmp_info_t *thr = team->t.t_threads[i];
1019  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020  thr->th.th_prev_level != team->t.t_level) {
1021  team->t.t_display_affinity = 1;
1022  break;
1023  }
1024  }
1025  }
1026 
1027  KMP_MB();
1028 }
1029 
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031 // Propagate any changes to the floating point control registers out to the team
1032 // We try to avoid unnecessary writes to the relevant cache line in the team
1033 // structure, so we don't make changes unless they are needed.
1034 inline static void propagateFPControl(kmp_team_t *team) {
1035  if (__kmp_inherit_fp_control) {
1036  kmp_int16 x87_fpu_control_word;
1037  kmp_uint32 mxcsr;
1038 
1039  // Get primary thread's values of FPU control flags (both X87 and vector)
1040  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041  __kmp_store_mxcsr(&mxcsr);
1042  mxcsr &= KMP_X86_MXCSR_MASK;
1043 
1044  // There is no point looking at t_fp_control_saved here.
1045  // If it is TRUE, we still have to update the values if they are different
1046  // from those we now have. If it is FALSE we didn't save anything yet, but
1047  // our objective is the same. We have to ensure that the values in the team
1048  // are the same as those we have.
1049  // So, this code achieves what we need whether or not t_fp_control_saved is
1050  // true. By checking whether the value needs updating we avoid unnecessary
1051  // writes that would put the cache-line into a written state, causing all
1052  // threads in the team to have to read it again.
1053  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055  // Although we don't use this value, other code in the runtime wants to know
1056  // whether it should restore them. So we must ensure it is correct.
1057  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058  } else {
1059  // Similarly here. Don't write to this cache-line in the team structure
1060  // unless we have to.
1061  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062  }
1063 }
1064 
1065 // Do the opposite, setting the hardware registers to the updated values from
1066 // the team.
1067 inline static void updateHWFPControl(kmp_team_t *team) {
1068  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069  // Only reset the fp control regs if they have been changed in the team.
1070  // the parallel region that we are exiting.
1071  kmp_int16 x87_fpu_control_word;
1072  kmp_uint32 mxcsr;
1073  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074  __kmp_store_mxcsr(&mxcsr);
1075  mxcsr &= KMP_X86_MXCSR_MASK;
1076 
1077  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078  __kmp_clear_x87_fpu_status_word();
1079  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080  }
1081 
1082  if (team->t.t_mxcsr != mxcsr) {
1083  __kmp_load_mxcsr(&team->t.t_mxcsr);
1084  }
1085  }
1086 }
1087 #else
1088 #define propagateFPControl(x) ((void)0)
1089 #define updateHWFPControl(x) ((void)0)
1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091 
1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093  int realloc); // forward declaration
1094 
1095 /* Run a parallel region that has been serialized, so runs only in a team of the
1096  single primary thread. */
1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098  kmp_info_t *this_thr;
1099  kmp_team_t *serial_team;
1100 
1101  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102 
1103  /* Skip all this code for autopar serialized loops since it results in
1104  unacceptable overhead */
1105  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106  return;
1107 
1108  if (!TCR_4(__kmp_init_parallel))
1109  __kmp_parallel_initialize();
1110  __kmp_resume_if_soft_paused();
1111 
1112  this_thr = __kmp_threads[global_tid];
1113  serial_team = this_thr->th.th_serial_team;
1114 
1115  /* utilize the serialized team held by this thread */
1116  KMP_DEBUG_ASSERT(serial_team);
1117  KMP_MB();
1118 
1119  if (__kmp_tasking_mode != tskm_immediate_exec) {
1120  KMP_DEBUG_ASSERT(
1121  this_thr->th.th_task_team ==
1122  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124  NULL);
1125  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126  "team %p, new task_team = NULL\n",
1127  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128  this_thr->th.th_task_team = NULL;
1129  }
1130 
1131  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133  proc_bind = proc_bind_false;
1134  } else if (proc_bind == proc_bind_default) {
1135  // No proc_bind clause was specified, so use the current value
1136  // of proc-bind-var for this parallel region.
1137  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138  }
1139  // Reset for next parallel region
1140  this_thr->th.th_set_proc_bind = proc_bind_default;
1141 
1142 #if OMPT_SUPPORT
1143  ompt_data_t ompt_parallel_data = ompt_data_none;
1144  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145  if (ompt_enabled.enabled &&
1146  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147 
1148  ompt_task_info_t *parent_task_info;
1149  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150 
1151  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152  if (ompt_enabled.ompt_callback_parallel_begin) {
1153  int team_size = 1;
1154 
1155  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156  &(parent_task_info->task_data), &(parent_task_info->frame),
1157  &ompt_parallel_data, team_size,
1158  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159  }
1160  }
1161 #endif // OMPT_SUPPORT
1162 
1163  if (this_thr->th.th_team != serial_team) {
1164  // Nested level will be an index in the nested nthreads array
1165  int level = this_thr->th.th_team->t.t_level;
1166 
1167  if (serial_team->t.t_serialized) {
1168  /* this serial team was already used
1169  TODO increase performance by making this locks more specific */
1170  kmp_team_t *new_team;
1171 
1172  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173 
1174  new_team =
1175  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176 #if OMPT_SUPPORT
1177  ompt_parallel_data,
1178 #endif
1179  proc_bind, &this_thr->th.th_current_task->td_icvs,
1180  0 USE_NESTED_HOT_ARG(NULL));
1181  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182  KMP_ASSERT(new_team);
1183 
1184  /* setup new serialized team and install it */
1185  new_team->t.t_threads[0] = this_thr;
1186  new_team->t.t_parent = this_thr->th.th_team;
1187  serial_team = new_team;
1188  this_thr->th.th_serial_team = serial_team;
1189 
1190  KF_TRACE(
1191  10,
1192  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193  global_tid, serial_team));
1194 
1195  /* TODO the above breaks the requirement that if we run out of resources,
1196  then we can still guarantee that serialized teams are ok, since we may
1197  need to allocate a new one */
1198  } else {
1199  KF_TRACE(
1200  10,
1201  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202  global_tid, serial_team));
1203  }
1204 
1205  /* we have to initialize this serial team */
1206  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209  serial_team->t.t_ident = loc;
1210  serial_team->t.t_serialized = 1;
1211  serial_team->t.t_nproc = 1;
1212  serial_team->t.t_parent = this_thr->th.th_team;
1213  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214  this_thr->th.th_team = serial_team;
1215  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216 
1217  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218  this_thr->th.th_current_task));
1219  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220  this_thr->th.th_current_task->td_flags.executing = 0;
1221 
1222  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223 
1224  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225  implicit task for each serialized task represented by
1226  team->t.t_serialized? */
1227  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228  &this_thr->th.th_current_task->td_parent->td_icvs);
1229 
1230  // Thread value exists in the nested nthreads array for the next nested
1231  // level
1232  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233  this_thr->th.th_current_task->td_icvs.nproc =
1234  __kmp_nested_nth.nth[level + 1];
1235  }
1236 
1237  if (__kmp_nested_proc_bind.used &&
1238  (level + 1 < __kmp_nested_proc_bind.used)) {
1239  this_thr->th.th_current_task->td_icvs.proc_bind =
1240  __kmp_nested_proc_bind.bind_types[level + 1];
1241  }
1242 
1243 #if USE_DEBUGGER
1244  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245 #endif
1246  this_thr->th.th_info.ds.ds_tid = 0;
1247 
1248  /* set thread cache values */
1249  this_thr->th.th_team_nproc = 1;
1250  this_thr->th.th_team_master = this_thr;
1251  this_thr->th.th_team_serialized = 1;
1252 
1253  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256 
1257  propagateFPControl(serial_team);
1258 
1259  /* check if we need to allocate dispatch buffers stack */
1260  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262  serial_team->t.t_dispatch->th_disp_buffer =
1263  (dispatch_private_info_t *)__kmp_allocate(
1264  sizeof(dispatch_private_info_t));
1265  }
1266  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267 
1268  KMP_MB();
1269 
1270  } else {
1271  /* this serialized team is already being used,
1272  * that's fine, just add another nested level */
1273  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276  ++serial_team->t.t_serialized;
1277  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278 
1279  // Nested level will be an index in the nested nthreads array
1280  int level = this_thr->th.th_team->t.t_level;
1281  // Thread value exists in the nested nthreads array for the next nested
1282  // level
1283  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284  this_thr->th.th_current_task->td_icvs.nproc =
1285  __kmp_nested_nth.nth[level + 1];
1286  }
1287  serial_team->t.t_level++;
1288  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289  "of serial team %p to %d\n",
1290  global_tid, serial_team, serial_team->t.t_level));
1291 
1292  /* allocate/push dispatch buffers stack */
1293  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294  {
1295  dispatch_private_info_t *disp_buffer =
1296  (dispatch_private_info_t *)__kmp_allocate(
1297  sizeof(dispatch_private_info_t));
1298  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300  }
1301  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302 
1303  KMP_MB();
1304  }
1305  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306 
1307  // Perform the display affinity functionality for
1308  // serialized parallel regions
1309  if (__kmp_display_affinity) {
1310  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311  this_thr->th.th_prev_num_threads != 1) {
1312  // NULL means use the affinity-format-var ICV
1313  __kmp_aux_display_affinity(global_tid, NULL);
1314  this_thr->th.th_prev_level = serial_team->t.t_level;
1315  this_thr->th.th_prev_num_threads = 1;
1316  }
1317  }
1318 
1319  if (__kmp_env_consistency_check)
1320  __kmp_push_parallel(global_tid, NULL);
1321 #if OMPT_SUPPORT
1322  serial_team->t.ompt_team_info.master_return_address = codeptr;
1323  if (ompt_enabled.enabled &&
1324  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326  OMPT_GET_FRAME_ADDRESS(0);
1327 
1328  ompt_lw_taskteam_t lw_taskteam;
1329  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330  &ompt_parallel_data, codeptr);
1331 
1332  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333  // don't use lw_taskteam after linking. content was swaped
1334 
1335  /* OMPT implicit task begin */
1336  if (ompt_enabled.ompt_callback_implicit_task) {
1337  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342  __kmp_tid_from_gtid(global_tid);
1343  }
1344 
1345  /* OMPT state */
1346  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348  OMPT_GET_FRAME_ADDRESS(0);
1349  }
1350 #endif
1351 }
1352 
1353 /* most of the work for a fork */
1354 /* return true if we really went parallel, false if serialized */
1355 int __kmp_fork_call(ident_t *loc, int gtid,
1356  enum fork_context_e call_context, // Intel, GNU, ...
1357  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358  kmp_va_list ap) {
1359  void **argv;
1360  int i;
1361  int master_tid;
1362  int master_this_cons;
1363  kmp_team_t *team;
1364  kmp_team_t *parent_team;
1365  kmp_info_t *master_th;
1366  kmp_root_t *root;
1367  int nthreads;
1368  int master_active;
1369  int master_set_numthreads;
1370  int level;
1371  int active_level;
1372  int teams_level;
1373 #if KMP_NESTED_HOT_TEAMS
1374  kmp_hot_team_ptr_t **p_hot_teams;
1375 #endif
1376  { // KMP_TIME_BLOCK
1377  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379 
1380  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382  /* Some systems prefer the stack for the root thread(s) to start with */
1383  /* some gap from the parent stack to prevent false sharing. */
1384  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385  /* These 2 lines below are so this does not get optimized out */
1386  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387  __kmp_stkpadding += (short)((kmp_int64)dummy);
1388  }
1389 
1390  /* initialize if needed */
1391  KMP_DEBUG_ASSERT(
1392  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393  if (!TCR_4(__kmp_init_parallel))
1394  __kmp_parallel_initialize();
1395  __kmp_resume_if_soft_paused();
1396 
1397  /* setup current data */
1398  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399  // shutdown
1400  parent_team = master_th->th.th_team;
1401  master_tid = master_th->th.th_info.ds.ds_tid;
1402  master_this_cons = master_th->th.th_local.this_construct;
1403  root = master_th->th.th_root;
1404  master_active = root->r.r_active;
1405  master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408  ompt_data_t ompt_parallel_data = ompt_data_none;
1409  ompt_data_t *parent_task_data;
1410  ompt_frame_t *ompt_frame;
1411  ompt_data_t *implicit_task_data;
1412  void *return_address = NULL;
1413 
1414  if (ompt_enabled.enabled) {
1415  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416  NULL, NULL);
1417  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418  }
1419 #endif
1420 
1421  // Assign affinity to root thread if it hasn't happened yet
1422  __kmp_assign_root_init_mask();
1423 
1424  // Nested level will be an index in the nested nthreads array
1425  level = parent_team->t.t_level;
1426  // used to launch non-serial teams even if nested is not allowed
1427  active_level = parent_team->t.t_active_level;
1428  // needed to check nesting inside the teams
1429  teams_level = master_th->th.th_teams_level;
1430 #if KMP_NESTED_HOT_TEAMS
1431  p_hot_teams = &master_th->th.th_hot_teams;
1432  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436  // it is either actual or not needed (when active_level > 0)
1437  (*p_hot_teams)[0].hot_team_nth = 1;
1438  }
1439 #endif
1440 
1441 #if OMPT_SUPPORT
1442  if (ompt_enabled.enabled) {
1443  if (ompt_enabled.ompt_callback_parallel_begin) {
1444  int team_size = master_set_numthreads
1445  ? master_set_numthreads
1446  : get__nproc_2(parent_team, master_tid);
1447  int flags = OMPT_INVOKER(call_context) |
1448  ((microtask == (microtask_t)__kmp_teams_master)
1449  ? ompt_parallel_league
1450  : ompt_parallel_team);
1451  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453  return_address);
1454  }
1455  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456  }
1457 #endif
1458 
1459  master_th->th.th_ident = loc;
1460 
1461  if (master_th->th.th_teams_microtask && ap &&
1462  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463  // AC: This is start of parallel that is nested inside teams construct.
1464  // The team is actual (hot), all workers are ready at the fork barrier.
1465  // No lock needed to initialize the team a bit, then free workers.
1466  parent_team->t.t_ident = loc;
1467  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468  parent_team->t.t_argc = argc;
1469  argv = (void **)parent_team->t.t_argv;
1470  for (i = argc - 1; i >= 0; --i)
1471  *argv++ = va_arg(kmp_va_deref(ap), void *);
1472  // Increment our nested depth levels, but not increase the serialization
1473  if (parent_team == master_th->th.th_serial_team) {
1474  // AC: we are in serialized parallel
1475  __kmpc_serialized_parallel(loc, gtid);
1476  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477 
1478  if (call_context == fork_context_gnu) {
1479  // AC: need to decrement t_serialized for enquiry functions to work
1480  // correctly, will restore at join time
1481  parent_team->t.t_serialized--;
1482  return TRUE;
1483  }
1484 
1485 #if OMPD_SUPPORT
1486  parent_team->t.t_pkfn = microtask;
1487 #endif
1488 
1489 #if OMPT_SUPPORT
1490  void *dummy;
1491  void **exit_frame_p;
1492 
1493  ompt_lw_taskteam_t lw_taskteam;
1494 
1495  if (ompt_enabled.enabled) {
1496  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497  &ompt_parallel_data, return_address);
1498  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499 
1500  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501  // don't use lw_taskteam after linking. content was swaped
1502 
1503  /* OMPT implicit task begin */
1504  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505  if (ompt_enabled.ompt_callback_implicit_task) {
1506  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507  __kmp_tid_from_gtid(gtid);
1508  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510  implicit_task_data, 1,
1511  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512  }
1513 
1514  /* OMPT state */
1515  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516  } else {
1517  exit_frame_p = &dummy;
1518  }
1519 #endif
1520  // AC: need to decrement t_serialized for enquiry functions to work
1521  // correctly, will restore at join time
1522  parent_team->t.t_serialized--;
1523 
1524  {
1525  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528 #if OMPT_SUPPORT
1529  ,
1530  exit_frame_p
1531 #endif
1532  );
1533  }
1534 
1535 #if OMPT_SUPPORT
1536  if (ompt_enabled.enabled) {
1537  *exit_frame_p = NULL;
1538  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539  if (ompt_enabled.ompt_callback_implicit_task) {
1540  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541  ompt_scope_end, NULL, implicit_task_data, 1,
1542  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543  }
1544  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545  __ompt_lw_taskteam_unlink(master_th);
1546  if (ompt_enabled.ompt_callback_parallel_end) {
1547  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549  OMPT_INVOKER(call_context) | ompt_parallel_team,
1550  return_address);
1551  }
1552  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553  }
1554 #endif
1555  return TRUE;
1556  }
1557 
1558  parent_team->t.t_pkfn = microtask;
1559  parent_team->t.t_invoke = invoker;
1560  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561  parent_team->t.t_active_level++;
1562  parent_team->t.t_level++;
1563  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564 
1565 #if OMPT_SUPPORT
1566  if (ompt_enabled.enabled) {
1567  ompt_lw_taskteam_t lw_taskteam;
1568  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569  &ompt_parallel_data, return_address);
1570  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571  }
1572 #endif
1573 
1574  /* Change number of threads in the team if requested */
1575  if (master_set_numthreads) { // The parallel has num_threads clause
1576  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577  // AC: only can reduce number of threads dynamically, can't increase
1578  kmp_info_t **other_threads = parent_team->t.t_threads;
1579  // NOTE: if using distributed barrier, we need to run this code block
1580  // even when the team size appears not to have changed from the max.
1581  int old_proc = master_th->th.th_teams_size.nth;
1582  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583  bp_dist_bar) {
1584  __kmp_resize_dist_barrier(parent_team, old_proc,
1585  master_set_numthreads);
1586  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587  }
1588  parent_team->t.t_nproc = master_set_numthreads;
1589  for (i = 0; i < master_set_numthreads; ++i) {
1590  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591  }
1592  }
1593  // Keep extra threads hot in the team for possible next parallels
1594  master_th->th.th_set_nproc = 0;
1595  }
1596 
1597 #if USE_DEBUGGER
1598  if (__kmp_debugging) { // Let debugger override number of threads.
1599  int nth = __kmp_omp_num_threads(loc);
1600  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601  master_set_numthreads = nth;
1602  }
1603  }
1604 #endif
1605 
1606  // Figure out the proc_bind policy for the nested parallel within teams
1607  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608  // proc_bind_default means don't update
1609  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611  proc_bind = proc_bind_false;
1612  } else {
1613  // No proc_bind clause specified; use current proc-bind-var
1614  if (proc_bind == proc_bind_default) {
1615  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616  }
1617  /* else: The proc_bind policy was specified explicitly on parallel
1618  clause.
1619  This overrides proc-bind-var for this parallel region, but does not
1620  change proc-bind-var. */
1621  // Figure the value of proc-bind-var for the child threads.
1622  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624  master_th->th.th_current_task->td_icvs.proc_bind)) {
1625  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626  }
1627  }
1628  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629  // Need to change the bind-var ICV to correct value for each implicit task
1630  if (proc_bind_icv != proc_bind_default &&
1631  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632  kmp_info_t **other_threads = parent_team->t.t_threads;
1633  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634  other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635  proc_bind_icv;
1636  }
1637  }
1638  // Reset for next parallel region
1639  master_th->th.th_set_proc_bind = proc_bind_default;
1640 
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643  KMP_ITT_DEBUG) &&
1644  __kmp_forkjoin_frames_mode == 3 &&
1645  parent_team->t.t_active_level == 1 // only report frames at level 1
1646  && master_th->th.th_teams_size.nteams == 1) {
1647  kmp_uint64 tmp_time = __itt_get_timestamp();
1648  master_th->th.th_frame_time = tmp_time;
1649  parent_team->t.t_region_time = tmp_time;
1650  }
1651  if (__itt_stack_caller_create_ptr) {
1652  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653  // create new stack stitching id before entering fork barrier
1654  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655  }
1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 #if KMP_AFFINITY_SUPPORTED
1658  __kmp_partition_places(parent_team);
1659 #endif
1660 
1661  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662  "master_th=%p, gtid=%d\n",
1663  root, parent_team, master_th, gtid));
1664  __kmp_internal_fork(loc, gtid, parent_team);
1665  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666  "master_th=%p, gtid=%d\n",
1667  root, parent_team, master_th, gtid));
1668 
1669  if (call_context == fork_context_gnu)
1670  return TRUE;
1671 
1672  /* Invoke microtask for PRIMARY thread */
1673  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674  parent_team->t.t_id, parent_team->t.t_pkfn));
1675 
1676  if (!parent_team->t.t_invoke(gtid)) {
1677  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678  }
1679  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680  parent_team->t.t_id, parent_team->t.t_pkfn));
1681  KMP_MB(); /* Flush all pending memory write invalidates. */
1682 
1683  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684 
1685  return TRUE;
1686  } // Parallel closely nested in teams construct
1687 
1688 #if KMP_DEBUG
1689  if (__kmp_tasking_mode != tskm_immediate_exec) {
1690  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691  parent_team->t.t_task_team[master_th->th.th_task_state]);
1692  }
1693 #endif
1694 
1695  // Need this to happen before we determine the number of threads, not while
1696  // we are allocating the team
1697  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698  int enter_teams = 0;
1699  if (parent_team->t.t_active_level >=
1700  master_th->th.th_current_task->td_icvs.max_active_levels) {
1701  nthreads = 1;
1702  } else {
1703  enter_teams = ((ap == NULL && active_level == 0) ||
1704  (ap && teams_level > 0 && teams_level == level));
1705  nthreads = master_set_numthreads
1706  ? master_set_numthreads
1707  // TODO: get nproc directly from current task
1708  : get__nproc_2(parent_team, master_tid);
1709  // Check if we need to take forkjoin lock? (no need for serialized
1710  // parallel out of teams construct). This code moved here from
1711  // __kmp_reserve_threads() to speedup nested serialized parallels.
1712  if (nthreads > 1) {
1713  if ((get__max_active_levels(master_th) == 1 &&
1714  (root->r.r_in_parallel && !enter_teams)) ||
1715  (__kmp_library == library_serial)) {
1716  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717  " threads\n",
1718  gtid, nthreads));
1719  nthreads = 1;
1720  }
1721  }
1722  if (nthreads > 1) {
1723  /* determine how many new threads we can use */
1724  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725  /* AC: If we execute teams from parallel region (on host), then teams
1726  should be created but each can only have 1 thread if nesting is
1727  disabled. If teams called from serial region, then teams and their
1728  threads should be created regardless of the nesting setting. */
1729  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730  nthreads, enter_teams);
1731  if (nthreads == 1) {
1732  // Free lock for single thread execution here; for multi-thread
1733  // execution it will be freed later after team of threads created
1734  // and initialized
1735  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736  }
1737  }
1738  }
1739  KMP_DEBUG_ASSERT(nthreads > 0);
1740 
1741  // If we temporarily changed the set number of threads then restore it now
1742  master_th->th.th_set_nproc = 0;
1743 
1744  /* create a serialized parallel region? */
1745  if (nthreads == 1) {
1746 /* josh todo: hypothetical question: what do we do for OS X*? */
1747 #if KMP_OS_LINUX && \
1748  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749  void *args[argc];
1750 #else
1751  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753  KMP_ARCH_AARCH64) */
1754 
1755  KA_TRACE(20,
1756  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757 
1758  __kmpc_serialized_parallel(loc, gtid);
1759 
1760 #if OMPD_SUPPORT
1761  master_th->th.th_serial_team->t.t_pkfn = microtask;
1762 #endif
1763 
1764  if (call_context == fork_context_intel) {
1765  /* TODO this sucks, use the compiler itself to pass args! :) */
1766  master_th->th.th_serial_team->t.t_ident = loc;
1767  if (!ap) {
1768  // revert change made in __kmpc_serialized_parallel()
1769  master_th->th.th_serial_team->t.t_level--;
1770  // Get args from parent team for teams construct
1771 
1772 #if OMPT_SUPPORT
1773  void *dummy;
1774  void **exit_frame_p;
1775  ompt_task_info_t *task_info;
1776 
1777  ompt_lw_taskteam_t lw_taskteam;
1778 
1779  if (ompt_enabled.enabled) {
1780  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781  &ompt_parallel_data, return_address);
1782 
1783  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784  // don't use lw_taskteam after linking. content was swaped
1785 
1786  task_info = OMPT_CUR_TASK_INFO(master_th);
1787  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788  if (ompt_enabled.ompt_callback_implicit_task) {
1789  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790  __kmp_tid_from_gtid(gtid);
1791  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793  &(task_info->task_data), 1,
1794  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795  ompt_task_implicit);
1796  }
1797 
1798  /* OMPT state */
1799  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800  } else {
1801  exit_frame_p = &dummy;
1802  }
1803 #endif
1804 
1805  {
1806  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809  parent_team->t.t_argv
1810 #if OMPT_SUPPORT
1811  ,
1812  exit_frame_p
1813 #endif
1814  );
1815  }
1816 
1817 #if OMPT_SUPPORT
1818  if (ompt_enabled.enabled) {
1819  *exit_frame_p = NULL;
1820  if (ompt_enabled.ompt_callback_implicit_task) {
1821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822  ompt_scope_end, NULL, &(task_info->task_data), 1,
1823  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824  ompt_task_implicit);
1825  }
1826  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827  __ompt_lw_taskteam_unlink(master_th);
1828  if (ompt_enabled.ompt_callback_parallel_end) {
1829  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830  &ompt_parallel_data, parent_task_data,
1831  OMPT_INVOKER(call_context) | ompt_parallel_team,
1832  return_address);
1833  }
1834  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835  }
1836 #endif
1837  } else if (microtask == (microtask_t)__kmp_teams_master) {
1838  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839  master_th->th.th_serial_team);
1840  team = master_th->th.th_team;
1841  // team->t.t_pkfn = microtask;
1842  team->t.t_invoke = invoker;
1843  __kmp_alloc_argv_entries(argc, team, TRUE);
1844  team->t.t_argc = argc;
1845  argv = (void **)team->t.t_argv;
1846  if (ap) {
1847  for (i = argc - 1; i >= 0; --i)
1848  *argv++ = va_arg(kmp_va_deref(ap), void *);
1849  } else {
1850  for (i = 0; i < argc; ++i)
1851  // Get args from parent team for teams construct
1852  argv[i] = parent_team->t.t_argv[i];
1853  }
1854  // AC: revert change made in __kmpc_serialized_parallel()
1855  // because initial code in teams should have level=0
1856  team->t.t_level--;
1857  // AC: call special invoker for outer "parallel" of teams construct
1858  invoker(gtid);
1859 #if OMPT_SUPPORT
1860  if (ompt_enabled.enabled) {
1861  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862  if (ompt_enabled.ompt_callback_implicit_task) {
1863  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864  ompt_scope_end, NULL, &(task_info->task_data), 0,
1865  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866  }
1867  if (ompt_enabled.ompt_callback_parallel_end) {
1868  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869  &ompt_parallel_data, parent_task_data,
1870  OMPT_INVOKER(call_context) | ompt_parallel_league,
1871  return_address);
1872  }
1873  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874  }
1875 #endif
1876  } else {
1877  argv = args;
1878  for (i = argc - 1; i >= 0; --i)
1879  *argv++ = va_arg(kmp_va_deref(ap), void *);
1880  KMP_MB();
1881 
1882 #if OMPT_SUPPORT
1883  void *dummy;
1884  void **exit_frame_p;
1885  ompt_task_info_t *task_info;
1886 
1887  ompt_lw_taskteam_t lw_taskteam;
1888 
1889  if (ompt_enabled.enabled) {
1890  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891  &ompt_parallel_data, return_address);
1892  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893  // don't use lw_taskteam after linking. content was swaped
1894  task_info = OMPT_CUR_TASK_INFO(master_th);
1895  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896 
1897  /* OMPT implicit task begin */
1898  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899  if (ompt_enabled.ompt_callback_implicit_task) {
1900  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903  ompt_task_implicit);
1904  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905  __kmp_tid_from_gtid(gtid);
1906  }
1907 
1908  /* OMPT state */
1909  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910  } else {
1911  exit_frame_p = &dummy;
1912  }
1913 #endif
1914 
1915  {
1916  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920  ,
1921  exit_frame_p
1922 #endif
1923  );
1924  }
1925 
1926 #if OMPT_SUPPORT
1927  if (ompt_enabled.enabled) {
1928  *exit_frame_p = NULL;
1929  if (ompt_enabled.ompt_callback_implicit_task) {
1930  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931  ompt_scope_end, NULL, &(task_info->task_data), 1,
1932  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933  ompt_task_implicit);
1934  }
1935 
1936  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937  __ompt_lw_taskteam_unlink(master_th);
1938  if (ompt_enabled.ompt_callback_parallel_end) {
1939  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940  &ompt_parallel_data, parent_task_data,
1941  OMPT_INVOKER(call_context) | ompt_parallel_team,
1942  return_address);
1943  }
1944  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945  }
1946 #endif
1947  }
1948  } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950  ompt_lw_taskteam_t lwt;
1951  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952  return_address);
1953 
1954  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959  // we were called from GNU native code
1960  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961  return FALSE;
1962  } else {
1963  KMP_ASSERT2(call_context < fork_context_last,
1964  "__kmp_fork_call: unknown fork_context parameter");
1965  }
1966 
1967  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968  KMP_MB();
1969  return FALSE;
1970  } // if (nthreads == 1)
1971 
1972  // GEH: only modify the executing flag in the case when not serialized
1973  // serialized case is handled in kmpc_serialized_parallel
1974  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975  "curtask=%p, curtask_max_aclevel=%d\n",
1976  parent_team->t.t_active_level, master_th,
1977  master_th->th.th_current_task,
1978  master_th->th.th_current_task->td_icvs.max_active_levels));
1979  // TODO: GEH - cannot do this assertion because root thread not set up as
1980  // executing
1981  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982  master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984  if (!master_th->th.th_teams_microtask || level > teams_level) {
1985  /* Increment our nested depth level */
1986  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1987  }
1988 
1989  // See if we need to make a copy of the ICVs.
1990  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1991  if ((level + 1 < __kmp_nested_nth.used) &&
1992  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1993  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1994  } else {
1995  nthreads_icv = 0; // don't update
1996  }
1997 
1998  // Figure out the proc_bind_policy for the new team.
1999  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2000  // proc_bind_default means don't update
2001  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2002  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2003  proc_bind = proc_bind_false;
2004  } else {
2005  // No proc_bind clause specified; use current proc-bind-var for this
2006  // parallel region
2007  if (proc_bind == proc_bind_default) {
2008  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2009  }
2010  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2011  if (master_th->th.th_teams_microtask &&
2012  microtask == (microtask_t)__kmp_teams_master) {
2013  proc_bind = __kmp_teams_proc_bind;
2014  }
2015  /* else: The proc_bind policy was specified explicitly on parallel clause.
2016  This overrides proc-bind-var for this parallel region, but does not
2017  change proc-bind-var. */
2018  // Figure the value of proc-bind-var for the child threads.
2019  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2020  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2021  master_th->th.th_current_task->td_icvs.proc_bind)) {
2022  // Do not modify the proc bind icv for the two teams construct forks
2023  // They just let the proc bind icv pass through
2024  if (!master_th->th.th_teams_microtask ||
2025  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2026  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2027  }
2028  }
2029 
2030  // Reset for next parallel region
2031  master_th->th.th_set_proc_bind = proc_bind_default;
2032 
2033  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2034  kmp_internal_control_t new_icvs;
2035  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036  new_icvs.next = NULL;
2037  if (nthreads_icv > 0) {
2038  new_icvs.nproc = nthreads_icv;
2039  }
2040  if (proc_bind_icv != proc_bind_default) {
2041  new_icvs.proc_bind = proc_bind_icv;
2042  }
2043 
2044  /* allocate a new parallel team */
2045  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2046  team = __kmp_allocate_team(root, nthreads, nthreads,
2047 #if OMPT_SUPPORT
2048  ompt_parallel_data,
2049 #endif
2050  proc_bind, &new_icvs,
2051  argc USE_NESTED_HOT_ARG(master_th));
2052  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2053  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2054  } else {
2055  /* allocate a new parallel team */
2056  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057  team = __kmp_allocate_team(root, nthreads, nthreads,
2058 #if OMPT_SUPPORT
2059  ompt_parallel_data,
2060 #endif
2061  proc_bind,
2062  &master_th->th.th_current_task->td_icvs,
2063  argc USE_NESTED_HOT_ARG(master_th));
2064  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2065  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2066  &master_th->th.th_current_task->td_icvs);
2067  }
2068  KF_TRACE(
2069  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2070 
2071  /* setup the new team */
2072  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2073  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2074  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2075  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2076  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2077 #if OMPT_SUPPORT
2078  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2079  return_address);
2080 #endif
2081  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2082  // TODO: parent_team->t.t_level == INT_MAX ???
2083  if (!master_th->th.th_teams_microtask || level > teams_level) {
2084  int new_level = parent_team->t.t_level + 1;
2085  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2086  new_level = parent_team->t.t_active_level + 1;
2087  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2088  } else {
2089  // AC: Do not increase parallel level at start of the teams construct
2090  int new_level = parent_team->t.t_level;
2091  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2092  new_level = parent_team->t.t_active_level;
2093  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2094  }
2095  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2096  // set primary thread's schedule as new run-time schedule
2097  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2098 
2099  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2100  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2101 
2102  // Update the floating point rounding in the team if required.
2103  propagateFPControl(team);
2104 #if OMPD_SUPPORT
2105  if (ompd_state & OMPD_ENABLE_BP)
2106  ompd_bp_parallel_begin();
2107 #endif
2108 
2109  if (__kmp_tasking_mode != tskm_immediate_exec) {
2110  // Set primary thread's task team to team's task team. Unless this is hot
2111  // team, it should be NULL.
2112  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2113  parent_team->t.t_task_team[master_th->th.th_task_state]);
2114  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2115  "%p, new task_team %p / team %p\n",
2116  __kmp_gtid_from_thread(master_th),
2117  master_th->th.th_task_team, parent_team,
2118  team->t.t_task_team[master_th->th.th_task_state], team));
2119 
2120  if (active_level || master_th->th.th_task_team) {
2121  // Take a memo of primary thread's task_state
2122  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2123  if (master_th->th.th_task_state_top >=
2124  master_th->th.th_task_state_stack_sz) { // increase size
2125  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2126  kmp_uint8 *old_stack, *new_stack;
2127  kmp_uint32 i;
2128  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2129  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2130  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2131  }
2132  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2133  ++i) { // zero-init rest of stack
2134  new_stack[i] = 0;
2135  }
2136  old_stack = master_th->th.th_task_state_memo_stack;
2137  master_th->th.th_task_state_memo_stack = new_stack;
2138  master_th->th.th_task_state_stack_sz = new_size;
2139  __kmp_free(old_stack);
2140  }
2141  // Store primary thread's task_state on stack
2142  master_th->th
2143  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2144  master_th->th.th_task_state;
2145  master_th->th.th_task_state_top++;
2146 #if KMP_NESTED_HOT_TEAMS
2147  if (master_th->th.th_hot_teams &&
2148  active_level < __kmp_hot_teams_max_level &&
2149  team == master_th->th.th_hot_teams[active_level].hot_team) {
2150  // Restore primary thread's nested state if nested hot team
2151  master_th->th.th_task_state =
2152  master_th->th
2153  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2154  } else {
2155 #endif
2156  master_th->th.th_task_state = 0;
2157 #if KMP_NESTED_HOT_TEAMS
2158  }
2159 #endif
2160  }
2161 #if !KMP_NESTED_HOT_TEAMS
2162  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2163  (team == root->r.r_hot_team));
2164 #endif
2165  }
2166 
2167  KA_TRACE(
2168  20,
2169  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2170  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2171  team->t.t_nproc));
2172  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2173  (team->t.t_master_tid == 0 &&
2174  (team->t.t_parent == root->r.r_root_team ||
2175  team->t.t_parent->t.t_serialized)));
2176  KMP_MB();
2177 
2178  /* now, setup the arguments */
2179  argv = (void **)team->t.t_argv;
2180  if (ap) {
2181  for (i = argc - 1; i >= 0; --i) {
2182  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2183  KMP_CHECK_UPDATE(*argv, new_argv);
2184  argv++;
2185  }
2186  } else {
2187  for (i = 0; i < argc; ++i) {
2188  // Get args from parent team for teams construct
2189  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2190  }
2191  }
2192 
2193  /* now actually fork the threads */
2194  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2195  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2196  root->r.r_active = TRUE;
2197 
2198  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2199  __kmp_setup_icv_copy(team, nthreads,
2200  &master_th->th.th_current_task->td_icvs, loc);
2201 
2202 #if OMPT_SUPPORT
2203  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2204 #endif
2205 
2206  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2207 
2208 #if USE_ITT_BUILD
2209  if (team->t.t_active_level == 1 // only report frames at level 1
2210  && !master_th->th.th_teams_microtask) { // not in teams construct
2211 #if USE_ITT_NOTIFY
2212  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213  (__kmp_forkjoin_frames_mode == 3 ||
2214  __kmp_forkjoin_frames_mode == 1)) {
2215  kmp_uint64 tmp_time = 0;
2216  if (__itt_get_timestamp_ptr)
2217  tmp_time = __itt_get_timestamp();
2218  // Internal fork - report frame begin
2219  master_th->th.th_frame_time = tmp_time;
2220  if (__kmp_forkjoin_frames_mode == 3)
2221  team->t.t_region_time = tmp_time;
2222  } else
2223 // only one notification scheme (either "submit" or "forking/joined", not both)
2224 #endif /* USE_ITT_NOTIFY */
2225  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229  }
2230  }
2231 #endif /* USE_ITT_BUILD */
2232 
2233  /* now go on and do the work */
2234  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235  KMP_MB();
2236  KF_TRACE(10,
2237  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238  root, team, master_th, gtid));
2239 
2240 #if USE_ITT_BUILD
2241  if (__itt_stack_caller_create_ptr) {
2242  // create new stack stitching id before entering fork barrier
2243  if (!enter_teams) {
2244  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2245  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2246  } else if (parent_team->t.t_serialized) {
2247  // keep stack stitching id in the serialized parent_team;
2248  // current team will be used for parallel inside the teams;
2249  // if parent_team is active, then it already keeps stack stitching id
2250  // for the league of teams
2251  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2252  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2253  }
2254  }
2255 #endif /* USE_ITT_BUILD */
2256 
2257  // AC: skip __kmp_internal_fork at teams construct, let only primary
2258  // threads execute
2259  if (ap) {
2260  __kmp_internal_fork(loc, gtid, team);
2261  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2262  "master_th=%p, gtid=%d\n",
2263  root, team, master_th, gtid));
2264  }
2265 
2266  if (call_context == fork_context_gnu) {
2267  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2268  return TRUE;
2269  }
2270 
2271  /* Invoke microtask for PRIMARY thread */
2272  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2273  team->t.t_id, team->t.t_pkfn));
2274  } // END of timer KMP_fork_call block
2275 
2276 #if KMP_STATS_ENABLED
2277  // If beginning a teams construct, then change thread state
2278  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2279  if (!ap) {
2280  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2281  }
2282 #endif
2283 
2284  if (!team->t.t_invoke(gtid)) {
2285  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2286  }
2287 
2288 #if KMP_STATS_ENABLED
2289  // If was beginning of a teams construct, then reset thread state
2290  if (!ap) {
2291  KMP_SET_THREAD_STATE(previous_state);
2292  }
2293 #endif
2294 
2295  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296  team->t.t_id, team->t.t_pkfn));
2297  KMP_MB(); /* Flush all pending memory write invalidates. */
2298 
2299  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300 #if OMPT_SUPPORT
2301  if (ompt_enabled.enabled) {
2302  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2303  }
2304 #endif
2305 
2306  return TRUE;
2307 }
2308 
2309 #if OMPT_SUPPORT
2310 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2311  kmp_team_t *team) {
2312  // restore state outside the region
2313  thread->th.ompt_thread_info.state =
2314  ((team->t.t_serialized) ? ompt_state_work_serial
2315  : ompt_state_work_parallel);
2316 }
2317 
2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2319  kmp_team_t *team, ompt_data_t *parallel_data,
2320  int flags, void *codeptr) {
2321  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2322  if (ompt_enabled.ompt_callback_parallel_end) {
2323  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2324  parallel_data, &(task_info->task_data), flags, codeptr);
2325  }
2326 
2327  task_info->frame.enter_frame = ompt_data_none;
2328  __kmp_join_restore_state(thread, team);
2329 }
2330 #endif
2331 
2332 void __kmp_join_call(ident_t *loc, int gtid
2333 #if OMPT_SUPPORT
2334  ,
2335  enum fork_context_e fork_context
2336 #endif
2337  ,
2338  int exit_teams) {
2339  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2340  kmp_team_t *team;
2341  kmp_team_t *parent_team;
2342  kmp_info_t *master_th;
2343  kmp_root_t *root;
2344  int master_active;
2345 
2346  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2347 
2348  /* setup current data */
2349  master_th = __kmp_threads[gtid];
2350  root = master_th->th.th_root;
2351  team = master_th->th.th_team;
2352  parent_team = team->t.t_parent;
2353 
2354  master_th->th.th_ident = loc;
2355 
2356 #if OMPT_SUPPORT
2357  void *team_microtask = (void *)team->t.t_pkfn;
2358  // For GOMP interface with serialized parallel, need the
2359  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2360  // and end-parallel events.
2361  if (ompt_enabled.enabled &&
2362  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2363  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364  }
2365 #endif
2366 
2367 #if KMP_DEBUG
2368  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370  "th_task_team = %p\n",
2371  __kmp_gtid_from_thread(master_th), team,
2372  team->t.t_task_team[master_th->th.th_task_state],
2373  master_th->th.th_task_team));
2374  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375  team->t.t_task_team[master_th->th.th_task_state]);
2376  }
2377 #endif
2378 
2379  if (team->t.t_serialized) {
2380  if (master_th->th.th_teams_microtask) {
2381  // We are in teams construct
2382  int level = team->t.t_level;
2383  int tlevel = master_th->th.th_teams_level;
2384  if (level == tlevel) {
2385  // AC: we haven't incremented it earlier at start of teams construct,
2386  // so do it here - at the end of teams construct
2387  team->t.t_level++;
2388  } else if (level == tlevel + 1) {
2389  // AC: we are exiting parallel inside teams, need to increment
2390  // serialization in order to restore it in the next call to
2391  // __kmpc_end_serialized_parallel
2392  team->t.t_serialized++;
2393  }
2394  }
2395  __kmpc_end_serialized_parallel(loc, gtid);
2396 
2397 #if OMPT_SUPPORT
2398  if (ompt_enabled.enabled) {
2399  __kmp_join_restore_state(master_th, parent_team);
2400  }
2401 #endif
2402 
2403  return;
2404  }
2405 
2406  master_active = team->t.t_master_active;
2407 
2408  if (!exit_teams) {
2409  // AC: No barrier for internal teams at exit from teams construct.
2410  // But there is barrier for external team (league).
2411  __kmp_internal_join(loc, gtid, team);
2412 #if USE_ITT_BUILD
2413  if (__itt_stack_caller_create_ptr) {
2414  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2415  // destroy the stack stitching id after join barrier
2416  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2417  team->t.t_stack_id = NULL;
2418  }
2419 #endif
2420  } else {
2421  master_th->th.th_task_state =
2422  0; // AC: no tasking in teams (out of any parallel)
2423 #if USE_ITT_BUILD
2424  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2425  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2426  // destroy the stack stitching id on exit from the teams construct
2427  // if parent_team is active, then the id will be destroyed later on
2428  // by master of the league of teams
2429  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2430  parent_team->t.t_stack_id = NULL;
2431  }
2432 #endif
2433 
2434  if (team->t.t_nproc > 1 &&
2435  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2436  team->t.b->update_num_threads(team->t.t_nproc);
2437  __kmp_add_threads_to_team(team, team->t.t_nproc);
2438  }
2439  }
2440 
2441  KMP_MB();
2442 
2443 #if OMPT_SUPPORT
2444  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445  void *codeptr = team->t.ompt_team_info.master_return_address;
2446 #endif
2447 
2448 #if USE_ITT_BUILD
2449  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2450  if (team->t.t_active_level == 1 &&
2451  (!master_th->th.th_teams_microtask || /* not in teams construct */
2452  master_th->th.th_teams_size.nteams == 1)) {
2453  master_th->th.th_ident = loc;
2454  // only one notification scheme (either "submit" or "forking/joined", not
2455  // both)
2456  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2457  __kmp_forkjoin_frames_mode == 3)
2458  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2459  master_th->th.th_frame_time, 0, loc,
2460  master_th->th.th_team_nproc, 1);
2461  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2462  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2463  __kmp_itt_region_joined(gtid);
2464  } // active_level == 1
2465 #endif /* USE_ITT_BUILD */
2466 
2467 #if KMP_AFFINITY_SUPPORTED
2468  if (!exit_teams) {
2469  // Restore master thread's partition.
2470  master_th->th.th_first_place = team->t.t_first_place;
2471  master_th->th.th_last_place = team->t.t_last_place;
2472  }
2473 #endif // KMP_AFFINITY_SUPPORTED
2474 
2475  if (master_th->th.th_teams_microtask && !exit_teams &&
2476  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2477  team->t.t_level == master_th->th.th_teams_level + 1) {
2478 // AC: We need to leave the team structure intact at the end of parallel
2479 // inside the teams construct, so that at the next parallel same (hot) team
2480 // works, only adjust nesting levels
2481 #if OMPT_SUPPORT
2482  ompt_data_t ompt_parallel_data = ompt_data_none;
2483  if (ompt_enabled.enabled) {
2484  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2485  if (ompt_enabled.ompt_callback_implicit_task) {
2486  int ompt_team_size = team->t.t_nproc;
2487  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2488  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2489  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2490  }
2491  task_info->frame.exit_frame = ompt_data_none;
2492  task_info->task_data = ompt_data_none;
2493  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2494  __ompt_lw_taskteam_unlink(master_th);
2495  }
2496 #endif
2497  /* Decrement our nested depth level */
2498  team->t.t_level--;
2499  team->t.t_active_level--;
2500  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2501 
2502  // Restore number of threads in the team if needed. This code relies on
2503  // the proper adjustment of th_teams_size.nth after the fork in
2504  // __kmp_teams_master on each teams primary thread in the case that
2505  // __kmp_reserve_threads reduced it.
2506  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2507  int old_num = master_th->th.th_team_nproc;
2508  int new_num = master_th->th.th_teams_size.nth;
2509  kmp_info_t **other_threads = team->t.t_threads;
2510  team->t.t_nproc = new_num;
2511  for (int i = 0; i < old_num; ++i) {
2512  other_threads[i]->th.th_team_nproc = new_num;
2513  }
2514  // Adjust states of non-used threads of the team
2515  for (int i = old_num; i < new_num; ++i) {
2516  // Re-initialize thread's barrier data.
2517  KMP_DEBUG_ASSERT(other_threads[i]);
2518  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2519  for (int b = 0; b < bs_last_barrier; ++b) {
2520  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2521  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2522 #if USE_DEBUGGER
2523  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2524 #endif
2525  }
2526  if (__kmp_tasking_mode != tskm_immediate_exec) {
2527  // Synchronize thread's task state
2528  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2529  }
2530  }
2531  }
2532 
2533 #if OMPT_SUPPORT
2534  if (ompt_enabled.enabled) {
2535  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2536  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2537  }
2538 #endif
2539 
2540  return;
2541  }
2542 
2543  /* do cleanup and restore the parent team */
2544  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2545  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2546 
2547  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2548 
2549  /* jc: The following lock has instructions with REL and ACQ semantics,
2550  separating the parallel user code called in this parallel region
2551  from the serial user code called after this function returns. */
2552  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2553 
2554  if (!master_th->th.th_teams_microtask ||
2555  team->t.t_level > master_th->th.th_teams_level) {
2556  /* Decrement our nested depth level */
2557  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2558  }
2559  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2560 
2561 #if OMPT_SUPPORT
2562  if (ompt_enabled.enabled) {
2563  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2564  if (ompt_enabled.ompt_callback_implicit_task) {
2565  int flags = (team_microtask == (void *)__kmp_teams_master)
2566  ? ompt_task_initial
2567  : ompt_task_implicit;
2568  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2569  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2571  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2572  }
2573  task_info->frame.exit_frame = ompt_data_none;
2574  task_info->task_data = ompt_data_none;
2575  }
2576 #endif
2577 
2578  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2579  master_th, team));
2580  __kmp_pop_current_task_from_thread(master_th);
2581 
2582  master_th->th.th_def_allocator = team->t.t_def_allocator;
2583 
2584 #if OMPD_SUPPORT
2585  if (ompd_state & OMPD_ENABLE_BP)
2586  ompd_bp_parallel_end();
2587 #endif
2588  updateHWFPControl(team);
2589 
2590  if (root->r.r_active != master_active)
2591  root->r.r_active = master_active;
2592 
2593  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2594  master_th)); // this will free worker threads
2595 
2596  /* this race was fun to find. make sure the following is in the critical
2597  region otherwise assertions may fail occasionally since the old team may be
2598  reallocated and the hierarchy appears inconsistent. it is actually safe to
2599  run and won't cause any bugs, but will cause those assertion failures. it's
2600  only one deref&assign so might as well put this in the critical region */
2601  master_th->th.th_team = parent_team;
2602  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2603  master_th->th.th_team_master = parent_team->t.t_threads[0];
2604  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2605 
2606  /* restore serialized team, if need be */
2607  if (parent_team->t.t_serialized &&
2608  parent_team != master_th->th.th_serial_team &&
2609  parent_team != root->r.r_root_team) {
2610  __kmp_free_team(root,
2611  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2612  master_th->th.th_serial_team = parent_team;
2613  }
2614 
2615  if (__kmp_tasking_mode != tskm_immediate_exec) {
2616  if (master_th->th.th_task_state_top >
2617  0) { // Restore task state from memo stack
2618  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2619  // Remember primary thread's state if we re-use this nested hot team
2620  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2621  master_th->th.th_task_state;
2622  --master_th->th.th_task_state_top; // pop
2623  // Now restore state at this level
2624  master_th->th.th_task_state =
2625  master_th->th
2626  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2627  }
2628  // Copy the task team from the parent team to the primary thread
2629  master_th->th.th_task_team =
2630  parent_team->t.t_task_team[master_th->th.th_task_state];
2631  KA_TRACE(20,
2632  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2633  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2634  parent_team));
2635  }
2636 
2637  // TODO: GEH - cannot do this assertion because root thread not set up as
2638  // executing
2639  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2640  master_th->th.th_current_task->td_flags.executing = 1;
2641 
2642  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643 
2644 #if OMPT_SUPPORT
2645  int flags =
2646  OMPT_INVOKER(fork_context) |
2647  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2648  : ompt_parallel_team);
2649  if (ompt_enabled.enabled) {
2650  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2651  codeptr);
2652  }
2653 #endif
2654 
2655  KMP_MB();
2656  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2657 }
2658 
2659 /* Check whether we should push an internal control record onto the
2660  serial team stack. If so, do it. */
2661 void __kmp_save_internal_controls(kmp_info_t *thread) {
2662 
2663  if (thread->th.th_team != thread->th.th_serial_team) {
2664  return;
2665  }
2666  if (thread->th.th_team->t.t_serialized > 1) {
2667  int push = 0;
2668 
2669  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2670  push = 1;
2671  } else {
2672  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2673  thread->th.th_team->t.t_serialized) {
2674  push = 1;
2675  }
2676  }
2677  if (push) { /* push a record on the serial team's stack */
2678  kmp_internal_control_t *control =
2679  (kmp_internal_control_t *)__kmp_allocate(
2680  sizeof(kmp_internal_control_t));
2681 
2682  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2683 
2684  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2685 
2686  control->next = thread->th.th_team->t.t_control_stack_top;
2687  thread->th.th_team->t.t_control_stack_top = control;
2688  }
2689  }
2690 }
2691 
2692 /* Changes set_nproc */
2693 void __kmp_set_num_threads(int new_nth, int gtid) {
2694  kmp_info_t *thread;
2695  kmp_root_t *root;
2696 
2697  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2698  KMP_DEBUG_ASSERT(__kmp_init_serial);
2699 
2700  if (new_nth < 1)
2701  new_nth = 1;
2702  else if (new_nth > __kmp_max_nth)
2703  new_nth = __kmp_max_nth;
2704 
2705  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2706  thread = __kmp_threads[gtid];
2707  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2708  return; // nothing to do
2709 
2710  __kmp_save_internal_controls(thread);
2711 
2712  set__nproc(thread, new_nth);
2713 
2714  // If this omp_set_num_threads() call will cause the hot team size to be
2715  // reduced (in the absence of a num_threads clause), then reduce it now,
2716  // rather than waiting for the next parallel region.
2717  root = thread->th.th_root;
2718  if (__kmp_init_parallel && (!root->r.r_active) &&
2719  (root->r.r_hot_team->t.t_nproc > new_nth)
2720 #if KMP_NESTED_HOT_TEAMS
2721  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2722 #endif
2723  ) {
2724  kmp_team_t *hot_team = root->r.r_hot_team;
2725  int f;
2726 
2727  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2728 
2729  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2730  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2731  }
2732  // Release the extra threads we don't need any more.
2733  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2734  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2735  if (__kmp_tasking_mode != tskm_immediate_exec) {
2736  // When decreasing team size, threads no longer in the team should unref
2737  // task team.
2738  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2739  }
2740  __kmp_free_thread(hot_team->t.t_threads[f]);
2741  hot_team->t.t_threads[f] = NULL;
2742  }
2743  hot_team->t.t_nproc = new_nth;
2744 #if KMP_NESTED_HOT_TEAMS
2745  if (thread->th.th_hot_teams) {
2746  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2747  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2748  }
2749 #endif
2750 
2751  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2752  hot_team->t.b->update_num_threads(new_nth);
2753  __kmp_add_threads_to_team(hot_team, new_nth);
2754  }
2755 
2756  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2757 
2758  // Update the t_nproc field in the threads that are still active.
2759  for (f = 0; f < new_nth; f++) {
2760  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2761  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2762  }
2763  // Special flag in case omp_set_num_threads() call
2764  hot_team->t.t_size_changed = -1;
2765  }
2766 }
2767 
2768 /* Changes max_active_levels */
2769 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2770  kmp_info_t *thread;
2771 
2772  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2773  "%d = (%d)\n",
2774  gtid, max_active_levels));
2775  KMP_DEBUG_ASSERT(__kmp_init_serial);
2776 
2777  // validate max_active_levels
2778  if (max_active_levels < 0) {
2779  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2780  // We ignore this call if the user has specified a negative value.
2781  // The current setting won't be changed. The last valid setting will be
2782  // used. A warning will be issued (if warnings are allowed as controlled by
2783  // the KMP_WARNINGS env var).
2784  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2785  "max_active_levels for thread %d = (%d)\n",
2786  gtid, max_active_levels));
2787  return;
2788  }
2789  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2790  // it's OK, the max_active_levels is within the valid range: [ 0;
2791  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2792  // We allow a zero value. (implementation defined behavior)
2793  } else {
2794  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2795  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2796  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2797  // Current upper limit is MAX_INT. (implementation defined behavior)
2798  // If the input exceeds the upper limit, we correct the input to be the
2799  // upper limit. (implementation defined behavior)
2800  // Actually, the flow should never get here until we use MAX_INT limit.
2801  }
2802  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2803  "max_active_levels for thread %d = (%d)\n",
2804  gtid, max_active_levels));
2805 
2806  thread = __kmp_threads[gtid];
2807 
2808  __kmp_save_internal_controls(thread);
2809 
2810  set__max_active_levels(thread, max_active_levels);
2811 }
2812 
2813 /* Gets max_active_levels */
2814 int __kmp_get_max_active_levels(int gtid) {
2815  kmp_info_t *thread;
2816 
2817  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2818  KMP_DEBUG_ASSERT(__kmp_init_serial);
2819 
2820  thread = __kmp_threads[gtid];
2821  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2822  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2823  "curtask_maxaclevel=%d\n",
2824  gtid, thread->th.th_current_task,
2825  thread->th.th_current_task->td_icvs.max_active_levels));
2826  return thread->th.th_current_task->td_icvs.max_active_levels;
2827 }
2828 
2829 // nteams-var per-device ICV
2830 void __kmp_set_num_teams(int num_teams) {
2831  if (num_teams > 0)
2832  __kmp_nteams = num_teams;
2833 }
2834 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2835 // teams-thread-limit-var per-device ICV
2836 void __kmp_set_teams_thread_limit(int limit) {
2837  if (limit > 0)
2838  __kmp_teams_thread_limit = limit;
2839 }
2840 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2841 
2842 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2843 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2844 
2845 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2846 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2847  kmp_info_t *thread;
2848  kmp_sched_t orig_kind;
2849  // kmp_team_t *team;
2850 
2851  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2852  gtid, (int)kind, chunk));
2853  KMP_DEBUG_ASSERT(__kmp_init_serial);
2854 
2855  // Check if the kind parameter is valid, correct if needed.
2856  // Valid parameters should fit in one of two intervals - standard or extended:
2857  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2858  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2859  orig_kind = kind;
2860  kind = __kmp_sched_without_mods(kind);
2861 
2862  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2863  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2864  // TODO: Hint needs attention in case we change the default schedule.
2865  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2866  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2867  __kmp_msg_null);
2868  kind = kmp_sched_default;
2869  chunk = 0; // ignore chunk value in case of bad kind
2870  }
2871 
2872  thread = __kmp_threads[gtid];
2873 
2874  __kmp_save_internal_controls(thread);
2875 
2876  if (kind < kmp_sched_upper_std) {
2877  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2878  // differ static chunked vs. unchunked: chunk should be invalid to
2879  // indicate unchunked schedule (which is the default)
2880  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2881  } else {
2882  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2883  __kmp_sch_map[kind - kmp_sched_lower - 1];
2884  }
2885  } else {
2886  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2887  // kmp_sched_lower - 2 ];
2888  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2889  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2890  kmp_sched_lower - 2];
2891  }
2892  __kmp_sched_apply_mods_intkind(
2893  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2894  if (kind == kmp_sched_auto || chunk < 1) {
2895  // ignore parameter chunk for schedule auto
2896  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2897  } else {
2898  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2899  }
2900 }
2901 
2902 /* Gets def_sched_var ICV values */
2903 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2904  kmp_info_t *thread;
2905  enum sched_type th_type;
2906 
2907  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2908  KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910  thread = __kmp_threads[gtid];
2911 
2912  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2913  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2914  case kmp_sch_static:
2915  case kmp_sch_static_greedy:
2916  case kmp_sch_static_balanced:
2917  *kind = kmp_sched_static;
2918  __kmp_sched_apply_mods_stdkind(kind, th_type);
2919  *chunk = 0; // chunk was not set, try to show this fact via zero value
2920  return;
2921  case kmp_sch_static_chunked:
2922  *kind = kmp_sched_static;
2923  break;
2924  case kmp_sch_dynamic_chunked:
2925  *kind = kmp_sched_dynamic;
2926  break;
2928  case kmp_sch_guided_iterative_chunked:
2929  case kmp_sch_guided_analytical_chunked:
2930  *kind = kmp_sched_guided;
2931  break;
2932  case kmp_sch_auto:
2933  *kind = kmp_sched_auto;
2934  break;
2935  case kmp_sch_trapezoidal:
2936  *kind = kmp_sched_trapezoidal;
2937  break;
2938 #if KMP_STATIC_STEAL_ENABLED
2939  case kmp_sch_static_steal:
2940  *kind = kmp_sched_static_steal;
2941  break;
2942 #endif
2943  default:
2944  KMP_FATAL(UnknownSchedulingType, th_type);
2945  }
2946 
2947  __kmp_sched_apply_mods_stdkind(kind, th_type);
2948  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2949 }
2950 
2951 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2952 
2953  int ii, dd;
2954  kmp_team_t *team;
2955  kmp_info_t *thr;
2956 
2957  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2958  KMP_DEBUG_ASSERT(__kmp_init_serial);
2959 
2960  // validate level
2961  if (level == 0)
2962  return 0;
2963  if (level < 0)
2964  return -1;
2965  thr = __kmp_threads[gtid];
2966  team = thr->th.th_team;
2967  ii = team->t.t_level;
2968  if (level > ii)
2969  return -1;
2970 
2971  if (thr->th.th_teams_microtask) {
2972  // AC: we are in teams region where multiple nested teams have same level
2973  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2974  if (level <=
2975  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2976  KMP_DEBUG_ASSERT(ii >= tlevel);
2977  // AC: As we need to pass by the teams league, we need to artificially
2978  // increase ii
2979  if (ii == tlevel) {
2980  ii += 2; // three teams have same level
2981  } else {
2982  ii++; // two teams have same level
2983  }
2984  }
2985  }
2986 
2987  if (ii == level)
2988  return __kmp_tid_from_gtid(gtid);
2989 
2990  dd = team->t.t_serialized;
2991  level++;
2992  while (ii > level) {
2993  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2994  }
2995  if ((team->t.t_serialized) && (!dd)) {
2996  team = team->t.t_parent;
2997  continue;
2998  }
2999  if (ii > level) {
3000  team = team->t.t_parent;
3001  dd = team->t.t_serialized;
3002  ii--;
3003  }
3004  }
3005 
3006  return (dd > 1) ? (0) : (team->t.t_master_tid);
3007 }
3008 
3009 int __kmp_get_team_size(int gtid, int level) {
3010 
3011  int ii, dd;
3012  kmp_team_t *team;
3013  kmp_info_t *thr;
3014 
3015  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3016  KMP_DEBUG_ASSERT(__kmp_init_serial);
3017 
3018  // validate level
3019  if (level == 0)
3020  return 1;
3021  if (level < 0)
3022  return -1;
3023  thr = __kmp_threads[gtid];
3024  team = thr->th.th_team;
3025  ii = team->t.t_level;
3026  if (level > ii)
3027  return -1;
3028 
3029  if (thr->th.th_teams_microtask) {
3030  // AC: we are in teams region where multiple nested teams have same level
3031  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3032  if (level <=
3033  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3034  KMP_DEBUG_ASSERT(ii >= tlevel);
3035  // AC: As we need to pass by the teams league, we need to artificially
3036  // increase ii
3037  if (ii == tlevel) {
3038  ii += 2; // three teams have same level
3039  } else {
3040  ii++; // two teams have same level
3041  }
3042  }
3043  }
3044 
3045  while (ii > level) {
3046  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3047  }
3048  if (team->t.t_serialized && (!dd)) {
3049  team = team->t.t_parent;
3050  continue;
3051  }
3052  if (ii > level) {
3053  team = team->t.t_parent;
3054  ii--;
3055  }
3056  }
3057 
3058  return team->t.t_nproc;
3059 }
3060 
3061 kmp_r_sched_t __kmp_get_schedule_global() {
3062  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3063  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3064  // independently. So one can get the updated schedule here.
3065 
3066  kmp_r_sched_t r_sched;
3067 
3068  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3069  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3070  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3071  // different roots (even in OMP 2.5)
3072  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3073  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3074  if (s == kmp_sch_static) {
3075  // replace STATIC with more detailed schedule (balanced or greedy)
3076  r_sched.r_sched_type = __kmp_static;
3077  } else if (s == kmp_sch_guided_chunked) {
3078  // replace GUIDED with more detailed schedule (iterative or analytical)
3079  r_sched.r_sched_type = __kmp_guided;
3080  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3081  r_sched.r_sched_type = __kmp_sched;
3082  }
3083  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3084 
3085  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3086  // __kmp_chunk may be wrong here (if it was not ever set)
3087  r_sched.chunk = KMP_DEFAULT_CHUNK;
3088  } else {
3089  r_sched.chunk = __kmp_chunk;
3090  }
3091 
3092  return r_sched;
3093 }
3094 
3095 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3096  at least argc number of *t_argv entries for the requested team. */
3097 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3098 
3099  KMP_DEBUG_ASSERT(team);
3100  if (!realloc || argc > team->t.t_max_argc) {
3101 
3102  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3103  "current entries=%d\n",
3104  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3105  /* if previously allocated heap space for args, free them */
3106  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3107  __kmp_free((void *)team->t.t_argv);
3108 
3109  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3110  /* use unused space in the cache line for arguments */
3111  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3112  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3113  "argv entries\n",
3114  team->t.t_id, team->t.t_max_argc));
3115  team->t.t_argv = &team->t.t_inline_argv[0];
3116  if (__kmp_storage_map) {
3117  __kmp_print_storage_map_gtid(
3118  -1, &team->t.t_inline_argv[0],
3119  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3120  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3121  team->t.t_id);
3122  }
3123  } else {
3124  /* allocate space for arguments in the heap */
3125  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3126  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3127  : 2 * argc;
3128  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3129  "argv entries\n",
3130  team->t.t_id, team->t.t_max_argc));
3131  team->t.t_argv =
3132  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3133  if (__kmp_storage_map) {
3134  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3135  &team->t.t_argv[team->t.t_max_argc],
3136  sizeof(void *) * team->t.t_max_argc,
3137  "team_%d.t_argv", team->t.t_id);
3138  }
3139  }
3140  }
3141 }
3142 
3143 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3144  int i;
3145  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3146  team->t.t_threads =
3147  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3148  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3149  sizeof(dispatch_shared_info_t) * num_disp_buff);
3150  team->t.t_dispatch =
3151  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3152  team->t.t_implicit_task_taskdata =
3153  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3154  team->t.t_max_nproc = max_nth;
3155 
3156  /* setup dispatch buffers */
3157  for (i = 0; i < num_disp_buff; ++i) {
3158  team->t.t_disp_buffer[i].buffer_index = i;
3159  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3160  }
3161 }
3162 
3163 static void __kmp_free_team_arrays(kmp_team_t *team) {
3164  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3165  int i;
3166  for (i = 0; i < team->t.t_max_nproc; ++i) {
3167  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3168  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3169  team->t.t_dispatch[i].th_disp_buffer = NULL;
3170  }
3171  }
3172 #if KMP_USE_HIER_SCHED
3173  __kmp_dispatch_free_hierarchies(team);
3174 #endif
3175  __kmp_free(team->t.t_threads);
3176  __kmp_free(team->t.t_disp_buffer);
3177  __kmp_free(team->t.t_dispatch);
3178  __kmp_free(team->t.t_implicit_task_taskdata);
3179  team->t.t_threads = NULL;
3180  team->t.t_disp_buffer = NULL;
3181  team->t.t_dispatch = NULL;
3182  team->t.t_implicit_task_taskdata = 0;
3183 }
3184 
3185 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3186  kmp_info_t **oldThreads = team->t.t_threads;
3187 
3188  __kmp_free(team->t.t_disp_buffer);
3189  __kmp_free(team->t.t_dispatch);
3190  __kmp_free(team->t.t_implicit_task_taskdata);
3191  __kmp_allocate_team_arrays(team, max_nth);
3192 
3193  KMP_MEMCPY(team->t.t_threads, oldThreads,
3194  team->t.t_nproc * sizeof(kmp_info_t *));
3195 
3196  __kmp_free(oldThreads);
3197 }
3198 
3199 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3200 
3201  kmp_r_sched_t r_sched =
3202  __kmp_get_schedule_global(); // get current state of scheduling globals
3203 
3204  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3205 
3206  kmp_internal_control_t g_icvs = {
3207  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3208  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3209  // adjustment of threads (per thread)
3210  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3211  // whether blocktime is explicitly set
3212  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3213 #if KMP_USE_MONITOR
3214  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3215 // intervals
3216 #endif
3217  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3218  // next parallel region (per thread)
3219  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3220  __kmp_cg_max_nth, // int thread_limit;
3221  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3222  // for max_active_levels
3223  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3224  // {sched,chunk} pair
3225  __kmp_nested_proc_bind.bind_types[0],
3226  __kmp_default_device,
3227  NULL // struct kmp_internal_control *next;
3228  };
3229 
3230  return g_icvs;
3231 }
3232 
3233 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3234 
3235  kmp_internal_control_t gx_icvs;
3236  gx_icvs.serial_nesting_level =
3237  0; // probably =team->t.t_serial like in save_inter_controls
3238  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3239  gx_icvs.next = NULL;
3240 
3241  return gx_icvs;
3242 }
3243 
3244 static void __kmp_initialize_root(kmp_root_t *root) {
3245  int f;
3246  kmp_team_t *root_team;
3247  kmp_team_t *hot_team;
3248  int hot_team_max_nth;
3249  kmp_r_sched_t r_sched =
3250  __kmp_get_schedule_global(); // get current state of scheduling globals
3251  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3252  KMP_DEBUG_ASSERT(root);
3253  KMP_ASSERT(!root->r.r_begin);
3254 
3255  /* setup the root state structure */
3256  __kmp_init_lock(&root->r.r_begin_lock);
3257  root->r.r_begin = FALSE;
3258  root->r.r_active = FALSE;
3259  root->r.r_in_parallel = 0;
3260  root->r.r_blocktime = __kmp_dflt_blocktime;
3261 #if KMP_AFFINITY_SUPPORTED
3262  root->r.r_affinity_assigned = FALSE;
3263 #endif
3264 
3265  /* setup the root team for this task */
3266  /* allocate the root team structure */
3267  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3268 
3269  root_team =
3270  __kmp_allocate_team(root,
3271  1, // new_nproc
3272  1, // max_nproc
3273 #if OMPT_SUPPORT
3274  ompt_data_none, // root parallel id
3275 #endif
3276  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3277  0 // argc
3278  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3279  );
3280 #if USE_DEBUGGER
3281  // Non-NULL value should be assigned to make the debugger display the root
3282  // team.
3283  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3284 #endif
3285 
3286  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3287 
3288  root->r.r_root_team = root_team;
3289  root_team->t.t_control_stack_top = NULL;
3290 
3291  /* initialize root team */
3292  root_team->t.t_threads[0] = NULL;
3293  root_team->t.t_nproc = 1;
3294  root_team->t.t_serialized = 1;
3295  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3296  root_team->t.t_sched.sched = r_sched.sched;
3297  KA_TRACE(
3298  20,
3299  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3300  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3301 
3302  /* setup the hot team for this task */
3303  /* allocate the hot team structure */
3304  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3305 
3306  hot_team =
3307  __kmp_allocate_team(root,
3308  1, // new_nproc
3309  __kmp_dflt_team_nth_ub * 2, // max_nproc
3310 #if OMPT_SUPPORT
3311  ompt_data_none, // root parallel id
3312 #endif
3313  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3314  0 // argc
3315  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3316  );
3317  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3318 
3319  root->r.r_hot_team = hot_team;
3320  root_team->t.t_control_stack_top = NULL;
3321 
3322  /* first-time initialization */
3323  hot_team->t.t_parent = root_team;
3324 
3325  /* initialize hot team */
3326  hot_team_max_nth = hot_team->t.t_max_nproc;
3327  for (f = 0; f < hot_team_max_nth; ++f) {
3328  hot_team->t.t_threads[f] = NULL;
3329  }
3330  hot_team->t.t_nproc = 1;
3331  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3332  hot_team->t.t_sched.sched = r_sched.sched;
3333  hot_team->t.t_size_changed = 0;
3334 }
3335 
3336 #ifdef KMP_DEBUG
3337 
3338 typedef struct kmp_team_list_item {
3339  kmp_team_p const *entry;
3340  struct kmp_team_list_item *next;
3341 } kmp_team_list_item_t;
3342 typedef kmp_team_list_item_t *kmp_team_list_t;
3343 
3344 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3345  kmp_team_list_t list, // List of teams.
3346  kmp_team_p const *team // Team to add.
3347 ) {
3348 
3349  // List must terminate with item where both entry and next are NULL.
3350  // Team is added to the list only once.
3351  // List is sorted in ascending order by team id.
3352  // Team id is *not* a key.
3353 
3354  kmp_team_list_t l;
3355 
3356  KMP_DEBUG_ASSERT(list != NULL);
3357  if (team == NULL) {
3358  return;
3359  }
3360 
3361  __kmp_print_structure_team_accum(list, team->t.t_parent);
3362  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3363 
3364  // Search list for the team.
3365  l = list;
3366  while (l->next != NULL && l->entry != team) {
3367  l = l->next;
3368  }
3369  if (l->next != NULL) {
3370  return; // Team has been added before, exit.
3371  }
3372 
3373  // Team is not found. Search list again for insertion point.
3374  l = list;
3375  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3376  l = l->next;
3377  }
3378 
3379  // Insert team.
3380  {
3381  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3382  sizeof(kmp_team_list_item_t));
3383  *item = *l;
3384  l->entry = team;
3385  l->next = item;
3386  }
3387 }
3388 
3389 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3390 
3391 ) {
3392  __kmp_printf("%s", title);
3393  if (team != NULL) {
3394  __kmp_printf("%2x %p\n", team->t.t_id, team);
3395  } else {
3396  __kmp_printf(" - (nil)\n");
3397  }
3398 }
3399 
3400 static void __kmp_print_structure_thread(char const *title,
3401  kmp_info_p const *thread) {
3402  __kmp_printf("%s", title);
3403  if (thread != NULL) {
3404  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3405  } else {
3406  __kmp_printf(" - (nil)\n");
3407  }
3408 }
3409 
3410 void __kmp_print_structure(void) {
3411 
3412  kmp_team_list_t list;
3413 
3414  // Initialize list of teams.
3415  list =
3416  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3417  list->entry = NULL;
3418  list->next = NULL;
3419 
3420  __kmp_printf("\n------------------------------\nGlobal Thread "
3421  "Table\n------------------------------\n");
3422  {
3423  int gtid;
3424  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3425  __kmp_printf("%2d", gtid);
3426  if (__kmp_threads != NULL) {
3427  __kmp_printf(" %p", __kmp_threads[gtid]);
3428  }
3429  if (__kmp_root != NULL) {
3430  __kmp_printf(" %p", __kmp_root[gtid]);
3431  }
3432  __kmp_printf("\n");
3433  }
3434  }
3435 
3436  // Print out __kmp_threads array.
3437  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3438  "----------\n");
3439  if (__kmp_threads != NULL) {
3440  int gtid;
3441  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3442  kmp_info_t const *thread = __kmp_threads[gtid];
3443  if (thread != NULL) {
3444  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3445  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3446  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3447  __kmp_print_structure_team(" Serial Team: ",
3448  thread->th.th_serial_team);
3449  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3450  __kmp_print_structure_thread(" Primary: ",
3451  thread->th.th_team_master);
3452  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3453  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3454  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3455  __kmp_print_structure_thread(" Next in pool: ",
3456  thread->th.th_next_pool);
3457  __kmp_printf("\n");
3458  __kmp_print_structure_team_accum(list, thread->th.th_team);
3459  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3460  }
3461  }
3462  } else {
3463  __kmp_printf("Threads array is not allocated.\n");
3464  }
3465 
3466  // Print out __kmp_root array.
3467  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3468  "--------\n");
3469  if (__kmp_root != NULL) {
3470  int gtid;
3471  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3472  kmp_root_t const *root = __kmp_root[gtid];
3473  if (root != NULL) {
3474  __kmp_printf("GTID %2d %p:\n", gtid, root);
3475  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3476  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3477  __kmp_print_structure_thread(" Uber Thread: ",
3478  root->r.r_uber_thread);
3479  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3480  __kmp_printf(" In Parallel: %2d\n",
3481  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3482  __kmp_printf("\n");
3483  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3484  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3485  }
3486  }
3487  } else {
3488  __kmp_printf("Ubers array is not allocated.\n");
3489  }
3490 
3491  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3492  "--------\n");
3493  while (list->next != NULL) {
3494  kmp_team_p const *team = list->entry;
3495  int i;
3496  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3497  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3498  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3499  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3500  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3501  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3502  for (i = 0; i < team->t.t_nproc; ++i) {
3503  __kmp_printf(" Thread %2d: ", i);
3504  __kmp_print_structure_thread("", team->t.t_threads[i]);
3505  }
3506  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3507  __kmp_printf("\n");
3508  list = list->next;
3509  }
3510 
3511  // Print out __kmp_thread_pool and __kmp_team_pool.
3512  __kmp_printf("\n------------------------------\nPools\n----------------------"
3513  "--------\n");
3514  __kmp_print_structure_thread("Thread pool: ",
3515  CCAST(kmp_info_t *, __kmp_thread_pool));
3516  __kmp_print_structure_team("Team pool: ",
3517  CCAST(kmp_team_t *, __kmp_team_pool));
3518  __kmp_printf("\n");
3519 
3520  // Free team list.
3521  while (list != NULL) {
3522  kmp_team_list_item_t *item = list;
3523  list = list->next;
3524  KMP_INTERNAL_FREE(item);
3525  }
3526 }
3527 
3528 #endif
3529 
3530 //---------------------------------------------------------------------------
3531 // Stuff for per-thread fast random number generator
3532 // Table of primes
3533 static const unsigned __kmp_primes[] = {
3534  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3535  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3536  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3537  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3538  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3539  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3540  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3541  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3542  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3543  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3544  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3545 
3546 //---------------------------------------------------------------------------
3547 // __kmp_get_random: Get a random number using a linear congruential method.
3548 unsigned short __kmp_get_random(kmp_info_t *thread) {
3549  unsigned x = thread->th.th_x;
3550  unsigned short r = (unsigned short)(x >> 16);
3551 
3552  thread->th.th_x = x * thread->th.th_a + 1;
3553 
3554  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3555  thread->th.th_info.ds.ds_tid, r));
3556 
3557  return r;
3558 }
3559 //--------------------------------------------------------
3560 // __kmp_init_random: Initialize a random number generator
3561 void __kmp_init_random(kmp_info_t *thread) {
3562  unsigned seed = thread->th.th_info.ds.ds_tid;
3563 
3564  thread->th.th_a =
3565  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3566  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3567  KA_TRACE(30,
3568  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3569 }
3570 
3571 #if KMP_OS_WINDOWS
3572 /* reclaim array entries for root threads that are already dead, returns number
3573  * reclaimed */
3574 static int __kmp_reclaim_dead_roots(void) {
3575  int i, r = 0;
3576 
3577  for (i = 0; i < __kmp_threads_capacity; ++i) {
3578  if (KMP_UBER_GTID(i) &&
3579  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3580  !__kmp_root[i]
3581  ->r.r_active) { // AC: reclaim only roots died in non-active state
3582  r += __kmp_unregister_root_other_thread(i);
3583  }
3584  }
3585  return r;
3586 }
3587 #endif
3588 
3589 /* This function attempts to create free entries in __kmp_threads and
3590  __kmp_root, and returns the number of free entries generated.
3591 
3592  For Windows* OS static library, the first mechanism used is to reclaim array
3593  entries for root threads that are already dead.
3594 
3595  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3596  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3597  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3598  threadprivate cache array has been created. Synchronization with
3599  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3600 
3601  After any dead root reclamation, if the clipping value allows array expansion
3602  to result in the generation of a total of nNeed free slots, the function does
3603  that expansion. If not, nothing is done beyond the possible initial root
3604  thread reclamation.
3605 
3606  If any argument is negative, the behavior is undefined. */
3607 static int __kmp_expand_threads(int nNeed) {
3608  int added = 0;
3609  int minimumRequiredCapacity;
3610  int newCapacity;
3611  kmp_info_t **newThreads;
3612  kmp_root_t **newRoot;
3613 
3614  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3615  // resizing __kmp_threads does not need additional protection if foreign
3616  // threads are present
3617 
3618 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3619  /* only for Windows static library */
3620  /* reclaim array entries for root threads that are already dead */
3621  added = __kmp_reclaim_dead_roots();
3622 
3623  if (nNeed) {
3624  nNeed -= added;
3625  if (nNeed < 0)
3626  nNeed = 0;
3627  }
3628 #endif
3629  if (nNeed <= 0)
3630  return added;
3631 
3632  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3633  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3634  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3635  // > __kmp_max_nth in one of two ways:
3636  //
3637  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3638  // may not be reused by another thread, so we may need to increase
3639  // __kmp_threads_capacity to __kmp_max_nth + 1.
3640  //
3641  // 2) New foreign root(s) are encountered. We always register new foreign
3642  // roots. This may cause a smaller # of threads to be allocated at
3643  // subsequent parallel regions, but the worker threads hang around (and
3644  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3645  //
3646  // Anyway, that is the reason for moving the check to see if
3647  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3648  // instead of having it performed here. -BB
3649 
3650  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3651 
3652  /* compute expansion headroom to check if we can expand */
3653  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3654  /* possible expansion too small -- give up */
3655  return added;
3656  }
3657  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3658 
3659  newCapacity = __kmp_threads_capacity;
3660  do {
3661  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3662  : __kmp_sys_max_nth;
3663  } while (newCapacity < minimumRequiredCapacity);
3664  newThreads = (kmp_info_t **)__kmp_allocate(
3665  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3666  newRoot =
3667  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3668  KMP_MEMCPY(newThreads, __kmp_threads,
3669  __kmp_threads_capacity * sizeof(kmp_info_t *));
3670  KMP_MEMCPY(newRoot, __kmp_root,
3671  __kmp_threads_capacity * sizeof(kmp_root_t *));
3672 
3673  kmp_info_t **temp_threads = __kmp_threads;
3674  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3675  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3676  __kmp_free(temp_threads);
3677  added += newCapacity - __kmp_threads_capacity;
3678  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3679 
3680  if (newCapacity > __kmp_tp_capacity) {
3681  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3682  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3683  __kmp_threadprivate_resize_cache(newCapacity);
3684  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3685  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3686  }
3687  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3688  }
3689 
3690  return added;
3691 }
3692 
3693 /* Register the current thread as a root thread and obtain our gtid. We must
3694  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3695  thread that calls from __kmp_do_serial_initialize() */
3696 int __kmp_register_root(int initial_thread) {
3697  kmp_info_t *root_thread;
3698  kmp_root_t *root;
3699  int gtid;
3700  int capacity;
3701  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3702  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3703  KMP_MB();
3704 
3705  /* 2007-03-02:
3706  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3707  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3708  work as expected -- it may return false (that means there is at least one
3709  empty slot in __kmp_threads array), but it is possible the only free slot
3710  is #0, which is reserved for initial thread and so cannot be used for this
3711  one. Following code workarounds this bug.
3712 
3713  However, right solution seems to be not reserving slot #0 for initial
3714  thread because:
3715  (1) there is no magic in slot #0,
3716  (2) we cannot detect initial thread reliably (the first thread which does
3717  serial initialization may be not a real initial thread).
3718  */
3719  capacity = __kmp_threads_capacity;
3720  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3721  --capacity;
3722  }
3723 
3724  // If it is not for initializing the hidden helper team, we need to take
3725  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3726  // in __kmp_threads_capacity.
3727  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3728  capacity -= __kmp_hidden_helper_threads_num;
3729  }
3730 
3731  /* see if there are too many threads */
3732  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3733  if (__kmp_tp_cached) {
3734  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3735  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3736  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3737  } else {
3738  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3739  __kmp_msg_null);
3740  }
3741  }
3742 
3743  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3744  // 0: initial thread, also a regular OpenMP thread.
3745  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3746  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3747  // regular OpenMP threads.
3748  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3749  // Find an available thread slot for hidden helper thread. Slots for hidden
3750  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3751  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3752  gtid <= __kmp_hidden_helper_threads_num;
3753  gtid++)
3754  ;
3755  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3756  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3757  "hidden helper thread: T#%d\n",
3758  gtid));
3759  } else {
3760  /* find an available thread slot */
3761  // Don't reassign the zero slot since we need that to only be used by
3762  // initial thread. Slots for hidden helper threads should also be skipped.
3763  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3764  gtid = 0;
3765  } else {
3766  for (gtid = __kmp_hidden_helper_threads_num + 1;
3767  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3768  ;
3769  }
3770  KA_TRACE(
3771  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3772  KMP_ASSERT(gtid < __kmp_threads_capacity);
3773  }
3774 
3775  /* update global accounting */
3776  __kmp_all_nth++;
3777  TCW_4(__kmp_nth, __kmp_nth + 1);
3778 
3779  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3780  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3781  if (__kmp_adjust_gtid_mode) {
3782  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3783  if (TCR_4(__kmp_gtid_mode) != 2) {
3784  TCW_4(__kmp_gtid_mode, 2);
3785  }
3786  } else {
3787  if (TCR_4(__kmp_gtid_mode) != 1) {
3788  TCW_4(__kmp_gtid_mode, 1);
3789  }
3790  }
3791  }
3792 
3793 #ifdef KMP_ADJUST_BLOCKTIME
3794  /* Adjust blocktime to zero if necessary */
3795  /* Middle initialization might not have occurred yet */
3796  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3797  if (__kmp_nth > __kmp_avail_proc) {
3798  __kmp_zero_bt = TRUE;
3799  }
3800  }
3801 #endif /* KMP_ADJUST_BLOCKTIME */
3802 
3803  /* setup this new hierarchy */
3804  if (!(root = __kmp_root[gtid])) {
3805  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3806  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3807  }
3808 
3809 #if KMP_STATS_ENABLED
3810  // Initialize stats as soon as possible (right after gtid assignment).
3811  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3812  __kmp_stats_thread_ptr->startLife();
3813  KMP_SET_THREAD_STATE(SERIAL_REGION);
3814  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3815 #endif
3816  __kmp_initialize_root(root);
3817 
3818  /* setup new root thread structure */
3819  if (root->r.r_uber_thread) {
3820  root_thread = root->r.r_uber_thread;
3821  } else {
3822  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3823  if (__kmp_storage_map) {
3824  __kmp_print_thread_storage_map(root_thread, gtid);
3825  }
3826  root_thread->th.th_info.ds.ds_gtid = gtid;
3827 #if OMPT_SUPPORT
3828  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3829 #endif
3830  root_thread->th.th_root = root;
3831  if (__kmp_env_consistency_check) {
3832  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3833  }
3834 #if USE_FAST_MEMORY
3835  __kmp_initialize_fast_memory(root_thread);
3836 #endif /* USE_FAST_MEMORY */
3837 
3838 #if KMP_USE_BGET
3839  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3840  __kmp_initialize_bget(root_thread);
3841 #endif
3842  __kmp_init_random(root_thread); // Initialize random number generator
3843  }
3844 
3845  /* setup the serial team held in reserve by the root thread */
3846  if (!root_thread->th.th_serial_team) {
3847  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3848  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3849  root_thread->th.th_serial_team = __kmp_allocate_team(
3850  root, 1, 1,
3851 #if OMPT_SUPPORT
3852  ompt_data_none, // root parallel id
3853 #endif
3854  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3855  }
3856  KMP_ASSERT(root_thread->th.th_serial_team);
3857  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3858  root_thread->th.th_serial_team));
3859 
3860  /* drop root_thread into place */
3861  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3862 
3863  root->r.r_root_team->t.t_threads[0] = root_thread;
3864  root->r.r_hot_team->t.t_threads[0] = root_thread;
3865  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3866  // AC: the team created in reserve, not for execution (it is unused for now).
3867  root_thread->th.th_serial_team->t.t_serialized = 0;
3868  root->r.r_uber_thread = root_thread;
3869 
3870  /* initialize the thread, get it ready to go */
3871  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3872  TCW_4(__kmp_init_gtid, TRUE);
3873 
3874  /* prepare the primary thread for get_gtid() */
3875  __kmp_gtid_set_specific(gtid);
3876 
3877 #if USE_ITT_BUILD
3878  __kmp_itt_thread_name(gtid);
3879 #endif /* USE_ITT_BUILD */
3880 
3881 #ifdef KMP_TDATA_GTID
3882  __kmp_gtid = gtid;
3883 #endif
3884  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3885  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3886 
3887  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3888  "plain=%u\n",
3889  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3890  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3891  KMP_INIT_BARRIER_STATE));
3892  { // Initialize barrier data.
3893  int b;
3894  for (b = 0; b < bs_last_barrier; ++b) {
3895  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3896 #if USE_DEBUGGER
3897  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3898 #endif
3899  }
3900  }
3901  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3902  KMP_INIT_BARRIER_STATE);
3903 
3904 #if KMP_AFFINITY_SUPPORTED
3905  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3906  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3907  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3908  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3909 #endif /* KMP_AFFINITY_SUPPORTED */
3910  root_thread->th.th_def_allocator = __kmp_def_allocator;
3911  root_thread->th.th_prev_level = 0;
3912  root_thread->th.th_prev_num_threads = 1;
3913 
3914  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3915  tmp->cg_root = root_thread;
3916  tmp->cg_thread_limit = __kmp_cg_max_nth;
3917  tmp->cg_nthreads = 1;
3918  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3919  " cg_nthreads init to 1\n",
3920  root_thread, tmp));
3921  tmp->up = NULL;
3922  root_thread->th.th_cg_roots = tmp;
3923 
3924  __kmp_root_counter++;
3925 
3926 #if OMPT_SUPPORT
3927  if (!initial_thread && ompt_enabled.enabled) {
3928 
3929  kmp_info_t *root_thread = ompt_get_thread();
3930 
3931  ompt_set_thread_state(root_thread, ompt_state_overhead);
3932 
3933  if (ompt_enabled.ompt_callback_thread_begin) {
3934  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3935  ompt_thread_initial, __ompt_get_thread_data_internal());
3936  }
3937  ompt_data_t *task_data;
3938  ompt_data_t *parallel_data;
3939  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3940  NULL);
3941  if (ompt_enabled.ompt_callback_implicit_task) {
3942  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3943  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3944  }
3945 
3946  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3947  }
3948 #endif
3949 #if OMPD_SUPPORT
3950  if (ompd_state & OMPD_ENABLE_BP)
3951  ompd_bp_thread_begin();
3952 #endif
3953 
3954  KMP_MB();
3955  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3956 
3957  return gtid;
3958 }
3959 
3960 #if KMP_NESTED_HOT_TEAMS
3961 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3962  const int max_level) {
3963  int i, n, nth;
3964  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3965  if (!hot_teams || !hot_teams[level].hot_team) {
3966  return 0;
3967  }
3968  KMP_DEBUG_ASSERT(level < max_level);
3969  kmp_team_t *team = hot_teams[level].hot_team;
3970  nth = hot_teams[level].hot_team_nth;
3971  n = nth - 1; // primary thread is not freed
3972  if (level < max_level - 1) {
3973  for (i = 0; i < nth; ++i) {
3974  kmp_info_t *th = team->t.t_threads[i];
3975  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3976  if (i > 0 && th->th.th_hot_teams) {
3977  __kmp_free(th->th.th_hot_teams);
3978  th->th.th_hot_teams = NULL;
3979  }
3980  }
3981  }
3982  __kmp_free_team(root, team, NULL);
3983  return n;
3984 }
3985 #endif
3986 
3987 // Resets a root thread and clear its root and hot teams.
3988 // Returns the number of __kmp_threads entries directly and indirectly freed.
3989 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3990  kmp_team_t *root_team = root->r.r_root_team;
3991  kmp_team_t *hot_team = root->r.r_hot_team;
3992  int n = hot_team->t.t_nproc;
3993  int i;
3994 
3995  KMP_DEBUG_ASSERT(!root->r.r_active);
3996 
3997  root->r.r_root_team = NULL;
3998  root->r.r_hot_team = NULL;
3999  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4000  // before call to __kmp_free_team().
4001  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4002 #if KMP_NESTED_HOT_TEAMS
4003  if (__kmp_hot_teams_max_level >
4004  0) { // need to free nested hot teams and their threads if any
4005  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4006  kmp_info_t *th = hot_team->t.t_threads[i];
4007  if (__kmp_hot_teams_max_level > 1) {
4008  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4009  }
4010  if (th->th.th_hot_teams) {
4011  __kmp_free(th->th.th_hot_teams);
4012  th->th.th_hot_teams = NULL;
4013  }
4014  }
4015  }
4016 #endif
4017  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4018 
4019  // Before we can reap the thread, we need to make certain that all other
4020  // threads in the teams that had this root as ancestor have stopped trying to
4021  // steal tasks.
4022  if (__kmp_tasking_mode != tskm_immediate_exec) {
4023  __kmp_wait_to_unref_task_teams();
4024  }
4025 
4026 #if KMP_OS_WINDOWS
4027  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4028  KA_TRACE(
4029  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4030  "\n",
4031  (LPVOID) & (root->r.r_uber_thread->th),
4032  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4033  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4034 #endif /* KMP_OS_WINDOWS */
4035 
4036 #if OMPD_SUPPORT
4037  if (ompd_state & OMPD_ENABLE_BP)
4038  ompd_bp_thread_end();
4039 #endif
4040 
4041 #if OMPT_SUPPORT
4042  ompt_data_t *task_data;
4043  ompt_data_t *parallel_data;
4044  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4045  NULL);
4046  if (ompt_enabled.ompt_callback_implicit_task) {
4047  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4048  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4049  }
4050  if (ompt_enabled.ompt_callback_thread_end) {
4051  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4052  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4053  }
4054 #endif
4055 
4056  TCW_4(__kmp_nth,
4057  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4058  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4059  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4060  " to %d\n",
4061  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4062  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4063  if (i == 1) {
4064  // need to free contention group structure
4065  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4066  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4067  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4068  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4069  root->r.r_uber_thread->th.th_cg_roots = NULL;
4070  }
4071  __kmp_reap_thread(root->r.r_uber_thread, 1);
4072 
4073  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4074  // instead of freeing.
4075  root->r.r_uber_thread = NULL;
4076  /* mark root as no longer in use */
4077  root->r.r_begin = FALSE;
4078 
4079  return n;
4080 }
4081 
4082 void __kmp_unregister_root_current_thread(int gtid) {
4083  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4084  /* this lock should be ok, since unregister_root_current_thread is never
4085  called during an abort, only during a normal close. furthermore, if you
4086  have the forkjoin lock, you should never try to get the initz lock */
4087  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4088  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4089  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4090  "exiting T#%d\n",
4091  gtid));
4092  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4093  return;
4094  }
4095  kmp_root_t *root = __kmp_root[gtid];
4096 
4097  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4098  KMP_ASSERT(KMP_UBER_GTID(gtid));
4099  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4100  KMP_ASSERT(root->r.r_active == FALSE);
4101 
4102  KMP_MB();
4103 
4104  kmp_info_t *thread = __kmp_threads[gtid];
4105  kmp_team_t *team = thread->th.th_team;
4106  kmp_task_team_t *task_team = thread->th.th_task_team;
4107 
4108  // we need to wait for the proxy tasks before finishing the thread
4109  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4110  task_team->tt.tt_hidden_helper_task_encountered)) {
4111 #if OMPT_SUPPORT
4112  // the runtime is shutting down so we won't report any events
4113  thread->th.ompt_thread_info.state = ompt_state_undefined;
4114 #endif
4115  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4116  }
4117 
4118  __kmp_reset_root(gtid, root);
4119 
4120  KMP_MB();
4121  KC_TRACE(10,
4122  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4123 
4124  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4125 }
4126 
4127 #if KMP_OS_WINDOWS
4128 /* __kmp_forkjoin_lock must be already held
4129  Unregisters a root thread that is not the current thread. Returns the number
4130  of __kmp_threads entries freed as a result. */
4131 static int __kmp_unregister_root_other_thread(int gtid) {
4132  kmp_root_t *root = __kmp_root[gtid];
4133  int r;
4134 
4135  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4136  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4137  KMP_ASSERT(KMP_UBER_GTID(gtid));
4138  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4139  KMP_ASSERT(root->r.r_active == FALSE);
4140 
4141  r = __kmp_reset_root(gtid, root);
4142  KC_TRACE(10,
4143  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4144  return r;
4145 }
4146 #endif
4147 
4148 #if KMP_DEBUG
4149 void __kmp_task_info() {
4150 
4151  kmp_int32 gtid = __kmp_entry_gtid();
4152  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4153  kmp_info_t *this_thr = __kmp_threads[gtid];
4154  kmp_team_t *steam = this_thr->th.th_serial_team;
4155  kmp_team_t *team = this_thr->th.th_team;
4156 
4157  __kmp_printf(
4158  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4159  "ptask=%p\n",
4160  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4161  team->t.t_implicit_task_taskdata[tid].td_parent);
4162 }
4163 #endif // KMP_DEBUG
4164 
4165 /* TODO optimize with one big memclr, take out what isn't needed, split
4166  responsibility to workers as much as possible, and delay initialization of
4167  features as much as possible */
4168 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4169  int tid, int gtid) {
4170  /* this_thr->th.th_info.ds.ds_gtid is setup in
4171  kmp_allocate_thread/create_worker.
4172  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4173  KMP_DEBUG_ASSERT(this_thr != NULL);
4174  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4175  KMP_DEBUG_ASSERT(team);
4176  KMP_DEBUG_ASSERT(team->t.t_threads);
4177  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4178  kmp_info_t *master = team->t.t_threads[0];
4179  KMP_DEBUG_ASSERT(master);
4180  KMP_DEBUG_ASSERT(master->th.th_root);
4181 
4182  KMP_MB();
4183 
4184  TCW_SYNC_PTR(this_thr->th.th_team, team);
4185 
4186  this_thr->th.th_info.ds.ds_tid = tid;
4187  this_thr->th.th_set_nproc = 0;
4188  if (__kmp_tasking_mode != tskm_immediate_exec)
4189  // When tasking is possible, threads are not safe to reap until they are
4190  // done tasking; this will be set when tasking code is exited in wait
4191  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4192  else // no tasking --> always safe to reap
4193  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4194  this_thr->th.th_set_proc_bind = proc_bind_default;
4195 #if KMP_AFFINITY_SUPPORTED
4196  this_thr->th.th_new_place = this_thr->th.th_current_place;
4197 #endif
4198  this_thr->th.th_root = master->th.th_root;
4199 
4200  /* setup the thread's cache of the team structure */
4201  this_thr->th.th_team_nproc = team->t.t_nproc;
4202  this_thr->th.th_team_master = master;
4203  this_thr->th.th_team_serialized = team->t.t_serialized;
4204 
4205  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4206 
4207  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4208  tid, gtid, this_thr, this_thr->th.th_current_task));
4209 
4210  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4211  team, tid, TRUE);
4212 
4213  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4214  tid, gtid, this_thr, this_thr->th.th_current_task));
4215  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4216  // __kmp_initialize_team()?
4217 
4218  /* TODO no worksharing in speculative threads */
4219  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4220 
4221  this_thr->th.th_local.this_construct = 0;
4222 
4223  if (!this_thr->th.th_pri_common) {
4224  this_thr->th.th_pri_common =
4225  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4226  if (__kmp_storage_map) {
4227  __kmp_print_storage_map_gtid(
4228  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4229  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4230  }
4231  this_thr->th.th_pri_head = NULL;
4232  }
4233 
4234  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4235  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4236  // Make new thread's CG root same as primary thread's
4237  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4238  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4239  if (tmp) {
4240  // worker changes CG, need to check if old CG should be freed
4241  int i = tmp->cg_nthreads--;
4242  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4243  " on node %p of thread %p to %d\n",
4244  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4245  if (i == 1) {
4246  __kmp_free(tmp); // last thread left CG --> free it
4247  }
4248  }
4249  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4250  // Increment new thread's CG root's counter to add the new thread
4251  this_thr->th.th_cg_roots->cg_nthreads++;
4252  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4253  " node %p of thread %p to %d\n",
4254  this_thr, this_thr->th.th_cg_roots,
4255  this_thr->th.th_cg_roots->cg_root,
4256  this_thr->th.th_cg_roots->cg_nthreads));
4257  this_thr->th.th_current_task->td_icvs.thread_limit =
4258  this_thr->th.th_cg_roots->cg_thread_limit;
4259  }
4260 
4261  /* Initialize dynamic dispatch */
4262  {
4263  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4264  // Use team max_nproc since this will never change for the team.
4265  size_t disp_size =
4266  sizeof(dispatch_private_info_t) *
4267  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4268  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4269  team->t.t_max_nproc));
4270  KMP_ASSERT(dispatch);
4271  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4272  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4273 
4274  dispatch->th_disp_index = 0;
4275  dispatch->th_doacross_buf_idx = 0;
4276  if (!dispatch->th_disp_buffer) {
4277  dispatch->th_disp_buffer =
4278  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4279 
4280  if (__kmp_storage_map) {
4281  __kmp_print_storage_map_gtid(
4282  gtid, &dispatch->th_disp_buffer[0],
4283  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4284  ? 1
4285  : __kmp_dispatch_num_buffers],
4286  disp_size,
4287  "th_%d.th_dispatch.th_disp_buffer "
4288  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4289  gtid, team->t.t_id, gtid);
4290  }
4291  } else {
4292  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4293  }
4294 
4295  dispatch->th_dispatch_pr_current = 0;
4296  dispatch->th_dispatch_sh_current = 0;
4297 
4298  dispatch->th_deo_fcn = 0; /* ORDERED */
4299  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4300  }
4301 
4302  this_thr->th.th_next_pool = NULL;
4303 
4304  if (!this_thr->th.th_task_state_memo_stack) {
4305  size_t i;
4306  this_thr->th.th_task_state_memo_stack =
4307  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4308  this_thr->th.th_task_state_top = 0;
4309  this_thr->th.th_task_state_stack_sz = 4;
4310  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4311  ++i) // zero init the stack
4312  this_thr->th.th_task_state_memo_stack[i] = 0;
4313  }
4314 
4315  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4316  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4317 
4318  KMP_MB();
4319 }
4320 
4321 /* allocate a new thread for the requesting team. this is only called from
4322  within a forkjoin critical section. we will first try to get an available
4323  thread from the thread pool. if none is available, we will fork a new one
4324  assuming we are able to create a new one. this should be assured, as the
4325  caller should check on this first. */
4326 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4327  int new_tid) {
4328  kmp_team_t *serial_team;
4329  kmp_info_t *new_thr;
4330  int new_gtid;
4331 
4332  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4333  KMP_DEBUG_ASSERT(root && team);
4334 #if !KMP_NESTED_HOT_TEAMS
4335  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4336 #endif
4337  KMP_MB();
4338 
4339  /* first, try to get one from the thread pool */
4340  if (__kmp_thread_pool) {
4341  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4342  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4343  if (new_thr == __kmp_thread_pool_insert_pt) {
4344  __kmp_thread_pool_insert_pt = NULL;
4345  }
4346  TCW_4(new_thr->th.th_in_pool, FALSE);
4347  __kmp_suspend_initialize_thread(new_thr);
4348  __kmp_lock_suspend_mx(new_thr);
4349  if (new_thr->th.th_active_in_pool == TRUE) {
4350  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4351  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4352  new_thr->th.th_active_in_pool = FALSE;
4353  }
4354  __kmp_unlock_suspend_mx(new_thr);
4355 
4356  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4357  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4358  KMP_ASSERT(!new_thr->th.th_team);
4359  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4360 
4361  /* setup the thread structure */
4362  __kmp_initialize_info(new_thr, team, new_tid,
4363  new_thr->th.th_info.ds.ds_gtid);
4364  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4365 
4366  TCW_4(__kmp_nth, __kmp_nth + 1);
4367 
4368  new_thr->th.th_task_state = 0;
4369  new_thr->th.th_task_state_top = 0;
4370  new_thr->th.th_task_state_stack_sz = 4;
4371 
4372  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4373  // Make sure pool thread has transitioned to waiting on own thread struct
4374  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4375  // Thread activated in __kmp_allocate_team when increasing team size
4376  }
4377 
4378 #ifdef KMP_ADJUST_BLOCKTIME
4379  /* Adjust blocktime back to zero if necessary */
4380  /* Middle initialization might not have occurred yet */
4381  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4382  if (__kmp_nth > __kmp_avail_proc) {
4383  __kmp_zero_bt = TRUE;
4384  }
4385  }
4386 #endif /* KMP_ADJUST_BLOCKTIME */
4387 
4388 #if KMP_DEBUG
4389  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4390  // KMP_BARRIER_PARENT_FLAG.
4391  int b;
4392  kmp_balign_t *balign = new_thr->th.th_bar;
4393  for (b = 0; b < bs_last_barrier; ++b)
4394  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4395 #endif
4396 
4397  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4398  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4399 
4400  KMP_MB();
4401  return new_thr;
4402  }
4403 
4404  /* no, well fork a new one */
4405  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4406  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4407 
4408 #if KMP_USE_MONITOR
4409  // If this is the first worker thread the RTL is creating, then also
4410  // launch the monitor thread. We try to do this as early as possible.
4411  if (!TCR_4(__kmp_init_monitor)) {
4412  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4413  if (!TCR_4(__kmp_init_monitor)) {
4414  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4415  TCW_4(__kmp_init_monitor, 1);
4416  __kmp_create_monitor(&__kmp_monitor);
4417  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4418 #if KMP_OS_WINDOWS
4419  // AC: wait until monitor has started. This is a fix for CQ232808.
4420  // The reason is that if the library is loaded/unloaded in a loop with
4421  // small (parallel) work in between, then there is high probability that
4422  // monitor thread started after the library shutdown. At shutdown it is
4423  // too late to cope with the problem, because when the primary thread is
4424  // in DllMain (process detach) the monitor has no chances to start (it is
4425  // blocked), and primary thread has no means to inform the monitor that
4426  // the library has gone, because all the memory which the monitor can
4427  // access is going to be released/reset.
4428  while (TCR_4(__kmp_init_monitor) < 2) {
4429  KMP_YIELD(TRUE);
4430  }
4431  KF_TRACE(10, ("after monitor thread has started\n"));
4432 #endif
4433  }
4434  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4435  }
4436 #endif
4437 
4438  KMP_MB();
4439 
4440  {
4441  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4442  ? 1
4443  : __kmp_hidden_helper_threads_num + 1;
4444 
4445  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4446  ++new_gtid) {
4447  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4448  }
4449 
4450  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4451  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4452  }
4453  }
4454 
4455  /* allocate space for it. */
4456  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4457 
4458  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4459 
4460 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4461  // suppress race conditions detection on synchronization flags in debug mode
4462  // this helps to analyze library internals eliminating false positives
4463  __itt_suppress_mark_range(
4464  __itt_suppress_range, __itt_suppress_threading_errors,
4465  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4466  __itt_suppress_mark_range(
4467  __itt_suppress_range, __itt_suppress_threading_errors,
4468  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4469 #if KMP_OS_WINDOWS
4470  __itt_suppress_mark_range(
4471  __itt_suppress_range, __itt_suppress_threading_errors,
4472  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4473 #else
4474  __itt_suppress_mark_range(__itt_suppress_range,
4475  __itt_suppress_threading_errors,
4476  &new_thr->th.th_suspend_init_count,
4477  sizeof(new_thr->th.th_suspend_init_count));
4478 #endif
4479  // TODO: check if we need to also suppress b_arrived flags
4480  __itt_suppress_mark_range(__itt_suppress_range,
4481  __itt_suppress_threading_errors,
4482  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4483  sizeof(new_thr->th.th_bar[0].bb.b_go));
4484  __itt_suppress_mark_range(__itt_suppress_range,
4485  __itt_suppress_threading_errors,
4486  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4487  sizeof(new_thr->th.th_bar[1].bb.b_go));
4488  __itt_suppress_mark_range(__itt_suppress_range,
4489  __itt_suppress_threading_errors,
4490  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4491  sizeof(new_thr->th.th_bar[2].bb.b_go));
4492 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4493  if (__kmp_storage_map) {
4494  __kmp_print_thread_storage_map(new_thr, new_gtid);
4495  }
4496 
4497  // add the reserve serialized team, initialized from the team's primary thread
4498  {
4499  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4500  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4501  new_thr->th.th_serial_team = serial_team =
4502  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4503 #if OMPT_SUPPORT
4504  ompt_data_none, // root parallel id
4505 #endif
4506  proc_bind_default, &r_icvs,
4507  0 USE_NESTED_HOT_ARG(NULL));
4508  }
4509  KMP_ASSERT(serial_team);
4510  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4511  // execution (it is unused for now).
4512  serial_team->t.t_threads[0] = new_thr;
4513  KF_TRACE(10,
4514  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4515  new_thr));
4516 
4517  /* setup the thread structures */
4518  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4519 
4520 #if USE_FAST_MEMORY
4521  __kmp_initialize_fast_memory(new_thr);
4522 #endif /* USE_FAST_MEMORY */
4523 
4524 #if KMP_USE_BGET
4525  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4526  __kmp_initialize_bget(new_thr);
4527 #endif
4528 
4529  __kmp_init_random(new_thr); // Initialize random number generator
4530 
4531  /* Initialize these only once when thread is grabbed for a team allocation */
4532  KA_TRACE(20,
4533  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4534  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4535 
4536  int b;
4537  kmp_balign_t *balign = new_thr->th.th_bar;
4538  for (b = 0; b < bs_last_barrier; ++b) {
4539  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4540  balign[b].bb.team = NULL;
4541  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4542  balign[b].bb.use_oncore_barrier = 0;
4543  }
4544 
4545  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4546  new_thr->th.th_sleep_loc_type = flag_unset;
4547 
4548  new_thr->th.th_spin_here = FALSE;
4549  new_thr->th.th_next_waiting = 0;
4550 #if KMP_OS_UNIX
4551  new_thr->th.th_blocking = false;
4552 #endif
4553 
4554 #if KMP_AFFINITY_SUPPORTED
4555  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4556  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4557  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4558  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4559 #endif
4560  new_thr->th.th_def_allocator = __kmp_def_allocator;
4561  new_thr->th.th_prev_level = 0;
4562  new_thr->th.th_prev_num_threads = 1;
4563 
4564  TCW_4(new_thr->th.th_in_pool, FALSE);
4565  new_thr->th.th_active_in_pool = FALSE;
4566  TCW_4(new_thr->th.th_active, TRUE);
4567 
4568  /* adjust the global counters */
4569  __kmp_all_nth++;
4570  __kmp_nth++;
4571 
4572  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4573  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4574  if (__kmp_adjust_gtid_mode) {
4575  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4576  if (TCR_4(__kmp_gtid_mode) != 2) {
4577  TCW_4(__kmp_gtid_mode, 2);
4578  }
4579  } else {
4580  if (TCR_4(__kmp_gtid_mode) != 1) {
4581  TCW_4(__kmp_gtid_mode, 1);
4582  }
4583  }
4584  }
4585 
4586 #ifdef KMP_ADJUST_BLOCKTIME
4587  /* Adjust blocktime back to zero if necessary */
4588  /* Middle initialization might not have occurred yet */
4589  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4590  if (__kmp_nth > __kmp_avail_proc) {
4591  __kmp_zero_bt = TRUE;
4592  }
4593  }
4594 #endif /* KMP_ADJUST_BLOCKTIME */
4595 
4596  /* actually fork it and create the new worker thread */
4597  KF_TRACE(
4598  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4599  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4600  KF_TRACE(10,
4601  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4602 
4603  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4604  new_gtid));
4605  KMP_MB();
4606  return new_thr;
4607 }
4608 
4609 /* Reinitialize team for reuse.
4610  The hot team code calls this case at every fork barrier, so EPCC barrier
4611  test are extremely sensitive to changes in it, esp. writes to the team
4612  struct, which cause a cache invalidation in all threads.
4613  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4614 static void __kmp_reinitialize_team(kmp_team_t *team,
4615  kmp_internal_control_t *new_icvs,
4616  ident_t *loc) {
4617  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4618  team->t.t_threads[0], team));
4619  KMP_DEBUG_ASSERT(team && new_icvs);
4620  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4621  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4622 
4623  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4624  // Copy ICVs to the primary thread's implicit taskdata
4625  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4626  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4627 
4628  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4629  team->t.t_threads[0], team));
4630 }
4631 
4632 /* Initialize the team data structure.
4633  This assumes the t_threads and t_max_nproc are already set.
4634  Also, we don't touch the arguments */
4635 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4636  kmp_internal_control_t *new_icvs,
4637  ident_t *loc) {
4638  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4639 
4640  /* verify */
4641  KMP_DEBUG_ASSERT(team);
4642  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4643  KMP_DEBUG_ASSERT(team->t.t_threads);
4644  KMP_MB();
4645 
4646  team->t.t_master_tid = 0; /* not needed */
4647  /* team->t.t_master_bar; not needed */
4648  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4649  team->t.t_nproc = new_nproc;
4650 
4651  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4652  team->t.t_next_pool = NULL;
4653  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4654  * up hot team */
4655 
4656  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4657  team->t.t_invoke = NULL; /* not needed */
4658 
4659  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4660  team->t.t_sched.sched = new_icvs->sched.sched;
4661 
4662 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4663  team->t.t_fp_control_saved = FALSE; /* not needed */
4664  team->t.t_x87_fpu_control_word = 0; /* not needed */
4665  team->t.t_mxcsr = 0; /* not needed */
4666 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4667 
4668  team->t.t_construct = 0;
4669 
4670  team->t.t_ordered.dt.t_value = 0;
4671  team->t.t_master_active = FALSE;
4672 
4673 #ifdef KMP_DEBUG
4674  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4675 #endif
4676 #if KMP_OS_WINDOWS
4677  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4678 #endif
4679 
4680  team->t.t_control_stack_top = NULL;
4681 
4682  __kmp_reinitialize_team(team, new_icvs, loc);
4683 
4684  KMP_MB();
4685  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4686 }
4687 
4688 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4689 /* Sets full mask for thread and returns old mask, no changes to structures. */
4690 static void
4691 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4692  if (KMP_AFFINITY_CAPABLE()) {
4693  int status;
4694  if (old_mask != NULL) {
4695  status = __kmp_get_system_affinity(old_mask, TRUE);
4696  int error = errno;
4697  if (status != 0) {
4698  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4699  __kmp_msg_null);
4700  }
4701  }
4702  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4703  }
4704 }
4705 #endif
4706 
4707 #if KMP_AFFINITY_SUPPORTED
4708 
4709 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4710 // It calculates the worker + primary thread's partition based upon the parent
4711 // thread's partition, and binds each worker to a thread in their partition.
4712 // The primary thread's partition should already include its current binding.
4713 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4714  // Do not partition places for the hidden helper team
4715  if (KMP_HIDDEN_HELPER_TEAM(team))
4716  return;
4717  // Copy the primary thread's place partition to the team struct
4718  kmp_info_t *master_th = team->t.t_threads[0];
4719  KMP_DEBUG_ASSERT(master_th != NULL);
4720  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4721  int first_place = master_th->th.th_first_place;
4722  int last_place = master_th->th.th_last_place;
4723  int masters_place = master_th->th.th_current_place;
4724  team->t.t_first_place = first_place;
4725  team->t.t_last_place = last_place;
4726 
4727  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4728  "bound to place %d partition = [%d,%d]\n",
4729  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4730  team->t.t_id, masters_place, first_place, last_place));
4731 
4732  switch (proc_bind) {
4733 
4734  case proc_bind_default:
4735  // Serial teams might have the proc_bind policy set to proc_bind_default.
4736  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4737  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4738  break;
4739 
4740  case proc_bind_primary: {
4741  int f;
4742  int n_th = team->t.t_nproc;
4743  for (f = 1; f < n_th; f++) {
4744  kmp_info_t *th = team->t.t_threads[f];
4745  KMP_DEBUG_ASSERT(th != NULL);
4746  th->th.th_first_place = first_place;
4747  th->th.th_last_place = last_place;
4748  th->th.th_new_place = masters_place;
4749  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4750  team->t.t_display_affinity != 1) {
4751  team->t.t_display_affinity = 1;
4752  }
4753 
4754  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4755  "partition = [%d,%d]\n",
4756  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4757  f, masters_place, first_place, last_place));
4758  }
4759  } break;
4760 
4761  case proc_bind_close: {
4762  int f;
4763  int n_th = team->t.t_nproc;
4764  int n_places;
4765  if (first_place <= last_place) {
4766  n_places = last_place - first_place + 1;
4767  } else {
4768  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4769  }
4770  if (n_th <= n_places) {
4771  int place = masters_place;
4772  for (f = 1; f < n_th; f++) {
4773  kmp_info_t *th = team->t.t_threads[f];
4774  KMP_DEBUG_ASSERT(th != NULL);
4775 
4776  if (place == last_place) {
4777  place = first_place;
4778  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4779  place = 0;
4780  } else {
4781  place++;
4782  }
4783  th->th.th_first_place = first_place;
4784  th->th.th_last_place = last_place;
4785  th->th.th_new_place = place;
4786  if (__kmp_display_affinity && place != th->th.th_current_place &&
4787  team->t.t_display_affinity != 1) {
4788  team->t.t_display_affinity = 1;
4789  }
4790 
4791  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4792  "partition = [%d,%d]\n",
4793  __kmp_gtid_from_thread(team->t.t_threads[f]),
4794  team->t.t_id, f, place, first_place, last_place));
4795  }
4796  } else {
4797  int S, rem, gap, s_count;
4798  S = n_th / n_places;
4799  s_count = 0;
4800  rem = n_th - (S * n_places);
4801  gap = rem > 0 ? n_places / rem : n_places;
4802  int place = masters_place;
4803  int gap_ct = gap;
4804  for (f = 0; f < n_th; f++) {
4805  kmp_info_t *th = team->t.t_threads[f];
4806  KMP_DEBUG_ASSERT(th != NULL);
4807 
4808  th->th.th_first_place = first_place;
4809  th->th.th_last_place = last_place;
4810  th->th.th_new_place = place;
4811  if (__kmp_display_affinity && place != th->th.th_current_place &&
4812  team->t.t_display_affinity != 1) {
4813  team->t.t_display_affinity = 1;
4814  }
4815  s_count++;
4816 
4817  if ((s_count == S) && rem && (gap_ct == gap)) {
4818  // do nothing, add an extra thread to place on next iteration
4819  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4820  // we added an extra thread to this place; move to next place
4821  if (place == last_place) {
4822  place = first_place;
4823  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4824  place = 0;
4825  } else {
4826  place++;
4827  }
4828  s_count = 0;
4829  gap_ct = 1;
4830  rem--;
4831  } else if (s_count == S) { // place full; don't add extra
4832  if (place == last_place) {
4833  place = first_place;
4834  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4835  place = 0;
4836  } else {
4837  place++;
4838  }
4839  gap_ct++;
4840  s_count = 0;
4841  }
4842 
4843  KA_TRACE(100,
4844  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4845  "partition = [%d,%d]\n",
4846  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4847  th->th.th_new_place, first_place, last_place));
4848  }
4849  KMP_DEBUG_ASSERT(place == masters_place);
4850  }
4851  } break;
4852 
4853  case proc_bind_spread: {
4854  int f;
4855  int n_th = team->t.t_nproc;
4856  int n_places;
4857  int thidx;
4858  if (first_place <= last_place) {
4859  n_places = last_place - first_place + 1;
4860  } else {
4861  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4862  }
4863  if (n_th <= n_places) {
4864  int place = -1;
4865 
4866  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4867  int S = n_places / n_th;
4868  int s_count, rem, gap, gap_ct;
4869 
4870  place = masters_place;
4871  rem = n_places - n_th * S;
4872  gap = rem ? n_th / rem : 1;
4873  gap_ct = gap;
4874  thidx = n_th;
4875  if (update_master_only == 1)
4876  thidx = 1;
4877  for (f = 0; f < thidx; f++) {
4878  kmp_info_t *th = team->t.t_threads[f];
4879  KMP_DEBUG_ASSERT(th != NULL);
4880 
4881  th->th.th_first_place = place;
4882  th->th.th_new_place = place;
4883  if (__kmp_display_affinity && place != th->th.th_current_place &&
4884  team->t.t_display_affinity != 1) {
4885  team->t.t_display_affinity = 1;
4886  }
4887  s_count = 1;
4888  while (s_count < S) {
4889  if (place == last_place) {
4890  place = first_place;
4891  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4892  place = 0;
4893  } else {
4894  place++;
4895  }
4896  s_count++;
4897  }
4898  if (rem && (gap_ct == gap)) {
4899  if (place == last_place) {
4900  place = first_place;
4901  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4902  place = 0;
4903  } else {
4904  place++;
4905  }
4906  rem--;
4907  gap_ct = 0;
4908  }
4909  th->th.th_last_place = place;
4910  gap_ct++;
4911 
4912  if (place == last_place) {
4913  place = first_place;
4914  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4915  place = 0;
4916  } else {
4917  place++;
4918  }
4919 
4920  KA_TRACE(100,
4921  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4922  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4923  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4924  f, th->th.th_new_place, th->th.th_first_place,
4925  th->th.th_last_place, __kmp_affinity_num_masks));
4926  }
4927  } else {
4928  /* Having uniform space of available computation places I can create
4929  T partitions of round(P/T) size and put threads into the first
4930  place of each partition. */
4931  double current = static_cast<double>(masters_place);
4932  double spacing =
4933  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4934  int first, last;
4935  kmp_info_t *th;
4936 
4937  thidx = n_th + 1;
4938  if (update_master_only == 1)
4939  thidx = 1;
4940  for (f = 0; f < thidx; f++) {
4941  first = static_cast<int>(current);
4942  last = static_cast<int>(current + spacing) - 1;
4943  KMP_DEBUG_ASSERT(last >= first);
4944  if (first >= n_places) {
4945  if (masters_place) {
4946  first -= n_places;
4947  last -= n_places;
4948  if (first == (masters_place + 1)) {
4949  KMP_DEBUG_ASSERT(f == n_th);
4950  first--;
4951  }
4952  if (last == masters_place) {
4953  KMP_DEBUG_ASSERT(f == (n_th - 1));
4954  last--;
4955  }
4956  } else {
4957  KMP_DEBUG_ASSERT(f == n_th);
4958  first = 0;
4959  last = 0;
4960  }
4961  }
4962  if (last >= n_places) {
4963  last = (n_places - 1);
4964  }
4965  place = first;
4966  current += spacing;
4967  if (f < n_th) {
4968  KMP_DEBUG_ASSERT(0 <= first);
4969  KMP_DEBUG_ASSERT(n_places > first);
4970  KMP_DEBUG_ASSERT(0 <= last);
4971  KMP_DEBUG_ASSERT(n_places > last);
4972  KMP_DEBUG_ASSERT(last_place >= first_place);
4973  th = team->t.t_threads[f];
4974  KMP_DEBUG_ASSERT(th);
4975  th->th.th_first_place = first;
4976  th->th.th_new_place = place;
4977  th->th.th_last_place = last;
4978  if (__kmp_display_affinity && place != th->th.th_current_place &&
4979  team->t.t_display_affinity != 1) {
4980  team->t.t_display_affinity = 1;
4981  }
4982  KA_TRACE(100,
4983  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4984  "partition = [%d,%d], spacing = %.4f\n",
4985  __kmp_gtid_from_thread(team->t.t_threads[f]),
4986  team->t.t_id, f, th->th.th_new_place,
4987  th->th.th_first_place, th->th.th_last_place, spacing));
4988  }
4989  }
4990  }
4991  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4992  } else {
4993  int S, rem, gap, s_count;
4994  S = n_th / n_places;
4995  s_count = 0;
4996  rem = n_th - (S * n_places);
4997  gap = rem > 0 ? n_places / rem : n_places;
4998  int place = masters_place;
4999  int gap_ct = gap;
5000  thidx = n_th;
5001  if (update_master_only == 1)
5002  thidx = 1;
5003  for (f = 0; f < thidx; f++) {
5004  kmp_info_t *th = team->t.t_threads[f];
5005  KMP_DEBUG_ASSERT(th != NULL);
5006 
5007  th->th.th_first_place = place;
5008  th->th.th_last_place = place;
5009  th->th.th_new_place = place;
5010  if (__kmp_display_affinity && place != th->th.th_current_place &&
5011  team->t.t_display_affinity != 1) {
5012  team->t.t_display_affinity = 1;
5013  }
5014  s_count++;
5015 
5016  if ((s_count == S) && rem && (gap_ct == gap)) {
5017  // do nothing, add an extra thread to place on next iteration
5018  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5019  // we added an extra thread to this place; move on to next place
5020  if (place == last_place) {
5021  place = first_place;
5022  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5023  place = 0;
5024  } else {
5025  place++;
5026  }
5027  s_count = 0;
5028  gap_ct = 1;
5029  rem--;
5030  } else if (s_count == S) { // place is full; don't add extra thread
5031  if (place == last_place) {
5032  place = first_place;
5033  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5034  place = 0;
5035  } else {
5036  place++;
5037  }
5038  gap_ct++;
5039  s_count = 0;
5040  }
5041 
5042  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5043  "partition = [%d,%d]\n",
5044  __kmp_gtid_from_thread(team->t.t_threads[f]),
5045  team->t.t_id, f, th->th.th_new_place,
5046  th->th.th_first_place, th->th.th_last_place));
5047  }
5048  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5049  }
5050  } break;
5051 
5052  default:
5053  break;
5054  }
5055 
5056  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5057 }
5058 
5059 #endif // KMP_AFFINITY_SUPPORTED
5060 
5061 /* allocate a new team data structure to use. take one off of the free pool if
5062  available */
5063 kmp_team_t *
5064 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5065 #if OMPT_SUPPORT
5066  ompt_data_t ompt_parallel_data,
5067 #endif
5068  kmp_proc_bind_t new_proc_bind,
5069  kmp_internal_control_t *new_icvs,
5070  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5071  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5072  int f;
5073  kmp_team_t *team;
5074  int use_hot_team = !root->r.r_active;
5075  int level = 0;
5076  int do_place_partition = 1;
5077 
5078  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5079  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5080  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5081  KMP_MB();
5082 
5083 #if KMP_NESTED_HOT_TEAMS
5084  kmp_hot_team_ptr_t *hot_teams;
5085  if (master) {
5086  team = master->th.th_team;
5087  level = team->t.t_active_level;
5088  if (master->th.th_teams_microtask) { // in teams construct?
5089  if (master->th.th_teams_size.nteams > 1 &&
5090  ( // #teams > 1
5091  team->t.t_pkfn ==
5092  (microtask_t)__kmp_teams_master || // inner fork of the teams
5093  master->th.th_teams_level <
5094  team->t.t_level)) { // or nested parallel inside the teams
5095  ++level; // not increment if #teams==1, or for outer fork of the teams;
5096  // increment otherwise
5097  }
5098  // Do not perform the place partition if inner fork of the teams
5099  // Wait until nested parallel region encountered inside teams construct
5100  if ((master->th.th_teams_size.nteams == 1 &&
5101  master->th.th_teams_level >= team->t.t_level) ||
5102  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5103  do_place_partition = 0;
5104  }
5105  hot_teams = master->th.th_hot_teams;
5106  if (level < __kmp_hot_teams_max_level && hot_teams &&
5107  hot_teams[level].hot_team) {
5108  // hot team has already been allocated for given level
5109  use_hot_team = 1;
5110  } else {
5111  use_hot_team = 0;
5112  }
5113  } else {
5114  // check we won't access uninitialized hot_teams, just in case
5115  KMP_DEBUG_ASSERT(new_nproc == 1);
5116  }
5117 #endif
5118  // Optimization to use a "hot" team
5119  if (use_hot_team && new_nproc > 1) {
5120  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5121 #if KMP_NESTED_HOT_TEAMS
5122  team = hot_teams[level].hot_team;
5123 #else
5124  team = root->r.r_hot_team;
5125 #endif
5126 #if KMP_DEBUG
5127  if (__kmp_tasking_mode != tskm_immediate_exec) {
5128  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5129  "task_team[1] = %p before reinit\n",
5130  team->t.t_task_team[0], team->t.t_task_team[1]));
5131  }
5132 #endif
5133 
5134  if (team->t.t_nproc != new_nproc &&
5135  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5136  // Distributed barrier may need a resize
5137  int old_nthr = team->t.t_nproc;
5138  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5139  }
5140 
5141  // If not doing the place partition, then reset the team's proc bind
5142  // to indicate that partitioning of all threads still needs to take place
5143  if (do_place_partition == 0)
5144  team->t.t_proc_bind = proc_bind_default;
5145  // Has the number of threads changed?
5146  /* Let's assume the most common case is that the number of threads is
5147  unchanged, and put that case first. */
5148  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5149  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5150  // This case can mean that omp_set_num_threads() was called and the hot
5151  // team size was already reduced, so we check the special flag
5152  if (team->t.t_size_changed == -1) {
5153  team->t.t_size_changed = 1;
5154  } else {
5155  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5156  }
5157 
5158  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5159  kmp_r_sched_t new_sched = new_icvs->sched;
5160  // set primary thread's schedule as new run-time schedule
5161  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5162 
5163  __kmp_reinitialize_team(team, new_icvs,
5164  root->r.r_uber_thread->th.th_ident);
5165 
5166  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5167  team->t.t_threads[0], team));
5168  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5169 
5170 #if KMP_AFFINITY_SUPPORTED
5171  if ((team->t.t_size_changed == 0) &&
5172  (team->t.t_proc_bind == new_proc_bind)) {
5173  if (new_proc_bind == proc_bind_spread) {
5174  if (do_place_partition) {
5175  // add flag to update only master for spread
5176  __kmp_partition_places(team, 1);
5177  }
5178  }
5179  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5180  "proc_bind = %d, partition = [%d,%d]\n",
5181  team->t.t_id, new_proc_bind, team->t.t_first_place,
5182  team->t.t_last_place));
5183  } else {
5184  if (do_place_partition) {
5185  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5186  __kmp_partition_places(team);
5187  }
5188  }
5189 #else
5190  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5191 #endif /* KMP_AFFINITY_SUPPORTED */
5192  } else if (team->t.t_nproc > new_nproc) {
5193  KA_TRACE(20,
5194  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5195  new_nproc));
5196 
5197  team->t.t_size_changed = 1;
5198  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5199  // Barrier size already reduced earlier in this function
5200  // Activate team threads via th_used_in_team
5201  __kmp_add_threads_to_team(team, new_nproc);
5202  }
5203 #if KMP_NESTED_HOT_TEAMS
5204  if (__kmp_hot_teams_mode == 0) {
5205  // AC: saved number of threads should correspond to team's value in this
5206  // mode, can be bigger in mode 1, when hot team has threads in reserve
5207  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5208  hot_teams[level].hot_team_nth = new_nproc;
5209 #endif // KMP_NESTED_HOT_TEAMS
5210  /* release the extra threads we don't need any more */
5211  for (f = new_nproc; f < team->t.t_nproc; f++) {
5212  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5213  if (__kmp_tasking_mode != tskm_immediate_exec) {
5214  // When decreasing team size, threads no longer in the team should
5215  // unref task team.
5216  team->t.t_threads[f]->th.th_task_team = NULL;
5217  }
5218  __kmp_free_thread(team->t.t_threads[f]);
5219  team->t.t_threads[f] = NULL;
5220  }
5221 #if KMP_NESTED_HOT_TEAMS
5222  } // (__kmp_hot_teams_mode == 0)
5223  else {
5224  // When keeping extra threads in team, switch threads to wait on own
5225  // b_go flag
5226  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5227  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5228  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5229  for (int b = 0; b < bs_last_barrier; ++b) {
5230  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5231  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5232  }
5233  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5234  }
5235  }
5236  }
5237 #endif // KMP_NESTED_HOT_TEAMS
5238  team->t.t_nproc = new_nproc;
5239  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5240  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5241  __kmp_reinitialize_team(team, new_icvs,
5242  root->r.r_uber_thread->th.th_ident);
5243 
5244  // Update remaining threads
5245  for (f = 0; f < new_nproc; ++f) {
5246  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5247  }
5248 
5249  // restore the current task state of the primary thread: should be the
5250  // implicit task
5251  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5252  team->t.t_threads[0], team));
5253 
5254  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5255 
5256 #ifdef KMP_DEBUG
5257  for (f = 0; f < team->t.t_nproc; f++) {
5258  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5259  team->t.t_threads[f]->th.th_team_nproc ==
5260  team->t.t_nproc);
5261  }
5262 #endif
5263 
5264  if (do_place_partition) {
5265  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5266 #if KMP_AFFINITY_SUPPORTED
5267  __kmp_partition_places(team);
5268 #endif
5269  }
5270  } else { // team->t.t_nproc < new_nproc
5271 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5272  kmp_affin_mask_t *old_mask;
5273  if (KMP_AFFINITY_CAPABLE()) {
5274  KMP_CPU_ALLOC(old_mask);
5275  }
5276 #endif
5277 
5278  KA_TRACE(20,
5279  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5280  new_nproc));
5281  int old_nproc = team->t.t_nproc; // save old value and use to update only
5282  team->t.t_size_changed = 1;
5283 
5284 #if KMP_NESTED_HOT_TEAMS
5285  int avail_threads = hot_teams[level].hot_team_nth;
5286  if (new_nproc < avail_threads)
5287  avail_threads = new_nproc;
5288  kmp_info_t **other_threads = team->t.t_threads;
5289  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5290  // Adjust barrier data of reserved threads (if any) of the team
5291  // Other data will be set in __kmp_initialize_info() below.
5292  int b;
5293  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5294  for (b = 0; b < bs_last_barrier; ++b) {
5295  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5296  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5297 #if USE_DEBUGGER
5298  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5299 #endif
5300  }
5301  }
5302  if (hot_teams[level].hot_team_nth >= new_nproc) {
5303  // we have all needed threads in reserve, no need to allocate any
5304  // this only possible in mode 1, cannot have reserved threads in mode 0
5305  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5306  team->t.t_nproc = new_nproc; // just get reserved threads involved
5307  } else {
5308  // We may have some threads in reserve, but not enough;
5309  // get reserved threads involved if any.
5310  team->t.t_nproc = hot_teams[level].hot_team_nth;
5311  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5312 #endif // KMP_NESTED_HOT_TEAMS
5313  if (team->t.t_max_nproc < new_nproc) {
5314  /* reallocate larger arrays */
5315  __kmp_reallocate_team_arrays(team, new_nproc);
5316  __kmp_reinitialize_team(team, new_icvs, NULL);
5317  }
5318 
5319 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5320  /* Temporarily set full mask for primary thread before creation of
5321  workers. The reason is that workers inherit the affinity from the
5322  primary thread, so if a lot of workers are created on the single
5323  core quickly, they don't get a chance to set their own affinity for
5324  a long time. */
5325  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5326 #endif
5327 
5328  /* allocate new threads for the hot team */
5329  for (f = team->t.t_nproc; f < new_nproc; f++) {
5330  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5331  KMP_DEBUG_ASSERT(new_worker);
5332  team->t.t_threads[f] = new_worker;
5333 
5334  KA_TRACE(20,
5335  ("__kmp_allocate_team: team %d init T#%d arrived: "
5336  "join=%llu, plain=%llu\n",
5337  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5338  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5339  team->t.t_bar[bs_plain_barrier].b_arrived));
5340 
5341  { // Initialize barrier data for new threads.
5342  int b;
5343  kmp_balign_t *balign = new_worker->th.th_bar;
5344  for (b = 0; b < bs_last_barrier; ++b) {
5345  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5346  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5347  KMP_BARRIER_PARENT_FLAG);
5348 #if USE_DEBUGGER
5349  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5350 #endif
5351  }
5352  }
5353  }
5354 
5355 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5356  if (KMP_AFFINITY_CAPABLE()) {
5357  /* Restore initial primary thread's affinity mask */
5358  __kmp_set_system_affinity(old_mask, TRUE);
5359  KMP_CPU_FREE(old_mask);
5360  }
5361 #endif
5362 #if KMP_NESTED_HOT_TEAMS
5363  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5364 #endif // KMP_NESTED_HOT_TEAMS
5365  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5366  // Barrier size already increased earlier in this function
5367  // Activate team threads via th_used_in_team
5368  __kmp_add_threads_to_team(team, new_nproc);
5369  }
5370  /* make sure everyone is syncronized */
5371  // new threads below
5372  __kmp_initialize_team(team, new_nproc, new_icvs,
5373  root->r.r_uber_thread->th.th_ident);
5374 
5375  /* reinitialize the threads */
5376  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5377  for (f = 0; f < team->t.t_nproc; ++f)
5378  __kmp_initialize_info(team->t.t_threads[f], team, f,
5379  __kmp_gtid_from_tid(f, team));
5380 
5381  if (level) { // set th_task_state for new threads in nested hot team
5382  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5383  // only need to set the th_task_state for the new threads. th_task_state
5384  // for primary thread will not be accurate until after this in
5385  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5386  // get the correct value.
5387  for (f = old_nproc; f < team->t.t_nproc; ++f)
5388  team->t.t_threads[f]->th.th_task_state =
5389  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5390  } else { // set th_task_state for new threads in non-nested hot team
5391  // copy primary thread's state
5392  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5393  for (f = old_nproc; f < team->t.t_nproc; ++f)
5394  team->t.t_threads[f]->th.th_task_state = old_state;
5395  }
5396 
5397 #ifdef KMP_DEBUG
5398  for (f = 0; f < team->t.t_nproc; ++f) {
5399  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5400  team->t.t_threads[f]->th.th_team_nproc ==
5401  team->t.t_nproc);
5402  }
5403 #endif
5404 
5405  if (do_place_partition) {
5406  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5407 #if KMP_AFFINITY_SUPPORTED
5408  __kmp_partition_places(team);
5409 #endif
5410  }
5411  } // Check changes in number of threads
5412 
5413  kmp_info_t *master = team->t.t_threads[0];
5414  if (master->th.th_teams_microtask) {
5415  for (f = 1; f < new_nproc; ++f) {
5416  // propagate teams construct specific info to workers
5417  kmp_info_t *thr = team->t.t_threads[f];
5418  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5419  thr->th.th_teams_level = master->th.th_teams_level;
5420  thr->th.th_teams_size = master->th.th_teams_size;
5421  }
5422  }
5423 #if KMP_NESTED_HOT_TEAMS
5424  if (level) {
5425  // Sync barrier state for nested hot teams, not needed for outermost hot
5426  // team.
5427  for (f = 1; f < new_nproc; ++f) {
5428  kmp_info_t *thr = team->t.t_threads[f];
5429  int b;
5430  kmp_balign_t *balign = thr->th.th_bar;
5431  for (b = 0; b < bs_last_barrier; ++b) {
5432  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5433  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5434 #if USE_DEBUGGER
5435  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5436 #endif
5437  }
5438  }
5439  }
5440 #endif // KMP_NESTED_HOT_TEAMS
5441 
5442  /* reallocate space for arguments if necessary */
5443  __kmp_alloc_argv_entries(argc, team, TRUE);
5444  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5445  // The hot team re-uses the previous task team,
5446  // if untouched during the previous release->gather phase.
5447 
5448  KF_TRACE(10, (" hot_team = %p\n", team));
5449 
5450 #if KMP_DEBUG
5451  if (__kmp_tasking_mode != tskm_immediate_exec) {
5452  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5453  "task_team[1] = %p after reinit\n",
5454  team->t.t_task_team[0], team->t.t_task_team[1]));
5455  }
5456 #endif
5457 
5458 #if OMPT_SUPPORT
5459  __ompt_team_assign_id(team, ompt_parallel_data);
5460 #endif
5461 
5462  KMP_MB();
5463 
5464  return team;
5465  }
5466 
5467  /* next, let's try to take one from the team pool */
5468  KMP_MB();
5469  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5470  /* TODO: consider resizing undersized teams instead of reaping them, now
5471  that we have a resizing mechanism */
5472  if (team->t.t_max_nproc >= max_nproc) {
5473  /* take this team from the team pool */
5474  __kmp_team_pool = team->t.t_next_pool;
5475 
5476  if (max_nproc > 1 &&
5477  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5478  if (!team->t.b) { // Allocate barrier structure
5479  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5480  }
5481  }
5482 
5483  /* setup the team for fresh use */
5484  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5485 
5486  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5487  "task_team[1] %p to NULL\n",
5488  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5489  team->t.t_task_team[0] = NULL;
5490  team->t.t_task_team[1] = NULL;
5491 
5492  /* reallocate space for arguments if necessary */
5493  __kmp_alloc_argv_entries(argc, team, TRUE);
5494  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5495 
5496  KA_TRACE(
5497  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5498  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5499  { // Initialize barrier data.
5500  int b;
5501  for (b = 0; b < bs_last_barrier; ++b) {
5502  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5503 #if USE_DEBUGGER
5504  team->t.t_bar[b].b_master_arrived = 0;
5505  team->t.t_bar[b].b_team_arrived = 0;
5506 #endif
5507  }
5508  }
5509 
5510  team->t.t_proc_bind = new_proc_bind;
5511 
5512  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5513  team->t.t_id));
5514 
5515 #if OMPT_SUPPORT
5516  __ompt_team_assign_id(team, ompt_parallel_data);
5517 #endif
5518 
5519  KMP_MB();
5520 
5521  return team;
5522  }
5523 
5524  /* reap team if it is too small, then loop back and check the next one */
5525  // not sure if this is wise, but, will be redone during the hot-teams
5526  // rewrite.
5527  /* TODO: Use technique to find the right size hot-team, don't reap them */
5528  team = __kmp_reap_team(team);
5529  __kmp_team_pool = team;
5530  }
5531 
5532  /* nothing available in the pool, no matter, make a new team! */
5533  KMP_MB();
5534  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5535 
5536  /* and set it up */
5537  team->t.t_max_nproc = max_nproc;
5538  if (max_nproc > 1 &&
5539  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5540  // Allocate barrier structure
5541  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5542  }
5543 
5544  /* NOTE well, for some reason allocating one big buffer and dividing it up
5545  seems to really hurt performance a lot on the P4, so, let's not use this */
5546  __kmp_allocate_team_arrays(team, max_nproc);
5547 
5548  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5549  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5550 
5551  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5552  "%p to NULL\n",
5553  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5554  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5555  // memory, no need to duplicate
5556  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5557  // memory, no need to duplicate
5558 
5559  if (__kmp_storage_map) {
5560  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5561  }
5562 
5563  /* allocate space for arguments */
5564  __kmp_alloc_argv_entries(argc, team, FALSE);
5565  team->t.t_argc = argc;
5566 
5567  KA_TRACE(20,
5568  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5569  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5570  { // Initialize barrier data.
5571  int b;
5572  for (b = 0; b < bs_last_barrier; ++b) {
5573  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5574 #if USE_DEBUGGER
5575  team->t.t_bar[b].b_master_arrived = 0;
5576  team->t.t_bar[b].b_team_arrived = 0;
5577 #endif
5578  }
5579  }
5580 
5581  team->t.t_proc_bind = new_proc_bind;
5582 
5583 #if OMPT_SUPPORT
5584  __ompt_team_assign_id(team, ompt_parallel_data);
5585  team->t.ompt_serialized_team_info = NULL;
5586 #endif
5587 
5588  KMP_MB();
5589 
5590  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5591  team->t.t_id));
5592 
5593  return team;
5594 }
5595 
5596 /* TODO implement hot-teams at all levels */
5597 /* TODO implement lazy thread release on demand (disband request) */
5598 
5599 /* free the team. return it to the team pool. release all the threads
5600  * associated with it */
5601 void __kmp_free_team(kmp_root_t *root,
5602  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5603  int f;
5604  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5605  team->t.t_id));
5606 
5607  /* verify state */
5608  KMP_DEBUG_ASSERT(root);
5609  KMP_DEBUG_ASSERT(team);
5610  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5611  KMP_DEBUG_ASSERT(team->t.t_threads);
5612 
5613  int use_hot_team = team == root->r.r_hot_team;
5614 #if KMP_NESTED_HOT_TEAMS
5615  int level;
5616  if (master) {
5617  level = team->t.t_active_level - 1;
5618  if (master->th.th_teams_microtask) { // in teams construct?
5619  if (master->th.th_teams_size.nteams > 1) {
5620  ++level; // level was not increased in teams construct for
5621  // team_of_masters
5622  }
5623  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5624  master->th.th_teams_level == team->t.t_level) {
5625  ++level; // level was not increased in teams construct for
5626  // team_of_workers before the parallel
5627  } // team->t.t_level will be increased inside parallel
5628  }
5629 #if KMP_DEBUG
5630  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5631 #endif
5632  if (level < __kmp_hot_teams_max_level) {
5633  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5634  use_hot_team = 1;
5635  }
5636  }
5637 #endif // KMP_NESTED_HOT_TEAMS
5638 
5639  /* team is done working */
5640  TCW_SYNC_PTR(team->t.t_pkfn,
5641  NULL); // Important for Debugging Support Library.
5642 #if KMP_OS_WINDOWS
5643  team->t.t_copyin_counter = 0; // init counter for possible reuse
5644 #endif
5645  // Do not reset pointer to parent team to NULL for hot teams.
5646 
5647  /* if we are non-hot team, release our threads */
5648  if (!use_hot_team) {
5649  if (__kmp_tasking_mode != tskm_immediate_exec) {
5650  // Wait for threads to reach reapable state
5651  for (f = 1; f < team->t.t_nproc; ++f) {
5652  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5653  kmp_info_t *th = team->t.t_threads[f];
5654  volatile kmp_uint32 *state = &th->th.th_reap_state;
5655  while (*state != KMP_SAFE_TO_REAP) {
5656 #if KMP_OS_WINDOWS
5657  // On Windows a thread can be killed at any time, check this
5658  DWORD ecode;
5659  if (!__kmp_is_thread_alive(th, &ecode)) {
5660  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5661  break;
5662  }
5663 #endif
5664  // first check if thread is sleeping
5665  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5666  if (fl.is_sleeping())
5667  fl.resume(__kmp_gtid_from_thread(th));
5668  KMP_CPU_PAUSE();
5669  }
5670  }
5671 
5672  // Delete task teams
5673  int tt_idx;
5674  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5675  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5676  if (task_team != NULL) {
5677  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5678  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5679  team->t.t_threads[f]->th.th_task_team = NULL;
5680  }
5681  KA_TRACE(
5682  20,
5683  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5684  __kmp_get_gtid(), task_team, team->t.t_id));
5685 #if KMP_NESTED_HOT_TEAMS
5686  __kmp_free_task_team(master, task_team);
5687 #endif
5688  team->t.t_task_team[tt_idx] = NULL;
5689  }
5690  }
5691  }
5692 
5693  // Reset pointer to parent team only for non-hot teams.
5694  team->t.t_parent = NULL;
5695  team->t.t_level = 0;
5696  team->t.t_active_level = 0;
5697 
5698  /* free the worker threads */
5699  for (f = 1; f < team->t.t_nproc; ++f) {
5700  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5701  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5702  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5703  1, 2);
5704  }
5705  __kmp_free_thread(team->t.t_threads[f]);
5706  }
5707 
5708  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5709  if (team->t.b) {
5710  // wake up thread at old location
5711  team->t.b->go_release();
5712  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5713  for (f = 1; f < team->t.t_nproc; ++f) {
5714  if (team->t.b->sleep[f].sleep) {
5715  __kmp_atomic_resume_64(
5716  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5717  (kmp_atomic_flag_64<> *)NULL);
5718  }
5719  }
5720  }
5721  // Wait for threads to be removed from team
5722  for (int f = 1; f < team->t.t_nproc; ++f) {
5723  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5724  KMP_CPU_PAUSE();
5725  }
5726  }
5727  }
5728 
5729  for (f = 1; f < team->t.t_nproc; ++f) {
5730  team->t.t_threads[f] = NULL;
5731  }
5732 
5733  if (team->t.t_max_nproc > 1 &&
5734  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5735  distributedBarrier::deallocate(team->t.b);
5736  team->t.b = NULL;
5737  }
5738  /* put the team back in the team pool */
5739  /* TODO limit size of team pool, call reap_team if pool too large */
5740  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5741  __kmp_team_pool = (volatile kmp_team_t *)team;
5742  } else { // Check if team was created for primary threads in teams construct
5743  // See if first worker is a CG root
5744  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5745  team->t.t_threads[1]->th.th_cg_roots);
5746  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5747  // Clean up the CG root nodes on workers so that this team can be re-used
5748  for (f = 1; f < team->t.t_nproc; ++f) {
5749  kmp_info_t *thr = team->t.t_threads[f];
5750  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5751  thr->th.th_cg_roots->cg_root == thr);
5752  // Pop current CG root off list
5753  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5754  thr->th.th_cg_roots = tmp->up;
5755  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5756  " up to node %p. cg_nthreads was %d\n",
5757  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5758  int i = tmp->cg_nthreads--;
5759  if (i == 1) {
5760  __kmp_free(tmp); // free CG if we are the last thread in it
5761  }
5762  // Restore current task's thread_limit from CG root
5763  if (thr->th.th_cg_roots)
5764  thr->th.th_current_task->td_icvs.thread_limit =
5765  thr->th.th_cg_roots->cg_thread_limit;
5766  }
5767  }
5768  }
5769 
5770  KMP_MB();
5771 }
5772 
5773 /* reap the team. destroy it, reclaim all its resources and free its memory */
5774 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5775  kmp_team_t *next_pool = team->t.t_next_pool;
5776 
5777  KMP_DEBUG_ASSERT(team);
5778  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5779  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5780  KMP_DEBUG_ASSERT(team->t.t_threads);
5781  KMP_DEBUG_ASSERT(team->t.t_argv);
5782 
5783  /* TODO clean the threads that are a part of this? */
5784 
5785  /* free stuff */
5786  __kmp_free_team_arrays(team);
5787  if (team->t.t_argv != &team->t.t_inline_argv[0])
5788  __kmp_free((void *)team->t.t_argv);
5789  __kmp_free(team);
5790 
5791  KMP_MB();
5792  return next_pool;
5793 }
5794 
5795 // Free the thread. Don't reap it, just place it on the pool of available
5796 // threads.
5797 //
5798 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5799 // binding for the affinity mechanism to be useful.
5800 //
5801 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5802 // However, we want to avoid a potential performance problem by always
5803 // scanning through the list to find the correct point at which to insert
5804 // the thread (potential N**2 behavior). To do this we keep track of the
5805 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5806 // With single-level parallelism, threads will always be added to the tail
5807 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5808 // parallelism, all bets are off and we may need to scan through the entire
5809 // free list.
5810 //
5811 // This change also has a potentially large performance benefit, for some
5812 // applications. Previously, as threads were freed from the hot team, they
5813 // would be placed back on the free list in inverse order. If the hot team
5814 // grew back to it's original size, then the freed thread would be placed
5815 // back on the hot team in reverse order. This could cause bad cache
5816 // locality problems on programs where the size of the hot team regularly
5817 // grew and shrunk.
5818 //
5819 // Now, for single-level parallelism, the OMP tid is always == gtid.
5820 void __kmp_free_thread(kmp_info_t *this_th) {
5821  int gtid;
5822  kmp_info_t **scan;
5823 
5824  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5825  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5826 
5827  KMP_DEBUG_ASSERT(this_th);
5828 
5829  // When moving thread to pool, switch thread to wait on own b_go flag, and
5830  // uninitialized (NULL team).
5831  int b;
5832  kmp_balign_t *balign = this_th->th.th_bar;
5833  for (b = 0; b < bs_last_barrier; ++b) {
5834  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5835  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5836  balign[b].bb.team = NULL;
5837  balign[b].bb.leaf_kids = 0;
5838  }
5839  this_th->th.th_task_state = 0;
5840  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5841 
5842  /* put thread back on the free pool */
5843  TCW_PTR(this_th->th.th_team, NULL);
5844  TCW_PTR(this_th->th.th_root, NULL);
5845  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5846 
5847  while (this_th->th.th_cg_roots) {
5848  this_th->th.th_cg_roots->cg_nthreads--;
5849  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5850  " %p of thread %p to %d\n",
5851  this_th, this_th->th.th_cg_roots,
5852  this_th->th.th_cg_roots->cg_root,
5853  this_th->th.th_cg_roots->cg_nthreads));
5854  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5855  if (tmp->cg_root == this_th) { // Thread is a cg_root
5856  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5857  KA_TRACE(
5858  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5859  this_th->th.th_cg_roots = tmp->up;
5860  __kmp_free(tmp);
5861  } else { // Worker thread
5862  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5863  __kmp_free(tmp);
5864  }
5865  this_th->th.th_cg_roots = NULL;
5866  break;
5867  }
5868  }
5869 
5870  /* If the implicit task assigned to this thread can be used by other threads
5871  * -> multiple threads can share the data and try to free the task at
5872  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5873  * with higher probability when hot team is disabled but can occurs even when
5874  * the hot team is enabled */
5875  __kmp_free_implicit_task(this_th);
5876  this_th->th.th_current_task = NULL;
5877 
5878  // If the __kmp_thread_pool_insert_pt is already past the new insert
5879  // point, then we need to re-scan the entire list.
5880  gtid = this_th->th.th_info.ds.ds_gtid;
5881  if (__kmp_thread_pool_insert_pt != NULL) {
5882  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5883  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5884  __kmp_thread_pool_insert_pt = NULL;
5885  }
5886  }
5887 
5888  // Scan down the list to find the place to insert the thread.
5889  // scan is the address of a link in the list, possibly the address of
5890  // __kmp_thread_pool itself.
5891  //
5892  // In the absence of nested parallelism, the for loop will have 0 iterations.
5893  if (__kmp_thread_pool_insert_pt != NULL) {
5894  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5895  } else {
5896  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5897  }
5898  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5899  scan = &((*scan)->th.th_next_pool))
5900  ;
5901 
5902  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5903  // to its address.
5904  TCW_PTR(this_th->th.th_next_pool, *scan);
5905  __kmp_thread_pool_insert_pt = *scan = this_th;
5906  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5907  (this_th->th.th_info.ds.ds_gtid <
5908  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5909  TCW_4(this_th->th.th_in_pool, TRUE);
5910  __kmp_suspend_initialize_thread(this_th);
5911  __kmp_lock_suspend_mx(this_th);
5912  if (this_th->th.th_active == TRUE) {
5913  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5914  this_th->th.th_active_in_pool = TRUE;
5915  }
5916 #if KMP_DEBUG
5917  else {
5918  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5919  }
5920 #endif
5921  __kmp_unlock_suspend_mx(this_th);
5922 
5923  TCW_4(__kmp_nth, __kmp_nth - 1);
5924 
5925 #ifdef KMP_ADJUST_BLOCKTIME
5926  /* Adjust blocktime back to user setting or default if necessary */
5927  /* Middle initialization might never have occurred */
5928  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5929  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5930  if (__kmp_nth <= __kmp_avail_proc) {
5931  __kmp_zero_bt = FALSE;
5932  }
5933  }
5934 #endif /* KMP_ADJUST_BLOCKTIME */
5935 
5936  KMP_MB();
5937 }
5938 
5939 /* ------------------------------------------------------------------------ */
5940 
5941 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5942 #if OMP_PROFILING_SUPPORT
5943  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5944  // TODO: add a configuration option for time granularity
5945  if (ProfileTraceFile)
5946  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5947 #endif
5948 
5949  int gtid = this_thr->th.th_info.ds.ds_gtid;
5950  /* void *stack_data;*/
5951  kmp_team_t **volatile pteam;
5952 
5953  KMP_MB();
5954  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5955 
5956  if (__kmp_env_consistency_check) {
5957  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5958  }
5959 
5960 #if OMPD_SUPPORT
5961  if (ompd_state & OMPD_ENABLE_BP)
5962  ompd_bp_thread_begin();
5963 #endif
5964 
5965 #if OMPT_SUPPORT
5966  ompt_data_t *thread_data = nullptr;
5967  if (ompt_enabled.enabled) {
5968  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5969  *thread_data = ompt_data_none;
5970 
5971  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5972  this_thr->th.ompt_thread_info.wait_id = 0;
5973  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5974  this_thr->th.ompt_thread_info.parallel_flags = 0;
5975  if (ompt_enabled.ompt_callback_thread_begin) {
5976  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5977  ompt_thread_worker, thread_data);
5978  }
5979  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5980  }
5981 #endif
5982 
5983  /* This is the place where threads wait for work */
5984  while (!TCR_4(__kmp_global.g.g_done)) {
5985  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5986  KMP_MB();
5987 
5988  /* wait for work to do */
5989  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5990 
5991  /* No tid yet since not part of a team */
5992  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5993 
5994 #if OMPT_SUPPORT
5995  if (ompt_enabled.enabled) {
5996  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5997  }
5998 #endif
5999 
6000  pteam = &this_thr->th.th_team;
6001 
6002  /* have we been allocated? */
6003  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6004  /* we were just woken up, so run our new task */
6005  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6006  int rc;
6007  KA_TRACE(20,
6008  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6009  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6010  (*pteam)->t.t_pkfn));
6011 
6012  updateHWFPControl(*pteam);
6013 
6014 #if OMPT_SUPPORT
6015  if (ompt_enabled.enabled) {
6016  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6017  }
6018 #endif
6019 
6020  rc = (*pteam)->t.t_invoke(gtid);
6021  KMP_ASSERT(rc);
6022 
6023  KMP_MB();
6024  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6025  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6026  (*pteam)->t.t_pkfn));
6027  }
6028 #if OMPT_SUPPORT
6029  if (ompt_enabled.enabled) {
6030  /* no frame set while outside task */
6031  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6032 
6033  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6034  }
6035 #endif
6036  /* join barrier after parallel region */
6037  __kmp_join_barrier(gtid);
6038  }
6039  }
6040  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6041 
6042 #if OMPD_SUPPORT
6043  if (ompd_state & OMPD_ENABLE_BP)
6044  ompd_bp_thread_end();
6045 #endif
6046 
6047 #if OMPT_SUPPORT
6048  if (ompt_enabled.ompt_callback_thread_end) {
6049  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6050  }
6051 #endif
6052 
6053  this_thr->th.th_task_team = NULL;
6054  /* run the destructors for the threadprivate data for this thread */
6055  __kmp_common_destroy_gtid(gtid);
6056 
6057  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6058  KMP_MB();
6059 
6060 #if OMP_PROFILING_SUPPORT
6061  llvm::timeTraceProfilerFinishThread();
6062 #endif
6063  return this_thr;
6064 }
6065 
6066 /* ------------------------------------------------------------------------ */
6067 
6068 void __kmp_internal_end_dest(void *specific_gtid) {
6069  // Make sure no significant bits are lost
6070  int gtid;
6071  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6072 
6073  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6074  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6075  * this is because 0 is reserved for the nothing-stored case */
6076 
6077  __kmp_internal_end_thread(gtid);
6078 }
6079 
6080 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6081 
6082 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6083  __kmp_internal_end_atexit();
6084 }
6085 
6086 #endif
6087 
6088 /* [Windows] josh: when the atexit handler is called, there may still be more
6089  than one thread alive */
6090 void __kmp_internal_end_atexit(void) {
6091  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6092  /* [Windows]
6093  josh: ideally, we want to completely shutdown the library in this atexit
6094  handler, but stat code that depends on thread specific data for gtid fails
6095  because that data becomes unavailable at some point during the shutdown, so
6096  we call __kmp_internal_end_thread instead. We should eventually remove the
6097  dependency on __kmp_get_specific_gtid in the stat code and use
6098  __kmp_internal_end_library to cleanly shutdown the library.
6099 
6100  // TODO: Can some of this comment about GVS be removed?
6101  I suspect that the offending stat code is executed when the calling thread
6102  tries to clean up a dead root thread's data structures, resulting in GVS
6103  code trying to close the GVS structures for that thread, but since the stat
6104  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6105  the calling thread is cleaning up itself instead of another thread, it get
6106  confused. This happens because allowing a thread to unregister and cleanup
6107  another thread is a recent modification for addressing an issue.
6108  Based on the current design (20050722), a thread may end up
6109  trying to unregister another thread only if thread death does not trigger
6110  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6111  thread specific data destructor function to detect thread death. For
6112  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6113  is nothing. Thus, the workaround is applicable only for Windows static
6114  stat library. */
6115  __kmp_internal_end_library(-1);
6116 #if KMP_OS_WINDOWS
6117  __kmp_close_console();
6118 #endif
6119 }
6120 
6121 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6122  // It is assumed __kmp_forkjoin_lock is acquired.
6123 
6124  int gtid;
6125 
6126  KMP_DEBUG_ASSERT(thread != NULL);
6127 
6128  gtid = thread->th.th_info.ds.ds_gtid;
6129 
6130  if (!is_root) {
6131  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6132  /* Assume the threads are at the fork barrier here */
6133  KA_TRACE(
6134  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6135  gtid));
6136  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6137  while (
6138  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6139  KMP_CPU_PAUSE();
6140  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6141  } else {
6142  /* Need release fence here to prevent seg faults for tree forkjoin
6143  barrier (GEH) */
6144  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6145  thread);
6146  __kmp_release_64(&flag);
6147  }
6148  }
6149 
6150  // Terminate OS thread.
6151  __kmp_reap_worker(thread);
6152 
6153  // The thread was killed asynchronously. If it was actively
6154  // spinning in the thread pool, decrement the global count.
6155  //
6156  // There is a small timing hole here - if the worker thread was just waking
6157  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6158  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6159  // the global counter might not get updated.
6160  //
6161  // Currently, this can only happen as the library is unloaded,
6162  // so there are no harmful side effects.
6163  if (thread->th.th_active_in_pool) {
6164  thread->th.th_active_in_pool = FALSE;
6165  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6166  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6167  }
6168  }
6169 
6170  __kmp_free_implicit_task(thread);
6171 
6172 // Free the fast memory for tasking
6173 #if USE_FAST_MEMORY
6174  __kmp_free_fast_memory(thread);
6175 #endif /* USE_FAST_MEMORY */
6176 
6177  __kmp_suspend_uninitialize_thread(thread);
6178 
6179  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6180  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6181 
6182  --__kmp_all_nth;
6183  // __kmp_nth was decremented when thread is added to the pool.
6184 
6185 #ifdef KMP_ADJUST_BLOCKTIME
6186  /* Adjust blocktime back to user setting or default if necessary */
6187  /* Middle initialization might never have occurred */
6188  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6189  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6190  if (__kmp_nth <= __kmp_avail_proc) {
6191  __kmp_zero_bt = FALSE;
6192  }
6193  }
6194 #endif /* KMP_ADJUST_BLOCKTIME */
6195 
6196  /* free the memory being used */
6197  if (__kmp_env_consistency_check) {
6198  if (thread->th.th_cons) {
6199  __kmp_free_cons_stack(thread->th.th_cons);
6200  thread->th.th_cons = NULL;
6201  }
6202  }
6203 
6204  if (thread->th.th_pri_common != NULL) {
6205  __kmp_free(thread->th.th_pri_common);
6206  thread->th.th_pri_common = NULL;
6207  }
6208 
6209  if (thread->th.th_task_state_memo_stack != NULL) {
6210  __kmp_free(thread->th.th_task_state_memo_stack);
6211  thread->th.th_task_state_memo_stack = NULL;
6212  }
6213 
6214 #if KMP_USE_BGET
6215  if (thread->th.th_local.bget_data != NULL) {
6216  __kmp_finalize_bget(thread);
6217  }
6218 #endif
6219 
6220 #if KMP_AFFINITY_SUPPORTED
6221  if (thread->th.th_affin_mask != NULL) {
6222  KMP_CPU_FREE(thread->th.th_affin_mask);
6223  thread->th.th_affin_mask = NULL;
6224  }
6225 #endif /* KMP_AFFINITY_SUPPORTED */
6226 
6227 #if KMP_USE_HIER_SCHED
6228  if (thread->th.th_hier_bar_data != NULL) {
6229  __kmp_free(thread->th.th_hier_bar_data);
6230  thread->th.th_hier_bar_data = NULL;
6231  }
6232 #endif
6233 
6234  __kmp_reap_team(thread->th.th_serial_team);
6235  thread->th.th_serial_team = NULL;
6236  __kmp_free(thread);
6237 
6238  KMP_MB();
6239 
6240 } // __kmp_reap_thread
6241 
6242 static void __kmp_itthash_clean(kmp_info_t *th) {
6243 #if USE_ITT_NOTIFY
6244  if (__kmp_itt_region_domains.count > 0) {
6245  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6246  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6247  while (bucket) {
6248  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6249  __kmp_thread_free(th, bucket);
6250  bucket = next;
6251  }
6252  }
6253  }
6254  if (__kmp_itt_barrier_domains.count > 0) {
6255  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6256  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6257  while (bucket) {
6258  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6259  __kmp_thread_free(th, bucket);
6260  bucket = next;
6261  }
6262  }
6263  }
6264 #endif
6265 }
6266 
6267 static void __kmp_internal_end(void) {
6268  int i;
6269 
6270  /* First, unregister the library */
6271  __kmp_unregister_library();
6272 
6273 #if KMP_OS_WINDOWS
6274  /* In Win static library, we can't tell when a root actually dies, so we
6275  reclaim the data structures for any root threads that have died but not
6276  unregistered themselves, in order to shut down cleanly.
6277  In Win dynamic library we also can't tell when a thread dies. */
6278  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6279 // dead roots
6280 #endif
6281 
6282  for (i = 0; i < __kmp_threads_capacity; i++)
6283  if (__kmp_root[i])
6284  if (__kmp_root[i]->r.r_active)
6285  break;
6286  KMP_MB(); /* Flush all pending memory write invalidates. */
6287  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6288 
6289  if (i < __kmp_threads_capacity) {
6290 #if KMP_USE_MONITOR
6291  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6292  KMP_MB(); /* Flush all pending memory write invalidates. */
6293 
6294  // Need to check that monitor was initialized before reaping it. If we are
6295  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6296  // __kmp_monitor will appear to contain valid data, but it is only valid in
6297  // the parent process, not the child.
6298  // New behavior (201008): instead of keying off of the flag
6299  // __kmp_init_parallel, the monitor thread creation is keyed off
6300  // of the new flag __kmp_init_monitor.
6301  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6302  if (TCR_4(__kmp_init_monitor)) {
6303  __kmp_reap_monitor(&__kmp_monitor);
6304  TCW_4(__kmp_init_monitor, 0);
6305  }
6306  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6307  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6308 #endif // KMP_USE_MONITOR
6309  } else {
6310 /* TODO move this to cleanup code */
6311 #ifdef KMP_DEBUG
6312  /* make sure that everything has properly ended */
6313  for (i = 0; i < __kmp_threads_capacity; i++) {
6314  if (__kmp_root[i]) {
6315  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6316  // there can be uber threads alive here
6317  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6318  }
6319  }
6320 #endif
6321 
6322  KMP_MB();
6323 
6324  // Reap the worker threads.
6325  // This is valid for now, but be careful if threads are reaped sooner.
6326  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6327  // Get the next thread from the pool.
6328  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6329  __kmp_thread_pool = thread->th.th_next_pool;
6330  // Reap it.
6331  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6332  thread->th.th_next_pool = NULL;
6333  thread->th.th_in_pool = FALSE;
6334  __kmp_reap_thread(thread, 0);
6335  }
6336  __kmp_thread_pool_insert_pt = NULL;
6337 
6338  // Reap teams.
6339  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6340  // Get the next team from the pool.
6341  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6342  __kmp_team_pool = team->t.t_next_pool;
6343  // Reap it.
6344  team->t.t_next_pool = NULL;
6345  __kmp_reap_team(team);
6346  }
6347 
6348  __kmp_reap_task_teams();
6349 
6350 #if KMP_OS_UNIX
6351  // Threads that are not reaped should not access any resources since they
6352  // are going to be deallocated soon, so the shutdown sequence should wait
6353  // until all threads either exit the final spin-waiting loop or begin
6354  // sleeping after the given blocktime.
6355  for (i = 0; i < __kmp_threads_capacity; i++) {
6356  kmp_info_t *thr = __kmp_threads[i];
6357  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6358  KMP_CPU_PAUSE();
6359  }
6360 #endif
6361 
6362  for (i = 0; i < __kmp_threads_capacity; ++i) {
6363  // TBD: Add some checking...
6364  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6365  }
6366 
6367  /* Make sure all threadprivate destructors get run by joining with all
6368  worker threads before resetting this flag */
6369  TCW_SYNC_4(__kmp_init_common, FALSE);
6370 
6371  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6372  KMP_MB();
6373 
6374 #if KMP_USE_MONITOR
6375  // See note above: One of the possible fixes for CQ138434 / CQ140126
6376  //
6377  // FIXME: push both code fragments down and CSE them?
6378  // push them into __kmp_cleanup() ?
6379  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6380  if (TCR_4(__kmp_init_monitor)) {
6381  __kmp_reap_monitor(&__kmp_monitor);
6382  TCW_4(__kmp_init_monitor, 0);
6383  }
6384  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6385  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6386 #endif
6387  } /* else !__kmp_global.t_active */
6388  TCW_4(__kmp_init_gtid, FALSE);
6389  KMP_MB(); /* Flush all pending memory write invalidates. */
6390 
6391  __kmp_cleanup();
6392 #if OMPT_SUPPORT
6393  ompt_fini();
6394 #endif
6395 }
6396 
6397 void __kmp_internal_end_library(int gtid_req) {
6398  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6399  /* this shouldn't be a race condition because __kmp_internal_end() is the
6400  only place to clear __kmp_serial_init */
6401  /* we'll check this later too, after we get the lock */
6402  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6403  // redundant, because the next check will work in any case.
6404  if (__kmp_global.g.g_abort) {
6405  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6406  /* TODO abort? */
6407  return;
6408  }
6409  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6410  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6411  return;
6412  }
6413 
6414  // If hidden helper team has been initialized, we need to deinit it
6415  if (TCR_4(__kmp_init_hidden_helper) &&
6416  !TCR_4(__kmp_hidden_helper_team_done)) {
6417  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6418  // First release the main thread to let it continue its work
6419  __kmp_hidden_helper_main_thread_release();
6420  // Wait until the hidden helper team has been destroyed
6421  __kmp_hidden_helper_threads_deinitz_wait();
6422  }
6423 
6424  KMP_MB(); /* Flush all pending memory write invalidates. */
6425  /* find out who we are and what we should do */
6426  {
6427  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6428  KA_TRACE(
6429  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6430  if (gtid == KMP_GTID_SHUTDOWN) {
6431  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6432  "already shutdown\n"));
6433  return;
6434  } else if (gtid == KMP_GTID_MONITOR) {
6435  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6436  "registered, or system shutdown\n"));
6437  return;
6438  } else if (gtid == KMP_GTID_DNE) {
6439  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6440  "shutdown\n"));
6441  /* we don't know who we are, but we may still shutdown the library */
6442  } else if (KMP_UBER_GTID(gtid)) {
6443  /* unregister ourselves as an uber thread. gtid is no longer valid */
6444  if (__kmp_root[gtid]->r.r_active) {
6445  __kmp_global.g.g_abort = -1;
6446  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6447  __kmp_unregister_library();
6448  KA_TRACE(10,
6449  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6450  gtid));
6451  return;
6452  } else {
6453  __kmp_itthash_clean(__kmp_threads[gtid]);
6454  KA_TRACE(
6455  10,
6456  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6457  __kmp_unregister_root_current_thread(gtid);
6458  }
6459  } else {
6460 /* worker threads may call this function through the atexit handler, if they
6461  * call exit() */
6462 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6463  TODO: do a thorough shutdown instead */
6464 #ifdef DUMP_DEBUG_ON_EXIT
6465  if (__kmp_debug_buf)
6466  __kmp_dump_debug_buffer();
6467 #endif
6468  // added unregister library call here when we switch to shm linux
6469  // if we don't, it will leave lots of files in /dev/shm
6470  // cleanup shared memory file before exiting.
6471  __kmp_unregister_library();
6472  return;
6473  }
6474  }
6475  /* synchronize the termination process */
6476  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6477 
6478  /* have we already finished */
6479  if (__kmp_global.g.g_abort) {
6480  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6481  /* TODO abort? */
6482  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6483  return;
6484  }
6485  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6486  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6487  return;
6488  }
6489 
6490  /* We need this lock to enforce mutex between this reading of
6491  __kmp_threads_capacity and the writing by __kmp_register_root.
6492  Alternatively, we can use a counter of roots that is atomically updated by
6493  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6494  __kmp_internal_end_*. */
6495  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6496 
6497  /* now we can safely conduct the actual termination */
6498  __kmp_internal_end();
6499 
6500  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6501  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6502 
6503  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6504 
6505 #ifdef DUMP_DEBUG_ON_EXIT
6506  if (__kmp_debug_buf)
6507  __kmp_dump_debug_buffer();
6508 #endif
6509 
6510 #if KMP_OS_WINDOWS
6511  __kmp_close_console();
6512 #endif
6513 
6514  __kmp_fini_allocator();
6515 
6516 } // __kmp_internal_end_library
6517 
6518 void __kmp_internal_end_thread(int gtid_req) {
6519  int i;
6520 
6521  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6522  /* this shouldn't be a race condition because __kmp_internal_end() is the
6523  * only place to clear __kmp_serial_init */
6524  /* we'll check this later too, after we get the lock */
6525  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6526  // redundant, because the next check will work in any case.
6527  if (__kmp_global.g.g_abort) {
6528  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6529  /* TODO abort? */
6530  return;
6531  }
6532  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6533  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6534  return;
6535  }
6536 
6537  // If hidden helper team has been initialized, we need to deinit it
6538  if (TCR_4(__kmp_init_hidden_helper) &&
6539  !TCR_4(__kmp_hidden_helper_team_done)) {
6540  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6541  // First release the main thread to let it continue its work
6542  __kmp_hidden_helper_main_thread_release();
6543  // Wait until the hidden helper team has been destroyed
6544  __kmp_hidden_helper_threads_deinitz_wait();
6545  }
6546 
6547  KMP_MB(); /* Flush all pending memory write invalidates. */
6548 
6549  /* find out who we are and what we should do */
6550  {
6551  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6552  KA_TRACE(10,
6553  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6554  if (gtid == KMP_GTID_SHUTDOWN) {
6555  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6556  "already shutdown\n"));
6557  return;
6558  } else if (gtid == KMP_GTID_MONITOR) {
6559  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6560  "registered, or system shutdown\n"));
6561  return;
6562  } else if (gtid == KMP_GTID_DNE) {
6563  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6564  "shutdown\n"));
6565  return;
6566  /* we don't know who we are */
6567  } else if (KMP_UBER_GTID(gtid)) {
6568  /* unregister ourselves as an uber thread. gtid is no longer valid */
6569  if (__kmp_root[gtid]->r.r_active) {
6570  __kmp_global.g.g_abort = -1;
6571  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6572  KA_TRACE(10,
6573  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6574  gtid));
6575  return;
6576  } else {
6577  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6578  gtid));
6579  __kmp_unregister_root_current_thread(gtid);
6580  }
6581  } else {
6582  /* just a worker thread, let's leave */
6583  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6584 
6585  if (gtid >= 0) {
6586  __kmp_threads[gtid]->th.th_task_team = NULL;
6587  }
6588 
6589  KA_TRACE(10,
6590  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6591  gtid));
6592  return;
6593  }
6594  }
6595 #if KMP_DYNAMIC_LIB
6596  if (__kmp_pause_status != kmp_hard_paused)
6597  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6598  // because we will better shutdown later in the library destructor.
6599  {
6600  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6601  return;
6602  }
6603 #endif
6604  /* synchronize the termination process */
6605  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6606 
6607  /* have we already finished */
6608  if (__kmp_global.g.g_abort) {
6609  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6610  /* TODO abort? */
6611  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6612  return;
6613  }
6614  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6615  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6616  return;
6617  }
6618 
6619  /* We need this lock to enforce mutex between this reading of
6620  __kmp_threads_capacity and the writing by __kmp_register_root.
6621  Alternatively, we can use a counter of roots that is atomically updated by
6622  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6623  __kmp_internal_end_*. */
6624 
6625  /* should we finish the run-time? are all siblings done? */
6626  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6627 
6628  for (i = 0; i < __kmp_threads_capacity; ++i) {
6629  if (KMP_UBER_GTID(i)) {
6630  KA_TRACE(
6631  10,
6632  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6633  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6634  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6635  return;
6636  }
6637  }
6638 
6639  /* now we can safely conduct the actual termination */
6640 
6641  __kmp_internal_end();
6642 
6643  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6644  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6645 
6646  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6647 
6648 #ifdef DUMP_DEBUG_ON_EXIT
6649  if (__kmp_debug_buf)
6650  __kmp_dump_debug_buffer();
6651 #endif
6652 } // __kmp_internal_end_thread
6653 
6654 // -----------------------------------------------------------------------------
6655 // Library registration stuff.
6656 
6657 static long __kmp_registration_flag = 0;
6658 // Random value used to indicate library initialization.
6659 static char *__kmp_registration_str = NULL;
6660 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6661 
6662 static inline char *__kmp_reg_status_name() {
6663 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6664  each thread. If registration and unregistration go in different threads
6665  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6666  env var can not be found, because the name will contain different pid. */
6667 // macOS* complains about name being too long with additional getuid()
6668 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6669  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6670  (int)getuid());
6671 #else
6672  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6673 #endif
6674 } // __kmp_reg_status_get
6675 
6676 void __kmp_register_library_startup(void) {
6677 
6678  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6679  int done = 0;
6680  union {
6681  double dtime;
6682  long ltime;
6683  } time;
6684 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6685  __kmp_initialize_system_tick();
6686 #endif
6687  __kmp_read_system_time(&time.dtime);
6688  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6689  __kmp_registration_str =
6690  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6691  __kmp_registration_flag, KMP_LIBRARY_FILE);
6692 
6693  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6694  __kmp_registration_str));
6695 
6696  while (!done) {
6697 
6698  char *value = NULL; // Actual value of the environment variable.
6699 
6700 #if defined(KMP_USE_SHM)
6701  char *shm_name = __kmp_str_format("/%s", name);
6702  int shm_preexist = 0;
6703  char *data1;
6704  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6705  if ((fd1 == -1) && (errno == EEXIST)) {
6706  // file didn't open because it already exists.
6707  // try opening existing file
6708  fd1 = shm_open(shm_name, O_RDWR, 0666);
6709  if (fd1 == -1) { // file didn't open
6710  // error out here
6711  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6712  __kmp_msg_null);
6713  } else {
6714  // able to open existing file
6715  shm_preexist = 1;
6716  }
6717  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6718  // already exists.
6719  // error out here.
6720  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6721  __kmp_msg_null);
6722  }
6723  if (shm_preexist == 0) {
6724  // we created SHM now set size
6725  if (ftruncate(fd1, SHM_SIZE) == -1) {
6726  // error occured setting size;
6727  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6728  KMP_ERR(errno), __kmp_msg_null);
6729  }
6730  }
6731  data1 =
6732  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6733  if (data1 == MAP_FAILED) {
6734  // failed to map shared memory
6735  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6736  __kmp_msg_null);
6737  }
6738  if (shm_preexist == 0) { // set data to SHM, set value
6739  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6740  }
6741  // Read value from either what we just wrote or existing file.
6742  value = __kmp_str_format("%s", data1); // read value from SHM
6743  munmap(data1, SHM_SIZE);
6744  close(fd1);
6745 #else // Windows and unix with static library
6746  // Set environment variable, but do not overwrite if it is exist.
6747  __kmp_env_set(name, __kmp_registration_str, 0);
6748  // read value to see if it got set
6749  value = __kmp_env_get(name);
6750 #endif
6751 
6752  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6753  done = 1; // Ok, environment variable set successfully, exit the loop.
6754  } else {
6755  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6756  // Check whether it alive or dead.
6757  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6758  char *tail = value;
6759  char *flag_addr_str = NULL;
6760  char *flag_val_str = NULL;
6761  char const *file_name = NULL;
6762  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6763  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6764  file_name = tail;
6765  if (tail != NULL) {
6766  unsigned long *flag_addr = 0;
6767  unsigned long flag_val = 0;
6768  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6769  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6770  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6771  // First, check whether environment-encoded address is mapped into
6772  // addr space.
6773  // If so, dereference it to see if it still has the right value.
6774  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6775  neighbor = 1;
6776  } else {
6777  // If not, then we know the other copy of the library is no longer
6778  // running.
6779  neighbor = 2;
6780  }
6781  }
6782  }
6783  switch (neighbor) {
6784  case 0: // Cannot parse environment variable -- neighbor status unknown.
6785  // Assume it is the incompatible format of future version of the
6786  // library. Assume the other library is alive.
6787  // WARN( ... ); // TODO: Issue a warning.
6788  file_name = "unknown library";
6789  KMP_FALLTHROUGH();
6790  // Attention! Falling to the next case. That's intentional.
6791  case 1: { // Neighbor is alive.
6792  // Check it is allowed.
6793  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6794  if (!__kmp_str_match_true(duplicate_ok)) {
6795  // That's not allowed. Issue fatal error.
6796  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6797  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6798  }
6799  KMP_INTERNAL_FREE(duplicate_ok);
6800  __kmp_duplicate_library_ok = 1;
6801  done = 1; // Exit the loop.
6802  } break;
6803  case 2: { // Neighbor is dead.
6804 
6805 #if defined(KMP_USE_SHM)
6806  // close shared memory.
6807  shm_unlink(shm_name); // this removes file in /dev/shm
6808 #else
6809  // Clear the variable and try to register library again.
6810  __kmp_env_unset(name);
6811 #endif
6812  } break;
6813  default: {
6814  KMP_DEBUG_ASSERT(0);
6815  } break;
6816  }
6817  }
6818  KMP_INTERNAL_FREE((void *)value);
6819 #if defined(KMP_USE_SHM)
6820  KMP_INTERNAL_FREE((void *)shm_name);
6821 #endif
6822  } // while
6823  KMP_INTERNAL_FREE((void *)name);
6824 
6825 } // func __kmp_register_library_startup
6826 
6827 void __kmp_unregister_library(void) {
6828 
6829  char *name = __kmp_reg_status_name();
6830  char *value = NULL;
6831 
6832 #if defined(KMP_USE_SHM)
6833  char *shm_name = __kmp_str_format("/%s", name);
6834  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6835  if (fd1 == -1) {
6836  // file did not open. return.
6837  return;
6838  }
6839  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6840  if (data1 != MAP_FAILED) {
6841  value = __kmp_str_format("%s", data1); // read value from SHM
6842  munmap(data1, SHM_SIZE);
6843  }
6844  close(fd1);
6845 #else
6846  value = __kmp_env_get(name);
6847 #endif
6848 
6849  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6850  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6851  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6852 // Ok, this is our variable. Delete it.
6853 #if defined(KMP_USE_SHM)
6854  shm_unlink(shm_name); // this removes file in /dev/shm
6855 #else
6856  __kmp_env_unset(name);
6857 #endif
6858  }
6859 
6860 #if defined(KMP_USE_SHM)
6861  KMP_INTERNAL_FREE(shm_name);
6862 #endif
6863 
6864  KMP_INTERNAL_FREE(__kmp_registration_str);
6865  KMP_INTERNAL_FREE(value);
6866  KMP_INTERNAL_FREE(name);
6867 
6868  __kmp_registration_flag = 0;
6869  __kmp_registration_str = NULL;
6870 
6871 } // __kmp_unregister_library
6872 
6873 // End of Library registration stuff.
6874 // -----------------------------------------------------------------------------
6875 
6876 #if KMP_MIC_SUPPORTED
6877 
6878 static void __kmp_check_mic_type() {
6879  kmp_cpuid_t cpuid_state = {0};
6880  kmp_cpuid_t *cs_p = &cpuid_state;
6881  __kmp_x86_cpuid(1, 0, cs_p);
6882  // We don't support mic1 at the moment
6883  if ((cs_p->eax & 0xff0) == 0xB10) {
6884  __kmp_mic_type = mic2;
6885  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6886  __kmp_mic_type = mic3;
6887  } else {
6888  __kmp_mic_type = non_mic;
6889  }
6890 }
6891 
6892 #endif /* KMP_MIC_SUPPORTED */
6893 
6894 #if KMP_HAVE_UMWAIT
6895 static void __kmp_user_level_mwait_init() {
6896  struct kmp_cpuid buf;
6897  __kmp_x86_cpuid(7, 0, &buf);
6898  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6899  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6900  __kmp_umwait_enabled));
6901 }
6902 #elif KMP_HAVE_MWAIT
6903 #ifndef AT_INTELPHIUSERMWAIT
6904 // Spurious, non-existent value that should always fail to return anything.
6905 // Will be replaced with the correct value when we know that.
6906 #define AT_INTELPHIUSERMWAIT 10000
6907 #endif
6908 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6909 // earlier OS is used to build the RTL, we'll use the following internal
6910 // function when the entry is not found.
6911 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6912 unsigned long getauxval(unsigned long) { return 0; }
6913 
6914 static void __kmp_user_level_mwait_init() {
6915  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6916  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6917  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6918  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6919  if (__kmp_mic_type == mic3) {
6920  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6921  if ((res & 0x1) || __kmp_user_level_mwait) {
6922  __kmp_mwait_enabled = TRUE;
6923  if (__kmp_user_level_mwait) {
6924  KMP_INFORM(EnvMwaitWarn);
6925  }
6926  } else {
6927  __kmp_mwait_enabled = FALSE;
6928  }
6929  }
6930  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6931  "__kmp_mwait_enabled = %d\n",
6932  __kmp_mic_type, __kmp_mwait_enabled));
6933 }
6934 #endif /* KMP_HAVE_UMWAIT */
6935 
6936 static void __kmp_do_serial_initialize(void) {
6937  int i, gtid;
6938  size_t size;
6939 
6940  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6941 
6942  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6943  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6944  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6945  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6946  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6947 
6948 #if OMPT_SUPPORT
6949  ompt_pre_init();
6950 #endif
6951 #if OMPD_SUPPORT
6952  __kmp_env_dump();
6953  ompd_init();
6954 #endif
6955 
6956  __kmp_validate_locks();
6957 
6958  /* Initialize internal memory allocator */
6959  __kmp_init_allocator();
6960 
6961  /* Register the library startup via an environment variable and check to see
6962  whether another copy of the library is already registered. */
6963 
6964  __kmp_register_library_startup();
6965 
6966  /* TODO reinitialization of library */
6967  if (TCR_4(__kmp_global.g.g_done)) {
6968  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6969  }
6970 
6971  __kmp_global.g.g_abort = 0;
6972  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6973 
6974 /* initialize the locks */
6975 #if KMP_USE_ADAPTIVE_LOCKS
6976 #if KMP_DEBUG_ADAPTIVE_LOCKS
6977  __kmp_init_speculative_stats();
6978 #endif
6979 #endif
6980 #if KMP_STATS_ENABLED
6981  __kmp_stats_init();
6982 #endif
6983  __kmp_init_lock(&__kmp_global_lock);
6984  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6985  __kmp_init_lock(&__kmp_debug_lock);
6986  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6987  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6988  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6989  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6990  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6991  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6992  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6993  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6994  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6995  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6996  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6997  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6998  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6999  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7000  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7001 #if KMP_USE_MONITOR
7002  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7003 #endif
7004  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7005 
7006  /* conduct initialization and initial setup of configuration */
7007 
7008  __kmp_runtime_initialize();
7009 
7010 #if KMP_MIC_SUPPORTED
7011  __kmp_check_mic_type();
7012 #endif
7013 
7014 // Some global variable initialization moved here from kmp_env_initialize()
7015 #ifdef KMP_DEBUG
7016  kmp_diag = 0;
7017 #endif
7018  __kmp_abort_delay = 0;
7019 
7020  // From __kmp_init_dflt_team_nth()
7021  /* assume the entire machine will be used */
7022  __kmp_dflt_team_nth_ub = __kmp_xproc;
7023  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7024  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7025  }
7026  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7027  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7028  }
7029  __kmp_max_nth = __kmp_sys_max_nth;
7030  __kmp_cg_max_nth = __kmp_sys_max_nth;
7031  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7032  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7033  __kmp_teams_max_nth = __kmp_sys_max_nth;
7034  }
7035 
7036  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7037  // part
7038  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7039 #if KMP_USE_MONITOR
7040  __kmp_monitor_wakeups =
7041  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7042  __kmp_bt_intervals =
7043  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7044 #endif
7045  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7046  __kmp_library = library_throughput;
7047  // From KMP_SCHEDULE initialization
7048  __kmp_static = kmp_sch_static_balanced;
7049 // AC: do not use analytical here, because it is non-monotonous
7050 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7051 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7052 // need to repeat assignment
7053 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7054 // bit control and barrier method control parts
7055 #if KMP_FAST_REDUCTION_BARRIER
7056 #define kmp_reduction_barrier_gather_bb ((int)1)
7057 #define kmp_reduction_barrier_release_bb ((int)1)
7058 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7059 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7060 #endif // KMP_FAST_REDUCTION_BARRIER
7061  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7062  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7063  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7064  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7065  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7066 #if KMP_FAST_REDUCTION_BARRIER
7067  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7068  // lin_64 ): hyper,1
7069  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7070  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7071  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7072  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7073  }
7074 #endif // KMP_FAST_REDUCTION_BARRIER
7075  }
7076 #if KMP_FAST_REDUCTION_BARRIER
7077 #undef kmp_reduction_barrier_release_pat
7078 #undef kmp_reduction_barrier_gather_pat
7079 #undef kmp_reduction_barrier_release_bb
7080 #undef kmp_reduction_barrier_gather_bb
7081 #endif // KMP_FAST_REDUCTION_BARRIER
7082 #if KMP_MIC_SUPPORTED
7083  if (__kmp_mic_type == mic2) { // KNC
7084  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7085  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7086  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7087  1; // forkjoin release
7088  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7089  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7090  }
7091 #if KMP_FAST_REDUCTION_BARRIER
7092  if (__kmp_mic_type == mic2) { // KNC
7093  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7094  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7095  }
7096 #endif // KMP_FAST_REDUCTION_BARRIER
7097 #endif // KMP_MIC_SUPPORTED
7098 
7099 // From KMP_CHECKS initialization
7100 #ifdef KMP_DEBUG
7101  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7102 #else
7103  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7104 #endif
7105 
7106  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7107  __kmp_foreign_tp = TRUE;
7108 
7109  __kmp_global.g.g_dynamic = FALSE;
7110  __kmp_global.g.g_dynamic_mode = dynamic_default;
7111 
7112  __kmp_init_nesting_mode();
7113 
7114  __kmp_env_initialize(NULL);
7115 
7116 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7117  __kmp_user_level_mwait_init();
7118 #endif
7119 // Print all messages in message catalog for testing purposes.
7120 #ifdef KMP_DEBUG
7121  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7122  if (__kmp_str_match_true(val)) {
7123  kmp_str_buf_t buffer;
7124  __kmp_str_buf_init(&buffer);
7125  __kmp_i18n_dump_catalog(&buffer);
7126  __kmp_printf("%s", buffer.str);
7127  __kmp_str_buf_free(&buffer);
7128  }
7129  __kmp_env_free(&val);
7130 #endif
7131 
7132  __kmp_threads_capacity =
7133  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7134  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7135  __kmp_tp_capacity = __kmp_default_tp_capacity(
7136  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7137 
7138  // If the library is shut down properly, both pools must be NULL. Just in
7139  // case, set them to NULL -- some memory may leak, but subsequent code will
7140  // work even if pools are not freed.
7141  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7142  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7143  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7144  __kmp_thread_pool = NULL;
7145  __kmp_thread_pool_insert_pt = NULL;
7146  __kmp_team_pool = NULL;
7147 
7148  /* Allocate all of the variable sized records */
7149  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7150  * expandable */
7151  /* Since allocation is cache-aligned, just add extra padding at the end */
7152  size =
7153  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7154  CACHE_LINE;
7155  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7156  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7157  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7158 
7159  /* init thread counts */
7160  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7161  0); // Asserts fail if the library is reinitializing and
7162  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7163  __kmp_all_nth = 0;
7164  __kmp_nth = 0;
7165 
7166  /* setup the uber master thread and hierarchy */
7167  gtid = __kmp_register_root(TRUE);
7168  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7169  KMP_ASSERT(KMP_UBER_GTID(gtid));
7170  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7171 
7172  KMP_MB(); /* Flush all pending memory write invalidates. */
7173 
7174  __kmp_common_initialize();
7175 
7176 #if KMP_OS_UNIX
7177  /* invoke the child fork handler */
7178  __kmp_register_atfork();
7179 #endif
7180 
7181 #if !KMP_DYNAMIC_LIB
7182  {
7183  /* Invoke the exit handler when the program finishes, only for static
7184  library. For dynamic library, we already have _fini and DllMain. */
7185  int rc = atexit(__kmp_internal_end_atexit);
7186  if (rc != 0) {
7187  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7188  __kmp_msg_null);
7189  }
7190  }
7191 #endif
7192 
7193 #if KMP_HANDLE_SIGNALS
7194 #if KMP_OS_UNIX
7195  /* NOTE: make sure that this is called before the user installs their own
7196  signal handlers so that the user handlers are called first. this way they
7197  can return false, not call our handler, avoid terminating the library, and
7198  continue execution where they left off. */
7199  __kmp_install_signals(FALSE);
7200 #endif /* KMP_OS_UNIX */
7201 #if KMP_OS_WINDOWS
7202  __kmp_install_signals(TRUE);
7203 #endif /* KMP_OS_WINDOWS */
7204 #endif
7205 
7206  /* we have finished the serial initialization */
7207  __kmp_init_counter++;
7208 
7209  __kmp_init_serial = TRUE;
7210 
7211  if (__kmp_settings) {
7212  __kmp_env_print();
7213  }
7214 
7215  if (__kmp_display_env || __kmp_display_env_verbose) {
7216  __kmp_env_print_2();
7217  }
7218 
7219 #if OMPT_SUPPORT
7220  ompt_post_init();
7221 #endif
7222 
7223  KMP_MB();
7224 
7225  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7226 }
7227 
7228 void __kmp_serial_initialize(void) {
7229  if (__kmp_init_serial) {
7230  return;
7231  }
7232  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7233  if (__kmp_init_serial) {
7234  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7235  return;
7236  }
7237  __kmp_do_serial_initialize();
7238  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7239 }
7240 
7241 static void __kmp_do_middle_initialize(void) {
7242  int i, j;
7243  int prev_dflt_team_nth;
7244 
7245  if (!__kmp_init_serial) {
7246  __kmp_do_serial_initialize();
7247  }
7248 
7249  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7250 
7251  // Save the previous value for the __kmp_dflt_team_nth so that
7252  // we can avoid some reinitialization if it hasn't changed.
7253  prev_dflt_team_nth = __kmp_dflt_team_nth;
7254 
7255 #if KMP_AFFINITY_SUPPORTED
7256  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7257  // number of cores on the machine.
7258  __kmp_affinity_initialize();
7259 
7260 #endif /* KMP_AFFINITY_SUPPORTED */
7261 
7262  KMP_ASSERT(__kmp_xproc > 0);
7263  if (__kmp_avail_proc == 0) {
7264  __kmp_avail_proc = __kmp_xproc;
7265  }
7266 
7267  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7268  // correct them now
7269  j = 0;
7270  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7271  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7272  __kmp_avail_proc;
7273  j++;
7274  }
7275 
7276  if (__kmp_dflt_team_nth == 0) {
7277 #ifdef KMP_DFLT_NTH_CORES
7278  // Default #threads = #cores
7279  __kmp_dflt_team_nth = __kmp_ncores;
7280  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7281  "__kmp_ncores (%d)\n",
7282  __kmp_dflt_team_nth));
7283 #else
7284  // Default #threads = #available OS procs
7285  __kmp_dflt_team_nth = __kmp_avail_proc;
7286  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7287  "__kmp_avail_proc(%d)\n",
7288  __kmp_dflt_team_nth));
7289 #endif /* KMP_DFLT_NTH_CORES */
7290  }
7291 
7292  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7293  __kmp_dflt_team_nth = KMP_MIN_NTH;
7294  }
7295  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7296  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7297  }
7298 
7299  if (__kmp_nesting_mode > 0)
7300  __kmp_set_nesting_mode_threads();
7301 
7302  // There's no harm in continuing if the following check fails,
7303  // but it indicates an error in the previous logic.
7304  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7305 
7306  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7307  // Run through the __kmp_threads array and set the num threads icv for each
7308  // root thread that is currently registered with the RTL (which has not
7309  // already explicitly set its nthreads-var with a call to
7310  // omp_set_num_threads()).
7311  for (i = 0; i < __kmp_threads_capacity; i++) {
7312  kmp_info_t *thread = __kmp_threads[i];
7313  if (thread == NULL)
7314  continue;
7315  if (thread->th.th_current_task->td_icvs.nproc != 0)
7316  continue;
7317 
7318  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7319  }
7320  }
7321  KA_TRACE(
7322  20,
7323  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7324  __kmp_dflt_team_nth));
7325 
7326 #ifdef KMP_ADJUST_BLOCKTIME
7327  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7328  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7329  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7330  if (__kmp_nth > __kmp_avail_proc) {
7331  __kmp_zero_bt = TRUE;
7332  }
7333  }
7334 #endif /* KMP_ADJUST_BLOCKTIME */
7335 
7336  /* we have finished middle initialization */
7337  TCW_SYNC_4(__kmp_init_middle, TRUE);
7338 
7339  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7340 }
7341 
7342 void __kmp_middle_initialize(void) {
7343  if (__kmp_init_middle) {
7344  return;
7345  }
7346  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7347  if (__kmp_init_middle) {
7348  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7349  return;
7350  }
7351  __kmp_do_middle_initialize();
7352  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7353 }
7354 
7355 void __kmp_parallel_initialize(void) {
7356  int gtid = __kmp_entry_gtid(); // this might be a new root
7357 
7358  /* synchronize parallel initialization (for sibling) */
7359  if (TCR_4(__kmp_init_parallel))
7360  return;
7361  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7362  if (TCR_4(__kmp_init_parallel)) {
7363  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7364  return;
7365  }
7366 
7367  /* TODO reinitialization after we have already shut down */
7368  if (TCR_4(__kmp_global.g.g_done)) {
7369  KA_TRACE(
7370  10,
7371  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7372  __kmp_infinite_loop();
7373  }
7374 
7375  /* jc: The lock __kmp_initz_lock is already held, so calling
7376  __kmp_serial_initialize would cause a deadlock. So we call
7377  __kmp_do_serial_initialize directly. */
7378  if (!__kmp_init_middle) {
7379  __kmp_do_middle_initialize();
7380  }
7381  __kmp_assign_root_init_mask();
7382  __kmp_resume_if_hard_paused();
7383 
7384  /* begin initialization */
7385  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7386  KMP_ASSERT(KMP_UBER_GTID(gtid));
7387 
7388 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7389  // Save the FP control regs.
7390  // Worker threads will set theirs to these values at thread startup.
7391  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7392  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7393  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7394 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7395 
7396 #if KMP_OS_UNIX
7397 #if KMP_HANDLE_SIGNALS
7398  /* must be after __kmp_serial_initialize */
7399  __kmp_install_signals(TRUE);
7400 #endif
7401 #endif
7402 
7403  __kmp_suspend_initialize();
7404 
7405 #if defined(USE_LOAD_BALANCE)
7406  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7407  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7408  }
7409 #else
7410  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7411  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7412  }
7413 #endif
7414 
7415  if (__kmp_version) {
7416  __kmp_print_version_2();
7417  }
7418 
7419  /* we have finished parallel initialization */
7420  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7421 
7422  KMP_MB();
7423  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7424 
7425  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7426 }
7427 
7428 void __kmp_hidden_helper_initialize() {
7429  if (TCR_4(__kmp_init_hidden_helper))
7430  return;
7431 
7432  // __kmp_parallel_initialize is required before we initialize hidden helper
7433  if (!TCR_4(__kmp_init_parallel))
7434  __kmp_parallel_initialize();
7435 
7436  // Double check. Note that this double check should not be placed before
7437  // __kmp_parallel_initialize as it will cause dead lock.
7438  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7439  if (TCR_4(__kmp_init_hidden_helper)) {
7440  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7441  return;
7442  }
7443 
7444  // Set the count of hidden helper tasks to be executed to zero
7445  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7446 
7447  // Set the global variable indicating that we're initializing hidden helper
7448  // team/threads
7449  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7450 
7451  // Platform independent initialization
7452  __kmp_do_initialize_hidden_helper_threads();
7453 
7454  // Wait here for the finish of initialization of hidden helper teams
7455  __kmp_hidden_helper_threads_initz_wait();
7456 
7457  // We have finished hidden helper initialization
7458  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7459 
7460  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7461 }
7462 
7463 /* ------------------------------------------------------------------------ */
7464 
7465 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7466  kmp_team_t *team) {
7467  kmp_disp_t *dispatch;
7468 
7469  KMP_MB();
7470 
7471  /* none of the threads have encountered any constructs, yet. */
7472  this_thr->th.th_local.this_construct = 0;
7473 #if KMP_CACHE_MANAGE
7474  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7475 #endif /* KMP_CACHE_MANAGE */
7476  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7477  KMP_DEBUG_ASSERT(dispatch);
7478  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7479  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7480  // this_thr->th.th_info.ds.ds_tid ] );
7481 
7482  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7483  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7484  if (__kmp_env_consistency_check)
7485  __kmp_push_parallel(gtid, team->t.t_ident);
7486 
7487  KMP_MB(); /* Flush all pending memory write invalidates. */
7488 }
7489 
7490 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7491  kmp_team_t *team) {
7492  if (__kmp_env_consistency_check)
7493  __kmp_pop_parallel(gtid, team->t.t_ident);
7494 
7495  __kmp_finish_implicit_task(this_thr);
7496 }
7497 
7498 int __kmp_invoke_task_func(int gtid) {
7499  int rc;
7500  int tid = __kmp_tid_from_gtid(gtid);
7501  kmp_info_t *this_thr = __kmp_threads[gtid];
7502  kmp_team_t *team = this_thr->th.th_team;
7503 
7504  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7505 #if USE_ITT_BUILD
7506  if (__itt_stack_caller_create_ptr) {
7507  // inform ittnotify about entering user's code
7508  if (team->t.t_stack_id != NULL) {
7509  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7510  } else {
7511  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7512  __kmp_itt_stack_callee_enter(
7513  (__itt_caller)team->t.t_parent->t.t_stack_id);
7514  }
7515  }
7516 #endif /* USE_ITT_BUILD */
7517 #if INCLUDE_SSC_MARKS
7518  SSC_MARK_INVOKING();
7519 #endif
7520 
7521 #if OMPT_SUPPORT
7522  void *dummy;
7523  void **exit_frame_p;
7524  ompt_data_t *my_task_data;
7525  ompt_data_t *my_parallel_data;
7526  int ompt_team_size;
7527 
7528  if (ompt_enabled.enabled) {
7529  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7530  .ompt_task_info.frame.exit_frame.ptr);
7531  } else {
7532  exit_frame_p = &dummy;
7533  }
7534 
7535  my_task_data =
7536  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7537  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7538  if (ompt_enabled.ompt_callback_implicit_task) {
7539  ompt_team_size = team->t.t_nproc;
7540  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7541  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7542  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7543  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7544  }
7545 #endif
7546 
7547 #if KMP_STATS_ENABLED
7548  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7549  if (previous_state == stats_state_e::TEAMS_REGION) {
7550  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7551  } else {
7552  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7553  }
7554  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7555 #endif
7556 
7557  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7558  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7559 #if OMPT_SUPPORT
7560  ,
7561  exit_frame_p
7562 #endif
7563  );
7564 #if OMPT_SUPPORT
7565  *exit_frame_p = NULL;
7566  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7567 #endif
7568 
7569 #if KMP_STATS_ENABLED
7570  if (previous_state == stats_state_e::TEAMS_REGION) {
7571  KMP_SET_THREAD_STATE(previous_state);
7572  }
7573  KMP_POP_PARTITIONED_TIMER();
7574 #endif
7575 
7576 #if USE_ITT_BUILD
7577  if (__itt_stack_caller_create_ptr) {
7578  // inform ittnotify about leaving user's code
7579  if (team->t.t_stack_id != NULL) {
7580  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7581  } else {
7582  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7583  __kmp_itt_stack_callee_leave(
7584  (__itt_caller)team->t.t_parent->t.t_stack_id);
7585  }
7586  }
7587 #endif /* USE_ITT_BUILD */
7588  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7589 
7590  return rc;
7591 }
7592 
7593 void __kmp_teams_master(int gtid) {
7594  // This routine is called by all primary threads in teams construct
7595  kmp_info_t *thr = __kmp_threads[gtid];
7596  kmp_team_t *team = thr->th.th_team;
7597  ident_t *loc = team->t.t_ident;
7598  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7599  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7600  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7601  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7602  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7603 
7604  // This thread is a new CG root. Set up the proper variables.
7605  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7606  tmp->cg_root = thr; // Make thr the CG root
7607  // Init to thread limit stored when league primary threads were forked
7608  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7609  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7610  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7611  " cg_nthreads to 1\n",
7612  thr, tmp));
7613  tmp->up = thr->th.th_cg_roots;
7614  thr->th.th_cg_roots = tmp;
7615 
7616 // Launch league of teams now, but not let workers execute
7617 // (they hang on fork barrier until next parallel)
7618 #if INCLUDE_SSC_MARKS
7619  SSC_MARK_FORKING();
7620 #endif
7621  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7622  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7623  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7624 #if INCLUDE_SSC_MARKS
7625  SSC_MARK_JOINING();
7626 #endif
7627  // If the team size was reduced from the limit, set it to the new size
7628  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7629  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7630  // AC: last parameter "1" eliminates join barrier which won't work because
7631  // worker threads are in a fork barrier waiting for more parallel regions
7632  __kmp_join_call(loc, gtid
7633 #if OMPT_SUPPORT
7634  ,
7635  fork_context_intel
7636 #endif
7637  ,
7638  1);
7639 }
7640 
7641 int __kmp_invoke_teams_master(int gtid) {
7642  kmp_info_t *this_thr = __kmp_threads[gtid];
7643  kmp_team_t *team = this_thr->th.th_team;
7644 #if KMP_DEBUG
7645  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7646  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7647  (void *)__kmp_teams_master);
7648 #endif
7649  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7650 #if OMPT_SUPPORT
7651  int tid = __kmp_tid_from_gtid(gtid);
7652  ompt_data_t *task_data =
7653  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7654  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7655  if (ompt_enabled.ompt_callback_implicit_task) {
7656  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7657  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7658  ompt_task_initial);
7659  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7660  }
7661 #endif
7662  __kmp_teams_master(gtid);
7663 #if OMPT_SUPPORT
7664  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7665 #endif
7666  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7667  return 1;
7668 }
7669 
7670 /* this sets the requested number of threads for the next parallel region
7671  encountered by this team. since this should be enclosed in the forkjoin
7672  critical section it should avoid race conditions with asymmetrical nested
7673  parallelism */
7674 
7675 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7676  kmp_info_t *thr = __kmp_threads[gtid];
7677 
7678  if (num_threads > 0)
7679  thr->th.th_set_nproc = num_threads;
7680 }
7681 
7682 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7683  int num_threads) {
7684  KMP_DEBUG_ASSERT(thr);
7685  // Remember the number of threads for inner parallel regions
7686  if (!TCR_4(__kmp_init_middle))
7687  __kmp_middle_initialize(); // get internal globals calculated
7688  __kmp_assign_root_init_mask();
7689  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7690  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7691 
7692  if (num_threads == 0) {
7693  if (__kmp_teams_thread_limit > 0) {
7694  num_threads = __kmp_teams_thread_limit;
7695  } else {
7696  num_threads = __kmp_avail_proc / num_teams;
7697  }
7698  // adjust num_threads w/o warning as it is not user setting
7699  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7700  // no thread_limit clause specified - do not change thread-limit-var ICV
7701  if (num_threads > __kmp_dflt_team_nth) {
7702  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7703  }
7704  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7705  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7706  } // prevent team size to exceed thread-limit-var
7707  if (num_teams * num_threads > __kmp_teams_max_nth) {
7708  num_threads = __kmp_teams_max_nth / num_teams;
7709  }
7710  if (num_threads == 0) {
7711  num_threads = 1;
7712  }
7713  } else {
7714  if (num_threads < 0) {
7715  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7716  __kmp_msg_null);
7717  num_threads = 1;
7718  }
7719  // This thread will be the primary thread of the league primary threads
7720  // Store new thread limit; old limit is saved in th_cg_roots list
7721  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7722  // num_threads = min(num_threads, nthreads-var)
7723  if (num_threads > __kmp_dflt_team_nth) {
7724  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7725  }
7726  if (num_teams * num_threads > __kmp_teams_max_nth) {
7727  int new_threads = __kmp_teams_max_nth / num_teams;
7728  if (new_threads == 0) {
7729  new_threads = 1;
7730  }
7731  if (new_threads != num_threads) {
7732  if (!__kmp_reserve_warn) { // user asked for too many threads
7733  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7734  __kmp_msg(kmp_ms_warning,
7735  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7736  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7737  }
7738  }
7739  num_threads = new_threads;
7740  }
7741  }
7742  thr->th.th_teams_size.nth = num_threads;
7743 }
7744 
7745 /* this sets the requested number of teams for the teams region and/or
7746  the number of threads for the next parallel region encountered */
7747 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7748  int num_threads) {
7749  kmp_info_t *thr = __kmp_threads[gtid];
7750  if (num_teams < 0) {
7751  // OpenMP specification requires requested values to be positive,
7752  // but people can send us any value, so we'd better check
7753  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7754  __kmp_msg_null);
7755  num_teams = 1;
7756  }
7757  if (num_teams == 0) {
7758  if (__kmp_nteams > 0) {
7759  num_teams = __kmp_nteams;
7760  } else {
7761  num_teams = 1; // default number of teams is 1.
7762  }
7763  }
7764  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7765  if (!__kmp_reserve_warn) {
7766  __kmp_reserve_warn = 1;
7767  __kmp_msg(kmp_ms_warning,
7768  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7769  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7770  }
7771  num_teams = __kmp_teams_max_nth;
7772  }
7773  // Set number of teams (number of threads in the outer "parallel" of the
7774  // teams)
7775  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7776 
7777  __kmp_push_thread_limit(thr, num_teams, num_threads);
7778 }
7779 
7780 /* This sets the requested number of teams for the teams region and/or
7781  the number of threads for the next parallel region encountered */
7782 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7783  int num_teams_ub, int num_threads) {
7784  kmp_info_t *thr = __kmp_threads[gtid];
7785  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7786  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7787  KMP_DEBUG_ASSERT(num_threads >= 0);
7788 
7789  if (num_teams_lb > num_teams_ub) {
7790  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7791  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7792  }
7793 
7794  int num_teams = 1; // defalt number of teams is 1.
7795 
7796  if (num_teams_lb == 0 && num_teams_ub > 0)
7797  num_teams_lb = num_teams_ub;
7798 
7799  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7800  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7801  if (num_teams > __kmp_teams_max_nth) {
7802  if (!__kmp_reserve_warn) {
7803  __kmp_reserve_warn = 1;
7804  __kmp_msg(kmp_ms_warning,
7805  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7806  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7807  }
7808  num_teams = __kmp_teams_max_nth;
7809  }
7810  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7811  num_teams = num_teams_ub;
7812  } else { // num_teams_lb <= num_teams <= num_teams_ub
7813  if (num_threads <= 0) {
7814  if (num_teams_ub > __kmp_teams_max_nth) {
7815  num_teams = num_teams_lb;
7816  } else {
7817  num_teams = num_teams_ub;
7818  }
7819  } else {
7820  num_teams = (num_threads > __kmp_teams_max_nth)
7821  ? num_teams
7822  : __kmp_teams_max_nth / num_threads;
7823  if (num_teams < num_teams_lb) {
7824  num_teams = num_teams_lb;
7825  } else if (num_teams > num_teams_ub) {
7826  num_teams = num_teams_ub;
7827  }
7828  }
7829  }
7830  // Set number of teams (number of threads in the outer "parallel" of the
7831  // teams)
7832  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7833 
7834  __kmp_push_thread_limit(thr, num_teams, num_threads);
7835 }
7836 
7837 // Set the proc_bind var to use in the following parallel region.
7838 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7839  kmp_info_t *thr = __kmp_threads[gtid];
7840  thr->th.th_set_proc_bind = proc_bind;
7841 }
7842 
7843 /* Launch the worker threads into the microtask. */
7844 
7845 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7846  kmp_info_t *this_thr = __kmp_threads[gtid];
7847 
7848 #ifdef KMP_DEBUG
7849  int f;
7850 #endif /* KMP_DEBUG */
7851 
7852  KMP_DEBUG_ASSERT(team);
7853  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7854  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7855  KMP_MB(); /* Flush all pending memory write invalidates. */
7856 
7857  team->t.t_construct = 0; /* no single directives seen yet */
7858  team->t.t_ordered.dt.t_value =
7859  0; /* thread 0 enters the ordered section first */
7860 
7861  /* Reset the identifiers on the dispatch buffer */
7862  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7863  if (team->t.t_max_nproc > 1) {
7864  int i;
7865  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7866  team->t.t_disp_buffer[i].buffer_index = i;
7867  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7868  }
7869  } else {
7870  team->t.t_disp_buffer[0].buffer_index = 0;
7871  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7872  }
7873 
7874  KMP_MB(); /* Flush all pending memory write invalidates. */
7875  KMP_ASSERT(this_thr->th.th_team == team);
7876 
7877 #ifdef KMP_DEBUG
7878  for (f = 0; f < team->t.t_nproc; f++) {
7879  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7880  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7881  }
7882 #endif /* KMP_DEBUG */
7883 
7884  /* release the worker threads so they may begin working */
7885  __kmp_fork_barrier(gtid, 0);
7886 }
7887 
7888 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7889  kmp_info_t *this_thr = __kmp_threads[gtid];
7890 
7891  KMP_DEBUG_ASSERT(team);
7892  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7893  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7894  KMP_MB(); /* Flush all pending memory write invalidates. */
7895 
7896  /* Join barrier after fork */
7897 
7898 #ifdef KMP_DEBUG
7899  if (__kmp_threads[gtid] &&
7900  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7901  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7902  __kmp_threads[gtid]);
7903  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7904  "team->t.t_nproc=%d\n",
7905  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7906  team->t.t_nproc);
7907  __kmp_print_structure();
7908  }
7909  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7910  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7911 #endif /* KMP_DEBUG */
7912 
7913  __kmp_join_barrier(gtid); /* wait for everyone */
7914 #if OMPT_SUPPORT
7915  if (ompt_enabled.enabled &&
7916  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7917  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7918  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7919  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7920 #if OMPT_OPTIONAL
7921  void *codeptr = NULL;
7922  if (KMP_MASTER_TID(ds_tid) &&
7923  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7924  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7925  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7926 
7927  if (ompt_enabled.ompt_callback_sync_region_wait) {
7928  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7929  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7930  codeptr);
7931  }
7932  if (ompt_enabled.ompt_callback_sync_region) {
7933  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7934  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7935  codeptr);
7936  }
7937 #endif
7938  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7939  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7940  ompt_scope_end, NULL, task_data, 0, ds_tid,
7941  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7942  }
7943  }
7944 #endif
7945 
7946  KMP_MB(); /* Flush all pending memory write invalidates. */
7947  KMP_ASSERT(this_thr->th.th_team == team);
7948 }
7949 
7950 /* ------------------------------------------------------------------------ */
7951 
7952 #ifdef USE_LOAD_BALANCE
7953 
7954 // Return the worker threads actively spinning in the hot team, if we
7955 // are at the outermost level of parallelism. Otherwise, return 0.
7956 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7957  int i;
7958  int retval;
7959  kmp_team_t *hot_team;
7960 
7961  if (root->r.r_active) {
7962  return 0;
7963  }
7964  hot_team = root->r.r_hot_team;
7965  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7966  return hot_team->t.t_nproc - 1; // Don't count primary thread
7967  }
7968 
7969  // Skip the primary thread - it is accounted for elsewhere.
7970  retval = 0;
7971  for (i = 1; i < hot_team->t.t_nproc; i++) {
7972  if (hot_team->t.t_threads[i]->th.th_active) {
7973  retval++;
7974  }
7975  }
7976  return retval;
7977 }
7978 
7979 // Perform an automatic adjustment to the number of
7980 // threads used by the next parallel region.
7981 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7982  int retval;
7983  int pool_active;
7984  int hot_team_active;
7985  int team_curr_active;
7986  int system_active;
7987 
7988  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7989  set_nproc));
7990  KMP_DEBUG_ASSERT(root);
7991  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7992  ->th.th_current_task->td_icvs.dynamic == TRUE);
7993  KMP_DEBUG_ASSERT(set_nproc > 1);
7994 
7995  if (set_nproc == 1) {
7996  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7997  return 1;
7998  }
7999 
8000  // Threads that are active in the thread pool, active in the hot team for this
8001  // particular root (if we are at the outer par level), and the currently
8002  // executing thread (to become the primary thread) are available to add to the
8003  // new team, but are currently contributing to the system load, and must be
8004  // accounted for.
8005  pool_active = __kmp_thread_pool_active_nth;
8006  hot_team_active = __kmp_active_hot_team_nproc(root);
8007  team_curr_active = pool_active + hot_team_active + 1;
8008 
8009  // Check the system load.
8010  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8011  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8012  "hot team active = %d\n",
8013  system_active, pool_active, hot_team_active));
8014 
8015  if (system_active < 0) {
8016  // There was an error reading the necessary info from /proc, so use the
8017  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8018  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8019  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8020  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8021 
8022  // Make this call behave like the thread limit algorithm.
8023  retval = __kmp_avail_proc - __kmp_nth +
8024  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8025  if (retval > set_nproc) {
8026  retval = set_nproc;
8027  }
8028  if (retval < KMP_MIN_NTH) {
8029  retval = KMP_MIN_NTH;
8030  }
8031 
8032  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8033  retval));
8034  return retval;
8035  }
8036 
8037  // There is a slight delay in the load balance algorithm in detecting new
8038  // running procs. The real system load at this instant should be at least as
8039  // large as the #active omp thread that are available to add to the team.
8040  if (system_active < team_curr_active) {
8041  system_active = team_curr_active;
8042  }
8043  retval = __kmp_avail_proc - system_active + team_curr_active;
8044  if (retval > set_nproc) {
8045  retval = set_nproc;
8046  }
8047  if (retval < KMP_MIN_NTH) {
8048  retval = KMP_MIN_NTH;
8049  }
8050 
8051  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8052  return retval;
8053 } // __kmp_load_balance_nproc()
8054 
8055 #endif /* USE_LOAD_BALANCE */
8056 
8057 /* ------------------------------------------------------------------------ */
8058 
8059 /* NOTE: this is called with the __kmp_init_lock held */
8060 void __kmp_cleanup(void) {
8061  int f;
8062 
8063  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8064 
8065  if (TCR_4(__kmp_init_parallel)) {
8066 #if KMP_HANDLE_SIGNALS
8067  __kmp_remove_signals();
8068 #endif
8069  TCW_4(__kmp_init_parallel, FALSE);
8070  }
8071 
8072  if (TCR_4(__kmp_init_middle)) {
8073 #if KMP_AFFINITY_SUPPORTED
8074  __kmp_affinity_uninitialize();
8075 #endif /* KMP_AFFINITY_SUPPORTED */
8076  __kmp_cleanup_hierarchy();
8077  TCW_4(__kmp_init_middle, FALSE);
8078  }
8079 
8080  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8081 
8082  if (__kmp_init_serial) {
8083  __kmp_runtime_destroy();
8084  __kmp_init_serial = FALSE;
8085  }
8086 
8087  __kmp_cleanup_threadprivate_caches();
8088 
8089  for (f = 0; f < __kmp_threads_capacity; f++) {
8090  if (__kmp_root[f] != NULL) {
8091  __kmp_free(__kmp_root[f]);
8092  __kmp_root[f] = NULL;
8093  }
8094  }
8095  __kmp_free(__kmp_threads);
8096  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8097  // there is no need in freeing __kmp_root.
8098  __kmp_threads = NULL;
8099  __kmp_root = NULL;
8100  __kmp_threads_capacity = 0;
8101 
8102 #if KMP_USE_DYNAMIC_LOCK
8103  __kmp_cleanup_indirect_user_locks();
8104 #else
8105  __kmp_cleanup_user_locks();
8106 #endif
8107 #if OMPD_SUPPORT
8108  if (ompd_state) {
8109  __kmp_free(ompd_env_block);
8110  ompd_env_block = NULL;
8111  ompd_env_block_size = 0;
8112  }
8113 #endif
8114 
8115 #if KMP_AFFINITY_SUPPORTED
8116  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8117  __kmp_cpuinfo_file = NULL;
8118 #endif /* KMP_AFFINITY_SUPPORTED */
8119 
8120 #if KMP_USE_ADAPTIVE_LOCKS
8121 #if KMP_DEBUG_ADAPTIVE_LOCKS
8122  __kmp_print_speculative_stats();
8123 #endif
8124 #endif
8125  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8126  __kmp_nested_nth.nth = NULL;
8127  __kmp_nested_nth.size = 0;
8128  __kmp_nested_nth.used = 0;
8129  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8130  __kmp_nested_proc_bind.bind_types = NULL;
8131  __kmp_nested_proc_bind.size = 0;
8132  __kmp_nested_proc_bind.used = 0;
8133  if (__kmp_affinity_format) {
8134  KMP_INTERNAL_FREE(__kmp_affinity_format);
8135  __kmp_affinity_format = NULL;
8136  }
8137 
8138  __kmp_i18n_catclose();
8139 
8140 #if KMP_USE_HIER_SCHED
8141  __kmp_hier_scheds.deallocate();
8142 #endif
8143 
8144 #if KMP_STATS_ENABLED
8145  __kmp_stats_fini();
8146 #endif
8147 
8148  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8149 }
8150 
8151 /* ------------------------------------------------------------------------ */
8152 
8153 int __kmp_ignore_mppbeg(void) {
8154  char *env;
8155 
8156  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8157  if (__kmp_str_match_false(env))
8158  return FALSE;
8159  }
8160  // By default __kmpc_begin() is no-op.
8161  return TRUE;
8162 }
8163 
8164 int __kmp_ignore_mppend(void) {
8165  char *env;
8166 
8167  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8168  if (__kmp_str_match_false(env))
8169  return FALSE;
8170  }
8171  // By default __kmpc_end() is no-op.
8172  return TRUE;
8173 }
8174 
8175 void __kmp_internal_begin(void) {
8176  int gtid;
8177  kmp_root_t *root;
8178 
8179  /* this is a very important step as it will register new sibling threads
8180  and assign these new uber threads a new gtid */
8181  gtid = __kmp_entry_gtid();
8182  root = __kmp_threads[gtid]->th.th_root;
8183  KMP_ASSERT(KMP_UBER_GTID(gtid));
8184 
8185  if (root->r.r_begin)
8186  return;
8187  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8188  if (root->r.r_begin) {
8189  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8190  return;
8191  }
8192 
8193  root->r.r_begin = TRUE;
8194 
8195  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8196 }
8197 
8198 /* ------------------------------------------------------------------------ */
8199 
8200 void __kmp_user_set_library(enum library_type arg) {
8201  int gtid;
8202  kmp_root_t *root;
8203  kmp_info_t *thread;
8204 
8205  /* first, make sure we are initialized so we can get our gtid */
8206 
8207  gtid = __kmp_entry_gtid();
8208  thread = __kmp_threads[gtid];
8209 
8210  root = thread->th.th_root;
8211 
8212  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8213  library_serial));
8214  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8215  thread */
8216  KMP_WARNING(SetLibraryIncorrectCall);
8217  return;
8218  }
8219 
8220  switch (arg) {
8221  case library_serial:
8222  thread->th.th_set_nproc = 0;
8223  set__nproc(thread, 1);
8224  break;
8225  case library_turnaround:
8226  thread->th.th_set_nproc = 0;
8227  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8228  : __kmp_dflt_team_nth_ub);
8229  break;
8230  case library_throughput:
8231  thread->th.th_set_nproc = 0;
8232  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8233  : __kmp_dflt_team_nth_ub);
8234  break;
8235  default:
8236  KMP_FATAL(UnknownLibraryType, arg);
8237  }
8238 
8239  __kmp_aux_set_library(arg);
8240 }
8241 
8242 void __kmp_aux_set_stacksize(size_t arg) {
8243  if (!__kmp_init_serial)
8244  __kmp_serial_initialize();
8245 
8246 #if KMP_OS_DARWIN
8247  if (arg & (0x1000 - 1)) {
8248  arg &= ~(0x1000 - 1);
8249  if (arg + 0x1000) /* check for overflow if we round up */
8250  arg += 0x1000;
8251  }
8252 #endif
8253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8254 
8255  /* only change the default stacksize before the first parallel region */
8256  if (!TCR_4(__kmp_init_parallel)) {
8257  size_t value = arg; /* argument is in bytes */
8258 
8259  if (value < __kmp_sys_min_stksize)
8260  value = __kmp_sys_min_stksize;
8261  else if (value > KMP_MAX_STKSIZE)
8262  value = KMP_MAX_STKSIZE;
8263 
8264  __kmp_stksize = value;
8265 
8266  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8267  }
8268 
8269  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8270 }
8271 
8272 /* set the behaviour of the runtime library */
8273 /* TODO this can cause some odd behaviour with sibling parallelism... */
8274 void __kmp_aux_set_library(enum library_type arg) {
8275  __kmp_library = arg;
8276 
8277  switch (__kmp_library) {
8278  case library_serial: {
8279  KMP_INFORM(LibraryIsSerial);
8280  } break;
8281  case library_turnaround:
8282  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8283  __kmp_use_yield = 2; // only yield when oversubscribed
8284  break;
8285  case library_throughput:
8286  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8287  __kmp_dflt_blocktime = 200;
8288  break;
8289  default:
8290  KMP_FATAL(UnknownLibraryType, arg);
8291  }
8292 }
8293 
8294 /* Getting team information common for all team API */
8295 // Returns NULL if not in teams construct
8296 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8297  kmp_info_t *thr = __kmp_entry_thread();
8298  teams_serialized = 0;
8299  if (thr->th.th_teams_microtask) {
8300  kmp_team_t *team = thr->th.th_team;
8301  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8302  int ii = team->t.t_level;
8303  teams_serialized = team->t.t_serialized;
8304  int level = tlevel + 1;
8305  KMP_DEBUG_ASSERT(ii >= tlevel);
8306  while (ii > level) {
8307  for (teams_serialized = team->t.t_serialized;
8308  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8309  }
8310  if (team->t.t_serialized && (!teams_serialized)) {
8311  team = team->t.t_parent;
8312  continue;
8313  }
8314  if (ii > level) {
8315  team = team->t.t_parent;
8316  ii--;
8317  }
8318  }
8319  return team;
8320  }
8321  return NULL;
8322 }
8323 
8324 int __kmp_aux_get_team_num() {
8325  int serialized;
8326  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8327  if (team) {
8328  if (serialized > 1) {
8329  return 0; // teams region is serialized ( 1 team of 1 thread ).
8330  } else {
8331  return team->t.t_master_tid;
8332  }
8333  }
8334  return 0;
8335 }
8336 
8337 int __kmp_aux_get_num_teams() {
8338  int serialized;
8339  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8340  if (team) {
8341  if (serialized > 1) {
8342  return 1;
8343  } else {
8344  return team->t.t_parent->t.t_nproc;
8345  }
8346  }
8347  return 1;
8348 }
8349 
8350 /* ------------------------------------------------------------------------ */
8351 
8352 /*
8353  * Affinity Format Parser
8354  *
8355  * Field is in form of: %[[[0].]size]type
8356  * % and type are required (%% means print a literal '%')
8357  * type is either single char or long name surrounded by {},
8358  * e.g., N or {num_threads}
8359  * 0 => leading zeros
8360  * . => right justified when size is specified
8361  * by default output is left justified
8362  * size is the *minimum* field length
8363  * All other characters are printed as is
8364  *
8365  * Available field types:
8366  * L {thread_level} - omp_get_level()
8367  * n {thread_num} - omp_get_thread_num()
8368  * h {host} - name of host machine
8369  * P {process_id} - process id (integer)
8370  * T {thread_identifier} - native thread identifier (integer)
8371  * N {num_threads} - omp_get_num_threads()
8372  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8373  * a {thread_affinity} - comma separated list of integers or integer ranges
8374  * (values of affinity mask)
8375  *
8376  * Implementation-specific field types can be added
8377  * If a type is unknown, print "undefined"
8378  */
8379 
8380 // Structure holding the short name, long name, and corresponding data type
8381 // for snprintf. A table of these will represent the entire valid keyword
8382 // field types.
8383 typedef struct kmp_affinity_format_field_t {
8384  char short_name; // from spec e.g., L -> thread level
8385  const char *long_name; // from spec thread_level -> thread level
8386  char field_format; // data type for snprintf (typically 'd' or 's'
8387  // for integer or string)
8388 } kmp_affinity_format_field_t;
8389 
8390 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8391 #if KMP_AFFINITY_SUPPORTED
8392  {'A', "thread_affinity", 's'},
8393 #endif
8394  {'t', "team_num", 'd'},
8395  {'T', "num_teams", 'd'},
8396  {'L', "nesting_level", 'd'},
8397  {'n', "thread_num", 'd'},
8398  {'N', "num_threads", 'd'},
8399  {'a', "ancestor_tnum", 'd'},
8400  {'H', "host", 's'},
8401  {'P', "process_id", 'd'},
8402  {'i', "native_thread_id", 'd'}};
8403 
8404 // Return the number of characters it takes to hold field
8405 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8406  const char **ptr,
8407  kmp_str_buf_t *field_buffer) {
8408  int rc, format_index, field_value;
8409  const char *width_left, *width_right;
8410  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8411  static const int FORMAT_SIZE = 20;
8412  char format[FORMAT_SIZE] = {0};
8413  char absolute_short_name = 0;
8414 
8415  KMP_DEBUG_ASSERT(gtid >= 0);
8416  KMP_DEBUG_ASSERT(th);
8417  KMP_DEBUG_ASSERT(**ptr == '%');
8418  KMP_DEBUG_ASSERT(field_buffer);
8419 
8420  __kmp_str_buf_clear(field_buffer);
8421 
8422  // Skip the initial %
8423  (*ptr)++;
8424 
8425  // Check for %% first
8426  if (**ptr == '%') {
8427  __kmp_str_buf_cat(field_buffer, "%", 1);
8428  (*ptr)++; // skip over the second %
8429  return 1;
8430  }
8431 
8432  // Parse field modifiers if they are present
8433  pad_zeros = false;
8434  if (**ptr == '0') {
8435  pad_zeros = true;
8436  (*ptr)++; // skip over 0
8437  }
8438  right_justify = false;
8439  if (**ptr == '.') {
8440  right_justify = true;
8441  (*ptr)++; // skip over .
8442  }
8443  // Parse width of field: [width_left, width_right)
8444  width_left = width_right = NULL;
8445  if (**ptr >= '0' && **ptr <= '9') {
8446  width_left = *ptr;
8447  SKIP_DIGITS(*ptr);
8448  width_right = *ptr;
8449  }
8450 
8451  // Create the format for KMP_SNPRINTF based on flags parsed above
8452  format_index = 0;
8453  format[format_index++] = '%';
8454  if (!right_justify)
8455  format[format_index++] = '-';
8456  if (pad_zeros)
8457  format[format_index++] = '0';
8458  if (width_left && width_right) {
8459  int i = 0;
8460  // Only allow 8 digit number widths.
8461  // This also prevents overflowing format variable
8462  while (i < 8 && width_left < width_right) {
8463  format[format_index++] = *width_left;
8464  width_left++;
8465  i++;
8466  }
8467  }
8468 
8469  // Parse a name (long or short)
8470  // Canonicalize the name into absolute_short_name
8471  found_valid_name = false;
8472  parse_long_name = (**ptr == '{');
8473  if (parse_long_name)
8474  (*ptr)++; // skip initial left brace
8475  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8476  sizeof(__kmp_affinity_format_table[0]);
8477  ++i) {
8478  char short_name = __kmp_affinity_format_table[i].short_name;
8479  const char *long_name = __kmp_affinity_format_table[i].long_name;
8480  char field_format = __kmp_affinity_format_table[i].field_format;
8481  if (parse_long_name) {
8482  size_t length = KMP_STRLEN(long_name);
8483  if (strncmp(*ptr, long_name, length) == 0) {
8484  found_valid_name = true;
8485  (*ptr) += length; // skip the long name
8486  }
8487  } else if (**ptr == short_name) {
8488  found_valid_name = true;
8489  (*ptr)++; // skip the short name
8490  }
8491  if (found_valid_name) {
8492  format[format_index++] = field_format;
8493  format[format_index++] = '\0';
8494  absolute_short_name = short_name;
8495  break;
8496  }
8497  }
8498  if (parse_long_name) {
8499  if (**ptr != '}') {
8500  absolute_short_name = 0;
8501  } else {
8502  (*ptr)++; // skip over the right brace
8503  }
8504  }
8505 
8506  // Attempt to fill the buffer with the requested
8507  // value using snprintf within __kmp_str_buf_print()
8508  switch (absolute_short_name) {
8509  case 't':
8510  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8511  break;
8512  case 'T':
8513  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8514  break;
8515  case 'L':
8516  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8517  break;
8518  case 'n':
8519  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8520  break;
8521  case 'H': {
8522  static const int BUFFER_SIZE = 256;
8523  char buf[BUFFER_SIZE];
8524  __kmp_expand_host_name(buf, BUFFER_SIZE);
8525  rc = __kmp_str_buf_print(field_buffer, format, buf);
8526  } break;
8527  case 'P':
8528  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8529  break;
8530  case 'i':
8531  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8532  break;
8533  case 'N':
8534  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8535  break;
8536  case 'a':
8537  field_value =
8538  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8539  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8540  break;
8541 #if KMP_AFFINITY_SUPPORTED
8542  case 'A': {
8543  kmp_str_buf_t buf;
8544  __kmp_str_buf_init(&buf);
8545  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8546  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8547  __kmp_str_buf_free(&buf);
8548  } break;
8549 #endif
8550  default:
8551  // According to spec, If an implementation does not have info for field
8552  // type, then "undefined" is printed
8553  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8554  // Skip the field
8555  if (parse_long_name) {
8556  SKIP_TOKEN(*ptr);
8557  if (**ptr == '}')
8558  (*ptr)++;
8559  } else {
8560  (*ptr)++;
8561  }
8562  }
8563 
8564  KMP_ASSERT(format_index <= FORMAT_SIZE);
8565  return rc;
8566 }
8567 
8568 /*
8569  * Return number of characters needed to hold the affinity string
8570  * (not including null byte character)
8571  * The resultant string is printed to buffer, which the caller can then
8572  * handle afterwards
8573  */
8574 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8575  kmp_str_buf_t *buffer) {
8576  const char *parse_ptr;
8577  size_t retval;
8578  const kmp_info_t *th;
8579  kmp_str_buf_t field;
8580 
8581  KMP_DEBUG_ASSERT(buffer);
8582  KMP_DEBUG_ASSERT(gtid >= 0);
8583 
8584  __kmp_str_buf_init(&field);
8585  __kmp_str_buf_clear(buffer);
8586 
8587  th = __kmp_threads[gtid];
8588  retval = 0;
8589 
8590  // If format is NULL or zero-length string, then we use
8591  // affinity-format-var ICV
8592  parse_ptr = format;
8593  if (parse_ptr == NULL || *parse_ptr == '\0') {
8594  parse_ptr = __kmp_affinity_format;
8595  }
8596  KMP_DEBUG_ASSERT(parse_ptr);
8597 
8598  while (*parse_ptr != '\0') {
8599  // Parse a field
8600  if (*parse_ptr == '%') {
8601  // Put field in the buffer
8602  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8603  __kmp_str_buf_catbuf(buffer, &field);
8604  retval += rc;
8605  } else {
8606  // Put literal character in buffer
8607  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8608  retval++;
8609  parse_ptr++;
8610  }
8611  }
8612  __kmp_str_buf_free(&field);
8613  return retval;
8614 }
8615 
8616 // Displays the affinity string to stdout
8617 void __kmp_aux_display_affinity(int gtid, const char *format) {
8618  kmp_str_buf_t buf;
8619  __kmp_str_buf_init(&buf);
8620  __kmp_aux_capture_affinity(gtid, format, &buf);
8621  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8622  __kmp_str_buf_free(&buf);
8623 }
8624 
8625 /* ------------------------------------------------------------------------ */
8626 
8627 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8628  int blocktime = arg; /* argument is in milliseconds */
8629 #if KMP_USE_MONITOR
8630  int bt_intervals;
8631 #endif
8632  kmp_int8 bt_set;
8633 
8634  __kmp_save_internal_controls(thread);
8635 
8636  /* Normalize and set blocktime for the teams */
8637  if (blocktime < KMP_MIN_BLOCKTIME)
8638  blocktime = KMP_MIN_BLOCKTIME;
8639  else if (blocktime > KMP_MAX_BLOCKTIME)
8640  blocktime = KMP_MAX_BLOCKTIME;
8641 
8642  set__blocktime_team(thread->th.th_team, tid, blocktime);
8643  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8644 
8645 #if KMP_USE_MONITOR
8646  /* Calculate and set blocktime intervals for the teams */
8647  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8648 
8649  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8650  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8651 #endif
8652 
8653  /* Set whether blocktime has been set to "TRUE" */
8654  bt_set = TRUE;
8655 
8656  set__bt_set_team(thread->th.th_team, tid, bt_set);
8657  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8658 #if KMP_USE_MONITOR
8659  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8660  "bt_intervals=%d, monitor_updates=%d\n",
8661  __kmp_gtid_from_tid(tid, thread->th.th_team),
8662  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8663  __kmp_monitor_wakeups));
8664 #else
8665  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8666  __kmp_gtid_from_tid(tid, thread->th.th_team),
8667  thread->th.th_team->t.t_id, tid, blocktime));
8668 #endif
8669 }
8670 
8671 void __kmp_aux_set_defaults(char const *str, size_t len) {
8672  if (!__kmp_init_serial) {
8673  __kmp_serial_initialize();
8674  }
8675  __kmp_env_initialize(str);
8676 
8677  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8678  __kmp_env_print();
8679  }
8680 } // __kmp_aux_set_defaults
8681 
8682 /* ------------------------------------------------------------------------ */
8683 /* internal fast reduction routines */
8684 
8685 PACKED_REDUCTION_METHOD_T
8686 __kmp_determine_reduction_method(
8687  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8688  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8689  kmp_critical_name *lck) {
8690 
8691  // Default reduction method: critical construct ( lck != NULL, like in current
8692  // PAROPT )
8693  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8694  // can be selected by RTL
8695  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8696  // can be selected by RTL
8697  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8698  // among generated by PAROPT.
8699 
8700  PACKED_REDUCTION_METHOD_T retval;
8701 
8702  int team_size;
8703 
8704  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8705  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8706 
8707 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8708  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8709 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8710 
8711  retval = critical_reduce_block;
8712 
8713  // another choice of getting a team size (with 1 dynamic deference) is slower
8714  team_size = __kmp_get_team_num_threads(global_tid);
8715  if (team_size == 1) {
8716 
8717  retval = empty_reduce_block;
8718 
8719  } else {
8720 
8721  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8722 
8723 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8724  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8725 
8726 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8727  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8728 
8729  int teamsize_cutoff = 4;
8730 
8731 #if KMP_MIC_SUPPORTED
8732  if (__kmp_mic_type != non_mic) {
8733  teamsize_cutoff = 8;
8734  }
8735 #endif
8736  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8737  if (tree_available) {
8738  if (team_size <= teamsize_cutoff) {
8739  if (atomic_available) {
8740  retval = atomic_reduce_block;
8741  }
8742  } else {
8743  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8744  }
8745  } else if (atomic_available) {
8746  retval = atomic_reduce_block;
8747  }
8748 #else
8749 #error "Unknown or unsupported OS"
8750 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8751  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8752 
8753 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8754 
8755 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8756 
8757  // basic tuning
8758 
8759  if (atomic_available) {
8760  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8761  retval = atomic_reduce_block;
8762  }
8763  } // otherwise: use critical section
8764 
8765 #elif KMP_OS_DARWIN
8766 
8767  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8768  if (atomic_available && (num_vars <= 3)) {
8769  retval = atomic_reduce_block;
8770  } else if (tree_available) {
8771  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8772  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8773  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8774  }
8775  } // otherwise: use critical section
8776 
8777 #else
8778 #error "Unknown or unsupported OS"
8779 #endif
8780 
8781 #else
8782 #error "Unknown or unsupported architecture"
8783 #endif
8784  }
8785 
8786  // KMP_FORCE_REDUCTION
8787 
8788  // If the team is serialized (team_size == 1), ignore the forced reduction
8789  // method and stay with the unsynchronized method (empty_reduce_block)
8790  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8791  team_size != 1) {
8792 
8793  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8794 
8795  int atomic_available, tree_available;
8796 
8797  switch ((forced_retval = __kmp_force_reduction_method)) {
8798  case critical_reduce_block:
8799  KMP_ASSERT(lck); // lck should be != 0
8800  break;
8801 
8802  case atomic_reduce_block:
8803  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8804  if (!atomic_available) {
8805  KMP_WARNING(RedMethodNotSupported, "atomic");
8806  forced_retval = critical_reduce_block;
8807  }
8808  break;
8809 
8810  case tree_reduce_block:
8811  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8812  if (!tree_available) {
8813  KMP_WARNING(RedMethodNotSupported, "tree");
8814  forced_retval = critical_reduce_block;
8815  } else {
8816 #if KMP_FAST_REDUCTION_BARRIER
8817  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8818 #endif
8819  }
8820  break;
8821 
8822  default:
8823  KMP_ASSERT(0); // "unsupported method specified"
8824  }
8825 
8826  retval = forced_retval;
8827  }
8828 
8829  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8830 
8831 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8832 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8833 
8834  return (retval);
8835 }
8836 // this function is for testing set/get/determine reduce method
8837 kmp_int32 __kmp_get_reduce_method(void) {
8838  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8839 }
8840 
8841 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8842 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8843 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8844 
8845 // Hard pause shuts down the runtime completely. Resume happens naturally when
8846 // OpenMP is used subsequently.
8847 void __kmp_hard_pause() {
8848  __kmp_pause_status = kmp_hard_paused;
8849  __kmp_internal_end_thread(-1);
8850 }
8851 
8852 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8853 void __kmp_resume_if_soft_paused() {
8854  if (__kmp_pause_status == kmp_soft_paused) {
8855  __kmp_pause_status = kmp_not_paused;
8856 
8857  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8858  kmp_info_t *thread = __kmp_threads[gtid];
8859  if (thread) { // Wake it if sleeping
8860  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8861  thread);
8862  if (fl.is_sleeping())
8863  fl.resume(gtid);
8864  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8865  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8866  } else { // thread holds the lock and may sleep soon
8867  do { // until either the thread sleeps, or we can get the lock
8868  if (fl.is_sleeping()) {
8869  fl.resume(gtid);
8870  break;
8871  } else if (__kmp_try_suspend_mx(thread)) {
8872  __kmp_unlock_suspend_mx(thread);
8873  break;
8874  }
8875  } while (1);
8876  }
8877  }
8878  }
8879  }
8880 }
8881 
8882 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8883 // TODO: add warning messages
8884 int __kmp_pause_resource(kmp_pause_status_t level) {
8885  if (level == kmp_not_paused) { // requesting resume
8886  if (__kmp_pause_status == kmp_not_paused) {
8887  // error message about runtime not being paused, so can't resume
8888  return 1;
8889  } else {
8890  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8891  __kmp_pause_status == kmp_hard_paused);
8892  __kmp_pause_status = kmp_not_paused;
8893  return 0;
8894  }
8895  } else if (level == kmp_soft_paused) { // requesting soft pause
8896  if (__kmp_pause_status != kmp_not_paused) {
8897  // error message about already being paused
8898  return 1;
8899  } else {
8900  __kmp_soft_pause();
8901  return 0;
8902  }
8903  } else if (level == kmp_hard_paused) { // requesting hard pause
8904  if (__kmp_pause_status != kmp_not_paused) {
8905  // error message about already being paused
8906  return 1;
8907  } else {
8908  __kmp_hard_pause();
8909  return 0;
8910  }
8911  } else {
8912  // error message about invalid level
8913  return 1;
8914  }
8915 }
8916 
8917 void __kmp_omp_display_env(int verbose) {
8918  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8919  if (__kmp_init_serial == 0)
8920  __kmp_do_serial_initialize();
8921  __kmp_display_env_impl(!verbose, verbose);
8922  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8923 }
8924 
8925 // The team size is changing, so distributed barrier must be modified
8926 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8927  int new_nthreads) {
8928  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8929  bp_dist_bar);
8930  kmp_info_t **other_threads = team->t.t_threads;
8931 
8932  // We want all the workers to stop waiting on the barrier while we adjust the
8933  // size of the team.
8934  for (int f = 1; f < old_nthreads; ++f) {
8935  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8936  // Ignore threads that are already inactive or not present in the team
8937  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8938  // teams construct causes thread_limit to get passed in, and some of
8939  // those could be inactive; just ignore them
8940  continue;
8941  }
8942  // If thread is transitioning still to in_use state, wait for it
8943  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8944  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8945  KMP_CPU_PAUSE();
8946  }
8947  // The thread should be in_use now
8948  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8949  // Transition to unused state
8950  team->t.t_threads[f]->th.th_used_in_team.store(2);
8951  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8952  }
8953  // Release all the workers
8954  kmp_uint64 new_value; // new value for go
8955  new_value = team->t.b->go_release();
8956 
8957  KMP_MFENCE();
8958 
8959  // Workers should see transition status 2 and move to 0; but may need to be
8960  // woken up first
8961  size_t my_go_index;
8962  int count = old_nthreads - 1;
8963  while (count > 0) {
8964  count = old_nthreads - 1;
8965  for (int f = 1; f < old_nthreads; ++f) {
8966  my_go_index = f / team->t.b->threads_per_go;
8967  if (other_threads[f]->th.th_used_in_team.load() != 0) {
8968  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8969  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8970  void *, other_threads[f]->th.th_sleep_loc);
8971  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8972  }
8973  } else {
8974  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8975  count--;
8976  }
8977  }
8978  }
8979  // Now update the barrier size
8980  team->t.b->update_num_threads(new_nthreads);
8981  team->t.b->go_reset();
8982 }
8983 
8984 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8985  // Add the threads back to the team
8986  KMP_DEBUG_ASSERT(team);
8987  // Threads were paused and pointed at th_used_in_team temporarily during a
8988  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
8989  // the thread that it should transition itself back into the team. Then, if
8990  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
8991  // to wake it up.
8992  for (int f = 1; f < new_nthreads; ++f) {
8993  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
8994  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
8995  3);
8996  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
8997  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
8998  (kmp_flag_32<false, false> *)NULL);
8999  }
9000  }
9001  // The threads should be transitioning to the team; when they are done, they
9002  // should have set th_used_in_team to 1. This loop forces master to wait until
9003  // all threads have moved into the team and are waiting in the barrier.
9004  int count = new_nthreads - 1;
9005  while (count > 0) {
9006  count = new_nthreads - 1;
9007  for (int f = 1; f < new_nthreads; ++f) {
9008  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9009  count--;
9010  }
9011  }
9012  }
9013 }
9014 
9015 // Globals and functions for hidden helper task
9016 kmp_info_t **__kmp_hidden_helper_threads;
9017 kmp_info_t *__kmp_hidden_helper_main_thread;
9018 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9019 #if KMP_OS_LINUX
9020 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9021 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9022 #else
9023 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9024 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9025 #endif
9026 
9027 namespace {
9028 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9029 
9030 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9031  // This is an explicit synchronization on all hidden helper threads in case
9032  // that when a regular thread pushes a hidden helper task to one hidden
9033  // helper thread, the thread has not been awaken once since they're released
9034  // by the main thread after creating the team.
9035  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9036  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9037  __kmp_hidden_helper_threads_num)
9038  ;
9039 
9040  // If main thread, then wait for signal
9041  if (__kmpc_master(nullptr, *gtid)) {
9042  // First, unset the initial state and release the initial thread
9043  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9044  __kmp_hidden_helper_initz_release();
9045  __kmp_hidden_helper_main_thread_wait();
9046  // Now wake up all worker threads
9047  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9048  __kmp_hidden_helper_worker_thread_signal();
9049  }
9050  }
9051 }
9052 } // namespace
9053 
9054 void __kmp_hidden_helper_threads_initz_routine() {
9055  // Create a new root for hidden helper team/threads
9056  const int gtid = __kmp_register_root(TRUE);
9057  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9058  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9059  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9060  __kmp_hidden_helper_threads_num;
9061 
9062  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9063 
9064  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9065 
9066  // Set the initialization flag to FALSE
9067  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9068 
9069  __kmp_hidden_helper_threads_deinitz_release();
9070 }
9071 
9072 /* Nesting Mode:
9073  Set via KMP_NESTING_MODE, which takes an integer.
9074  Note: we skip duplicate topology levels, and skip levels with only
9075  one entity.
9076  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9077  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9078  in the topology, and initializes the number of threads at each of those
9079  levels to the number of entities at each level, respectively, below the
9080  entity at the parent level.
9081  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9082  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9083  the user to turn nesting on explicitly. This is an even more experimental
9084  option to this experimental feature, and may change or go away in the
9085  future.
9086 */
9087 
9088 // Allocate space to store nesting levels
9089 void __kmp_init_nesting_mode() {
9090  int levels = KMP_HW_LAST;
9091  __kmp_nesting_mode_nlevels = levels;
9092  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9093  for (int i = 0; i < levels; ++i)
9094  __kmp_nesting_nth_level[i] = 0;
9095  if (__kmp_nested_nth.size < levels) {
9096  __kmp_nested_nth.nth =
9097  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9098  __kmp_nested_nth.size = levels;
9099  }
9100 }
9101 
9102 // Set # threads for top levels of nesting; must be called after topology set
9103 void __kmp_set_nesting_mode_threads() {
9104  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9105 
9106  if (__kmp_nesting_mode == 1)
9107  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9108  else if (__kmp_nesting_mode > 1)
9109  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9110 
9111  if (__kmp_topology) { // use topology info
9112  int loc, hw_level;
9113  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9114  loc < __kmp_nesting_mode_nlevels;
9115  loc++, hw_level++) {
9116  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9117  if (__kmp_nesting_nth_level[loc] == 1)
9118  loc--;
9119  }
9120  // Make sure all cores are used
9121  if (__kmp_nesting_mode > 1 && loc > 1) {
9122  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9123  int num_cores = __kmp_topology->get_count(core_level);
9124  int upper_levels = 1;
9125  for (int level = 0; level < loc - 1; ++level)
9126  upper_levels *= __kmp_nesting_nth_level[level];
9127  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9128  __kmp_nesting_nth_level[loc - 1] =
9129  num_cores / __kmp_nesting_nth_level[loc - 2];
9130  }
9131  __kmp_nesting_mode_nlevels = loc;
9132  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9133  } else { // no topology info available; provide a reasonable guesstimation
9134  if (__kmp_avail_proc >= 4) {
9135  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9136  __kmp_nesting_nth_level[1] = 2;
9137  __kmp_nesting_mode_nlevels = 2;
9138  } else {
9139  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9140  __kmp_nesting_mode_nlevels = 1;
9141  }
9142  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9143  }
9144  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9145  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9146  }
9147  set__nproc(thread, __kmp_nesting_nth_level[0]);
9148  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9149  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9150  if (get__max_active_levels(thread) > 1) {
9151  // if max levels was set, set nesting mode levels to same
9152  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9153  }
9154  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9155  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9156 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:937
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236