LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411  char buffer[MAX_MESSAGE];
412  va_list ap;
413 
414  if (__kmp_generate_warnings == kmp_warnings_off) {
415  return;
416  }
417 
418  va_start(ap, format);
419 
420  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422  __kmp_vprintf(kmp_err, buffer, ap);
423  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425  va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429  // Later threads may stall here, but that's ok because abort() will kill them.
430  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432  if (__kmp_debug_buf) {
433  __kmp_dump_debug_buffer();
434  }
435 
436  if (KMP_OS_WINDOWS) {
437  // Let other threads know of abnormal termination and prevent deadlock
438  // if abort happened during library initialization or shutdown
439  __kmp_global.g.g_abort = SIGABRT;
440 
441  /* On Windows* OS by default abort() causes pop-up error box, which stalls
442  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443  boxes. _set_abort_behavior() works well, but this function is not
444  available in VS7 (this is not problem for DLL, but it is a problem for
445  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446  help, at least in some versions of MS C RTL.
447 
448  It seems following sequence is the only way to simulate abort() and
449  avoid pop-up error box. */
450  raise(SIGABRT);
451  _exit(3); // Just in case, if signal ignored, exit anyway.
452  } else {
453  __kmp_unregister_library();
454  abort();
455  }
456 
457  __kmp_infinite_loop();
458  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463  // TODO: Eliminate g_abort global variable and this function.
464  // In case of abort just call abort(), it will kill all the threads.
465  __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469  that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473  gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479  sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481  __kmp_print_storage_map_gtid(
482  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486  &thr->th.th_bar[bs_plain_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491  &thr->th.th_bar[bs_forkjoin_barrier + 1],
492  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493  gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497  &thr->th.th_bar[bs_reduction_barrier + 1],
498  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499  gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504  that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507  int team_id, int num_thr) {
508  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513  &team->t.t_bar[bs_last_barrier],
514  sizeof(kmp_balign_team_t) * bs_last_barrier,
515  "%s_%d.t_bar", header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518  &team->t.t_bar[bs_plain_barrier + 1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520  header, team_id);
521 
522  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523  &team->t.t_bar[bs_forkjoin_barrier + 1],
524  sizeof(kmp_balign_team_t),
525  "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529  &team->t.t_bar[bs_reduction_barrier + 1],
530  sizeof(kmp_balign_team_t),
531  "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538  __kmp_print_storage_map_gtid(
539  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543  &team->t.t_disp_buffer[num_disp_buff],
544  sizeof(dispatch_shared_info_t) * num_disp_buff,
545  "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549  __kmp_init_memkind();
550  __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562  switch (fdwReason) {
563 
564  case DLL_PROCESS_ATTACH:
565  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567  return TRUE;
568 
569  case DLL_PROCESS_DETACH:
570  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572  // According to Windows* documentation for DllMain entry point:
573  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574  // lpReserved == NULL when FreeLibrary() is called,
575  // lpReserved != NULL when the process is terminated.
576  // When FreeLibrary() is called, worker threads remain alive. So the
577  // runtime's state is consistent and executing proper shutdown is OK.
578  // When the process is terminated, worker threads have exited or been
579  // forcefully terminated by the OS and only the shutdown thread remains.
580  // This can leave the runtime in an inconsistent state.
581  // Hence, only attempt proper cleanup when FreeLibrary() is called.
582  // Otherwise, rely on OS to reclaim resources.
583  if (lpReserved == NULL)
584  __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586  return TRUE;
587 
588  case DLL_THREAD_ATTACH:
589  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591  /* if we want to register new siblings all the time here call
592  * __kmp_get_gtid(); */
593  return TRUE;
594 
595  case DLL_THREAD_DETACH:
596  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598  __kmp_internal_end_thread(__kmp_gtid_get_specific());
599  return TRUE;
600  }
601 
602  return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610  int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612  kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615  if (__kmp_env_consistency_check) {
616  if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622  }
623 #ifdef BUILD_PARALLEL_ORDERED
624  if (!team->t.t_serialized) {
625  KMP_MB();
626  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627  NULL);
628  KMP_MB();
629  }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635  int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637  int tid = __kmp_tid_from_gtid(gtid);
638  kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641  if (__kmp_env_consistency_check) {
642  if (__kmp_threads[gtid]->th.th_root->r.r_active)
643  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644  }
645 #ifdef BUILD_PARALLEL_ORDERED
646  if (!team->t.t_serialized) {
647  KMP_MB(); /* Flush all pending memory write invalidates. */
648 
649  /* use the tid of the next thread in this team */
650  /* TODO replace with general release procedure */
651  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653  KMP_MB(); /* Flush all pending memory write invalidates. */
654  }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662  int status;
663  kmp_info_t *th;
664  kmp_team_t *team;
665 
666  if (!TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668  __kmp_resume_if_soft_paused();
669 
670  th = __kmp_threads[gtid];
671  team = th->th.th_team;
672  status = 0;
673 
674  th->th.th_ident = id_ref;
675 
676  if (team->t.t_serialized) {
677  status = 1;
678  } else {
679  kmp_int32 old_this = th->th.th_local.this_construct;
680 
681  ++th->th.th_local.this_construct;
682  /* try to set team count to thread count--success means thread got the
683  single block */
684  /* TODO: Should this be acquire or release? */
685  if (team->t.t_construct == old_this) {
686  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687  th->th.th_local.this_construct);
688  }
689 #if USE_ITT_BUILD
690  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692  team->t.t_active_level == 1) {
693  // Only report metadata by primary thread of active team at level 1
694  __kmp_itt_metadata_single(id_ref);
695  }
696 #endif /* USE_ITT_BUILD */
697  }
698 
699  if (__kmp_env_consistency_check) {
700  if (status && push_ws) {
701  __kmp_push_workshare(gtid, ct_psingle, id_ref);
702  } else {
703  __kmp_check_workshare(gtid, ct_psingle, id_ref);
704  }
705  }
706 #if USE_ITT_BUILD
707  if (status) {
708  __kmp_itt_single_start(gtid);
709  }
710 #endif /* USE_ITT_BUILD */
711  return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716  __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718  if (__kmp_env_consistency_check)
719  __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729  int master_tid, int set_nthreads,
730  int enter_teams) {
731  int capacity;
732  int new_nthreads;
733  KMP_DEBUG_ASSERT(__kmp_init_serial);
734  KMP_DEBUG_ASSERT(root && parent_team);
735  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737  // If dyn-var is set, dynamically adjust the number of desired threads,
738  // according to the method specified by dynamic_mode.
739  new_nthreads = set_nthreads;
740  if (!get__dynamic_2(parent_team, master_tid)) {
741  ;
742  }
743 #ifdef USE_LOAD_BALANCE
744  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746  if (new_nthreads == 1) {
747  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748  "reservation to 1 thread\n",
749  master_tid));
750  return 1;
751  }
752  if (new_nthreads < set_nthreads) {
753  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754  "reservation to %d threads\n",
755  master_tid, new_nthreads));
756  }
757  }
758 #endif /* USE_LOAD_BALANCE */
759  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760  new_nthreads = __kmp_avail_proc - __kmp_nth +
761  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762  if (new_nthreads <= 1) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764  "reservation to 1 thread\n",
765  master_tid));
766  return 1;
767  }
768  if (new_nthreads < set_nthreads) {
769  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770  "reservation to %d threads\n",
771  master_tid, new_nthreads));
772  } else {
773  new_nthreads = set_nthreads;
774  }
775  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776  if (set_nthreads > 2) {
777  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778  new_nthreads = (new_nthreads % set_nthreads) + 1;
779  if (new_nthreads == 1) {
780  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781  "reservation to 1 thread\n",
782  master_tid));
783  return 1;
784  }
785  if (new_nthreads < set_nthreads) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787  "reservation to %d threads\n",
788  master_tid, new_nthreads));
789  }
790  }
791  } else {
792  KMP_ASSERT(0);
793  }
794 
795  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796  if (__kmp_nth + new_nthreads -
797  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798  __kmp_max_nth) {
799  int tl_nthreads = __kmp_max_nth - __kmp_nth +
800  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801  if (tl_nthreads <= 0) {
802  tl_nthreads = 1;
803  }
804 
805  // If dyn-var is false, emit a 1-time warning.
806  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807  __kmp_reserve_warn = 1;
808  __kmp_msg(kmp_ms_warning,
809  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811  }
812  if (tl_nthreads == 1) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814  "reduced reservation to 1 thread\n",
815  master_tid));
816  return 1;
817  }
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819  "reservation to %d threads\n",
820  master_tid, tl_nthreads));
821  new_nthreads = tl_nthreads;
822  }
823 
824  // Respect OMP_THREAD_LIMIT
825  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827  if (cg_nthreads + new_nthreads -
828  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829  max_cg_threads) {
830  int tl_nthreads = max_cg_threads - cg_nthreads +
831  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832  if (tl_nthreads <= 0) {
833  tl_nthreads = 1;
834  }
835 
836  // If dyn-var is false, emit a 1-time warning.
837  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838  __kmp_reserve_warn = 1;
839  __kmp_msg(kmp_ms_warning,
840  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842  }
843  if (tl_nthreads == 1) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845  "reduced reservation to 1 thread\n",
846  master_tid));
847  return 1;
848  }
849  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850  "reservation to %d threads\n",
851  master_tid, tl_nthreads));
852  new_nthreads = tl_nthreads;
853  }
854 
855  // Check if the threads array is large enough, or needs expanding.
856  // See comment in __kmp_register_root() about the adjustment if
857  // __kmp_threads[0] == NULL.
858  capacity = __kmp_threads_capacity;
859  if (TCR_PTR(__kmp_threads[0]) == NULL) {
860  --capacity;
861  }
862  // If it is not for initializing the hidden helper team, we need to take
863  // __kmp_hidden_helper_threads_num out of the capacity because it is included
864  // in __kmp_threads_capacity.
865  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866  capacity -= __kmp_hidden_helper_threads_num;
867  }
868  if (__kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  capacity) {
871  // Expand the threads array.
872  int slotsRequired = __kmp_nth + new_nthreads -
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874  capacity;
875  int slotsAdded = __kmp_expand_threads(slotsRequired);
876  if (slotsAdded < slotsRequired) {
877  // The threads array was not expanded enough.
878  new_nthreads -= (slotsRequired - slotsAdded);
879  KMP_ASSERT(new_nthreads >= 1);
880 
881  // If dyn-var is false, emit a 1-time warning.
882  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883  __kmp_reserve_warn = 1;
884  if (__kmp_tp_cached) {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889  } else {
890  __kmp_msg(kmp_ms_warning,
891  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893  }
894  }
895  }
896  }
897 
898 #ifdef KMP_DEBUG
899  if (new_nthreads == 1) {
900  KC_TRACE(10,
901  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902  "dead roots and rechecking; requested %d threads\n",
903  __kmp_get_gtid(), set_nthreads));
904  } else {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906  " %d threads\n",
907  __kmp_get_gtid(), new_nthreads, set_nthreads));
908  }
909 #endif // KMP_DEBUG
910  return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914  assured that there are enough threads available, because we checked on that
915  earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917  kmp_info_t *master_th, int master_gtid) {
918  int i;
919  int use_hot_team;
920 
921  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
922  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
923  KMP_MB();
924 
925  /* first, let's setup the primary thread */
926  master_th->th.th_info.ds.ds_tid = 0;
927  master_th->th.th_team = team;
928  master_th->th.th_team_nproc = team->t.t_nproc;
929  master_th->th.th_team_master = master_th;
930  master_th->th.th_team_serialized = FALSE;
931  master_th->th.th_dispatch = &team->t.t_dispatch[0];
932 
933 /* make sure we are not the optimized hot team */
934 #if KMP_NESTED_HOT_TEAMS
935  use_hot_team = 0;
936  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
937  if (hot_teams) { // hot teams array is not allocated if
938  // KMP_HOT_TEAMS_MAX_LEVEL=0
939  int level = team->t.t_active_level - 1; // index in array of hot teams
940  if (master_th->th.th_teams_microtask) { // are we inside the teams?
941  if (master_th->th.th_teams_size.nteams > 1) {
942  ++level; // level was not increased in teams construct for
943  // team_of_masters
944  }
945  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
946  master_th->th.th_teams_level == team->t.t_level) {
947  ++level; // level was not increased in teams construct for
948  // team_of_workers before the parallel
949  } // team->t.t_level will be increased inside parallel
950  }
951  if (level < __kmp_hot_teams_max_level) {
952  if (hot_teams[level].hot_team) {
953  // hot team has already been allocated for given level
954  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
955  use_hot_team = 1; // the team is ready to use
956  } else {
957  use_hot_team = 0; // AC: threads are not allocated yet
958  hot_teams[level].hot_team = team; // remember new hot team
959  hot_teams[level].hot_team_nth = team->t.t_nproc;
960  }
961  } else {
962  use_hot_team = 0;
963  }
964  }
965 #else
966  use_hot_team = team == root->r.r_hot_team;
967 #endif
968  if (!use_hot_team) {
969 
970  /* install the primary thread */
971  team->t.t_threads[0] = master_th;
972  __kmp_initialize_info(master_th, team, 0, master_gtid);
973 
974  /* now, install the worker threads */
975  for (i = 1; i < team->t.t_nproc; i++) {
976 
977  /* fork or reallocate a new thread and install it in team */
978  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
979  team->t.t_threads[i] = thr;
980  KMP_DEBUG_ASSERT(thr);
981  KMP_DEBUG_ASSERT(thr->th.th_team == team);
982  /* align team and thread arrived states */
983  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
984  "T#%d(%d:%d) join =%llu, plain=%llu\n",
985  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
986  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
987  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
988  team->t.t_bar[bs_plain_barrier].b_arrived));
989  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
990  thr->th.th_teams_level = master_th->th.th_teams_level;
991  thr->th.th_teams_size = master_th->th.th_teams_size;
992  { // Initialize threads' barrier data.
993  int b;
994  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
995  for (b = 0; b < bs_last_barrier; ++b) {
996  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
997  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
998 #if USE_DEBUGGER
999  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1000 #endif
1001  }
1002  }
1003  }
1004 
1005 #if KMP_AFFINITY_SUPPORTED
1006  __kmp_partition_places(team);
1007 #endif
1008  }
1009 
1010  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1011  for (i = 0; i < team->t.t_nproc; i++) {
1012  kmp_info_t *thr = team->t.t_threads[i];
1013  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1014  thr->th.th_prev_level != team->t.t_level) {
1015  team->t.t_display_affinity = 1;
1016  break;
1017  }
1018  }
1019  }
1020 
1021  KMP_MB();
1022 }
1023 
1024 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1025 // Propagate any changes to the floating point control registers out to the team
1026 // We try to avoid unnecessary writes to the relevant cache line in the team
1027 // structure, so we don't make changes unless they are needed.
1028 inline static void propagateFPControl(kmp_team_t *team) {
1029  if (__kmp_inherit_fp_control) {
1030  kmp_int16 x87_fpu_control_word;
1031  kmp_uint32 mxcsr;
1032 
1033  // Get primary thread's values of FPU control flags (both X87 and vector)
1034  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1035  __kmp_store_mxcsr(&mxcsr);
1036  mxcsr &= KMP_X86_MXCSR_MASK;
1037 
1038  // There is no point looking at t_fp_control_saved here.
1039  // If it is TRUE, we still have to update the values if they are different
1040  // from those we now have. If it is FALSE we didn't save anything yet, but
1041  // our objective is the same. We have to ensure that the values in the team
1042  // are the same as those we have.
1043  // So, this code achieves what we need whether or not t_fp_control_saved is
1044  // true. By checking whether the value needs updating we avoid unnecessary
1045  // writes that would put the cache-line into a written state, causing all
1046  // threads in the team to have to read it again.
1047  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1048  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1049  // Although we don't use this value, other code in the runtime wants to know
1050  // whether it should restore them. So we must ensure it is correct.
1051  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1052  } else {
1053  // Similarly here. Don't write to this cache-line in the team structure
1054  // unless we have to.
1055  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1056  }
1057 }
1058 
1059 // Do the opposite, setting the hardware registers to the updated values from
1060 // the team.
1061 inline static void updateHWFPControl(kmp_team_t *team) {
1062  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1063  // Only reset the fp control regs if they have been changed in the team.
1064  // the parallel region that we are exiting.
1065  kmp_int16 x87_fpu_control_word;
1066  kmp_uint32 mxcsr;
1067  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1068  __kmp_store_mxcsr(&mxcsr);
1069  mxcsr &= KMP_X86_MXCSR_MASK;
1070 
1071  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1072  __kmp_clear_x87_fpu_status_word();
1073  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1074  }
1075 
1076  if (team->t.t_mxcsr != mxcsr) {
1077  __kmp_load_mxcsr(&team->t.t_mxcsr);
1078  }
1079  }
1080 }
1081 #else
1082 #define propagateFPControl(x) ((void)0)
1083 #define updateHWFPControl(x) ((void)0)
1084 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1085 
1086 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1087  int realloc); // forward declaration
1088 
1089 /* Run a parallel region that has been serialized, so runs only in a team of the
1090  single primary thread. */
1091 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1092  kmp_info_t *this_thr;
1093  kmp_team_t *serial_team;
1094 
1095  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1096 
1097  /* Skip all this code for autopar serialized loops since it results in
1098  unacceptable overhead */
1099  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1100  return;
1101 
1102  if (!TCR_4(__kmp_init_parallel))
1103  __kmp_parallel_initialize();
1104  __kmp_resume_if_soft_paused();
1105 
1106  this_thr = __kmp_threads[global_tid];
1107  serial_team = this_thr->th.th_serial_team;
1108 
1109  /* utilize the serialized team held by this thread */
1110  KMP_DEBUG_ASSERT(serial_team);
1111  KMP_MB();
1112 
1113  if (__kmp_tasking_mode != tskm_immediate_exec) {
1114  KMP_DEBUG_ASSERT(
1115  this_thr->th.th_task_team ==
1116  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1117  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1118  NULL);
1119  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1120  "team %p, new task_team = NULL\n",
1121  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1122  this_thr->th.th_task_team = NULL;
1123  }
1124 
1125  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1126  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1127  proc_bind = proc_bind_false;
1128  } else if (proc_bind == proc_bind_default) {
1129  // No proc_bind clause was specified, so use the current value
1130  // of proc-bind-var for this parallel region.
1131  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1132  }
1133  // Reset for next parallel region
1134  this_thr->th.th_set_proc_bind = proc_bind_default;
1135 
1136 #if OMPT_SUPPORT
1137  ompt_data_t ompt_parallel_data = ompt_data_none;
1138  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1139  if (ompt_enabled.enabled &&
1140  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1141 
1142  ompt_task_info_t *parent_task_info;
1143  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1144 
1145  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1146  if (ompt_enabled.ompt_callback_parallel_begin) {
1147  int team_size = 1;
1148 
1149  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1150  &(parent_task_info->task_data), &(parent_task_info->frame),
1151  &ompt_parallel_data, team_size,
1152  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1153  }
1154  }
1155 #endif // OMPT_SUPPORT
1156 
1157  if (this_thr->th.th_team != serial_team) {
1158  // Nested level will be an index in the nested nthreads array
1159  int level = this_thr->th.th_team->t.t_level;
1160 
1161  if (serial_team->t.t_serialized) {
1162  /* this serial team was already used
1163  TODO increase performance by making this locks more specific */
1164  kmp_team_t *new_team;
1165 
1166  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1167 
1168  new_team =
1169  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1170 #if OMPT_SUPPORT
1171  ompt_parallel_data,
1172 #endif
1173  proc_bind, &this_thr->th.th_current_task->td_icvs,
1174  0 USE_NESTED_HOT_ARG(NULL));
1175  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1176  KMP_ASSERT(new_team);
1177 
1178  /* setup new serialized team and install it */
1179  new_team->t.t_threads[0] = this_thr;
1180  new_team->t.t_parent = this_thr->th.th_team;
1181  serial_team = new_team;
1182  this_thr->th.th_serial_team = serial_team;
1183 
1184  KF_TRACE(
1185  10,
1186  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1187  global_tid, serial_team));
1188 
1189  /* TODO the above breaks the requirement that if we run out of resources,
1190  then we can still guarantee that serialized teams are ok, since we may
1191  need to allocate a new one */
1192  } else {
1193  KF_TRACE(
1194  10,
1195  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1196  global_tid, serial_team));
1197  }
1198 
1199  /* we have to initialize this serial team */
1200  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1201  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1202  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1203  serial_team->t.t_ident = loc;
1204  serial_team->t.t_serialized = 1;
1205  serial_team->t.t_nproc = 1;
1206  serial_team->t.t_parent = this_thr->th.th_team;
1207  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1208  this_thr->th.th_team = serial_team;
1209  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1210 
1211  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1212  this_thr->th.th_current_task));
1213  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1214  this_thr->th.th_current_task->td_flags.executing = 0;
1215 
1216  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1217 
1218  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1219  implicit task for each serialized task represented by
1220  team->t.t_serialized? */
1221  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1222  &this_thr->th.th_current_task->td_parent->td_icvs);
1223 
1224  // Thread value exists in the nested nthreads array for the next nested
1225  // level
1226  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1227  this_thr->th.th_current_task->td_icvs.nproc =
1228  __kmp_nested_nth.nth[level + 1];
1229  }
1230 
1231  if (__kmp_nested_proc_bind.used &&
1232  (level + 1 < __kmp_nested_proc_bind.used)) {
1233  this_thr->th.th_current_task->td_icvs.proc_bind =
1234  __kmp_nested_proc_bind.bind_types[level + 1];
1235  }
1236 
1237 #if USE_DEBUGGER
1238  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1239 #endif
1240  this_thr->th.th_info.ds.ds_tid = 0;
1241 
1242  /* set thread cache values */
1243  this_thr->th.th_team_nproc = 1;
1244  this_thr->th.th_team_master = this_thr;
1245  this_thr->th.th_team_serialized = 1;
1246 
1247  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1248  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1249  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1250 
1251  propagateFPControl(serial_team);
1252 
1253  /* check if we need to allocate dispatch buffers stack */
1254  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1255  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1256  serial_team->t.t_dispatch->th_disp_buffer =
1257  (dispatch_private_info_t *)__kmp_allocate(
1258  sizeof(dispatch_private_info_t));
1259  }
1260  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1261 
1262  KMP_MB();
1263 
1264  } else {
1265  /* this serialized team is already being used,
1266  * that's fine, just add another nested level */
1267  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1268  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1269  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1270  ++serial_team->t.t_serialized;
1271  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1272 
1273  // Nested level will be an index in the nested nthreads array
1274  int level = this_thr->th.th_team->t.t_level;
1275  // Thread value exists in the nested nthreads array for the next nested
1276  // level
1277  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278  this_thr->th.th_current_task->td_icvs.nproc =
1279  __kmp_nested_nth.nth[level + 1];
1280  }
1281  serial_team->t.t_level++;
1282  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1283  "of serial team %p to %d\n",
1284  global_tid, serial_team, serial_team->t.t_level));
1285 
1286  /* allocate/push dispatch buffers stack */
1287  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1288  {
1289  dispatch_private_info_t *disp_buffer =
1290  (dispatch_private_info_t *)__kmp_allocate(
1291  sizeof(dispatch_private_info_t));
1292  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1293  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1294  }
1295  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1296 
1297  KMP_MB();
1298  }
1299  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1300 
1301  // Perform the display affinity functionality for
1302  // serialized parallel regions
1303  if (__kmp_display_affinity) {
1304  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1305  this_thr->th.th_prev_num_threads != 1) {
1306  // NULL means use the affinity-format-var ICV
1307  __kmp_aux_display_affinity(global_tid, NULL);
1308  this_thr->th.th_prev_level = serial_team->t.t_level;
1309  this_thr->th.th_prev_num_threads = 1;
1310  }
1311  }
1312 
1313  if (__kmp_env_consistency_check)
1314  __kmp_push_parallel(global_tid, NULL);
1315 #if OMPT_SUPPORT
1316  serial_team->t.ompt_team_info.master_return_address = codeptr;
1317  if (ompt_enabled.enabled &&
1318  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1319  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1320  OMPT_GET_FRAME_ADDRESS(0);
1321 
1322  ompt_lw_taskteam_t lw_taskteam;
1323  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1324  &ompt_parallel_data, codeptr);
1325 
1326  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1327  // don't use lw_taskteam after linking. content was swaped
1328 
1329  /* OMPT implicit task begin */
1330  if (ompt_enabled.ompt_callback_implicit_task) {
1331  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1332  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1333  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1334  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1335  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1336  __kmp_tid_from_gtid(global_tid);
1337  }
1338 
1339  /* OMPT state */
1340  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1341  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1342  OMPT_GET_FRAME_ADDRESS(0);
1343  }
1344 #endif
1345 }
1346 
1347 /* most of the work for a fork */
1348 /* return true if we really went parallel, false if serialized */
1349 int __kmp_fork_call(ident_t *loc, int gtid,
1350  enum fork_context_e call_context, // Intel, GNU, ...
1351  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1352  kmp_va_list ap) {
1353  void **argv;
1354  int i;
1355  int master_tid;
1356  int master_this_cons;
1357  kmp_team_t *team;
1358  kmp_team_t *parent_team;
1359  kmp_info_t *master_th;
1360  kmp_root_t *root;
1361  int nthreads;
1362  int master_active;
1363  int master_set_numthreads;
1364  int level;
1365  int active_level;
1366  int teams_level;
1367 #if KMP_NESTED_HOT_TEAMS
1368  kmp_hot_team_ptr_t **p_hot_teams;
1369 #endif
1370  { // KMP_TIME_BLOCK
1371  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1372  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1373 
1374  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1375  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1376  /* Some systems prefer the stack for the root thread(s) to start with */
1377  /* some gap from the parent stack to prevent false sharing. */
1378  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1379  /* These 2 lines below are so this does not get optimized out */
1380  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1381  __kmp_stkpadding += (short)((kmp_int64)dummy);
1382  }
1383 
1384  /* initialize if needed */
1385  KMP_DEBUG_ASSERT(
1386  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1387  if (!TCR_4(__kmp_init_parallel))
1388  __kmp_parallel_initialize();
1389  __kmp_resume_if_soft_paused();
1390 
1391  /* setup current data */
1392  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1393  // shutdown
1394  parent_team = master_th->th.th_team;
1395  master_tid = master_th->th.th_info.ds.ds_tid;
1396  master_this_cons = master_th->th.th_local.this_construct;
1397  root = master_th->th.th_root;
1398  master_active = root->r.r_active;
1399  master_set_numthreads = master_th->th.th_set_nproc;
1400 
1401 #if OMPT_SUPPORT
1402  ompt_data_t ompt_parallel_data = ompt_data_none;
1403  ompt_data_t *parent_task_data;
1404  ompt_frame_t *ompt_frame;
1405  ompt_data_t *implicit_task_data;
1406  void *return_address = NULL;
1407 
1408  if (ompt_enabled.enabled) {
1409  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1410  NULL, NULL);
1411  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1412  }
1413 #endif
1414 
1415  // Assign affinity to root thread if it hasn't happened yet
1416  __kmp_assign_root_init_mask();
1417 
1418  // Nested level will be an index in the nested nthreads array
1419  level = parent_team->t.t_level;
1420  // used to launch non-serial teams even if nested is not allowed
1421  active_level = parent_team->t.t_active_level;
1422  // needed to check nesting inside the teams
1423  teams_level = master_th->th.th_teams_level;
1424 #if KMP_NESTED_HOT_TEAMS
1425  p_hot_teams = &master_th->th.th_hot_teams;
1426  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1427  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1428  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1429  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1430  // it is either actual or not needed (when active_level > 0)
1431  (*p_hot_teams)[0].hot_team_nth = 1;
1432  }
1433 #endif
1434 
1435 #if OMPT_SUPPORT
1436  if (ompt_enabled.enabled) {
1437  if (ompt_enabled.ompt_callback_parallel_begin) {
1438  int team_size = master_set_numthreads
1439  ? master_set_numthreads
1440  : get__nproc_2(parent_team, master_tid);
1441  int flags = OMPT_INVOKER(call_context) |
1442  ((microtask == (microtask_t)__kmp_teams_master)
1443  ? ompt_parallel_league
1444  : ompt_parallel_team);
1445  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1446  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1447  return_address);
1448  }
1449  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1450  }
1451 #endif
1452 
1453  master_th->th.th_ident = loc;
1454 
1455  if (master_th->th.th_teams_microtask && ap &&
1456  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1457  // AC: This is start of parallel that is nested inside teams construct.
1458  // The team is actual (hot), all workers are ready at the fork barrier.
1459  // No lock needed to initialize the team a bit, then free workers.
1460  parent_team->t.t_ident = loc;
1461  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1462  parent_team->t.t_argc = argc;
1463  argv = (void **)parent_team->t.t_argv;
1464  for (i = argc - 1; i >= 0; --i)
1465  *argv++ = va_arg(kmp_va_deref(ap), void *);
1466  // Increment our nested depth levels, but not increase the serialization
1467  if (parent_team == master_th->th.th_serial_team) {
1468  // AC: we are in serialized parallel
1469  __kmpc_serialized_parallel(loc, gtid);
1470  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1471 
1472  if (call_context == fork_context_gnu) {
1473  // AC: need to decrement t_serialized for enquiry functions to work
1474  // correctly, will restore at join time
1475  parent_team->t.t_serialized--;
1476  return TRUE;
1477  }
1478 
1479 #if OMPD_SUPPORT
1480  parent_team->t.t_pkfn = microtask;
1481 #endif
1482 
1483 #if OMPT_SUPPORT
1484  void *dummy;
1485  void **exit_frame_p;
1486 
1487  ompt_lw_taskteam_t lw_taskteam;
1488 
1489  if (ompt_enabled.enabled) {
1490  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1491  &ompt_parallel_data, return_address);
1492  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1493 
1494  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1495  // don't use lw_taskteam after linking. content was swaped
1496 
1497  /* OMPT implicit task begin */
1498  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1499  if (ompt_enabled.ompt_callback_implicit_task) {
1500  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1501  __kmp_tid_from_gtid(gtid);
1502  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1503  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1504  implicit_task_data, 1,
1505  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1506  }
1507 
1508  /* OMPT state */
1509  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1510  } else {
1511  exit_frame_p = &dummy;
1512  }
1513 #endif
1514  // AC: need to decrement t_serialized for enquiry functions to work
1515  // correctly, will restore at join time
1516  parent_team->t.t_serialized--;
1517 
1518  {
1519  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1520  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1521  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1522 #if OMPT_SUPPORT
1523  ,
1524  exit_frame_p
1525 #endif
1526  );
1527  }
1528 
1529 #if OMPT_SUPPORT
1530  if (ompt_enabled.enabled) {
1531  *exit_frame_p = NULL;
1532  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1533  if (ompt_enabled.ompt_callback_implicit_task) {
1534  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1535  ompt_scope_end, NULL, implicit_task_data, 1,
1536  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1537  }
1538  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1539  __ompt_lw_taskteam_unlink(master_th);
1540  if (ompt_enabled.ompt_callback_parallel_end) {
1541  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1542  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1543  OMPT_INVOKER(call_context) | ompt_parallel_team,
1544  return_address);
1545  }
1546  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1547  }
1548 #endif
1549  return TRUE;
1550  }
1551 
1552  parent_team->t.t_pkfn = microtask;
1553  parent_team->t.t_invoke = invoker;
1554  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1555  parent_team->t.t_active_level++;
1556  parent_team->t.t_level++;
1557  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1558 
1559 #if OMPT_SUPPORT
1560  if (ompt_enabled.enabled) {
1561  ompt_lw_taskteam_t lw_taskteam;
1562  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1563  &ompt_parallel_data, return_address);
1564  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1565  }
1566 #endif
1567 
1568  /* Change number of threads in the team if requested */
1569  if (master_set_numthreads) { // The parallel has num_threads clause
1570  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1571  // AC: only can reduce number of threads dynamically, can't increase
1572  kmp_info_t **other_threads = parent_team->t.t_threads;
1573  // NOTE: if using distributed barrier, we need to run this code block
1574  // even when the team size appears not to have changed from the max.
1575  int old_proc = master_th->th.th_teams_size.nth;
1576  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1577  bp_dist_bar) {
1578  __kmp_resize_dist_barrier(parent_team, old_proc,
1579  master_set_numthreads);
1580  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1581  }
1582  parent_team->t.t_nproc = master_set_numthreads;
1583  for (i = 0; i < master_set_numthreads; ++i) {
1584  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1585  }
1586  }
1587  // Keep extra threads hot in the team for possible next parallels
1588  master_th->th.th_set_nproc = 0;
1589  }
1590 
1591 #if USE_DEBUGGER
1592  if (__kmp_debugging) { // Let debugger override number of threads.
1593  int nth = __kmp_omp_num_threads(loc);
1594  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1595  master_set_numthreads = nth;
1596  }
1597  }
1598 #endif
1599 
1600 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1601  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1602  KMP_ITT_DEBUG) &&
1603  __kmp_forkjoin_frames_mode == 3 &&
1604  parent_team->t.t_active_level == 1 // only report frames at level 1
1605  && master_th->th.th_teams_size.nteams == 1) {
1606  kmp_uint64 tmp_time = __itt_get_timestamp();
1607  master_th->th.th_frame_time = tmp_time;
1608  parent_team->t.t_region_time = tmp_time;
1609  }
1610  if (__itt_stack_caller_create_ptr) {
1611  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1612  // create new stack stitching id before entering fork barrier
1613  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1614  }
1615 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1616 
1617  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1618  "master_th=%p, gtid=%d\n",
1619  root, parent_team, master_th, gtid));
1620  __kmp_internal_fork(loc, gtid, parent_team);
1621  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1622  "master_th=%p, gtid=%d\n",
1623  root, parent_team, master_th, gtid));
1624 
1625  if (call_context == fork_context_gnu)
1626  return TRUE;
1627 
1628  /* Invoke microtask for PRIMARY thread */
1629  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1630  parent_team->t.t_id, parent_team->t.t_pkfn));
1631 
1632  if (!parent_team->t.t_invoke(gtid)) {
1633  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1634  }
1635  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1636  parent_team->t.t_id, parent_team->t.t_pkfn));
1637  KMP_MB(); /* Flush all pending memory write invalidates. */
1638 
1639  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1640 
1641  return TRUE;
1642  } // Parallel closely nested in teams construct
1643 
1644 #if KMP_DEBUG
1645  if (__kmp_tasking_mode != tskm_immediate_exec) {
1646  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1647  parent_team->t.t_task_team[master_th->th.th_task_state]);
1648  }
1649 #endif
1650 
1651  // Need this to happen before we determine the number of threads, not while
1652  // we are allocating the team
1653  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1654  int enter_teams = 0;
1655  if (parent_team->t.t_active_level >=
1656  master_th->th.th_current_task->td_icvs.max_active_levels) {
1657  nthreads = 1;
1658  } else {
1659  enter_teams = ((ap == NULL && active_level == 0) ||
1660  (ap && teams_level > 0 && teams_level == level));
1661  nthreads = master_set_numthreads
1662  ? master_set_numthreads
1663  // TODO: get nproc directly from current task
1664  : get__nproc_2(parent_team, master_tid);
1665  // Check if we need to take forkjoin lock? (no need for serialized
1666  // parallel out of teams construct). This code moved here from
1667  // __kmp_reserve_threads() to speedup nested serialized parallels.
1668  if (nthreads > 1) {
1669  if ((get__max_active_levels(master_th) == 1 &&
1670  (root->r.r_in_parallel && !enter_teams)) ||
1671  (__kmp_library == library_serial)) {
1672  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1673  " threads\n",
1674  gtid, nthreads));
1675  nthreads = 1;
1676  }
1677  }
1678  if (nthreads > 1) {
1679  /* determine how many new threads we can use */
1680  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1681  /* AC: If we execute teams from parallel region (on host), then teams
1682  should be created but each can only have 1 thread if nesting is
1683  disabled. If teams called from serial region, then teams and their
1684  threads should be created regardless of the nesting setting. */
1685  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1686  nthreads, enter_teams);
1687  if (nthreads == 1) {
1688  // Free lock for single thread execution here; for multi-thread
1689  // execution it will be freed later after team of threads created
1690  // and initialized
1691  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1692  }
1693  }
1694  }
1695  KMP_DEBUG_ASSERT(nthreads > 0);
1696 
1697  // If we temporarily changed the set number of threads then restore it now
1698  master_th->th.th_set_nproc = 0;
1699 
1700  /* create a serialized parallel region? */
1701  if (nthreads == 1) {
1702 /* josh todo: hypothetical question: what do we do for OS X*? */
1703 #if KMP_OS_LINUX && \
1704  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1705  void *args[argc];
1706 #else
1707  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1708 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1709  KMP_ARCH_AARCH64) */
1710 
1711  KA_TRACE(20,
1712  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1713 
1714  __kmpc_serialized_parallel(loc, gtid);
1715 
1716 #if OMPD_SUPPORT
1717  master_th->th.th_serial_team->t.t_pkfn = microtask;
1718 #endif
1719 
1720  if (call_context == fork_context_intel) {
1721  /* TODO this sucks, use the compiler itself to pass args! :) */
1722  master_th->th.th_serial_team->t.t_ident = loc;
1723  if (!ap) {
1724  // revert change made in __kmpc_serialized_parallel()
1725  master_th->th.th_serial_team->t.t_level--;
1726  // Get args from parent team for teams construct
1727 
1728 #if OMPT_SUPPORT
1729  void *dummy;
1730  void **exit_frame_p;
1731  ompt_task_info_t *task_info;
1732 
1733  ompt_lw_taskteam_t lw_taskteam;
1734 
1735  if (ompt_enabled.enabled) {
1736  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1737  &ompt_parallel_data, return_address);
1738 
1739  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1740  // don't use lw_taskteam after linking. content was swaped
1741 
1742  task_info = OMPT_CUR_TASK_INFO(master_th);
1743  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1744  if (ompt_enabled.ompt_callback_implicit_task) {
1745  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1746  __kmp_tid_from_gtid(gtid);
1747  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1748  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1749  &(task_info->task_data), 1,
1750  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1751  ompt_task_implicit);
1752  }
1753 
1754  /* OMPT state */
1755  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1756  } else {
1757  exit_frame_p = &dummy;
1758  }
1759 #endif
1760 
1761  {
1762  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1763  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1764  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1765  parent_team->t.t_argv
1766 #if OMPT_SUPPORT
1767  ,
1768  exit_frame_p
1769 #endif
1770  );
1771  }
1772 
1773 #if OMPT_SUPPORT
1774  if (ompt_enabled.enabled) {
1775  *exit_frame_p = NULL;
1776  if (ompt_enabled.ompt_callback_implicit_task) {
1777  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1778  ompt_scope_end, NULL, &(task_info->task_data), 1,
1779  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1780  ompt_task_implicit);
1781  }
1782  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1783  __ompt_lw_taskteam_unlink(master_th);
1784  if (ompt_enabled.ompt_callback_parallel_end) {
1785  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1786  &ompt_parallel_data, parent_task_data,
1787  OMPT_INVOKER(call_context) | ompt_parallel_team,
1788  return_address);
1789  }
1790  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1791  }
1792 #endif
1793  } else if (microtask == (microtask_t)__kmp_teams_master) {
1794  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1795  master_th->th.th_serial_team);
1796  team = master_th->th.th_team;
1797  // team->t.t_pkfn = microtask;
1798  team->t.t_invoke = invoker;
1799  __kmp_alloc_argv_entries(argc, team, TRUE);
1800  team->t.t_argc = argc;
1801  argv = (void **)team->t.t_argv;
1802  if (ap) {
1803  for (i = argc - 1; i >= 0; --i)
1804  *argv++ = va_arg(kmp_va_deref(ap), void *);
1805  } else {
1806  for (i = 0; i < argc; ++i)
1807  // Get args from parent team for teams construct
1808  argv[i] = parent_team->t.t_argv[i];
1809  }
1810  // AC: revert change made in __kmpc_serialized_parallel()
1811  // because initial code in teams should have level=0
1812  team->t.t_level--;
1813  // AC: call special invoker for outer "parallel" of teams construct
1814  invoker(gtid);
1815 #if OMPT_SUPPORT
1816  if (ompt_enabled.enabled) {
1817  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1818  if (ompt_enabled.ompt_callback_implicit_task) {
1819  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1820  ompt_scope_end, NULL, &(task_info->task_data), 0,
1821  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1822  }
1823  if (ompt_enabled.ompt_callback_parallel_end) {
1824  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1825  &ompt_parallel_data, parent_task_data,
1826  OMPT_INVOKER(call_context) | ompt_parallel_league,
1827  return_address);
1828  }
1829  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1830  }
1831 #endif
1832  } else {
1833  argv = args;
1834  for (i = argc - 1; i >= 0; --i)
1835  *argv++ = va_arg(kmp_va_deref(ap), void *);
1836  KMP_MB();
1837 
1838 #if OMPT_SUPPORT
1839  void *dummy;
1840  void **exit_frame_p;
1841  ompt_task_info_t *task_info;
1842 
1843  ompt_lw_taskteam_t lw_taskteam;
1844 
1845  if (ompt_enabled.enabled) {
1846  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1847  &ompt_parallel_data, return_address);
1848  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1849  // don't use lw_taskteam after linking. content was swaped
1850  task_info = OMPT_CUR_TASK_INFO(master_th);
1851  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1852 
1853  /* OMPT implicit task begin */
1854  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1855  if (ompt_enabled.ompt_callback_implicit_task) {
1856  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1857  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1858  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1859  ompt_task_implicit);
1860  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1861  __kmp_tid_from_gtid(gtid);
1862  }
1863 
1864  /* OMPT state */
1865  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1866  } else {
1867  exit_frame_p = &dummy;
1868  }
1869 #endif
1870 
1871  {
1872  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1873  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1874  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1875 #if OMPT_SUPPORT
1876  ,
1877  exit_frame_p
1878 #endif
1879  );
1880  }
1881 
1882 #if OMPT_SUPPORT
1883  if (ompt_enabled.enabled) {
1884  *exit_frame_p = NULL;
1885  if (ompt_enabled.ompt_callback_implicit_task) {
1886  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1887  ompt_scope_end, NULL, &(task_info->task_data), 1,
1888  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1889  ompt_task_implicit);
1890  }
1891 
1892  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1893  __ompt_lw_taskteam_unlink(master_th);
1894  if (ompt_enabled.ompt_callback_parallel_end) {
1895  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1896  &ompt_parallel_data, parent_task_data,
1897  OMPT_INVOKER(call_context) | ompt_parallel_team,
1898  return_address);
1899  }
1900  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1901  }
1902 #endif
1903  }
1904  } else if (call_context == fork_context_gnu) {
1905 #if OMPT_SUPPORT
1906  ompt_lw_taskteam_t lwt;
1907  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1908  return_address);
1909 
1910  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1911  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1912 // don't use lw_taskteam after linking. content was swaped
1913 #endif
1914 
1915  // we were called from GNU native code
1916  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1917  return FALSE;
1918  } else {
1919  KMP_ASSERT2(call_context < fork_context_last,
1920  "__kmp_fork_call: unknown fork_context parameter");
1921  }
1922 
1923  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1924  KMP_MB();
1925  return FALSE;
1926  } // if (nthreads == 1)
1927 
1928  // GEH: only modify the executing flag in the case when not serialized
1929  // serialized case is handled in kmpc_serialized_parallel
1930  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1931  "curtask=%p, curtask_max_aclevel=%d\n",
1932  parent_team->t.t_active_level, master_th,
1933  master_th->th.th_current_task,
1934  master_th->th.th_current_task->td_icvs.max_active_levels));
1935  // TODO: GEH - cannot do this assertion because root thread not set up as
1936  // executing
1937  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1938  master_th->th.th_current_task->td_flags.executing = 0;
1939 
1940  if (!master_th->th.th_teams_microtask || level > teams_level) {
1941  /* Increment our nested depth level */
1942  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1943  }
1944 
1945  // See if we need to make a copy of the ICVs.
1946  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1947  if ((level + 1 < __kmp_nested_nth.used) &&
1948  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1949  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1950  } else {
1951  nthreads_icv = 0; // don't update
1952  }
1953 
1954  // Figure out the proc_bind_policy for the new team.
1955  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1956  kmp_proc_bind_t proc_bind_icv =
1957  proc_bind_default; // proc_bind_default means don't update
1958  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1959  proc_bind = proc_bind_false;
1960  } else {
1961  if (proc_bind == proc_bind_default) {
1962  // No proc_bind clause specified; use current proc-bind-var for this
1963  // parallel region
1964  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1965  }
1966  /* else: The proc_bind policy was specified explicitly on parallel clause.
1967  This overrides proc-bind-var for this parallel region, but does not
1968  change proc-bind-var. */
1969  // Figure the value of proc-bind-var for the child threads.
1970  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1971  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1972  master_th->th.th_current_task->td_icvs.proc_bind)) {
1973  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1974  }
1975  }
1976 
1977  // Reset for next parallel region
1978  master_th->th.th_set_proc_bind = proc_bind_default;
1979 
1980  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1981  kmp_internal_control_t new_icvs;
1982  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1983  new_icvs.next = NULL;
1984  if (nthreads_icv > 0) {
1985  new_icvs.nproc = nthreads_icv;
1986  }
1987  if (proc_bind_icv != proc_bind_default) {
1988  new_icvs.proc_bind = proc_bind_icv;
1989  }
1990 
1991  /* allocate a new parallel team */
1992  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1993  team = __kmp_allocate_team(root, nthreads, nthreads,
1994 #if OMPT_SUPPORT
1995  ompt_parallel_data,
1996 #endif
1997  proc_bind, &new_icvs,
1998  argc USE_NESTED_HOT_ARG(master_th));
1999  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2000  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2001  } else {
2002  /* allocate a new parallel team */
2003  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2004  team = __kmp_allocate_team(root, nthreads, nthreads,
2005 #if OMPT_SUPPORT
2006  ompt_parallel_data,
2007 #endif
2008  proc_bind,
2009  &master_th->th.th_current_task->td_icvs,
2010  argc USE_NESTED_HOT_ARG(master_th));
2011  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2012  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2013  &master_th->th.th_current_task->td_icvs);
2014  }
2015  KF_TRACE(
2016  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2017 
2018  /* setup the new team */
2019  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2020  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2021  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2022  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2023  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2024 #if OMPT_SUPPORT
2025  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2026  return_address);
2027 #endif
2028  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2029  // TODO: parent_team->t.t_level == INT_MAX ???
2030  if (!master_th->th.th_teams_microtask || level > teams_level) {
2031  int new_level = parent_team->t.t_level + 1;
2032  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2033  new_level = parent_team->t.t_active_level + 1;
2034  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2035  } else {
2036  // AC: Do not increase parallel level at start of the teams construct
2037  int new_level = parent_team->t.t_level;
2038  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2039  new_level = parent_team->t.t_active_level;
2040  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2041  }
2042  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2043  // set primary thread's schedule as new run-time schedule
2044  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2045 
2046  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2047  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2048 
2049  // Update the floating point rounding in the team if required.
2050  propagateFPControl(team);
2051 #if OMPD_SUPPORT
2052  if (ompd_state & OMPD_ENABLE_BP)
2053  ompd_bp_parallel_begin();
2054 #endif
2055 
2056  if (__kmp_tasking_mode != tskm_immediate_exec) {
2057  // Set primary thread's task team to team's task team. Unless this is hot
2058  // team, it should be NULL.
2059  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2060  parent_team->t.t_task_team[master_th->th.th_task_state]);
2061  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2062  "%p, new task_team %p / team %p\n",
2063  __kmp_gtid_from_thread(master_th),
2064  master_th->th.th_task_team, parent_team,
2065  team->t.t_task_team[master_th->th.th_task_state], team));
2066 
2067  if (active_level || master_th->th.th_task_team) {
2068  // Take a memo of primary thread's task_state
2069  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2070  if (master_th->th.th_task_state_top >=
2071  master_th->th.th_task_state_stack_sz) { // increase size
2072  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2073  kmp_uint8 *old_stack, *new_stack;
2074  kmp_uint32 i;
2075  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2076  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2077  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2078  }
2079  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2080  ++i) { // zero-init rest of stack
2081  new_stack[i] = 0;
2082  }
2083  old_stack = master_th->th.th_task_state_memo_stack;
2084  master_th->th.th_task_state_memo_stack = new_stack;
2085  master_th->th.th_task_state_stack_sz = new_size;
2086  __kmp_free(old_stack);
2087  }
2088  // Store primary thread's task_state on stack
2089  master_th->th
2090  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2091  master_th->th.th_task_state;
2092  master_th->th.th_task_state_top++;
2093 #if KMP_NESTED_HOT_TEAMS
2094  if (master_th->th.th_hot_teams &&
2095  active_level < __kmp_hot_teams_max_level &&
2096  team == master_th->th.th_hot_teams[active_level].hot_team) {
2097  // Restore primary thread's nested state if nested hot team
2098  master_th->th.th_task_state =
2099  master_th->th
2100  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2101  } else {
2102 #endif
2103  master_th->th.th_task_state = 0;
2104 #if KMP_NESTED_HOT_TEAMS
2105  }
2106 #endif
2107  }
2108 #if !KMP_NESTED_HOT_TEAMS
2109  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2110  (team == root->r.r_hot_team));
2111 #endif
2112  }
2113 
2114  KA_TRACE(
2115  20,
2116  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2117  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2118  team->t.t_nproc));
2119  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2120  (team->t.t_master_tid == 0 &&
2121  (team->t.t_parent == root->r.r_root_team ||
2122  team->t.t_parent->t.t_serialized)));
2123  KMP_MB();
2124 
2125  /* now, setup the arguments */
2126  argv = (void **)team->t.t_argv;
2127  if (ap) {
2128  for (i = argc - 1; i >= 0; --i) {
2129  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2130  KMP_CHECK_UPDATE(*argv, new_argv);
2131  argv++;
2132  }
2133  } else {
2134  for (i = 0; i < argc; ++i) {
2135  // Get args from parent team for teams construct
2136  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2137  }
2138  }
2139 
2140  /* now actually fork the threads */
2141  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2142  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2143  root->r.r_active = TRUE;
2144 
2145  __kmp_fork_team_threads(root, team, master_th, gtid);
2146  __kmp_setup_icv_copy(team, nthreads,
2147  &master_th->th.th_current_task->td_icvs, loc);
2148 
2149 #if OMPT_SUPPORT
2150  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2151 #endif
2152 
2153  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2154 
2155 #if USE_ITT_BUILD
2156  if (team->t.t_active_level == 1 // only report frames at level 1
2157  && !master_th->th.th_teams_microtask) { // not in teams construct
2158 #if USE_ITT_NOTIFY
2159  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2160  (__kmp_forkjoin_frames_mode == 3 ||
2161  __kmp_forkjoin_frames_mode == 1)) {
2162  kmp_uint64 tmp_time = 0;
2163  if (__itt_get_timestamp_ptr)
2164  tmp_time = __itt_get_timestamp();
2165  // Internal fork - report frame begin
2166  master_th->th.th_frame_time = tmp_time;
2167  if (__kmp_forkjoin_frames_mode == 3)
2168  team->t.t_region_time = tmp_time;
2169  } else
2170 // only one notification scheme (either "submit" or "forking/joined", not both)
2171 #endif /* USE_ITT_NOTIFY */
2172  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2173  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2174  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2175  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2176  }
2177  }
2178 #endif /* USE_ITT_BUILD */
2179 
2180  /* now go on and do the work */
2181  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2182  KMP_MB();
2183  KF_TRACE(10,
2184  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2185  root, team, master_th, gtid));
2186 
2187 #if USE_ITT_BUILD
2188  if (__itt_stack_caller_create_ptr) {
2189  // create new stack stitching id before entering fork barrier
2190  if (!enter_teams) {
2191  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2192  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2193  } else if (parent_team->t.t_serialized) {
2194  // keep stack stitching id in the serialized parent_team;
2195  // current team will be used for parallel inside the teams;
2196  // if parent_team is active, then it already keeps stack stitching id
2197  // for the league of teams
2198  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2199  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2200  }
2201  }
2202 #endif /* USE_ITT_BUILD */
2203 
2204  // AC: skip __kmp_internal_fork at teams construct, let only primary
2205  // threads execute
2206  if (ap) {
2207  __kmp_internal_fork(loc, gtid, team);
2208  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2209  "master_th=%p, gtid=%d\n",
2210  root, team, master_th, gtid));
2211  }
2212 
2213  if (call_context == fork_context_gnu) {
2214  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2215  return TRUE;
2216  }
2217 
2218  /* Invoke microtask for PRIMARY thread */
2219  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2220  team->t.t_id, team->t.t_pkfn));
2221  } // END of timer KMP_fork_call block
2222 
2223 #if KMP_STATS_ENABLED
2224  // If beginning a teams construct, then change thread state
2225  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2226  if (!ap) {
2227  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2228  }
2229 #endif
2230 
2231  if (!team->t.t_invoke(gtid)) {
2232  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2233  }
2234 
2235 #if KMP_STATS_ENABLED
2236  // If was beginning of a teams construct, then reset thread state
2237  if (!ap) {
2238  KMP_SET_THREAD_STATE(previous_state);
2239  }
2240 #endif
2241 
2242  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2243  team->t.t_id, team->t.t_pkfn));
2244  KMP_MB(); /* Flush all pending memory write invalidates. */
2245 
2246  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2247 #if OMPT_SUPPORT
2248  if (ompt_enabled.enabled) {
2249  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2250  }
2251 #endif
2252 
2253  return TRUE;
2254 }
2255 
2256 #if OMPT_SUPPORT
2257 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2258  kmp_team_t *team) {
2259  // restore state outside the region
2260  thread->th.ompt_thread_info.state =
2261  ((team->t.t_serialized) ? ompt_state_work_serial
2262  : ompt_state_work_parallel);
2263 }
2264 
2265 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2266  kmp_team_t *team, ompt_data_t *parallel_data,
2267  int flags, void *codeptr) {
2268  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2269  if (ompt_enabled.ompt_callback_parallel_end) {
2270  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2271  parallel_data, &(task_info->task_data), flags, codeptr);
2272  }
2273 
2274  task_info->frame.enter_frame = ompt_data_none;
2275  __kmp_join_restore_state(thread, team);
2276 }
2277 #endif
2278 
2279 void __kmp_join_call(ident_t *loc, int gtid
2280 #if OMPT_SUPPORT
2281  ,
2282  enum fork_context_e fork_context
2283 #endif
2284  ,
2285  int exit_teams) {
2286  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2287  kmp_team_t *team;
2288  kmp_team_t *parent_team;
2289  kmp_info_t *master_th;
2290  kmp_root_t *root;
2291  int master_active;
2292 
2293  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2294 
2295  /* setup current data */
2296  master_th = __kmp_threads[gtid];
2297  root = master_th->th.th_root;
2298  team = master_th->th.th_team;
2299  parent_team = team->t.t_parent;
2300 
2301  master_th->th.th_ident = loc;
2302 
2303 #if OMPT_SUPPORT
2304  void *team_microtask = (void *)team->t.t_pkfn;
2305  // For GOMP interface with serialized parallel, need the
2306  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2307  // and end-parallel events.
2308  if (ompt_enabled.enabled &&
2309  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2310  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2311  }
2312 #endif
2313 
2314 #if KMP_DEBUG
2315  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2316  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2317  "th_task_team = %p\n",
2318  __kmp_gtid_from_thread(master_th), team,
2319  team->t.t_task_team[master_th->th.th_task_state],
2320  master_th->th.th_task_team));
2321  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2322  team->t.t_task_team[master_th->th.th_task_state]);
2323  }
2324 #endif
2325 
2326  if (team->t.t_serialized) {
2327  if (master_th->th.th_teams_microtask) {
2328  // We are in teams construct
2329  int level = team->t.t_level;
2330  int tlevel = master_th->th.th_teams_level;
2331  if (level == tlevel) {
2332  // AC: we haven't incremented it earlier at start of teams construct,
2333  // so do it here - at the end of teams construct
2334  team->t.t_level++;
2335  } else if (level == tlevel + 1) {
2336  // AC: we are exiting parallel inside teams, need to increment
2337  // serialization in order to restore it in the next call to
2338  // __kmpc_end_serialized_parallel
2339  team->t.t_serialized++;
2340  }
2341  }
2342  __kmpc_end_serialized_parallel(loc, gtid);
2343 
2344 #if OMPT_SUPPORT
2345  if (ompt_enabled.enabled) {
2346  __kmp_join_restore_state(master_th, parent_team);
2347  }
2348 #endif
2349 
2350  return;
2351  }
2352 
2353  master_active = team->t.t_master_active;
2354 
2355  if (!exit_teams) {
2356  // AC: No barrier for internal teams at exit from teams construct.
2357  // But there is barrier for external team (league).
2358  __kmp_internal_join(loc, gtid, team);
2359 #if USE_ITT_BUILD
2360  if (__itt_stack_caller_create_ptr) {
2361  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2362  // destroy the stack stitching id after join barrier
2363  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2364  team->t.t_stack_id = NULL;
2365  }
2366 #endif
2367  } else {
2368  master_th->th.th_task_state =
2369  0; // AC: no tasking in teams (out of any parallel)
2370 #if USE_ITT_BUILD
2371  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2372  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2373  // destroy the stack stitching id on exit from the teams construct
2374  // if parent_team is active, then the id will be destroyed later on
2375  // by master of the league of teams
2376  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2377  parent_team->t.t_stack_id = NULL;
2378  }
2379 #endif
2380 
2381  if (team->t.t_nproc > 1 &&
2382  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2383  team->t.b->update_num_threads(team->t.t_nproc);
2384  __kmp_add_threads_to_team(team, team->t.t_nproc);
2385  }
2386  }
2387 
2388  KMP_MB();
2389 
2390 #if OMPT_SUPPORT
2391  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2392  void *codeptr = team->t.ompt_team_info.master_return_address;
2393 #endif
2394 
2395 #if USE_ITT_BUILD
2396  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2397  if (team->t.t_active_level == 1 &&
2398  (!master_th->th.th_teams_microtask || /* not in teams construct */
2399  master_th->th.th_teams_size.nteams == 1)) {
2400  master_th->th.th_ident = loc;
2401  // only one notification scheme (either "submit" or "forking/joined", not
2402  // both)
2403  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2404  __kmp_forkjoin_frames_mode == 3)
2405  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2406  master_th->th.th_frame_time, 0, loc,
2407  master_th->th.th_team_nproc, 1);
2408  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2409  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2410  __kmp_itt_region_joined(gtid);
2411  } // active_level == 1
2412 #endif /* USE_ITT_BUILD */
2413 
2414  if (master_th->th.th_teams_microtask && !exit_teams &&
2415  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2416  team->t.t_level == master_th->th.th_teams_level + 1) {
2417 // AC: We need to leave the team structure intact at the end of parallel
2418 // inside the teams construct, so that at the next parallel same (hot) team
2419 // works, only adjust nesting levels
2420 #if OMPT_SUPPORT
2421  ompt_data_t ompt_parallel_data = ompt_data_none;
2422  if (ompt_enabled.enabled) {
2423  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2424  if (ompt_enabled.ompt_callback_implicit_task) {
2425  int ompt_team_size = team->t.t_nproc;
2426  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2427  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2428  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2429  }
2430  task_info->frame.exit_frame = ompt_data_none;
2431  task_info->task_data = ompt_data_none;
2432  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2433  __ompt_lw_taskteam_unlink(master_th);
2434  }
2435 #endif
2436  /* Decrement our nested depth level */
2437  team->t.t_level--;
2438  team->t.t_active_level--;
2439  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2440 
2441  // Restore number of threads in the team if needed. This code relies on
2442  // the proper adjustment of th_teams_size.nth after the fork in
2443  // __kmp_teams_master on each teams primary thread in the case that
2444  // __kmp_reserve_threads reduced it.
2445  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2446  int old_num = master_th->th.th_team_nproc;
2447  int new_num = master_th->th.th_teams_size.nth;
2448  kmp_info_t **other_threads = team->t.t_threads;
2449  team->t.t_nproc = new_num;
2450  for (int i = 0; i < old_num; ++i) {
2451  other_threads[i]->th.th_team_nproc = new_num;
2452  }
2453  // Adjust states of non-used threads of the team
2454  for (int i = old_num; i < new_num; ++i) {
2455  // Re-initialize thread's barrier data.
2456  KMP_DEBUG_ASSERT(other_threads[i]);
2457  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2458  for (int b = 0; b < bs_last_barrier; ++b) {
2459  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2460  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2461 #if USE_DEBUGGER
2462  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2463 #endif
2464  }
2465  if (__kmp_tasking_mode != tskm_immediate_exec) {
2466  // Synchronize thread's task state
2467  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2468  }
2469  }
2470  }
2471 
2472 #if OMPT_SUPPORT
2473  if (ompt_enabled.enabled) {
2474  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2475  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2476  }
2477 #endif
2478 
2479  return;
2480  }
2481 
2482  /* do cleanup and restore the parent team */
2483  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2484  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2485 
2486  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2487 
2488  /* jc: The following lock has instructions with REL and ACQ semantics,
2489  separating the parallel user code called in this parallel region
2490  from the serial user code called after this function returns. */
2491  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2492 
2493  if (!master_th->th.th_teams_microtask ||
2494  team->t.t_level > master_th->th.th_teams_level) {
2495  /* Decrement our nested depth level */
2496  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2497  }
2498  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2499 
2500 #if OMPT_SUPPORT
2501  if (ompt_enabled.enabled) {
2502  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2503  if (ompt_enabled.ompt_callback_implicit_task) {
2504  int flags = (team_microtask == (void *)__kmp_teams_master)
2505  ? ompt_task_initial
2506  : ompt_task_implicit;
2507  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2508  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2509  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2510  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2511  }
2512  task_info->frame.exit_frame = ompt_data_none;
2513  task_info->task_data = ompt_data_none;
2514  }
2515 #endif
2516 
2517  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2518  master_th, team));
2519  __kmp_pop_current_task_from_thread(master_th);
2520 
2521 #if KMP_AFFINITY_SUPPORTED
2522  // Restore master thread's partition.
2523  master_th->th.th_first_place = team->t.t_first_place;
2524  master_th->th.th_last_place = team->t.t_last_place;
2525 #endif // KMP_AFFINITY_SUPPORTED
2526  master_th->th.th_def_allocator = team->t.t_def_allocator;
2527 
2528 #if OMPD_SUPPORT
2529  if (ompd_state & OMPD_ENABLE_BP)
2530  ompd_bp_parallel_end();
2531 #endif
2532  updateHWFPControl(team);
2533 
2534  if (root->r.r_active != master_active)
2535  root->r.r_active = master_active;
2536 
2537  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2538  master_th)); // this will free worker threads
2539 
2540  /* this race was fun to find. make sure the following is in the critical
2541  region otherwise assertions may fail occasionally since the old team may be
2542  reallocated and the hierarchy appears inconsistent. it is actually safe to
2543  run and won't cause any bugs, but will cause those assertion failures. it's
2544  only one deref&assign so might as well put this in the critical region */
2545  master_th->th.th_team = parent_team;
2546  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2547  master_th->th.th_team_master = parent_team->t.t_threads[0];
2548  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2549 
2550  /* restore serialized team, if need be */
2551  if (parent_team->t.t_serialized &&
2552  parent_team != master_th->th.th_serial_team &&
2553  parent_team != root->r.r_root_team) {
2554  __kmp_free_team(root,
2555  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2556  master_th->th.th_serial_team = parent_team;
2557  }
2558 
2559  if (__kmp_tasking_mode != tskm_immediate_exec) {
2560  if (master_th->th.th_task_state_top >
2561  0) { // Restore task state from memo stack
2562  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2563  // Remember primary thread's state if we re-use this nested hot team
2564  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2565  master_th->th.th_task_state;
2566  --master_th->th.th_task_state_top; // pop
2567  // Now restore state at this level
2568  master_th->th.th_task_state =
2569  master_th->th
2570  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2571  }
2572  // Copy the task team from the parent team to the primary thread
2573  master_th->th.th_task_team =
2574  parent_team->t.t_task_team[master_th->th.th_task_state];
2575  KA_TRACE(20,
2576  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2577  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2578  parent_team));
2579  }
2580 
2581  // TODO: GEH - cannot do this assertion because root thread not set up as
2582  // executing
2583  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2584  master_th->th.th_current_task->td_flags.executing = 1;
2585 
2586  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2587 
2588 #if OMPT_SUPPORT
2589  int flags =
2590  OMPT_INVOKER(fork_context) |
2591  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2592  : ompt_parallel_team);
2593  if (ompt_enabled.enabled) {
2594  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2595  codeptr);
2596  }
2597 #endif
2598 
2599  KMP_MB();
2600  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2601 }
2602 
2603 /* Check whether we should push an internal control record onto the
2604  serial team stack. If so, do it. */
2605 void __kmp_save_internal_controls(kmp_info_t *thread) {
2606 
2607  if (thread->th.th_team != thread->th.th_serial_team) {
2608  return;
2609  }
2610  if (thread->th.th_team->t.t_serialized > 1) {
2611  int push = 0;
2612 
2613  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2614  push = 1;
2615  } else {
2616  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2617  thread->th.th_team->t.t_serialized) {
2618  push = 1;
2619  }
2620  }
2621  if (push) { /* push a record on the serial team's stack */
2622  kmp_internal_control_t *control =
2623  (kmp_internal_control_t *)__kmp_allocate(
2624  sizeof(kmp_internal_control_t));
2625 
2626  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2627 
2628  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2629 
2630  control->next = thread->th.th_team->t.t_control_stack_top;
2631  thread->th.th_team->t.t_control_stack_top = control;
2632  }
2633  }
2634 }
2635 
2636 /* Changes set_nproc */
2637 void __kmp_set_num_threads(int new_nth, int gtid) {
2638  kmp_info_t *thread;
2639  kmp_root_t *root;
2640 
2641  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2642  KMP_DEBUG_ASSERT(__kmp_init_serial);
2643 
2644  if (new_nth < 1)
2645  new_nth = 1;
2646  else if (new_nth > __kmp_max_nth)
2647  new_nth = __kmp_max_nth;
2648 
2649  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2650  thread = __kmp_threads[gtid];
2651  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2652  return; // nothing to do
2653 
2654  __kmp_save_internal_controls(thread);
2655 
2656  set__nproc(thread, new_nth);
2657 
2658  // If this omp_set_num_threads() call will cause the hot team size to be
2659  // reduced (in the absence of a num_threads clause), then reduce it now,
2660  // rather than waiting for the next parallel region.
2661  root = thread->th.th_root;
2662  if (__kmp_init_parallel && (!root->r.r_active) &&
2663  (root->r.r_hot_team->t.t_nproc > new_nth)
2664 #if KMP_NESTED_HOT_TEAMS
2665  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2666 #endif
2667  ) {
2668  kmp_team_t *hot_team = root->r.r_hot_team;
2669  int f;
2670 
2671  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2672 
2673  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2674  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2675  }
2676  // Release the extra threads we don't need any more.
2677  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2678  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2679  if (__kmp_tasking_mode != tskm_immediate_exec) {
2680  // When decreasing team size, threads no longer in the team should unref
2681  // task team.
2682  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2683  }
2684  __kmp_free_thread(hot_team->t.t_threads[f]);
2685  hot_team->t.t_threads[f] = NULL;
2686  }
2687  hot_team->t.t_nproc = new_nth;
2688 #if KMP_NESTED_HOT_TEAMS
2689  if (thread->th.th_hot_teams) {
2690  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2691  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2692  }
2693 #endif
2694 
2695  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2696  hot_team->t.b->update_num_threads(new_nth);
2697  __kmp_add_threads_to_team(hot_team, new_nth);
2698  }
2699 
2700  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2701 
2702  // Update the t_nproc field in the threads that are still active.
2703  for (f = 0; f < new_nth; f++) {
2704  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2705  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2706  }
2707  // Special flag in case omp_set_num_threads() call
2708  hot_team->t.t_size_changed = -1;
2709  }
2710 }
2711 
2712 /* Changes max_active_levels */
2713 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2714  kmp_info_t *thread;
2715 
2716  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2717  "%d = (%d)\n",
2718  gtid, max_active_levels));
2719  KMP_DEBUG_ASSERT(__kmp_init_serial);
2720 
2721  // validate max_active_levels
2722  if (max_active_levels < 0) {
2723  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2724  // We ignore this call if the user has specified a negative value.
2725  // The current setting won't be changed. The last valid setting will be
2726  // used. A warning will be issued (if warnings are allowed as controlled by
2727  // the KMP_WARNINGS env var).
2728  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2729  "max_active_levels for thread %d = (%d)\n",
2730  gtid, max_active_levels));
2731  return;
2732  }
2733  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2734  // it's OK, the max_active_levels is within the valid range: [ 0;
2735  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2736  // We allow a zero value. (implementation defined behavior)
2737  } else {
2738  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2739  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2740  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2741  // Current upper limit is MAX_INT. (implementation defined behavior)
2742  // If the input exceeds the upper limit, we correct the input to be the
2743  // upper limit. (implementation defined behavior)
2744  // Actually, the flow should never get here until we use MAX_INT limit.
2745  }
2746  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2747  "max_active_levels for thread %d = (%d)\n",
2748  gtid, max_active_levels));
2749 
2750  thread = __kmp_threads[gtid];
2751 
2752  __kmp_save_internal_controls(thread);
2753 
2754  set__max_active_levels(thread, max_active_levels);
2755 }
2756 
2757 /* Gets max_active_levels */
2758 int __kmp_get_max_active_levels(int gtid) {
2759  kmp_info_t *thread;
2760 
2761  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2762  KMP_DEBUG_ASSERT(__kmp_init_serial);
2763 
2764  thread = __kmp_threads[gtid];
2765  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2766  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2767  "curtask_maxaclevel=%d\n",
2768  gtid, thread->th.th_current_task,
2769  thread->th.th_current_task->td_icvs.max_active_levels));
2770  return thread->th.th_current_task->td_icvs.max_active_levels;
2771 }
2772 
2773 // nteams-var per-device ICV
2774 void __kmp_set_num_teams(int num_teams) {
2775  if (num_teams > 0)
2776  __kmp_nteams = num_teams;
2777 }
2778 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2779 // teams-thread-limit-var per-device ICV
2780 void __kmp_set_teams_thread_limit(int limit) {
2781  if (limit > 0)
2782  __kmp_teams_thread_limit = limit;
2783 }
2784 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2785 
2786 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2787 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2788 
2789 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2790 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2791  kmp_info_t *thread;
2792  kmp_sched_t orig_kind;
2793  // kmp_team_t *team;
2794 
2795  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2796  gtid, (int)kind, chunk));
2797  KMP_DEBUG_ASSERT(__kmp_init_serial);
2798 
2799  // Check if the kind parameter is valid, correct if needed.
2800  // Valid parameters should fit in one of two intervals - standard or extended:
2801  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2802  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2803  orig_kind = kind;
2804  kind = __kmp_sched_without_mods(kind);
2805 
2806  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2807  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2808  // TODO: Hint needs attention in case we change the default schedule.
2809  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2810  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2811  __kmp_msg_null);
2812  kind = kmp_sched_default;
2813  chunk = 0; // ignore chunk value in case of bad kind
2814  }
2815 
2816  thread = __kmp_threads[gtid];
2817 
2818  __kmp_save_internal_controls(thread);
2819 
2820  if (kind < kmp_sched_upper_std) {
2821  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2822  // differ static chunked vs. unchunked: chunk should be invalid to
2823  // indicate unchunked schedule (which is the default)
2824  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2825  } else {
2826  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2827  __kmp_sch_map[kind - kmp_sched_lower - 1];
2828  }
2829  } else {
2830  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2831  // kmp_sched_lower - 2 ];
2832  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2833  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2834  kmp_sched_lower - 2];
2835  }
2836  __kmp_sched_apply_mods_intkind(
2837  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2838  if (kind == kmp_sched_auto || chunk < 1) {
2839  // ignore parameter chunk for schedule auto
2840  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2841  } else {
2842  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2843  }
2844 }
2845 
2846 /* Gets def_sched_var ICV values */
2847 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2848  kmp_info_t *thread;
2849  enum sched_type th_type;
2850 
2851  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2852  KMP_DEBUG_ASSERT(__kmp_init_serial);
2853 
2854  thread = __kmp_threads[gtid];
2855 
2856  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2857  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2858  case kmp_sch_static:
2859  case kmp_sch_static_greedy:
2860  case kmp_sch_static_balanced:
2861  *kind = kmp_sched_static;
2862  __kmp_sched_apply_mods_stdkind(kind, th_type);
2863  *chunk = 0; // chunk was not set, try to show this fact via zero value
2864  return;
2865  case kmp_sch_static_chunked:
2866  *kind = kmp_sched_static;
2867  break;
2868  case kmp_sch_dynamic_chunked:
2869  *kind = kmp_sched_dynamic;
2870  break;
2872  case kmp_sch_guided_iterative_chunked:
2873  case kmp_sch_guided_analytical_chunked:
2874  *kind = kmp_sched_guided;
2875  break;
2876  case kmp_sch_auto:
2877  *kind = kmp_sched_auto;
2878  break;
2879  case kmp_sch_trapezoidal:
2880  *kind = kmp_sched_trapezoidal;
2881  break;
2882 #if KMP_STATIC_STEAL_ENABLED
2883  case kmp_sch_static_steal:
2884  *kind = kmp_sched_static_steal;
2885  break;
2886 #endif
2887  default:
2888  KMP_FATAL(UnknownSchedulingType, th_type);
2889  }
2890 
2891  __kmp_sched_apply_mods_stdkind(kind, th_type);
2892  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2893 }
2894 
2895 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2896 
2897  int ii, dd;
2898  kmp_team_t *team;
2899  kmp_info_t *thr;
2900 
2901  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2902  KMP_DEBUG_ASSERT(__kmp_init_serial);
2903 
2904  // validate level
2905  if (level == 0)
2906  return 0;
2907  if (level < 0)
2908  return -1;
2909  thr = __kmp_threads[gtid];
2910  team = thr->th.th_team;
2911  ii = team->t.t_level;
2912  if (level > ii)
2913  return -1;
2914 
2915  if (thr->th.th_teams_microtask) {
2916  // AC: we are in teams region where multiple nested teams have same level
2917  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2918  if (level <=
2919  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2920  KMP_DEBUG_ASSERT(ii >= tlevel);
2921  // AC: As we need to pass by the teams league, we need to artificially
2922  // increase ii
2923  if (ii == tlevel) {
2924  ii += 2; // three teams have same level
2925  } else {
2926  ii++; // two teams have same level
2927  }
2928  }
2929  }
2930 
2931  if (ii == level)
2932  return __kmp_tid_from_gtid(gtid);
2933 
2934  dd = team->t.t_serialized;
2935  level++;
2936  while (ii > level) {
2937  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2938  }
2939  if ((team->t.t_serialized) && (!dd)) {
2940  team = team->t.t_parent;
2941  continue;
2942  }
2943  if (ii > level) {
2944  team = team->t.t_parent;
2945  dd = team->t.t_serialized;
2946  ii--;
2947  }
2948  }
2949 
2950  return (dd > 1) ? (0) : (team->t.t_master_tid);
2951 }
2952 
2953 int __kmp_get_team_size(int gtid, int level) {
2954 
2955  int ii, dd;
2956  kmp_team_t *team;
2957  kmp_info_t *thr;
2958 
2959  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2960  KMP_DEBUG_ASSERT(__kmp_init_serial);
2961 
2962  // validate level
2963  if (level == 0)
2964  return 1;
2965  if (level < 0)
2966  return -1;
2967  thr = __kmp_threads[gtid];
2968  team = thr->th.th_team;
2969  ii = team->t.t_level;
2970  if (level > ii)
2971  return -1;
2972 
2973  if (thr->th.th_teams_microtask) {
2974  // AC: we are in teams region where multiple nested teams have same level
2975  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2976  if (level <=
2977  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2978  KMP_DEBUG_ASSERT(ii >= tlevel);
2979  // AC: As we need to pass by the teams league, we need to artificially
2980  // increase ii
2981  if (ii == tlevel) {
2982  ii += 2; // three teams have same level
2983  } else {
2984  ii++; // two teams have same level
2985  }
2986  }
2987  }
2988 
2989  while (ii > level) {
2990  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2991  }
2992  if (team->t.t_serialized && (!dd)) {
2993  team = team->t.t_parent;
2994  continue;
2995  }
2996  if (ii > level) {
2997  team = team->t.t_parent;
2998  ii--;
2999  }
3000  }
3001 
3002  return team->t.t_nproc;
3003 }
3004 
3005 kmp_r_sched_t __kmp_get_schedule_global() {
3006  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3007  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3008  // independently. So one can get the updated schedule here.
3009 
3010  kmp_r_sched_t r_sched;
3011 
3012  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3013  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3014  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3015  // different roots (even in OMP 2.5)
3016  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3017  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3018  if (s == kmp_sch_static) {
3019  // replace STATIC with more detailed schedule (balanced or greedy)
3020  r_sched.r_sched_type = __kmp_static;
3021  } else if (s == kmp_sch_guided_chunked) {
3022  // replace GUIDED with more detailed schedule (iterative or analytical)
3023  r_sched.r_sched_type = __kmp_guided;
3024  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3025  r_sched.r_sched_type = __kmp_sched;
3026  }
3027  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3028 
3029  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3030  // __kmp_chunk may be wrong here (if it was not ever set)
3031  r_sched.chunk = KMP_DEFAULT_CHUNK;
3032  } else {
3033  r_sched.chunk = __kmp_chunk;
3034  }
3035 
3036  return r_sched;
3037 }
3038 
3039 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3040  at least argc number of *t_argv entries for the requested team. */
3041 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3042 
3043  KMP_DEBUG_ASSERT(team);
3044  if (!realloc || argc > team->t.t_max_argc) {
3045 
3046  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3047  "current entries=%d\n",
3048  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3049  /* if previously allocated heap space for args, free them */
3050  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3051  __kmp_free((void *)team->t.t_argv);
3052 
3053  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3054  /* use unused space in the cache line for arguments */
3055  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3056  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3057  "argv entries\n",
3058  team->t.t_id, team->t.t_max_argc));
3059  team->t.t_argv = &team->t.t_inline_argv[0];
3060  if (__kmp_storage_map) {
3061  __kmp_print_storage_map_gtid(
3062  -1, &team->t.t_inline_argv[0],
3063  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3064  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3065  team->t.t_id);
3066  }
3067  } else {
3068  /* allocate space for arguments in the heap */
3069  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3070  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3071  : 2 * argc;
3072  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3073  "argv entries\n",
3074  team->t.t_id, team->t.t_max_argc));
3075  team->t.t_argv =
3076  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3077  if (__kmp_storage_map) {
3078  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3079  &team->t.t_argv[team->t.t_max_argc],
3080  sizeof(void *) * team->t.t_max_argc,
3081  "team_%d.t_argv", team->t.t_id);
3082  }
3083  }
3084  }
3085 }
3086 
3087 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3088  int i;
3089  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3090  team->t.t_threads =
3091  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3092  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3093  sizeof(dispatch_shared_info_t) * num_disp_buff);
3094  team->t.t_dispatch =
3095  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3096  team->t.t_implicit_task_taskdata =
3097  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3098  team->t.t_max_nproc = max_nth;
3099 
3100  /* setup dispatch buffers */
3101  for (i = 0; i < num_disp_buff; ++i) {
3102  team->t.t_disp_buffer[i].buffer_index = i;
3103  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3104  }
3105 }
3106 
3107 static void __kmp_free_team_arrays(kmp_team_t *team) {
3108  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3109  int i;
3110  for (i = 0; i < team->t.t_max_nproc; ++i) {
3111  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3112  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3113  team->t.t_dispatch[i].th_disp_buffer = NULL;
3114  }
3115  }
3116 #if KMP_USE_HIER_SCHED
3117  __kmp_dispatch_free_hierarchies(team);
3118 #endif
3119  __kmp_free(team->t.t_threads);
3120  __kmp_free(team->t.t_disp_buffer);
3121  __kmp_free(team->t.t_dispatch);
3122  __kmp_free(team->t.t_implicit_task_taskdata);
3123  team->t.t_threads = NULL;
3124  team->t.t_disp_buffer = NULL;
3125  team->t.t_dispatch = NULL;
3126  team->t.t_implicit_task_taskdata = 0;
3127 }
3128 
3129 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3130  kmp_info_t **oldThreads = team->t.t_threads;
3131 
3132  __kmp_free(team->t.t_disp_buffer);
3133  __kmp_free(team->t.t_dispatch);
3134  __kmp_free(team->t.t_implicit_task_taskdata);
3135  __kmp_allocate_team_arrays(team, max_nth);
3136 
3137  KMP_MEMCPY(team->t.t_threads, oldThreads,
3138  team->t.t_nproc * sizeof(kmp_info_t *));
3139 
3140  __kmp_free(oldThreads);
3141 }
3142 
3143 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3144 
3145  kmp_r_sched_t r_sched =
3146  __kmp_get_schedule_global(); // get current state of scheduling globals
3147 
3148  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3149 
3150  kmp_internal_control_t g_icvs = {
3151  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3152  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3153  // adjustment of threads (per thread)
3154  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3155  // whether blocktime is explicitly set
3156  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3157 #if KMP_USE_MONITOR
3158  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3159 // intervals
3160 #endif
3161  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3162  // next parallel region (per thread)
3163  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3164  __kmp_cg_max_nth, // int thread_limit;
3165  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3166  // for max_active_levels
3167  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3168  // {sched,chunk} pair
3169  __kmp_nested_proc_bind.bind_types[0],
3170  __kmp_default_device,
3171  NULL // struct kmp_internal_control *next;
3172  };
3173 
3174  return g_icvs;
3175 }
3176 
3177 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3178 
3179  kmp_internal_control_t gx_icvs;
3180  gx_icvs.serial_nesting_level =
3181  0; // probably =team->t.t_serial like in save_inter_controls
3182  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3183  gx_icvs.next = NULL;
3184 
3185  return gx_icvs;
3186 }
3187 
3188 static void __kmp_initialize_root(kmp_root_t *root) {
3189  int f;
3190  kmp_team_t *root_team;
3191  kmp_team_t *hot_team;
3192  int hot_team_max_nth;
3193  kmp_r_sched_t r_sched =
3194  __kmp_get_schedule_global(); // get current state of scheduling globals
3195  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3196  KMP_DEBUG_ASSERT(root);
3197  KMP_ASSERT(!root->r.r_begin);
3198 
3199  /* setup the root state structure */
3200  __kmp_init_lock(&root->r.r_begin_lock);
3201  root->r.r_begin = FALSE;
3202  root->r.r_active = FALSE;
3203  root->r.r_in_parallel = 0;
3204  root->r.r_blocktime = __kmp_dflt_blocktime;
3205 #if KMP_AFFINITY_SUPPORTED
3206  root->r.r_affinity_assigned = FALSE;
3207 #endif
3208 
3209  /* setup the root team for this task */
3210  /* allocate the root team structure */
3211  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3212 
3213  root_team =
3214  __kmp_allocate_team(root,
3215  1, // new_nproc
3216  1, // max_nproc
3217 #if OMPT_SUPPORT
3218  ompt_data_none, // root parallel id
3219 #endif
3220  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3221  0 // argc
3222  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3223  );
3224 #if USE_DEBUGGER
3225  // Non-NULL value should be assigned to make the debugger display the root
3226  // team.
3227  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3228 #endif
3229 
3230  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3231 
3232  root->r.r_root_team = root_team;
3233  root_team->t.t_control_stack_top = NULL;
3234 
3235  /* initialize root team */
3236  root_team->t.t_threads[0] = NULL;
3237  root_team->t.t_nproc = 1;
3238  root_team->t.t_serialized = 1;
3239  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3240  root_team->t.t_sched.sched = r_sched.sched;
3241  KA_TRACE(
3242  20,
3243  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3244  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3245 
3246  /* setup the hot team for this task */
3247  /* allocate the hot team structure */
3248  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3249 
3250  hot_team =
3251  __kmp_allocate_team(root,
3252  1, // new_nproc
3253  __kmp_dflt_team_nth_ub * 2, // max_nproc
3254 #if OMPT_SUPPORT
3255  ompt_data_none, // root parallel id
3256 #endif
3257  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3258  0 // argc
3259  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3260  );
3261  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3262 
3263  root->r.r_hot_team = hot_team;
3264  root_team->t.t_control_stack_top = NULL;
3265 
3266  /* first-time initialization */
3267  hot_team->t.t_parent = root_team;
3268 
3269  /* initialize hot team */
3270  hot_team_max_nth = hot_team->t.t_max_nproc;
3271  for (f = 0; f < hot_team_max_nth; ++f) {
3272  hot_team->t.t_threads[f] = NULL;
3273  }
3274  hot_team->t.t_nproc = 1;
3275  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3276  hot_team->t.t_sched.sched = r_sched.sched;
3277  hot_team->t.t_size_changed = 0;
3278 }
3279 
3280 #ifdef KMP_DEBUG
3281 
3282 typedef struct kmp_team_list_item {
3283  kmp_team_p const *entry;
3284  struct kmp_team_list_item *next;
3285 } kmp_team_list_item_t;
3286 typedef kmp_team_list_item_t *kmp_team_list_t;
3287 
3288 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3289  kmp_team_list_t list, // List of teams.
3290  kmp_team_p const *team // Team to add.
3291 ) {
3292 
3293  // List must terminate with item where both entry and next are NULL.
3294  // Team is added to the list only once.
3295  // List is sorted in ascending order by team id.
3296  // Team id is *not* a key.
3297 
3298  kmp_team_list_t l;
3299 
3300  KMP_DEBUG_ASSERT(list != NULL);
3301  if (team == NULL) {
3302  return;
3303  }
3304 
3305  __kmp_print_structure_team_accum(list, team->t.t_parent);
3306  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3307 
3308  // Search list for the team.
3309  l = list;
3310  while (l->next != NULL && l->entry != team) {
3311  l = l->next;
3312  }
3313  if (l->next != NULL) {
3314  return; // Team has been added before, exit.
3315  }
3316 
3317  // Team is not found. Search list again for insertion point.
3318  l = list;
3319  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3320  l = l->next;
3321  }
3322 
3323  // Insert team.
3324  {
3325  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3326  sizeof(kmp_team_list_item_t));
3327  *item = *l;
3328  l->entry = team;
3329  l->next = item;
3330  }
3331 }
3332 
3333 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3334 
3335 ) {
3336  __kmp_printf("%s", title);
3337  if (team != NULL) {
3338  __kmp_printf("%2x %p\n", team->t.t_id, team);
3339  } else {
3340  __kmp_printf(" - (nil)\n");
3341  }
3342 }
3343 
3344 static void __kmp_print_structure_thread(char const *title,
3345  kmp_info_p const *thread) {
3346  __kmp_printf("%s", title);
3347  if (thread != NULL) {
3348  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3349  } else {
3350  __kmp_printf(" - (nil)\n");
3351  }
3352 }
3353 
3354 void __kmp_print_structure(void) {
3355 
3356  kmp_team_list_t list;
3357 
3358  // Initialize list of teams.
3359  list =
3360  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3361  list->entry = NULL;
3362  list->next = NULL;
3363 
3364  __kmp_printf("\n------------------------------\nGlobal Thread "
3365  "Table\n------------------------------\n");
3366  {
3367  int gtid;
3368  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3369  __kmp_printf("%2d", gtid);
3370  if (__kmp_threads != NULL) {
3371  __kmp_printf(" %p", __kmp_threads[gtid]);
3372  }
3373  if (__kmp_root != NULL) {
3374  __kmp_printf(" %p", __kmp_root[gtid]);
3375  }
3376  __kmp_printf("\n");
3377  }
3378  }
3379 
3380  // Print out __kmp_threads array.
3381  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3382  "----------\n");
3383  if (__kmp_threads != NULL) {
3384  int gtid;
3385  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3386  kmp_info_t const *thread = __kmp_threads[gtid];
3387  if (thread != NULL) {
3388  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3389  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3390  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3391  __kmp_print_structure_team(" Serial Team: ",
3392  thread->th.th_serial_team);
3393  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3394  __kmp_print_structure_thread(" Primary: ",
3395  thread->th.th_team_master);
3396  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3397  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3398  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3399  __kmp_print_structure_thread(" Next in pool: ",
3400  thread->th.th_next_pool);
3401  __kmp_printf("\n");
3402  __kmp_print_structure_team_accum(list, thread->th.th_team);
3403  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3404  }
3405  }
3406  } else {
3407  __kmp_printf("Threads array is not allocated.\n");
3408  }
3409 
3410  // Print out __kmp_root array.
3411  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3412  "--------\n");
3413  if (__kmp_root != NULL) {
3414  int gtid;
3415  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3416  kmp_root_t const *root = __kmp_root[gtid];
3417  if (root != NULL) {
3418  __kmp_printf("GTID %2d %p:\n", gtid, root);
3419  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3420  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3421  __kmp_print_structure_thread(" Uber Thread: ",
3422  root->r.r_uber_thread);
3423  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3424  __kmp_printf(" In Parallel: %2d\n",
3425  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3426  __kmp_printf("\n");
3427  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3428  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3429  }
3430  }
3431  } else {
3432  __kmp_printf("Ubers array is not allocated.\n");
3433  }
3434 
3435  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3436  "--------\n");
3437  while (list->next != NULL) {
3438  kmp_team_p const *team = list->entry;
3439  int i;
3440  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3441  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3442  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3443  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3444  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3445  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3446  for (i = 0; i < team->t.t_nproc; ++i) {
3447  __kmp_printf(" Thread %2d: ", i);
3448  __kmp_print_structure_thread("", team->t.t_threads[i]);
3449  }
3450  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3451  __kmp_printf("\n");
3452  list = list->next;
3453  }
3454 
3455  // Print out __kmp_thread_pool and __kmp_team_pool.
3456  __kmp_printf("\n------------------------------\nPools\n----------------------"
3457  "--------\n");
3458  __kmp_print_structure_thread("Thread pool: ",
3459  CCAST(kmp_info_t *, __kmp_thread_pool));
3460  __kmp_print_structure_team("Team pool: ",
3461  CCAST(kmp_team_t *, __kmp_team_pool));
3462  __kmp_printf("\n");
3463 
3464  // Free team list.
3465  while (list != NULL) {
3466  kmp_team_list_item_t *item = list;
3467  list = list->next;
3468  KMP_INTERNAL_FREE(item);
3469  }
3470 }
3471 
3472 #endif
3473 
3474 //---------------------------------------------------------------------------
3475 // Stuff for per-thread fast random number generator
3476 // Table of primes
3477 static const unsigned __kmp_primes[] = {
3478  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3479  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3480  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3481  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3482  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3483  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3484  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3485  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3486  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3487  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3488  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3489 
3490 //---------------------------------------------------------------------------
3491 // __kmp_get_random: Get a random number using a linear congruential method.
3492 unsigned short __kmp_get_random(kmp_info_t *thread) {
3493  unsigned x = thread->th.th_x;
3494  unsigned short r = (unsigned short)(x >> 16);
3495 
3496  thread->th.th_x = x * thread->th.th_a + 1;
3497 
3498  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3499  thread->th.th_info.ds.ds_tid, r));
3500 
3501  return r;
3502 }
3503 //--------------------------------------------------------
3504 // __kmp_init_random: Initialize a random number generator
3505 void __kmp_init_random(kmp_info_t *thread) {
3506  unsigned seed = thread->th.th_info.ds.ds_tid;
3507 
3508  thread->th.th_a =
3509  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3510  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3511  KA_TRACE(30,
3512  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3513 }
3514 
3515 #if KMP_OS_WINDOWS
3516 /* reclaim array entries for root threads that are already dead, returns number
3517  * reclaimed */
3518 static int __kmp_reclaim_dead_roots(void) {
3519  int i, r = 0;
3520 
3521  for (i = 0; i < __kmp_threads_capacity; ++i) {
3522  if (KMP_UBER_GTID(i) &&
3523  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3524  !__kmp_root[i]
3525  ->r.r_active) { // AC: reclaim only roots died in non-active state
3526  r += __kmp_unregister_root_other_thread(i);
3527  }
3528  }
3529  return r;
3530 }
3531 #endif
3532 
3533 /* This function attempts to create free entries in __kmp_threads and
3534  __kmp_root, and returns the number of free entries generated.
3535 
3536  For Windows* OS static library, the first mechanism used is to reclaim array
3537  entries for root threads that are already dead.
3538 
3539  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3540  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3541  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3542  threadprivate cache array has been created. Synchronization with
3543  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3544 
3545  After any dead root reclamation, if the clipping value allows array expansion
3546  to result in the generation of a total of nNeed free slots, the function does
3547  that expansion. If not, nothing is done beyond the possible initial root
3548  thread reclamation.
3549 
3550  If any argument is negative, the behavior is undefined. */
3551 static int __kmp_expand_threads(int nNeed) {
3552  int added = 0;
3553  int minimumRequiredCapacity;
3554  int newCapacity;
3555  kmp_info_t **newThreads;
3556  kmp_root_t **newRoot;
3557 
3558  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3559  // resizing __kmp_threads does not need additional protection if foreign
3560  // threads are present
3561 
3562 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3563  /* only for Windows static library */
3564  /* reclaim array entries for root threads that are already dead */
3565  added = __kmp_reclaim_dead_roots();
3566 
3567  if (nNeed) {
3568  nNeed -= added;
3569  if (nNeed < 0)
3570  nNeed = 0;
3571  }
3572 #endif
3573  if (nNeed <= 0)
3574  return added;
3575 
3576  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3577  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3578  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3579  // > __kmp_max_nth in one of two ways:
3580  //
3581  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3582  // may not be reused by another thread, so we may need to increase
3583  // __kmp_threads_capacity to __kmp_max_nth + 1.
3584  //
3585  // 2) New foreign root(s) are encountered. We always register new foreign
3586  // roots. This may cause a smaller # of threads to be allocated at
3587  // subsequent parallel regions, but the worker threads hang around (and
3588  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3589  //
3590  // Anyway, that is the reason for moving the check to see if
3591  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3592  // instead of having it performed here. -BB
3593 
3594  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3595 
3596  /* compute expansion headroom to check if we can expand */
3597  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3598  /* possible expansion too small -- give up */
3599  return added;
3600  }
3601  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3602 
3603  newCapacity = __kmp_threads_capacity;
3604  do {
3605  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3606  : __kmp_sys_max_nth;
3607  } while (newCapacity < minimumRequiredCapacity);
3608  newThreads = (kmp_info_t **)__kmp_allocate(
3609  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3610  newRoot =
3611  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3612  KMP_MEMCPY(newThreads, __kmp_threads,
3613  __kmp_threads_capacity * sizeof(kmp_info_t *));
3614  KMP_MEMCPY(newRoot, __kmp_root,
3615  __kmp_threads_capacity * sizeof(kmp_root_t *));
3616 
3617  kmp_info_t **temp_threads = __kmp_threads;
3618  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3619  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3620  __kmp_free(temp_threads);
3621  added += newCapacity - __kmp_threads_capacity;
3622  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3623 
3624  if (newCapacity > __kmp_tp_capacity) {
3625  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3626  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3627  __kmp_threadprivate_resize_cache(newCapacity);
3628  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3629  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3630  }
3631  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3632  }
3633 
3634  return added;
3635 }
3636 
3637 /* Register the current thread as a root thread and obtain our gtid. We must
3638  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3639  thread that calls from __kmp_do_serial_initialize() */
3640 int __kmp_register_root(int initial_thread) {
3641  kmp_info_t *root_thread;
3642  kmp_root_t *root;
3643  int gtid;
3644  int capacity;
3645  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3646  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3647  KMP_MB();
3648 
3649  /* 2007-03-02:
3650  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3651  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3652  work as expected -- it may return false (that means there is at least one
3653  empty slot in __kmp_threads array), but it is possible the only free slot
3654  is #0, which is reserved for initial thread and so cannot be used for this
3655  one. Following code workarounds this bug.
3656 
3657  However, right solution seems to be not reserving slot #0 for initial
3658  thread because:
3659  (1) there is no magic in slot #0,
3660  (2) we cannot detect initial thread reliably (the first thread which does
3661  serial initialization may be not a real initial thread).
3662  */
3663  capacity = __kmp_threads_capacity;
3664  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3665  --capacity;
3666  }
3667 
3668  // If it is not for initializing the hidden helper team, we need to take
3669  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3670  // in __kmp_threads_capacity.
3671  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3672  capacity -= __kmp_hidden_helper_threads_num;
3673  }
3674 
3675  /* see if there are too many threads */
3676  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3677  if (__kmp_tp_cached) {
3678  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3679  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3680  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3681  } else {
3682  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3683  __kmp_msg_null);
3684  }
3685  }
3686 
3687  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3688  // 0: initial thread, also a regular OpenMP thread.
3689  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3690  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3691  // regular OpenMP threads.
3692  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3693  // Find an available thread slot for hidden helper thread. Slots for hidden
3694  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3695  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3696  gtid <= __kmp_hidden_helper_threads_num;
3697  gtid++)
3698  ;
3699  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3700  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3701  "hidden helper thread: T#%d\n",
3702  gtid));
3703  } else {
3704  /* find an available thread slot */
3705  // Don't reassign the zero slot since we need that to only be used by
3706  // initial thread. Slots for hidden helper threads should also be skipped.
3707  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3708  gtid = 0;
3709  } else {
3710  for (gtid = __kmp_hidden_helper_threads_num + 1;
3711  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3712  ;
3713  }
3714  KA_TRACE(
3715  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3716  KMP_ASSERT(gtid < __kmp_threads_capacity);
3717  }
3718 
3719  /* update global accounting */
3720  __kmp_all_nth++;
3721  TCW_4(__kmp_nth, __kmp_nth + 1);
3722 
3723  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3724  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3725  if (__kmp_adjust_gtid_mode) {
3726  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3727  if (TCR_4(__kmp_gtid_mode) != 2) {
3728  TCW_4(__kmp_gtid_mode, 2);
3729  }
3730  } else {
3731  if (TCR_4(__kmp_gtid_mode) != 1) {
3732  TCW_4(__kmp_gtid_mode, 1);
3733  }
3734  }
3735  }
3736 
3737 #ifdef KMP_ADJUST_BLOCKTIME
3738  /* Adjust blocktime to zero if necessary */
3739  /* Middle initialization might not have occurred yet */
3740  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3741  if (__kmp_nth > __kmp_avail_proc) {
3742  __kmp_zero_bt = TRUE;
3743  }
3744  }
3745 #endif /* KMP_ADJUST_BLOCKTIME */
3746 
3747  /* setup this new hierarchy */
3748  if (!(root = __kmp_root[gtid])) {
3749  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3750  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3751  }
3752 
3753 #if KMP_STATS_ENABLED
3754  // Initialize stats as soon as possible (right after gtid assignment).
3755  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3756  __kmp_stats_thread_ptr->startLife();
3757  KMP_SET_THREAD_STATE(SERIAL_REGION);
3758  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3759 #endif
3760  __kmp_initialize_root(root);
3761 
3762  /* setup new root thread structure */
3763  if (root->r.r_uber_thread) {
3764  root_thread = root->r.r_uber_thread;
3765  } else {
3766  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3767  if (__kmp_storage_map) {
3768  __kmp_print_thread_storage_map(root_thread, gtid);
3769  }
3770  root_thread->th.th_info.ds.ds_gtid = gtid;
3771 #if OMPT_SUPPORT
3772  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3773 #endif
3774  root_thread->th.th_root = root;
3775  if (__kmp_env_consistency_check) {
3776  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3777  }
3778 #if USE_FAST_MEMORY
3779  __kmp_initialize_fast_memory(root_thread);
3780 #endif /* USE_FAST_MEMORY */
3781 
3782 #if KMP_USE_BGET
3783  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3784  __kmp_initialize_bget(root_thread);
3785 #endif
3786  __kmp_init_random(root_thread); // Initialize random number generator
3787  }
3788 
3789  /* setup the serial team held in reserve by the root thread */
3790  if (!root_thread->th.th_serial_team) {
3791  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3792  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3793  root_thread->th.th_serial_team = __kmp_allocate_team(
3794  root, 1, 1,
3795 #if OMPT_SUPPORT
3796  ompt_data_none, // root parallel id
3797 #endif
3798  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3799  }
3800  KMP_ASSERT(root_thread->th.th_serial_team);
3801  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3802  root_thread->th.th_serial_team));
3803 
3804  /* drop root_thread into place */
3805  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3806 
3807  root->r.r_root_team->t.t_threads[0] = root_thread;
3808  root->r.r_hot_team->t.t_threads[0] = root_thread;
3809  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3810  // AC: the team created in reserve, not for execution (it is unused for now).
3811  root_thread->th.th_serial_team->t.t_serialized = 0;
3812  root->r.r_uber_thread = root_thread;
3813 
3814  /* initialize the thread, get it ready to go */
3815  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3816  TCW_4(__kmp_init_gtid, TRUE);
3817 
3818  /* prepare the primary thread for get_gtid() */
3819  __kmp_gtid_set_specific(gtid);
3820 
3821 #if USE_ITT_BUILD
3822  __kmp_itt_thread_name(gtid);
3823 #endif /* USE_ITT_BUILD */
3824 
3825 #ifdef KMP_TDATA_GTID
3826  __kmp_gtid = gtid;
3827 #endif
3828  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3829  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3830 
3831  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3832  "plain=%u\n",
3833  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3834  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3835  KMP_INIT_BARRIER_STATE));
3836  { // Initialize barrier data.
3837  int b;
3838  for (b = 0; b < bs_last_barrier; ++b) {
3839  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3840 #if USE_DEBUGGER
3841  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3842 #endif
3843  }
3844  }
3845  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3846  KMP_INIT_BARRIER_STATE);
3847 
3848 #if KMP_AFFINITY_SUPPORTED
3849  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3850  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3851  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3852  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3853 #endif /* KMP_AFFINITY_SUPPORTED */
3854  root_thread->th.th_def_allocator = __kmp_def_allocator;
3855  root_thread->th.th_prev_level = 0;
3856  root_thread->th.th_prev_num_threads = 1;
3857 
3858  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3859  tmp->cg_root = root_thread;
3860  tmp->cg_thread_limit = __kmp_cg_max_nth;
3861  tmp->cg_nthreads = 1;
3862  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3863  " cg_nthreads init to 1\n",
3864  root_thread, tmp));
3865  tmp->up = NULL;
3866  root_thread->th.th_cg_roots = tmp;
3867 
3868  __kmp_root_counter++;
3869 
3870 #if OMPT_SUPPORT
3871  if (!initial_thread && ompt_enabled.enabled) {
3872 
3873  kmp_info_t *root_thread = ompt_get_thread();
3874 
3875  ompt_set_thread_state(root_thread, ompt_state_overhead);
3876 
3877  if (ompt_enabled.ompt_callback_thread_begin) {
3878  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3879  ompt_thread_initial, __ompt_get_thread_data_internal());
3880  }
3881  ompt_data_t *task_data;
3882  ompt_data_t *parallel_data;
3883  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3884  NULL);
3885  if (ompt_enabled.ompt_callback_implicit_task) {
3886  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3887  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3888  }
3889 
3890  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3891  }
3892 #endif
3893 #if OMPD_SUPPORT
3894  if (ompd_state & OMPD_ENABLE_BP)
3895  ompd_bp_thread_begin();
3896 #endif
3897 
3898  KMP_MB();
3899  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3900 
3901  return gtid;
3902 }
3903 
3904 #if KMP_NESTED_HOT_TEAMS
3905 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3906  const int max_level) {
3907  int i, n, nth;
3908  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3909  if (!hot_teams || !hot_teams[level].hot_team) {
3910  return 0;
3911  }
3912  KMP_DEBUG_ASSERT(level < max_level);
3913  kmp_team_t *team = hot_teams[level].hot_team;
3914  nth = hot_teams[level].hot_team_nth;
3915  n = nth - 1; // primary thread is not freed
3916  if (level < max_level - 1) {
3917  for (i = 0; i < nth; ++i) {
3918  kmp_info_t *th = team->t.t_threads[i];
3919  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3920  if (i > 0 && th->th.th_hot_teams) {
3921  __kmp_free(th->th.th_hot_teams);
3922  th->th.th_hot_teams = NULL;
3923  }
3924  }
3925  }
3926  __kmp_free_team(root, team, NULL);
3927  return n;
3928 }
3929 #endif
3930 
3931 // Resets a root thread and clear its root and hot teams.
3932 // Returns the number of __kmp_threads entries directly and indirectly freed.
3933 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3934  kmp_team_t *root_team = root->r.r_root_team;
3935  kmp_team_t *hot_team = root->r.r_hot_team;
3936  int n = hot_team->t.t_nproc;
3937  int i;
3938 
3939  KMP_DEBUG_ASSERT(!root->r.r_active);
3940 
3941  root->r.r_root_team = NULL;
3942  root->r.r_hot_team = NULL;
3943  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3944  // before call to __kmp_free_team().
3945  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3946 #if KMP_NESTED_HOT_TEAMS
3947  if (__kmp_hot_teams_max_level >
3948  0) { // need to free nested hot teams and their threads if any
3949  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3950  kmp_info_t *th = hot_team->t.t_threads[i];
3951  if (__kmp_hot_teams_max_level > 1) {
3952  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3953  }
3954  if (th->th.th_hot_teams) {
3955  __kmp_free(th->th.th_hot_teams);
3956  th->th.th_hot_teams = NULL;
3957  }
3958  }
3959  }
3960 #endif
3961  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3962 
3963  // Before we can reap the thread, we need to make certain that all other
3964  // threads in the teams that had this root as ancestor have stopped trying to
3965  // steal tasks.
3966  if (__kmp_tasking_mode != tskm_immediate_exec) {
3967  __kmp_wait_to_unref_task_teams();
3968  }
3969 
3970 #if KMP_OS_WINDOWS
3971  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3972  KA_TRACE(
3973  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3974  "\n",
3975  (LPVOID) & (root->r.r_uber_thread->th),
3976  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3977  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3978 #endif /* KMP_OS_WINDOWS */
3979 
3980 #if OMPD_SUPPORT
3981  if (ompd_state & OMPD_ENABLE_BP)
3982  ompd_bp_thread_end();
3983 #endif
3984 
3985 #if OMPT_SUPPORT
3986  ompt_data_t *task_data;
3987  ompt_data_t *parallel_data;
3988  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3989  NULL);
3990  if (ompt_enabled.ompt_callback_implicit_task) {
3991  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3992  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3993  }
3994  if (ompt_enabled.ompt_callback_thread_end) {
3995  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3996  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3997  }
3998 #endif
3999 
4000  TCW_4(__kmp_nth,
4001  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4002  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4003  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4004  " to %d\n",
4005  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4006  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4007  if (i == 1) {
4008  // need to free contention group structure
4009  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4010  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4011  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4012  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4013  root->r.r_uber_thread->th.th_cg_roots = NULL;
4014  }
4015  __kmp_reap_thread(root->r.r_uber_thread, 1);
4016 
4017  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4018  // instead of freeing.
4019  root->r.r_uber_thread = NULL;
4020  /* mark root as no longer in use */
4021  root->r.r_begin = FALSE;
4022 
4023  return n;
4024 }
4025 
4026 void __kmp_unregister_root_current_thread(int gtid) {
4027  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4028  /* this lock should be ok, since unregister_root_current_thread is never
4029  called during an abort, only during a normal close. furthermore, if you
4030  have the forkjoin lock, you should never try to get the initz lock */
4031  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4032  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4033  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4034  "exiting T#%d\n",
4035  gtid));
4036  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4037  return;
4038  }
4039  kmp_root_t *root = __kmp_root[gtid];
4040 
4041  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4042  KMP_ASSERT(KMP_UBER_GTID(gtid));
4043  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4044  KMP_ASSERT(root->r.r_active == FALSE);
4045 
4046  KMP_MB();
4047 
4048  kmp_info_t *thread = __kmp_threads[gtid];
4049  kmp_team_t *team = thread->th.th_team;
4050  kmp_task_team_t *task_team = thread->th.th_task_team;
4051 
4052  // we need to wait for the proxy tasks before finishing the thread
4053  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4054 #if OMPT_SUPPORT
4055  // the runtime is shutting down so we won't report any events
4056  thread->th.ompt_thread_info.state = ompt_state_undefined;
4057 #endif
4058  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4059  }
4060 
4061  __kmp_reset_root(gtid, root);
4062 
4063  KMP_MB();
4064  KC_TRACE(10,
4065  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4066 
4067  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4068 }
4069 
4070 #if KMP_OS_WINDOWS
4071 /* __kmp_forkjoin_lock must be already held
4072  Unregisters a root thread that is not the current thread. Returns the number
4073  of __kmp_threads entries freed as a result. */
4074 static int __kmp_unregister_root_other_thread(int gtid) {
4075  kmp_root_t *root = __kmp_root[gtid];
4076  int r;
4077 
4078  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4079  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4080  KMP_ASSERT(KMP_UBER_GTID(gtid));
4081  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4082  KMP_ASSERT(root->r.r_active == FALSE);
4083 
4084  r = __kmp_reset_root(gtid, root);
4085  KC_TRACE(10,
4086  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4087  return r;
4088 }
4089 #endif
4090 
4091 #if KMP_DEBUG
4092 void __kmp_task_info() {
4093 
4094  kmp_int32 gtid = __kmp_entry_gtid();
4095  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4096  kmp_info_t *this_thr = __kmp_threads[gtid];
4097  kmp_team_t *steam = this_thr->th.th_serial_team;
4098  kmp_team_t *team = this_thr->th.th_team;
4099 
4100  __kmp_printf(
4101  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4102  "ptask=%p\n",
4103  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4104  team->t.t_implicit_task_taskdata[tid].td_parent);
4105 }
4106 #endif // KMP_DEBUG
4107 
4108 /* TODO optimize with one big memclr, take out what isn't needed, split
4109  responsibility to workers as much as possible, and delay initialization of
4110  features as much as possible */
4111 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4112  int tid, int gtid) {
4113  /* this_thr->th.th_info.ds.ds_gtid is setup in
4114  kmp_allocate_thread/create_worker.
4115  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4116  KMP_DEBUG_ASSERT(this_thr != NULL);
4117  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4118  KMP_DEBUG_ASSERT(team);
4119  KMP_DEBUG_ASSERT(team->t.t_threads);
4120  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4121  kmp_info_t *master = team->t.t_threads[0];
4122  KMP_DEBUG_ASSERT(master);
4123  KMP_DEBUG_ASSERT(master->th.th_root);
4124 
4125  KMP_MB();
4126 
4127  TCW_SYNC_PTR(this_thr->th.th_team, team);
4128 
4129  this_thr->th.th_info.ds.ds_tid = tid;
4130  this_thr->th.th_set_nproc = 0;
4131  if (__kmp_tasking_mode != tskm_immediate_exec)
4132  // When tasking is possible, threads are not safe to reap until they are
4133  // done tasking; this will be set when tasking code is exited in wait
4134  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4135  else // no tasking --> always safe to reap
4136  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4137  this_thr->th.th_set_proc_bind = proc_bind_default;
4138 #if KMP_AFFINITY_SUPPORTED
4139  this_thr->th.th_new_place = this_thr->th.th_current_place;
4140 #endif
4141  this_thr->th.th_root = master->th.th_root;
4142 
4143  /* setup the thread's cache of the team structure */
4144  this_thr->th.th_team_nproc = team->t.t_nproc;
4145  this_thr->th.th_team_master = master;
4146  this_thr->th.th_team_serialized = team->t.t_serialized;
4147 
4148  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4149 
4150  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4151  tid, gtid, this_thr, this_thr->th.th_current_task));
4152 
4153  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4154  team, tid, TRUE);
4155 
4156  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4157  tid, gtid, this_thr, this_thr->th.th_current_task));
4158  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4159  // __kmp_initialize_team()?
4160 
4161  /* TODO no worksharing in speculative threads */
4162  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4163 
4164  this_thr->th.th_local.this_construct = 0;
4165 
4166  if (!this_thr->th.th_pri_common) {
4167  this_thr->th.th_pri_common =
4168  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4169  if (__kmp_storage_map) {
4170  __kmp_print_storage_map_gtid(
4171  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4172  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4173  }
4174  this_thr->th.th_pri_head = NULL;
4175  }
4176 
4177  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4178  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4179  // Make new thread's CG root same as primary thread's
4180  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4181  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4182  if (tmp) {
4183  // worker changes CG, need to check if old CG should be freed
4184  int i = tmp->cg_nthreads--;
4185  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4186  " on node %p of thread %p to %d\n",
4187  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4188  if (i == 1) {
4189  __kmp_free(tmp); // last thread left CG --> free it
4190  }
4191  }
4192  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4193  // Increment new thread's CG root's counter to add the new thread
4194  this_thr->th.th_cg_roots->cg_nthreads++;
4195  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4196  " node %p of thread %p to %d\n",
4197  this_thr, this_thr->th.th_cg_roots,
4198  this_thr->th.th_cg_roots->cg_root,
4199  this_thr->th.th_cg_roots->cg_nthreads));
4200  this_thr->th.th_current_task->td_icvs.thread_limit =
4201  this_thr->th.th_cg_roots->cg_thread_limit;
4202  }
4203 
4204  /* Initialize dynamic dispatch */
4205  {
4206  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4207  // Use team max_nproc since this will never change for the team.
4208  size_t disp_size =
4209  sizeof(dispatch_private_info_t) *
4210  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4211  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4212  team->t.t_max_nproc));
4213  KMP_ASSERT(dispatch);
4214  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4215  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4216 
4217  dispatch->th_disp_index = 0;
4218  dispatch->th_doacross_buf_idx = 0;
4219  if (!dispatch->th_disp_buffer) {
4220  dispatch->th_disp_buffer =
4221  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4222 
4223  if (__kmp_storage_map) {
4224  __kmp_print_storage_map_gtid(
4225  gtid, &dispatch->th_disp_buffer[0],
4226  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4227  ? 1
4228  : __kmp_dispatch_num_buffers],
4229  disp_size,
4230  "th_%d.th_dispatch.th_disp_buffer "
4231  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4232  gtid, team->t.t_id, gtid);
4233  }
4234  } else {
4235  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4236  }
4237 
4238  dispatch->th_dispatch_pr_current = 0;
4239  dispatch->th_dispatch_sh_current = 0;
4240 
4241  dispatch->th_deo_fcn = 0; /* ORDERED */
4242  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4243  }
4244 
4245  this_thr->th.th_next_pool = NULL;
4246 
4247  if (!this_thr->th.th_task_state_memo_stack) {
4248  size_t i;
4249  this_thr->th.th_task_state_memo_stack =
4250  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4251  this_thr->th.th_task_state_top = 0;
4252  this_thr->th.th_task_state_stack_sz = 4;
4253  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4254  ++i) // zero init the stack
4255  this_thr->th.th_task_state_memo_stack[i] = 0;
4256  }
4257 
4258  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4259  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4260 
4261  KMP_MB();
4262 }
4263 
4264 /* allocate a new thread for the requesting team. this is only called from
4265  within a forkjoin critical section. we will first try to get an available
4266  thread from the thread pool. if none is available, we will fork a new one
4267  assuming we are able to create a new one. this should be assured, as the
4268  caller should check on this first. */
4269 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4270  int new_tid) {
4271  kmp_team_t *serial_team;
4272  kmp_info_t *new_thr;
4273  int new_gtid;
4274 
4275  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4276  KMP_DEBUG_ASSERT(root && team);
4277 #if !KMP_NESTED_HOT_TEAMS
4278  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4279 #endif
4280  KMP_MB();
4281 
4282  /* first, try to get one from the thread pool */
4283  if (__kmp_thread_pool) {
4284  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4285  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4286  if (new_thr == __kmp_thread_pool_insert_pt) {
4287  __kmp_thread_pool_insert_pt = NULL;
4288  }
4289  TCW_4(new_thr->th.th_in_pool, FALSE);
4290  __kmp_suspend_initialize_thread(new_thr);
4291  __kmp_lock_suspend_mx(new_thr);
4292  if (new_thr->th.th_active_in_pool == TRUE) {
4293  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4294  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4295  new_thr->th.th_active_in_pool = FALSE;
4296  }
4297  __kmp_unlock_suspend_mx(new_thr);
4298 
4299  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4300  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4301  KMP_ASSERT(!new_thr->th.th_team);
4302  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4303 
4304  /* setup the thread structure */
4305  __kmp_initialize_info(new_thr, team, new_tid,
4306  new_thr->th.th_info.ds.ds_gtid);
4307  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4308 
4309  TCW_4(__kmp_nth, __kmp_nth + 1);
4310 
4311  new_thr->th.th_task_state = 0;
4312  new_thr->th.th_task_state_top = 0;
4313  new_thr->th.th_task_state_stack_sz = 4;
4314 
4315  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4316  // Make sure pool thread has transitioned to waiting on own thread struct
4317  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4318  // Thread activated in __kmp_allocate_team when increasing team size
4319  }
4320 
4321 #ifdef KMP_ADJUST_BLOCKTIME
4322  /* Adjust blocktime back to zero if necessary */
4323  /* Middle initialization might not have occurred yet */
4324  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4325  if (__kmp_nth > __kmp_avail_proc) {
4326  __kmp_zero_bt = TRUE;
4327  }
4328  }
4329 #endif /* KMP_ADJUST_BLOCKTIME */
4330 
4331 #if KMP_DEBUG
4332  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4333  // KMP_BARRIER_PARENT_FLAG.
4334  int b;
4335  kmp_balign_t *balign = new_thr->th.th_bar;
4336  for (b = 0; b < bs_last_barrier; ++b)
4337  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4338 #endif
4339 
4340  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4341  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4342 
4343  KMP_MB();
4344  return new_thr;
4345  }
4346 
4347  /* no, well fork a new one */
4348  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4349  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4350 
4351 #if KMP_USE_MONITOR
4352  // If this is the first worker thread the RTL is creating, then also
4353  // launch the monitor thread. We try to do this as early as possible.
4354  if (!TCR_4(__kmp_init_monitor)) {
4355  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4356  if (!TCR_4(__kmp_init_monitor)) {
4357  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4358  TCW_4(__kmp_init_monitor, 1);
4359  __kmp_create_monitor(&__kmp_monitor);
4360  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4361 #if KMP_OS_WINDOWS
4362  // AC: wait until monitor has started. This is a fix for CQ232808.
4363  // The reason is that if the library is loaded/unloaded in a loop with
4364  // small (parallel) work in between, then there is high probability that
4365  // monitor thread started after the library shutdown. At shutdown it is
4366  // too late to cope with the problem, because when the primary thread is
4367  // in DllMain (process detach) the monitor has no chances to start (it is
4368  // blocked), and primary thread has no means to inform the monitor that
4369  // the library has gone, because all the memory which the monitor can
4370  // access is going to be released/reset.
4371  while (TCR_4(__kmp_init_monitor) < 2) {
4372  KMP_YIELD(TRUE);
4373  }
4374  KF_TRACE(10, ("after monitor thread has started\n"));
4375 #endif
4376  }
4377  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4378  }
4379 #endif
4380 
4381  KMP_MB();
4382 
4383  {
4384  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4385  ? 1
4386  : __kmp_hidden_helper_threads_num + 1;
4387 
4388  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4389  ++new_gtid) {
4390  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4391  }
4392 
4393  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4394  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4395  }
4396  }
4397 
4398  /* allocate space for it. */
4399  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4400 
4401  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4402 
4403 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4404  // suppress race conditions detection on synchronization flags in debug mode
4405  // this helps to analyze library internals eliminating false positives
4406  __itt_suppress_mark_range(
4407  __itt_suppress_range, __itt_suppress_threading_errors,
4408  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4409  __itt_suppress_mark_range(
4410  __itt_suppress_range, __itt_suppress_threading_errors,
4411  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4412 #if KMP_OS_WINDOWS
4413  __itt_suppress_mark_range(
4414  __itt_suppress_range, __itt_suppress_threading_errors,
4415  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4416 #else
4417  __itt_suppress_mark_range(__itt_suppress_range,
4418  __itt_suppress_threading_errors,
4419  &new_thr->th.th_suspend_init_count,
4420  sizeof(new_thr->th.th_suspend_init_count));
4421 #endif
4422  // TODO: check if we need to also suppress b_arrived flags
4423  __itt_suppress_mark_range(__itt_suppress_range,
4424  __itt_suppress_threading_errors,
4425  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4426  sizeof(new_thr->th.th_bar[0].bb.b_go));
4427  __itt_suppress_mark_range(__itt_suppress_range,
4428  __itt_suppress_threading_errors,
4429  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4430  sizeof(new_thr->th.th_bar[1].bb.b_go));
4431  __itt_suppress_mark_range(__itt_suppress_range,
4432  __itt_suppress_threading_errors,
4433  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4434  sizeof(new_thr->th.th_bar[2].bb.b_go));
4435 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4436  if (__kmp_storage_map) {
4437  __kmp_print_thread_storage_map(new_thr, new_gtid);
4438  }
4439 
4440  // add the reserve serialized team, initialized from the team's primary thread
4441  {
4442  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4443  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4444  new_thr->th.th_serial_team = serial_team =
4445  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4446 #if OMPT_SUPPORT
4447  ompt_data_none, // root parallel id
4448 #endif
4449  proc_bind_default, &r_icvs,
4450  0 USE_NESTED_HOT_ARG(NULL));
4451  }
4452  KMP_ASSERT(serial_team);
4453  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4454  // execution (it is unused for now).
4455  serial_team->t.t_threads[0] = new_thr;
4456  KF_TRACE(10,
4457  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4458  new_thr));
4459 
4460  /* setup the thread structures */
4461  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4462 
4463 #if USE_FAST_MEMORY
4464  __kmp_initialize_fast_memory(new_thr);
4465 #endif /* USE_FAST_MEMORY */
4466 
4467 #if KMP_USE_BGET
4468  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4469  __kmp_initialize_bget(new_thr);
4470 #endif
4471 
4472  __kmp_init_random(new_thr); // Initialize random number generator
4473 
4474  /* Initialize these only once when thread is grabbed for a team allocation */
4475  KA_TRACE(20,
4476  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4477  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4478 
4479  int b;
4480  kmp_balign_t *balign = new_thr->th.th_bar;
4481  for (b = 0; b < bs_last_barrier; ++b) {
4482  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4483  balign[b].bb.team = NULL;
4484  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4485  balign[b].bb.use_oncore_barrier = 0;
4486  }
4487 
4488  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4489  new_thr->th.th_sleep_loc_type = flag_unset;
4490 
4491  new_thr->th.th_spin_here = FALSE;
4492  new_thr->th.th_next_waiting = 0;
4493 #if KMP_OS_UNIX
4494  new_thr->th.th_blocking = false;
4495 #endif
4496 
4497 #if KMP_AFFINITY_SUPPORTED
4498  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4499  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4500  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4501  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4502 #endif
4503  new_thr->th.th_def_allocator = __kmp_def_allocator;
4504  new_thr->th.th_prev_level = 0;
4505  new_thr->th.th_prev_num_threads = 1;
4506 
4507  TCW_4(new_thr->th.th_in_pool, FALSE);
4508  new_thr->th.th_active_in_pool = FALSE;
4509  TCW_4(new_thr->th.th_active, TRUE);
4510 
4511  /* adjust the global counters */
4512  __kmp_all_nth++;
4513  __kmp_nth++;
4514 
4515  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4516  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4517  if (__kmp_adjust_gtid_mode) {
4518  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4519  if (TCR_4(__kmp_gtid_mode) != 2) {
4520  TCW_4(__kmp_gtid_mode, 2);
4521  }
4522  } else {
4523  if (TCR_4(__kmp_gtid_mode) != 1) {
4524  TCW_4(__kmp_gtid_mode, 1);
4525  }
4526  }
4527  }
4528 
4529 #ifdef KMP_ADJUST_BLOCKTIME
4530  /* Adjust blocktime back to zero if necessary */
4531  /* Middle initialization might not have occurred yet */
4532  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4533  if (__kmp_nth > __kmp_avail_proc) {
4534  __kmp_zero_bt = TRUE;
4535  }
4536  }
4537 #endif /* KMP_ADJUST_BLOCKTIME */
4538 
4539  /* actually fork it and create the new worker thread */
4540  KF_TRACE(
4541  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4542  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4543  KF_TRACE(10,
4544  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4545 
4546  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4547  new_gtid));
4548  KMP_MB();
4549  return new_thr;
4550 }
4551 
4552 /* Reinitialize team for reuse.
4553  The hot team code calls this case at every fork barrier, so EPCC barrier
4554  test are extremely sensitive to changes in it, esp. writes to the team
4555  struct, which cause a cache invalidation in all threads.
4556  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4557 static void __kmp_reinitialize_team(kmp_team_t *team,
4558  kmp_internal_control_t *new_icvs,
4559  ident_t *loc) {
4560  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4561  team->t.t_threads[0], team));
4562  KMP_DEBUG_ASSERT(team && new_icvs);
4563  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4564  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4565 
4566  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4567  // Copy ICVs to the primary thread's implicit taskdata
4568  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4569  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4570 
4571  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4572  team->t.t_threads[0], team));
4573 }
4574 
4575 /* Initialize the team data structure.
4576  This assumes the t_threads and t_max_nproc are already set.
4577  Also, we don't touch the arguments */
4578 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4579  kmp_internal_control_t *new_icvs,
4580  ident_t *loc) {
4581  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4582 
4583  /* verify */
4584  KMP_DEBUG_ASSERT(team);
4585  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4586  KMP_DEBUG_ASSERT(team->t.t_threads);
4587  KMP_MB();
4588 
4589  team->t.t_master_tid = 0; /* not needed */
4590  /* team->t.t_master_bar; not needed */
4591  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4592  team->t.t_nproc = new_nproc;
4593 
4594  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4595  team->t.t_next_pool = NULL;
4596  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4597  * up hot team */
4598 
4599  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4600  team->t.t_invoke = NULL; /* not needed */
4601 
4602  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4603  team->t.t_sched.sched = new_icvs->sched.sched;
4604 
4605 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4606  team->t.t_fp_control_saved = FALSE; /* not needed */
4607  team->t.t_x87_fpu_control_word = 0; /* not needed */
4608  team->t.t_mxcsr = 0; /* not needed */
4609 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4610 
4611  team->t.t_construct = 0;
4612 
4613  team->t.t_ordered.dt.t_value = 0;
4614  team->t.t_master_active = FALSE;
4615 
4616 #ifdef KMP_DEBUG
4617  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4618 #endif
4619 #if KMP_OS_WINDOWS
4620  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4621 #endif
4622 
4623  team->t.t_control_stack_top = NULL;
4624 
4625  __kmp_reinitialize_team(team, new_icvs, loc);
4626 
4627  KMP_MB();
4628  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4629 }
4630 
4631 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4632 /* Sets full mask for thread and returns old mask, no changes to structures. */
4633 static void
4634 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4635  if (KMP_AFFINITY_CAPABLE()) {
4636  int status;
4637  if (old_mask != NULL) {
4638  status = __kmp_get_system_affinity(old_mask, TRUE);
4639  int error = errno;
4640  if (status != 0) {
4641  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4642  __kmp_msg_null);
4643  }
4644  }
4645  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4646  }
4647 }
4648 #endif
4649 
4650 #if KMP_AFFINITY_SUPPORTED
4651 
4652 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4653 // It calculates the worker + primary thread's partition based upon the parent
4654 // thread's partition, and binds each worker to a thread in their partition.
4655 // The primary thread's partition should already include its current binding.
4656 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4657  // Do not partition places for the hidden helper team
4658  if (KMP_HIDDEN_HELPER_TEAM(team))
4659  return;
4660  // Copy the primary thread's place partition to the team struct
4661  kmp_info_t *master_th = team->t.t_threads[0];
4662  KMP_DEBUG_ASSERT(master_th != NULL);
4663  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4664  int first_place = master_th->th.th_first_place;
4665  int last_place = master_th->th.th_last_place;
4666  int masters_place = master_th->th.th_current_place;
4667  team->t.t_first_place = first_place;
4668  team->t.t_last_place = last_place;
4669 
4670  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4671  "bound to place %d partition = [%d,%d]\n",
4672  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4673  team->t.t_id, masters_place, first_place, last_place));
4674 
4675  switch (proc_bind) {
4676 
4677  case proc_bind_default:
4678  // Serial teams might have the proc_bind policy set to proc_bind_default.
4679  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4680  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4681  break;
4682 
4683  case proc_bind_primary: {
4684  int f;
4685  int n_th = team->t.t_nproc;
4686  for (f = 1; f < n_th; f++) {
4687  kmp_info_t *th = team->t.t_threads[f];
4688  KMP_DEBUG_ASSERT(th != NULL);
4689  th->th.th_first_place = first_place;
4690  th->th.th_last_place = last_place;
4691  th->th.th_new_place = masters_place;
4692  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4693  team->t.t_display_affinity != 1) {
4694  team->t.t_display_affinity = 1;
4695  }
4696 
4697  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4698  "partition = [%d,%d]\n",
4699  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4700  f, masters_place, first_place, last_place));
4701  }
4702  } break;
4703 
4704  case proc_bind_close: {
4705  int f;
4706  int n_th = team->t.t_nproc;
4707  int n_places;
4708  if (first_place <= last_place) {
4709  n_places = last_place - first_place + 1;
4710  } else {
4711  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4712  }
4713  if (n_th <= n_places) {
4714  int place = masters_place;
4715  for (f = 1; f < n_th; f++) {
4716  kmp_info_t *th = team->t.t_threads[f];
4717  KMP_DEBUG_ASSERT(th != NULL);
4718 
4719  if (place == last_place) {
4720  place = first_place;
4721  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4722  place = 0;
4723  } else {
4724  place++;
4725  }
4726  th->th.th_first_place = first_place;
4727  th->th.th_last_place = last_place;
4728  th->th.th_new_place = place;
4729  if (__kmp_display_affinity && place != th->th.th_current_place &&
4730  team->t.t_display_affinity != 1) {
4731  team->t.t_display_affinity = 1;
4732  }
4733 
4734  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4735  "partition = [%d,%d]\n",
4736  __kmp_gtid_from_thread(team->t.t_threads[f]),
4737  team->t.t_id, f, place, first_place, last_place));
4738  }
4739  } else {
4740  int S, rem, gap, s_count;
4741  S = n_th / n_places;
4742  s_count = 0;
4743  rem = n_th - (S * n_places);
4744  gap = rem > 0 ? n_places / rem : n_places;
4745  int place = masters_place;
4746  int gap_ct = gap;
4747  for (f = 0; f < n_th; f++) {
4748  kmp_info_t *th = team->t.t_threads[f];
4749  KMP_DEBUG_ASSERT(th != NULL);
4750 
4751  th->th.th_first_place = first_place;
4752  th->th.th_last_place = last_place;
4753  th->th.th_new_place = place;
4754  if (__kmp_display_affinity && place != th->th.th_current_place &&
4755  team->t.t_display_affinity != 1) {
4756  team->t.t_display_affinity = 1;
4757  }
4758  s_count++;
4759 
4760  if ((s_count == S) && rem && (gap_ct == gap)) {
4761  // do nothing, add an extra thread to place on next iteration
4762  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4763  // we added an extra thread to this place; move to next place
4764  if (place == last_place) {
4765  place = first_place;
4766  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4767  place = 0;
4768  } else {
4769  place++;
4770  }
4771  s_count = 0;
4772  gap_ct = 1;
4773  rem--;
4774  } else if (s_count == S) { // place full; don't add extra
4775  if (place == last_place) {
4776  place = first_place;
4777  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4778  place = 0;
4779  } else {
4780  place++;
4781  }
4782  gap_ct++;
4783  s_count = 0;
4784  }
4785 
4786  KA_TRACE(100,
4787  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4788  "partition = [%d,%d]\n",
4789  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4790  th->th.th_new_place, first_place, last_place));
4791  }
4792  KMP_DEBUG_ASSERT(place == masters_place);
4793  }
4794  } break;
4795 
4796  case proc_bind_spread: {
4797  int f;
4798  int n_th = team->t.t_nproc;
4799  int n_places;
4800  int thidx;
4801  if (first_place <= last_place) {
4802  n_places = last_place - first_place + 1;
4803  } else {
4804  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4805  }
4806  if (n_th <= n_places) {
4807  int place = -1;
4808 
4809  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4810  int S = n_places / n_th;
4811  int s_count, rem, gap, gap_ct;
4812 
4813  place = masters_place;
4814  rem = n_places - n_th * S;
4815  gap = rem ? n_th / rem : 1;
4816  gap_ct = gap;
4817  thidx = n_th;
4818  if (update_master_only == 1)
4819  thidx = 1;
4820  for (f = 0; f < thidx; f++) {
4821  kmp_info_t *th = team->t.t_threads[f];
4822  KMP_DEBUG_ASSERT(th != NULL);
4823 
4824  th->th.th_first_place = place;
4825  th->th.th_new_place = place;
4826  if (__kmp_display_affinity && place != th->th.th_current_place &&
4827  team->t.t_display_affinity != 1) {
4828  team->t.t_display_affinity = 1;
4829  }
4830  s_count = 1;
4831  while (s_count < S) {
4832  if (place == last_place) {
4833  place = first_place;
4834  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4835  place = 0;
4836  } else {
4837  place++;
4838  }
4839  s_count++;
4840  }
4841  if (rem && (gap_ct == gap)) {
4842  if (place == last_place) {
4843  place = first_place;
4844  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4845  place = 0;
4846  } else {
4847  place++;
4848  }
4849  rem--;
4850  gap_ct = 0;
4851  }
4852  th->th.th_last_place = place;
4853  gap_ct++;
4854 
4855  if (place == last_place) {
4856  place = first_place;
4857  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4858  place = 0;
4859  } else {
4860  place++;
4861  }
4862 
4863  KA_TRACE(100,
4864  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4865  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4866  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4867  f, th->th.th_new_place, th->th.th_first_place,
4868  th->th.th_last_place, __kmp_affinity_num_masks));
4869  }
4870  } else {
4871  /* Having uniform space of available computation places I can create
4872  T partitions of round(P/T) size and put threads into the first
4873  place of each partition. */
4874  double current = static_cast<double>(masters_place);
4875  double spacing =
4876  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4877  int first, last;
4878  kmp_info_t *th;
4879 
4880  thidx = n_th + 1;
4881  if (update_master_only == 1)
4882  thidx = 1;
4883  for (f = 0; f < thidx; f++) {
4884  first = static_cast<int>(current);
4885  last = static_cast<int>(current + spacing) - 1;
4886  KMP_DEBUG_ASSERT(last >= first);
4887  if (first >= n_places) {
4888  if (masters_place) {
4889  first -= n_places;
4890  last -= n_places;
4891  if (first == (masters_place + 1)) {
4892  KMP_DEBUG_ASSERT(f == n_th);
4893  first--;
4894  }
4895  if (last == masters_place) {
4896  KMP_DEBUG_ASSERT(f == (n_th - 1));
4897  last--;
4898  }
4899  } else {
4900  KMP_DEBUG_ASSERT(f == n_th);
4901  first = 0;
4902  last = 0;
4903  }
4904  }
4905  if (last >= n_places) {
4906  last = (n_places - 1);
4907  }
4908  place = first;
4909  current += spacing;
4910  if (f < n_th) {
4911  KMP_DEBUG_ASSERT(0 <= first);
4912  KMP_DEBUG_ASSERT(n_places > first);
4913  KMP_DEBUG_ASSERT(0 <= last);
4914  KMP_DEBUG_ASSERT(n_places > last);
4915  KMP_DEBUG_ASSERT(last_place >= first_place);
4916  th = team->t.t_threads[f];
4917  KMP_DEBUG_ASSERT(th);
4918  th->th.th_first_place = first;
4919  th->th.th_new_place = place;
4920  th->th.th_last_place = last;
4921  if (__kmp_display_affinity && place != th->th.th_current_place &&
4922  team->t.t_display_affinity != 1) {
4923  team->t.t_display_affinity = 1;
4924  }
4925  KA_TRACE(100,
4926  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4927  "partition = [%d,%d], spacing = %.4f\n",
4928  __kmp_gtid_from_thread(team->t.t_threads[f]),
4929  team->t.t_id, f, th->th.th_new_place,
4930  th->th.th_first_place, th->th.th_last_place, spacing));
4931  }
4932  }
4933  }
4934  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4935  } else {
4936  int S, rem, gap, s_count;
4937  S = n_th / n_places;
4938  s_count = 0;
4939  rem = n_th - (S * n_places);
4940  gap = rem > 0 ? n_places / rem : n_places;
4941  int place = masters_place;
4942  int gap_ct = gap;
4943  thidx = n_th;
4944  if (update_master_only == 1)
4945  thidx = 1;
4946  for (f = 0; f < thidx; f++) {
4947  kmp_info_t *th = team->t.t_threads[f];
4948  KMP_DEBUG_ASSERT(th != NULL);
4949 
4950  th->th.th_first_place = place;
4951  th->th.th_last_place = place;
4952  th->th.th_new_place = place;
4953  if (__kmp_display_affinity && place != th->th.th_current_place &&
4954  team->t.t_display_affinity != 1) {
4955  team->t.t_display_affinity = 1;
4956  }
4957  s_count++;
4958 
4959  if ((s_count == S) && rem && (gap_ct == gap)) {
4960  // do nothing, add an extra thread to place on next iteration
4961  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4962  // we added an extra thread to this place; move on to next place
4963  if (place == last_place) {
4964  place = first_place;
4965  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4966  place = 0;
4967  } else {
4968  place++;
4969  }
4970  s_count = 0;
4971  gap_ct = 1;
4972  rem--;
4973  } else if (s_count == S) { // place is full; don't add extra thread
4974  if (place == last_place) {
4975  place = first_place;
4976  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4977  place = 0;
4978  } else {
4979  place++;
4980  }
4981  gap_ct++;
4982  s_count = 0;
4983  }
4984 
4985  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4986  "partition = [%d,%d]\n",
4987  __kmp_gtid_from_thread(team->t.t_threads[f]),
4988  team->t.t_id, f, th->th.th_new_place,
4989  th->th.th_first_place, th->th.th_last_place));
4990  }
4991  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4992  }
4993  } break;
4994 
4995  default:
4996  break;
4997  }
4998 
4999  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5000 }
5001 
5002 #endif // KMP_AFFINITY_SUPPORTED
5003 
5004 /* allocate a new team data structure to use. take one off of the free pool if
5005  available */
5006 kmp_team_t *
5007 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5008 #if OMPT_SUPPORT
5009  ompt_data_t ompt_parallel_data,
5010 #endif
5011  kmp_proc_bind_t new_proc_bind,
5012  kmp_internal_control_t *new_icvs,
5013  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5014  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5015  int f;
5016  kmp_team_t *team;
5017  int use_hot_team = !root->r.r_active;
5018  int level = 0;
5019 
5020  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5021  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5022  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5023  KMP_MB();
5024 
5025 #if KMP_NESTED_HOT_TEAMS
5026  kmp_hot_team_ptr_t *hot_teams;
5027  if (master) {
5028  team = master->th.th_team;
5029  level = team->t.t_active_level;
5030  if (master->th.th_teams_microtask) { // in teams construct?
5031  if (master->th.th_teams_size.nteams > 1 &&
5032  ( // #teams > 1
5033  team->t.t_pkfn ==
5034  (microtask_t)__kmp_teams_master || // inner fork of the teams
5035  master->th.th_teams_level <
5036  team->t.t_level)) { // or nested parallel inside the teams
5037  ++level; // not increment if #teams==1, or for outer fork of the teams;
5038  // increment otherwise
5039  }
5040  }
5041  hot_teams = master->th.th_hot_teams;
5042  if (level < __kmp_hot_teams_max_level && hot_teams &&
5043  hot_teams[level].hot_team) {
5044  // hot team has already been allocated for given level
5045  use_hot_team = 1;
5046  } else {
5047  use_hot_team = 0;
5048  }
5049  } else {
5050  // check we won't access uninitialized hot_teams, just in case
5051  KMP_DEBUG_ASSERT(new_nproc == 1);
5052  }
5053 #endif
5054  // Optimization to use a "hot" team
5055  if (use_hot_team && new_nproc > 1) {
5056  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5057 #if KMP_NESTED_HOT_TEAMS
5058  team = hot_teams[level].hot_team;
5059 #else
5060  team = root->r.r_hot_team;
5061 #endif
5062 #if KMP_DEBUG
5063  if (__kmp_tasking_mode != tskm_immediate_exec) {
5064  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5065  "task_team[1] = %p before reinit\n",
5066  team->t.t_task_team[0], team->t.t_task_team[1]));
5067  }
5068 #endif
5069 
5070  if (team->t.t_nproc != new_nproc &&
5071  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5072  // Distributed barrier may need a resize
5073  int old_nthr = team->t.t_nproc;
5074  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5075  }
5076 
5077  // Has the number of threads changed?
5078  /* Let's assume the most common case is that the number of threads is
5079  unchanged, and put that case first. */
5080  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5081  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5082  // This case can mean that omp_set_num_threads() was called and the hot
5083  // team size was already reduced, so we check the special flag
5084  if (team->t.t_size_changed == -1) {
5085  team->t.t_size_changed = 1;
5086  } else {
5087  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5088  }
5089 
5090  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5091  kmp_r_sched_t new_sched = new_icvs->sched;
5092  // set primary thread's schedule as new run-time schedule
5093  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5094 
5095  __kmp_reinitialize_team(team, new_icvs,
5096  root->r.r_uber_thread->th.th_ident);
5097 
5098  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5099  team->t.t_threads[0], team));
5100  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5101 
5102 #if KMP_AFFINITY_SUPPORTED
5103  if ((team->t.t_size_changed == 0) &&
5104  (team->t.t_proc_bind == new_proc_bind)) {
5105  if (new_proc_bind == proc_bind_spread) {
5106  __kmp_partition_places(
5107  team, 1); // add flag to update only master for spread
5108  }
5109  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5110  "proc_bind = %d, partition = [%d,%d]\n",
5111  team->t.t_id, new_proc_bind, team->t.t_first_place,
5112  team->t.t_last_place));
5113  } else {
5114  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5115  __kmp_partition_places(team);
5116  }
5117 #else
5118  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5119 #endif /* KMP_AFFINITY_SUPPORTED */
5120  } else if (team->t.t_nproc > new_nproc) {
5121  KA_TRACE(20,
5122  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5123  new_nproc));
5124 
5125  team->t.t_size_changed = 1;
5126  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5127  // Barrier size already reduced earlier in this function
5128  // Activate team threads via th_used_in_team
5129  __kmp_add_threads_to_team(team, new_nproc);
5130  }
5131 #if KMP_NESTED_HOT_TEAMS
5132  if (__kmp_hot_teams_mode == 0) {
5133  // AC: saved number of threads should correspond to team's value in this
5134  // mode, can be bigger in mode 1, when hot team has threads in reserve
5135  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5136  hot_teams[level].hot_team_nth = new_nproc;
5137 #endif // KMP_NESTED_HOT_TEAMS
5138  /* release the extra threads we don't need any more */
5139  for (f = new_nproc; f < team->t.t_nproc; f++) {
5140  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5141  if (__kmp_tasking_mode != tskm_immediate_exec) {
5142  // When decreasing team size, threads no longer in the team should
5143  // unref task team.
5144  team->t.t_threads[f]->th.th_task_team = NULL;
5145  }
5146  __kmp_free_thread(team->t.t_threads[f]);
5147  team->t.t_threads[f] = NULL;
5148  }
5149 #if KMP_NESTED_HOT_TEAMS
5150  } // (__kmp_hot_teams_mode == 0)
5151  else {
5152  // When keeping extra threads in team, switch threads to wait on own
5153  // b_go flag
5154  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5155  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5156  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5157  for (int b = 0; b < bs_last_barrier; ++b) {
5158  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5159  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5160  }
5161  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5162  }
5163  }
5164  }
5165 #endif // KMP_NESTED_HOT_TEAMS
5166  team->t.t_nproc = new_nproc;
5167  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5168  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5169  __kmp_reinitialize_team(team, new_icvs,
5170  root->r.r_uber_thread->th.th_ident);
5171 
5172  // Update remaining threads
5173  for (f = 0; f < new_nproc; ++f) {
5174  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5175  }
5176 
5177  // restore the current task state of the primary thread: should be the
5178  // implicit task
5179  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5180  team->t.t_threads[0], team));
5181 
5182  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5183 
5184 #ifdef KMP_DEBUG
5185  for (f = 0; f < team->t.t_nproc; f++) {
5186  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5187  team->t.t_threads[f]->th.th_team_nproc ==
5188  team->t.t_nproc);
5189  }
5190 #endif
5191 
5192  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5193 #if KMP_AFFINITY_SUPPORTED
5194  __kmp_partition_places(team);
5195 #endif
5196  } else { // team->t.t_nproc < new_nproc
5197 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5198  kmp_affin_mask_t *old_mask;
5199  if (KMP_AFFINITY_CAPABLE()) {
5200  KMP_CPU_ALLOC(old_mask);
5201  }
5202 #endif
5203 
5204  KA_TRACE(20,
5205  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5206  new_nproc));
5207  int old_nproc = team->t.t_nproc; // save old value and use to update only
5208  team->t.t_size_changed = 1;
5209 
5210 #if KMP_NESTED_HOT_TEAMS
5211  int avail_threads = hot_teams[level].hot_team_nth;
5212  if (new_nproc < avail_threads)
5213  avail_threads = new_nproc;
5214  kmp_info_t **other_threads = team->t.t_threads;
5215  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5216  // Adjust barrier data of reserved threads (if any) of the team
5217  // Other data will be set in __kmp_initialize_info() below.
5218  int b;
5219  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5220  for (b = 0; b < bs_last_barrier; ++b) {
5221  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5222  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5223 #if USE_DEBUGGER
5224  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5225 #endif
5226  }
5227  }
5228  if (hot_teams[level].hot_team_nth >= new_nproc) {
5229  // we have all needed threads in reserve, no need to allocate any
5230  // this only possible in mode 1, cannot have reserved threads in mode 0
5231  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5232  team->t.t_nproc = new_nproc; // just get reserved threads involved
5233  } else {
5234  // We may have some threads in reserve, but not enough;
5235  // get reserved threads involved if any.
5236  team->t.t_nproc = hot_teams[level].hot_team_nth;
5237  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5238 #endif // KMP_NESTED_HOT_TEAMS
5239  if (team->t.t_max_nproc < new_nproc) {
5240  /* reallocate larger arrays */
5241  __kmp_reallocate_team_arrays(team, new_nproc);
5242  __kmp_reinitialize_team(team, new_icvs, NULL);
5243  }
5244 
5245 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5246  /* Temporarily set full mask for primary thread before creation of
5247  workers. The reason is that workers inherit the affinity from the
5248  primary thread, so if a lot of workers are created on the single
5249  core quickly, they don't get a chance to set their own affinity for
5250  a long time. */
5251  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5252 #endif
5253 
5254  /* allocate new threads for the hot team */
5255  for (f = team->t.t_nproc; f < new_nproc; f++) {
5256  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5257  KMP_DEBUG_ASSERT(new_worker);
5258  team->t.t_threads[f] = new_worker;
5259 
5260  KA_TRACE(20,
5261  ("__kmp_allocate_team: team %d init T#%d arrived: "
5262  "join=%llu, plain=%llu\n",
5263  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5264  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5265  team->t.t_bar[bs_plain_barrier].b_arrived));
5266 
5267  { // Initialize barrier data for new threads.
5268  int b;
5269  kmp_balign_t *balign = new_worker->th.th_bar;
5270  for (b = 0; b < bs_last_barrier; ++b) {
5271  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5272  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5273  KMP_BARRIER_PARENT_FLAG);
5274 #if USE_DEBUGGER
5275  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5276 #endif
5277  }
5278  }
5279  }
5280 
5281 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5282  if (KMP_AFFINITY_CAPABLE()) {
5283  /* Restore initial primary thread's affinity mask */
5284  __kmp_set_system_affinity(old_mask, TRUE);
5285  KMP_CPU_FREE(old_mask);
5286  }
5287 #endif
5288 #if KMP_NESTED_HOT_TEAMS
5289  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5290 #endif // KMP_NESTED_HOT_TEAMS
5291  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5292  // Barrier size already increased earlier in this function
5293  // Activate team threads via th_used_in_team
5294  __kmp_add_threads_to_team(team, new_nproc);
5295  }
5296  /* make sure everyone is syncronized */
5297  // new threads below
5298  __kmp_initialize_team(team, new_nproc, new_icvs,
5299  root->r.r_uber_thread->th.th_ident);
5300 
5301  /* reinitialize the threads */
5302  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5303  for (f = 0; f < team->t.t_nproc; ++f)
5304  __kmp_initialize_info(team->t.t_threads[f], team, f,
5305  __kmp_gtid_from_tid(f, team));
5306 
5307  if (level) { // set th_task_state for new threads in nested hot team
5308  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5309  // only need to set the th_task_state for the new threads. th_task_state
5310  // for primary thread will not be accurate until after this in
5311  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5312  // get the correct value.
5313  for (f = old_nproc; f < team->t.t_nproc; ++f)
5314  team->t.t_threads[f]->th.th_task_state =
5315  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5316  } else { // set th_task_state for new threads in non-nested hot team
5317  // copy primary thread's state
5318  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5319  for (f = old_nproc; f < team->t.t_nproc; ++f)
5320  team->t.t_threads[f]->th.th_task_state = old_state;
5321  }
5322 
5323 #ifdef KMP_DEBUG
5324  for (f = 0; f < team->t.t_nproc; ++f) {
5325  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5326  team->t.t_threads[f]->th.th_team_nproc ==
5327  team->t.t_nproc);
5328  }
5329 #endif
5330 
5331  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332 #if KMP_AFFINITY_SUPPORTED
5333  __kmp_partition_places(team);
5334 #endif
5335  } // Check changes in number of threads
5336 
5337  kmp_info_t *master = team->t.t_threads[0];
5338  if (master->th.th_teams_microtask) {
5339  for (f = 1; f < new_nproc; ++f) {
5340  // propagate teams construct specific info to workers
5341  kmp_info_t *thr = team->t.t_threads[f];
5342  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5343  thr->th.th_teams_level = master->th.th_teams_level;
5344  thr->th.th_teams_size = master->th.th_teams_size;
5345  }
5346  }
5347 #if KMP_NESTED_HOT_TEAMS
5348  if (level) {
5349  // Sync barrier state for nested hot teams, not needed for outermost hot
5350  // team.
5351  for (f = 1; f < new_nproc; ++f) {
5352  kmp_info_t *thr = team->t.t_threads[f];
5353  int b;
5354  kmp_balign_t *balign = thr->th.th_bar;
5355  for (b = 0; b < bs_last_barrier; ++b) {
5356  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5357  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5358 #if USE_DEBUGGER
5359  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5360 #endif
5361  }
5362  }
5363  }
5364 #endif // KMP_NESTED_HOT_TEAMS
5365 
5366  /* reallocate space for arguments if necessary */
5367  __kmp_alloc_argv_entries(argc, team, TRUE);
5368  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5369  // The hot team re-uses the previous task team,
5370  // if untouched during the previous release->gather phase.
5371 
5372  KF_TRACE(10, (" hot_team = %p\n", team));
5373 
5374 #if KMP_DEBUG
5375  if (__kmp_tasking_mode != tskm_immediate_exec) {
5376  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5377  "task_team[1] = %p after reinit\n",
5378  team->t.t_task_team[0], team->t.t_task_team[1]));
5379  }
5380 #endif
5381 
5382 #if OMPT_SUPPORT
5383  __ompt_team_assign_id(team, ompt_parallel_data);
5384 #endif
5385 
5386  KMP_MB();
5387 
5388  return team;
5389  }
5390 
5391  /* next, let's try to take one from the team pool */
5392  KMP_MB();
5393  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5394  /* TODO: consider resizing undersized teams instead of reaping them, now
5395  that we have a resizing mechanism */
5396  if (team->t.t_max_nproc >= max_nproc) {
5397  /* take this team from the team pool */
5398  __kmp_team_pool = team->t.t_next_pool;
5399 
5400  if (max_nproc > 1 &&
5401  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5402  if (!team->t.b) { // Allocate barrier structure
5403  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5404  }
5405  }
5406 
5407  /* setup the team for fresh use */
5408  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5409 
5410  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5411  "task_team[1] %p to NULL\n",
5412  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5413  team->t.t_task_team[0] = NULL;
5414  team->t.t_task_team[1] = NULL;
5415 
5416  /* reallocate space for arguments if necessary */
5417  __kmp_alloc_argv_entries(argc, team, TRUE);
5418  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5419 
5420  KA_TRACE(
5421  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5422  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5423  { // Initialize barrier data.
5424  int b;
5425  for (b = 0; b < bs_last_barrier; ++b) {
5426  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5427 #if USE_DEBUGGER
5428  team->t.t_bar[b].b_master_arrived = 0;
5429  team->t.t_bar[b].b_team_arrived = 0;
5430 #endif
5431  }
5432  }
5433 
5434  team->t.t_proc_bind = new_proc_bind;
5435 
5436  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5437  team->t.t_id));
5438 
5439 #if OMPT_SUPPORT
5440  __ompt_team_assign_id(team, ompt_parallel_data);
5441 #endif
5442 
5443  KMP_MB();
5444 
5445  return team;
5446  }
5447 
5448  /* reap team if it is too small, then loop back and check the next one */
5449  // not sure if this is wise, but, will be redone during the hot-teams
5450  // rewrite.
5451  /* TODO: Use technique to find the right size hot-team, don't reap them */
5452  team = __kmp_reap_team(team);
5453  __kmp_team_pool = team;
5454  }
5455 
5456  /* nothing available in the pool, no matter, make a new team! */
5457  KMP_MB();
5458  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5459 
5460  /* and set it up */
5461  team->t.t_max_nproc = max_nproc;
5462  if (max_nproc > 1 &&
5463  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5464  // Allocate barrier structure
5465  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5466  }
5467 
5468  /* NOTE well, for some reason allocating one big buffer and dividing it up
5469  seems to really hurt performance a lot on the P4, so, let's not use this */
5470  __kmp_allocate_team_arrays(team, max_nproc);
5471 
5472  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5473  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5474 
5475  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5476  "%p to NULL\n",
5477  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5478  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5479  // memory, no need to duplicate
5480  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5481  // memory, no need to duplicate
5482 
5483  if (__kmp_storage_map) {
5484  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5485  }
5486 
5487  /* allocate space for arguments */
5488  __kmp_alloc_argv_entries(argc, team, FALSE);
5489  team->t.t_argc = argc;
5490 
5491  KA_TRACE(20,
5492  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5493  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5494  { // Initialize barrier data.
5495  int b;
5496  for (b = 0; b < bs_last_barrier; ++b) {
5497  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5498 #if USE_DEBUGGER
5499  team->t.t_bar[b].b_master_arrived = 0;
5500  team->t.t_bar[b].b_team_arrived = 0;
5501 #endif
5502  }
5503  }
5504 
5505  team->t.t_proc_bind = new_proc_bind;
5506 
5507 #if OMPT_SUPPORT
5508  __ompt_team_assign_id(team, ompt_parallel_data);
5509  team->t.ompt_serialized_team_info = NULL;
5510 #endif
5511 
5512  KMP_MB();
5513 
5514  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5515  team->t.t_id));
5516 
5517  return team;
5518 }
5519 
5520 /* TODO implement hot-teams at all levels */
5521 /* TODO implement lazy thread release on demand (disband request) */
5522 
5523 /* free the team. return it to the team pool. release all the threads
5524  * associated with it */
5525 void __kmp_free_team(kmp_root_t *root,
5526  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5527  int f;
5528  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5529  team->t.t_id));
5530 
5531  /* verify state */
5532  KMP_DEBUG_ASSERT(root);
5533  KMP_DEBUG_ASSERT(team);
5534  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5535  KMP_DEBUG_ASSERT(team->t.t_threads);
5536 
5537  int use_hot_team = team == root->r.r_hot_team;
5538 #if KMP_NESTED_HOT_TEAMS
5539  int level;
5540  if (master) {
5541  level = team->t.t_active_level - 1;
5542  if (master->th.th_teams_microtask) { // in teams construct?
5543  if (master->th.th_teams_size.nteams > 1) {
5544  ++level; // level was not increased in teams construct for
5545  // team_of_masters
5546  }
5547  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5548  master->th.th_teams_level == team->t.t_level) {
5549  ++level; // level was not increased in teams construct for
5550  // team_of_workers before the parallel
5551  } // team->t.t_level will be increased inside parallel
5552  }
5553 #if KMP_DEBUG
5554  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5555 #endif
5556  if (level < __kmp_hot_teams_max_level) {
5557  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5558  use_hot_team = 1;
5559  }
5560  }
5561 #endif // KMP_NESTED_HOT_TEAMS
5562 
5563  /* team is done working */
5564  TCW_SYNC_PTR(team->t.t_pkfn,
5565  NULL); // Important for Debugging Support Library.
5566 #if KMP_OS_WINDOWS
5567  team->t.t_copyin_counter = 0; // init counter for possible reuse
5568 #endif
5569  // Do not reset pointer to parent team to NULL for hot teams.
5570 
5571  /* if we are non-hot team, release our threads */
5572  if (!use_hot_team) {
5573  if (__kmp_tasking_mode != tskm_immediate_exec) {
5574  // Wait for threads to reach reapable state
5575  for (f = 1; f < team->t.t_nproc; ++f) {
5576  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5577  kmp_info_t *th = team->t.t_threads[f];
5578  volatile kmp_uint32 *state = &th->th.th_reap_state;
5579  while (*state != KMP_SAFE_TO_REAP) {
5580 #if KMP_OS_WINDOWS
5581  // On Windows a thread can be killed at any time, check this
5582  DWORD ecode;
5583  if (!__kmp_is_thread_alive(th, &ecode)) {
5584  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5585  break;
5586  }
5587 #endif
5588  // first check if thread is sleeping
5589  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5590  if (fl.is_sleeping())
5591  fl.resume(__kmp_gtid_from_thread(th));
5592  KMP_CPU_PAUSE();
5593  }
5594  }
5595 
5596  // Delete task teams
5597  int tt_idx;
5598  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5599  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5600  if (task_team != NULL) {
5601  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5602  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5603  team->t.t_threads[f]->th.th_task_team = NULL;
5604  }
5605  KA_TRACE(
5606  20,
5607  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5608  __kmp_get_gtid(), task_team, team->t.t_id));
5609 #if KMP_NESTED_HOT_TEAMS
5610  __kmp_free_task_team(master, task_team);
5611 #endif
5612  team->t.t_task_team[tt_idx] = NULL;
5613  }
5614  }
5615  }
5616 
5617  // Reset pointer to parent team only for non-hot teams.
5618  team->t.t_parent = NULL;
5619  team->t.t_level = 0;
5620  team->t.t_active_level = 0;
5621 
5622  /* free the worker threads */
5623  for (f = 1; f < team->t.t_nproc; ++f) {
5624  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5625  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5626  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5627  1, 2);
5628  }
5629  __kmp_free_thread(team->t.t_threads[f]);
5630  }
5631 
5632  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5633  if (team->t.b) {
5634  // wake up thread at old location
5635  team->t.b->go_release();
5636  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5637  for (f = 1; f < team->t.t_nproc; ++f) {
5638  if (team->t.b->sleep[f].sleep) {
5639  __kmp_atomic_resume_64(
5640  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5641  (kmp_atomic_flag_64<> *)NULL);
5642  }
5643  }
5644  }
5645  // Wait for threads to be removed from team
5646  for (int f = 1; f < team->t.t_nproc; ++f) {
5647  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5648  KMP_CPU_PAUSE();
5649  }
5650  }
5651  }
5652 
5653  for (f = 1; f < team->t.t_nproc; ++f) {
5654  team->t.t_threads[f] = NULL;
5655  }
5656 
5657  if (team->t.t_max_nproc > 1 &&
5658  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5659  distributedBarrier::deallocate(team->t.b);
5660  team->t.b = NULL;
5661  }
5662  /* put the team back in the team pool */
5663  /* TODO limit size of team pool, call reap_team if pool too large */
5664  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5665  __kmp_team_pool = (volatile kmp_team_t *)team;
5666  } else { // Check if team was created for primary threads in teams construct
5667  // See if first worker is a CG root
5668  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5669  team->t.t_threads[1]->th.th_cg_roots);
5670  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5671  // Clean up the CG root nodes on workers so that this team can be re-used
5672  for (f = 1; f < team->t.t_nproc; ++f) {
5673  kmp_info_t *thr = team->t.t_threads[f];
5674  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5675  thr->th.th_cg_roots->cg_root == thr);
5676  // Pop current CG root off list
5677  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5678  thr->th.th_cg_roots = tmp->up;
5679  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5680  " up to node %p. cg_nthreads was %d\n",
5681  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5682  int i = tmp->cg_nthreads--;
5683  if (i == 1) {
5684  __kmp_free(tmp); // free CG if we are the last thread in it
5685  }
5686  // Restore current task's thread_limit from CG root
5687  if (thr->th.th_cg_roots)
5688  thr->th.th_current_task->td_icvs.thread_limit =
5689  thr->th.th_cg_roots->cg_thread_limit;
5690  }
5691  }
5692  }
5693 
5694  KMP_MB();
5695 }
5696 
5697 /* reap the team. destroy it, reclaim all its resources and free its memory */
5698 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5699  kmp_team_t *next_pool = team->t.t_next_pool;
5700 
5701  KMP_DEBUG_ASSERT(team);
5702  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5703  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5704  KMP_DEBUG_ASSERT(team->t.t_threads);
5705  KMP_DEBUG_ASSERT(team->t.t_argv);
5706 
5707  /* TODO clean the threads that are a part of this? */
5708 
5709  /* free stuff */
5710  __kmp_free_team_arrays(team);
5711  if (team->t.t_argv != &team->t.t_inline_argv[0])
5712  __kmp_free((void *)team->t.t_argv);
5713  __kmp_free(team);
5714 
5715  KMP_MB();
5716  return next_pool;
5717 }
5718 
5719 // Free the thread. Don't reap it, just place it on the pool of available
5720 // threads.
5721 //
5722 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5723 // binding for the affinity mechanism to be useful.
5724 //
5725 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5726 // However, we want to avoid a potential performance problem by always
5727 // scanning through the list to find the correct point at which to insert
5728 // the thread (potential N**2 behavior). To do this we keep track of the
5729 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5730 // With single-level parallelism, threads will always be added to the tail
5731 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5732 // parallelism, all bets are off and we may need to scan through the entire
5733 // free list.
5734 //
5735 // This change also has a potentially large performance benefit, for some
5736 // applications. Previously, as threads were freed from the hot team, they
5737 // would be placed back on the free list in inverse order. If the hot team
5738 // grew back to it's original size, then the freed thread would be placed
5739 // back on the hot team in reverse order. This could cause bad cache
5740 // locality problems on programs where the size of the hot team regularly
5741 // grew and shrunk.
5742 //
5743 // Now, for single-level parallelism, the OMP tid is always == gtid.
5744 void __kmp_free_thread(kmp_info_t *this_th) {
5745  int gtid;
5746  kmp_info_t **scan;
5747 
5748  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5749  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5750 
5751  KMP_DEBUG_ASSERT(this_th);
5752 
5753  // When moving thread to pool, switch thread to wait on own b_go flag, and
5754  // uninitialized (NULL team).
5755  int b;
5756  kmp_balign_t *balign = this_th->th.th_bar;
5757  for (b = 0; b < bs_last_barrier; ++b) {
5758  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5759  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5760  balign[b].bb.team = NULL;
5761  balign[b].bb.leaf_kids = 0;
5762  }
5763  this_th->th.th_task_state = 0;
5764  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5765 
5766  /* put thread back on the free pool */
5767  TCW_PTR(this_th->th.th_team, NULL);
5768  TCW_PTR(this_th->th.th_root, NULL);
5769  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5770 
5771  while (this_th->th.th_cg_roots) {
5772  this_th->th.th_cg_roots->cg_nthreads--;
5773  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5774  " %p of thread %p to %d\n",
5775  this_th, this_th->th.th_cg_roots,
5776  this_th->th.th_cg_roots->cg_root,
5777  this_th->th.th_cg_roots->cg_nthreads));
5778  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5779  if (tmp->cg_root == this_th) { // Thread is a cg_root
5780  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5781  KA_TRACE(
5782  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5783  this_th->th.th_cg_roots = tmp->up;
5784  __kmp_free(tmp);
5785  } else { // Worker thread
5786  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5787  __kmp_free(tmp);
5788  }
5789  this_th->th.th_cg_roots = NULL;
5790  break;
5791  }
5792  }
5793 
5794  /* If the implicit task assigned to this thread can be used by other threads
5795  * -> multiple threads can share the data and try to free the task at
5796  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5797  * with higher probability when hot team is disabled but can occurs even when
5798  * the hot team is enabled */
5799  __kmp_free_implicit_task(this_th);
5800  this_th->th.th_current_task = NULL;
5801 
5802  // If the __kmp_thread_pool_insert_pt is already past the new insert
5803  // point, then we need to re-scan the entire list.
5804  gtid = this_th->th.th_info.ds.ds_gtid;
5805  if (__kmp_thread_pool_insert_pt != NULL) {
5806  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5807  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5808  __kmp_thread_pool_insert_pt = NULL;
5809  }
5810  }
5811 
5812  // Scan down the list to find the place to insert the thread.
5813  // scan is the address of a link in the list, possibly the address of
5814  // __kmp_thread_pool itself.
5815  //
5816  // In the absence of nested parallelism, the for loop will have 0 iterations.
5817  if (__kmp_thread_pool_insert_pt != NULL) {
5818  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5819  } else {
5820  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5821  }
5822  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5823  scan = &((*scan)->th.th_next_pool))
5824  ;
5825 
5826  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5827  // to its address.
5828  TCW_PTR(this_th->th.th_next_pool, *scan);
5829  __kmp_thread_pool_insert_pt = *scan = this_th;
5830  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5831  (this_th->th.th_info.ds.ds_gtid <
5832  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5833  TCW_4(this_th->th.th_in_pool, TRUE);
5834  __kmp_suspend_initialize_thread(this_th);
5835  __kmp_lock_suspend_mx(this_th);
5836  if (this_th->th.th_active == TRUE) {
5837  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5838  this_th->th.th_active_in_pool = TRUE;
5839  }
5840 #if KMP_DEBUG
5841  else {
5842  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5843  }
5844 #endif
5845  __kmp_unlock_suspend_mx(this_th);
5846 
5847  TCW_4(__kmp_nth, __kmp_nth - 1);
5848 
5849 #ifdef KMP_ADJUST_BLOCKTIME
5850  /* Adjust blocktime back to user setting or default if necessary */
5851  /* Middle initialization might never have occurred */
5852  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5853  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5854  if (__kmp_nth <= __kmp_avail_proc) {
5855  __kmp_zero_bt = FALSE;
5856  }
5857  }
5858 #endif /* KMP_ADJUST_BLOCKTIME */
5859 
5860  KMP_MB();
5861 }
5862 
5863 /* ------------------------------------------------------------------------ */
5864 
5865 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5866 #if OMP_PROFILING_SUPPORT
5867  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5868  // TODO: add a configuration option for time granularity
5869  if (ProfileTraceFile)
5870  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5871 #endif
5872 
5873  int gtid = this_thr->th.th_info.ds.ds_gtid;
5874  /* void *stack_data;*/
5875  kmp_team_t **volatile pteam;
5876 
5877  KMP_MB();
5878  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5879 
5880  if (__kmp_env_consistency_check) {
5881  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5882  }
5883 
5884 #if OMPD_SUPPORT
5885  if (ompd_state & OMPD_ENABLE_BP)
5886  ompd_bp_thread_begin();
5887 #endif
5888 
5889 #if OMPT_SUPPORT
5890  ompt_data_t *thread_data = nullptr;
5891  if (ompt_enabled.enabled) {
5892  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5893  *thread_data = ompt_data_none;
5894 
5895  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5896  this_thr->th.ompt_thread_info.wait_id = 0;
5897  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5898  this_thr->th.ompt_thread_info.parallel_flags = 0;
5899  if (ompt_enabled.ompt_callback_thread_begin) {
5900  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5901  ompt_thread_worker, thread_data);
5902  }
5903  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5904  }
5905 #endif
5906 
5907  /* This is the place where threads wait for work */
5908  while (!TCR_4(__kmp_global.g.g_done)) {
5909  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5910  KMP_MB();
5911 
5912  /* wait for work to do */
5913  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5914 
5915  /* No tid yet since not part of a team */
5916  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5917 
5918 #if OMPT_SUPPORT
5919  if (ompt_enabled.enabled) {
5920  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5921  }
5922 #endif
5923 
5924  pteam = &this_thr->th.th_team;
5925 
5926  /* have we been allocated? */
5927  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5928  /* we were just woken up, so run our new task */
5929  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5930  int rc;
5931  KA_TRACE(20,
5932  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5933  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5934  (*pteam)->t.t_pkfn));
5935 
5936  updateHWFPControl(*pteam);
5937 
5938 #if OMPT_SUPPORT
5939  if (ompt_enabled.enabled) {
5940  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5941  }
5942 #endif
5943 
5944  rc = (*pteam)->t.t_invoke(gtid);
5945  KMP_ASSERT(rc);
5946 
5947  KMP_MB();
5948  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5949  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5950  (*pteam)->t.t_pkfn));
5951  }
5952 #if OMPT_SUPPORT
5953  if (ompt_enabled.enabled) {
5954  /* no frame set while outside task */
5955  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5956 
5957  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5958  }
5959 #endif
5960  /* join barrier after parallel region */
5961  __kmp_join_barrier(gtid);
5962  }
5963  }
5964  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5965 
5966 #if OMPD_SUPPORT
5967  if (ompd_state & OMPD_ENABLE_BP)
5968  ompd_bp_thread_end();
5969 #endif
5970 
5971 #if OMPT_SUPPORT
5972  if (ompt_enabled.ompt_callback_thread_end) {
5973  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5974  }
5975 #endif
5976 
5977  this_thr->th.th_task_team = NULL;
5978  /* run the destructors for the threadprivate data for this thread */
5979  __kmp_common_destroy_gtid(gtid);
5980 
5981  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5982  KMP_MB();
5983 
5984 #if OMP_PROFILING_SUPPORT
5985  llvm::timeTraceProfilerFinishThread();
5986 #endif
5987  return this_thr;
5988 }
5989 
5990 /* ------------------------------------------------------------------------ */
5991 
5992 void __kmp_internal_end_dest(void *specific_gtid) {
5993  // Make sure no significant bits are lost
5994  int gtid;
5995  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5996 
5997  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5998  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5999  * this is because 0 is reserved for the nothing-stored case */
6000 
6001  __kmp_internal_end_thread(gtid);
6002 }
6003 
6004 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6005 
6006 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6007  __kmp_internal_end_atexit();
6008 }
6009 
6010 #endif
6011 
6012 /* [Windows] josh: when the atexit handler is called, there may still be more
6013  than one thread alive */
6014 void __kmp_internal_end_atexit(void) {
6015  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6016  /* [Windows]
6017  josh: ideally, we want to completely shutdown the library in this atexit
6018  handler, but stat code that depends on thread specific data for gtid fails
6019  because that data becomes unavailable at some point during the shutdown, so
6020  we call __kmp_internal_end_thread instead. We should eventually remove the
6021  dependency on __kmp_get_specific_gtid in the stat code and use
6022  __kmp_internal_end_library to cleanly shutdown the library.
6023 
6024  // TODO: Can some of this comment about GVS be removed?
6025  I suspect that the offending stat code is executed when the calling thread
6026  tries to clean up a dead root thread's data structures, resulting in GVS
6027  code trying to close the GVS structures for that thread, but since the stat
6028  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6029  the calling thread is cleaning up itself instead of another thread, it get
6030  confused. This happens because allowing a thread to unregister and cleanup
6031  another thread is a recent modification for addressing an issue.
6032  Based on the current design (20050722), a thread may end up
6033  trying to unregister another thread only if thread death does not trigger
6034  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6035  thread specific data destructor function to detect thread death. For
6036  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6037  is nothing. Thus, the workaround is applicable only for Windows static
6038  stat library. */
6039  __kmp_internal_end_library(-1);
6040 #if KMP_OS_WINDOWS
6041  __kmp_close_console();
6042 #endif
6043 }
6044 
6045 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6046  // It is assumed __kmp_forkjoin_lock is acquired.
6047 
6048  int gtid;
6049 
6050  KMP_DEBUG_ASSERT(thread != NULL);
6051 
6052  gtid = thread->th.th_info.ds.ds_gtid;
6053 
6054  if (!is_root) {
6055  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6056  /* Assume the threads are at the fork barrier here */
6057  KA_TRACE(
6058  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6059  gtid));
6060  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6061  while (
6062  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6063  KMP_CPU_PAUSE();
6064  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6065  } else {
6066  /* Need release fence here to prevent seg faults for tree forkjoin
6067  barrier (GEH) */
6068  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6069  thread);
6070  __kmp_release_64(&flag);
6071  }
6072  }
6073 
6074  // Terminate OS thread.
6075  __kmp_reap_worker(thread);
6076 
6077  // The thread was killed asynchronously. If it was actively
6078  // spinning in the thread pool, decrement the global count.
6079  //
6080  // There is a small timing hole here - if the worker thread was just waking
6081  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6082  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6083  // the global counter might not get updated.
6084  //
6085  // Currently, this can only happen as the library is unloaded,
6086  // so there are no harmful side effects.
6087  if (thread->th.th_active_in_pool) {
6088  thread->th.th_active_in_pool = FALSE;
6089  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6090  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6091  }
6092  }
6093 
6094  __kmp_free_implicit_task(thread);
6095 
6096 // Free the fast memory for tasking
6097 #if USE_FAST_MEMORY
6098  __kmp_free_fast_memory(thread);
6099 #endif /* USE_FAST_MEMORY */
6100 
6101  __kmp_suspend_uninitialize_thread(thread);
6102 
6103  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6104  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6105 
6106  --__kmp_all_nth;
6107  // __kmp_nth was decremented when thread is added to the pool.
6108 
6109 #ifdef KMP_ADJUST_BLOCKTIME
6110  /* Adjust blocktime back to user setting or default if necessary */
6111  /* Middle initialization might never have occurred */
6112  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6113  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6114  if (__kmp_nth <= __kmp_avail_proc) {
6115  __kmp_zero_bt = FALSE;
6116  }
6117  }
6118 #endif /* KMP_ADJUST_BLOCKTIME */
6119 
6120  /* free the memory being used */
6121  if (__kmp_env_consistency_check) {
6122  if (thread->th.th_cons) {
6123  __kmp_free_cons_stack(thread->th.th_cons);
6124  thread->th.th_cons = NULL;
6125  }
6126  }
6127 
6128  if (thread->th.th_pri_common != NULL) {
6129  __kmp_free(thread->th.th_pri_common);
6130  thread->th.th_pri_common = NULL;
6131  }
6132 
6133  if (thread->th.th_task_state_memo_stack != NULL) {
6134  __kmp_free(thread->th.th_task_state_memo_stack);
6135  thread->th.th_task_state_memo_stack = NULL;
6136  }
6137 
6138 #if KMP_USE_BGET
6139  if (thread->th.th_local.bget_data != NULL) {
6140  __kmp_finalize_bget(thread);
6141  }
6142 #endif
6143 
6144 #if KMP_AFFINITY_SUPPORTED
6145  if (thread->th.th_affin_mask != NULL) {
6146  KMP_CPU_FREE(thread->th.th_affin_mask);
6147  thread->th.th_affin_mask = NULL;
6148  }
6149 #endif /* KMP_AFFINITY_SUPPORTED */
6150 
6151 #if KMP_USE_HIER_SCHED
6152  if (thread->th.th_hier_bar_data != NULL) {
6153  __kmp_free(thread->th.th_hier_bar_data);
6154  thread->th.th_hier_bar_data = NULL;
6155  }
6156 #endif
6157 
6158  __kmp_reap_team(thread->th.th_serial_team);
6159  thread->th.th_serial_team = NULL;
6160  __kmp_free(thread);
6161 
6162  KMP_MB();
6163 
6164 } // __kmp_reap_thread
6165 
6166 static void __kmp_internal_end(void) {
6167  int i;
6168 
6169  /* First, unregister the library */
6170  __kmp_unregister_library();
6171 
6172 #if KMP_OS_WINDOWS
6173  /* In Win static library, we can't tell when a root actually dies, so we
6174  reclaim the data structures for any root threads that have died but not
6175  unregistered themselves, in order to shut down cleanly.
6176  In Win dynamic library we also can't tell when a thread dies. */
6177  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6178 // dead roots
6179 #endif
6180 
6181  for (i = 0; i < __kmp_threads_capacity; i++)
6182  if (__kmp_root[i])
6183  if (__kmp_root[i]->r.r_active)
6184  break;
6185  KMP_MB(); /* Flush all pending memory write invalidates. */
6186  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6187 
6188  if (i < __kmp_threads_capacity) {
6189 #if KMP_USE_MONITOR
6190  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6191  KMP_MB(); /* Flush all pending memory write invalidates. */
6192 
6193  // Need to check that monitor was initialized before reaping it. If we are
6194  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6195  // __kmp_monitor will appear to contain valid data, but it is only valid in
6196  // the parent process, not the child.
6197  // New behavior (201008): instead of keying off of the flag
6198  // __kmp_init_parallel, the monitor thread creation is keyed off
6199  // of the new flag __kmp_init_monitor.
6200  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6201  if (TCR_4(__kmp_init_monitor)) {
6202  __kmp_reap_monitor(&__kmp_monitor);
6203  TCW_4(__kmp_init_monitor, 0);
6204  }
6205  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6206  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6207 #endif // KMP_USE_MONITOR
6208  } else {
6209 /* TODO move this to cleanup code */
6210 #ifdef KMP_DEBUG
6211  /* make sure that everything has properly ended */
6212  for (i = 0; i < __kmp_threads_capacity; i++) {
6213  if (__kmp_root[i]) {
6214  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6215  // there can be uber threads alive here
6216  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6217  }
6218  }
6219 #endif
6220 
6221  KMP_MB();
6222 
6223  // Reap the worker threads.
6224  // This is valid for now, but be careful if threads are reaped sooner.
6225  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6226  // Get the next thread from the pool.
6227  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6228  __kmp_thread_pool = thread->th.th_next_pool;
6229  // Reap it.
6230  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6231  thread->th.th_next_pool = NULL;
6232  thread->th.th_in_pool = FALSE;
6233  __kmp_reap_thread(thread, 0);
6234  }
6235  __kmp_thread_pool_insert_pt = NULL;
6236 
6237  // Reap teams.
6238  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6239  // Get the next team from the pool.
6240  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6241  __kmp_team_pool = team->t.t_next_pool;
6242  // Reap it.
6243  team->t.t_next_pool = NULL;
6244  __kmp_reap_team(team);
6245  }
6246 
6247  __kmp_reap_task_teams();
6248 
6249 #if KMP_OS_UNIX
6250  // Threads that are not reaped should not access any resources since they
6251  // are going to be deallocated soon, so the shutdown sequence should wait
6252  // until all threads either exit the final spin-waiting loop or begin
6253  // sleeping after the given blocktime.
6254  for (i = 0; i < __kmp_threads_capacity; i++) {
6255  kmp_info_t *thr = __kmp_threads[i];
6256  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6257  KMP_CPU_PAUSE();
6258  }
6259 #endif
6260 
6261  for (i = 0; i < __kmp_threads_capacity; ++i) {
6262  // TBD: Add some checking...
6263  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6264  }
6265 
6266  /* Make sure all threadprivate destructors get run by joining with all
6267  worker threads before resetting this flag */
6268  TCW_SYNC_4(__kmp_init_common, FALSE);
6269 
6270  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6271  KMP_MB();
6272 
6273 #if KMP_USE_MONITOR
6274  // See note above: One of the possible fixes for CQ138434 / CQ140126
6275  //
6276  // FIXME: push both code fragments down and CSE them?
6277  // push them into __kmp_cleanup() ?
6278  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6279  if (TCR_4(__kmp_init_monitor)) {
6280  __kmp_reap_monitor(&__kmp_monitor);
6281  TCW_4(__kmp_init_monitor, 0);
6282  }
6283  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6284  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6285 #endif
6286  } /* else !__kmp_global.t_active */
6287  TCW_4(__kmp_init_gtid, FALSE);
6288  KMP_MB(); /* Flush all pending memory write invalidates. */
6289 
6290  __kmp_cleanup();
6291 #if OMPT_SUPPORT
6292  ompt_fini();
6293 #endif
6294 }
6295 
6296 void __kmp_internal_end_library(int gtid_req) {
6297  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6298  /* this shouldn't be a race condition because __kmp_internal_end() is the
6299  only place to clear __kmp_serial_init */
6300  /* we'll check this later too, after we get the lock */
6301  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6302  // redundant, because the next check will work in any case.
6303  if (__kmp_global.g.g_abort) {
6304  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6305  /* TODO abort? */
6306  return;
6307  }
6308  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6309  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6310  return;
6311  }
6312 
6313  // If hidden helper team has been initialized, we need to deinit it
6314  if (TCR_4(__kmp_init_hidden_helper) &&
6315  !TCR_4(__kmp_hidden_helper_team_done)) {
6316  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6317  // First release the main thread to let it continue its work
6318  __kmp_hidden_helper_main_thread_release();
6319  // Wait until the hidden helper team has been destroyed
6320  __kmp_hidden_helper_threads_deinitz_wait();
6321  }
6322 
6323  KMP_MB(); /* Flush all pending memory write invalidates. */
6324  /* find out who we are and what we should do */
6325  {
6326  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6327  KA_TRACE(
6328  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6329  if (gtid == KMP_GTID_SHUTDOWN) {
6330  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6331  "already shutdown\n"));
6332  return;
6333  } else if (gtid == KMP_GTID_MONITOR) {
6334  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6335  "registered, or system shutdown\n"));
6336  return;
6337  } else if (gtid == KMP_GTID_DNE) {
6338  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6339  "shutdown\n"));
6340  /* we don't know who we are, but we may still shutdown the library */
6341  } else if (KMP_UBER_GTID(gtid)) {
6342  /* unregister ourselves as an uber thread. gtid is no longer valid */
6343  if (__kmp_root[gtid]->r.r_active) {
6344  __kmp_global.g.g_abort = -1;
6345  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6346  __kmp_unregister_library();
6347  KA_TRACE(10,
6348  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6349  gtid));
6350  return;
6351  } else {
6352  KA_TRACE(
6353  10,
6354  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6355  __kmp_unregister_root_current_thread(gtid);
6356  }
6357  } else {
6358 /* worker threads may call this function through the atexit handler, if they
6359  * call exit() */
6360 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6361  TODO: do a thorough shutdown instead */
6362 #ifdef DUMP_DEBUG_ON_EXIT
6363  if (__kmp_debug_buf)
6364  __kmp_dump_debug_buffer();
6365 #endif
6366  // added unregister library call here when we switch to shm linux
6367  // if we don't, it will leave lots of files in /dev/shm
6368  // cleanup shared memory file before exiting.
6369  __kmp_unregister_library();
6370  return;
6371  }
6372  }
6373  /* synchronize the termination process */
6374  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6375 
6376  /* have we already finished */
6377  if (__kmp_global.g.g_abort) {
6378  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6379  /* TODO abort? */
6380  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6381  return;
6382  }
6383  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6384  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6385  return;
6386  }
6387 
6388  /* We need this lock to enforce mutex between this reading of
6389  __kmp_threads_capacity and the writing by __kmp_register_root.
6390  Alternatively, we can use a counter of roots that is atomically updated by
6391  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6392  __kmp_internal_end_*. */
6393  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6394 
6395  /* now we can safely conduct the actual termination */
6396  __kmp_internal_end();
6397 
6398  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6399  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6400 
6401  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6402 
6403 #ifdef DUMP_DEBUG_ON_EXIT
6404  if (__kmp_debug_buf)
6405  __kmp_dump_debug_buffer();
6406 #endif
6407 
6408 #if KMP_OS_WINDOWS
6409  __kmp_close_console();
6410 #endif
6411 
6412  __kmp_fini_allocator();
6413 
6414 } // __kmp_internal_end_library
6415 
6416 void __kmp_internal_end_thread(int gtid_req) {
6417  int i;
6418 
6419  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6420  /* this shouldn't be a race condition because __kmp_internal_end() is the
6421  * only place to clear __kmp_serial_init */
6422  /* we'll check this later too, after we get the lock */
6423  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6424  // redundant, because the next check will work in any case.
6425  if (__kmp_global.g.g_abort) {
6426  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6427  /* TODO abort? */
6428  return;
6429  }
6430  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6431  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6432  return;
6433  }
6434 
6435  // If hidden helper team has been initialized, we need to deinit it
6436  if (TCR_4(__kmp_init_hidden_helper) &&
6437  !TCR_4(__kmp_hidden_helper_team_done)) {
6438  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6439  // First release the main thread to let it continue its work
6440  __kmp_hidden_helper_main_thread_release();
6441  // Wait until the hidden helper team has been destroyed
6442  __kmp_hidden_helper_threads_deinitz_wait();
6443  }
6444 
6445  KMP_MB(); /* Flush all pending memory write invalidates. */
6446 
6447  /* find out who we are and what we should do */
6448  {
6449  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6450  KA_TRACE(10,
6451  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6452  if (gtid == KMP_GTID_SHUTDOWN) {
6453  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6454  "already shutdown\n"));
6455  return;
6456  } else if (gtid == KMP_GTID_MONITOR) {
6457  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6458  "registered, or system shutdown\n"));
6459  return;
6460  } else if (gtid == KMP_GTID_DNE) {
6461  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6462  "shutdown\n"));
6463  return;
6464  /* we don't know who we are */
6465  } else if (KMP_UBER_GTID(gtid)) {
6466  /* unregister ourselves as an uber thread. gtid is no longer valid */
6467  if (__kmp_root[gtid]->r.r_active) {
6468  __kmp_global.g.g_abort = -1;
6469  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6470  KA_TRACE(10,
6471  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6472  gtid));
6473  return;
6474  } else {
6475  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6476  gtid));
6477  __kmp_unregister_root_current_thread(gtid);
6478  }
6479  } else {
6480  /* just a worker thread, let's leave */
6481  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6482 
6483  if (gtid >= 0) {
6484  __kmp_threads[gtid]->th.th_task_team = NULL;
6485  }
6486 
6487  KA_TRACE(10,
6488  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6489  gtid));
6490  return;
6491  }
6492  }
6493 #if KMP_DYNAMIC_LIB
6494  if (__kmp_pause_status != kmp_hard_paused)
6495  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6496  // because we will better shutdown later in the library destructor.
6497  {
6498  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6499  return;
6500  }
6501 #endif
6502  /* synchronize the termination process */
6503  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6504 
6505  /* have we already finished */
6506  if (__kmp_global.g.g_abort) {
6507  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6508  /* TODO abort? */
6509  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6510  return;
6511  }
6512  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6513  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6514  return;
6515  }
6516 
6517  /* We need this lock to enforce mutex between this reading of
6518  __kmp_threads_capacity and the writing by __kmp_register_root.
6519  Alternatively, we can use a counter of roots that is atomically updated by
6520  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6521  __kmp_internal_end_*. */
6522 
6523  /* should we finish the run-time? are all siblings done? */
6524  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6525 
6526  for (i = 0; i < __kmp_threads_capacity; ++i) {
6527  if (KMP_UBER_GTID(i)) {
6528  KA_TRACE(
6529  10,
6530  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6531  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6532  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6533  return;
6534  }
6535  }
6536 
6537  /* now we can safely conduct the actual termination */
6538 
6539  __kmp_internal_end();
6540 
6541  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6542  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6543 
6544  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6545 
6546 #ifdef DUMP_DEBUG_ON_EXIT
6547  if (__kmp_debug_buf)
6548  __kmp_dump_debug_buffer();
6549 #endif
6550 } // __kmp_internal_end_thread
6551 
6552 // -----------------------------------------------------------------------------
6553 // Library registration stuff.
6554 
6555 static long __kmp_registration_flag = 0;
6556 // Random value used to indicate library initialization.
6557 static char *__kmp_registration_str = NULL;
6558 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6559 
6560 static inline char *__kmp_reg_status_name() {
6561 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6562  each thread. If registration and unregistration go in different threads
6563  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6564  env var can not be found, because the name will contain different pid. */
6565 // macOS* complains about name being too long with additional getuid()
6566 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6567  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6568  (int)getuid());
6569 #else
6570  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6571 #endif
6572 } // __kmp_reg_status_get
6573 
6574 void __kmp_register_library_startup(void) {
6575 
6576  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6577  int done = 0;
6578  union {
6579  double dtime;
6580  long ltime;
6581  } time;
6582 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6583  __kmp_initialize_system_tick();
6584 #endif
6585  __kmp_read_system_time(&time.dtime);
6586  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6587  __kmp_registration_str =
6588  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6589  __kmp_registration_flag, KMP_LIBRARY_FILE);
6590 
6591  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6592  __kmp_registration_str));
6593 
6594  while (!done) {
6595 
6596  char *value = NULL; // Actual value of the environment variable.
6597 
6598 #if defined(KMP_USE_SHM)
6599  char *shm_name = __kmp_str_format("/%s", name);
6600  int shm_preexist = 0;
6601  char *data1;
6602  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6603  if ((fd1 == -1) && (errno == EEXIST)) {
6604  // file didn't open because it already exists.
6605  // try opening existing file
6606  fd1 = shm_open(shm_name, O_RDWR, 0666);
6607  if (fd1 == -1) { // file didn't open
6608  // error out here
6609  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6610  __kmp_msg_null);
6611  } else {
6612  // able to open existing file
6613  shm_preexist = 1;
6614  }
6615  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6616  // already exists.
6617  // error out here.
6618  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6619  __kmp_msg_null);
6620  }
6621  if (shm_preexist == 0) {
6622  // we created SHM now set size
6623  if (ftruncate(fd1, SHM_SIZE) == -1) {
6624  // error occured setting size;
6625  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6626  KMP_ERR(errno), __kmp_msg_null);
6627  }
6628  }
6629  data1 =
6630  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6631  if (data1 == MAP_FAILED) {
6632  // failed to map shared memory
6633  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6634  __kmp_msg_null);
6635  }
6636  if (shm_preexist == 0) { // set data to SHM, set value
6637  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6638  }
6639  // Read value from either what we just wrote or existing file.
6640  value = __kmp_str_format("%s", data1); // read value from SHM
6641  munmap(data1, SHM_SIZE);
6642  close(fd1);
6643 #else // Windows and unix with static library
6644  // Set environment variable, but do not overwrite if it is exist.
6645  __kmp_env_set(name, __kmp_registration_str, 0);
6646  // read value to see if it got set
6647  value = __kmp_env_get(name);
6648 #endif
6649 
6650  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6651  done = 1; // Ok, environment variable set successfully, exit the loop.
6652  } else {
6653  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6654  // Check whether it alive or dead.
6655  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6656  char *tail = value;
6657  char *flag_addr_str = NULL;
6658  char *flag_val_str = NULL;
6659  char const *file_name = NULL;
6660  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6661  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6662  file_name = tail;
6663  if (tail != NULL) {
6664  unsigned long *flag_addr = 0;
6665  unsigned long flag_val = 0;
6666  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6667  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6668  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6669  // First, check whether environment-encoded address is mapped into
6670  // addr space.
6671  // If so, dereference it to see if it still has the right value.
6672  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6673  neighbor = 1;
6674  } else {
6675  // If not, then we know the other copy of the library is no longer
6676  // running.
6677  neighbor = 2;
6678  }
6679  }
6680  }
6681  switch (neighbor) {
6682  case 0: // Cannot parse environment variable -- neighbor status unknown.
6683  // Assume it is the incompatible format of future version of the
6684  // library. Assume the other library is alive.
6685  // WARN( ... ); // TODO: Issue a warning.
6686  file_name = "unknown library";
6687  KMP_FALLTHROUGH();
6688  // Attention! Falling to the next case. That's intentional.
6689  case 1: { // Neighbor is alive.
6690  // Check it is allowed.
6691  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6692  if (!__kmp_str_match_true(duplicate_ok)) {
6693  // That's not allowed. Issue fatal error.
6694  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6695  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6696  }
6697  KMP_INTERNAL_FREE(duplicate_ok);
6698  __kmp_duplicate_library_ok = 1;
6699  done = 1; // Exit the loop.
6700  } break;
6701  case 2: { // Neighbor is dead.
6702 
6703 #if defined(KMP_USE_SHM)
6704  // close shared memory.
6705  shm_unlink(shm_name); // this removes file in /dev/shm
6706 #else
6707  // Clear the variable and try to register library again.
6708  __kmp_env_unset(name);
6709 #endif
6710  } break;
6711  default: {
6712  KMP_DEBUG_ASSERT(0);
6713  } break;
6714  }
6715  }
6716  KMP_INTERNAL_FREE((void *)value);
6717 #if defined(KMP_USE_SHM)
6718  KMP_INTERNAL_FREE((void *)shm_name);
6719 #endif
6720  } // while
6721  KMP_INTERNAL_FREE((void *)name);
6722 
6723 } // func __kmp_register_library_startup
6724 
6725 void __kmp_unregister_library(void) {
6726 
6727  char *name = __kmp_reg_status_name();
6728  char *value = NULL;
6729 
6730 #if defined(KMP_USE_SHM)
6731  char *shm_name = __kmp_str_format("/%s", name);
6732  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6733  if (fd1 == -1) {
6734  // file did not open. return.
6735  return;
6736  }
6737  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6738  if (data1 != MAP_FAILED) {
6739  value = __kmp_str_format("%s", data1); // read value from SHM
6740  munmap(data1, SHM_SIZE);
6741  }
6742  close(fd1);
6743 #else
6744  value = __kmp_env_get(name);
6745 #endif
6746 
6747  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6748  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6749  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6750 // Ok, this is our variable. Delete it.
6751 #if defined(KMP_USE_SHM)
6752  shm_unlink(shm_name); // this removes file in /dev/shm
6753 #else
6754  __kmp_env_unset(name);
6755 #endif
6756  }
6757 
6758 #if defined(KMP_USE_SHM)
6759  KMP_INTERNAL_FREE(shm_name);
6760 #endif
6761 
6762  KMP_INTERNAL_FREE(__kmp_registration_str);
6763  KMP_INTERNAL_FREE(value);
6764  KMP_INTERNAL_FREE(name);
6765 
6766  __kmp_registration_flag = 0;
6767  __kmp_registration_str = NULL;
6768 
6769 } // __kmp_unregister_library
6770 
6771 // End of Library registration stuff.
6772 // -----------------------------------------------------------------------------
6773 
6774 #if KMP_MIC_SUPPORTED
6775 
6776 static void __kmp_check_mic_type() {
6777  kmp_cpuid_t cpuid_state = {0};
6778  kmp_cpuid_t *cs_p = &cpuid_state;
6779  __kmp_x86_cpuid(1, 0, cs_p);
6780  // We don't support mic1 at the moment
6781  if ((cs_p->eax & 0xff0) == 0xB10) {
6782  __kmp_mic_type = mic2;
6783  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6784  __kmp_mic_type = mic3;
6785  } else {
6786  __kmp_mic_type = non_mic;
6787  }
6788 }
6789 
6790 #endif /* KMP_MIC_SUPPORTED */
6791 
6792 #if KMP_HAVE_UMWAIT
6793 static void __kmp_user_level_mwait_init() {
6794  struct kmp_cpuid buf;
6795  __kmp_x86_cpuid(7, 0, &buf);
6796  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6797  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6798  __kmp_umwait_enabled));
6799 }
6800 #elif KMP_HAVE_MWAIT
6801 #ifndef AT_INTELPHIUSERMWAIT
6802 // Spurious, non-existent value that should always fail to return anything.
6803 // Will be replaced with the correct value when we know that.
6804 #define AT_INTELPHIUSERMWAIT 10000
6805 #endif
6806 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6807 // earlier OS is used to build the RTL, we'll use the following internal
6808 // function when the entry is not found.
6809 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6810 unsigned long getauxval(unsigned long) { return 0; }
6811 
6812 static void __kmp_user_level_mwait_init() {
6813  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6814  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6815  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6816  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6817  if (__kmp_mic_type == mic3) {
6818  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6819  if ((res & 0x1) || __kmp_user_level_mwait) {
6820  __kmp_mwait_enabled = TRUE;
6821  if (__kmp_user_level_mwait) {
6822  KMP_INFORM(EnvMwaitWarn);
6823  }
6824  } else {
6825  __kmp_mwait_enabled = FALSE;
6826  }
6827  }
6828  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6829  "__kmp_mwait_enabled = %d\n",
6830  __kmp_mic_type, __kmp_mwait_enabled));
6831 }
6832 #endif /* KMP_HAVE_UMWAIT */
6833 
6834 static void __kmp_do_serial_initialize(void) {
6835  int i, gtid;
6836  size_t size;
6837 
6838  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6839 
6840  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6841  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6842  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6843  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6844  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6845 
6846 #if OMPT_SUPPORT
6847  ompt_pre_init();
6848 #endif
6849 #if OMPD_SUPPORT
6850  __kmp_env_dump();
6851  ompd_init();
6852 #endif
6853 
6854  __kmp_validate_locks();
6855 
6856  /* Initialize internal memory allocator */
6857  __kmp_init_allocator();
6858 
6859  /* Register the library startup via an environment variable and check to see
6860  whether another copy of the library is already registered. */
6861 
6862  __kmp_register_library_startup();
6863 
6864  /* TODO reinitialization of library */
6865  if (TCR_4(__kmp_global.g.g_done)) {
6866  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6867  }
6868 
6869  __kmp_global.g.g_abort = 0;
6870  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6871 
6872 /* initialize the locks */
6873 #if KMP_USE_ADAPTIVE_LOCKS
6874 #if KMP_DEBUG_ADAPTIVE_LOCKS
6875  __kmp_init_speculative_stats();
6876 #endif
6877 #endif
6878 #if KMP_STATS_ENABLED
6879  __kmp_stats_init();
6880 #endif
6881  __kmp_init_lock(&__kmp_global_lock);
6882  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6883  __kmp_init_lock(&__kmp_debug_lock);
6884  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6885  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6886  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6887  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6888  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6889  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6890  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6891  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6892  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6893  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6894  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6895  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6896  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6897  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6898  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6899 #if KMP_USE_MONITOR
6900  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6901 #endif
6902  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6903 
6904  /* conduct initialization and initial setup of configuration */
6905 
6906  __kmp_runtime_initialize();
6907 
6908 #if KMP_MIC_SUPPORTED
6909  __kmp_check_mic_type();
6910 #endif
6911 
6912 // Some global variable initialization moved here from kmp_env_initialize()
6913 #ifdef KMP_DEBUG
6914  kmp_diag = 0;
6915 #endif
6916  __kmp_abort_delay = 0;
6917 
6918  // From __kmp_init_dflt_team_nth()
6919  /* assume the entire machine will be used */
6920  __kmp_dflt_team_nth_ub = __kmp_xproc;
6921  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6922  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6923  }
6924  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6925  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6926  }
6927  __kmp_max_nth = __kmp_sys_max_nth;
6928  __kmp_cg_max_nth = __kmp_sys_max_nth;
6929  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6930  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6931  __kmp_teams_max_nth = __kmp_sys_max_nth;
6932  }
6933 
6934  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6935  // part
6936  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6937 #if KMP_USE_MONITOR
6938  __kmp_monitor_wakeups =
6939  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6940  __kmp_bt_intervals =
6941  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6942 #endif
6943  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6944  __kmp_library = library_throughput;
6945  // From KMP_SCHEDULE initialization
6946  __kmp_static = kmp_sch_static_balanced;
6947 // AC: do not use analytical here, because it is non-monotonous
6948 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6949 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6950 // need to repeat assignment
6951 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6952 // bit control and barrier method control parts
6953 #if KMP_FAST_REDUCTION_BARRIER
6954 #define kmp_reduction_barrier_gather_bb ((int)1)
6955 #define kmp_reduction_barrier_release_bb ((int)1)
6956 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
6957 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
6958 #endif // KMP_FAST_REDUCTION_BARRIER
6959  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6960  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6961  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6962  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6963  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6964 #if KMP_FAST_REDUCTION_BARRIER
6965  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6966  // lin_64 ): hyper,1
6967  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6968  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6969  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6970  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6971  }
6972 #endif // KMP_FAST_REDUCTION_BARRIER
6973  }
6974 #if KMP_FAST_REDUCTION_BARRIER
6975 #undef kmp_reduction_barrier_release_pat
6976 #undef kmp_reduction_barrier_gather_pat
6977 #undef kmp_reduction_barrier_release_bb
6978 #undef kmp_reduction_barrier_gather_bb
6979 #endif // KMP_FAST_REDUCTION_BARRIER
6980 #if KMP_MIC_SUPPORTED
6981  if (__kmp_mic_type == mic2) { // KNC
6982  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6983  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6984  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6985  1; // forkjoin release
6986  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6987  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6988  }
6989 #if KMP_FAST_REDUCTION_BARRIER
6990  if (__kmp_mic_type == mic2) { // KNC
6991  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6992  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6993  }
6994 #endif // KMP_FAST_REDUCTION_BARRIER
6995 #endif // KMP_MIC_SUPPORTED
6996 
6997 // From KMP_CHECKS initialization
6998 #ifdef KMP_DEBUG
6999  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7000 #else
7001  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7002 #endif
7003 
7004  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7005  __kmp_foreign_tp = TRUE;
7006 
7007  __kmp_global.g.g_dynamic = FALSE;
7008  __kmp_global.g.g_dynamic_mode = dynamic_default;
7009 
7010  __kmp_init_nesting_mode();
7011 
7012  __kmp_env_initialize(NULL);
7013 
7014 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7015  __kmp_user_level_mwait_init();
7016 #endif
7017 // Print all messages in message catalog for testing purposes.
7018 #ifdef KMP_DEBUG
7019  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7020  if (__kmp_str_match_true(val)) {
7021  kmp_str_buf_t buffer;
7022  __kmp_str_buf_init(&buffer);
7023  __kmp_i18n_dump_catalog(&buffer);
7024  __kmp_printf("%s", buffer.str);
7025  __kmp_str_buf_free(&buffer);
7026  }
7027  __kmp_env_free(&val);
7028 #endif
7029 
7030  __kmp_threads_capacity =
7031  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7032  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7033  __kmp_tp_capacity = __kmp_default_tp_capacity(
7034  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7035 
7036  // If the library is shut down properly, both pools must be NULL. Just in
7037  // case, set them to NULL -- some memory may leak, but subsequent code will
7038  // work even if pools are not freed.
7039  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7040  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7041  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7042  __kmp_thread_pool = NULL;
7043  __kmp_thread_pool_insert_pt = NULL;
7044  __kmp_team_pool = NULL;
7045 
7046  /* Allocate all of the variable sized records */
7047  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7048  * expandable */
7049  /* Since allocation is cache-aligned, just add extra padding at the end */
7050  size =
7051  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7052  CACHE_LINE;
7053  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7054  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7055  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7056 
7057  /* init thread counts */
7058  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7059  0); // Asserts fail if the library is reinitializing and
7060  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7061  __kmp_all_nth = 0;
7062  __kmp_nth = 0;
7063 
7064  /* setup the uber master thread and hierarchy */
7065  gtid = __kmp_register_root(TRUE);
7066  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7067  KMP_ASSERT(KMP_UBER_GTID(gtid));
7068  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7069 
7070  KMP_MB(); /* Flush all pending memory write invalidates. */
7071 
7072  __kmp_common_initialize();
7073 
7074 #if KMP_OS_UNIX
7075  /* invoke the child fork handler */
7076  __kmp_register_atfork();
7077 #endif
7078 
7079 #if !KMP_DYNAMIC_LIB
7080  {
7081  /* Invoke the exit handler when the program finishes, only for static
7082  library. For dynamic library, we already have _fini and DllMain. */
7083  int rc = atexit(__kmp_internal_end_atexit);
7084  if (rc != 0) {
7085  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7086  __kmp_msg_null);
7087  }
7088  }
7089 #endif
7090 
7091 #if KMP_HANDLE_SIGNALS
7092 #if KMP_OS_UNIX
7093  /* NOTE: make sure that this is called before the user installs their own
7094  signal handlers so that the user handlers are called first. this way they
7095  can return false, not call our handler, avoid terminating the library, and
7096  continue execution where they left off. */
7097  __kmp_install_signals(FALSE);
7098 #endif /* KMP_OS_UNIX */
7099 #if KMP_OS_WINDOWS
7100  __kmp_install_signals(TRUE);
7101 #endif /* KMP_OS_WINDOWS */
7102 #endif
7103 
7104  /* we have finished the serial initialization */
7105  __kmp_init_counter++;
7106 
7107  __kmp_init_serial = TRUE;
7108 
7109  if (__kmp_settings) {
7110  __kmp_env_print();
7111  }
7112 
7113  if (__kmp_display_env || __kmp_display_env_verbose) {
7114  __kmp_env_print_2();
7115  }
7116 
7117 #if OMPT_SUPPORT
7118  ompt_post_init();
7119 #endif
7120 
7121  KMP_MB();
7122 
7123  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7124 }
7125 
7126 void __kmp_serial_initialize(void) {
7127  if (__kmp_init_serial) {
7128  return;
7129  }
7130  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7131  if (__kmp_init_serial) {
7132  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7133  return;
7134  }
7135  __kmp_do_serial_initialize();
7136  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7137 }
7138 
7139 static void __kmp_do_middle_initialize(void) {
7140  int i, j;
7141  int prev_dflt_team_nth;
7142 
7143  if (!__kmp_init_serial) {
7144  __kmp_do_serial_initialize();
7145  }
7146 
7147  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7148 
7149  // Save the previous value for the __kmp_dflt_team_nth so that
7150  // we can avoid some reinitialization if it hasn't changed.
7151  prev_dflt_team_nth = __kmp_dflt_team_nth;
7152 
7153 #if KMP_AFFINITY_SUPPORTED
7154  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7155  // number of cores on the machine.
7156  __kmp_affinity_initialize();
7157 
7158 #endif /* KMP_AFFINITY_SUPPORTED */
7159 
7160  KMP_ASSERT(__kmp_xproc > 0);
7161  if (__kmp_avail_proc == 0) {
7162  __kmp_avail_proc = __kmp_xproc;
7163  }
7164 
7165  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7166  // correct them now
7167  j = 0;
7168  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7169  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7170  __kmp_avail_proc;
7171  j++;
7172  }
7173 
7174  if (__kmp_dflt_team_nth == 0) {
7175 #ifdef KMP_DFLT_NTH_CORES
7176  // Default #threads = #cores
7177  __kmp_dflt_team_nth = __kmp_ncores;
7178  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7179  "__kmp_ncores (%d)\n",
7180  __kmp_dflt_team_nth));
7181 #else
7182  // Default #threads = #available OS procs
7183  __kmp_dflt_team_nth = __kmp_avail_proc;
7184  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7185  "__kmp_avail_proc(%d)\n",
7186  __kmp_dflt_team_nth));
7187 #endif /* KMP_DFLT_NTH_CORES */
7188  }
7189 
7190  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7191  __kmp_dflt_team_nth = KMP_MIN_NTH;
7192  }
7193  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7194  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7195  }
7196 
7197  if (__kmp_nesting_mode > 0)
7198  __kmp_set_nesting_mode_threads();
7199 
7200  // There's no harm in continuing if the following check fails,
7201  // but it indicates an error in the previous logic.
7202  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7203 
7204  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7205  // Run through the __kmp_threads array and set the num threads icv for each
7206  // root thread that is currently registered with the RTL (which has not
7207  // already explicitly set its nthreads-var with a call to
7208  // omp_set_num_threads()).
7209  for (i = 0; i < __kmp_threads_capacity; i++) {
7210  kmp_info_t *thread = __kmp_threads[i];
7211  if (thread == NULL)
7212  continue;
7213  if (thread->th.th_current_task->td_icvs.nproc != 0)
7214  continue;
7215 
7216  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7217  }
7218  }
7219  KA_TRACE(
7220  20,
7221  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7222  __kmp_dflt_team_nth));
7223 
7224 #ifdef KMP_ADJUST_BLOCKTIME
7225  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7226  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7227  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7228  if (__kmp_nth > __kmp_avail_proc) {
7229  __kmp_zero_bt = TRUE;
7230  }
7231  }
7232 #endif /* KMP_ADJUST_BLOCKTIME */
7233 
7234  /* we have finished middle initialization */
7235  TCW_SYNC_4(__kmp_init_middle, TRUE);
7236 
7237  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7238 }
7239 
7240 void __kmp_middle_initialize(void) {
7241  if (__kmp_init_middle) {
7242  return;
7243  }
7244  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7245  if (__kmp_init_middle) {
7246  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7247  return;
7248  }
7249  __kmp_do_middle_initialize();
7250  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7251 }
7252 
7253 void __kmp_parallel_initialize(void) {
7254  int gtid = __kmp_entry_gtid(); // this might be a new root
7255 
7256  /* synchronize parallel initialization (for sibling) */
7257  if (TCR_4(__kmp_init_parallel))
7258  return;
7259  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7260  if (TCR_4(__kmp_init_parallel)) {
7261  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7262  return;
7263  }
7264 
7265  /* TODO reinitialization after we have already shut down */
7266  if (TCR_4(__kmp_global.g.g_done)) {
7267  KA_TRACE(
7268  10,
7269  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7270  __kmp_infinite_loop();
7271  }
7272 
7273  /* jc: The lock __kmp_initz_lock is already held, so calling
7274  __kmp_serial_initialize would cause a deadlock. So we call
7275  __kmp_do_serial_initialize directly. */
7276  if (!__kmp_init_middle) {
7277  __kmp_do_middle_initialize();
7278  }
7279  __kmp_assign_root_init_mask();
7280  __kmp_resume_if_hard_paused();
7281 
7282  /* begin initialization */
7283  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7284  KMP_ASSERT(KMP_UBER_GTID(gtid));
7285 
7286 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7287  // Save the FP control regs.
7288  // Worker threads will set theirs to these values at thread startup.
7289  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7290  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7291  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7292 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7293 
7294 #if KMP_OS_UNIX
7295 #if KMP_HANDLE_SIGNALS
7296  /* must be after __kmp_serial_initialize */
7297  __kmp_install_signals(TRUE);
7298 #endif
7299 #endif
7300 
7301  __kmp_suspend_initialize();
7302 
7303 #if defined(USE_LOAD_BALANCE)
7304  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7305  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7306  }
7307 #else
7308  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7309  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7310  }
7311 #endif
7312 
7313  if (__kmp_version) {
7314  __kmp_print_version_2();
7315  }
7316 
7317  /* we have finished parallel initialization */
7318  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7319 
7320  KMP_MB();
7321  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7322 
7323  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7324 }
7325 
7326 void __kmp_hidden_helper_initialize() {
7327  if (TCR_4(__kmp_init_hidden_helper))
7328  return;
7329 
7330  // __kmp_parallel_initialize is required before we initialize hidden helper
7331  if (!TCR_4(__kmp_init_parallel))
7332  __kmp_parallel_initialize();
7333 
7334  // Double check. Note that this double check should not be placed before
7335  // __kmp_parallel_initialize as it will cause dead lock.
7336  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7337  if (TCR_4(__kmp_init_hidden_helper)) {
7338  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7339  return;
7340  }
7341 
7342  // Set the count of hidden helper tasks to be executed to zero
7343  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7344 
7345  // Set the global variable indicating that we're initializing hidden helper
7346  // team/threads
7347  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7348 
7349  // Platform independent initialization
7350  __kmp_do_initialize_hidden_helper_threads();
7351 
7352  // Wait here for the finish of initialization of hidden helper teams
7353  __kmp_hidden_helper_threads_initz_wait();
7354 
7355  // We have finished hidden helper initialization
7356  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7357 
7358  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7359 }
7360 
7361 /* ------------------------------------------------------------------------ */
7362 
7363 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7364  kmp_team_t *team) {
7365  kmp_disp_t *dispatch;
7366 
7367  KMP_MB();
7368 
7369  /* none of the threads have encountered any constructs, yet. */
7370  this_thr->th.th_local.this_construct = 0;
7371 #if KMP_CACHE_MANAGE
7372  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7373 #endif /* KMP_CACHE_MANAGE */
7374  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7375  KMP_DEBUG_ASSERT(dispatch);
7376  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7377  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7378  // this_thr->th.th_info.ds.ds_tid ] );
7379 
7380  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7381  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7382  if (__kmp_env_consistency_check)
7383  __kmp_push_parallel(gtid, team->t.t_ident);
7384 
7385  KMP_MB(); /* Flush all pending memory write invalidates. */
7386 }
7387 
7388 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7389  kmp_team_t *team) {
7390  if (__kmp_env_consistency_check)
7391  __kmp_pop_parallel(gtid, team->t.t_ident);
7392 
7393  __kmp_finish_implicit_task(this_thr);
7394 }
7395 
7396 int __kmp_invoke_task_func(int gtid) {
7397  int rc;
7398  int tid = __kmp_tid_from_gtid(gtid);
7399  kmp_info_t *this_thr = __kmp_threads[gtid];
7400  kmp_team_t *team = this_thr->th.th_team;
7401 
7402  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7403 #if USE_ITT_BUILD
7404  if (__itt_stack_caller_create_ptr) {
7405  // inform ittnotify about entering user's code
7406  if (team->t.t_stack_id != NULL) {
7407  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7408  } else {
7409  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7410  __kmp_itt_stack_callee_enter(
7411  (__itt_caller)team->t.t_parent->t.t_stack_id);
7412  }
7413  }
7414 #endif /* USE_ITT_BUILD */
7415 #if INCLUDE_SSC_MARKS
7416  SSC_MARK_INVOKING();
7417 #endif
7418 
7419 #if OMPT_SUPPORT
7420  void *dummy;
7421  void **exit_frame_p;
7422  ompt_data_t *my_task_data;
7423  ompt_data_t *my_parallel_data;
7424  int ompt_team_size;
7425 
7426  if (ompt_enabled.enabled) {
7427  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7428  .ompt_task_info.frame.exit_frame.ptr);
7429  } else {
7430  exit_frame_p = &dummy;
7431  }
7432 
7433  my_task_data =
7434  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7435  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7436  if (ompt_enabled.ompt_callback_implicit_task) {
7437  ompt_team_size = team->t.t_nproc;
7438  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7439  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7440  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7441  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7442  }
7443 #endif
7444 
7445 #if KMP_STATS_ENABLED
7446  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7447  if (previous_state == stats_state_e::TEAMS_REGION) {
7448  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7449  } else {
7450  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7451  }
7452  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7453 #endif
7454 
7455  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7456  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7457 #if OMPT_SUPPORT
7458  ,
7459  exit_frame_p
7460 #endif
7461  );
7462 #if OMPT_SUPPORT
7463  *exit_frame_p = NULL;
7464  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7465 #endif
7466 
7467 #if KMP_STATS_ENABLED
7468  if (previous_state == stats_state_e::TEAMS_REGION) {
7469  KMP_SET_THREAD_STATE(previous_state);
7470  }
7471  KMP_POP_PARTITIONED_TIMER();
7472 #endif
7473 
7474 #if USE_ITT_BUILD
7475  if (__itt_stack_caller_create_ptr) {
7476  // inform ittnotify about leaving user's code
7477  if (team->t.t_stack_id != NULL) {
7478  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7479  } else {
7480  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7481  __kmp_itt_stack_callee_leave(
7482  (__itt_caller)team->t.t_parent->t.t_stack_id);
7483  }
7484  }
7485 #endif /* USE_ITT_BUILD */
7486  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7487 
7488  return rc;
7489 }
7490 
7491 void __kmp_teams_master(int gtid) {
7492  // This routine is called by all primary threads in teams construct
7493  kmp_info_t *thr = __kmp_threads[gtid];
7494  kmp_team_t *team = thr->th.th_team;
7495  ident_t *loc = team->t.t_ident;
7496  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7497  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7498  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7499  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7500  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7501 
7502  // This thread is a new CG root. Set up the proper variables.
7503  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7504  tmp->cg_root = thr; // Make thr the CG root
7505  // Init to thread limit stored when league primary threads were forked
7506  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7507  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7508  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7509  " cg_nthreads to 1\n",
7510  thr, tmp));
7511  tmp->up = thr->th.th_cg_roots;
7512  thr->th.th_cg_roots = tmp;
7513 
7514 // Launch league of teams now, but not let workers execute
7515 // (they hang on fork barrier until next parallel)
7516 #if INCLUDE_SSC_MARKS
7517  SSC_MARK_FORKING();
7518 #endif
7519  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7520  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7521  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7522 #if INCLUDE_SSC_MARKS
7523  SSC_MARK_JOINING();
7524 #endif
7525  // If the team size was reduced from the limit, set it to the new size
7526  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7527  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7528  // AC: last parameter "1" eliminates join barrier which won't work because
7529  // worker threads are in a fork barrier waiting for more parallel regions
7530  __kmp_join_call(loc, gtid
7531 #if OMPT_SUPPORT
7532  ,
7533  fork_context_intel
7534 #endif
7535  ,
7536  1);
7537 }
7538 
7539 int __kmp_invoke_teams_master(int gtid) {
7540  kmp_info_t *this_thr = __kmp_threads[gtid];
7541  kmp_team_t *team = this_thr->th.th_team;
7542 #if KMP_DEBUG
7543  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7544  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7545  (void *)__kmp_teams_master);
7546 #endif
7547  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7548 #if OMPT_SUPPORT
7549  int tid = __kmp_tid_from_gtid(gtid);
7550  ompt_data_t *task_data =
7551  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7552  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7553  if (ompt_enabled.ompt_callback_implicit_task) {
7554  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7555  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7556  ompt_task_initial);
7557  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7558  }
7559 #endif
7560  __kmp_teams_master(gtid);
7561 #if OMPT_SUPPORT
7562  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7563 #endif
7564  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7565  return 1;
7566 }
7567 
7568 /* this sets the requested number of threads for the next parallel region
7569  encountered by this team. since this should be enclosed in the forkjoin
7570  critical section it should avoid race conditions with asymmetrical nested
7571  parallelism */
7572 
7573 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7574  kmp_info_t *thr = __kmp_threads[gtid];
7575 
7576  if (num_threads > 0)
7577  thr->th.th_set_nproc = num_threads;
7578 }
7579 
7580 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7581  int num_threads) {
7582  KMP_DEBUG_ASSERT(thr);
7583  // Remember the number of threads for inner parallel regions
7584  if (!TCR_4(__kmp_init_middle))
7585  __kmp_middle_initialize(); // get internal globals calculated
7586  __kmp_assign_root_init_mask();
7587  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7588  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7589 
7590  if (num_threads == 0) {
7591  if (__kmp_teams_thread_limit > 0) {
7592  num_threads = __kmp_teams_thread_limit;
7593  } else {
7594  num_threads = __kmp_avail_proc / num_teams;
7595  }
7596  // adjust num_threads w/o warning as it is not user setting
7597  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7598  // no thread_limit clause specified - do not change thread-limit-var ICV
7599  if (num_threads > __kmp_dflt_team_nth) {
7600  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7601  }
7602  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7603  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7604  } // prevent team size to exceed thread-limit-var
7605  if (num_teams * num_threads > __kmp_teams_max_nth) {
7606  num_threads = __kmp_teams_max_nth / num_teams;
7607  }
7608  if (num_threads == 0) {
7609  num_threads = 1;
7610  }
7611  } else {
7612  // This thread will be the primary thread of the league primary threads
7613  // Store new thread limit; old limit is saved in th_cg_roots list
7614  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7615  // num_threads = min(num_threads, nthreads-var)
7616  if (num_threads > __kmp_dflt_team_nth) {
7617  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7618  }
7619  if (num_teams * num_threads > __kmp_teams_max_nth) {
7620  int new_threads = __kmp_teams_max_nth / num_teams;
7621  if (new_threads == 0) {
7622  new_threads = 1;
7623  }
7624  if (new_threads != num_threads) {
7625  if (!__kmp_reserve_warn) { // user asked for too many threads
7626  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7627  __kmp_msg(kmp_ms_warning,
7628  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7629  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7630  }
7631  }
7632  num_threads = new_threads;
7633  }
7634  }
7635  thr->th.th_teams_size.nth = num_threads;
7636 }
7637 
7638 /* this sets the requested number of teams for the teams region and/or
7639  the number of threads for the next parallel region encountered */
7640 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7641  int num_threads) {
7642  kmp_info_t *thr = __kmp_threads[gtid];
7643  KMP_DEBUG_ASSERT(num_teams >= 0);
7644  KMP_DEBUG_ASSERT(num_threads >= 0);
7645 
7646  if (num_teams == 0) {
7647  if (__kmp_nteams > 0) {
7648  num_teams = __kmp_nteams;
7649  } else {
7650  num_teams = 1; // default number of teams is 1.
7651  }
7652  }
7653  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7654  if (!__kmp_reserve_warn) {
7655  __kmp_reserve_warn = 1;
7656  __kmp_msg(kmp_ms_warning,
7657  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7658  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7659  }
7660  num_teams = __kmp_teams_max_nth;
7661  }
7662  // Set number of teams (number of threads in the outer "parallel" of the
7663  // teams)
7664  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7665 
7666  __kmp_push_thread_limit(thr, num_teams, num_threads);
7667 }
7668 
7669 /* This sets the requested number of teams for the teams region and/or
7670  the number of threads for the next parallel region encountered */
7671 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7672  int num_teams_ub, int num_threads) {
7673  kmp_info_t *thr = __kmp_threads[gtid];
7674  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7675  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7676  KMP_DEBUG_ASSERT(num_threads >= 0);
7677 
7678  if (num_teams_lb > num_teams_ub) {
7679  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7680  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7681  }
7682 
7683  int num_teams = 1; // defalt number of teams is 1.
7684 
7685  if (num_teams_lb == 0 && num_teams_ub > 0)
7686  num_teams_lb = num_teams_ub;
7687 
7688  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7689  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7690  if (num_teams > __kmp_teams_max_nth) {
7691  if (!__kmp_reserve_warn) {
7692  __kmp_reserve_warn = 1;
7693  __kmp_msg(kmp_ms_warning,
7694  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7695  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7696  }
7697  num_teams = __kmp_teams_max_nth;
7698  }
7699  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7700  num_teams = num_teams_ub;
7701  } else { // num_teams_lb <= num_teams <= num_teams_ub
7702  if (num_threads == 0) {
7703  if (num_teams_ub > __kmp_teams_max_nth) {
7704  num_teams = num_teams_lb;
7705  } else {
7706  num_teams = num_teams_ub;
7707  }
7708  } else {
7709  num_teams = (num_threads > __kmp_teams_max_nth)
7710  ? num_teams
7711  : __kmp_teams_max_nth / num_threads;
7712  if (num_teams < num_teams_lb) {
7713  num_teams = num_teams_lb;
7714  } else if (num_teams > num_teams_ub) {
7715  num_teams = num_teams_ub;
7716  }
7717  }
7718  }
7719  // Set number of teams (number of threads in the outer "parallel" of the
7720  // teams)
7721  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7722 
7723  __kmp_push_thread_limit(thr, num_teams, num_threads);
7724 }
7725 
7726 // Set the proc_bind var to use in the following parallel region.
7727 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7728  kmp_info_t *thr = __kmp_threads[gtid];
7729  thr->th.th_set_proc_bind = proc_bind;
7730 }
7731 
7732 /* Launch the worker threads into the microtask. */
7733 
7734 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7735  kmp_info_t *this_thr = __kmp_threads[gtid];
7736 
7737 #ifdef KMP_DEBUG
7738  int f;
7739 #endif /* KMP_DEBUG */
7740 
7741  KMP_DEBUG_ASSERT(team);
7742  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7743  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7744  KMP_MB(); /* Flush all pending memory write invalidates. */
7745 
7746  team->t.t_construct = 0; /* no single directives seen yet */
7747  team->t.t_ordered.dt.t_value =
7748  0; /* thread 0 enters the ordered section first */
7749 
7750  /* Reset the identifiers on the dispatch buffer */
7751  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7752  if (team->t.t_max_nproc > 1) {
7753  int i;
7754  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7755  team->t.t_disp_buffer[i].buffer_index = i;
7756  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7757  }
7758  } else {
7759  team->t.t_disp_buffer[0].buffer_index = 0;
7760  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7761  }
7762 
7763  KMP_MB(); /* Flush all pending memory write invalidates. */
7764  KMP_ASSERT(this_thr->th.th_team == team);
7765 
7766 #ifdef KMP_DEBUG
7767  for (f = 0; f < team->t.t_nproc; f++) {
7768  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7769  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7770  }
7771 #endif /* KMP_DEBUG */
7772 
7773  /* release the worker threads so they may begin working */
7774  __kmp_fork_barrier(gtid, 0);
7775 }
7776 
7777 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7778  kmp_info_t *this_thr = __kmp_threads[gtid];
7779 
7780  KMP_DEBUG_ASSERT(team);
7781  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7782  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7783  KMP_MB(); /* Flush all pending memory write invalidates. */
7784 
7785  /* Join barrier after fork */
7786 
7787 #ifdef KMP_DEBUG
7788  if (__kmp_threads[gtid] &&
7789  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7790  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7791  __kmp_threads[gtid]);
7792  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7793  "team->t.t_nproc=%d\n",
7794  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7795  team->t.t_nproc);
7796  __kmp_print_structure();
7797  }
7798  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7799  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7800 #endif /* KMP_DEBUG */
7801 
7802  __kmp_join_barrier(gtid); /* wait for everyone */
7803 #if OMPT_SUPPORT
7804  if (ompt_enabled.enabled &&
7805  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7806  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7807  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7808  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7809 #if OMPT_OPTIONAL
7810  void *codeptr = NULL;
7811  if (KMP_MASTER_TID(ds_tid) &&
7812  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7813  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7814  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7815 
7816  if (ompt_enabled.ompt_callback_sync_region_wait) {
7817  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7818  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7819  codeptr);
7820  }
7821  if (ompt_enabled.ompt_callback_sync_region) {
7822  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7823  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7824  codeptr);
7825  }
7826 #endif
7827  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7828  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7829  ompt_scope_end, NULL, task_data, 0, ds_tid,
7830  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7831  }
7832  }
7833 #endif
7834 
7835  KMP_MB(); /* Flush all pending memory write invalidates. */
7836  KMP_ASSERT(this_thr->th.th_team == team);
7837 }
7838 
7839 /* ------------------------------------------------------------------------ */
7840 
7841 #ifdef USE_LOAD_BALANCE
7842 
7843 // Return the worker threads actively spinning in the hot team, if we
7844 // are at the outermost level of parallelism. Otherwise, return 0.
7845 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7846  int i;
7847  int retval;
7848  kmp_team_t *hot_team;
7849 
7850  if (root->r.r_active) {
7851  return 0;
7852  }
7853  hot_team = root->r.r_hot_team;
7854  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7855  return hot_team->t.t_nproc - 1; // Don't count primary thread
7856  }
7857 
7858  // Skip the primary thread - it is accounted for elsewhere.
7859  retval = 0;
7860  for (i = 1; i < hot_team->t.t_nproc; i++) {
7861  if (hot_team->t.t_threads[i]->th.th_active) {
7862  retval++;
7863  }
7864  }
7865  return retval;
7866 }
7867 
7868 // Perform an automatic adjustment to the number of
7869 // threads used by the next parallel region.
7870 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7871  int retval;
7872  int pool_active;
7873  int hot_team_active;
7874  int team_curr_active;
7875  int system_active;
7876 
7877  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7878  set_nproc));
7879  KMP_DEBUG_ASSERT(root);
7880  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7881  ->th.th_current_task->td_icvs.dynamic == TRUE);
7882  KMP_DEBUG_ASSERT(set_nproc > 1);
7883 
7884  if (set_nproc == 1) {
7885  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7886  return 1;
7887  }
7888 
7889  // Threads that are active in the thread pool, active in the hot team for this
7890  // particular root (if we are at the outer par level), and the currently
7891  // executing thread (to become the primary thread) are available to add to the
7892  // new team, but are currently contributing to the system load, and must be
7893  // accounted for.
7894  pool_active = __kmp_thread_pool_active_nth;
7895  hot_team_active = __kmp_active_hot_team_nproc(root);
7896  team_curr_active = pool_active + hot_team_active + 1;
7897 
7898  // Check the system load.
7899  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7900  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7901  "hot team active = %d\n",
7902  system_active, pool_active, hot_team_active));
7903 
7904  if (system_active < 0) {
7905  // There was an error reading the necessary info from /proc, so use the
7906  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7907  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7908  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7909  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7910 
7911  // Make this call behave like the thread limit algorithm.
7912  retval = __kmp_avail_proc - __kmp_nth +
7913  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7914  if (retval > set_nproc) {
7915  retval = set_nproc;
7916  }
7917  if (retval < KMP_MIN_NTH) {
7918  retval = KMP_MIN_NTH;
7919  }
7920 
7921  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7922  retval));
7923  return retval;
7924  }
7925 
7926  // There is a slight delay in the load balance algorithm in detecting new
7927  // running procs. The real system load at this instant should be at least as
7928  // large as the #active omp thread that are available to add to the team.
7929  if (system_active < team_curr_active) {
7930  system_active = team_curr_active;
7931  }
7932  retval = __kmp_avail_proc - system_active + team_curr_active;
7933  if (retval > set_nproc) {
7934  retval = set_nproc;
7935  }
7936  if (retval < KMP_MIN_NTH) {
7937  retval = KMP_MIN_NTH;
7938  }
7939 
7940  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7941  return retval;
7942 } // __kmp_load_balance_nproc()
7943 
7944 #endif /* USE_LOAD_BALANCE */
7945 
7946 /* ------------------------------------------------------------------------ */
7947 
7948 /* NOTE: this is called with the __kmp_init_lock held */
7949 void __kmp_cleanup(void) {
7950  int f;
7951 
7952  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7953 
7954  if (TCR_4(__kmp_init_parallel)) {
7955 #if KMP_HANDLE_SIGNALS
7956  __kmp_remove_signals();
7957 #endif
7958  TCW_4(__kmp_init_parallel, FALSE);
7959  }
7960 
7961  if (TCR_4(__kmp_init_middle)) {
7962 #if KMP_AFFINITY_SUPPORTED
7963  __kmp_affinity_uninitialize();
7964 #endif /* KMP_AFFINITY_SUPPORTED */
7965  __kmp_cleanup_hierarchy();
7966  TCW_4(__kmp_init_middle, FALSE);
7967  }
7968 
7969  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7970 
7971  if (__kmp_init_serial) {
7972  __kmp_runtime_destroy();
7973  __kmp_init_serial = FALSE;
7974  }
7975 
7976  __kmp_cleanup_threadprivate_caches();
7977 
7978  for (f = 0; f < __kmp_threads_capacity; f++) {
7979  if (__kmp_root[f] != NULL) {
7980  __kmp_free(__kmp_root[f]);
7981  __kmp_root[f] = NULL;
7982  }
7983  }
7984  __kmp_free(__kmp_threads);
7985  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7986  // there is no need in freeing __kmp_root.
7987  __kmp_threads = NULL;
7988  __kmp_root = NULL;
7989  __kmp_threads_capacity = 0;
7990 
7991 #if KMP_USE_DYNAMIC_LOCK
7992  __kmp_cleanup_indirect_user_locks();
7993 #else
7994  __kmp_cleanup_user_locks();
7995 #endif
7996 #if OMPD_SUPPORT
7997  if (ompd_state) {
7998  __kmp_free(ompd_env_block);
7999  ompd_env_block = NULL;
8000  ompd_env_block_size = 0;
8001  }
8002 #endif
8003 
8004 #if KMP_AFFINITY_SUPPORTED
8005  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8006  __kmp_cpuinfo_file = NULL;
8007 #endif /* KMP_AFFINITY_SUPPORTED */
8008 
8009 #if KMP_USE_ADAPTIVE_LOCKS
8010 #if KMP_DEBUG_ADAPTIVE_LOCKS
8011  __kmp_print_speculative_stats();
8012 #endif
8013 #endif
8014  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8015  __kmp_nested_nth.nth = NULL;
8016  __kmp_nested_nth.size = 0;
8017  __kmp_nested_nth.used = 0;
8018  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8019  __kmp_nested_proc_bind.bind_types = NULL;
8020  __kmp_nested_proc_bind.size = 0;
8021  __kmp_nested_proc_bind.used = 0;
8022  if (__kmp_affinity_format) {
8023  KMP_INTERNAL_FREE(__kmp_affinity_format);
8024  __kmp_affinity_format = NULL;
8025  }
8026 
8027  __kmp_i18n_catclose();
8028 
8029 #if KMP_USE_HIER_SCHED
8030  __kmp_hier_scheds.deallocate();
8031 #endif
8032 
8033 #if KMP_STATS_ENABLED
8034  __kmp_stats_fini();
8035 #endif
8036 
8037  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8038 }
8039 
8040 /* ------------------------------------------------------------------------ */
8041 
8042 int __kmp_ignore_mppbeg(void) {
8043  char *env;
8044 
8045  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8046  if (__kmp_str_match_false(env))
8047  return FALSE;
8048  }
8049  // By default __kmpc_begin() is no-op.
8050  return TRUE;
8051 }
8052 
8053 int __kmp_ignore_mppend(void) {
8054  char *env;
8055 
8056  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8057  if (__kmp_str_match_false(env))
8058  return FALSE;
8059  }
8060  // By default __kmpc_end() is no-op.
8061  return TRUE;
8062 }
8063 
8064 void __kmp_internal_begin(void) {
8065  int gtid;
8066  kmp_root_t *root;
8067 
8068  /* this is a very important step as it will register new sibling threads
8069  and assign these new uber threads a new gtid */
8070  gtid = __kmp_entry_gtid();
8071  root = __kmp_threads[gtid]->th.th_root;
8072  KMP_ASSERT(KMP_UBER_GTID(gtid));
8073 
8074  if (root->r.r_begin)
8075  return;
8076  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8077  if (root->r.r_begin) {
8078  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8079  return;
8080  }
8081 
8082  root->r.r_begin = TRUE;
8083 
8084  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8085 }
8086 
8087 /* ------------------------------------------------------------------------ */
8088 
8089 void __kmp_user_set_library(enum library_type arg) {
8090  int gtid;
8091  kmp_root_t *root;
8092  kmp_info_t *thread;
8093 
8094  /* first, make sure we are initialized so we can get our gtid */
8095 
8096  gtid = __kmp_entry_gtid();
8097  thread = __kmp_threads[gtid];
8098 
8099  root = thread->th.th_root;
8100 
8101  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8102  library_serial));
8103  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8104  thread */
8105  KMP_WARNING(SetLibraryIncorrectCall);
8106  return;
8107  }
8108 
8109  switch (arg) {
8110  case library_serial:
8111  thread->th.th_set_nproc = 0;
8112  set__nproc(thread, 1);
8113  break;
8114  case library_turnaround:
8115  thread->th.th_set_nproc = 0;
8116  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8117  : __kmp_dflt_team_nth_ub);
8118  break;
8119  case library_throughput:
8120  thread->th.th_set_nproc = 0;
8121  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8122  : __kmp_dflt_team_nth_ub);
8123  break;
8124  default:
8125  KMP_FATAL(UnknownLibraryType, arg);
8126  }
8127 
8128  __kmp_aux_set_library(arg);
8129 }
8130 
8131 void __kmp_aux_set_stacksize(size_t arg) {
8132  if (!__kmp_init_serial)
8133  __kmp_serial_initialize();
8134 
8135 #if KMP_OS_DARWIN
8136  if (arg & (0x1000 - 1)) {
8137  arg &= ~(0x1000 - 1);
8138  if (arg + 0x1000) /* check for overflow if we round up */
8139  arg += 0x1000;
8140  }
8141 #endif
8142  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8143 
8144  /* only change the default stacksize before the first parallel region */
8145  if (!TCR_4(__kmp_init_parallel)) {
8146  size_t value = arg; /* argument is in bytes */
8147 
8148  if (value < __kmp_sys_min_stksize)
8149  value = __kmp_sys_min_stksize;
8150  else if (value > KMP_MAX_STKSIZE)
8151  value = KMP_MAX_STKSIZE;
8152 
8153  __kmp_stksize = value;
8154 
8155  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8156  }
8157 
8158  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8159 }
8160 
8161 /* set the behaviour of the runtime library */
8162 /* TODO this can cause some odd behaviour with sibling parallelism... */
8163 void __kmp_aux_set_library(enum library_type arg) {
8164  __kmp_library = arg;
8165 
8166  switch (__kmp_library) {
8167  case library_serial: {
8168  KMP_INFORM(LibraryIsSerial);
8169  } break;
8170  case library_turnaround:
8171  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8172  __kmp_use_yield = 2; // only yield when oversubscribed
8173  break;
8174  case library_throughput:
8175  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8176  __kmp_dflt_blocktime = 200;
8177  break;
8178  default:
8179  KMP_FATAL(UnknownLibraryType, arg);
8180  }
8181 }
8182 
8183 /* Getting team information common for all team API */
8184 // Returns NULL if not in teams construct
8185 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8186  kmp_info_t *thr = __kmp_entry_thread();
8187  teams_serialized = 0;
8188  if (thr->th.th_teams_microtask) {
8189  kmp_team_t *team = thr->th.th_team;
8190  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8191  int ii = team->t.t_level;
8192  teams_serialized = team->t.t_serialized;
8193  int level = tlevel + 1;
8194  KMP_DEBUG_ASSERT(ii >= tlevel);
8195  while (ii > level) {
8196  for (teams_serialized = team->t.t_serialized;
8197  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8198  }
8199  if (team->t.t_serialized && (!teams_serialized)) {
8200  team = team->t.t_parent;
8201  continue;
8202  }
8203  if (ii > level) {
8204  team = team->t.t_parent;
8205  ii--;
8206  }
8207  }
8208  return team;
8209  }
8210  return NULL;
8211 }
8212 
8213 int __kmp_aux_get_team_num() {
8214  int serialized;
8215  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8216  if (team) {
8217  if (serialized > 1) {
8218  return 0; // teams region is serialized ( 1 team of 1 thread ).
8219  } else {
8220  return team->t.t_master_tid;
8221  }
8222  }
8223  return 0;
8224 }
8225 
8226 int __kmp_aux_get_num_teams() {
8227  int serialized;
8228  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8229  if (team) {
8230  if (serialized > 1) {
8231  return 1;
8232  } else {
8233  return team->t.t_parent->t.t_nproc;
8234  }
8235  }
8236  return 1;
8237 }
8238 
8239 /* ------------------------------------------------------------------------ */
8240 
8241 /*
8242  * Affinity Format Parser
8243  *
8244  * Field is in form of: %[[[0].]size]type
8245  * % and type are required (%% means print a literal '%')
8246  * type is either single char or long name surrounded by {},
8247  * e.g., N or {num_threads}
8248  * 0 => leading zeros
8249  * . => right justified when size is specified
8250  * by default output is left justified
8251  * size is the *minimum* field length
8252  * All other characters are printed as is
8253  *
8254  * Available field types:
8255  * L {thread_level} - omp_get_level()
8256  * n {thread_num} - omp_get_thread_num()
8257  * h {host} - name of host machine
8258  * P {process_id} - process id (integer)
8259  * T {thread_identifier} - native thread identifier (integer)
8260  * N {num_threads} - omp_get_num_threads()
8261  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8262  * a {thread_affinity} - comma separated list of integers or integer ranges
8263  * (values of affinity mask)
8264  *
8265  * Implementation-specific field types can be added
8266  * If a type is unknown, print "undefined"
8267  */
8268 
8269 // Structure holding the short name, long name, and corresponding data type
8270 // for snprintf. A table of these will represent the entire valid keyword
8271 // field types.
8272 typedef struct kmp_affinity_format_field_t {
8273  char short_name; // from spec e.g., L -> thread level
8274  const char *long_name; // from spec thread_level -> thread level
8275  char field_format; // data type for snprintf (typically 'd' or 's'
8276  // for integer or string)
8277 } kmp_affinity_format_field_t;
8278 
8279 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8280 #if KMP_AFFINITY_SUPPORTED
8281  {'A', "thread_affinity", 's'},
8282 #endif
8283  {'t', "team_num", 'd'},
8284  {'T', "num_teams", 'd'},
8285  {'L', "nesting_level", 'd'},
8286  {'n', "thread_num", 'd'},
8287  {'N', "num_threads", 'd'},
8288  {'a', "ancestor_tnum", 'd'},
8289  {'H', "host", 's'},
8290  {'P', "process_id", 'd'},
8291  {'i', "native_thread_id", 'd'}};
8292 
8293 // Return the number of characters it takes to hold field
8294 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8295  const char **ptr,
8296  kmp_str_buf_t *field_buffer) {
8297  int rc, format_index, field_value;
8298  const char *width_left, *width_right;
8299  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8300  static const int FORMAT_SIZE = 20;
8301  char format[FORMAT_SIZE] = {0};
8302  char absolute_short_name = 0;
8303 
8304  KMP_DEBUG_ASSERT(gtid >= 0);
8305  KMP_DEBUG_ASSERT(th);
8306  KMP_DEBUG_ASSERT(**ptr == '%');
8307  KMP_DEBUG_ASSERT(field_buffer);
8308 
8309  __kmp_str_buf_clear(field_buffer);
8310 
8311  // Skip the initial %
8312  (*ptr)++;
8313 
8314  // Check for %% first
8315  if (**ptr == '%') {
8316  __kmp_str_buf_cat(field_buffer, "%", 1);
8317  (*ptr)++; // skip over the second %
8318  return 1;
8319  }
8320 
8321  // Parse field modifiers if they are present
8322  pad_zeros = false;
8323  if (**ptr == '0') {
8324  pad_zeros = true;
8325  (*ptr)++; // skip over 0
8326  }
8327  right_justify = false;
8328  if (**ptr == '.') {
8329  right_justify = true;
8330  (*ptr)++; // skip over .
8331  }
8332  // Parse width of field: [width_left, width_right)
8333  width_left = width_right = NULL;
8334  if (**ptr >= '0' && **ptr <= '9') {
8335  width_left = *ptr;
8336  SKIP_DIGITS(*ptr);
8337  width_right = *ptr;
8338  }
8339 
8340  // Create the format for KMP_SNPRINTF based on flags parsed above
8341  format_index = 0;
8342  format[format_index++] = '%';
8343  if (!right_justify)
8344  format[format_index++] = '-';
8345  if (pad_zeros)
8346  format[format_index++] = '0';
8347  if (width_left && width_right) {
8348  int i = 0;
8349  // Only allow 8 digit number widths.
8350  // This also prevents overflowing format variable
8351  while (i < 8 && width_left < width_right) {
8352  format[format_index++] = *width_left;
8353  width_left++;
8354  i++;
8355  }
8356  }
8357 
8358  // Parse a name (long or short)
8359  // Canonicalize the name into absolute_short_name
8360  found_valid_name = false;
8361  parse_long_name = (**ptr == '{');
8362  if (parse_long_name)
8363  (*ptr)++; // skip initial left brace
8364  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8365  sizeof(__kmp_affinity_format_table[0]);
8366  ++i) {
8367  char short_name = __kmp_affinity_format_table[i].short_name;
8368  const char *long_name = __kmp_affinity_format_table[i].long_name;
8369  char field_format = __kmp_affinity_format_table[i].field_format;
8370  if (parse_long_name) {
8371  size_t length = KMP_STRLEN(long_name);
8372  if (strncmp(*ptr, long_name, length) == 0) {
8373  found_valid_name = true;
8374  (*ptr) += length; // skip the long name
8375  }
8376  } else if (**ptr == short_name) {
8377  found_valid_name = true;
8378  (*ptr)++; // skip the short name
8379  }
8380  if (found_valid_name) {
8381  format[format_index++] = field_format;
8382  format[format_index++] = '\0';
8383  absolute_short_name = short_name;
8384  break;
8385  }
8386  }
8387  if (parse_long_name) {
8388  if (**ptr != '}') {
8389  absolute_short_name = 0;
8390  } else {
8391  (*ptr)++; // skip over the right brace
8392  }
8393  }
8394 
8395  // Attempt to fill the buffer with the requested
8396  // value using snprintf within __kmp_str_buf_print()
8397  switch (absolute_short_name) {
8398  case 't':
8399  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8400  break;
8401  case 'T':
8402  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8403  break;
8404  case 'L':
8405  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8406  break;
8407  case 'n':
8408  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8409  break;
8410  case 'H': {
8411  static const int BUFFER_SIZE = 256;
8412  char buf[BUFFER_SIZE];
8413  __kmp_expand_host_name(buf, BUFFER_SIZE);
8414  rc = __kmp_str_buf_print(field_buffer, format, buf);
8415  } break;
8416  case 'P':
8417  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8418  break;
8419  case 'i':
8420  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8421  break;
8422  case 'N':
8423  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8424  break;
8425  case 'a':
8426  field_value =
8427  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8428  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8429  break;
8430 #if KMP_AFFINITY_SUPPORTED
8431  case 'A': {
8432  kmp_str_buf_t buf;
8433  __kmp_str_buf_init(&buf);
8434  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8435  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8436  __kmp_str_buf_free(&buf);
8437  } break;
8438 #endif
8439  default:
8440  // According to spec, If an implementation does not have info for field
8441  // type, then "undefined" is printed
8442  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8443  // Skip the field
8444  if (parse_long_name) {
8445  SKIP_TOKEN(*ptr);
8446  if (**ptr == '}')
8447  (*ptr)++;
8448  } else {
8449  (*ptr)++;
8450  }
8451  }
8452 
8453  KMP_ASSERT(format_index <= FORMAT_SIZE);
8454  return rc;
8455 }
8456 
8457 /*
8458  * Return number of characters needed to hold the affinity string
8459  * (not including null byte character)
8460  * The resultant string is printed to buffer, which the caller can then
8461  * handle afterwards
8462  */
8463 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8464  kmp_str_buf_t *buffer) {
8465  const char *parse_ptr;
8466  size_t retval;
8467  const kmp_info_t *th;
8468  kmp_str_buf_t field;
8469 
8470  KMP_DEBUG_ASSERT(buffer);
8471  KMP_DEBUG_ASSERT(gtid >= 0);
8472 
8473  __kmp_str_buf_init(&field);
8474  __kmp_str_buf_clear(buffer);
8475 
8476  th = __kmp_threads[gtid];
8477  retval = 0;
8478 
8479  // If format is NULL or zero-length string, then we use
8480  // affinity-format-var ICV
8481  parse_ptr = format;
8482  if (parse_ptr == NULL || *parse_ptr == '\0') {
8483  parse_ptr = __kmp_affinity_format;
8484  }
8485  KMP_DEBUG_ASSERT(parse_ptr);
8486 
8487  while (*parse_ptr != '\0') {
8488  // Parse a field
8489  if (*parse_ptr == '%') {
8490  // Put field in the buffer
8491  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8492  __kmp_str_buf_catbuf(buffer, &field);
8493  retval += rc;
8494  } else {
8495  // Put literal character in buffer
8496  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8497  retval++;
8498  parse_ptr++;
8499  }
8500  }
8501  __kmp_str_buf_free(&field);
8502  return retval;
8503 }
8504 
8505 // Displays the affinity string to stdout
8506 void __kmp_aux_display_affinity(int gtid, const char *format) {
8507  kmp_str_buf_t buf;
8508  __kmp_str_buf_init(&buf);
8509  __kmp_aux_capture_affinity(gtid, format, &buf);
8510  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8511  __kmp_str_buf_free(&buf);
8512 }
8513 
8514 /* ------------------------------------------------------------------------ */
8515 
8516 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8517  int blocktime = arg; /* argument is in milliseconds */
8518 #if KMP_USE_MONITOR
8519  int bt_intervals;
8520 #endif
8521  kmp_int8 bt_set;
8522 
8523  __kmp_save_internal_controls(thread);
8524 
8525  /* Normalize and set blocktime for the teams */
8526  if (blocktime < KMP_MIN_BLOCKTIME)
8527  blocktime = KMP_MIN_BLOCKTIME;
8528  else if (blocktime > KMP_MAX_BLOCKTIME)
8529  blocktime = KMP_MAX_BLOCKTIME;
8530 
8531  set__blocktime_team(thread->th.th_team, tid, blocktime);
8532  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8533 
8534 #if KMP_USE_MONITOR
8535  /* Calculate and set blocktime intervals for the teams */
8536  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8537 
8538  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8539  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8540 #endif
8541 
8542  /* Set whether blocktime has been set to "TRUE" */
8543  bt_set = TRUE;
8544 
8545  set__bt_set_team(thread->th.th_team, tid, bt_set);
8546  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8547 #if KMP_USE_MONITOR
8548  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8549  "bt_intervals=%d, monitor_updates=%d\n",
8550  __kmp_gtid_from_tid(tid, thread->th.th_team),
8551  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8552  __kmp_monitor_wakeups));
8553 #else
8554  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8555  __kmp_gtid_from_tid(tid, thread->th.th_team),
8556  thread->th.th_team->t.t_id, tid, blocktime));
8557 #endif
8558 }
8559 
8560 void __kmp_aux_set_defaults(char const *str, size_t len) {
8561  if (!__kmp_init_serial) {
8562  __kmp_serial_initialize();
8563  }
8564  __kmp_env_initialize(str);
8565 
8566  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8567  __kmp_env_print();
8568  }
8569 } // __kmp_aux_set_defaults
8570 
8571 /* ------------------------------------------------------------------------ */
8572 /* internal fast reduction routines */
8573 
8574 PACKED_REDUCTION_METHOD_T
8575 __kmp_determine_reduction_method(
8576  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8577  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8578  kmp_critical_name *lck) {
8579 
8580  // Default reduction method: critical construct ( lck != NULL, like in current
8581  // PAROPT )
8582  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8583  // can be selected by RTL
8584  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8585  // can be selected by RTL
8586  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8587  // among generated by PAROPT.
8588 
8589  PACKED_REDUCTION_METHOD_T retval;
8590 
8591  int team_size;
8592 
8593  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8594  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8595 
8596 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8597  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8598 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8599 
8600  retval = critical_reduce_block;
8601 
8602  // another choice of getting a team size (with 1 dynamic deference) is slower
8603  team_size = __kmp_get_team_num_threads(global_tid);
8604  if (team_size == 1) {
8605 
8606  retval = empty_reduce_block;
8607 
8608  } else {
8609 
8610  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8611 
8612 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8613  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8614 
8615 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8616  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8617 
8618  int teamsize_cutoff = 4;
8619 
8620 #if KMP_MIC_SUPPORTED
8621  if (__kmp_mic_type != non_mic) {
8622  teamsize_cutoff = 8;
8623  }
8624 #endif
8625  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8626  if (tree_available) {
8627  if (team_size <= teamsize_cutoff) {
8628  if (atomic_available) {
8629  retval = atomic_reduce_block;
8630  }
8631  } else {
8632  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8633  }
8634  } else if (atomic_available) {
8635  retval = atomic_reduce_block;
8636  }
8637 #else
8638 #error "Unknown or unsupported OS"
8639 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8640  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8641 
8642 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8643 
8644 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8645 
8646  // basic tuning
8647 
8648  if (atomic_available) {
8649  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8650  retval = atomic_reduce_block;
8651  }
8652  } // otherwise: use critical section
8653 
8654 #elif KMP_OS_DARWIN
8655 
8656  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8657  if (atomic_available && (num_vars <= 3)) {
8658  retval = atomic_reduce_block;
8659  } else if (tree_available) {
8660  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8661  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8662  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8663  }
8664  } // otherwise: use critical section
8665 
8666 #else
8667 #error "Unknown or unsupported OS"
8668 #endif
8669 
8670 #else
8671 #error "Unknown or unsupported architecture"
8672 #endif
8673  }
8674 
8675  // KMP_FORCE_REDUCTION
8676 
8677  // If the team is serialized (team_size == 1), ignore the forced reduction
8678  // method and stay with the unsynchronized method (empty_reduce_block)
8679  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8680  team_size != 1) {
8681 
8682  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8683 
8684  int atomic_available, tree_available;
8685 
8686  switch ((forced_retval = __kmp_force_reduction_method)) {
8687  case critical_reduce_block:
8688  KMP_ASSERT(lck); // lck should be != 0
8689  break;
8690 
8691  case atomic_reduce_block:
8692  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8693  if (!atomic_available) {
8694  KMP_WARNING(RedMethodNotSupported, "atomic");
8695  forced_retval = critical_reduce_block;
8696  }
8697  break;
8698 
8699  case tree_reduce_block:
8700  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8701  if (!tree_available) {
8702  KMP_WARNING(RedMethodNotSupported, "tree");
8703  forced_retval = critical_reduce_block;
8704  } else {
8705 #if KMP_FAST_REDUCTION_BARRIER
8706  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8707 #endif
8708  }
8709  break;
8710 
8711  default:
8712  KMP_ASSERT(0); // "unsupported method specified"
8713  }
8714 
8715  retval = forced_retval;
8716  }
8717 
8718  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8719 
8720 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8721 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8722 
8723  return (retval);
8724 }
8725 // this function is for testing set/get/determine reduce method
8726 kmp_int32 __kmp_get_reduce_method(void) {
8727  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8728 }
8729 
8730 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8731 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8732 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8733 
8734 // Hard pause shuts down the runtime completely. Resume happens naturally when
8735 // OpenMP is used subsequently.
8736 void __kmp_hard_pause() {
8737  __kmp_pause_status = kmp_hard_paused;
8738  __kmp_internal_end_thread(-1);
8739 }
8740 
8741 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8742 void __kmp_resume_if_soft_paused() {
8743  if (__kmp_pause_status == kmp_soft_paused) {
8744  __kmp_pause_status = kmp_not_paused;
8745 
8746  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8747  kmp_info_t *thread = __kmp_threads[gtid];
8748  if (thread) { // Wake it if sleeping
8749  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8750  thread);
8751  if (fl.is_sleeping())
8752  fl.resume(gtid);
8753  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8754  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8755  } else { // thread holds the lock and may sleep soon
8756  do { // until either the thread sleeps, or we can get the lock
8757  if (fl.is_sleeping()) {
8758  fl.resume(gtid);
8759  break;
8760  } else if (__kmp_try_suspend_mx(thread)) {
8761  __kmp_unlock_suspend_mx(thread);
8762  break;
8763  }
8764  } while (1);
8765  }
8766  }
8767  }
8768  }
8769 }
8770 
8771 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8772 // TODO: add warning messages
8773 int __kmp_pause_resource(kmp_pause_status_t level) {
8774  if (level == kmp_not_paused) { // requesting resume
8775  if (__kmp_pause_status == kmp_not_paused) {
8776  // error message about runtime not being paused, so can't resume
8777  return 1;
8778  } else {
8779  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8780  __kmp_pause_status == kmp_hard_paused);
8781  __kmp_pause_status = kmp_not_paused;
8782  return 0;
8783  }
8784  } else if (level == kmp_soft_paused) { // requesting soft pause
8785  if (__kmp_pause_status != kmp_not_paused) {
8786  // error message about already being paused
8787  return 1;
8788  } else {
8789  __kmp_soft_pause();
8790  return 0;
8791  }
8792  } else if (level == kmp_hard_paused) { // requesting hard pause
8793  if (__kmp_pause_status != kmp_not_paused) {
8794  // error message about already being paused
8795  return 1;
8796  } else {
8797  __kmp_hard_pause();
8798  return 0;
8799  }
8800  } else {
8801  // error message about invalid level
8802  return 1;
8803  }
8804 }
8805 
8806 void __kmp_omp_display_env(int verbose) {
8807  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8808  if (__kmp_init_serial == 0)
8809  __kmp_do_serial_initialize();
8810  __kmp_display_env_impl(!verbose, verbose);
8811  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8812 }
8813 
8814 // The team size is changing, so distributed barrier must be modified
8815 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8816  int new_nthreads) {
8817  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8818  bp_dist_bar);
8819  kmp_info_t **other_threads = team->t.t_threads;
8820 
8821  // We want all the workers to stop waiting on the barrier while we adjust the
8822  // size of the team.
8823  for (int f = 1; f < old_nthreads; ++f) {
8824  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8825  // Ignore threads that are already inactive or not present in the team
8826  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8827  // teams construct causes thread_limit to get passed in, and some of
8828  // those could be inactive; just ignore them
8829  continue;
8830  }
8831  // If thread is transitioning still to in_use state, wait for it
8832  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8833  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8834  KMP_CPU_PAUSE();
8835  }
8836  // The thread should be in_use now
8837  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8838  // Transition to unused state
8839  team->t.t_threads[f]->th.th_used_in_team.store(2);
8840  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8841  }
8842  // Release all the workers
8843  kmp_uint64 new_value; // new value for go
8844  new_value = team->t.b->go_release();
8845 
8846  KMP_MFENCE();
8847 
8848  // Workers should see transition status 2 and move to 0; but may need to be
8849  // woken up first
8850  size_t my_go_index;
8851  int count = old_nthreads - 1;
8852  while (count > 0) {
8853  count = old_nthreads - 1;
8854  for (int f = 1; f < old_nthreads; ++f) {
8855  my_go_index = f / team->t.b->threads_per_go;
8856  if (other_threads[f]->th.th_used_in_team.load() != 0) {
8857  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8858  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8859  void *, other_threads[f]->th.th_sleep_loc);
8860  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8861  }
8862  } else {
8863  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8864  count--;
8865  }
8866  }
8867  }
8868  // Now update the barrier size
8869  team->t.b->update_num_threads(new_nthreads);
8870  team->t.b->go_reset();
8871 }
8872 
8873 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8874  // Add the threads back to the team
8875  KMP_DEBUG_ASSERT(team);
8876  // Threads were paused and pointed at th_used_in_team temporarily during a
8877  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
8878  // the thread that it should transition itself back into the team. Then, if
8879  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
8880  // to wake it up.
8881  for (int f = 1; f < new_nthreads; ++f) {
8882  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
8883  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
8884  3);
8885  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
8886  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
8887  (kmp_flag_32<false, false> *)NULL);
8888  }
8889  }
8890  // The threads should be transitioning to the team; when they are done, they
8891  // should have set th_used_in_team to 1. This loop forces master to wait until
8892  // all threads have moved into the team and are waiting in the barrier.
8893  int count = new_nthreads - 1;
8894  while (count > 0) {
8895  count = new_nthreads - 1;
8896  for (int f = 1; f < new_nthreads; ++f) {
8897  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
8898  count--;
8899  }
8900  }
8901  }
8902 }
8903 
8904 // Globals and functions for hidden helper task
8905 kmp_info_t **__kmp_hidden_helper_threads;
8906 kmp_info_t *__kmp_hidden_helper_main_thread;
8907 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8908 #if KMP_OS_LINUX
8909 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8910 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8911 #else
8912 kmp_int32 __kmp_hidden_helper_threads_num = 0;
8913 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8914 #endif
8915 
8916 namespace {
8917 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8918 
8919 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8920  // This is an explicit synchronization on all hidden helper threads in case
8921  // that when a regular thread pushes a hidden helper task to one hidden
8922  // helper thread, the thread has not been awaken once since they're released
8923  // by the main thread after creating the team.
8924  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8925  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8926  __kmp_hidden_helper_threads_num)
8927  ;
8928 
8929  // If main thread, then wait for signal
8930  if (__kmpc_master(nullptr, *gtid)) {
8931  // First, unset the initial state and release the initial thread
8932  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8933  __kmp_hidden_helper_initz_release();
8934  __kmp_hidden_helper_main_thread_wait();
8935  // Now wake up all worker threads
8936  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8937  __kmp_hidden_helper_worker_thread_signal();
8938  }
8939  }
8940 }
8941 } // namespace
8942 
8943 void __kmp_hidden_helper_threads_initz_routine() {
8944  // Create a new root for hidden helper team/threads
8945  const int gtid = __kmp_register_root(TRUE);
8946  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8947  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8948  __kmp_hidden_helper_main_thread->th.th_set_nproc =
8949  __kmp_hidden_helper_threads_num;
8950 
8951  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8952 
8953  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8954 
8955  // Set the initialization flag to FALSE
8956  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8957 
8958  __kmp_hidden_helper_threads_deinitz_release();
8959 }
8960 
8961 /* Nesting Mode:
8962  Set via KMP_NESTING_MODE, which takes an integer.
8963  Note: we skip duplicate topology levels, and skip levels with only
8964  one entity.
8965  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8966  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8967  in the topology, and initializes the number of threads at each of those
8968  levels to the number of entities at each level, respectively, below the
8969  entity at the parent level.
8970  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8971  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8972  the user to turn nesting on explicitly. This is an even more experimental
8973  option to this experimental feature, and may change or go away in the
8974  future.
8975 */
8976 
8977 // Allocate space to store nesting levels
8978 void __kmp_init_nesting_mode() {
8979  int levels = KMP_HW_LAST;
8980  __kmp_nesting_mode_nlevels = levels;
8981  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8982  for (int i = 0; i < levels; ++i)
8983  __kmp_nesting_nth_level[i] = 0;
8984  if (__kmp_nested_nth.size < levels) {
8985  __kmp_nested_nth.nth =
8986  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8987  __kmp_nested_nth.size = levels;
8988  }
8989 }
8990 
8991 // Set # threads for top levels of nesting; must be called after topology set
8992 void __kmp_set_nesting_mode_threads() {
8993  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8994 
8995  if (__kmp_nesting_mode == 1)
8996  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8997  else if (__kmp_nesting_mode > 1)
8998  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8999 
9000  if (__kmp_topology) { // use topology info
9001  int loc, hw_level;
9002  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9003  loc < __kmp_nesting_mode_nlevels;
9004  loc++, hw_level++) {
9005  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9006  if (__kmp_nesting_nth_level[loc] == 1)
9007  loc--;
9008  }
9009  // Make sure all cores are used
9010  if (__kmp_nesting_mode > 1 && loc > 1) {
9011  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9012  int num_cores = __kmp_topology->get_count(core_level);
9013  int upper_levels = 1;
9014  for (int level = 0; level < loc - 1; ++level)
9015  upper_levels *= __kmp_nesting_nth_level[level];
9016  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9017  __kmp_nesting_nth_level[loc - 1] =
9018  num_cores / __kmp_nesting_nth_level[loc - 2];
9019  }
9020  __kmp_nesting_mode_nlevels = loc;
9021  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9022  } else { // no topology info available; provide a reasonable guesstimation
9023  if (__kmp_avail_proc >= 4) {
9024  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9025  __kmp_nesting_nth_level[1] = 2;
9026  __kmp_nesting_mode_nlevels = 2;
9027  } else {
9028  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9029  __kmp_nesting_mode_nlevels = 1;
9030  }
9031  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9032  }
9033  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9034  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9035  }
9036  set__nproc(thread, __kmp_nesting_nth_level[0]);
9037  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9038  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9039  if (get__max_active_levels(thread) > 1) {
9040  // if max levels was set, set nesting mode levels to same
9041  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9042  }
9043  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9044  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9045 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:937
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236