LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  * it may change values between parallel regions. __kmp_max_nth
17  * is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44 
45  KMP_DEBUG_ASSERT(gtid_ref);
46 
47  if (__kmp_env_consistency_check) {
48  th = __kmp_threads[*gtid_ref];
49  if (th->th.th_root->r.r_active &&
50  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56  }
57  }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62 
63  if (__kmp_env_consistency_check) {
64  th = __kmp_threads[*gtid_ref];
65  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67  }
68  }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73  bool use_hier = false) {
74  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75  // Nonmonotonic as default for dynamic schedule when no modifier is specified
76  int monotonicity = SCHEDULE_NONMONOTONIC;
77 
78  // Let default be monotonic for executables
79  // compiled with OpenMP* 4.5 or less compilers
80  if (loc != NULL && loc->get_openmp_version() < 50)
81  monotonicity = SCHEDULE_MONOTONIC;
82 
83  if (use_hier || __kmp_force_monotonic)
84  monotonicity = SCHEDULE_MONOTONIC;
85  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86  monotonicity = SCHEDULE_NONMONOTONIC;
87  else if (SCHEDULE_HAS_MONOTONIC(schedule))
88  monotonicity = SCHEDULE_MONOTONIC;
89 
90  return monotonicity;
91 }
92 
93 #if KMP_STATIC_STEAL_ENABLED
94 enum { // values for steal_flag (possible states of private per-loop buffer)
95  UNUSED = 0,
96  CLAIMED = 1, // owner thread started initialization
97  READY = 2, // available for stealing
98  THIEF = 3 // finished by owner, or claimed by thief
99  // possible state changes:
100  // 0 -> 1 owner only, sync
101  // 0 -> 3 thief only, sync
102  // 1 -> 2 owner only, async
103  // 2 -> 3 owner only, async
104  // 3 -> 2 owner only, async
105  // 3 -> 0 last thread finishing the loop, async
106 };
107 #endif
108 
109 // Initialize a dispatch_private_info_template<T> buffer for a particular
110 // type of schedule,chunk. The loop description is found in lb (lower bound),
111 // ub (upper bound), and st (stride). nproc is the number of threads relevant
112 // to the scheduling (often the number of threads in a team, but not always if
113 // hierarchical scheduling is used). tid is the id of the thread calling
114 // the function within the group of nproc threads. It will have a value
115 // between 0 and nproc - 1. This is often just the thread id within a team, but
116 // is not necessarily the case when using hierarchical scheduling.
117 // loc is the source file location of the corresponding loop
118 // gtid is the global thread id
119 template <typename T>
120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
121  dispatch_private_info_template<T> *pr,
122  enum sched_type schedule, T lb, T ub,
123  typename traits_t<T>::signed_t st,
124 #if USE_ITT_BUILD
125  kmp_uint64 *cur_chunk,
126 #endif
127  typename traits_t<T>::signed_t chunk,
128  T nproc, T tid) {
129  typedef typename traits_t<T>::unsigned_t UT;
130  typedef typename traits_t<T>::floating_t DBL;
131 
132  int active;
133  T tc;
134  kmp_info_t *th;
135  kmp_team_t *team;
136  int monotonicity;
137  bool use_hier;
138 
139 #ifdef KMP_DEBUG
140  typedef typename traits_t<T>::signed_t ST;
141  {
142  char *buff;
143  // create format specifiers before the debug output
144  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
145  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
146  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
147  traits_t<T>::spec, traits_t<T>::spec,
148  traits_t<ST>::spec, traits_t<ST>::spec,
149  traits_t<T>::spec, traits_t<T>::spec);
150  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
151  __kmp_str_free(&buff);
152  }
153 #endif
154  /* setup data */
155  th = __kmp_threads[gtid];
156  team = th->th.th_team;
157  active = !team->t.t_serialized;
158 
159 #if USE_ITT_BUILD
160  int itt_need_metadata_reporting =
161  __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
162  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
163  team->t.t_active_level == 1;
164 #endif
165 
166 #if KMP_USE_HIER_SCHED
167  use_hier = pr->flags.use_hier;
168 #else
169  use_hier = false;
170 #endif
171 
172  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
173  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
174  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
175 
176  /* Pick up the nomerge/ordered bits from the scheduling type */
177  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
178  pr->flags.nomerge = TRUE;
179  schedule =
180  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
181  } else {
182  pr->flags.nomerge = FALSE;
183  }
184  pr->type_size = traits_t<T>::type_size; // remember the size of variables
185  if (kmp_ord_lower & schedule) {
186  pr->flags.ordered = TRUE;
187  schedule =
188  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
189  } else {
190  pr->flags.ordered = FALSE;
191  }
192  // Ordered overrides nonmonotonic
193  if (pr->flags.ordered) {
194  monotonicity = SCHEDULE_MONOTONIC;
195  }
196 
197  if (schedule == kmp_sch_static) {
198  schedule = __kmp_static;
199  } else {
200  if (schedule == kmp_sch_runtime) {
201  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
202  // not specified)
203  schedule = team->t.t_sched.r_sched_type;
204  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
205  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
206  if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
207  monotonicity = SCHEDULE_MONOTONIC;
208  // Detail the schedule if needed (global controls are differentiated
209  // appropriately)
210  if (schedule == kmp_sch_guided_chunked) {
211  schedule = __kmp_guided;
212  } else if (schedule == kmp_sch_static) {
213  schedule = __kmp_static;
214  }
215  // Use the chunk size specified by OMP_SCHEDULE (or default if not
216  // specified)
217  chunk = team->t.t_sched.chunk;
218 #if USE_ITT_BUILD
219  if (cur_chunk)
220  *cur_chunk = chunk;
221 #endif
222 #ifdef KMP_DEBUG
223  {
224  char *buff;
225  // create format specifiers before the debug output
226  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227  "schedule:%%d chunk:%%%s\n",
228  traits_t<ST>::spec);
229  KD_TRACE(10, (buff, gtid, schedule, chunk));
230  __kmp_str_free(&buff);
231  }
232 #endif
233  } else {
234  if (schedule == kmp_sch_guided_chunked) {
235  schedule = __kmp_guided;
236  }
237  if (chunk <= 0) {
238  chunk = KMP_DEFAULT_CHUNK;
239  }
240  }
241 
242  if (schedule == kmp_sch_auto) {
243  // mapping and differentiation: in the __kmp_do_serial_initialize()
244  schedule = __kmp_auto;
245 #ifdef KMP_DEBUG
246  {
247  char *buff;
248  // create format specifiers before the debug output
249  buff = __kmp_str_format(
250  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
251  "schedule:%%d chunk:%%%s\n",
252  traits_t<ST>::spec);
253  KD_TRACE(10, (buff, gtid, schedule, chunk));
254  __kmp_str_free(&buff);
255  }
256 #endif
257  }
258 #if KMP_STATIC_STEAL_ENABLED
259  // map nonmonotonic:dynamic to static steal
260  if (schedule == kmp_sch_dynamic_chunked) {
261  if (monotonicity == SCHEDULE_NONMONOTONIC)
262  schedule = kmp_sch_static_steal;
263  }
264 #endif
265  /* guided analytical not safe for too many threads */
266  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
267  schedule = kmp_sch_guided_iterative_chunked;
268  KMP_WARNING(DispatchManyThreads);
269  }
270  if (schedule == kmp_sch_runtime_simd) {
271  // compiler provides simd_width in the chunk parameter
272  schedule = team->t.t_sched.r_sched_type;
273  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
274  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
275  // Detail the schedule if needed (global controls are differentiated
276  // appropriately)
277  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
278  schedule == __kmp_static) {
279  schedule = kmp_sch_static_balanced_chunked;
280  } else {
281  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
282  schedule = kmp_sch_guided_simd;
283  }
284  chunk = team->t.t_sched.chunk * chunk;
285  }
286 #if USE_ITT_BUILD
287  if (cur_chunk)
288  *cur_chunk = chunk;
289 #endif
290 #ifdef KMP_DEBUG
291  {
292  char *buff;
293  // create format specifiers before the debug output
294  buff = __kmp_str_format(
295  "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
296  " chunk:%%%s\n",
297  traits_t<ST>::spec);
298  KD_TRACE(10, (buff, gtid, schedule, chunk));
299  __kmp_str_free(&buff);
300  }
301 #endif
302  }
303  pr->u.p.parm1 = chunk;
304  }
305  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
306  "unknown scheduling type");
307 
308  pr->u.p.count = 0;
309 
310  if (__kmp_env_consistency_check) {
311  if (st == 0) {
312  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
313  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
314  }
315  }
316  // compute trip count
317  if (st == 1) { // most common case
318  if (ub >= lb) {
319  tc = ub - lb + 1;
320  } else { // ub < lb
321  tc = 0; // zero-trip
322  }
323  } else if (st < 0) {
324  if (lb >= ub) {
325  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
326  // where the division needs to be unsigned regardless of the result type
327  tc = (UT)(lb - ub) / (-st) + 1;
328  } else { // lb < ub
329  tc = 0; // zero-trip
330  }
331  } else { // st > 0
332  if (ub >= lb) {
333  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
334  // where the division needs to be unsigned regardless of the result type
335  tc = (UT)(ub - lb) / st + 1;
336  } else { // ub < lb
337  tc = 0; // zero-trip
338  }
339  }
340 
341 #if KMP_STATS_ENABLED
342  if (KMP_MASTER_GTID(gtid)) {
343  KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
344  }
345 #endif
346 
347  pr->u.p.lb = lb;
348  pr->u.p.ub = ub;
349  pr->u.p.st = st;
350  pr->u.p.tc = tc;
351 
352 #if KMP_OS_WINDOWS
353  pr->u.p.last_upper = ub + st;
354 #endif /* KMP_OS_WINDOWS */
355 
356  /* NOTE: only the active parallel region(s) has active ordered sections */
357 
358  if (active) {
359  if (pr->flags.ordered) {
360  pr->ordered_bumped = 0;
361  pr->u.p.ordered_lower = 1;
362  pr->u.p.ordered_upper = 0;
363  }
364  }
365 
366  switch (schedule) {
367 #if KMP_STATIC_STEAL_ENABLED
368  case kmp_sch_static_steal: {
369  T ntc, init;
370 
371  KD_TRACE(100,
372  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
373  gtid));
374 
375  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
376  if (nproc > 1 && ntc >= nproc) {
377  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
378  T id = tid;
379  T small_chunk, extras;
380  kmp_uint32 old = UNUSED;
381  int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
382  if (traits_t<T>::type_size > 4) {
383  // AC: TODO: check if 16-byte CAS available and use it to
384  // improve performance (probably wait for explicit request
385  // before spending time on this).
386  // For now use dynamically allocated per-private-buffer lock,
387  // free memory in __kmp_dispatch_next when status==0.
388  pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
389  __kmp_init_lock(pr->u.p.steal_lock);
390  }
391  small_chunk = ntc / nproc;
392  extras = ntc % nproc;
393 
394  init = id * small_chunk + (id < extras ? id : extras);
395  pr->u.p.count = init;
396  if (claimed) { // are we succeeded in claiming own buffer?
397  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
398  // Other threads will inspect steal_flag when searching for a victim.
399  // READY means other threads may steal from this thread from now on.
400  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
401  } else {
402  // other thread has stolen whole our range
403  KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
404  pr->u.p.ub = init; // mark there is no iterations to work on
405  }
406  pr->u.p.parm2 = ntc; // save number of chunks
407  // parm3 is the number of times to attempt stealing which is
408  // nproc (just a heuristics, could be optimized later on).
409  pr->u.p.parm3 = nproc;
410  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
411  break;
412  } else {
413  /* too few chunks: switching to kmp_sch_dynamic_chunked */
414  schedule = kmp_sch_dynamic_chunked;
415  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
416  "kmp_sch_dynamic_chunked\n",
417  gtid));
418  goto dynamic_init;
419  break;
420  } // if
421  } // case
422 #endif
423  case kmp_sch_static_balanced: {
424  T init, limit;
425 
426  KD_TRACE(
427  100,
428  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
429  gtid));
430 
431  if (nproc > 1) {
432  T id = tid;
433 
434  if (tc < nproc) {
435  if (id < tc) {
436  init = id;
437  limit = id;
438  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
439  } else {
440  pr->u.p.count = 1; /* means no more chunks to execute */
441  pr->u.p.parm1 = FALSE;
442  break;
443  }
444  } else {
445  T small_chunk = tc / nproc;
446  T extras = tc % nproc;
447  init = id * small_chunk + (id < extras ? id : extras);
448  limit = init + small_chunk - (id < extras ? 0 : 1);
449  pr->u.p.parm1 = (id == nproc - 1);
450  }
451  } else {
452  if (tc > 0) {
453  init = 0;
454  limit = tc - 1;
455  pr->u.p.parm1 = TRUE;
456  } else {
457  // zero trip count
458  pr->u.p.count = 1; /* means no more chunks to execute */
459  pr->u.p.parm1 = FALSE;
460  break;
461  }
462  }
463 #if USE_ITT_BUILD
464  // Calculate chunk for metadata report
465  if (itt_need_metadata_reporting)
466  if (cur_chunk)
467  *cur_chunk = limit - init + 1;
468 #endif
469  if (st == 1) {
470  pr->u.p.lb = lb + init;
471  pr->u.p.ub = lb + limit;
472  } else {
473  // calculated upper bound, "ub" is user-defined upper bound
474  T ub_tmp = lb + limit * st;
475  pr->u.p.lb = lb + init * st;
476  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
477  // it exactly
478  if (st > 0) {
479  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
480  } else {
481  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
482  }
483  }
484  if (pr->flags.ordered) {
485  pr->u.p.ordered_lower = init;
486  pr->u.p.ordered_upper = limit;
487  }
488  break;
489  } // case
490  case kmp_sch_static_balanced_chunked: {
491  // similar to balanced, but chunk adjusted to multiple of simd width
492  T nth = nproc;
493  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
494  " -> falling-through to static_greedy\n",
495  gtid));
496  schedule = kmp_sch_static_greedy;
497  if (nth > 1)
498  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
499  else
500  pr->u.p.parm1 = tc;
501  break;
502  } // case
503  case kmp_sch_guided_simd:
504  case kmp_sch_guided_iterative_chunked: {
505  KD_TRACE(
506  100,
507  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
508  " case\n",
509  gtid));
510 
511  if (nproc > 1) {
512  if ((2L * chunk + 1) * nproc >= tc) {
513  /* chunk size too large, switch to dynamic */
514  schedule = kmp_sch_dynamic_chunked;
515  goto dynamic_init;
516  } else {
517  // when remaining iters become less than parm2 - switch to dynamic
518  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
519  *(double *)&pr->u.p.parm3 =
520  guided_flt_param / (double)nproc; // may occupy parm3 and parm4
521  }
522  } else {
523  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
524  "kmp_sch_static_greedy\n",
525  gtid));
526  schedule = kmp_sch_static_greedy;
527  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
528  KD_TRACE(
529  100,
530  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
531  gtid));
532  pr->u.p.parm1 = tc;
533  } // if
534  } // case
535  break;
536  case kmp_sch_guided_analytical_chunked: {
537  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
538  "kmp_sch_guided_analytical_chunked case\n",
539  gtid));
540 
541  if (nproc > 1) {
542  if ((2L * chunk + 1) * nproc >= tc) {
543  /* chunk size too large, switch to dynamic */
544  schedule = kmp_sch_dynamic_chunked;
545  goto dynamic_init;
546  } else {
547  /* commonly used term: (2 nproc - 1)/(2 nproc) */
548  DBL x;
549 
550 #if KMP_USE_X87CONTROL
551  /* Linux* OS already has 64-bit computation by default for long double,
552  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
553  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
554  instead of the default 53-bit. Even though long double doesn't work
555  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
556  expected to impact the correctness of the algorithm, but this has not
557  been mathematically proven. */
558  // save original FPCW and set precision to 64-bit, as
559  // Windows* OS on IA-32 architecture defaults to 53-bit
560  unsigned int oldFpcw = _control87(0, 0);
561  _control87(_PC_64, _MCW_PC); // 0,0x30000
562 #endif
563  /* value used for comparison in solver for cross-over point */
564  KMP_ASSERT(tc > 0);
565  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
566 
567  /* crossover point--chunk indexes equal to or greater than
568  this point switch to dynamic-style scheduling */
569  UT cross;
570 
571  /* commonly used term: (2 nproc - 1)/(2 nproc) */
572  x = 1.0 - 0.5 / (double)nproc;
573 
574 #ifdef KMP_DEBUG
575  { // test natural alignment
576  struct _test_a {
577  char a;
578  union {
579  char b;
580  DBL d;
581  };
582  } t;
583  ptrdiff_t natural_alignment =
584  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
585  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
586  // long)natural_alignment );
587  KMP_DEBUG_ASSERT(
588  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
589  }
590 #endif // KMP_DEBUG
591 
592  /* save the term in thread private dispatch structure */
593  *(DBL *)&pr->u.p.parm3 = x;
594 
595  /* solve for the crossover point to the nearest integer i for which C_i
596  <= chunk */
597  {
598  UT left, right, mid;
599  long double p;
600 
601  /* estimate initial upper and lower bound */
602 
603  /* doesn't matter what value right is as long as it is positive, but
604  it affects performance of the solver */
605  right = 229;
606  p = __kmp_pow<UT>(x, right);
607  if (p > target) {
608  do {
609  p *= p;
610  right <<= 1;
611  } while (p > target && right < (1 << 27));
612  /* lower bound is previous (failed) estimate of upper bound */
613  left = right >> 1;
614  } else {
615  left = 0;
616  }
617 
618  /* bisection root-finding method */
619  while (left + 1 < right) {
620  mid = (left + right) / 2;
621  if (__kmp_pow<UT>(x, mid) > target) {
622  left = mid;
623  } else {
624  right = mid;
625  }
626  } // while
627  cross = right;
628  }
629  /* assert sanity of computed crossover point */
630  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
631  __kmp_pow<UT>(x, cross) <= target);
632 
633  /* save the crossover point in thread private dispatch structure */
634  pr->u.p.parm2 = cross;
635 
636 // C75803
637 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
638 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
639 #else
640 #define GUIDED_ANALYTICAL_WORKAROUND (x)
641 #endif
642  /* dynamic-style scheduling offset */
643  pr->u.p.count = tc -
644  __kmp_dispatch_guided_remaining(
645  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
646  cross * chunk;
647 #if KMP_USE_X87CONTROL
648  // restore FPCW
649  _control87(oldFpcw, _MCW_PC);
650 #endif
651  } // if
652  } else {
653  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
654  "kmp_sch_static_greedy\n",
655  gtid));
656  schedule = kmp_sch_static_greedy;
657  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
658  pr->u.p.parm1 = tc;
659  } // if
660  } // case
661  break;
662  case kmp_sch_static_greedy:
663  KD_TRACE(
664  100,
665  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
666  gtid));
667  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
668  break;
669  case kmp_sch_static_chunked:
670  case kmp_sch_dynamic_chunked:
671  dynamic_init:
672  if (tc == 0)
673  break;
674  if (pr->u.p.parm1 <= 0)
675  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
676  else if (pr->u.p.parm1 > tc)
677  pr->u.p.parm1 = tc;
678  // Store the total number of chunks to prevent integer overflow during
679  // bounds calculations in the get next chunk routine.
680  pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
681  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
682  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
683  gtid));
684  break;
685  case kmp_sch_trapezoidal: {
686  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
687 
688  T parm1, parm2, parm3, parm4;
689  KD_TRACE(100,
690  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
691  gtid));
692 
693  parm1 = chunk;
694 
695  /* F : size of the first cycle */
696  parm2 = (tc / (2 * nproc));
697 
698  if (parm2 < 1) {
699  parm2 = 1;
700  }
701 
702  /* L : size of the last cycle. Make sure the last cycle is not larger
703  than the first cycle. */
704  if (parm1 < 1) {
705  parm1 = 1;
706  } else if (parm1 > parm2) {
707  parm1 = parm2;
708  }
709 
710  /* N : number of cycles */
711  parm3 = (parm2 + parm1);
712  parm3 = (2 * tc + parm3 - 1) / parm3;
713 
714  if (parm3 < 2) {
715  parm3 = 2;
716  }
717 
718  /* sigma : decreasing incr of the trapezoid */
719  parm4 = (parm3 - 1);
720  parm4 = (parm2 - parm1) / parm4;
721 
722  // pointless check, because parm4 >= 0 always
723  // if ( parm4 < 0 ) {
724  // parm4 = 0;
725  //}
726 
727  pr->u.p.parm1 = parm1;
728  pr->u.p.parm2 = parm2;
729  pr->u.p.parm3 = parm3;
730  pr->u.p.parm4 = parm4;
731  } // case
732  break;
733 
734  default: {
735  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
736  KMP_HNT(GetNewerLibrary), // Hint
737  __kmp_msg_null // Variadic argument list terminator
738  );
739  } break;
740  } // switch
741  pr->schedule = schedule;
742 }
743 
744 #if KMP_USE_HIER_SCHED
745 template <typename T>
746 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
747  typename traits_t<T>::signed_t st);
748 template <>
749 inline void
750 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
751  kmp_int32 ub, kmp_int32 st) {
752  __kmp_dispatch_init_hierarchy<kmp_int32>(
753  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
754  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
755 }
756 template <>
757 inline void
758 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
759  kmp_uint32 ub, kmp_int32 st) {
760  __kmp_dispatch_init_hierarchy<kmp_uint32>(
761  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
762  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
763 }
764 template <>
765 inline void
766 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
767  kmp_int64 ub, kmp_int64 st) {
768  __kmp_dispatch_init_hierarchy<kmp_int64>(
769  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
770  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
771 }
772 template <>
773 inline void
774 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
775  kmp_uint64 ub, kmp_int64 st) {
776  __kmp_dispatch_init_hierarchy<kmp_uint64>(
777  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
778  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
779 }
780 
781 // free all the hierarchy scheduling memory associated with the team
782 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
783  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
784  for (int i = 0; i < num_disp_buff; ++i) {
785  // type does not matter here so use kmp_int32
786  auto sh =
787  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
788  &team->t.t_disp_buffer[i]);
789  if (sh->hier) {
790  sh->hier->deallocate();
791  __kmp_free(sh->hier);
792  }
793  }
794 }
795 #endif
796 
797 // UT - unsigned flavor of T, ST - signed flavor of T,
798 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
799 template <typename T>
800 static void
801 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
802  T ub, typename traits_t<T>::signed_t st,
803  typename traits_t<T>::signed_t chunk, int push_ws) {
804  typedef typename traits_t<T>::unsigned_t UT;
805 
806  int active;
807  kmp_info_t *th;
808  kmp_team_t *team;
809  kmp_uint32 my_buffer_index;
810  dispatch_private_info_template<T> *pr;
811  dispatch_shared_info_template<T> volatile *sh;
812 
813  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
814  sizeof(dispatch_private_info));
815  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
816  sizeof(dispatch_shared_info));
817  __kmp_assert_valid_gtid(gtid);
818 
819  if (!TCR_4(__kmp_init_parallel))
820  __kmp_parallel_initialize();
821 
822  __kmp_resume_if_soft_paused();
823 
824 #if INCLUDE_SSC_MARKS
825  SSC_MARK_DISPATCH_INIT();
826 #endif
827 #ifdef KMP_DEBUG
828  typedef typename traits_t<T>::signed_t ST;
829  {
830  char *buff;
831  // create format specifiers before the debug output
832  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
833  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
834  traits_t<ST>::spec, traits_t<T>::spec,
835  traits_t<T>::spec, traits_t<ST>::spec);
836  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
837  __kmp_str_free(&buff);
838  }
839 #endif
840  /* setup data */
841  th = __kmp_threads[gtid];
842  team = th->th.th_team;
843  active = !team->t.t_serialized;
844  th->th.th_ident = loc;
845 
846  // Any half-decent optimizer will remove this test when the blocks are empty
847  // since the macros expand to nothing
848  // when statistics are disabled.
849  if (schedule == __kmp_static) {
850  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
851  } else {
852  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
853  }
854 
855 #if KMP_USE_HIER_SCHED
856  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
857  // Hierarchical scheduling does not work with ordered, so if ordered is
858  // detected, then revert back to threaded scheduling.
859  bool ordered;
860  enum sched_type my_sched = schedule;
861  my_buffer_index = th->th.th_dispatch->th_disp_index;
862  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
863  &th->th.th_dispatch
864  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
865  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
866  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
867  my_sched =
868  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
869  ordered = (kmp_ord_lower & my_sched);
870  if (pr->flags.use_hier) {
871  if (ordered) {
872  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
873  "Disabling hierarchical scheduling.\n",
874  gtid));
875  pr->flags.use_hier = FALSE;
876  }
877  }
878  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
879  // Don't use hierarchical for ordered parallel loops and don't
880  // use the runtime hierarchy if one was specified in the program
881  if (!ordered && !pr->flags.use_hier)
882  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
883  }
884 #endif // KMP_USE_HIER_SCHED
885 
886 #if USE_ITT_BUILD
887  kmp_uint64 cur_chunk = chunk;
888  int itt_need_metadata_reporting =
889  __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
890  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
891  team->t.t_active_level == 1;
892 #endif
893  if (!active) {
894  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
895  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
896  } else {
897  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
898  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
899 
900  my_buffer_index = th->th.th_dispatch->th_disp_index++;
901 
902  /* What happens when number of threads changes, need to resize buffer? */
903  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
904  &th->th.th_dispatch
905  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
906  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
907  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
908  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
909  my_buffer_index));
910  if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
911  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
912  " sh->buffer_index:%d\n",
913  gtid, my_buffer_index, sh->buffer_index));
914  __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
915  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
916  // Note: KMP_WAIT() cannot be used there: buffer index and
917  // my_buffer_index are *always* 32-bit integers.
918  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
919  "sh->buffer_index:%d\n",
920  gtid, my_buffer_index, sh->buffer_index));
921  }
922  }
923 
924  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
925 #if USE_ITT_BUILD
926  &cur_chunk,
927 #endif
928  chunk, (T)th->th.th_team_nproc,
929  (T)th->th.th_info.ds.ds_tid);
930  if (active) {
931  if (pr->flags.ordered == 0) {
932  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
933  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
934  } else {
935  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
936  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
937  }
938  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
939  th->th.th_dispatch->th_dispatch_sh_current =
940  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
941 #if USE_ITT_BUILD
942  if (pr->flags.ordered) {
943  __kmp_itt_ordered_init(gtid);
944  }
945  // Report loop metadata
946  if (itt_need_metadata_reporting) {
947  // Only report metadata by primary thread of active team at level 1
948  kmp_uint64 schedtype = 0;
949  switch (schedule) {
950  case kmp_sch_static_chunked:
951  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
952  break;
953  case kmp_sch_static_greedy:
954  cur_chunk = pr->u.p.parm1;
955  break;
956  case kmp_sch_dynamic_chunked:
957  schedtype = 1;
958  break;
959  case kmp_sch_guided_iterative_chunked:
960  case kmp_sch_guided_analytical_chunked:
961  case kmp_sch_guided_simd:
962  schedtype = 2;
963  break;
964  default:
965  // Should we put this case under "static"?
966  // case kmp_sch_static_steal:
967  schedtype = 3;
968  break;
969  }
970  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
971  }
972 #if KMP_USE_HIER_SCHED
973  if (pr->flags.use_hier) {
974  pr->u.p.count = 0;
975  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
976  }
977 #endif // KMP_USER_HIER_SCHED
978 #endif /* USE_ITT_BUILD */
979  }
980 
981 #ifdef KMP_DEBUG
982  {
983  char *buff;
984  // create format specifiers before the debug output
985  buff = __kmp_str_format(
986  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
987  "lb:%%%s ub:%%%s"
988  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
989  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
990  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
991  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
992  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
993  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
994  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
995  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
996  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
997  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
998  __kmp_str_free(&buff);
999  }
1000 #endif
1001 #if OMPT_SUPPORT && OMPT_OPTIONAL
1002  if (ompt_enabled.ompt_callback_work) {
1003  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1004  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1005  ompt_callbacks.ompt_callback(ompt_callback_work)(
1006  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1007  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1008  }
1009 #endif
1010  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1011 }
1012 
1013 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1014  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1015  * every chunk of iterations. If the ordered section(s) were not executed
1016  * for this iteration (or every iteration in this chunk), we need to set the
1017  * ordered iteration counters so that the next thread can proceed. */
1018 template <typename UT>
1019 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1020  typedef typename traits_t<UT>::signed_t ST;
1021  __kmp_assert_valid_gtid(gtid);
1022  kmp_info_t *th = __kmp_threads[gtid];
1023 
1024  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1025  if (!th->th.th_team->t.t_serialized) {
1026 
1027  dispatch_private_info_template<UT> *pr =
1028  reinterpret_cast<dispatch_private_info_template<UT> *>(
1029  th->th.th_dispatch->th_dispatch_pr_current);
1030  dispatch_shared_info_template<UT> volatile *sh =
1031  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1032  th->th.th_dispatch->th_dispatch_sh_current);
1033  KMP_DEBUG_ASSERT(pr);
1034  KMP_DEBUG_ASSERT(sh);
1035  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1036  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1037 
1038  if (pr->ordered_bumped) {
1039  KD_TRACE(
1040  1000,
1041  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1042  gtid));
1043  pr->ordered_bumped = 0;
1044  } else {
1045  UT lower = pr->u.p.ordered_lower;
1046 
1047 #ifdef KMP_DEBUG
1048  {
1049  char *buff;
1050  // create format specifiers before the debug output
1051  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1052  "ordered_iteration:%%%s lower:%%%s\n",
1053  traits_t<UT>::spec, traits_t<UT>::spec);
1054  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1055  __kmp_str_free(&buff);
1056  }
1057 #endif
1058 
1059  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1060  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1061  KMP_MB(); /* is this necessary? */
1062 #ifdef KMP_DEBUG
1063  {
1064  char *buff;
1065  // create format specifiers before the debug output
1066  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1067  "ordered_iteration:%%%s lower:%%%s\n",
1068  traits_t<UT>::spec, traits_t<UT>::spec);
1069  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1070  __kmp_str_free(&buff);
1071  }
1072 #endif
1073 
1074  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1075  } // if
1076  } // if
1077  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1078 }
1079 
1080 #ifdef KMP_GOMP_COMPAT
1081 
1082 template <typename UT>
1083 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1084  typedef typename traits_t<UT>::signed_t ST;
1085  __kmp_assert_valid_gtid(gtid);
1086  kmp_info_t *th = __kmp_threads[gtid];
1087 
1088  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1089  if (!th->th.th_team->t.t_serialized) {
1090  dispatch_private_info_template<UT> *pr =
1091  reinterpret_cast<dispatch_private_info_template<UT> *>(
1092  th->th.th_dispatch->th_dispatch_pr_current);
1093  dispatch_shared_info_template<UT> volatile *sh =
1094  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1095  th->th.th_dispatch->th_dispatch_sh_current);
1096  KMP_DEBUG_ASSERT(pr);
1097  KMP_DEBUG_ASSERT(sh);
1098  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1099  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1100 
1101  UT lower = pr->u.p.ordered_lower;
1102  UT upper = pr->u.p.ordered_upper;
1103  UT inc = upper - lower + 1;
1104 
1105  if (pr->ordered_bumped == inc) {
1106  KD_TRACE(
1107  1000,
1108  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1109  gtid));
1110  pr->ordered_bumped = 0;
1111  } else {
1112  inc -= pr->ordered_bumped;
1113 
1114 #ifdef KMP_DEBUG
1115  {
1116  char *buff;
1117  // create format specifiers before the debug output
1118  buff = __kmp_str_format(
1119  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1120  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1121  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1122  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1123  __kmp_str_free(&buff);
1124  }
1125 #endif
1126 
1127  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1128  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1129 
1130  KMP_MB(); /* is this necessary? */
1131  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1132  "ordered_bumped to zero\n",
1133  gtid));
1134  pr->ordered_bumped = 0;
1136 #ifdef KMP_DEBUG
1137  {
1138  char *buff;
1139  // create format specifiers before the debug output
1140  buff = __kmp_str_format(
1141  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1142  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1143  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1144  traits_t<UT>::spec);
1145  KD_TRACE(1000,
1146  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1147  __kmp_str_free(&buff);
1148  }
1149 #endif
1150 
1151  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1152  }
1153  // }
1154  }
1155  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1156 }
1157 
1158 #endif /* KMP_GOMP_COMPAT */
1159 
1160 template <typename T>
1161 int __kmp_dispatch_next_algorithm(int gtid,
1162  dispatch_private_info_template<T> *pr,
1163  dispatch_shared_info_template<T> volatile *sh,
1164  kmp_int32 *p_last, T *p_lb, T *p_ub,
1165  typename traits_t<T>::signed_t *p_st, T nproc,
1166  T tid) {
1167  typedef typename traits_t<T>::unsigned_t UT;
1168  typedef typename traits_t<T>::signed_t ST;
1169  typedef typename traits_t<T>::floating_t DBL;
1170  int status = 0;
1171  bool last = false;
1172  T start;
1173  ST incr;
1174  UT limit, trip, init;
1175  kmp_info_t *th = __kmp_threads[gtid];
1176  kmp_team_t *team = th->th.th_team;
1177 
1178  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1179  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1180  KMP_DEBUG_ASSERT(pr);
1181  KMP_DEBUG_ASSERT(sh);
1182  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1183 #ifdef KMP_DEBUG
1184  {
1185  char *buff;
1186  // create format specifiers before the debug output
1187  buff =
1188  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1189  "sh:%%p nproc:%%%s tid:%%%s\n",
1190  traits_t<T>::spec, traits_t<T>::spec);
1191  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1192  __kmp_str_free(&buff);
1193  }
1194 #endif
1195 
1196  // zero trip count
1197  if (pr->u.p.tc == 0) {
1198  KD_TRACE(10,
1199  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1200  "zero status:%d\n",
1201  gtid, status));
1202  return 0;
1203  }
1204 
1205  switch (pr->schedule) {
1206 #if KMP_STATIC_STEAL_ENABLED
1207  case kmp_sch_static_steal: {
1208  T chunk = pr->u.p.parm1;
1209  UT nchunks = pr->u.p.parm2;
1210  KD_TRACE(100,
1211  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1212  gtid));
1213 
1214  trip = pr->u.p.tc - 1;
1215 
1216  if (traits_t<T>::type_size > 4) {
1217  // use lock for 8-byte induction variable.
1218  // TODO (optional): check presence and use 16-byte CAS
1219  kmp_lock_t *lck = pr->u.p.steal_lock;
1220  KMP_DEBUG_ASSERT(lck != NULL);
1221  if (pr->u.p.count < (UT)pr->u.p.ub) {
1222  KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1223  __kmp_acquire_lock(lck, gtid);
1224  // try to get own chunk of iterations
1225  init = (pr->u.p.count)++;
1226  status = (init < (UT)pr->u.p.ub);
1227  __kmp_release_lock(lck, gtid);
1228  } else {
1229  status = 0; // no own chunks
1230  }
1231  if (!status) { // try to steal
1232  kmp_lock_t *lckv; // victim buffer's lock
1233  T while_limit = pr->u.p.parm3;
1234  T while_index = 0;
1235  int idx = (th->th.th_dispatch->th_disp_index - 1) %
1236  __kmp_dispatch_num_buffers; // current loop index
1237  // note: victim thread can potentially execute another loop
1238  KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1239  while ((!status) && (while_limit != ++while_index)) {
1240  dispatch_private_info_template<T> *v;
1241  T remaining;
1242  T victimId = pr->u.p.parm4;
1243  T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1244  v = reinterpret_cast<dispatch_private_info_template<T> *>(
1245  &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1246  KMP_DEBUG_ASSERT(v);
1247  while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1248  oldVictimId != victimId) {
1249  victimId = (victimId + 1) % nproc;
1250  v = reinterpret_cast<dispatch_private_info_template<T> *>(
1251  &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1252  KMP_DEBUG_ASSERT(v);
1253  }
1254  if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1255  continue; // try once more (nproc attempts in total)
1256  }
1257  if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1258  kmp_uint32 old = UNUSED;
1259  // try to steal whole range from inactive victim
1260  status = v->steal_flag.compare_exchange_strong(old, THIEF);
1261  if (status) {
1262  // initialize self buffer with victim's whole range of chunks
1263  T id = victimId;
1264  T small_chunk, extras;
1265  small_chunk = nchunks / nproc; // chunks per thread
1266  extras = nchunks % nproc;
1267  init = id * small_chunk + (id < extras ? id : extras);
1268  __kmp_acquire_lock(lck, gtid);
1269  pr->u.p.count = init + 1; // exclude one we execute immediately
1270  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1271  __kmp_release_lock(lck, gtid);
1272  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1273  // no need to reinitialize other thread invariants: lb, st, etc.
1274 #ifdef KMP_DEBUG
1275  {
1276  char *buff;
1277  // create format specifiers before the debug output
1278  buff = __kmp_str_format(
1279  "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1280  "count:%%%s ub:%%%s\n",
1281  traits_t<UT>::spec, traits_t<T>::spec);
1282  KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1283  __kmp_str_free(&buff);
1284  }
1285 #endif
1286  // activate non-empty buffer and let others steal from us
1287  if (pr->u.p.count < (UT)pr->u.p.ub)
1288  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1289  break;
1290  }
1291  }
1292  if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
1293  v->u.p.count >= (UT)v->u.p.ub) {
1294  pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1295  continue; // no chunks to steal, try next victim
1296  }
1297  lckv = v->u.p.steal_lock;
1298  KMP_ASSERT(lckv != NULL);
1299  __kmp_acquire_lock(lckv, gtid);
1300  limit = v->u.p.ub; // keep initial ub
1301  if (v->u.p.count >= limit) {
1302  __kmp_release_lock(lckv, gtid);
1303  pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1304  continue; // no chunks to steal, try next victim
1305  }
1306 
1307  // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1308  // TODO: is this heuristics good enough??
1309  remaining = limit - v->u.p.count;
1310  if (remaining > 7) {
1311  // steal 1/4 of remaining
1312  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1313  init = (v->u.p.ub -= (remaining >> 2));
1314  } else {
1315  // steal 1 chunk of 1..7 remaining
1316  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1317  init = (v->u.p.ub -= 1);
1318  }
1319  __kmp_release_lock(lckv, gtid);
1320 #ifdef KMP_DEBUG
1321  {
1322  char *buff;
1323  // create format specifiers before the debug output
1324  buff = __kmp_str_format(
1325  "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1326  "count:%%%s ub:%%%s\n",
1327  traits_t<UT>::spec, traits_t<UT>::spec);
1328  KD_TRACE(10, (buff, gtid, victimId, init, limit));
1329  __kmp_str_free(&buff);
1330  }
1331 #endif
1332  KMP_DEBUG_ASSERT(init + 1 <= limit);
1333  pr->u.p.parm4 = victimId; // remember victim to steal from
1334  status = 1;
1335  // now update own count and ub with stolen range excluding init chunk
1336  __kmp_acquire_lock(lck, gtid);
1337  pr->u.p.count = init + 1;
1338  pr->u.p.ub = limit;
1339  __kmp_release_lock(lck, gtid);
1340  // activate non-empty buffer and let others steal from us
1341  if (init + 1 < limit)
1342  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1343  } // while (search for victim)
1344  } // if (try to find victim and steal)
1345  } else {
1346  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1347  // as all operations on pair (count, ub) must be done atomically
1348  typedef union {
1349  struct {
1350  UT count;
1351  T ub;
1352  } p;
1353  kmp_int64 b;
1354  } union_i4;
1355  union_i4 vold, vnew;
1356  if (pr->u.p.count < (UT)pr->u.p.ub) {
1357  KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1358  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1359  vnew.b = vold.b;
1360  vnew.p.count++; // get chunk from head of self range
1361  while (!KMP_COMPARE_AND_STORE_REL64(
1362  (volatile kmp_int64 *)&pr->u.p.count,
1363  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1364  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1365  KMP_CPU_PAUSE();
1366  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1367  vnew.b = vold.b;
1368  vnew.p.count++;
1369  }
1370  init = vold.p.count;
1371  status = (init < (UT)vold.p.ub);
1372  } else {
1373  status = 0; // no own chunks
1374  }
1375  if (!status) { // try to steal
1376  T while_limit = pr->u.p.parm3;
1377  T while_index = 0;
1378  int idx = (th->th.th_dispatch->th_disp_index - 1) %
1379  __kmp_dispatch_num_buffers; // current loop index
1380  // note: victim thread can potentially execute another loop
1381  KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1382  while ((!status) && (while_limit != ++while_index)) {
1383  dispatch_private_info_template<T> *v;
1384  T remaining;
1385  T victimId = pr->u.p.parm4;
1386  T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1387  v = reinterpret_cast<dispatch_private_info_template<T> *>(
1388  &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1389  KMP_DEBUG_ASSERT(v);
1390  while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1391  oldVictimId != victimId) {
1392  victimId = (victimId + 1) % nproc;
1393  v = reinterpret_cast<dispatch_private_info_template<T> *>(
1394  &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1395  KMP_DEBUG_ASSERT(v);
1396  }
1397  if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1398  continue; // try once more (nproc attempts in total)
1399  }
1400  if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1401  kmp_uint32 old = UNUSED;
1402  // try to steal whole range from inactive victim
1403  status = v->steal_flag.compare_exchange_strong(old, THIEF);
1404  if (status) {
1405  // initialize self buffer with victim's whole range of chunks
1406  T id = victimId;
1407  T small_chunk, extras;
1408  small_chunk = nchunks / nproc; // chunks per thread
1409  extras = nchunks % nproc;
1410  init = id * small_chunk + (id < extras ? id : extras);
1411  vnew.p.count = init + 1;
1412  vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1413  // write pair (count, ub) at once atomically
1414 #if KMP_ARCH_X86
1415  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1416 #else
1417  *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1418 #endif
1419  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1420  // no need to initialize other thread invariants: lb, st, etc.
1421 #ifdef KMP_DEBUG
1422  {
1423  char *buff;
1424  // create format specifiers before the debug output
1425  buff = __kmp_str_format(
1426  "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1427  "count:%%%s ub:%%%s\n",
1428  traits_t<UT>::spec, traits_t<T>::spec);
1429  KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1430  __kmp_str_free(&buff);
1431  }
1432 #endif
1433  // activate non-empty buffer and let others steal from us
1434  if (pr->u.p.count < (UT)pr->u.p.ub)
1435  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1436  break;
1437  }
1438  }
1439  while (1) { // CAS loop with check if victim still has enough chunks
1440  // many threads may be stealing concurrently from same victim
1441  vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1442  if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1443  vold.p.count >= (UT)vold.p.ub) {
1444  pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1445  break; // no chunks to steal, try next victim
1446  }
1447  vnew.b = vold.b;
1448  remaining = vold.p.ub - vold.p.count;
1449  // try to steal 1/4 of remaining
1450  // TODO: is this heuristics good enough??
1451  if (remaining > 7) {
1452  vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1453  } else {
1454  vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1455  }
1456  KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1457  if (KMP_COMPARE_AND_STORE_REL64(
1458  (volatile kmp_int64 *)&v->u.p.count,
1459  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1460  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1461  // stealing succedded
1462 #ifdef KMP_DEBUG
1463  {
1464  char *buff;
1465  // create format specifiers before the debug output
1466  buff = __kmp_str_format(
1467  "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1468  "count:%%%s ub:%%%s\n",
1469  traits_t<T>::spec, traits_t<T>::spec);
1470  KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1471  __kmp_str_free(&buff);
1472  }
1473 #endif
1474  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1475  vold.p.ub - vnew.p.ub);
1476  status = 1;
1477  pr->u.p.parm4 = victimId; // keep victim id
1478  // now update own count and ub
1479  init = vnew.p.ub;
1480  vold.p.count = init + 1;
1481 #if KMP_ARCH_X86
1482  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1483 #else
1484  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1485 #endif
1486  // activate non-empty buffer and let others steal from us
1487  if (vold.p.count < (UT)vold.p.ub)
1488  KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1489  break;
1490  } // if (check CAS result)
1491  KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1492  } // while (try to steal from particular victim)
1493  } // while (search for victim)
1494  } // if (try to find victim and steal)
1495  } // if (4-byte induction variable)
1496  if (!status) {
1497  *p_lb = 0;
1498  *p_ub = 0;
1499  if (p_st != NULL)
1500  *p_st = 0;
1501  } else {
1502  start = pr->u.p.lb;
1503  init *= chunk;
1504  limit = chunk + init - 1;
1505  incr = pr->u.p.st;
1506  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1507 
1508  KMP_DEBUG_ASSERT(init <= trip);
1509  // keep track of done chunks for possible early exit from stealing
1510  // TODO: count executed chunks locally with rare update of shared location
1511  // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1512  if ((last = (limit >= trip)) != 0)
1513  limit = trip;
1514  if (p_st != NULL)
1515  *p_st = incr;
1516 
1517  if (incr == 1) {
1518  *p_lb = start + init;
1519  *p_ub = start + limit;
1520  } else {
1521  *p_lb = start + init * incr;
1522  *p_ub = start + limit * incr;
1523  }
1524  } // if
1525  break;
1526  } // case
1527 #endif // KMP_STATIC_STEAL_ENABLED
1528  case kmp_sch_static_balanced: {
1529  KD_TRACE(
1530  10,
1531  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1532  gtid));
1533  /* check if thread has any iteration to do */
1534  if ((status = !pr->u.p.count) != 0) {
1535  pr->u.p.count = 1;
1536  *p_lb = pr->u.p.lb;
1537  *p_ub = pr->u.p.ub;
1538  last = (pr->u.p.parm1 != 0);
1539  if (p_st != NULL)
1540  *p_st = pr->u.p.st;
1541  } else { /* no iterations to do */
1542  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1543  }
1544  } // case
1545  break;
1546  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1547  merged here */
1548  case kmp_sch_static_chunked: {
1549  T parm1;
1550 
1551  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1552  "kmp_sch_static_[affinity|chunked] case\n",
1553  gtid));
1554  parm1 = pr->u.p.parm1;
1555 
1556  trip = pr->u.p.tc - 1;
1557  init = parm1 * (pr->u.p.count + tid);
1558 
1559  if ((status = (init <= trip)) != 0) {
1560  start = pr->u.p.lb;
1561  incr = pr->u.p.st;
1562  limit = parm1 + init - 1;
1563 
1564  if ((last = (limit >= trip)) != 0)
1565  limit = trip;
1566 
1567  if (p_st != NULL)
1568  *p_st = incr;
1569 
1570  pr->u.p.count += nproc;
1571 
1572  if (incr == 1) {
1573  *p_lb = start + init;
1574  *p_ub = start + limit;
1575  } else {
1576  *p_lb = start + init * incr;
1577  *p_ub = start + limit * incr;
1578  }
1579 
1580  if (pr->flags.ordered) {
1581  pr->u.p.ordered_lower = init;
1582  pr->u.p.ordered_upper = limit;
1583  } // if
1584  } // if
1585  } // case
1586  break;
1587 
1588  case kmp_sch_dynamic_chunked: {
1589  UT chunk_number;
1590  UT chunk_size = pr->u.p.parm1;
1591  UT nchunks = pr->u.p.parm2;
1592 
1593  KD_TRACE(
1594  100,
1595  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1596  gtid));
1597 
1598  chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1599  status = (chunk_number < nchunks);
1600  if (!status) {
1601  *p_lb = 0;
1602  *p_ub = 0;
1603  if (p_st != NULL)
1604  *p_st = 0;
1605  } else {
1606  init = chunk_size * chunk_number;
1607  trip = pr->u.p.tc - 1;
1608  start = pr->u.p.lb;
1609  incr = pr->u.p.st;
1610 
1611  if ((last = (trip - init < (UT)chunk_size)))
1612  limit = trip;
1613  else
1614  limit = chunk_size + init - 1;
1615 
1616  if (p_st != NULL)
1617  *p_st = incr;
1618 
1619  if (incr == 1) {
1620  *p_lb = start + init;
1621  *p_ub = start + limit;
1622  } else {
1623  *p_lb = start + init * incr;
1624  *p_ub = start + limit * incr;
1625  }
1626 
1627  if (pr->flags.ordered) {
1628  pr->u.p.ordered_lower = init;
1629  pr->u.p.ordered_upper = limit;
1630  } // if
1631  } // if
1632  } // case
1633  break;
1634 
1635  case kmp_sch_guided_iterative_chunked: {
1636  T chunkspec = pr->u.p.parm1;
1637  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1638  "iterative case\n",
1639  gtid));
1640  trip = pr->u.p.tc;
1641  // Start atomic part of calculations
1642  while (1) {
1643  ST remaining; // signed, because can be < 0
1644  init = sh->u.s.iteration; // shared value
1645  remaining = trip - init;
1646  if (remaining <= 0) { // AC: need to compare with 0 first
1647  // nothing to do, don't try atomic op
1648  status = 0;
1649  break;
1650  }
1651  if ((T)remaining <
1652  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1653  // use dynamic-style schedule
1654  // atomically increment iterations, get old value
1655  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1656  (ST)chunkspec);
1657  remaining = trip - init;
1658  if (remaining <= 0) {
1659  status = 0; // all iterations got by other threads
1660  } else {
1661  // got some iterations to work on
1662  status = 1;
1663  if ((T)remaining > chunkspec) {
1664  limit = init + chunkspec - 1;
1665  } else {
1666  last = true; // the last chunk
1667  limit = init + remaining - 1;
1668  } // if
1669  } // if
1670  break;
1671  } // if
1672  limit = init + (UT)((double)remaining *
1673  *(double *)&pr->u.p.parm3); // divide by K*nproc
1674  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1675  (ST)init, (ST)limit)) {
1676  // CAS was successful, chunk obtained
1677  status = 1;
1678  --limit;
1679  break;
1680  } // if
1681  } // while
1682  if (status != 0) {
1683  start = pr->u.p.lb;
1684  incr = pr->u.p.st;
1685  if (p_st != NULL)
1686  *p_st = incr;
1687  *p_lb = start + init * incr;
1688  *p_ub = start + limit * incr;
1689  if (pr->flags.ordered) {
1690  pr->u.p.ordered_lower = init;
1691  pr->u.p.ordered_upper = limit;
1692  } // if
1693  } else {
1694  *p_lb = 0;
1695  *p_ub = 0;
1696  if (p_st != NULL)
1697  *p_st = 0;
1698  } // if
1699  } // case
1700  break;
1701 
1702  case kmp_sch_guided_simd: {
1703  // same as iterative but curr-chunk adjusted to be multiple of given
1704  // chunk
1705  T chunk = pr->u.p.parm1;
1706  KD_TRACE(100,
1707  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1708  gtid));
1709  trip = pr->u.p.tc;
1710  // Start atomic part of calculations
1711  while (1) {
1712  ST remaining; // signed, because can be < 0
1713  init = sh->u.s.iteration; // shared value
1714  remaining = trip - init;
1715  if (remaining <= 0) { // AC: need to compare with 0 first
1716  status = 0; // nothing to do, don't try atomic op
1717  break;
1718  }
1719  KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1720  // compare with K*nproc*(chunk+1), K=2 by default
1721  if ((T)remaining < pr->u.p.parm2) {
1722  // use dynamic-style schedule
1723  // atomically increment iterations, get old value
1724  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1725  (ST)chunk);
1726  remaining = trip - init;
1727  if (remaining <= 0) {
1728  status = 0; // all iterations got by other threads
1729  } else {
1730  // got some iterations to work on
1731  status = 1;
1732  if ((T)remaining > chunk) {
1733  limit = init + chunk - 1;
1734  } else {
1735  last = true; // the last chunk
1736  limit = init + remaining - 1;
1737  } // if
1738  } // if
1739  break;
1740  } // if
1741  // divide by K*nproc
1742  UT span;
1743  __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1744  &span);
1745  UT rem = span % chunk;
1746  if (rem) // adjust so that span%chunk == 0
1747  span += chunk - rem;
1748  limit = init + span;
1749  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1750  (ST)init, (ST)limit)) {
1751  // CAS was successful, chunk obtained
1752  status = 1;
1753  --limit;
1754  break;
1755  } // if
1756  } // while
1757  if (status != 0) {
1758  start = pr->u.p.lb;
1759  incr = pr->u.p.st;
1760  if (p_st != NULL)
1761  *p_st = incr;
1762  *p_lb = start + init * incr;
1763  *p_ub = start + limit * incr;
1764  if (pr->flags.ordered) {
1765  pr->u.p.ordered_lower = init;
1766  pr->u.p.ordered_upper = limit;
1767  } // if
1768  } else {
1769  *p_lb = 0;
1770  *p_ub = 0;
1771  if (p_st != NULL)
1772  *p_st = 0;
1773  } // if
1774  } // case
1775  break;
1776 
1777  case kmp_sch_guided_analytical_chunked: {
1778  T chunkspec = pr->u.p.parm1;
1779  UT chunkIdx;
1780 #if KMP_USE_X87CONTROL
1781  /* for storing original FPCW value for Windows* OS on
1782  IA-32 architecture 8-byte version */
1783  unsigned int oldFpcw;
1784  unsigned int fpcwSet = 0;
1785 #endif
1786  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1787  "kmp_sch_guided_analytical_chunked case\n",
1788  gtid));
1789 
1790  trip = pr->u.p.tc;
1791 
1792  KMP_DEBUG_ASSERT(nproc > 1);
1793  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1794 
1795  while (1) { /* this while loop is a safeguard against unexpected zero
1796  chunk sizes */
1797  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1798  if (chunkIdx >= (UT)pr->u.p.parm2) {
1799  --trip;
1800  /* use dynamic-style scheduling */
1801  init = chunkIdx * chunkspec + pr->u.p.count;
1802  /* need to verify init > 0 in case of overflow in the above
1803  * calculation */
1804  if ((status = (init > 0 && init <= trip)) != 0) {
1805  limit = init + chunkspec - 1;
1806 
1807  if ((last = (limit >= trip)) != 0)
1808  limit = trip;
1809  }
1810  break;
1811  } else {
1812 /* use exponential-style scheduling */
1813 /* The following check is to workaround the lack of long double precision on
1814  Windows* OS.
1815  This check works around the possible effect that init != 0 for chunkIdx == 0.
1816  */
1817 #if KMP_USE_X87CONTROL
1818  /* If we haven't already done so, save original
1819  FPCW and set precision to 64-bit, as Windows* OS
1820  on IA-32 architecture defaults to 53-bit */
1821  if (!fpcwSet) {
1822  oldFpcw = _control87(0, 0);
1823  _control87(_PC_64, _MCW_PC);
1824  fpcwSet = 0x30000;
1825  }
1826 #endif
1827  if (chunkIdx) {
1828  init = __kmp_dispatch_guided_remaining<T>(
1829  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1830  KMP_DEBUG_ASSERT(init);
1831  init = trip - init;
1832  } else
1833  init = 0;
1834  limit = trip - __kmp_dispatch_guided_remaining<T>(
1835  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1836  KMP_ASSERT(init <= limit);
1837  if (init < limit) {
1838  KMP_DEBUG_ASSERT(limit <= trip);
1839  --limit;
1840  status = 1;
1841  break;
1842  } // if
1843  } // if
1844  } // while (1)
1845 #if KMP_USE_X87CONTROL
1846  /* restore FPCW if necessary
1847  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1848  */
1849  if (fpcwSet && (oldFpcw & fpcwSet))
1850  _control87(oldFpcw, _MCW_PC);
1851 #endif
1852  if (status != 0) {
1853  start = pr->u.p.lb;
1854  incr = pr->u.p.st;
1855  if (p_st != NULL)
1856  *p_st = incr;
1857  *p_lb = start + init * incr;
1858  *p_ub = start + limit * incr;
1859  if (pr->flags.ordered) {
1860  pr->u.p.ordered_lower = init;
1861  pr->u.p.ordered_upper = limit;
1862  }
1863  } else {
1864  *p_lb = 0;
1865  *p_ub = 0;
1866  if (p_st != NULL)
1867  *p_st = 0;
1868  }
1869  } // case
1870  break;
1871 
1872  case kmp_sch_trapezoidal: {
1873  UT index;
1874  T parm2 = pr->u.p.parm2;
1875  T parm3 = pr->u.p.parm3;
1876  T parm4 = pr->u.p.parm4;
1877  KD_TRACE(100,
1878  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1879  gtid));
1880 
1881  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1882 
1883  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1884  trip = pr->u.p.tc - 1;
1885 
1886  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1887  *p_lb = 0;
1888  *p_ub = 0;
1889  if (p_st != NULL)
1890  *p_st = 0;
1891  } else {
1892  start = pr->u.p.lb;
1893  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1894  incr = pr->u.p.st;
1895 
1896  if ((last = (limit >= trip)) != 0)
1897  limit = trip;
1898 
1899  if (p_st != NULL)
1900  *p_st = incr;
1901 
1902  if (incr == 1) {
1903  *p_lb = start + init;
1904  *p_ub = start + limit;
1905  } else {
1906  *p_lb = start + init * incr;
1907  *p_ub = start + limit * incr;
1908  }
1909 
1910  if (pr->flags.ordered) {
1911  pr->u.p.ordered_lower = init;
1912  pr->u.p.ordered_upper = limit;
1913  } // if
1914  } // if
1915  } // case
1916  break;
1917  default: {
1918  status = 0; // to avoid complaints on uninitialized variable use
1919  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1920  KMP_HNT(GetNewerLibrary), // Hint
1921  __kmp_msg_null // Variadic argument list terminator
1922  );
1923  } break;
1924  } // switch
1925  if (p_last)
1926  *p_last = last;
1927 #ifdef KMP_DEBUG
1928  if (pr->flags.ordered) {
1929  char *buff;
1930  // create format specifiers before the debug output
1931  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1932  "ordered_lower:%%%s ordered_upper:%%%s\n",
1933  traits_t<UT>::spec, traits_t<UT>::spec);
1934  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1935  __kmp_str_free(&buff);
1936  }
1937  {
1938  char *buff;
1939  // create format specifiers before the debug output
1940  buff = __kmp_str_format(
1941  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1942  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1943  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1944  KMP_DEBUG_ASSERT(p_last);
1945  KMP_DEBUG_ASSERT(p_st);
1946  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1947  __kmp_str_free(&buff);
1948  }
1949 #endif
1950  return status;
1951 }
1952 
1953 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1954  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1955  is not called. */
1956 #if OMPT_SUPPORT && OMPT_OPTIONAL
1957 #define OMPT_LOOP_END \
1958  if (status == 0) { \
1959  if (ompt_enabled.ompt_callback_work) { \
1960  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1961  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1962  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1963  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1964  &(task_info->task_data), 0, codeptr); \
1965  } \
1966  }
1967 // TODO: implement count
1968 #else
1969 #define OMPT_LOOP_END // no-op
1970 #endif
1971 
1972 #if KMP_STATS_ENABLED
1973 #define KMP_STATS_LOOP_END \
1974  { \
1975  kmp_int64 u, l, t, i; \
1976  l = (kmp_int64)(*p_lb); \
1977  u = (kmp_int64)(*p_ub); \
1978  i = (kmp_int64)(pr->u.p.st); \
1979  if (status == 0) { \
1980  t = 0; \
1981  KMP_POP_PARTITIONED_TIMER(); \
1982  } else if (i == 1) { \
1983  if (u >= l) \
1984  t = u - l + 1; \
1985  else \
1986  t = 0; \
1987  } else if (i < 0) { \
1988  if (l >= u) \
1989  t = (l - u) / (-i) + 1; \
1990  else \
1991  t = 0; \
1992  } else { \
1993  if (u >= l) \
1994  t = (u - l) / i + 1; \
1995  else \
1996  t = 0; \
1997  } \
1998  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1999  }
2000 #else
2001 #define KMP_STATS_LOOP_END /* Nothing */
2002 #endif
2003 
2004 template <typename T>
2005 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2006  T *p_lb, T *p_ub,
2007  typename traits_t<T>::signed_t *p_st
2008 #if OMPT_SUPPORT && OMPT_OPTIONAL
2009  ,
2010  void *codeptr
2011 #endif
2012 ) {
2013 
2014  typedef typename traits_t<T>::unsigned_t UT;
2015  typedef typename traits_t<T>::signed_t ST;
2016  // This is potentially slightly misleading, schedule(runtime) will appear here
2017  // even if the actual runtime schedule is static. (Which points out a
2018  // disadvantage of schedule(runtime): even when static scheduling is used it
2019  // costs more than a compile time choice to use static scheduling would.)
2020  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2021 
2022  int status;
2023  dispatch_private_info_template<T> *pr;
2024  __kmp_assert_valid_gtid(gtid);
2025  kmp_info_t *th = __kmp_threads[gtid];
2026  kmp_team_t *team = th->th.th_team;
2027 
2028  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2029  KD_TRACE(
2030  1000,
2031  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2032  gtid, p_lb, p_ub, p_st, p_last));
2033 
2034  if (team->t.t_serialized) {
2035  /* NOTE: serialize this dispatch because we are not at the active level */
2036  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2037  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2038  KMP_DEBUG_ASSERT(pr);
2039 
2040  if ((status = (pr->u.p.tc != 0)) == 0) {
2041  *p_lb = 0;
2042  *p_ub = 0;
2043  // if ( p_last != NULL )
2044  // *p_last = 0;
2045  if (p_st != NULL)
2046  *p_st = 0;
2047  if (__kmp_env_consistency_check) {
2048  if (pr->pushed_ws != ct_none) {
2049  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2050  }
2051  }
2052  } else if (pr->flags.nomerge) {
2053  kmp_int32 last;
2054  T start;
2055  UT limit, trip, init;
2056  ST incr;
2057  T chunk = pr->u.p.parm1;
2058 
2059  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2060  gtid));
2061 
2062  init = chunk * pr->u.p.count++;
2063  trip = pr->u.p.tc - 1;
2064 
2065  if ((status = (init <= trip)) == 0) {
2066  *p_lb = 0;
2067  *p_ub = 0;
2068  // if ( p_last != NULL )
2069  // *p_last = 0;
2070  if (p_st != NULL)
2071  *p_st = 0;
2072  if (__kmp_env_consistency_check) {
2073  if (pr->pushed_ws != ct_none) {
2074  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2075  }
2076  }
2077  } else {
2078  start = pr->u.p.lb;
2079  limit = chunk + init - 1;
2080  incr = pr->u.p.st;
2081 
2082  if ((last = (limit >= trip)) != 0) {
2083  limit = trip;
2084 #if KMP_OS_WINDOWS
2085  pr->u.p.last_upper = pr->u.p.ub;
2086 #endif /* KMP_OS_WINDOWS */
2087  }
2088  if (p_last != NULL)
2089  *p_last = last;
2090  if (p_st != NULL)
2091  *p_st = incr;
2092  if (incr == 1) {
2093  *p_lb = start + init;
2094  *p_ub = start + limit;
2095  } else {
2096  *p_lb = start + init * incr;
2097  *p_ub = start + limit * incr;
2098  }
2099 
2100  if (pr->flags.ordered) {
2101  pr->u.p.ordered_lower = init;
2102  pr->u.p.ordered_upper = limit;
2103 #ifdef KMP_DEBUG
2104  {
2105  char *buff;
2106  // create format specifiers before the debug output
2107  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2108  "ordered_lower:%%%s ordered_upper:%%%s\n",
2109  traits_t<UT>::spec, traits_t<UT>::spec);
2110  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2111  pr->u.p.ordered_upper));
2112  __kmp_str_free(&buff);
2113  }
2114 #endif
2115  } // if
2116  } // if
2117  } else {
2118  pr->u.p.tc = 0;
2119  *p_lb = pr->u.p.lb;
2120  *p_ub = pr->u.p.ub;
2121 #if KMP_OS_WINDOWS
2122  pr->u.p.last_upper = *p_ub;
2123 #endif /* KMP_OS_WINDOWS */
2124  if (p_last != NULL)
2125  *p_last = TRUE;
2126  if (p_st != NULL)
2127  *p_st = pr->u.p.st;
2128  } // if
2129 #ifdef KMP_DEBUG
2130  {
2131  char *buff;
2132  // create format specifiers before the debug output
2133  buff = __kmp_str_format(
2134  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2135  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2136  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2137  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2138  (p_last ? *p_last : 0), status));
2139  __kmp_str_free(&buff);
2140  }
2141 #endif
2142 #if INCLUDE_SSC_MARKS
2143  SSC_MARK_DISPATCH_NEXT();
2144 #endif
2145  OMPT_LOOP_END;
2146  KMP_STATS_LOOP_END;
2147  return status;
2148  } else {
2149  kmp_int32 last = 0;
2150  dispatch_shared_info_template<T> volatile *sh;
2151 
2152  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2153  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2154 
2155  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2156  th->th.th_dispatch->th_dispatch_pr_current);
2157  KMP_DEBUG_ASSERT(pr);
2158  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2159  th->th.th_dispatch->th_dispatch_sh_current);
2160  KMP_DEBUG_ASSERT(sh);
2161 
2162 #if KMP_USE_HIER_SCHED
2163  if (pr->flags.use_hier)
2164  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2165  else
2166 #endif // KMP_USE_HIER_SCHED
2167  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2168  p_st, th->th.th_team_nproc,
2169  th->th.th_info.ds.ds_tid);
2170  // status == 0: no more iterations to execute
2171  if (status == 0) {
2172  ST num_done;
2173  num_done = test_then_inc<ST>(&sh->u.s.num_done);
2174 #ifdef KMP_DEBUG
2175  {
2176  char *buff;
2177  // create format specifiers before the debug output
2178  buff = __kmp_str_format(
2179  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2180  traits_t<ST>::spec);
2181  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2182  __kmp_str_free(&buff);
2183  }
2184 #endif
2185 
2186 #if KMP_USE_HIER_SCHED
2187  pr->flags.use_hier = FALSE;
2188 #endif
2189  if (num_done == th->th.th_team_nproc - 1) {
2190 #if KMP_STATIC_STEAL_ENABLED
2191  if (pr->schedule == kmp_sch_static_steal) {
2192  int i;
2193  int idx = (th->th.th_dispatch->th_disp_index - 1) %
2194  __kmp_dispatch_num_buffers; // current loop index
2195  // loop complete, safe to destroy locks used for stealing
2196  for (i = 0; i < th->th.th_team_nproc; ++i) {
2197  dispatch_private_info_template<T> *buf =
2198  reinterpret_cast<dispatch_private_info_template<T> *>(
2199  &team->t.t_dispatch[i].th_disp_buffer[idx]);
2200  KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2201  KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2202  if (traits_t<T>::type_size > 4) {
2203  // destroy locks used for stealing
2204  kmp_lock_t *lck = buf->u.p.steal_lock;
2205  KMP_ASSERT(lck != NULL);
2206  __kmp_destroy_lock(lck);
2207  __kmp_free(lck);
2208  buf->u.p.steal_lock = NULL;
2209  }
2210  }
2211  }
2212 #endif
2213  /* NOTE: release shared buffer to be reused */
2214 
2215  KMP_MB(); /* Flush all pending memory write invalidates. */
2216 
2217  sh->u.s.num_done = 0;
2218  sh->u.s.iteration = 0;
2219 
2220  /* TODO replace with general release procedure? */
2221  if (pr->flags.ordered) {
2222  sh->u.s.ordered_iteration = 0;
2223  }
2224 
2225  sh->buffer_index += __kmp_dispatch_num_buffers;
2226  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2227  gtid, sh->buffer_index));
2228 
2229  KMP_MB(); /* Flush all pending memory write invalidates. */
2230 
2231  } // if
2232  if (__kmp_env_consistency_check) {
2233  if (pr->pushed_ws != ct_none) {
2234  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2235  }
2236  }
2237 
2238  th->th.th_dispatch->th_deo_fcn = NULL;
2239  th->th.th_dispatch->th_dxo_fcn = NULL;
2240  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2241  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2242  } // if (status == 0)
2243 #if KMP_OS_WINDOWS
2244  else if (last) {
2245  pr->u.p.last_upper = pr->u.p.ub;
2246  }
2247 #endif /* KMP_OS_WINDOWS */
2248  if (p_last != NULL && status != 0)
2249  *p_last = last;
2250  } // if
2251 
2252 #ifdef KMP_DEBUG
2253  {
2254  char *buff;
2255  // create format specifiers before the debug output
2256  buff = __kmp_str_format(
2257  "__kmp_dispatch_next: T#%%d normal case: "
2258  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2259  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2260  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2261  (p_last ? *p_last : 0), status));
2262  __kmp_str_free(&buff);
2263  }
2264 #endif
2265 #if INCLUDE_SSC_MARKS
2266  SSC_MARK_DISPATCH_NEXT();
2267 #endif
2268  OMPT_LOOP_END;
2269  KMP_STATS_LOOP_END;
2270  return status;
2271 }
2272 
2273 template <typename T>
2274 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2275  kmp_int32 *plastiter, T *plower, T *pupper,
2276  typename traits_t<T>::signed_t incr) {
2277  typedef typename traits_t<T>::unsigned_t UT;
2278  kmp_uint32 team_id;
2279  kmp_uint32 nteams;
2280  UT trip_count;
2281  kmp_team_t *team;
2282  kmp_info_t *th;
2283 
2284  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2285  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2286 #ifdef KMP_DEBUG
2287  typedef typename traits_t<T>::signed_t ST;
2288  {
2289  char *buff;
2290  // create format specifiers before the debug output
2291  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2292  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2293  traits_t<T>::spec, traits_t<T>::spec,
2294  traits_t<ST>::spec, traits_t<T>::spec);
2295  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2296  __kmp_str_free(&buff);
2297  }
2298 #endif
2299 
2300  if (__kmp_env_consistency_check) {
2301  if (incr == 0) {
2302  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2303  loc);
2304  }
2305  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2306  // The loop is illegal.
2307  // Some zero-trip loops maintained by compiler, e.g.:
2308  // for(i=10;i<0;++i) // lower >= upper - run-time check
2309  // for(i=0;i>10;--i) // lower <= upper - run-time check
2310  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2311  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2312  // Compiler does not check the following illegal loops:
2313  // for(i=0;i<10;i+=incr) // where incr<0
2314  // for(i=10;i>0;i-=incr) // where incr<0
2315  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2316  }
2317  }
2318  __kmp_assert_valid_gtid(gtid);
2319  th = __kmp_threads[gtid];
2320  team = th->th.th_team;
2321  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2322  nteams = th->th.th_teams_size.nteams;
2323  team_id = team->t.t_master_tid;
2324  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2325 
2326  // compute global trip count
2327  if (incr == 1) {
2328  trip_count = *pupper - *plower + 1;
2329  } else if (incr == -1) {
2330  trip_count = *plower - *pupper + 1;
2331  } else if (incr > 0) {
2332  // upper-lower can exceed the limit of signed type
2333  trip_count = (UT)(*pupper - *plower) / incr + 1;
2334  } else {
2335  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2336  }
2337 
2338  if (trip_count <= nteams) {
2339  KMP_DEBUG_ASSERT(
2340  __kmp_static == kmp_sch_static_greedy ||
2341  __kmp_static ==
2342  kmp_sch_static_balanced); // Unknown static scheduling type.
2343  // only some teams get single iteration, others get nothing
2344  if (team_id < trip_count) {
2345  *pupper = *plower = *plower + team_id * incr;
2346  } else {
2347  *plower = *pupper + incr; // zero-trip loop
2348  }
2349  if (plastiter != NULL)
2350  *plastiter = (team_id == trip_count - 1);
2351  } else {
2352  if (__kmp_static == kmp_sch_static_balanced) {
2353  UT chunk = trip_count / nteams;
2354  UT extras = trip_count % nteams;
2355  *plower +=
2356  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2357  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2358  if (plastiter != NULL)
2359  *plastiter = (team_id == nteams - 1);
2360  } else {
2361  T chunk_inc_count =
2362  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2363  T upper = *pupper;
2364  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2365  // Unknown static scheduling type.
2366  *plower += team_id * chunk_inc_count;
2367  *pupper = *plower + chunk_inc_count - incr;
2368  // Check/correct bounds if needed
2369  if (incr > 0) {
2370  if (*pupper < *plower)
2371  *pupper = traits_t<T>::max_value;
2372  if (plastiter != NULL)
2373  *plastiter = *plower <= upper && *pupper > upper - incr;
2374  if (*pupper > upper)
2375  *pupper = upper; // tracker C73258
2376  } else {
2377  if (*pupper > *plower)
2378  *pupper = traits_t<T>::min_value;
2379  if (plastiter != NULL)
2380  *plastiter = *plower >= upper && *pupper < upper - incr;
2381  if (*pupper < upper)
2382  *pupper = upper; // tracker C73258
2383  }
2384  }
2385  }
2386 }
2387 
2388 //-----------------------------------------------------------------------------
2389 // Dispatch routines
2390 // Transfer call to template< type T >
2391 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2392 // T lb, T ub, ST st, ST chunk )
2393 extern "C" {
2394 
2411 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2412  enum sched_type schedule, kmp_int32 lb,
2413  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2414  KMP_DEBUG_ASSERT(__kmp_init_serial);
2415 #if OMPT_SUPPORT && OMPT_OPTIONAL
2416  OMPT_STORE_RETURN_ADDRESS(gtid);
2417 #endif
2418  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2419 }
2423 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2424  enum sched_type schedule, kmp_uint32 lb,
2425  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2426  KMP_DEBUG_ASSERT(__kmp_init_serial);
2427 #if OMPT_SUPPORT && OMPT_OPTIONAL
2428  OMPT_STORE_RETURN_ADDRESS(gtid);
2429 #endif
2430  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2431 }
2432 
2436 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2437  enum sched_type schedule, kmp_int64 lb,
2438  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2439  KMP_DEBUG_ASSERT(__kmp_init_serial);
2440 #if OMPT_SUPPORT && OMPT_OPTIONAL
2441  OMPT_STORE_RETURN_ADDRESS(gtid);
2442 #endif
2443  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2444 }
2445 
2449 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2450  enum sched_type schedule, kmp_uint64 lb,
2451  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2452  KMP_DEBUG_ASSERT(__kmp_init_serial);
2453 #if OMPT_SUPPORT && OMPT_OPTIONAL
2454  OMPT_STORE_RETURN_ADDRESS(gtid);
2455 #endif
2456  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2457 }
2458 
2468 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2469  enum sched_type schedule, kmp_int32 *p_last,
2470  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2471  kmp_int32 chunk) {
2472  KMP_DEBUG_ASSERT(__kmp_init_serial);
2473 #if OMPT_SUPPORT && OMPT_OPTIONAL
2474  OMPT_STORE_RETURN_ADDRESS(gtid);
2475 #endif
2476  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2477  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2478 }
2479 
2480 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2481  enum sched_type schedule, kmp_int32 *p_last,
2482  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2483  kmp_int32 chunk) {
2484  KMP_DEBUG_ASSERT(__kmp_init_serial);
2485 #if OMPT_SUPPORT && OMPT_OPTIONAL
2486  OMPT_STORE_RETURN_ADDRESS(gtid);
2487 #endif
2488  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2489  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2490 }
2491 
2492 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2493  enum sched_type schedule, kmp_int32 *p_last,
2494  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2495  kmp_int64 chunk) {
2496  KMP_DEBUG_ASSERT(__kmp_init_serial);
2497 #if OMPT_SUPPORT && OMPT_OPTIONAL
2498  OMPT_STORE_RETURN_ADDRESS(gtid);
2499 #endif
2500  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2501  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2502 }
2503 
2504 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2505  enum sched_type schedule, kmp_int32 *p_last,
2506  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2507  kmp_int64 chunk) {
2508  KMP_DEBUG_ASSERT(__kmp_init_serial);
2509 #if OMPT_SUPPORT && OMPT_OPTIONAL
2510  OMPT_STORE_RETURN_ADDRESS(gtid);
2511 #endif
2512  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2513  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2514 }
2515 
2529 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2530  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2531 #if OMPT_SUPPORT && OMPT_OPTIONAL
2532  OMPT_STORE_RETURN_ADDRESS(gtid);
2533 #endif
2534  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2535 #if OMPT_SUPPORT && OMPT_OPTIONAL
2536  ,
2537  OMPT_LOAD_RETURN_ADDRESS(gtid)
2538 #endif
2539  );
2540 }
2541 
2545 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2546  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2547  kmp_int32 *p_st) {
2548 #if OMPT_SUPPORT && OMPT_OPTIONAL
2549  OMPT_STORE_RETURN_ADDRESS(gtid);
2550 #endif
2551  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2552 #if OMPT_SUPPORT && OMPT_OPTIONAL
2553  ,
2554  OMPT_LOAD_RETURN_ADDRESS(gtid)
2555 #endif
2556  );
2557 }
2558 
2562 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2563  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2564 #if OMPT_SUPPORT && OMPT_OPTIONAL
2565  OMPT_STORE_RETURN_ADDRESS(gtid);
2566 #endif
2567  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2568 #if OMPT_SUPPORT && OMPT_OPTIONAL
2569  ,
2570  OMPT_LOAD_RETURN_ADDRESS(gtid)
2571 #endif
2572  );
2573 }
2574 
2578 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2579  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2580  kmp_int64 *p_st) {
2581 #if OMPT_SUPPORT && OMPT_OPTIONAL
2582  OMPT_STORE_RETURN_ADDRESS(gtid);
2583 #endif
2584  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2585 #if OMPT_SUPPORT && OMPT_OPTIONAL
2586  ,
2587  OMPT_LOAD_RETURN_ADDRESS(gtid)
2588 #endif
2589  );
2590 }
2591 
2598 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2599  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2600 }
2601 
2605 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2606  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2607 }
2608 
2612 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2613  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2614 }
2615 
2619 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2620  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2621 }
2624 //-----------------------------------------------------------------------------
2625 // Non-template routines from kmp_dispatch.cpp used in other sources
2626 
2627 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2628  return value == checker;
2629 }
2630 
2631 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2632  return value != checker;
2633 }
2634 
2635 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2636  return value < checker;
2637 }
2638 
2639 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2640  return value >= checker;
2641 }
2642 
2643 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2644  return value <= checker;
2645 }
2646 
2647 kmp_uint32
2648 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2649  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2650  void *obj // Higher-level synchronization object, or NULL.
2651 ) {
2652  // note: we may not belong to a team at this point
2653  volatile kmp_uint32 *spin = spinner;
2654  kmp_uint32 check = checker;
2655  kmp_uint32 spins;
2656  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2657  kmp_uint32 r;
2658 
2659  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2660  KMP_INIT_YIELD(spins);
2661  // main wait spin loop
2662  while (!f(r = TCR_4(*spin), check)) {
2663  KMP_FSYNC_SPIN_PREPARE(obj);
2664  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2665  split. It causes problems with infinite recursion because of exit lock */
2666  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2667  __kmp_abort_thread(); */
2668  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2669  }
2670  KMP_FSYNC_SPIN_ACQUIRED(obj);
2671  return r;
2672 }
2673 
2674 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2675  kmp_uint32 (*pred)(void *, kmp_uint32),
2676  void *obj // Higher-level synchronization object, or NULL.
2677 ) {
2678  // note: we may not belong to a team at this point
2679  void *spin = spinner;
2680  kmp_uint32 check = checker;
2681  kmp_uint32 spins;
2682  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2683 
2684  KMP_FSYNC_SPIN_INIT(obj, spin);
2685  KMP_INIT_YIELD(spins);
2686  // main wait spin loop
2687  while (!f(spin, check)) {
2688  KMP_FSYNC_SPIN_PREPARE(obj);
2689  /* if we have waited a bit, or are noversubscribed, yield */
2690  /* pause is in the following code */
2691  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2692  }
2693  KMP_FSYNC_SPIN_ACQUIRED(obj);
2694 }
2695 
2696 } // extern "C"
2697 
2698 #ifdef KMP_GOMP_COMPAT
2699 
2700 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2701  enum sched_type schedule, kmp_int32 lb,
2702  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2703  int push_ws) {
2704  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2705  push_ws);
2706 }
2707 
2708 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2709  enum sched_type schedule, kmp_uint32 lb,
2710  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2711  int push_ws) {
2712  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2713  push_ws);
2714 }
2715 
2716 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2717  enum sched_type schedule, kmp_int64 lb,
2718  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2719  int push_ws) {
2720  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2721  push_ws);
2722 }
2723 
2724 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2725  enum sched_type schedule, kmp_uint64 lb,
2726  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2727  int push_ws) {
2728  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2729  push_ws);
2730 }
2731 
2732 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2733  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2734 }
2735 
2736 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2737  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2738 }
2739 
2740 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2741  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2742 }
2743 
2744 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2745  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2746 }
2747 
2748 #endif /* KMP_GOMP_COMPAT */
2749 
2750 /* ------------------------------------------------------------------------ */
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:908
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
sched_type
Definition: kmp.h:357
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
@ kmp_sch_runtime_simd
Definition: kmp.h:379
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_simd
Definition: kmp.h:378
@ kmp_sch_guided_chunked
Definition: kmp.h:362
@ kmp_sch_lower
Definition: kmp.h:358
@ kmp_nm_upper
Definition: kmp.h:429
@ kmp_ord_lower
Definition: kmp.h:384
@ kmp_sch_upper
Definition: kmp.h:382
@ kmp_nm_lower
Definition: kmp.h:402
Definition: kmp.h:234