LLVM OpenMP* Runtime Library
kmp_stats.h
1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21  Accumulates number of samples and computes min, max, mean, standard deviation
22  on the fly.
23 
24  Online variance calculation algorithm from
25  http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37  than is useful for application characterisation and are intended for the
38  runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
50  noTotal = 1 << 0,
51  onlyInMaster = 1 << 1,
52  noUnits = 1 << 2,
53  notInMaster = 1 << 3,
54  logEvent = 1 << 4
56 };
57 
64  IDLE,
65  SERIAL_REGION,
66  FORK_JOIN_BARRIER,
67  PLAIN_BARRIER,
68  TASKWAIT,
69  TASKYIELD,
70  TASKGROUP,
71  IMPLICIT_TASK,
72  EXPLICIT_TASK,
73  TEAMS_REGION
74 };
75 
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg) \
96  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97  macro(OMP_NESTED_PARALLEL, 0, arg) \
98  macro(OMP_LOOP_STATIC, 0, arg) \
99  macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100  macro(OMP_LOOP_DYNAMIC, 0, arg) \
101  macro(OMP_DISTRIBUTE, 0, arg) \
102  macro(OMP_BARRIER, 0, arg) \
103  macro(OMP_CRITICAL, 0, arg) \
104  macro(OMP_SINGLE, 0, arg) \
105  macro(OMP_MASTER, 0, arg) \
106  macro(OMP_MASKED, 0, arg) \
107  macro(OMP_TEAMS, 0, arg) \
108  macro(OMP_set_lock, 0, arg) \
109  macro(OMP_test_lock, 0, arg) \
110  macro(REDUCE_wait, 0, arg) \
111  macro(REDUCE_nowait, 0, arg) \
112  macro(OMP_TASKYIELD, 0, arg) \
113  macro(OMP_TASKLOOP, 0, arg) \
114  macro(TASK_executed, 0, arg) \
115  macro(TASK_cancelled, 0, arg) \
116  macro(TASK_stolen, 0, arg)
117 // clang-format on
118 
137 // clang-format off
138 #define KMP_FOREACH_TIMER(macro, arg) \
139  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
140  macro (OMP_parallel, stats_flags_e::logEvent, arg) \
141  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
142  macro (OMP_teams, stats_flags_e::logEvent, arg) \
143  macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
144  macro (OMP_loop_static, 0, arg) \
145  macro (OMP_loop_static_scheduling, 0, arg) \
146  macro (OMP_loop_dynamic, 0, arg) \
147  macro (OMP_loop_dynamic_scheduling, 0, arg) \
148  macro (OMP_distribute, 0, arg) \
149  macro (OMP_distribute_scheduling, 0, arg) \
150  macro (OMP_critical, 0, arg) \
151  macro (OMP_critical_wait, 0, arg) \
152  macro (OMP_single, 0, arg) \
153  macro (OMP_master, 0, arg) \
154  macro (OMP_masked, 0, arg) \
155  macro (OMP_task_immediate, 0, arg) \
156  macro (OMP_task_taskwait, 0, arg) \
157  macro (OMP_task_taskyield, 0, arg) \
158  macro (OMP_task_taskgroup, 0, arg) \
159  macro (OMP_task_join_bar, 0, arg) \
160  macro (OMP_task_plain_bar, 0, arg) \
161  macro (OMP_taskloop_scheduling, 0, arg) \
162  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
163  macro (OMP_idle, stats_flags_e::logEvent, arg) \
164  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
165  macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
166  macro (OMP_serial, stats_flags_e::logEvent, arg) \
167  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
168  arg) \
169  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
170  arg) \
171  macro (OMP_loop_static_iterations, \
172  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
173  macro (OMP_loop_static_total_iterations, \
174  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
175  macro (OMP_loop_dynamic_iterations, \
176  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
177  macro (OMP_loop_dynamic_total_iterations, \
178  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
179  macro (OMP_distribute_iterations, \
180  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
181  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
182 // clang-format on
183 
184 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
185 // initializing OpenMP or being created by a primary
186 // thread) until the thread is destroyed
187 // OMP_parallel -- Time thread spends executing work directly
188 // within a #pragma omp parallel
189 // OMP_parallel_overhead -- Time thread spends setting up a parallel region
190 // OMP_loop_static -- Time thread spends executing loop iterations from
191 // a statically scheduled loop
192 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
193 // from a statically scheduled loop
194 // OMP_loop_dynamic -- Time thread spends executing loop iterations from
195 // a dynamically scheduled loop
196 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
197 // from a dynamically scheduled loop
198 // OMP_critical -- Time thread spends executing critical section
199 // OMP_critical_wait -- Time thread spends waiting to enter
200 // a critical section
201 // OMP_single -- Time spent executing a "single" region
202 // OMP_master -- Time spent executing a "master" region
203 // OMP_masked -- Time spent executing a "masked" region
204 // OMP_task_immediate -- Time spent executing non-deferred tasks
205 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
206 // construct
207 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
208 // construct
209 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
210 // construct
211 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
212 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
213 // construct
214 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
215 // construct
216 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
217 // inside implicit barrier at end of worksharing
218 // construct
219 // OMP_idle -- Time worker threads spend waiting for next
220 // parallel region
221 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
222 // parallel region
223 // OMP_join_barrier -- Time spent in a the join barrier surrounding a
224 // parallel region
225 // OMP_serial -- Time thread zero spends executing serial code
226 // OMP_set_numthreads -- Values passed to omp_set_num_threads
227 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
228 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
229 // statically scheduled loops
230 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
231 // dynamically scheduled loops
232 
233 #if (KMP_DEVELOPER_STATS)
234 // Timers which are of interest to runtime library developers, not end users.
235 // These have to be explicitly enabled in addition to the other stats.
236 
237 // KMP_fork_barrier -- time in __kmp_fork_barrier
238 // KMP_join_barrier -- time in __kmp_join_barrier
239 // KMP_barrier -- time in __kmp_barrier
240 // KMP_end_split_barrier -- time in __kmp_end_split_barrier
241 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
242 // KMP_icv_copy -- start/stop timer for any ICV copying
243 // KMP_linear_gather -- time in __kmp_linear_barrier_gather
244 // KMP_linear_release -- time in __kmp_linear_barrier_release
245 // KMP_tree_gather -- time in __kmp_tree_barrier_gather
246 // KMP_tree_release -- time in __kmp_tree_barrier_release
247 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
248 // KMP_hyper_release -- time in __kmp_hyper_barrier_release
249 // KMP_dist_gather -- time in __kmp_dist_barrier_gather
250 // KMP_dist_release -- time in __kmp_dist_barrier_release
251 // clang-format off
252 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
253  macro(KMP_fork_call, 0, arg) \
254  macro(KMP_join_call, 0, arg) \
255  macro(KMP_end_split_barrier, 0, arg) \
256  macro(KMP_hier_gather, 0, arg) \
257  macro(KMP_hier_release, 0, arg) \
258  macro(KMP_hyper_gather, 0, arg) \
259  macro(KMP_hyper_release, 0, arg) \
260  macro(KMP_dist_gather, 0, arg) \
261  macro(KMP_dist_release, 0, arg) \
262  macro(KMP_linear_gather, 0, arg) \
263  macro(KMP_linear_release, 0, arg) \
264  macro(KMP_tree_gather, 0, arg) \
265  macro(KMP_tree_release, 0, arg) \
266  macro(USER_resume, 0, arg) \
267  macro(USER_suspend, 0, arg) \
268  macro(USER_mwait, 0, arg) \
269  macro(KMP_allocate_team, 0, arg) \
270  macro(KMP_setup_icv_copy, 0, arg) \
271  macro(USER_icv_copy, 0, arg) \
272  macro (FOR_static_steal_stolen, \
273  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
274  macro (FOR_static_steal_chunks, \
275  stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
276 #else
277 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
278 #endif
279 // clang-format on
280 
300 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
301 
302 #define ENUMERATE(name, ignore, prefix) prefix##name,
303 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
304 
305 enum explicit_timer_e {
306  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
307 };
308 
309 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
310 #undef ENUMERATE
311 
312 /*
313  * A logarithmic histogram. It accumulates the number of values in each power of
314  * ten bin. So 1<=x<10, 10<=x<100, ...
315  * Mostly useful where we have some big outliers and want to see information
316  * about them.
317  */
318 class logHistogram {
319  enum {
320  numBins = 31, /* Number of powers of 10. If this changes you need to change
321  * the initializer for binMax */
322 
323  /*
324  * If you want to use this to analyse values that may be less than 1, (for
325  * instance times in s), then the logOffset gives you negative powers.
326  * In our case here, we're just looking at times in ticks, or counts, so we
327  * can never see values with magnitude < 1 (other than zero), so we can set
328  * it to 0. As above change the initializer if you change this.
329  */
330  logOffset = 0
331  };
332  uint32_t KMP_ALIGN_CACHE zeroCount;
333  struct {
334  uint32_t count;
335  double total;
336  } bins[numBins];
337 
338  static double binMax[numBins];
339 
340 #ifdef KMP_DEBUG
341  uint64_t _total;
342 
343  void check() const {
344  uint64_t t = zeroCount;
345  for (int i = 0; i < numBins; i++)
346  t += bins[i].count;
347  KMP_DEBUG_ASSERT(t == _total);
348  }
349 #else
350  void check() const {}
351 #endif
352 
353 public:
354  logHistogram() { reset(); }
355 
356  logHistogram(logHistogram const &o) {
357  for (int i = 0; i < numBins; i++)
358  bins[i] = o.bins[i];
359 #ifdef KMP_DEBUG
360  _total = o._total;
361 #endif
362  }
363 
364  void reset() {
365  zeroCount = 0;
366  for (int i = 0; i < numBins; i++) {
367  bins[i].count = 0;
368  bins[i].total = 0;
369  }
370 
371 #ifdef KMP_DEBUG
372  _total = 0;
373 #endif
374  }
375  uint32_t count(int b) const { return bins[b + logOffset].count; }
376  double total(int b) const { return bins[b + logOffset].total; }
377  static uint32_t findBin(double sample);
378 
379  logHistogram &operator+=(logHistogram const &o) {
380  zeroCount += o.zeroCount;
381  for (int i = 0; i < numBins; i++) {
382  bins[i].count += o.bins[i].count;
383  bins[i].total += o.bins[i].total;
384  }
385 #ifdef KMP_DEBUG
386  _total += o._total;
387  check();
388 #endif
389 
390  return *this;
391  }
392 
393  void addSample(double sample);
394  int minBin() const;
395  int maxBin() const;
396 
397  std::string format(char) const;
398 };
399 
400 class statistic {
401  double KMP_ALIGN_CACHE minVal;
402  double maxVal;
403  double meanVal;
404  double m2;
405  uint64_t sampleCount;
406  double offset;
407  bool collectingHist;
408  logHistogram hist;
409 
410 public:
411  statistic(bool doHist = bool(KMP_STATS_HIST)) {
412  reset();
413  collectingHist = doHist;
414  }
415  statistic(statistic const &o)
416  : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
417  sampleCount(o.sampleCount), offset(o.offset),
418  collectingHist(o.collectingHist), hist(o.hist) {}
419  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
420  : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
421  sampleCount(sc), offset(0.0), collectingHist(false) {}
422  bool haveHist() const { return collectingHist; }
423  double getMin() const { return minVal; }
424  double getMean() const { return meanVal; }
425  double getMax() const { return maxVal; }
426  uint64_t getCount() const { return sampleCount; }
427  double getSD() const { return sqrt(m2 / sampleCount); }
428  double getTotal() const { return sampleCount * meanVal; }
429  logHistogram const *getHist() const { return &hist; }
430  void setOffset(double d) { offset = d; }
431 
432  void reset() {
433  minVal = (std::numeric_limits<double>::max)();
434  maxVal = -minVal;
435  meanVal = 0.0;
436  m2 = 0.0;
437  sampleCount = 0;
438  offset = 0.0;
439  hist.reset();
440  }
441  void addSample(double sample);
442  void scale(double factor);
443  void scaleDown(double f) { scale(1. / f); }
444  void forceCount(uint64_t count) { sampleCount = count; }
445  statistic &operator+=(statistic const &other);
446 
447  std::string format(char unit, bool total = false) const;
448  std::string formatHist(char unit) const { return hist.format(unit); }
449 };
450 
451 struct statInfo {
452  const char *name;
453  uint32_t flags;
454 };
455 
456 class timeStat : public statistic {
457  static statInfo timerInfo[];
458 
459 public:
460  timeStat() : statistic() {}
461  static const char *name(timer_e e) { return timerInfo[e].name; }
462  static bool noTotal(timer_e e) {
463  return timerInfo[e].flags & stats_flags_e::noTotal;
464  }
465  static bool masterOnly(timer_e e) {
466  return timerInfo[e].flags & stats_flags_e::onlyInMaster;
467  }
468  static bool workerOnly(timer_e e) {
469  return timerInfo[e].flags & stats_flags_e::notInMaster;
470  }
471  static bool noUnits(timer_e e) {
472  return timerInfo[e].flags & stats_flags_e::noUnits;
473  }
474  static bool logEvent(timer_e e) {
475  return timerInfo[e].flags & stats_flags_e::logEvent;
476  }
477  static void clearEventFlags() {
478  for (int i = 0; i < TIMER_LAST; i++) {
479  timerInfo[i].flags &= (~(stats_flags_e::logEvent));
480  }
481  }
482 };
483 
484 // Where we need explicitly to start and end the timer, this version can be used
485 // Since these timers normally aren't nicely scoped, so don't have a good place
486 // to live on the stack of the thread, they're more work to use.
487 class explicitTimer {
488  timeStat *stat;
489  timer_e timerEnumValue;
490  tsc_tick_count startTime;
491  tsc_tick_count pauseStartTime;
492  tsc_tick_count::tsc_interval_t totalPauseTime;
493 
494 public:
495  explicitTimer(timeStat *s, timer_e te)
496  : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
497  totalPauseTime() {}
498 
499  // void setStat(timeStat *s) { stat = s; }
500  void start(tsc_tick_count tick);
501  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
502  void resume(tsc_tick_count tick) {
503  totalPauseTime += (tick - pauseStartTime);
504  }
505  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
506  void reset() {
507  startTime = 0;
508  pauseStartTime = 0;
509  totalPauseTime = 0;
510  }
511  timer_e get_type() const { return timerEnumValue; }
512 };
513 
514 // Where you need to partition a threads clock ticks into separate states
515 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
516 // DOING_NOTHING would render these conditions:
517 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
518 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
519 // versa
520 class partitionedTimers {
521 private:
522  std::vector<explicitTimer> timer_stack;
523 
524 public:
525  partitionedTimers();
526  void init(explicitTimer timer);
527  void exchange(explicitTimer timer);
528  void push(explicitTimer timer);
529  void pop();
530  void windup();
531 };
532 
533 // Special wrapper around the partitioned timers to aid timing code blocks
534 // It avoids the need to have an explicit end, leaving the scope suffices.
535 class blockPartitionedTimer {
536  partitionedTimers *part_timers;
537 
538 public:
539  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
540  : part_timers(pt) {
541  part_timers->push(timer);
542  }
543  ~blockPartitionedTimer() { part_timers->pop(); }
544 };
545 
546 // Special wrapper around the thread state to aid in keeping state in code
547 // blocks It avoids the need to have an explicit end, leaving the scope
548 // suffices.
549 class blockThreadState {
550  stats_state_e *state_pointer;
551  stats_state_e old_state;
552 
553 public:
554  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
555  : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
556  *state_pointer = new_state;
557  }
558  ~blockThreadState() { *state_pointer = old_state; }
559 };
560 
561 // If all you want is a count, then you can use this...
562 // The individual per-thread counts will be aggregated into a statistic at
563 // program exit.
564 class counter {
565  uint64_t value;
566  static const statInfo counterInfo[];
567 
568 public:
569  counter() : value(0) {}
570  void increment() { value++; }
571  uint64_t getValue() const { return value; }
572  void reset() { value = 0; }
573  static const char *name(counter_e e) { return counterInfo[e].name; }
574  static bool masterOnly(counter_e e) {
575  return counterInfo[e].flags & stats_flags_e::onlyInMaster;
576  }
577 };
578 
579 /* ****************************************************************
580  Class to implement an event
581 
582  There are four components to an event: start time, stop time
583  nest_level, and timer_name.
584  The start and stop time should be obvious (recorded in clock ticks).
585  The nest_level relates to the bar width in the timeline graph.
586  The timer_name is used to determine which timer event triggered this event.
587 
588  the interface to this class is through four read-only operations:
589  1) getStart() -- returns the start time as 64 bit integer
590  2) getStop() -- returns the stop time as 64 bit integer
591  3) getNestLevel() -- returns the nest level of the event
592  4) getTimerName() -- returns the timer name that triggered event
593 
594  *MORE ON NEST_LEVEL*
595  The nest level is used in the bar graph that represents the timeline.
596  Its main purpose is for showing how events are nested inside eachother.
597  For example, say events, A, B, and C are recorded. If the timeline
598  looks like this:
599 
600 Begin -------------------------------------------------------------> Time
601  | | | | | |
602  A B C C B A
603  start start start end end end
604 
605  Then A, B, C will have a nest level of 1, 2, 3 respectively.
606  These values are then used to calculate the barwidth so you can
607  see that inside A, B has occurred, and inside B, C has occurred.
608  Currently, this is shown with A's bar width being larger than B's
609  bar width, and B's bar width being larger than C's bar width.
610 
611 **************************************************************** */
612 class kmp_stats_event {
613  uint64_t start;
614  uint64_t stop;
615  int nest_level;
616  timer_e timer_name;
617 
618 public:
619  kmp_stats_event()
620  : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
621  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
622  : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
623  inline uint64_t getStart() const { return start; }
624  inline uint64_t getStop() const { return stop; }
625  inline int getNestLevel() const { return nest_level; }
626  inline timer_e getTimerName() const { return timer_name; }
627 };
628 
629 /* ****************************************************************
630  Class to implement a dynamically expandable array of events
631 
632  ---------------------------------------------------------
633  | event 1 | event 2 | event 3 | event 4 | ... | event N |
634  ---------------------------------------------------------
635 
636  An event is pushed onto the back of this array at every
637  explicitTimer->stop() call. The event records the thread #,
638  start time, stop time, and nest level related to the bar width.
639 
640  The event vector starts at size INIT_SIZE and grows (doubles in size)
641  if needed. An implication of this behavior is that log(N)
642  reallocations are needed (where N is number of events). If you want
643  to avoid reallocations, then set INIT_SIZE to a large value.
644 
645  the interface to this class is through six operations:
646  1) reset() -- sets the internal_size back to 0 but does not deallocate any
647  memory
648  2) size() -- returns the number of valid elements in the vector
649  3) push_back(start, stop, nest, timer_name) -- pushes an event onto
650  the back of the array
651  4) deallocate() -- frees all memory associated with the vector
652  5) sort() -- sorts the vector by start time
653  6) operator[index] or at(index) -- returns event reference at that index
654 **************************************************************** */
655 class kmp_stats_event_vector {
656  kmp_stats_event *events;
657  int internal_size;
658  int allocated_size;
659  static const int INIT_SIZE = 1024;
660 
661 public:
662  kmp_stats_event_vector() {
663  events =
664  (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
665  internal_size = 0;
666  allocated_size = INIT_SIZE;
667  }
668  ~kmp_stats_event_vector() {}
669  inline void reset() { internal_size = 0; }
670  inline int size() const { return internal_size; }
671  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
672  timer_e name) {
673  int i;
674  if (internal_size == allocated_size) {
675  kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
676  sizeof(kmp_stats_event) * allocated_size * 2);
677  for (i = 0; i < internal_size; i++)
678  tmp[i] = events[i];
679  __kmp_free(events);
680  events = tmp;
681  allocated_size *= 2;
682  }
683  events[internal_size] =
684  kmp_stats_event(start_time, stop_time, nest_level, name);
685  internal_size++;
686  return;
687  }
688  void deallocate();
689  void sort();
690  const kmp_stats_event &operator[](int index) const { return events[index]; }
691  kmp_stats_event &operator[](int index) { return events[index]; }
692  const kmp_stats_event &at(int index) const { return events[index]; }
693  kmp_stats_event &at(int index) { return events[index]; }
694 };
695 
696 /* ****************************************************************
697  Class to implement a doubly-linked, circular, statistics list
698 
699  |---| ---> |---| ---> |---| ---> |---| ---> ... next
700  | | | | | | | |
701  |---| <--- |---| <--- |---| <--- |---| <--- ... prev
702  Sentinel first second third
703  Node node node node
704 
705  The Sentinel Node is the user handle on the list.
706  The first node corresponds to thread 0's statistics.
707  The second node corresponds to thread 1's statistics and so on...
708 
709  Each node has a _timers, _counters, and _explicitTimers array to hold that
710  thread's statistics. The _explicitTimers point to the correct _timer and
711  update its statistics at every stop() call. The explicitTimers' pointers are
712  set up in the constructor. Each node also has an event vector to hold that
713  thread's timing events. The event vector expands as necessary and records
714  the start-stop times for each timer.
715 
716  The nestLevel variable is for plotting events and is related
717  to the bar width in the timeline graph.
718 
719  Every thread will have a thread local pointer to its node in
720  the list. The sentinel node is used by the primary thread to
721  store "dummy" statistics before __kmp_create_worker() is called.
722 **************************************************************** */
723 class kmp_stats_list {
724  int gtid;
725  timeStat _timers[TIMER_LAST + 1];
726  counter _counters[COUNTER_LAST + 1];
727  explicitTimer thread_life_timer;
728  partitionedTimers _partitionedTimers;
729  int _nestLevel; // one per thread
730  kmp_stats_event_vector _event_vector;
731  kmp_stats_list *next;
732  kmp_stats_list *prev;
733  stats_state_e state;
734  int thread_is_idle_flag;
735 
736 public:
737  kmp_stats_list()
738  : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
739  TIMER_OMP_worker_thread_life),
740  _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
741  thread_is_idle_flag(0) {}
742  ~kmp_stats_list() {}
743  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
744  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
745  inline partitionedTimers *getPartitionedTimers() {
746  return &_partitionedTimers;
747  }
748  inline timeStat *getTimers() { return _timers; }
749  inline counter *getCounters() { return _counters; }
750  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
751  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
752  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
753  inline void resetEventVector() { _event_vector.reset(); }
754  inline void incrementNestValue() { _nestLevel++; }
755  inline int getNestValue() { return _nestLevel; }
756  inline void decrementNestValue() { _nestLevel--; }
757  inline int getGtid() const { return gtid; }
758  inline void setGtid(int newgtid) { gtid = newgtid; }
759  inline void setState(stats_state_e newstate) { state = newstate; }
760  inline stats_state_e getState() const { return state; }
761  inline stats_state_e *getStatePointer() { return &state; }
762  inline bool isIdle() { return thread_is_idle_flag == 1; }
763  inline void setIdleFlag() { thread_is_idle_flag = 1; }
764  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
765  kmp_stats_list *push_back(int gtid); // returns newly created list node
766  inline void push_event(uint64_t start_time, uint64_t stop_time,
767  int nest_level, timer_e name) {
768  _event_vector.push_back(start_time, stop_time, nest_level, name);
769  }
770  void deallocate();
771  class iterator;
772  kmp_stats_list::iterator begin();
773  kmp_stats_list::iterator end();
774  int size();
775  class iterator {
776  kmp_stats_list *ptr;
777  friend kmp_stats_list::iterator kmp_stats_list::begin();
778  friend kmp_stats_list::iterator kmp_stats_list::end();
779 
780  public:
781  iterator();
782  ~iterator();
783  iterator operator++();
784  iterator operator++(int dummy);
785  iterator operator--();
786  iterator operator--(int dummy);
787  bool operator!=(const iterator &rhs);
788  bool operator==(const iterator &rhs);
789  kmp_stats_list *operator*() const; // dereference operator
790  };
791 };
792 
793 /* ****************************************************************
794  Class to encapsulate all output functions and the environment variables
795 
796  This module holds filenames for various outputs (normal stats, events, plot
797  file), as well as coloring information for the plot file.
798 
799  The filenames and flags variables are read from environment variables.
800  These are read once by the constructor of the global variable
801  __kmp_stats_output which calls init().
802 
803  During this init() call, event flags for the timeStat::timerInfo[] global
804  array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
805 
806  The only interface function that is public is outputStats(heading). This
807  function should print out everything it needs to, either to files or stderr,
808  depending on the environment variables described below
809 
810  ENVIRONMENT VARIABLES:
811  KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
812  file, otherwise, print to stderr
813  KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
814  either KMP_STATS_FILE or stderr
815  KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
816  otherwise, the plot file is sent to "events.plt"
817  KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
818  events
819  KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
820  otherwise, output is sent to "events.dat"
821 **************************************************************** */
822 class kmp_stats_output_module {
823 
824 public:
825  struct rgb_color {
826  float r;
827  float g;
828  float b;
829  };
830 
831 private:
832  std::string outputFileName;
833  static const char *eventsFileName;
834  static const char *plotFileName;
835  static int printPerThreadFlag;
836  static int printPerThreadEventsFlag;
837  static const rgb_color globalColorArray[];
838  static rgb_color timerColorInfo[];
839 
840  void init();
841  static void setupEventColors();
842  static void printPloticusFile();
843  static void printHeaderInfo(FILE *statsOut);
844  static void printTimerStats(FILE *statsOut, statistic const *theStats,
845  statistic const *totalStats);
846  static void printCounterStats(FILE *statsOut, statistic const *theStats);
847  static void printCounters(FILE *statsOut, counter const *theCounters);
848  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
849  int gtid);
850  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
851  static void windupExplicitTimers();
852  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
853 
854 public:
855  kmp_stats_output_module() { init(); }
856  void outputStats(const char *heading);
857 };
858 
859 #ifdef __cplusplus
860 extern "C" {
861 #endif
862 void __kmp_stats_init();
863 void __kmp_stats_fini();
864 void __kmp_reset_stats();
865 void __kmp_output_stats(const char *);
866 void __kmp_accumulate_stats_at_exit(void);
867 // thread local pointer to stats node within list
868 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
869 // head to stats list.
870 extern kmp_stats_list *__kmp_stats_list;
871 // lock for __kmp_stats_list
872 extern kmp_tas_lock_t __kmp_stats_lock;
873 // reference start time
874 extern tsc_tick_count __kmp_stats_start_time;
875 // interface to output
876 extern kmp_stats_output_module __kmp_stats_output;
877 
878 #ifdef __cplusplus
879 }
880 #endif
881 
882 // Simple, standard interfaces that drop out completely if stats aren't enabled
883 
895 #define KMP_COUNT_VALUE(name, value) \
896  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
897 
908 #define KMP_COUNT_BLOCK(name) \
909  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
910 
928 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
929 
937 #define KMP_INIT_PARTITIONED_TIMERS(name) \
938  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
939  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
940 
941 #define KMP_TIME_PARTITIONED_BLOCK(name) \
942  blockPartitionedTimer __PBLOCKTIME__( \
943  __kmp_stats_thread_ptr->getPartitionedTimers(), \
944  explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
945  TIMER_##name))
946 
947 #define KMP_PUSH_PARTITIONED_TIMER(name) \
948  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
949  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
950 
951 #define KMP_POP_PARTITIONED_TIMER() \
952  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
953 
954 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
955  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
956  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
957 
958 #define KMP_SET_THREAD_STATE(state_name) \
959  __kmp_stats_thread_ptr->setState(state_name)
960 
961 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
962 
963 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
964  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
965  state_name)
966 
974 #define KMP_RESET_STATS() __kmp_reset_stats()
975 
976 #if (KMP_DEVELOPER_STATS)
977 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
978 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
979 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
980 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
981 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
982 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
983  KMP_EXCHANGE_PARTITIONED_TIMER(n)
984 #else
985 // Null definitions
986 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
987 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
988 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
989 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
990 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
991 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
992 #endif
993 
994 #else // KMP_STATS_ENABLED
995 
996 // Null definitions
997 #define KMP_COUNT_VALUE(n, v) ((void)0)
998 #define KMP_COUNT_BLOCK(n) ((void)0)
999 
1000 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
1001 #define KMP_RESET_STATS() ((void)0)
1002 
1003 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1004 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1005 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1006 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1007 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1008 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1009 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1010 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1011 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1012 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1013 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1014 #define KMP_GET_THREAD_STATE() ((void)0)
1015 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1016 #endif // KMP_STATS_ENABLED
1017 
1018 #endif // KMP_STATS_H
stats_flags_e
flags to describe the statistic (timer or counter)
Definition: kmp_stats.h:49
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition: kmp_stats.h:95
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition: kmp_stats.h:300
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
@ notInMaster
statistic is valid only for non-primary threads
Definition: kmp_stats.h:53
@ noUnits
statistic doesn't need units printed next to it
Definition: kmp_stats.h:52
@ logEvent
Definition: kmp_stats.h:54
@ noTotal
do not show a TOTAL_aggregation for this statistic
Definition: kmp_stats.h:50
@ onlyInMaster
statistic is valid only for primary thread
Definition: kmp_stats.h:51