1#ifndef KMP_STATS_H
2#define KMP_STATS_H
3
4/** @file kmp_stats.h
5 * Functions for collecting statistics.
6 */
7
8//===----------------------------------------------------------------------===//
9//
10// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11// See https://llvm.org/LICENSE.txt for license information.
12// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13//
14//===----------------------------------------------------------------------===//
15
16#include "kmp_config.h"
17#include "kmp_debug.h"
18
19#if KMP_STATS_ENABLED
20/* Statistics accumulator.
21 Accumulates number of samples and computes min, max, mean, standard deviation
22 on the fly.
23
24 Online variance calculation algorithm from
25 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26 */
27
28#include "kmp_stats_timing.h"
29#include <limits>
30#include <math.h>
31#include <new> // placement new
32#include <stdint.h>
33#include <string>
34#include <vector>
35
36/* Enable developer statistics here if you want them. They are more detailed
37 than is useful for application characterisation and are intended for the
38 runtime library developer. */
39#define KMP_DEVELOPER_STATS 0
40
41/* Enable/Disable histogram output */
42#define KMP_STATS_HIST 0
43
44/*!
45 * @ingroup STATS_GATHERING
46 * \brief flags to describe the statistic (timer or counter)
47 *
48 */
49enum stats_flags_e {
50 noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
51 onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread
52 noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
53 notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads
54 logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
55 //! KMP_STATS_EVENTS is on (valid only for timers)
56};
57
58/*!
59 * @ingroup STATS_GATHERING
60 * \brief the states which a thread can be in
61 *
62 */
63enum stats_state_e {
64 IDLE,
65 SERIAL_REGION,
66 FORK_JOIN_BARRIER,
67 PLAIN_BARRIER,
68 TASKWAIT,
69 TASKYIELD,
70 TASKGROUP,
71 IMPLICIT_TASK,
72 EXPLICIT_TASK,
73 TEAMS_REGION
74};
75
76/*!
77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
78 *
79 * @param macro a user defined macro that takes three arguments -
80 * macro(COUNTER_NAME, flags, arg)
81 * @param arg a user defined argument to send to the user defined macro
82 *
83 * \details A counter counts the occurrence of some event. Each thread
84 * accumulates its own count, at the end of execution the counts are aggregated
85 * treating each thread as a separate measurement. (Unless onlyInMaster is set,
86 * in which case there's only a single measurement). The min,mean,max are
87 * therefore the values for the threads. Adding the counter here and then
88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
89 * need to do. All of the tables and printing is generated from this macro.
90 * Format is "macro(name, flags, arg)"
91 *
92 * @ingroup STATS_GATHERING
93 */
94// clang-format off
95#define KMP_FOREACH_COUNTER(macro, arg) \
96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97 macro(OMP_NESTED_PARALLEL, 0, arg) \
98 macro(OMP_LOOP_STATIC, 0, arg) \
99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100 macro(OMP_LOOP_DYNAMIC, 0, arg) \
101 macro(OMP_DISTRIBUTE, 0, arg) \
102 macro(OMP_BARRIER, 0, arg) \
103 macro(OMP_CRITICAL, 0, arg) \
104 macro(OMP_SINGLE, 0, arg) \
105 macro(OMP_SECTIONS, 0, arg) \
106 macro(OMP_MASTER, 0, arg) \
107 macro(OMP_MASKED, 0, arg) \
108 macro(OMP_TEAMS, 0, arg) \
109 macro(OMP_set_lock, 0, arg) \
110 macro(OMP_test_lock, 0, arg) \
111 macro(REDUCE_wait, 0, arg) \
112 macro(REDUCE_nowait, 0, arg) \
113 macro(OMP_TASKYIELD, 0, arg) \
114 macro(OMP_TASKLOOP, 0, arg) \
115 macro(TASK_executed, 0, arg) \
116 macro(TASK_cancelled, 0, arg) \
117 macro(TASK_stolen, 0, arg)
118// clang-format on
119
120/*!
121 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
122 *
123 * @param macro a user defined macro that takes three arguments -
124 * macro(TIMER_NAME, flags, arg)
125 * @param arg a user defined argument to send to the user defined macro
126 *
127 * \details A timer collects multiple samples of some count in each thread and
128 * then finally aggregates all of the samples from all of the threads. For most
129 * timers the printing code also provides an aggregation over the thread totals.
130 * These are printed as TOTAL_foo. The count is normally a time (in ticks),
131 * hence the name "timer". (But can be any value, so we use this for "number of
132 * arguments passed to fork" as well). For timers the threads are not
133 * significant, it's the individual observations that count, so the statistics
134 * are at that level. Format is "macro(name, flags, arg)"
135 *
136 * @ingroup STATS_GATHERING2
137 */
138// clang-format off
139#define KMP_FOREACH_TIMER(macro, arg) \
140 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
141 macro (OMP_parallel, stats_flags_e::logEvent, arg) \
142 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
143 macro (OMP_teams, stats_flags_e::logEvent, arg) \
144 macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
145 macro (OMP_loop_static, 0, arg) \
146 macro (OMP_loop_static_scheduling, 0, arg) \
147 macro (OMP_loop_dynamic, 0, arg) \
148 macro (OMP_loop_dynamic_scheduling, 0, arg) \
149 macro (OMP_distribute, 0, arg) \
150 macro (OMP_distribute_scheduling, 0, arg) \
151 macro (OMP_critical, 0, arg) \
152 macro (OMP_critical_wait, 0, arg) \
153 macro (OMP_single, 0, arg) \
154 macro (OMP_sections, 0, arg) \
155 macro (OMP_sections_overhead, 0, arg) \
156 macro (OMP_master, 0, arg) \
157 macro (OMP_masked, 0, arg) \
158 macro (OMP_task_immediate, 0, arg) \
159 macro (OMP_task_taskwait, 0, arg) \
160 macro (OMP_task_taskyield, 0, arg) \
161 macro (OMP_task_taskgroup, 0, arg) \
162 macro (OMP_task_join_bar, 0, arg) \
163 macro (OMP_task_plain_bar, 0, arg) \
164 macro (OMP_taskloop_scheduling, 0, arg) \
165 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
166 macro (OMP_idle, stats_flags_e::logEvent, arg) \
167 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
168 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
169 macro (OMP_serial, stats_flags_e::logEvent, arg) \
170 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
171 arg) \
172 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
173 arg) \
174 macro (OMP_loop_static_iterations, \
175 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
176 macro (OMP_loop_static_total_iterations, \
177 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
178 macro (OMP_loop_dynamic_iterations, \
179 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
180 macro (OMP_loop_dynamic_total_iterations, \
181 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
182 macro (OMP_distribute_iterations, \
183 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
184 KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
185// clang-format on
186
187// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
188// initializing OpenMP or being created by a primary
189// thread) until the thread is destroyed
190// OMP_parallel -- Time thread spends executing work directly
191// within a #pragma omp parallel
192// OMP_parallel_overhead -- Time thread spends setting up a parallel region
193// OMP_loop_static -- Time thread spends executing loop iterations from
194// a statically scheduled loop
195// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
196// from a statically scheduled loop
197// OMP_loop_dynamic -- Time thread spends executing loop iterations from
198// a dynamically scheduled loop
199// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
200// from a dynamically scheduled loop
201// OMP_critical -- Time thread spends executing critical section
202// OMP_critical_wait -- Time thread spends waiting to enter
203// a critical section
204// OMP_single -- Time spent executing a "single" region
205// OMP_master -- Time spent executing a "master" region
206// OMP_masked -- Time spent executing a "masked" region
207// OMP_task_immediate -- Time spent executing non-deferred tasks
208// OMP_task_taskwait -- Time spent executing tasks inside a taskwait
209// construct
210// OMP_task_taskyield -- Time spent executing tasks inside a taskyield
211// construct
212// OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
213// construct
214// OMP_task_join_bar -- Time spent executing tasks inside a join barrier
215// OMP_task_plain_bar -- Time spent executing tasks inside a barrier
216// construct
217// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
218// construct
219// OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
220// inside implicit barrier at end of worksharing
221// construct
222// OMP_idle -- Time worker threads spend waiting for next
223// parallel region
224// OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
225// parallel region
226// OMP_join_barrier -- Time spent in a the join barrier surrounding a
227// parallel region
228// OMP_serial -- Time thread zero spends executing serial code
229// OMP_set_numthreads -- Values passed to omp_set_num_threads
230// OMP_PARALLEL_args -- Number of arguments passed to a parallel region
231// OMP_loop_static_iterations -- Number of iterations thread is assigned for
232// statically scheduled loops
233// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
234// dynamically scheduled loops
235
236#if (KMP_DEVELOPER_STATS)
237// Timers which are of interest to runtime library developers, not end users.
238// These have to be explicitly enabled in addition to the other stats.
239
240// KMP_fork_barrier -- time in __kmp_fork_barrier
241// KMP_join_barrier -- time in __kmp_join_barrier
242// KMP_barrier -- time in __kmp_barrier
243// KMP_end_split_barrier -- time in __kmp_end_split_barrier
244// KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
245// KMP_icv_copy -- start/stop timer for any ICV copying
246// KMP_linear_gather -- time in __kmp_linear_barrier_gather
247// KMP_linear_release -- time in __kmp_linear_barrier_release
248// KMP_tree_gather -- time in __kmp_tree_barrier_gather
249// KMP_tree_release -- time in __kmp_tree_barrier_release
250// KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
251// KMP_hyper_release -- time in __kmp_hyper_barrier_release
252// KMP_dist_gather -- time in __kmp_dist_barrier_gather
253// KMP_dist_release -- time in __kmp_dist_barrier_release
254// clang-format off
255#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
256 macro(KMP_fork_call, 0, arg) \
257 macro(KMP_join_call, 0, arg) \
258 macro(KMP_end_split_barrier, 0, arg) \
259 macro(KMP_hier_gather, 0, arg) \
260 macro(KMP_hier_release, 0, arg) \
261 macro(KMP_hyper_gather, 0, arg) \
262 macro(KMP_hyper_release, 0, arg) \
263 macro(KMP_dist_gather, 0, arg) \
264 macro(KMP_dist_release, 0, arg) \
265 macro(KMP_linear_gather, 0, arg) \
266 macro(KMP_linear_release, 0, arg) \
267 macro(KMP_tree_gather, 0, arg) \
268 macro(KMP_tree_release, 0, arg) \
269 macro(USER_resume, 0, arg) \
270 macro(USER_suspend, 0, arg) \
271 macro(USER_mwait, 0, arg) \
272 macro(KMP_allocate_team, 0, arg) \
273 macro(KMP_setup_icv_copy, 0, arg) \
274 macro(USER_icv_copy, 0, arg) \
275 macro (FOR_static_steal_stolen, \
276 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
277 macro (FOR_static_steal_chunks, \
278 stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
279#else
280#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
281#endif
282// clang-format on
283
284/*!
285 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
286 *
287 * @param macro a user defined macro that takes three arguments -
288 * macro(TIMER_NAME, flags, arg)
289 * @param arg a user defined argument to send to the user defined macro
290 *
291 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
292 * BAD THINGS WILL HAPPEN!
293 *
294 * \details Explicit timers are ones where we need to allocate a timer itself
295 * (as well as the accumulated timing statistics). We allocate these on a
296 * per-thread basis, and explicitly start and stop them. Block timers just
297 * allocate the timer itself on the stack, and use the destructor to notice
298 * block exit; they don't need to be defined here. The name here should be the
299 * same as that of a timer above.
300 *
301 * @ingroup STATS_GATHERING
302 */
303#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
304
305#define ENUMERATE(name, ignore, prefix) prefix##name,
306enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
307
308enum explicit_timer_e {
309 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
310};
311
312enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
313#undef ENUMERATE
314
315/*
316 * A logarithmic histogram. It accumulates the number of values in each power of
317 * ten bin. So 1<=x<10, 10<=x<100, ...
318 * Mostly useful where we have some big outliers and want to see information
319 * about them.
320 */
321class logHistogram {
322 enum {
323 numBins = 31, /* Number of powers of 10. If this changes you need to change
324 * the initializer for binMax */
325
326 /*
327 * If you want to use this to analyse values that may be less than 1, (for
328 * instance times in s), then the logOffset gives you negative powers.
329 * In our case here, we're just looking at times in ticks, or counts, so we
330 * can never see values with magnitude < 1 (other than zero), so we can set
331 * it to 0. As above change the initializer if you change this.
332 */
333 logOffset = 0
334 };
335 uint32_t KMP_ALIGN_CACHE zeroCount;
336 struct {
337 uint32_t count;
338 double total;
339 } bins[numBins];
340
341 static double binMax[numBins];
342
343#ifdef KMP_DEBUG
344 uint64_t _total;
345
346 void check() const {
347 uint64_t t = zeroCount;
348 for (int i = 0; i < numBins; i++)
349 t += bins[i].count;
350 KMP_DEBUG_ASSERT(t == _total);
351 }
352#else
353 void check() const {}
354#endif
355
356public:
357 logHistogram() { reset(); }
358
359 logHistogram(logHistogram const &o) {
360 for (int i = 0; i < numBins; i++)
361 bins[i] = o.bins[i];
362#ifdef KMP_DEBUG
363 _total = o._total;
364#endif
365 }
366
367 void reset() {
368 zeroCount = 0;
369 for (int i = 0; i < numBins; i++) {
370 bins[i].count = 0;
371 bins[i].total = 0;
372 }
373
374#ifdef KMP_DEBUG
375 _total = 0;
376#endif
377 }
378 uint32_t count(int b) const { return bins[b + logOffset].count; }
379 double total(int b) const { return bins[b + logOffset].total; }
380 static uint32_t findBin(double sample);
381
382 logHistogram &operator+=(logHistogram const &o) {
383 zeroCount += o.zeroCount;
384 for (int i = 0; i < numBins; i++) {
385 bins[i].count += o.bins[i].count;
386 bins[i].total += o.bins[i].total;
387 }
388#ifdef KMP_DEBUG
389 _total += o._total;
390 check();
391#endif
392
393 return *this;
394 }
395
396 void addSample(double sample);
397 int minBin() const;
398 int maxBin() const;
399
400 std::string format(char) const;
401};
402
403class statistic {
404 double KMP_ALIGN_CACHE minVal;
405 double maxVal;
406 double meanVal;
407 double m2;
408 uint64_t sampleCount;
409 double offset;
410 bool collectingHist;
411 logHistogram hist;
412
413public:
414 statistic(bool doHist = bool(KMP_STATS_HIST)) {
415 reset();
416 collectingHist = doHist;
417 }
418 statistic(statistic const &o)
419 : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
420 sampleCount(o.sampleCount), offset(o.offset),
421 collectingHist(o.collectingHist), hist(o.hist) {}
422 statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
423 : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
424 sampleCount(sc), offset(0.0), collectingHist(false) {}
425 bool haveHist() const { return collectingHist; }
426 double getMin() const { return minVal; }
427 double getMean() const { return meanVal; }
428 double getMax() const { return maxVal; }
429 uint64_t getCount() const { return sampleCount; }
430 double getSD() const { return sqrt(m2 / sampleCount); }
431 double getTotal() const { return sampleCount * meanVal; }
432 logHistogram const *getHist() const { return &hist; }
433 void setOffset(double d) { offset = d; }
434
435 void reset() {
436 minVal = (std::numeric_limits<double>::max)();
437 maxVal = -minVal;
438 meanVal = 0.0;
439 m2 = 0.0;
440 sampleCount = 0;
441 offset = 0.0;
442 hist.reset();
443 }
444 void addSample(double sample);
445 void scale(double factor);
446 void scaleDown(double f) { scale(1. / f); }
447 void forceCount(uint64_t count) { sampleCount = count; }
448 statistic &operator+=(statistic const &other);
449
450 std::string format(char unit, bool total = false) const;
451 std::string formatHist(char unit) const { return hist.format(unit); }
452};
453
454struct statInfo {
455 const char *name;
456 uint32_t flags;
457};
458
459class timeStat : public statistic {
460 static statInfo timerInfo[];
461
462public:
463 timeStat() : statistic() {}
464 static const char *name(timer_e e) { return timerInfo[e].name; }
465 static bool noTotal(timer_e e) {
466 return timerInfo[e].flags & stats_flags_e::noTotal;
467 }
468 static bool masterOnly(timer_e e) {
469 return timerInfo[e].flags & stats_flags_e::onlyInMaster;
470 }
471 static bool workerOnly(timer_e e) {
472 return timerInfo[e].flags & stats_flags_e::notInMaster;
473 }
474 static bool noUnits(timer_e e) {
475 return timerInfo[e].flags & stats_flags_e::noUnits;
476 }
477 static bool logEvent(timer_e e) {
478 return timerInfo[e].flags & stats_flags_e::logEvent;
479 }
480 static void clearEventFlags() {
481 for (int i = 0; i < TIMER_LAST; i++) {
482 timerInfo[i].flags &= (~(stats_flags_e::logEvent));
483 }
484 }
485};
486
487// Where we need explicitly to start and end the timer, this version can be used
488// Since these timers normally aren't nicely scoped, so don't have a good place
489// to live on the stack of the thread, they're more work to use.
490class explicitTimer {
491 timeStat *stat;
492 timer_e timerEnumValue;
493 tsc_tick_count startTime;
494 tsc_tick_count pauseStartTime;
495 tsc_tick_count::tsc_interval_t totalPauseTime;
496
497public:
498 explicitTimer(timeStat *s, timer_e te)
499 : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
500 totalPauseTime() {}
501
502 // void setStat(timeStat *s) { stat = s; }
503 void start(tsc_tick_count tick);
504 void pause(tsc_tick_count tick) { pauseStartTime = tick; }
505 void resume(tsc_tick_count tick) {
506 totalPauseTime += (tick - pauseStartTime);
507 }
508 void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
509 void reset() {
510 startTime = 0;
511 pauseStartTime = 0;
512 totalPauseTime = 0;
513 }
514 timer_e get_type() const { return timerEnumValue; }
515};
516
517// Where you need to partition a threads clock ticks into separate states
518// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
519// DOING_NOTHING would render these conditions:
520// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
521// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
522// versa
523class partitionedTimers {
524private:
525 std::vector<explicitTimer> timer_stack;
526
527public:
528 partitionedTimers();
529 void init(explicitTimer timer);
530 void exchange(explicitTimer timer);
531 void push(explicitTimer timer);
532 void pop();
533 void windup();
534};
535
536// Special wrapper around the partitioned timers to aid timing code blocks
537// It avoids the need to have an explicit end, leaving the scope suffices.
538class blockPartitionedTimer {
539 partitionedTimers *part_timers;
540
541public:
542 blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
543 : part_timers(pt) {
544 part_timers->push(timer);
545 }
546 ~blockPartitionedTimer() { part_timers->pop(); }
547};
548
549// Special wrapper around the thread state to aid in keeping state in code
550// blocks It avoids the need to have an explicit end, leaving the scope
551// suffices.
552class blockThreadState {
553 stats_state_e *state_pointer;
554 stats_state_e old_state;
555
556public:
557 blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
558 : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
559 *state_pointer = new_state;
560 }
561 ~blockThreadState() { *state_pointer = old_state; }
562};
563
564// If all you want is a count, then you can use this...
565// The individual per-thread counts will be aggregated into a statistic at
566// program exit.
567class counter {
568 uint64_t value;
569 static const statInfo counterInfo[];
570
571public:
572 counter() : value(0) {}
573 void increment() { value++; }
574 uint64_t getValue() const { return value; }
575 void reset() { value = 0; }
576 static const char *name(counter_e e) { return counterInfo[e].name; }
577 static bool masterOnly(counter_e e) {
578 return counterInfo[e].flags & stats_flags_e::onlyInMaster;
579 }
580};
581
582/* ****************************************************************
583 Class to implement an event
584
585 There are four components to an event: start time, stop time
586 nest_level, and timer_name.
587 The start and stop time should be obvious (recorded in clock ticks).
588 The nest_level relates to the bar width in the timeline graph.
589 The timer_name is used to determine which timer event triggered this event.
590
591 the interface to this class is through four read-only operations:
592 1) getStart() -- returns the start time as 64 bit integer
593 2) getStop() -- returns the stop time as 64 bit integer
594 3) getNestLevel() -- returns the nest level of the event
595 4) getTimerName() -- returns the timer name that triggered event
596
597 *MORE ON NEST_LEVEL*
598 The nest level is used in the bar graph that represents the timeline.
599 Its main purpose is for showing how events are nested inside eachother.
600 For example, say events, A, B, and C are recorded. If the timeline
601 looks like this:
602
603Begin -------------------------------------------------------------> Time
604 | | | | | |
605 A B C C B A
606 start start start end end end
607
608 Then A, B, C will have a nest level of 1, 2, 3 respectively.
609 These values are then used to calculate the barwidth so you can
610 see that inside A, B has occurred, and inside B, C has occurred.
611 Currently, this is shown with A's bar width being larger than B's
612 bar width, and B's bar width being larger than C's bar width.
613
614**************************************************************** */
615class kmp_stats_event {
616 uint64_t start;
617 uint64_t stop;
618 int nest_level;
619 timer_e timer_name;
620
621public:
622 kmp_stats_event()
623 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
624 kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
625 : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
626 inline uint64_t getStart() const { return start; }
627 inline uint64_t getStop() const { return stop; }
628 inline int getNestLevel() const { return nest_level; }
629 inline timer_e getTimerName() const { return timer_name; }
630};
631
632/* ****************************************************************
633 Class to implement a dynamically expandable array of events
634
635 ---------------------------------------------------------
636 | event 1 | event 2 | event 3 | event 4 | ... | event N |
637 ---------------------------------------------------------
638
639 An event is pushed onto the back of this array at every
640 explicitTimer->stop() call. The event records the thread #,
641 start time, stop time, and nest level related to the bar width.
642
643 The event vector starts at size INIT_SIZE and grows (doubles in size)
644 if needed. An implication of this behavior is that log(N)
645 reallocations are needed (where N is number of events). If you want
646 to avoid reallocations, then set INIT_SIZE to a large value.
647
648 the interface to this class is through six operations:
649 1) reset() -- sets the internal_size back to 0 but does not deallocate any
650 memory
651 2) size() -- returns the number of valid elements in the vector
652 3) push_back(start, stop, nest, timer_name) -- pushes an event onto
653 the back of the array
654 4) deallocate() -- frees all memory associated with the vector
655 5) sort() -- sorts the vector by start time
656 6) operator[index] or at(index) -- returns event reference at that index
657**************************************************************** */
658class kmp_stats_event_vector {
659 kmp_stats_event *events;
660 int internal_size;
661 int allocated_size;
662 static const int INIT_SIZE = 1024;
663
664public:
665 kmp_stats_event_vector() {
666 events =
667 (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
668 internal_size = 0;
669 allocated_size = INIT_SIZE;
670 }
671 ~kmp_stats_event_vector() {}
672 inline void reset() { internal_size = 0; }
673 inline int size() const { return internal_size; }
674 void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
675 timer_e name) {
676 int i;
677 if (internal_size == allocated_size) {
678 kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
679 sizeof(kmp_stats_event) * allocated_size * 2);
680 for (i = 0; i < internal_size; i++)
681 tmp[i] = events[i];
682 __kmp_free(events);
683 events = tmp;
684 allocated_size *= 2;
685 }
686 events[internal_size] =
687 kmp_stats_event(start_time, stop_time, nest_level, name);
688 internal_size++;
689 return;
690 }
691 void deallocate();
692 void sort();
693 const kmp_stats_event &operator[](int index) const { return events[index]; }
694 kmp_stats_event &operator[](int index) { return events[index]; }
695 const kmp_stats_event &at(int index) const { return events[index]; }
696 kmp_stats_event &at(int index) { return events[index]; }
697};
698
699/* ****************************************************************
700 Class to implement a doubly-linked, circular, statistics list
701
702 |---| ---> |---| ---> |---| ---> |---| ---> ... next
703 | | | | | | | |
704 |---| <--- |---| <--- |---| <--- |---| <--- ... prev
705 Sentinel first second third
706 Node node node node
707
708 The Sentinel Node is the user handle on the list.
709 The first node corresponds to thread 0's statistics.
710 The second node corresponds to thread 1's statistics and so on...
711
712 Each node has a _timers, _counters, and _explicitTimers array to hold that
713 thread's statistics. The _explicitTimers point to the correct _timer and
714 update its statistics at every stop() call. The explicitTimers' pointers are
715 set up in the constructor. Each node also has an event vector to hold that
716 thread's timing events. The event vector expands as necessary and records
717 the start-stop times for each timer.
718
719 The nestLevel variable is for plotting events and is related
720 to the bar width in the timeline graph.
721
722 Every thread will have a thread local pointer to its node in
723 the list. The sentinel node is used by the primary thread to
724 store "dummy" statistics before __kmp_create_worker() is called.
725**************************************************************** */
726class kmp_stats_list {
727 int gtid;
728 timeStat _timers[TIMER_LAST + 1];
729 counter _counters[COUNTER_LAST + 1];
730 explicitTimer thread_life_timer;
731 partitionedTimers _partitionedTimers;
732 int _nestLevel; // one per thread
733 kmp_stats_event_vector _event_vector;
734 kmp_stats_list *next;
735 kmp_stats_list *prev;
736 stats_state_e state;
737 int thread_is_idle_flag;
738
739public:
740 kmp_stats_list()
741 : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
742 TIMER_OMP_worker_thread_life),
743 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
744 thread_is_idle_flag(0) {}
745 ~kmp_stats_list() {}
746 inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
747 inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
748 inline partitionedTimers *getPartitionedTimers() {
749 return &_partitionedTimers;
750 }
751 inline timeStat *getTimers() { return _timers; }
752 inline counter *getCounters() { return _counters; }
753 inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
754 inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
755 inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
756 inline void resetEventVector() { _event_vector.reset(); }
757 inline void incrementNestValue() { _nestLevel++; }
758 inline int getNestValue() { return _nestLevel; }
759 inline void decrementNestValue() { _nestLevel--; }
760 inline int getGtid() const { return gtid; }
761 inline void setGtid(int newgtid) { gtid = newgtid; }
762 inline void setState(stats_state_e newstate) { state = newstate; }
763 inline stats_state_e getState() const { return state; }
764 inline stats_state_e *getStatePointer() { return &state; }
765 inline bool isIdle() { return thread_is_idle_flag == 1; }
766 inline void setIdleFlag() { thread_is_idle_flag = 1; }
767 inline void resetIdleFlag() { thread_is_idle_flag = 0; }
768 kmp_stats_list *push_back(int gtid); // returns newly created list node
769 inline void push_event(uint64_t start_time, uint64_t stop_time,
770 int nest_level, timer_e name) {
771 _event_vector.push_back(start_time, stop_time, nest_level, name);
772 }
773 void deallocate();
774 class iterator;
775 kmp_stats_list::iterator begin();
776 kmp_stats_list::iterator end();
777 int size();
778 class iterator {
779 kmp_stats_list *ptr;
780 friend kmp_stats_list::iterator kmp_stats_list::begin();
781 friend kmp_stats_list::iterator kmp_stats_list::end();
782
783 public:
784 iterator();
785 ~iterator();
786 iterator operator++();
787 iterator operator++(int dummy);
788 iterator operator--();
789 iterator operator--(int dummy);
790 bool operator!=(const iterator &rhs);
791 bool operator==(const iterator &rhs);
792 kmp_stats_list *operator*() const; // dereference operator
793 };
794};
795
796/* ****************************************************************
797 Class to encapsulate all output functions and the environment variables
798
799 This module holds filenames for various outputs (normal stats, events, plot
800 file), as well as coloring information for the plot file.
801
802 The filenames and flags variables are read from environment variables.
803 These are read once by the constructor of the global variable
804 __kmp_stats_output which calls init().
805
806 During this init() call, event flags for the timeStat::timerInfo[] global
807 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
808
809 The only interface function that is public is outputStats(heading). This
810 function should print out everything it needs to, either to files or stderr,
811 depending on the environment variables described below
812
813 ENVIRONMENT VARIABLES:
814 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
815 file, otherwise, print to stderr
816 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
817 either KMP_STATS_FILE or stderr
818 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
819 otherwise, the plot file is sent to "events.plt"
820 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
821 events
822 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
823 otherwise, output is sent to "events.dat"
824**************************************************************** */
825class kmp_stats_output_module {
826
827public:
828 struct rgb_color {
829 float r;
830 float g;
831 float b;
832 };
833
834private:
835 std::string outputFileName;
836 static const char *eventsFileName;
837 static const char *plotFileName;
838 static int printPerThreadFlag;
839 static int printPerThreadEventsFlag;
840 static const rgb_color globalColorArray[];
841 static rgb_color timerColorInfo[];
842
843 void init();
844 static void setupEventColors();
845 static void printPloticusFile();
846 static void printHeaderInfo(FILE *statsOut);
847 static void printTimerStats(FILE *statsOut, statistic const *theStats,
848 statistic const *totalStats);
849 static void printCounterStats(FILE *statsOut, statistic const *theStats);
850 static void printCounters(FILE *statsOut, counter const *theCounters);
851 static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
852 int gtid);
853 static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
854 static void windupExplicitTimers();
855 bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
856
857public:
858 kmp_stats_output_module() { init(); }
859 void outputStats(const char *heading);
860};
861
862#ifdef __cplusplus
863extern "C" {
864#endif
865void __kmp_stats_init();
866void __kmp_stats_fini();
867void __kmp_reset_stats();
868void __kmp_output_stats(const char *);
869void __kmp_accumulate_stats_at_exit(void);
870// thread local pointer to stats node within list
871extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
872// head to stats list.
873extern kmp_stats_list *__kmp_stats_list;
874// lock for __kmp_stats_list
875extern kmp_tas_lock_t __kmp_stats_lock;
876// reference start time
877extern tsc_tick_count __kmp_stats_start_time;
878// interface to output
879extern kmp_stats_output_module __kmp_stats_output;
880
881#ifdef __cplusplus
882}
883#endif
884
885// Simple, standard interfaces that drop out completely if stats aren't enabled
886
887/*!
888 * \brief Adds value to specified timer (name).
889 *
890 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
891 * @param value double precision sample value to add to statistics for the timer
892 *
893 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
894 * a timer statistics.
895 *
896 * @ingroup STATS_GATHERING
897 */
898#define KMP_COUNT_VALUE(name, value) \
899 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
900
901/*!
902 * \brief Increments specified counter (name).
903 *
904 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
905 *
906 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
907 * counter for the executing thread.
908 *
909 * @ingroup STATS_GATHERING
910 */
911#define KMP_COUNT_BLOCK(name) \
912 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
913
914/*!
915 * \brief Outputs the current thread statistics and reset them.
916 *
917 * @param heading_string heading put above the final stats output
918 *
919 * \details Explicitly stops all timers and outputs all stats. Environment
920 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
921 * filename instead of stderr. Environment variable,
922 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
923 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
924 * defined with any value, which will print out thread specific stats, or it can
925 * be undefined (not specified in the environment) and thread specific stats
926 * won't be printed. It should be noted that all statistics are reset when this
927 * macro is called.
928 *
929 * @ingroup STATS_GATHERING
930 */
931#define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
932
933/*!
934 * \brief Initializes the partitioned timers to begin with name.
935 *
936 * @param name timer which you want this thread to begin with
937 *
938 * @ingroup STATS_GATHERING
939 */
940#define KMP_INIT_PARTITIONED_TIMERS(name) \
941 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
942 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
943
944#define KMP_TIME_PARTITIONED_BLOCK(name) \
945 blockPartitionedTimer __PBLOCKTIME__( \
946 __kmp_stats_thread_ptr->getPartitionedTimers(), \
947 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
948 TIMER_##name))
949
950#define KMP_PUSH_PARTITIONED_TIMER(name) \
951 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
952 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
953
954#define KMP_POP_PARTITIONED_TIMER() \
955 __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
956
957#define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
958 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
959 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
960
961#define KMP_SET_THREAD_STATE(state_name) \
962 __kmp_stats_thread_ptr->setState(state_name)
963
964#define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
965
966#define KMP_SET_THREAD_STATE_BLOCK(state_name) \
967 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
968 state_name)
969
970/*!
971 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
972 *
973 * \details Reset all stats for all threads.
974 *
975 * @ingroup STATS_GATHERING
976 */
977#define KMP_RESET_STATS() __kmp_reset_stats()
978
979#if (KMP_DEVELOPER_STATS)
980#define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
981#define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
982#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
983#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
984#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
985#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
986 KMP_EXCHANGE_PARTITIONED_TIMER(n)
987#else
988// Null definitions
989#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
990#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
991#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
992#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
993#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
994#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
995#endif
996
997#else // KMP_STATS_ENABLED
998
999// Null definitions
1000#define KMP_COUNT_VALUE(n, v) ((void)0)
1001#define KMP_COUNT_BLOCK(n) ((void)0)
1002
1003#define KMP_OUTPUT_STATS(heading_string) ((void)0)
1004#define KMP_RESET_STATS() ((void)0)
1005
1006#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1007#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1008#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1009#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1010#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1011#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1012#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1013#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1014#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1015#define KMP_POP_PARTITIONED_TIMER() ((void)0)
1016#define KMP_SET_THREAD_STATE(state_name) ((void)0)
1017#define KMP_GET_THREAD_STATE() ((void)0)
1018#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1019#endif // KMP_STATS_ENABLED
1020
1021#endif // KMP_STATS_H
1022

source code of openmp/runtime/src/kmp_stats.h