kmp_dispatch.h source code [openmp/runtime/src/kmp_dispatch.h]

1	/*
2	* kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3	*/
4
5	//===----------------------------------------------------------------------===//
6	//
7	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8	// See https://llvm.org/LICENSE.txt for license information.
9	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10	//
11	//===----------------------------------------------------------------------===//
12
13	#ifndef KMP_DISPATCH_H
14	#define KMP_DISPATCH_H
15
16	/ ------------------------------------------------------------------------ /
17	/ ------------------------------------------------------------------------ /
18
19	#include "kmp.h"
20	#include "kmp_error.h"
21	#include "kmp_i18n.h"
22	#include "kmp_itt.h"
23	#include "kmp_stats.h"
24	#include "kmp_str.h"
25	#if KMP_OS_WINDOWS && KMP_ARCH_X86
26	#include <float.h>
27	#endif
28
29	#if OMPT_SUPPORT
30	#include "ompt-internal.h"
31	#include "ompt-specific.h"
32	#endif
33
34	/ ------------------------------------------------------------------------ /
35	/ ------------------------------------------------------------------------ /
36	#if KMP_USE_HIER_SCHED
37	// Forward declarations of some hierarchical scheduling data structures
38	template <typename T> struct kmp_hier_t;
39	template <typename T> struct kmp_hier_top_unit_t;
40	#endif // KMP_USE_HIER_SCHED
41
42	template <typename T> struct dispatch_shared_info_template;
43	template <typename T> struct dispatch_private_info_template;
44
45	template <typename T>
46	extern void __kmp_dispatch_init_algorithm(ident_t loc, int* gtid,
47	dispatch_private_info_template<T> *pr,
48	enum sched_type schedule, T lb, T ub,
49	typename traits_t<T>::signed_t st,
50	#if USE_ITT_BUILD
51	kmp_uint64 *cur_chunk,
52	#endif
53	typename traits_t<T>::signed_t chunk,
54	T nproc, T unit_id);
55	template <typename T>
56	extern int __kmp_dispatch_next_algorithm(
57	int gtid, dispatch_private_info_template<T> *pr,
58	dispatch_shared_info_template<T> volatile sh, kmp_int32 p_last, T *p_lb,
59	T p_ub, typename* traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60
61	void __kmp_dispatch_dxo_error(int gtid_ref, int* cid_ref, ident_t loc_ref);
62	void __kmp_dispatch_deo_error(int gtid_ref, int* cid_ref, ident_t loc_ref);
63
64	#if KMP_STATIC_STEAL_ENABLED
65
66	// replaces dispatch_private_info{32,64} structures and
67	// dispatch_private_info{32,64}_t types
68	template <typename T> struct dispatch_private_infoXX_template {
69	typedef typename traits_t<T>::unsigned_t UT;
70	typedef typename traits_t<T>::signed_t ST;
71	UT count; // unsigned
72	T ub;
73	/ Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance /
74	T lb;
75	ST st; // signed
76	UT tc; // unsigned
77	kmp_lock_t steal_lock; // lock used for chunk stealing*
78
79	UT ordered_lower; // unsigned
80	UT ordered_upper; // unsigned
81
82	/ parm[1-4] are used in different ways by different scheduling algorithms /
83
84	// KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
85	// a) parm3 is properly aligned and
86	// b) all parm1-4 are in the same cache line.
87	// Because of parm1-4 are used together, performance seems to be better
88	// if they are in the same line (not measured though).
89	struct KMP_ALIGN(`32`) { // compiler does not accept sizeof(T)4*
90	T parm1;
91	T parm2;
92	T parm3;
93	T parm4;
94	};
95
96	#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
97	UT pchunks; // total number of chunks for processes with p-core
98	UT num_procs_with_pcore; // number of threads with p-core
99	T first_thread_with_ecore;
100	#endif
101	#if KMP_OS_WINDOWS
102	T last_upper;
103	#endif /* KMP_OS_WINDOWS */
104	};
105
106	#else /* KMP_STATIC_STEAL_ENABLED */
107
108	// replaces dispatch_private_info{32,64} structures and
109	// dispatch_private_info{32,64}_t types
110	template <typename T> struct dispatch_private_infoXX_template {
111	typedef typename traits_t<T>::unsigned_t UT;
112	typedef typename traits_t<T>::signed_t ST;
113	T lb;
114	T ub;
115	ST st; // signed
116	UT tc; // unsigned
117
118	T parm1;
119	T parm2;
120	T parm3;
121	T parm4;
122
123	UT count; // unsigned
124
125	UT ordered_lower; // unsigned
126	UT ordered_upper; // unsigned
127	#if KMP_OS_WINDOWS
128	T last_upper;
129	#endif /* KMP_OS_WINDOWS */
130	};
131	#endif /* KMP_STATIC_STEAL_ENABLED */
132
133	template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
134	// duplicate alignment here, otherwise size of structure is not correct in our
135	// compiler
136	union KMP_ALIGN_CACHE private_info_tmpl {
137	dispatch_private_infoXX_template<T> p;
138	dispatch_private_info64_t p64;
139	} u;
140	enum sched_type schedule; / scheduling algorithm /
141	kmp_sched_flags_t flags; / flags (e.g., ordered, nomerge, etc.) /
142	std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
143	kmp_uint32 ordered_bumped;
144	dispatch_private_info next; /* stack of buffers for nest of serial regions /
145	kmp_uint32 type_size;
146	#if KMP_USE_HIER_SCHED
147	kmp_int32 hier_id;
148	kmp_hier_top_unit_t<T> *hier_parent;
149	// member functions
150	kmp_int32 get_hier_id() const { return hier_id; }
151	kmp_hier_top_unit_t<T> get_parent() { return* hier_parent; }
152	#endif
153	enum cons_type pushed_ws;
154	};
155
156	// replaces dispatch_shared_info{32,64} structures and
157	// dispatch_shared_info{32,64}_t types
158	template <typename T> struct dispatch_shared_infoXX_template {
159	typedef typename traits_t<T>::unsigned_t UT;
160	typedef typename traits_t<T>::signed_t ST;
161	/ chunk index under dynamic, number of idle threads under static-steal;*
162	iteration index otherwise /*
163	volatile UT iteration;
164	volatile ST num_done;
165	volatile UT ordered_iteration;
166	// to retain the structure size making ordered_iteration scalar
167	UT ordered_dummy[KMP_MAX_ORDERED - `3`];
168	};
169
170	// replaces dispatch_shared_info structure and dispatch_shared_info_t type
171	template <typename T> struct dispatch_shared_info_template {
172	typedef typename traits_t<T>::unsigned_t UT;
173	// we need union here to keep the structure size
174	union shared_info_tmpl {
175	dispatch_shared_infoXX_template<UT> s;
176	dispatch_shared_info64_t s64;
177	} u;
178	volatile kmp_uint32 buffer_index;
179	volatile kmp_int32 doacross_buf_idx; // teamwise index
180	kmp_uint32 doacross_flags; // array of iteration flags (0/1)*
181	kmp_int32 doacross_num_done; // count finished threads
182	#if KMP_USE_HIER_SCHED
183	kmp_hier_t<T> *hier;
184	#endif
185	#if KMP_USE_HWLOC
186	// When linking with libhwloc, the ORDERED EPCC test slowsdown on big
187	// machines (> 48 cores). Performance analysis showed that a cache thrash
188	// was occurring and this padding helps alleviate the problem.
189	char padding[`64`];
190	#endif
191	};
192
193	/ ------------------------------------------------------------------------ /
194	/ ------------------------------------------------------------------------ /
195
196	#undef USE_TEST_LOCKS
197
198	// test_then_add template (general template should NOT be used)
199	template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
200
201	template <>
202	__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
203	kmp_int32 d) {
204	kmp_int32 r;
205	r = KMP_TEST_THEN_ADD32(p, d);
206	return r;
207	}
208
209	template <>
210	__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
211	kmp_int64 d) {
212	kmp_int64 r;
213	r = KMP_TEST_THEN_ADD64(p, d);
214	return r;
215	}
216
217	// test_then_inc_acq template (general template should NOT be used)
218	template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
219
220	template <>
221	__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
222	kmp_int32 r;
223	r = KMP_TEST_THEN_INC_ACQ32(p);
224	return r;
225	}
226
227	template <>
228	__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
229	kmp_int64 r;
230	r = KMP_TEST_THEN_INC_ACQ64(p);
231	return r;
232	}
233
234	// test_then_inc template (general template should NOT be used)
235	template <typename T> static __forceinline T test_then_inc(volatile T *p);
236
237	template <>
238	__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
239	kmp_int32 r;
240	r = KMP_TEST_THEN_INC32(p);
241	return r;
242	}
243
244	template <>
245	__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
246	kmp_int64 r;
247	r = KMP_TEST_THEN_INC64(p);
248	return r;
249	}
250
251	// compare_and_swap template (general template should NOT be used)
252	template <typename T>
253	static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
254
255	template <>
256	__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
257	kmp_int32 c, kmp_int32 s) {
258	return KMP_COMPARE_AND_STORE_REL32(p, c, s);
259	}
260
261	template <>
262	__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
263	kmp_int64 c, kmp_int64 s) {
264	return KMP_COMPARE_AND_STORE_REL64(p, c, s);
265	}
266
267	template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
268	return value >= checker;
269	}
270	template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
271	return value == checker;
272	}
273
274	/*
275	Spin wait loop that pauses between checks.
276	Waits until function returns non-zero when called with spinner and check.*
277	Does NOT put threads to sleep.
278	Arguments:
279	UT is unsigned 4- or 8-byte type
280	spinner - memory location to check value
281	checker - value which spinner is >, <, ==, etc.
282	pred - predicate function to perform binary comparison of some sort
283	#if USE_ITT_BUILD
284	obj -- is higher-level synchronization object to report to ittnotify. It
285	is used to report locks consistently. For example, if lock is acquired
286	immediately, its address is reported to ittnotify via
287	KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
288	and lock routine calls to KMP_WAIT(), the later should report the
289	same address, not an address of low-level spinner.
290	#endif // USE_ITT_BUILD
291	TODO: make inline function (move to header file for icl)
292	*/
293	template <typename UT>
294	static UT __kmp_wait(volatile UT *spinner, UT checker,
295	kmp_uint32 (pred)(UT, UT) USE_ITT_BUILD_ARG(void* *obj)) {
296	// note: we may not belong to a team at this point
297	volatile UT *spin = spinner;
298	UT check = checker;
299	kmp_uint32 spins;
300	kmp_uint32 (*f)(UT, UT) = pred;
301	kmp_uint64 time;
302	UT r;
303
304	KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
305	KMP_INIT_YIELD(spins);
306	KMP_INIT_BACKOFF(time);
307	// main wait spin loop
308	while (!f(r = *spin, check)) {
309	KMP_FSYNC_SPIN_PREPARE(obj);
310	/ GEH - remove this since it was accidentally introduced when kmp_wait was*
311	split.
312	It causes problems with infinite recursion because of exit lock /*
313	/ if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)*
314	__kmp_abort_thread(); /*
315	// If oversubscribed, or have waited a bit then yield.
316	KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
317	}
318	KMP_FSYNC_SPIN_ACQUIRED(obj);
319	return r;
320	}
321
322	/ ------------------------------------------------------------------------ /
323	/ ------------------------------------------------------------------------ /
324
325	template <typename UT>
326	void __kmp_dispatch_deo(int gtid_ref, int* cid_ref, ident_t loc_ref) {
327	dispatch_private_info_template<UT> *pr;
328
329	int gtid = *gtid_ref;
330	// int cid = cid_ref;*
331	kmp_info_t *th = __kmp_threads[gtid];
332	KMP_DEBUG_ASSERT(th->th.th_dispatch);
333
334	KD_TRACE(`100`, ("__kmp_dispatch_deo: T#%d called\n", gtid));
335	if (__kmp_env_consistency_check) {
336	pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
337	th->th.th_dispatch->th_dispatch_pr_current);
338	if (pr->pushed_ws != ct_none) {
339	#if KMP_USE_DYNAMIC_LOCK
340	__kmp_push_sync(gtid, ct: ct_ordered_in_pdo, ident: loc_ref, NULL, `0`);
341	#else
342	__kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
343	#endif
344	}
345	}
346
347	if (!th->th.th_team->t.t_serialized) {
348	dispatch_shared_info_template<UT> *sh =
349	reinterpret_cast<dispatch_shared_info_template<UT> *>(
350	th->th.th_dispatch->th_dispatch_sh_current);
351	UT lower;
352
353	if (!__kmp_env_consistency_check) {
354	pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
355	th->th.th_dispatch->th_dispatch_pr_current);
356	}
357	lower = pr->u.p.ordered_lower;
358
359	#if !defined(KMP_GOMP_COMPAT)
360	if (__kmp_env_consistency_check) {
361	if (pr->ordered_bumped) {
362	struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
363	__kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
364	ct_ordered_in_pdo, loc_ref,
365	&p->stack_data[p->w_top]);
366	}
367	}
368	#endif /* !defined(KMP_GOMP_COMPAT) */
369
370	KMP_MB();
371	#ifdef KMP_DEBUG
372	{
373	char *buff;
374	// create format specifiers before the debug output
375	buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
376	"ordered_iter:%%%s lower:%%%s\n",
377	traits_t<UT>::spec, traits_t<UT>::spec);
378	KD_TRACE(`1000`, (buff, gtid, sh->u.s.ordered_iteration, lower));
379	__kmp_str_free(str: &buff);
380	}
381	#endif
382	__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
383	__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
384	KMP_MB(); / is this necessary? /
385	#ifdef KMP_DEBUG
386	{
387	char *buff;
388	// create format specifiers before the debug output
389	buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
390	"ordered_iter:%%%s lower:%%%s\n",
391	traits_t<UT>::spec, traits_t<UT>::spec);
392	KD_TRACE(`1000`, (buff, gtid, sh->u.s.ordered_iteration, lower));
393	__kmp_str_free(str: &buff);
394	}
395	#endif
396	}
397	KD_TRACE(`100`, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
398	}
399
400	template <typename UT>
401	void __kmp_dispatch_dxo(int gtid_ref, int* cid_ref, ident_t loc_ref) {
402	typedef typename traits_t<UT>::signed_t ST;
403	dispatch_private_info_template<UT> *pr;
404
405	int gtid = *gtid_ref;
406	// int cid = cid_ref;*
407	kmp_info_t *th = __kmp_threads[gtid];
408	KMP_DEBUG_ASSERT(th->th.th_dispatch);
409
410	KD_TRACE(`100`, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
411	if (__kmp_env_consistency_check) {
412	pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
413	th->th.th_dispatch->th_dispatch_pr_current);
414	if (pr->pushed_ws != ct_none) {
415	__kmp_pop_sync(gtid, ct: ct_ordered_in_pdo, ident: loc_ref);
416	}
417	}
418
419	if (!th->th.th_team->t.t_serialized) {
420	dispatch_shared_info_template<UT> *sh =
421	reinterpret_cast<dispatch_shared_info_template<UT> *>(
422	th->th.th_dispatch->th_dispatch_sh_current);
423
424	if (!__kmp_env_consistency_check) {
425	pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
426	th->th.th_dispatch->th_dispatch_pr_current);
427	}
428
429	KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
430	#if !defined(KMP_GOMP_COMPAT)
431	if (__kmp_env_consistency_check) {
432	if (pr->ordered_bumped != `0`) {
433	struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
434	/ How to test it? - OM /
435	__kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
436	ct_ordered_in_pdo, loc_ref,
437	&p->stack_data[p->w_top]);
438	}
439	}
440	#endif /* !defined(KMP_GOMP_COMPAT) */
441
442	KMP_MB(); / Flush all pending memory write invalidates. /
443
444	pr->ordered_bumped += `1`;
445
446	KD_TRACE(`1000`,
447	("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
448	gtid, pr->ordered_bumped));
449
450	KMP_MB(); / Flush all pending memory write invalidates. /
451
452	/ TODO use general release procedure? /
453	test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
454
455	KMP_MB(); / Flush all pending memory write invalidates. /
456	}
457	KD_TRACE(`100`, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
458	}
459
460	/ Computes and returns x to the power of y, where y must a non-negative integer*
461	*/
462	template <typename UT>
463	static __forceinline long double __kmp_pow(long double x, UT y) {
464	long double s = `1.0L`;
465
466	KMP_DEBUG_ASSERT(x > `0.0` && x < `1.0`);
467	// KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
468	while (y) {
469	if (y & `1`)
470	s *= x;
471	x *= x;
472	y >>= `1`;
473	}
474	return s;
475	}
476
477	/ Computes and returns the number of unassigned iterations after idx chunks*
478	have been assigned
479	(the total number of unassigned iterations in chunks with index greater than
480	or equal to idx).
481	__forceinline seems to be broken so that if we __forceinline this function,
482	the behavior is wrong
483	(one of the unit tests, sch_guided_analytical_basic.cpp, fails)
484	*/
485	template <typename T>
486	static __inline typename traits_t<T>::unsigned_t
487	__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
488	typename traits_t<T>::unsigned_t idx) {
489	/ Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at*
490	least for ICL 8.1, long double arithmetic may not really have
491	long double precision, even with /Qlong_double. Currently, we
492	workaround that in the caller code, by manipulating the FPCW for
493	Windows OS on IA-32 architecture. The lack of precision is not*
494	expected to be a correctness issue, though.
495	*/
496	typedef typename traits_t<T>::unsigned_t UT;
497
498	long double x = tc * __kmp_pow<UT>(base, idx);
499	UT r = (UT)x;
500	if (x == r)
501	return r;
502	return r + `1`;
503	}
504
505	// Parameters of the guided-iterative algorithm:
506	// p2 = n nproc * ( chunk + 1 ) // point of switching to dynamic*
507	// p3 = 1 / ( n nproc ) // remaining iterations multiplier*
508	// by default n = 2. For example with n = 3 the chunks distribution will be more
509	// flat.
510	// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
511	static const int guided_int_param = `2`;
512	static const double guided_flt_param = `0.5`; // = 1.0 / guided_int_param;
513	#endif // KMP_DISPATCH_H
514

source code of openmp/runtime/src/kmp_dispatch.h