kmp_barrier.cpp source code [openmp/runtime/src/kmp_barrier.cpp]

1	/*
2	* kmp_barrier.cpp
3	*/
4
5	//===----------------------------------------------------------------------===//
6	//
7	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8	// See https://llvm.org/LICENSE.txt for license information.
9	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "kmp_wait_release.h"
14	#include "kmp_barrier.h"
15	#include "kmp_itt.h"
16	#include "kmp_os.h"
17	#include "kmp_stats.h"
18	#include "ompt-specific.h"
19	// for distributed barrier
20	#include "kmp_affinity.h"
21
22	#if KMP_MIC
23	#include <immintrin.h>
24	#define USE_NGO_STORES 1
25	#endif // KMP_MIC
26
27	#if KMP_MIC && USE_NGO_STORES
28	// ICV copying
29	#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30	#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31	#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32	#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33	#else
34	#define ngo_load(src) ((void)0)
35	#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36	#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37	#define ngo_sync() ((void)0)
38	#endif /* KMP_MIC && USE_NGO_STORES */
39
40	void __kmp_print_structure(void); // Forward declaration
41
42	// ---------------------------- Barrier Algorithms ----------------------------
43	// Distributed barrier
44
45	// Compute how many threads to have polling each cache-line.
46	// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47	void distributedBarrier::computeVarsForN(size_t n) {
48	int nsockets = `1`;
49	if (__kmp_topology) {
50	int socket_level = __kmp_topology->get_level(type: KMP_HW_SOCKET);
51	int core_level = __kmp_topology->get_level(type: KMP_HW_CORE);
52	int ncores_per_socket =
53	__kmp_topology->calculate_ratio(level1: core_level, level2: socket_level);
54	nsockets = __kmp_topology->get_count(level: socket_level);
55
56	if (nsockets <= `0`)
57	nsockets = `1`;
58	if (ncores_per_socket <= `0`)
59	ncores_per_socket = `1`;
60
61	threads_per_go = ncores_per_socket >> `1`;
62	if (!fix_threads_per_go) {
63	// Minimize num_gos
64	if (threads_per_go > `4`) {
65	if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66	threads_per_go = threads_per_go >> `1`;
67	}
68	if (threads_per_go > `4` && nsockets == `1`)
69	threads_per_go = threads_per_go >> `1`;
70	}
71	}
72	if (threads_per_go == `0`)
73	threads_per_go = `1`;
74	fix_threads_per_go = true;
75	num_gos = n / threads_per_go;
76	if (n % threads_per_go)
77	num_gos++;
78	if (nsockets == `1` \|\| num_gos == `1`)
79	num_groups = `1`;
80	else {
81	num_groups = num_gos / nsockets;
82	if (num_gos % nsockets)
83	num_groups++;
84	}
85	if (num_groups <= `0`)
86	num_groups = `1`;
87	gos_per_group = num_gos / num_groups;
88	if (num_gos % num_groups)
89	gos_per_group++;
90	threads_per_group = threads_per_go * gos_per_group;
91	} else {
92	num_gos = n / threads_per_go;
93	if (n % threads_per_go)
94	num_gos++;
95	if (num_gos == `1`)
96	num_groups = `1`;
97	else {
98	num_groups = num_gos / `2`;
99	if (num_gos % `2`)
100	num_groups++;
101	}
102	gos_per_group = num_gos / num_groups;
103	if (num_gos % num_groups)
104	gos_per_group++;
105	threads_per_group = threads_per_go * gos_per_group;
106	}
107	}
108
109	void distributedBarrier::computeGo(size_t n) {
110	// Minimize num_gos
111	for (num_gos = `1`;; num_gos++)
112	if (IDEAL_CONTENTION * num_gos >= n)
113	break;
114	threads_per_go = n / num_gos;
115	if (n % num_gos)
116	threads_per_go++;
117	while (num_gos > MAX_GOS) {
118	threads_per_go++;
119	num_gos = n / threads_per_go;
120	if (n % threads_per_go)
121	num_gos++;
122	}
123	computeVarsForN(n);
124	}
125
126	// This function is to resize the barrier arrays when the new number of threads
127	// exceeds max_threads, which is the current size of all the arrays
128	void distributedBarrier::resize(size_t nthr) {
129	KMP_DEBUG_ASSERT(nthr > max_threads);
130
131	// expand to requested size 2*
132	max_threads = nthr * `2`;
133
134	// allocate arrays to new max threads
135	for (int i = `0`; i < MAX_ITERS; ++i) {
136	if (flags[i])
137	flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138	max_threads * sizeof(flags_s));
139	else
140	flags[i] = (flags_s )KMP_INTERNAL_MALLOC(max_threads sizeof(flags_s));
141	}
142
143	if (go)
144	go = (go_s )KMP_INTERNAL_REALLOC(go, max_threads sizeof(go_s));
145	else
146	go = (go_s )KMP_INTERNAL_MALLOC(max_threads sizeof(go_s));
147
148	if (iter)
149	iter = (iter_s )KMP_INTERNAL_REALLOC(iter, max_threads sizeof(iter_s));
150	else
151	iter = (iter_s )KMP_INTERNAL_MALLOC(max_threads sizeof(iter_s));
152
153	if (sleep)
154	sleep =
155	(sleep_s )KMP_INTERNAL_REALLOC(sleep, max_threads sizeof(sleep_s));
156	else
157	sleep = (sleep_s )KMP_INTERNAL_MALLOC(max_threads sizeof(sleep_s));
158	}
159
160	// This function is to set all the go flags that threads might be waiting
161	// on, and when blocktime is not infinite, it should be followed by a wake-up
162	// call to each thread
163	kmp_uint64 distributedBarrier::go_release() {
164	kmp_uint64 next_go = iter[`0`].iter + distributedBarrier::MAX_ITERS;
165	for (size_t j = `0`; j < num_gos; j++) {
166	go[j].go.store(i: next_go);
167	}
168	return next_go;
169	}
170
171	void distributedBarrier::go_reset() {
172	for (size_t j = `0`; j < max_threads; ++j) {
173	for (size_t i = `0`; i < distributedBarrier::MAX_ITERS; ++i) {
174	flags[i][j].stillNeed = `1`;
175	}
176	go[j].go.store(i: `0`);
177	iter[j].iter = `0`;
178	}
179	}
180
181	// This function inits/re-inits the distributed barrier for a particular number
182	// of threads. If a resize of arrays is needed, it calls the resize function.
183	void distributedBarrier::init(size_t nthr) {
184	size_t old_max = max_threads;
185	if (nthr > max_threads) { // need more space in arrays
186	resize(nthr);
187	}
188
189	for (size_t i = `0`; i < max_threads; i++) {
190	for (size_t j = `0`; j < distributedBarrier::MAX_ITERS; j++) {
191	flags[j][i].stillNeed = `1`;
192	}
193	go[i].go.store(i: `0`);
194	iter[i].iter = `0`;
195	if (i >= old_max)
196	sleep[i].sleep = false;
197	}
198
199	// Recalculate num_gos, etc. based on new nthr
200	computeVarsForN(n: nthr);
201
202	num_threads = nthr;
203
204	if (team_icvs == NULL)
205	team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206	}
207
208	// This function is used only when KMP_BLOCKTIME is not infinite.
209	// static
210	void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211	size_t start, size_t stop, size_t inc,
212	size_t tid) {
213	KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215	return;
216
217	kmp_info_t **other_threads = team->t.t_threads;
218	for (size_t thr = start; thr < stop; thr += inc) {
219	KMP_DEBUG_ASSERT(other_threads[thr]);
220	int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221	// Wake up worker regardless of if it appears to be sleeping or not
222	__kmp_atomic_resume_64(target_gtid: gtid, flag: (kmp_atomic_flag_64<> *)NULL);
223	}
224	}
225
226	static void __kmp_dist_barrier_gather(
227	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
228	void (reduce)(void* , void* ) USE_ITT_BUILD_ARG(void* *itt_sync_obj)) {
229	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230	kmp_team_t *team;
231	distributedBarrier *b;
232	kmp_info_t **other_threads;
233	kmp_uint64 my_current_iter, my_next_iter;
234	kmp_uint32 nproc;
235	bool group_leader;
236
237	team = this_thr->th.th_team;
238	nproc = this_thr->th.th_team_nproc;
239	other_threads = team->t.t_threads;
240	b = team->t.b;
241	my_current_iter = b->iter[tid].iter;
242	my_next_iter = (my_current_iter + `1`) % distributedBarrier::MAX_ITERS;
243	group_leader = ((tid % b->threads_per_group) == `0`);
244
245	KA_TRACE(`20`,
246	("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247	gtid, team->t.t_id, tid, bt));
248
249	#if USE_ITT_BUILD && USE_ITT_NOTIFY
250	// Barrier imbalance - save arrive time to the thread
251	if (__kmp_forkjoin_frames_mode == `3` \|\| __kmp_forkjoin_frames_mode == `2`) {
252	this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253	__itt_get_timestamp();
254	}
255	#endif
256
257	if (group_leader) {
258	// Start from the thread after the group leader
259	size_t group_start = tid + `1`;
260	size_t group_end = tid + b->threads_per_group;
261	size_t threads_pending = `0`;
262
263	if (group_end > nproc)
264	group_end = nproc;
265	do { // wait for threads in my group
266	threads_pending = `0`;
267	// Check all the flags every time to avoid branch misspredict
268	for (size_t thr = group_start; thr < group_end; thr++) {
269	// Each thread uses a different cache line
270	threads_pending += b->flags[my_current_iter][thr].stillNeed;
271	}
272	// Execute tasks here
273	if (__kmp_tasking_mode != tskm_immediate_exec) {
274	kmp_task_team_t *task_team = this_thr->th.th_task_team;
275	if (task_team != NULL) {
276	if (TCR_SYNC_4(task_team->tt.tt_active)) {
277	if (KMP_TASKING_ENABLED(task_team)) {
278	int tasks_completed = FALSE;
279	__kmp_atomic_execute_tasks_64(
280	thread: this_thr, gtid, flag: (kmp_atomic_flag_64<> *)NULL, FALSE,
281	thread_finished: &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained: `0`);
282	} else
283	this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284	}
285	} else {
286	this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287	} // if
288	}
289	if (TCR_4(__kmp_global.g.g_done)) {
290	if (__kmp_global.g.g_abort)
291	__kmp_abort_thread();
292	break;
293	} else if (__kmp_tasking_mode != tskm_immediate_exec &&
294	this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295	this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296	}
297	} while (threads_pending > `0`);
298
299	if (reduce) { // Perform reduction if needed
300	OMPT_REDUCTION_DECL(this_thr, gtid);
301	OMPT_REDUCTION_BEGIN;
302	// Group leader reduces all threads in group
303	for (size_t thr = group_start; thr < group_end; thr++) {
304	(*reduce)(this_thr->th.th_local.reduce_data,
305	other_threads[thr]->th.th_local.reduce_data);
306	}
307	OMPT_REDUCTION_END;
308	}
309
310	// Set flag for next iteration
311	b->flags[my_next_iter][tid].stillNeed = `1`;
312	// Each thread uses a different cache line; resets stillNeed to 0 to
313	// indicate it has reached the barrier
314	b->flags[my_current_iter][tid].stillNeed = `0`;
315
316	do { // wait for all group leaders
317	threads_pending = `0`;
318	for (size_t thr = `0`; thr < nproc; thr += b->threads_per_group) {
319	threads_pending += b->flags[my_current_iter][thr].stillNeed;
320	}
321	// Execute tasks here
322	if (__kmp_tasking_mode != tskm_immediate_exec) {
323	kmp_task_team_t *task_team = this_thr->th.th_task_team;
324	if (task_team != NULL) {
325	if (TCR_SYNC_4(task_team->tt.tt_active)) {
326	if (KMP_TASKING_ENABLED(task_team)) {
327	int tasks_completed = FALSE;
328	__kmp_atomic_execute_tasks_64(
329	thread: this_thr, gtid, flag: (kmp_atomic_flag_64<> *)NULL, FALSE,
330	thread_finished: &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained: `0`);
331	} else
332	this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333	}
334	} else {
335	this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336	} // if
337	}
338	if (TCR_4(__kmp_global.g.g_done)) {
339	if (__kmp_global.g.g_abort)
340	__kmp_abort_thread();
341	break;
342	} else if (__kmp_tasking_mode != tskm_immediate_exec &&
343	this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344	this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345	}
346	} while (threads_pending > `0`);
347
348	if (reduce) { // Perform reduction if needed
349	if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350	OMPT_REDUCTION_DECL(this_thr, gtid);
351	OMPT_REDUCTION_BEGIN;
352	for (size_t thr = b->threads_per_group; thr < nproc;
353	thr += b->threads_per_group) {
354	(*reduce)(this_thr->th.th_local.reduce_data,
355	other_threads[thr]->th.th_local.reduce_data);
356	}
357	OMPT_REDUCTION_END;
358	}
359	}
360	} else {
361	// Set flag for next iteration
362	b->flags[my_next_iter][tid].stillNeed = `1`;
363	// Each thread uses a different cache line; resets stillNeed to 0 to
364	// indicate it has reached the barrier
365	b->flags[my_current_iter][tid].stillNeed = `0`;
366	}
367
368	KMP_MFENCE();
369
370	KA_TRACE(`20`,
371	("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372	gtid, team->t.t_id, tid, bt));
373	}
374
375	static void __kmp_dist_barrier_release(
376	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
377	int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379	kmp_team_t *team;
380	distributedBarrier *b;
381	kmp_bstate_t *thr_bar;
382	kmp_uint64 my_current_iter, next_go;
383	size_t my_go_index;
384	bool group_leader;
385
386	KA_TRACE(`20`, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387	gtid, tid, bt));
388
389	thr_bar = &this_thr->th.th_bar[bt].bb;
390
391	if (!KMP_MASTER_TID(tid)) {
392	// workers and non-master group leaders need to check their presence in team
393	do {
394	if (this_thr->th.th_used_in_team.load() != `1` &&
395	this_thr->th.th_used_in_team.load() != `3`) {
396	// Thread is not in use in a team. Wait on location in tid's thread
397	// struct. The 0 value tells anyone looking that this thread is spinning
398	// or sleeping until this location becomes 3 again; 3 is the transition
399	// state to get to 1 which is waiting on go and being in the team
400	kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), `3`);
401	if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), `2`,
402	`0`) \|\|
403	this_thr->th.th_used_in_team.load() == `0`) {
404	my_flag.wait(this_thr, final_spin: true USE_ITT_BUILD_ARG(itt_sync_obj));
405	}
406	#if USE_ITT_BUILD && USE_ITT_NOTIFY
407	if ((__itt_sync_create_ptr && itt_sync_obj == NULL) \|\| KMP_ITT_DEBUG) {
408	// In fork barrier where we could not get the object reliably
409	itt_sync_obj =
410	__kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: `0`, delta: -`1`);
411	// Cancel wait on previous parallel region...
412	__kmp_itt_task_starting(object: itt_sync_obj);
413
414	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415	return;
416
417	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier);
418	if (itt_sync_obj != NULL)
419	// Call prepare as early as possible for "new" barrier
420	__kmp_itt_task_finished(object: itt_sync_obj);
421	} else
422	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424	return;
425	}
426	if (this_thr->th.th_used_in_team.load() != `1` &&
427	this_thr->th.th_used_in_team.load() != `3`) // spurious wake-up?
428	continue;
429	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430	return;
431
432	// At this point, the thread thinks it is in use in a team, or in
433	// transition to be used in a team, but it might have reached this barrier
434	// before it was marked unused by the team. Unused threads are awoken and
435	// shifted to wait on local thread struct elsewhere. It also might reach
436	// this point by being picked up for use by a different team. Either way,
437	// we need to update the tid.
438	tid = __kmp_tid_from_gtid(gtid);
439	team = this_thr->th.th_team;
440	KMP_DEBUG_ASSERT(tid >= `0`);
441	KMP_DEBUG_ASSERT(team);
442	b = team->t.b;
443	my_current_iter = b->iter[tid].iter;
444	next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445	my_go_index = tid / b->threads_per_go;
446	if (this_thr->th.th_used_in_team.load() == `3`) {
447	KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), `3`, `1`);
448	}
449	// Check if go flag is set
450	if (b->go[my_go_index].go.load() != next_go) {
451	// Wait on go flag on team
452	kmp_atomic_flag_64<false, true> my_flag(
453	&(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
454	my_flag.wait(this_thr, final_spin: true USE_ITT_BUILD_ARG(itt_sync_obj));
455	KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter \|\|
456	b->iter[tid].iter == `0`);
457	KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
458	}
459
460	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
461	return;
462	// At this point, the thread's go location was set. This means the primary
463	// thread is safely in the barrier, and so this thread's data is
464	// up-to-date, but we should check again that this thread is really in
465	// use in the team, as it could have been woken up for the purpose of
466	// changing team size, or reaping threads at shutdown.
467	if (this_thr->th.th_used_in_team.load() == `1`)
468	break;
469	} while (`1`);
470
471	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
472	return;
473
474	group_leader = ((tid % b->threads_per_group) == `0`);
475	if (group_leader) {
476	// Tell all the threads in my group they can go!
477	for (size_t go_idx = my_go_index + `1`;
478	go_idx < my_go_index + b->gos_per_group; go_idx++) {
479	b->go[go_idx].go.store(i: next_go);
480	}
481	// Fence added so that workers can see changes to go. sfence inadequate.
482	KMP_MFENCE();
483	}
484
485	#if KMP_BARRIER_ICV_PUSH
486	if (propagate_icvs) { // copy ICVs to final dest
487	__kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team,
488	tid, FALSE);
489	copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs,
490	src: (kmp_internal_control_t *)team->t.b->team_icvs);
491	copy_icvs(dst: &thr_bar->th_fixed_icvs,
492	src: &team->t.t_implicit_task_taskdata[tid].td_icvs);
493	}
494	#endif
495	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
496	// This thread is now awake and participating in the barrier;
497	// wake up the other threads in the group
498	size_t nproc = this_thr->th.th_team_nproc;
499	size_t group_end = tid + b->threads_per_group;
500	if (nproc < group_end)
501	group_end = nproc;
502	__kmp_dist_barrier_wakeup(bt, team, start: tid + `1`, stop: group_end, inc: `1`, tid);
503	}
504	} else { // Primary thread
505	team = this_thr->th.th_team;
506	b = team->t.b;
507	my_current_iter = b->iter[tid].iter;
508	next_go = my_current_iter + distributedBarrier::MAX_ITERS;
509	#if KMP_BARRIER_ICV_PUSH
510	if (propagate_icvs) {
511	// primary thread has ICVs in final destination; copy
512	copy_icvs(dst: &thr_bar->th_fixed_icvs,
513	src: &team->t.t_implicit_task_taskdata[tid].td_icvs);
514	}
515	#endif
516	// Tell all the group leaders they can go!
517	for (size_t go_idx = `0`; go_idx < b->num_gos; go_idx += b->gos_per_group) {
518	b->go[go_idx].go.store(i: next_go);
519	}
520
521	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
522	// Wake-up the group leaders
523	size_t nproc = this_thr->th.th_team_nproc;
524	__kmp_dist_barrier_wakeup(bt, team, start: tid + b->threads_per_group, stop: nproc,
525	inc: b->threads_per_group, tid);
526	}
527
528	// Tell all the threads in my group they can go!
529	for (size_t go_idx = `1`; go_idx < b->gos_per_group; go_idx++) {
530	b->go[go_idx].go.store(i: next_go);
531	}
532
533	// Fence added so that workers can see changes to go. sfence inadequate.
534	KMP_MFENCE();
535
536	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
537	// Wake-up the other threads in my group
538	size_t nproc = this_thr->th.th_team_nproc;
539	size_t group_end = tid + b->threads_per_group;
540	if (nproc < group_end)
541	group_end = nproc;
542	__kmp_dist_barrier_wakeup(bt, team, start: tid + `1`, stop: group_end, inc: `1`, tid);
543	}
544	}
545	// Update to next iteration
546	KMP_ASSERT(my_current_iter == b->iter[tid].iter);
547	b->iter[tid].iter = (b->iter[tid].iter + `1`) % distributedBarrier::MAX_ITERS;
548
549	KA_TRACE(
550	`20`, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
551	gtid, team->t.t_id, tid, bt));
552	}
553
554	// Linear Barrier
555	template <bool cancellable = false>
556	static bool __kmp_linear_barrier_gather_template(
557	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
558	void (reduce)(void* , void* ) USE_ITT_BUILD_ARG(void* *itt_sync_obj)) {
559	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
560	kmp_team_t *team = this_thr->th.th_team;
561	kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
562	kmp_info_t **other_threads = team->t.t_threads;
563
564	KA_TRACE(
565	`20`,
566	("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
567	gtid, team->t.t_id, tid, bt));
568	KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
569
570	#if USE_ITT_BUILD && USE_ITT_NOTIFY
571	// Barrier imbalance - save arrive time to the thread
572	if (__kmp_forkjoin_frames_mode == `3` \|\| __kmp_forkjoin_frames_mode == `2`) {
573	this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
574	__itt_get_timestamp();
575	}
576	#endif
577	// We now perform a linear reduction to signal that all of the threads have
578	// arrived.
579	if (!KMP_MASTER_TID(tid)) {
580	KA_TRACE(`20`,
581	("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
582	"arrived(%p): %llu => %llu\n",
583	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(`0`, team),
584	team->t.t_id, `0`, &thr_bar->b_arrived, thr_bar->b_arrived,
585	thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
586	// Mark arrival to primary thread
587	/ After performing this write, a worker thread may not assume that the team*
588	is valid any more - it could be deallocated by the primary thread at any
589	time. /*
590	kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[`0`]);
591	flag.release();
592	} else {
593	kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
594	int nproc = this_thr->th.th_team_nproc;
595	int i;
596	// Don't have to worry about sleep bit here or atomic since team setting
597	kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
598
599	// Collect all the worker team member threads.
600	for (i = `1`; i < nproc; ++i) {
601	#if KMP_CACHE_MANAGE
602	// Prefetch next thread's arrived count
603	if (i + `1` < nproc)
604	KMP_CACHE_PREFETCH(&other_threads[i + `1`]->th.th_bar[bt].bb.b_arrived);
605	#endif /* KMP_CACHE_MANAGE */
606	KA_TRACE(`20`, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
607	"arrived(%p) == %llu\n",
608	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
609	team->t.t_id, i,
610	&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
611
612	// Wait for worker thread to arrive
613	if (cancellable) {
614	kmp_flag_64<true, false> flag(
615	&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
616	if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
617	return true;
618	} else {
619	kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
620	new_state);
621	flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
622	}
623	#if USE_ITT_BUILD && USE_ITT_NOTIFY
624	// Barrier imbalance - write min of the thread time and the other thread
625	// time to the thread.
626	if (__kmp_forkjoin_frames_mode == `2`) {
627	this_thr->th.th_bar_min_time = KMP_MIN(
628	this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
629	}
630	#endif
631	if (reduce) {
632	KA_TRACE(`100`,
633	("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
634	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635	team->t.t_id, i));
636	OMPT_REDUCTION_DECL(this_thr, gtid);
637	OMPT_REDUCTION_BEGIN;
638	(*reduce)(this_thr->th.th_local.reduce_data,
639	other_threads[i]->th.th_local.reduce_data);
640	OMPT_REDUCTION_END;
641	}
642	}
643	// Don't have to worry about sleep bit here or atomic since team setting
644	team_bar->b_arrived = new_state;
645	KA_TRACE(`20`, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
646	"arrived(%p) = %llu\n",
647	gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
648	new_state));
649	}
650	KA_TRACE(
651	`20`,
652	("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
653	gtid, team->t.t_id, tid, bt));
654	return false;
655	}
656
657	template <bool cancellable = false>
658	static bool __kmp_linear_barrier_release_template(
659	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
660	int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
661	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
662	kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
663	kmp_team_t *team;
664
665	if (KMP_MASTER_TID(tid)) {
666	unsigned int i;
667	kmp_uint32 nproc = this_thr->th.th_team_nproc;
668	kmp_info_t **other_threads;
669
670	team = __kmp_threads[gtid]->th.th_team;
671	KMP_DEBUG_ASSERT(team != NULL);
672	other_threads = team->t.t_threads;
673
674	KA_TRACE(`20`, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
675	"barrier type %d\n",
676	gtid, team->t.t_id, tid, bt));
677
678	if (nproc > `1`) {
679	#if KMP_BARRIER_ICV_PUSH
680	{
681	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
682	if (propagate_icvs) {
683	ngo_load(&team->t.t_implicit_task_taskdata[`0`].td_icvs);
684	for (i = `1`; i < nproc; ++i) {
685	__kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[i],
686	team, tid: i, FALSE);
687	ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
688	&team->t.t_implicit_task_taskdata[`0`].td_icvs);
689	}
690	ngo_sync();
691	}
692	}
693	#endif // KMP_BARRIER_ICV_PUSH
694
695	// Now, release all of the worker threads
696	for (i = `1`; i < nproc; ++i) {
697	#if KMP_CACHE_MANAGE
698	// Prefetch next thread's go flag
699	if (i + `1` < nproc)
700	KMP_CACHE_PREFETCH(&other_threads[i + `1`]->th.th_bar[bt].bb.b_go);
701	#endif /* KMP_CACHE_MANAGE */
702	KA_TRACE(
703	`20`,
704	("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
705	"go(%p): %u => %u\n",
706	gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
707	team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
708	other_threads[i]->th.th_bar[bt].bb.b_go,
709	other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
710	kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
711	other_threads[i]);
712	flag.release();
713	}
714	}
715	} else { // Wait for the PRIMARY thread to release us
716	KA_TRACE(`20`, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
717	gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
718	if (cancellable) {
719	kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
720	if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
721	return true;
722	} else {
723	kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
724	flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
725	}
726	#if USE_ITT_BUILD && USE_ITT_NOTIFY
727	if ((__itt_sync_create_ptr && itt_sync_obj == NULL) \|\| KMP_ITT_DEBUG) {
728	// In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
729	// disabled)
730	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: `0`, delta: -`1`);
731	// Cancel wait on previous parallel region...
732	__kmp_itt_task_starting(object: itt_sync_obj);
733
734	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
735	return false;
736
737	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier);
738	if (itt_sync_obj != NULL)
739	// Call prepare as early as possible for "new" barrier
740	__kmp_itt_task_finished(object: itt_sync_obj);
741	} else
742	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
743	// Early exit for reaping threads releasing forkjoin barrier
744	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
745	return false;
746	// The worker thread may now assume that the team is valid.
747	#ifdef KMP_DEBUG
748	tid = __kmp_tid_from_gtid(gtid);
749	team = __kmp_threads[gtid]->th.th_team;
750	#endif
751	KMP_DEBUG_ASSERT(team != NULL);
752	TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
753	KA_TRACE(`20`,
754	("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
755	gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
756	KMP_MB(); // Flush all pending memory write invalidates.
757	}
758	KA_TRACE(
759	`20`,
760	("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
761	gtid, team->t.t_id, tid, bt));
762	return false;
763	}
764
765	static void __kmp_linear_barrier_gather(
766	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
767	void (reduce)(void* , void* ) USE_ITT_BUILD_ARG(void* *itt_sync_obj)) {
768	__kmp_linear_barrier_gather_template<false>(
769	bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
770	}
771
772	static bool __kmp_linear_barrier_gather_cancellable(
773	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
774	void (reduce)(void* , void* ) USE_ITT_BUILD_ARG(void* *itt_sync_obj)) {
775	return __kmp_linear_barrier_gather_template<true>(
776	bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
777	}
778
779	static void __kmp_linear_barrier_release(
780	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
781	int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
782	__kmp_linear_barrier_release_template<false>(
783	bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
784	}
785
786	static bool __kmp_linear_barrier_release_cancellable(
787	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
788	int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
789	return __kmp_linear_barrier_release_template<true>(
790	bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
791	}
792
793	// Tree barrier
794	static void __kmp_tree_barrier_gather(
795	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
796	void (reduce)(void* , void* ) USE_ITT_BUILD_ARG(void* *itt_sync_obj)) {
797	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
798	kmp_team_t *team = this_thr->th.th_team;
799	kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
800	kmp_info_t **other_threads = team->t.t_threads;
801	kmp_uint32 nproc = this_thr->th.th_team_nproc;
802	kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
803	kmp_uint32 branch_factor = `1` << branch_bits;
804	kmp_uint32 child;
805	kmp_uint32 child_tid;
806	kmp_uint64 new_state = `0`;
807
808	KA_TRACE(
809	`20`, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
810	gtid, team->t.t_id, tid, bt));
811	KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
812
813	#if USE_ITT_BUILD && USE_ITT_NOTIFY
814	// Barrier imbalance - save arrive time to the thread
815	if (__kmp_forkjoin_frames_mode == `3` \|\| __kmp_forkjoin_frames_mode == `2`) {
816	this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
817	__itt_get_timestamp();
818	}
819	#endif
820	// Perform tree gather to wait until all threads have arrived; reduce any
821	// required data as we go
822	child_tid = (tid << branch_bits) + `1`;
823	if (child_tid < nproc) {
824	// Parent threads wait for all their children to arrive
825	new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
826	child = `1`;
827	do {
828	kmp_info_t *child_thr = other_threads[child_tid];
829	kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
830	#if KMP_CACHE_MANAGE
831	// Prefetch next thread's arrived count
832	if (child + `1` <= branch_factor && child_tid + `1` < nproc)
833	KMP_CACHE_PREFETCH(
834	&other_threads[child_tid + `1`]->th.th_bar[bt].bb.b_arrived);
835	#endif /* KMP_CACHE_MANAGE */
836	KA_TRACE(`20`,
837	("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
838	"arrived(%p) == %llu\n",
839	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
840	team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
841	// Wait for child to arrive
842	kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
843	flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
844	#if USE_ITT_BUILD && USE_ITT_NOTIFY
845	// Barrier imbalance - write min of the thread time and a child time to
846	// the thread.
847	if (__kmp_forkjoin_frames_mode == `2`) {
848	this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
849	child_thr->th.th_bar_min_time);
850	}
851	#endif
852	if (reduce) {
853	KA_TRACE(`100`,
854	("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
855	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
856	team->t.t_id, child_tid));
857	OMPT_REDUCTION_DECL(this_thr, gtid);
858	OMPT_REDUCTION_BEGIN;
859	(*reduce)(this_thr->th.th_local.reduce_data,
860	child_thr->th.th_local.reduce_data);
861	OMPT_REDUCTION_END;
862	}
863	child++;
864	child_tid++;
865	} while (child <= branch_factor && child_tid < nproc);
866	}
867
868	if (!KMP_MASTER_TID(tid)) { // Worker threads
869	kmp_int32 parent_tid = (tid - `1`) >> branch_bits;
870
871	KA_TRACE(`20`,
872	("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
873	"arrived(%p): %llu => %llu\n",
874	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
875	team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
876	thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
877
878	// Mark arrival to parent thread
879	/ After performing this write, a worker thread may not assume that the team*
880	is valid any more - it could be deallocated by the primary thread at any
881	time. /*
882	kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
883	flag.release();
884	} else {
885	// Need to update the team arrived pointer if we are the primary thread
886	if (nproc > `1`) // New value was already computed above
887	team->t.t_bar[bt].b_arrived = new_state;
888	else
889	team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
890	KA_TRACE(`20`, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
891	"arrived(%p) = %llu\n",
892	gtid, team->t.t_id, tid, team->t.t_id,
893	&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
894	}
895	KA_TRACE(`20`,
896	("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
897	gtid, team->t.t_id, tid, bt));
898	}
899
900	static void __kmp_tree_barrier_release(
901	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
902	int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
903	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
904	kmp_team_t *team;
905	kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
906	kmp_uint32 nproc;
907	kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
908	kmp_uint32 branch_factor = `1` << branch_bits;
909	kmp_uint32 child;
910	kmp_uint32 child_tid;
911
912	// Perform a tree release for all of the threads that have been gathered
913	if (!KMP_MASTER_TID(
914	tid)) { // Handle fork barrier workers who aren't part of a team yet
915	KA_TRACE(`20`, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
916	&thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
917	// Wait for parent thread to release us
918	kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
919	flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
920	#if USE_ITT_BUILD && USE_ITT_NOTIFY
921	if ((__itt_sync_create_ptr && itt_sync_obj == NULL) \|\| KMP_ITT_DEBUG) {
922	// In fork barrier where we could not get the object reliably (or
923	// ITTNOTIFY is disabled)
924	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: `0`, delta: -`1`);
925	// Cancel wait on previous parallel region...
926	__kmp_itt_task_starting(object: itt_sync_obj);
927
928	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
929	return;
930
931	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier);
932	if (itt_sync_obj != NULL)
933	// Call prepare as early as possible for "new" barrier
934	__kmp_itt_task_finished(object: itt_sync_obj);
935	} else
936	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
937	// Early exit for reaping threads releasing forkjoin barrier
938	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
939	return;
940
941	// The worker thread may now assume that the team is valid.
942	team = __kmp_threads[gtid]->th.th_team;
943	KMP_DEBUG_ASSERT(team != NULL);
944	tid = __kmp_tid_from_gtid(gtid);
945
946	TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
947	KA_TRACE(`20`,
948	("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
949	team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
950	KMP_MB(); // Flush all pending memory write invalidates.
951	} else {
952	team = __kmp_threads[gtid]->th.th_team;
953	KMP_DEBUG_ASSERT(team != NULL);
954	KA_TRACE(`20`, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
955	"barrier type %d\n",
956	gtid, team->t.t_id, tid, bt));
957	}
958	nproc = this_thr->th.th_team_nproc;
959	child_tid = (tid << branch_bits) + `1`;
960
961	if (child_tid < nproc) {
962	kmp_info_t **other_threads = team->t.t_threads;
963	child = `1`;
964	// Parent threads release all their children
965	do {
966	kmp_info_t *child_thr = other_threads[child_tid];
967	kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
968	#if KMP_CACHE_MANAGE
969	// Prefetch next thread's go count
970	if (child + `1` <= branch_factor && child_tid + `1` < nproc)
971	KMP_CACHE_PREFETCH(
972	&other_threads[child_tid + `1`]->th.th_bar[bt].bb.b_go);
973	#endif /* KMP_CACHE_MANAGE */
974
975	#if KMP_BARRIER_ICV_PUSH
976	{
977	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
978	if (propagate_icvs) {
979	__kmp_init_implicit_task(loc_ref: team->t.t_ident,
980	this_thr: team->t.t_threads[child_tid], team,
981	tid: child_tid, FALSE);
982	copy_icvs(dst: &team->t.t_implicit_task_taskdata[child_tid].td_icvs,
983	src: &team->t.t_implicit_task_taskdata[`0`].td_icvs);
984	}
985	}
986	#endif // KMP_BARRIER_ICV_PUSH
987	KA_TRACE(`20`,
988	("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
989	"go(%p): %u => %u\n",
990	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991	team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992	child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993	// Release child from barrier
994	kmp_flag_64<> flag(&child_bar->b_go, child_thr);
995	flag.release();
996	child++;
997	child_tid++;
998	} while (child <= branch_factor && child_tid < nproc);
999	}
1000	KA_TRACE(
1001	`20`, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1002	gtid, team->t.t_id, tid, bt));
1003	}
1004
1005	// Hyper Barrier
1006	static void __kmp_hyper_barrier_gather(
1007	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
1008	void (reduce)(void* , void* ) USE_ITT_BUILD_ARG(void* *itt_sync_obj)) {
1009	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1010	kmp_team_t *team = this_thr->th.th_team;
1011	kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1012	kmp_info_t **other_threads = team->t.t_threads;
1013	kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1014	kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1015	kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1016	kmp_uint32 branch_factor = `1` << branch_bits;
1017	kmp_uint32 offset;
1018	kmp_uint32 level;
1019
1020	KA_TRACE(
1021	`20`,
1022	("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1023	gtid, team->t.t_id, tid, bt));
1024	KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1025
1026	#if USE_ITT_BUILD && USE_ITT_NOTIFY
1027	// Barrier imbalance - save arrive time to the thread
1028	if (__kmp_forkjoin_frames_mode == `3` \|\| __kmp_forkjoin_frames_mode == `2`) {
1029	this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1030	__itt_get_timestamp();
1031	}
1032	#endif
1033	/ Perform a hypercube-embedded tree gather to wait until all of the threads*
1034	have arrived, and reduce any required data as we go. /*
1035	kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1036	for (level = `0`, offset = `1`; offset < num_threads;
1037	level += branch_bits, offset <<= branch_bits) {
1038	kmp_uint32 child;
1039	kmp_uint32 child_tid;
1040
1041	if (((tid >> level) & (branch_factor - `1`)) != `0`) {
1042	kmp_int32 parent_tid = tid & ~((`1` << (level + branch_bits)) - `1`);
1043
1044	KMP_MB(); // Synchronize parent and child threads.
1045	KA_TRACE(`20`,
1046	("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1047	"arrived(%p): %llu => %llu\n",
1048	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1049	team->t.t_id, parent_tid, &thr_bar->b_arrived,
1050	thr_bar->b_arrived,
1051	thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1052	// Mark arrival to parent thread
1053	/ After performing this write (in the last iteration of the enclosing for*
1054	loop), a worker thread may not assume that the team is valid any more
1055	- it could be deallocated by the primary thread at any time. /*
1056	p_flag.set_waiter(other_threads[parent_tid]);
1057	p_flag.release();
1058	break;
1059	}
1060
1061	// Parent threads wait for children to arrive
1062	if (new_state == KMP_BARRIER_UNUSED_STATE)
1063	new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1064	for (child = `1`, child_tid = tid + (`1` << level);
1065	child < branch_factor && child_tid < num_threads;
1066	child++, child_tid += (`1` << level)) {
1067	kmp_info_t *child_thr = other_threads[child_tid];
1068	kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1069	#if KMP_CACHE_MANAGE
1070	kmp_uint32 next_child_tid = child_tid + (`1` << level);
1071	// Prefetch next thread's arrived count
1072	if (child + `1` < branch_factor && next_child_tid < num_threads)
1073	KMP_CACHE_PREFETCH(
1074	&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1075	#endif /* KMP_CACHE_MANAGE */
1076	KA_TRACE(`20`,
1077	("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1078	"arrived(%p) == %llu\n",
1079	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1080	team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1081	// Wait for child to arrive
1082	kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1083	c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1084	KMP_MB(); // Synchronize parent and child threads.
1085	#if USE_ITT_BUILD && USE_ITT_NOTIFY
1086	// Barrier imbalance - write min of the thread time and a child time to
1087	// the thread.
1088	if (__kmp_forkjoin_frames_mode == `2`) {
1089	this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1090	child_thr->th.th_bar_min_time);
1091	}
1092	#endif
1093	if (reduce) {
1094	KA_TRACE(`100`,
1095	("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1096	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1097	team->t.t_id, child_tid));
1098	OMPT_REDUCTION_DECL(this_thr, gtid);
1099	OMPT_REDUCTION_BEGIN;
1100	(*reduce)(this_thr->th.th_local.reduce_data,
1101	child_thr->th.th_local.reduce_data);
1102	OMPT_REDUCTION_END;
1103	}
1104	}
1105	}
1106
1107	if (KMP_MASTER_TID(tid)) {
1108	// Need to update the team arrived pointer if we are the primary thread
1109	if (new_state == KMP_BARRIER_UNUSED_STATE)
1110	team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1111	else
1112	team->t.t_bar[bt].b_arrived = new_state;
1113	KA_TRACE(`20`, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1114	"arrived(%p) = %llu\n",
1115	gtid, team->t.t_id, tid, team->t.t_id,
1116	&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1117	}
1118	KA_TRACE(
1119	`20`, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1120	gtid, team->t.t_id, tid, bt));
1121	}
1122
1123	// The reverse versions seem to beat the forward versions overall
1124	#define KMP_REVERSE_HYPER_BAR
1125	static void __kmp_hyper_barrier_release(
1126	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
1127	int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1128	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1129	kmp_team_t *team;
1130	kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1131	kmp_info_t **other_threads;
1132	kmp_uint32 num_threads;
1133	kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1134	kmp_uint32 branch_factor = `1` << branch_bits;
1135	kmp_uint32 child;
1136	kmp_uint32 child_tid;
1137	kmp_uint32 offset;
1138	kmp_uint32 level;
1139
1140	/ Perform a hypercube-embedded tree release for all of the threads that have*
1141	been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1142	are released in the reverse order of the corresponding gather, otherwise
1143	threads are released in the same order. /*
1144	if (KMP_MASTER_TID(tid)) { // primary thread
1145	team = __kmp_threads[gtid]->th.th_team;
1146	KMP_DEBUG_ASSERT(team != NULL);
1147	KA_TRACE(`20`, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1148	"barrier type %d\n",
1149	gtid, team->t.t_id, tid, bt));
1150	#if KMP_BARRIER_ICV_PUSH
1151	if (propagate_icvs) { // primary already has ICVs in final destination; copy
1152	copy_icvs(dst: &thr_bar->th_fixed_icvs,
1153	src: &team->t.t_implicit_task_taskdata[tid].td_icvs);
1154	}
1155	#endif
1156	} else { // Handle fork barrier workers who aren't part of a team yet
1157	KA_TRACE(`20`, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1158	&thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1159	// Wait for parent thread to release us
1160	kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1161	flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1162	#if USE_ITT_BUILD && USE_ITT_NOTIFY
1163	if ((__itt_sync_create_ptr && itt_sync_obj == NULL) \|\| KMP_ITT_DEBUG) {
1164	// In fork barrier where we could not get the object reliably
1165	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: `0`, delta: -`1`);
1166	// Cancel wait on previous parallel region...
1167	__kmp_itt_task_starting(object: itt_sync_obj);
1168
1169	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1170	return;
1171
1172	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier);
1173	if (itt_sync_obj != NULL)
1174	// Call prepare as early as possible for "new" barrier
1175	__kmp_itt_task_finished(object: itt_sync_obj);
1176	} else
1177	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1178	// Early exit for reaping threads releasing forkjoin barrier
1179	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1180	return;
1181
1182	// The worker thread may now assume that the team is valid.
1183	team = __kmp_threads[gtid]->th.th_team;
1184	KMP_DEBUG_ASSERT(team != NULL);
1185	tid = __kmp_tid_from_gtid(gtid);
1186
1187	TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1188	KA_TRACE(`20`,
1189	("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1190	gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1191	KMP_MB(); // Flush all pending memory write invalidates.
1192	}
1193	num_threads = this_thr->th.th_team_nproc;
1194	other_threads = team->t.t_threads;
1195
1196	#ifdef KMP_REVERSE_HYPER_BAR
1197	// Count up to correct level for parent
1198	for (level = `0`, offset = `1`;
1199	offset < num_threads && (((tid >> level) & (branch_factor - `1`)) == `0`);
1200	level += branch_bits, offset <<= branch_bits)
1201	;
1202
1203	// Now go down from there
1204	for (level -= branch_bits, offset >>= branch_bits; offset != `0`;
1205	level -= branch_bits, offset >>= branch_bits)
1206	#else
1207	// Go down the tree, level by level
1208	for (level = `0`, offset = `1`; offset < num_threads;
1209	level += branch_bits, offset <<= branch_bits)
1210	#endif // KMP_REVERSE_HYPER_BAR
1211	{
1212	#ifdef KMP_REVERSE_HYPER_BAR
1213	/ Now go in reverse order through the children, highest to lowest.*
1214	Initial setting of child is conservative here. /*
1215	child = num_threads >> ((level == `0`) ? level : level - `1`);
1216	for (child = (child < branch_factor - `1`) ? child : branch_factor - `1`,
1217	child_tid = tid + (child << level);
1218	child >= `1`; child--, child_tid -= (`1` << level))
1219	#else
1220	if (((tid >> level) & (branch_factor - `1`)) != `0`)
1221	// No need to go lower than this, since this is the level parent would be
1222	// notified
1223	break;
1224	// Iterate through children on this level of the tree
1225	for (child = `1`, child_tid = tid + (`1` << level);
1226	child < branch_factor && child_tid < num_threads;
1227	child++, child_tid += (`1` << level))
1228	#endif // KMP_REVERSE_HYPER_BAR
1229	{
1230	if (child_tid >= num_threads)
1231	continue; // Child doesn't exist so keep going
1232	else {
1233	kmp_info_t *child_thr = other_threads[child_tid];
1234	kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1235	#if KMP_CACHE_MANAGE
1236	kmp_uint32 next_child_tid = child_tid - (`1` << level);
1237	// Prefetch next thread's go count
1238	#ifdef KMP_REVERSE_HYPER_BAR
1239	if (child - `1` >= `1` && next_child_tid < num_threads)
1240	#else
1241	if (child + `1` < branch_factor && next_child_tid < num_threads)
1242	#endif // KMP_REVERSE_HYPER_BAR
1243	KMP_CACHE_PREFETCH(
1244	&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1245	#endif /* KMP_CACHE_MANAGE */
1246
1247	#if KMP_BARRIER_ICV_PUSH
1248	if (propagate_icvs) // push my fixed ICVs to my child
1249	copy_icvs(dst: &child_bar->th_fixed_icvs, src: &thr_bar->th_fixed_icvs);
1250	#endif // KMP_BARRIER_ICV_PUSH
1251
1252	KA_TRACE(
1253	`20`,
1254	("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1255	"go(%p): %u => %u\n",
1256	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1257	team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1258	child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1259	// Release child from barrier
1260	kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1261	flag.release();
1262	}
1263	}
1264	}
1265	#if KMP_BARRIER_ICV_PUSH
1266	if (propagate_icvs &&
1267	!KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1268	__kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team, tid,
1269	FALSE);
1270	copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs,
1271	src: &thr_bar->th_fixed_icvs);
1272	}
1273	#endif
1274	KA_TRACE(
1275	`20`,
1276	("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1277	gtid, team->t.t_id, tid, bt));
1278	}
1279
1280	// Hierarchical Barrier
1281
1282	// Initialize thread barrier data
1283	/ Initializes/re-initializes the hierarchical barrier data stored on a thread.*
1284	Performs the minimum amount of initialization required based on how the team
1285	has changed. Returns true if leaf children will require both on-core and
1286	traditional wake-up mechanisms. For example, if the team size increases,
1287	threads already in the team will respond to on-core wakeup on their parent
1288	thread, but threads newly added to the team will only be listening on the
1289	their local b_go. /*
1290	static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1291	kmp_bstate_t *thr_bar,
1292	kmp_uint32 nproc, int gtid,
1293	int tid, kmp_team_t *team) {
1294	// Checks to determine if (re-)initialization is needed
1295	bool uninitialized = thr_bar->team == NULL;
1296	bool team_changed = team != thr_bar->team;
1297	bool team_sz_changed = nproc != thr_bar->nproc;
1298	bool tid_changed = tid != thr_bar->old_tid;
1299	bool retval = false;
1300
1301	if (uninitialized \|\| team_sz_changed) {
1302	__kmp_get_hierarchy(nproc, thr_bar);
1303	}
1304
1305	if (uninitialized \|\| team_sz_changed \|\| tid_changed) {
1306	thr_bar->my_level = thr_bar->depth - `1`; // default for primary thread
1307	thr_bar->parent_tid = -`1`; // default for primary thread
1308	if (!KMP_MASTER_TID(tid)) {
1309	// if not primary thread, find parent thread in hierarchy
1310	kmp_uint32 d = `0`;
1311	while (d < thr_bar->depth) { // find parent based on level of thread in
1312	// hierarchy, and note level
1313	kmp_uint32 rem;
1314	if (d == thr_bar->depth - `2`) { // reached level right below the primary
1315	thr_bar->parent_tid = `0`;
1316	thr_bar->my_level = d;
1317	break;
1318	} else if ((rem = tid % thr_bar->skip_per_level[d + `1`]) != `0`) {
1319	// TODO: can we make the above op faster?
1320	// thread is not a subtree root at next level, so this is max
1321	thr_bar->parent_tid = tid - rem;
1322	thr_bar->my_level = d;
1323	break;
1324	}
1325	++d;
1326	}
1327	}
1328	__kmp_type_convert(src: `7` - ((tid - thr_bar->parent_tid) /
1329	(thr_bar->skip_per_level[thr_bar->my_level])),
1330	dest: &(thr_bar->offset));
1331	thr_bar->old_tid = tid;
1332	thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1333	thr_bar->team = team;
1334	thr_bar->parent_bar =
1335	&team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1336	}
1337	if (uninitialized \|\| team_changed \|\| tid_changed) {
1338	thr_bar->team = team;
1339	thr_bar->parent_bar =
1340	&team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1341	retval = true;
1342	}
1343	if (uninitialized \|\| team_sz_changed \|\| tid_changed) {
1344	thr_bar->nproc = nproc;
1345	thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1346	if (thr_bar->my_level == `0`)
1347	thr_bar->leaf_kids = `0`;
1348	if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + `1` > nproc)
1349	__kmp_type_convert(src: nproc - tid - `1`, dest: &(thr_bar->leaf_kids));
1350	thr_bar->leaf_state = `0`;
1351	for (int i = `0`; i < thr_bar->leaf_kids; ++i)
1352	((char *)&(thr_bar->leaf_state))[`7` - i] = `1`;
1353	}
1354	return retval;
1355	}
1356
1357	static void __kmp_hierarchical_barrier_gather(
1358	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
1359	void (reduce)(void* , void* ) USE_ITT_BUILD_ARG(void* *itt_sync_obj)) {
1360	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1361	kmp_team_t *team = this_thr->th.th_team;
1362	kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1363	kmp_uint32 nproc = this_thr->th.th_team_nproc;
1364	kmp_info_t **other_threads = team->t.t_threads;
1365	kmp_uint64 new_state = `0`;
1366
1367	int level = team->t.t_level;
1368	if (other_threads[`0`]
1369	->th.th_teams_microtask) // are we inside the teams construct?
1370	if (this_thr->th.th_teams_size.nteams > `1`)
1371	++level; // level was not increased in teams construct for team_of_masters
1372	if (level == `1`)
1373	thr_bar->use_oncore_barrier = `1`;
1374	else
1375	thr_bar->use_oncore_barrier = `0`; // Do not use oncore barrier when nested
1376
1377	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1378	"barrier type %d\n",
1379	gtid, team->t.t_id, tid, bt));
1380	KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1381
1382	#if USE_ITT_BUILD && USE_ITT_NOTIFY
1383	// Barrier imbalance - save arrive time to the thread
1384	if (__kmp_forkjoin_frames_mode == `3` \|\| __kmp_forkjoin_frames_mode == `2`) {
1385	this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1386	}
1387	#endif
1388
1389	(void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1390	team);
1391
1392	if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1393	kmp_int32 child_tid;
1394	new_state =
1395	(kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1396	if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1397	thr_bar->use_oncore_barrier) {
1398	if (thr_bar->leaf_kids) {
1399	// First, wait for leaf children to check-in on my b_arrived flag
1400	kmp_uint64 leaf_state =
1401	KMP_MASTER_TID(tid)
1402	? thr_bar->b_arrived \| thr_bar->leaf_state
1403	: team->t.t_bar[bt].b_arrived \| thr_bar->leaf_state;
1404	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1405	"for leaf kids\n",
1406	gtid, team->t.t_id, tid));
1407	kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1408	flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1409	if (reduce) {
1410	OMPT_REDUCTION_DECL(this_thr, gtid);
1411	OMPT_REDUCTION_BEGIN;
1412	for (child_tid = tid + `1`; child_tid <= tid + thr_bar->leaf_kids;
1413	++child_tid) {
1414	KA_TRACE(`100`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1415	"T#%d(%d:%d)\n",
1416	gtid, team->t.t_id, tid,
1417	__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1418	child_tid));
1419	(*reduce)(this_thr->th.th_local.reduce_data,
1420	other_threads[child_tid]->th.th_local.reduce_data);
1421	}
1422	OMPT_REDUCTION_END;
1423	}
1424	// clear leaf_state bits
1425	KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1426	}
1427	// Next, wait for higher level children on each child's b_arrived flag
1428	for (kmp_uint32 d = `1`; d < thr_bar->my_level;
1429	++d) { // gather lowest level threads first, but skip 0
1430	kmp_uint32 last = tid + thr_bar->skip_per_level[d + `1`],
1431	skip = thr_bar->skip_per_level[d];
1432	if (last > nproc)
1433	last = nproc;
1434	for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1435	kmp_info_t *child_thr = other_threads[child_tid];
1436	kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1437	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1438	"T#%d(%d:%d) "
1439	"arrived(%p) == %llu\n",
1440	gtid, team->t.t_id, tid,
1441	__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1442	child_tid, &child_bar->b_arrived, new_state));
1443	kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1444	flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1445	if (reduce) {
1446	KA_TRACE(`100`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1447	"T#%d(%d:%d)\n",
1448	gtid, team->t.t_id, tid,
1449	__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1450	child_tid));
1451	(*reduce)(this_thr->th.th_local.reduce_data,
1452	child_thr->th.th_local.reduce_data);
1453	}
1454	}
1455	}
1456	} else { // Blocktime is not infinite
1457	for (kmp_uint32 d = `0`; d < thr_bar->my_level;
1458	++d) { // Gather lowest level threads first
1459	kmp_uint32 last = tid + thr_bar->skip_per_level[d + `1`],
1460	skip = thr_bar->skip_per_level[d];
1461	if (last > nproc)
1462	last = nproc;
1463	for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1464	kmp_info_t *child_thr = other_threads[child_tid];
1465	kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1466	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1467	"T#%d(%d:%d) "
1468	"arrived(%p) == %llu\n",
1469	gtid, team->t.t_id, tid,
1470	__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1471	child_tid, &child_bar->b_arrived, new_state));
1472	kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1473	flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474	if (reduce) {
1475	KA_TRACE(`100`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1476	"T#%d(%d:%d)\n",
1477	gtid, team->t.t_id, tid,
1478	__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1479	child_tid));
1480	(*reduce)(this_thr->th.th_local.reduce_data,
1481	child_thr->th.th_local.reduce_data);
1482	}
1483	}
1484	}
1485	}
1486	}
1487	// All subordinates are gathered; now release parent if not primary thread
1488
1489	if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1490	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1491	" T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1492	gtid, team->t.t_id, tid,
1493	__kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1494	thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1495	thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1496	/ Mark arrival to parent: After performing this write, a worker thread may*
1497	not assume that the team is valid any more - it could be deallocated by
1498	the primary thread at any time. /*
1499	if (thr_bar->my_level \|\| __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME \|\|
1500	!thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1501	// flag; release it
1502	kmp_flag_64<> flag(&thr_bar->b_arrived,
1503	other_threads[thr_bar->parent_tid]);
1504	flag.release();
1505	} else {
1506	// Leaf does special release on "offset" bits of parent's b_arrived flag
1507	thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1508	kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1509	thr_bar->offset + `1`);
1510	flag.set_waiter(other_threads[thr_bar->parent_tid]);
1511	flag.release();
1512	}
1513	} else { // Primary thread needs to update the team's b_arrived value
1514	team->t.t_bar[bt].b_arrived = new_state;
1515	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1516	"arrived(%p) = %llu\n",
1517	gtid, team->t.t_id, tid, team->t.t_id,
1518	&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1519	}
1520	// Is the team access below unsafe or just technically invalid?
1521	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1522	"barrier type %d\n",
1523	gtid, team->t.t_id, tid, bt));
1524	}
1525
1526	static void __kmp_hierarchical_barrier_release(
1527	enum barrier_type bt, kmp_info_t this_thr, int* gtid, int tid,
1528	int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1529	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1530	kmp_team_t *team;
1531	kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1532	kmp_uint32 nproc;
1533	bool team_change = false; // indicates on-core barrier shouldn't be used
1534
1535	if (KMP_MASTER_TID(tid)) {
1536	team = __kmp_threads[gtid]->th.th_team;
1537	KMP_DEBUG_ASSERT(team != NULL);
1538	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1539	"entered barrier type %d\n",
1540	gtid, team->t.t_id, tid, bt));
1541	} else { // Worker threads
1542	// Wait for parent thread to release me
1543	if (!thr_bar->use_oncore_barrier \|\|
1544	__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME \|\| thr_bar->my_level != `0` \|\|
1545	thr_bar->team == NULL) {
1546	// Use traditional method of waiting on my own b_go flag
1547	thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1548	kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1549	flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1550	TCW_8(thr_bar->b_go,
1551	KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1552	} else { // Thread barrier data is initialized, this is a leaf, blocktime is
1553	// infinite, not nested
1554	// Wait on my "offset" bits on parent's b_go flag
1555	thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1556	kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1557	thr_bar->offset + `1`, bt,
1558	this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1559	flag.wait(this_thr, TRUE);
1560	if (thr_bar->wait_flag ==
1561	KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1562	TCW_8(thr_bar->b_go,
1563	KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1564	} else { // Reset my bits on parent's b_go flag
1565	(RCAST(volatile char *,
1566	&(thr_bar->parent_bar->b_go)))[thr_bar->offset + `1`] = `0`;
1567	}
1568	}
1569	thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1570	// Early exit for reaping threads releasing forkjoin barrier
1571	if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1572	return;
1573	// The worker thread may now assume that the team is valid.
1574	team = __kmp_threads[gtid]->th.th_team;
1575	KMP_DEBUG_ASSERT(team != NULL);
1576	tid = __kmp_tid_from_gtid(gtid);
1577
1578	KA_TRACE(
1579	`20`,
1580	("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1581	gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1582	KMP_MB(); // Flush all pending memory write invalidates.
1583	}
1584
1585	nproc = this_thr->th.th_team_nproc;
1586	int level = team->t.t_level;
1587	if (team->t.t_threads[`0`]
1588	->th.th_teams_microtask) { // are we inside the teams construct?
1589	if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1590	this_thr->th.th_teams_level == level)
1591	++level; // level was not increased in teams construct for team_of_workers
1592	if (this_thr->th.th_teams_size.nteams > `1`)
1593	++level; // level was not increased in teams construct for team_of_masters
1594	}
1595	if (level == `1`)
1596	thr_bar->use_oncore_barrier = `1`;
1597	else
1598	thr_bar->use_oncore_barrier = `0`; // Do not use oncore barrier when nested
1599
1600	// If the team size has increased, we still communicate with old leaves via
1601	// oncore barrier.
1602	unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1603	kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1604	team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1605	tid, team);
1606	// But if the entire team changes, we won't use oncore barrier at all
1607	if (team_change)
1608	old_leaf_kids = `0`;
1609
1610	#if KMP_BARRIER_ICV_PUSH
1611	if (propagate_icvs) {
1612	__kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team, tid,
1613	FALSE);
1614	if (KMP_MASTER_TID(
1615	tid)) { // primary already has copy in final destination; copy
1616	copy_icvs(dst: &thr_bar->th_fixed_icvs,
1617	src: &team->t.t_implicit_task_taskdata[tid].td_icvs);
1618	} else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1619	thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1620	if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1621	// leaves (on-core children) pull parent's fixed ICVs directly to local
1622	// ICV store
1623	copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs,
1624	src: &thr_bar->parent_bar->th_fixed_icvs);
1625	// non-leaves will get ICVs piggybacked with b_go via NGO store
1626	} else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1627	if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1628	// access
1629	copy_icvs(dst: &thr_bar->th_fixed_icvs, src: &thr_bar->parent_bar->th_fixed_icvs);
1630	else // leaves copy parent's fixed ICVs directly to local ICV store
1631	copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs,
1632	src: &thr_bar->parent_bar->th_fixed_icvs);
1633	}
1634	}
1635	#endif // KMP_BARRIER_ICV_PUSH
1636
1637	// Now, release my children
1638	if (thr_bar->my_level) { // not a leaf
1639	kmp_int32 child_tid;
1640	kmp_uint32 last;
1641	if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1642	thr_bar->use_oncore_barrier) {
1643	if (KMP_MASTER_TID(tid)) { // do a flat release
1644	// Set local b_go to bump children via NGO store of the cache line
1645	// containing IVCs and b_go.
1646	thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1647	// Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1648	// the cache line
1649	ngo_load(&thr_bar->th_fixed_icvs);
1650	// This loops over all the threads skipping only the leaf nodes in the
1651	// hierarchy
1652	for (child_tid = thr_bar->skip_per_level[`1`]; child_tid < (int)nproc;
1653	child_tid += thr_bar->skip_per_level[`1`]) {
1654	kmp_bstate_t *child_bar =
1655	&team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1656	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1657	"releasing T#%d(%d:%d)"
1658	" go(%p): %u => %u\n",
1659	gtid, team->t.t_id, tid,
1660	__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1661	child_tid, &child_bar->b_go, child_bar->b_go,
1662	child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1663	// Use ngo store (if available) to both store ICVs and release child
1664	// via child's b_go
1665	ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1666	}
1667	ngo_sync();
1668	}
1669	TCW_8(thr_bar->b_go,
1670	KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1671	// Now, release leaf children
1672	if (thr_bar->leaf_kids) { // if there are any
1673	// We test team_change on the off-chance that the level 1 team changed.
1674	if (team_change \|\|
1675	old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1676	if (old_leaf_kids) { // release old leaf kids
1677	thr_bar->b_go \|= old_leaf_state;
1678	}
1679	// Release new leaf kids
1680	last = tid + thr_bar->skip_per_level[`1`];
1681	if (last > nproc)
1682	last = nproc;
1683	for (child_tid = tid + `1` + old_leaf_kids; child_tid < (int)last;
1684	++child_tid) { // skip_per_level[0]=1
1685	kmp_info_t *child_thr = team->t.t_threads[child_tid];
1686	kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1687	KA_TRACE(
1688	`20`,
1689	("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1690	" T#%d(%d:%d) go(%p): %u => %u\n",
1691	gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1692	team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1693	child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1694	// Release child using child's b_go flag
1695	kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1696	flag.release();
1697	}
1698	} else { // Release all children at once with leaf_state bits on my own
1699	// b_go flag
1700	thr_bar->b_go \|= thr_bar->leaf_state;
1701	}
1702	}
1703	} else { // Blocktime is not infinite; do a simple hierarchical release
1704	for (int d = thr_bar->my_level - `1`; d >= `0`;
1705	--d) { // Release highest level threads first
1706	last = tid + thr_bar->skip_per_level[d + `1`];
1707	kmp_uint32 skip = thr_bar->skip_per_level[d];
1708	if (last > nproc)
1709	last = nproc;
1710	for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1711	kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712	kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1714	"releasing T#%d(%d:%d) go(%p): %u => %u\n",
1715	gtid, team->t.t_id, tid,
1716	__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1717	child_tid, &child_bar->b_go, child_bar->b_go,
1718	child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1719	// Release child using child's b_go flag
1720	kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1721	flag.release();
1722	}
1723	}
1724	}
1725	#if KMP_BARRIER_ICV_PUSH
1726	if (propagate_icvs && !KMP_MASTER_TID(tid))
1727	// non-leaves copy ICVs from fixed ICVs to local dest
1728	copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs,
1729	src: &thr_bar->th_fixed_icvs);
1730	#endif // KMP_BARRIER_ICV_PUSH
1731	}
1732	KA_TRACE(`20`, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1733	"barrier type %d\n",
1734	gtid, team->t.t_id, tid, bt));
1735	}
1736
1737	// End of Barrier Algorithms
1738
1739	// type traits for cancellable value
1740	// if cancellable is true, then is_cancellable is a normal boolean variable
1741	// if cancellable is false, then is_cancellable is a compile time constant
1742	template <bool cancellable> struct is_cancellable {};
1743	template <> struct is_cancellable<true> {
1744	bool value;
1745	is_cancellable() : value(false) {}
1746	is_cancellable(bool b) : value(b) {}
1747	is_cancellable &operator=(bool b) {
1748	value = b;
1749	return *this;
1750	}
1751	operator bool() const { return value; }
1752	};
1753	template <> struct is_cancellable<false> {
1754	is_cancellable &operator=(bool b) { return *this; }
1755	constexpr operator bool() const { return false; }
1756	};
1757
1758	// Internal function to do a barrier.
1759	/ If is_split is true, do a split barrier, otherwise, do a plain barrier*
1760	If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1761	barrier
1762	When cancellable = false,
1763	Returns 0 if primary thread, 1 if worker thread.
1764	When cancellable = true
1765	Returns 0 if not cancelled, 1 if cancelled. /*
1766	template <bool cancellable = false>
1767	static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1768	size_t reduce_size, void *reduce_data,
1769	void (reduce)(void* , void* *)) {
1770	KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1771	KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1772	int tid = __kmp_tid_from_gtid(gtid);
1773	kmp_info_t *this_thr = __kmp_threads[gtid];
1774	kmp_team_t *team = this_thr->th.th_team;
1775	int status = `0`;
1776	is_cancellable<cancellable> cancelled;
1777	#if OMPT_SUPPORT && OMPT_OPTIONAL
1778	ompt_data_t *my_task_data;
1779	ompt_data_t *my_parallel_data;
1780	void *return_address;
1781	ompt_sync_region_t barrier_kind;
1782	#endif
1783
1784	KA_TRACE(`15`, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1785	__kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1786
1787	#if OMPT_SUPPORT
1788	if (ompt_enabled.enabled) {
1789	#if OMPT_OPTIONAL
1790	my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1791	my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1792	return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1793	barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1794	if (ompt_enabled.ompt_callback_sync_region) {
1795	ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1796	barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1797	return_address);
1798	}
1799	if (ompt_enabled.ompt_callback_sync_region_wait) {
1800	ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1801	barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1802	return_address);
1803	}
1804	#endif
1805	// It is OK to report the barrier state after the barrier begin callback.
1806	// According to the OMPT specification, a compliant implementation may
1807	// even delay reporting this state until the barrier begins to wait.
1808	this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1809	}
1810	#endif
1811
1812	if (!team->t.t_serialized) {
1813	#if USE_ITT_BUILD
1814	// This value will be used in itt notify events below.
1815	void *itt_sync_obj = NULL;
1816	#if USE_ITT_NOTIFY
1817	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
1818	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, set_name: `1`);
1819	#endif
1820	#endif /* USE_ITT_BUILD */
1821	if (__kmp_tasking_mode == tskm_extra_barrier) {
1822	__kmp_tasking_barrier(team, thread: this_thr, gtid);
1823	KA_TRACE(`15`,
1824	("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1825	__kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1826	}
1827
1828	/ Copy the blocktime info to the thread, where __kmp_wait_template() can*
1829	access it when the team struct is not guaranteed to exist. /*
1830	// See note about the corresponding code in __kmp_join_barrier() being
1831	// performance-critical.
1832	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1833	#if KMP_USE_MONITOR
1834	this_thr->th.th_team_bt_intervals =
1835	team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1836	this_thr->th.th_team_bt_set =
1837	team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1838	#else
1839	this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1840	#endif
1841	}
1842
1843	#if USE_ITT_BUILD
1844	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
1845	__kmp_itt_barrier_starting(gtid, object: itt_sync_obj);
1846	#endif /* USE_ITT_BUILD */
1847	#if USE_DEBUGGER
1848	// Let the debugger know: the thread arrived to the barrier and waiting.
1849	if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1850	team->t.t_bar[bt].b_master_arrived += `1`;
1851	} else {
1852	this_thr->th.th_bar[bt].bb.b_worker_arrived += `1`;
1853	} // if
1854	#endif /* USE_DEBUGGER */
1855	if (reduce != NULL) {
1856	// KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1857	this_thr->th.th_local.reduce_data = reduce_data;
1858	}
1859
1860	if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1861	// use 0 to only setup the current team if nthreads > 1
1862	__kmp_task_team_setup(this_thr, team, always: `0`);
1863
1864	if (cancellable) {
1865	cancelled = __kmp_linear_barrier_gather_cancellable(
1866	bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1867	} else {
1868	switch (__kmp_barrier_gather_pattern[bt]) {
1869	case bp_dist_bar: {
1870	__kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1871	reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1872	break;
1873	}
1874	case bp_hyper_bar: {
1875	// don't set branch bits to 0; use linear
1876	KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1877	__kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1878	reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1879	break;
1880	}
1881	case bp_hierarchical_bar: {
1882	__kmp_hierarchical_barrier_gather(
1883	bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1884	break;
1885	}
1886	case bp_tree_bar: {
1887	// don't set branch bits to 0; use linear
1888	KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1889	__kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1890	reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1891	break;
1892	}
1893	default: {
1894	__kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1895	reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1896	}
1897	}
1898	}
1899
1900	KMP_MB();
1901
1902	if (KMP_MASTER_TID(tid)) {
1903	status = `0`;
1904	if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1905	__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1906	}
1907	#if USE_DEBUGGER
1908	// Let the debugger know: All threads are arrived and starting leaving the
1909	// barrier.
1910	team->t.t_bar[bt].b_team_arrived += `1`;
1911	#endif
1912
1913	if (__kmp_omp_cancellation) {
1914	kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1915	// Reset cancellation flag for worksharing constructs
1916	if (cancel_request == cancel_loop \|\|
1917	cancel_request == cancel_sections) {
1918	KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1919	}
1920	}
1921	#if USE_ITT_BUILD
1922	/ TODO: In case of split reduction barrier, primary thread may send*
1923	acquired event early, before the final summation into the shared
1924	variable is done (final summation can be a long operation for array
1925	reductions). /*
1926	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
1927	__kmp_itt_barrier_middle(gtid, object: itt_sync_obj);
1928	#endif /* USE_ITT_BUILD */
1929	#if USE_ITT_BUILD && USE_ITT_NOTIFY
1930	// Barrier - report frame end (only if active_level == 1)
1931	if ((__itt_frame_submit_v3_ptr \|\| KMP_ITT_DEBUG) &&
1932	__kmp_forkjoin_frames_mode &&
1933	(this_thr->th.th_teams_microtask == NULL \|\| // either not in teams
1934	this_thr->th.th_teams_size.nteams == `1`) && // or inside single team
1935	team->t.t_active_level == `1`) {
1936	ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1937	kmp_uint64 cur_time = __itt_get_timestamp();
1938	kmp_info_t **other_threads = team->t.t_threads;
1939	int nproc = this_thr->th.th_team_nproc;
1940	int i;
1941	switch (__kmp_forkjoin_frames_mode) {
1942	case `1`:
1943	__kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: `0`,
1944	loc, team_size: nproc);
1945	this_thr->th.th_frame_time = cur_time;
1946	break;
1947	case `2`: // AC 2015-01-19: currently does not work for hierarchical (to
1948	// be fixed)
1949	__kmp_itt_frame_submit(gtid, begin: this_thr->th.th_bar_min_time, end: cur_time,
1950	imbalance: `1`, loc, team_size: nproc);
1951	break;
1952	case `3`:
1953	if (__itt_metadata_add_ptr) {
1954	// Initialize with primary thread's wait time
1955	kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1956	// Set arrive time to zero to be able to check it in
1957	// __kmp_invoke_task(); the same is done inside the loop below
1958	this_thr->th.th_bar_arrive_time = `0`;
1959	for (i = `1`; i < nproc; ++i) {
1960	delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1961	other_threads[i]->th.th_bar_arrive_time = `0`;
1962	}
1963	__kmp_itt_metadata_imbalance(gtid, begin: this_thr->th.th_frame_time,
1964	end: cur_time, imbalance: delta,
1965	reduction: (kmp_uint64)(reduce != NULL));
1966	}
1967	__kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: `0`,
1968	loc, team_size: nproc);
1969	this_thr->th.th_frame_time = cur_time;
1970	break;
1971	}
1972	}
1973	#endif /* USE_ITT_BUILD */
1974	} else {
1975	status = `1`;
1976	#if USE_ITT_BUILD
1977	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
1978	__kmp_itt_barrier_middle(gtid, object: itt_sync_obj);
1979	#endif /* USE_ITT_BUILD */
1980	}
1981	if ((status == `1` \|\| !is_split) && !cancelled) {
1982	if (cancellable) {
1983	cancelled = __kmp_linear_barrier_release_cancellable(
1984	bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1985	} else {
1986	switch (__kmp_barrier_release_pattern[bt]) {
1987	case bp_dist_bar: {
1988	KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1989	__kmp_dist_barrier_release(bt, this_thr, gtid, tid,
1990	FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1991	break;
1992	}
1993	case bp_hyper_bar: {
1994	KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1995	__kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1996	FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1997	break;
1998	}
1999	case bp_hierarchical_bar: {
2000	__kmp_hierarchical_barrier_release(
2001	bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2002	break;
2003	}
2004	case bp_tree_bar: {
2005	KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2006	__kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2007	FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2008	break;
2009	}
2010	default: {
2011	__kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2012	FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2013	}
2014	}
2015	}
2016	if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2017	__kmp_task_team_sync(this_thr, team);
2018	}
2019	}
2020
2021	#if USE_ITT_BUILD
2022	/ GEH: TODO: Move this under if-condition above and also include in*
2023	__kmp_end_split_barrier(). This will more accurately represent the actual
2024	release time of the threads for split barriers. /*
2025	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
2026	__kmp_itt_barrier_finished(gtid, object: itt_sync_obj);
2027	#endif /* USE_ITT_BUILD */
2028	} else { // Team is serialized.
2029	status = `0`;
2030	if (__kmp_tasking_mode != tskm_immediate_exec) {
2031	if (this_thr->th.th_task_team != NULL) {
2032	#if USE_ITT_NOTIFY
2033	void *itt_sync_obj = NULL;
2034	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG) {
2035	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, set_name: `1`);
2036	__kmp_itt_barrier_starting(gtid, object: itt_sync_obj);
2037	}
2038	#endif
2039
2040	KMP_DEBUG_ASSERT(
2041	this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE \|\|
2042	this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2043	TRUE);
2044	__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2045	__kmp_task_team_setup(this_thr, team, always: `0`);
2046
2047	#if USE_ITT_BUILD
2048	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
2049	__kmp_itt_barrier_finished(gtid, object: itt_sync_obj);
2050	#endif /* USE_ITT_BUILD */
2051	}
2052	}
2053	}
2054	KA_TRACE(`15`, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2055	gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2056	__kmp_tid_from_gtid(gtid), status));
2057
2058	#if OMPT_SUPPORT
2059	if (ompt_enabled.enabled) {
2060	#if OMPT_OPTIONAL
2061	if (ompt_enabled.ompt_callback_sync_region_wait) {
2062	ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2063	barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2064	return_address);
2065	}
2066	if (ompt_enabled.ompt_callback_sync_region) {
2067	ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2068	barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2069	return_address);
2070	}
2071	#endif
2072	this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2073	}
2074	#endif
2075
2076	if (cancellable)
2077	return (int)cancelled;
2078	return status;
2079	}
2080
2081	// Returns 0 if primary thread, 1 if worker thread.
2082	int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2083	size_t reduce_size, void *reduce_data,
2084	void (reduce)(void* , void* *)) {
2085	return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2086	reduce);
2087	}
2088
2089	#if defined(KMP_GOMP_COMPAT)
2090	// Returns 1 if cancelled, 0 otherwise
2091	int __kmp_barrier_gomp_cancel(int gtid) {
2092	if (__kmp_omp_cancellation) {
2093	int cancelled = __kmp_barrier_template<true>(bt: bs_plain_barrier, gtid, FALSE,
2094	reduce_size: `0`, NULL, NULL);
2095	if (cancelled) {
2096	int tid = __kmp_tid_from_gtid(gtid);
2097	kmp_info_t *this_thr = __kmp_threads[gtid];
2098	if (KMP_MASTER_TID(tid)) {
2099	// Primary thread does not need to revert anything
2100	} else {
2101	// Workers need to revert their private b_arrived flag
2102	this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2103	KMP_BARRIER_STATE_BUMP;
2104	}
2105	}
2106	return cancelled;
2107	}
2108	__kmp_barrier(bt: bs_plain_barrier, gtid, FALSE, reduce_size: `0`, NULL, NULL);
2109	return FALSE;
2110	}
2111	#endif
2112
2113	void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2114	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2115	KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2116	KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2117	int tid = __kmp_tid_from_gtid(gtid);
2118	kmp_info_t *this_thr = __kmp_threads[gtid];
2119	kmp_team_t *team = this_thr->th.th_team;
2120
2121	if (!team->t.t_serialized) {
2122	if (KMP_MASTER_GTID(gtid)) {
2123	switch (__kmp_barrier_release_pattern[bt]) {
2124	case bp_dist_bar: {
2125	__kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2126	FALSE USE_ITT_BUILD_ARG(NULL));
2127	break;
2128	}
2129	case bp_hyper_bar: {
2130	KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2131	__kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2132	FALSE USE_ITT_BUILD_ARG(NULL));
2133	break;
2134	}
2135	case bp_hierarchical_bar: {
2136	__kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2137	FALSE USE_ITT_BUILD_ARG(NULL));
2138	break;
2139	}
2140	case bp_tree_bar: {
2141	KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2142	__kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2143	FALSE USE_ITT_BUILD_ARG(NULL));
2144	break;
2145	}
2146	default: {
2147	__kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2148	FALSE USE_ITT_BUILD_ARG(NULL));
2149	}
2150	}
2151	if (__kmp_tasking_mode != tskm_immediate_exec) {
2152	__kmp_task_team_sync(this_thr, team);
2153	} // if
2154	}
2155	}
2156	}
2157
2158	void __kmp_join_barrier(int gtid) {
2159	KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2160	KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2161
2162	KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2163
2164	kmp_info_t *this_thr = __kmp_threads[gtid];
2165	kmp_team_t *team;
2166	int tid;
2167	#ifdef KMP_DEBUG
2168	int team_id;
2169	#endif /* KMP_DEBUG */
2170	#if USE_ITT_BUILD
2171	void *itt_sync_obj = NULL;
2172	#if USE_ITT_NOTIFY
2173	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG) // Don't call routine without need
2174	// Get object created at fork_barrier
2175	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier);
2176	#endif
2177	#endif /* USE_ITT_BUILD */
2178	#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) \|\| defined KMP_DEBUG)
2179	int nproc = this_thr->th.th_team_nproc;
2180	#endif
2181	KMP_MB();
2182
2183	// Get current info
2184	team = this_thr->th.th_team;
2185	KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2186	tid = __kmp_tid_from_gtid(gtid);
2187	#ifdef KMP_DEBUG
2188	team_id = team->t.t_id;
2189	kmp_info_t *master_thread = this_thr->th.th_team_master;
2190	if (master_thread != team->t.t_threads[`0`]) {
2191	__kmp_print_structure();
2192	}
2193	#endif /* KMP_DEBUG */
2194	KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[`0`]);
2195	KMP_MB();
2196
2197	// Verify state
2198	KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2199	KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2200	KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2201	KA_TRACE(`10`, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2202	gtid, team_id, tid));
2203
2204	#if OMPT_SUPPORT
2205	if (ompt_enabled.enabled) {
2206	#if OMPT_OPTIONAL
2207	ompt_data_t *my_task_data;
2208	ompt_data_t *my_parallel_data;
2209	void *codeptr = NULL;
2210	int ds_tid = this_thr->th.th_info.ds.ds_tid;
2211	if (KMP_MASTER_TID(ds_tid) &&
2212	(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) \|\|
2213	ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2214	codeptr = team->t.ompt_team_info.master_return_address;
2215	my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2216	my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2217	if (ompt_enabled.ompt_callback_sync_region) {
2218	ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2219	ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2220	my_task_data, codeptr);
2221	}
2222	if (ompt_enabled.ompt_callback_sync_region_wait) {
2223	ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2224	ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2225	my_task_data, codeptr);
2226	}
2227	if (!KMP_MASTER_TID(ds_tid))
2228	this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2229	#endif
2230	this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
2231	}
2232	#endif
2233
2234	if (__kmp_tasking_mode == tskm_extra_barrier) {
2235	__kmp_tasking_barrier(team, thread: this_thr, gtid);
2236	KA_TRACE(`10`, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2237	gtid, team_id, tid));
2238	}
2239	#ifdef KMP_DEBUG
2240	if (__kmp_tasking_mode != tskm_immediate_exec) {
2241	KA_TRACE(`20`, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2242	"%p, th_task_team = %p\n",
2243	__kmp_gtid_from_thread(this_thr), team_id,
2244	team->t.t_task_team[this_thr->th.th_task_state],
2245	this_thr->th.th_task_team));
2246	if (this_thr->th.th_task_team)
2247	KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
2248	team->t.t_task_team[this_thr->th.th_task_state]);
2249	}
2250	#endif /* KMP_DEBUG */
2251
2252	/ Copy the blocktime info to the thread, where __kmp_wait_template() can*
2253	access it when the team struct is not guaranteed to exist. Doing these
2254	loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2255	we do not perform the copy if blocktime=infinite, since the values are not
2256	used by __kmp_wait_template() in that case. /*
2257	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2258	#if KMP_USE_MONITOR
2259	this_thr->th.th_team_bt_intervals =
2260	team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2261	this_thr->th.th_team_bt_set =
2262	team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2263	#else
2264	this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2265	#endif
2266	}
2267
2268	#if USE_ITT_BUILD
2269	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
2270	__kmp_itt_barrier_starting(gtid, object: itt_sync_obj);
2271	#endif /* USE_ITT_BUILD */
2272
2273	switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2274	case bp_dist_bar: {
2275	__kmp_dist_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2276	NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2277	break;
2278	}
2279	case bp_hyper_bar: {
2280	KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2281	__kmp_hyper_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2282	NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2283	break;
2284	}
2285	case bp_hierarchical_bar: {
2286	__kmp_hierarchical_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2287	NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2288	break;
2289	}
2290	case bp_tree_bar: {
2291	KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2292	__kmp_tree_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2293	NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2294	break;
2295	}
2296	default: {
2297	__kmp_linear_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2298	NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2299	}
2300	}
2301
2302	/ From this point on, the team data structure may be deallocated at any time*
2303	by the primary thread - it is unsafe to reference it in any of the worker
2304	threads. Any per-team data items that need to be referenced before the
2305	end of the barrier should be moved to the kmp_task_team_t structs. /*
2306	if (KMP_MASTER_TID(tid)) {
2307	if (__kmp_tasking_mode != tskm_immediate_exec) {
2308	__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2309	}
2310	if (__kmp_display_affinity) {
2311	KMP_CHECK_UPDATE(team->t.t_display_affinity, `0`);
2312	}
2313	#if KMP_STATS_ENABLED
2314	// Have primary thread flag the workers to indicate they are now waiting for
2315	// next parallel region, Also wake them up so they switch their timers to
2316	// idle.
2317	for (int i = `0`; i < team->t.t_nproc; ++i) {
2318	kmp_info_t *team_thread = team->t.t_threads[i];
2319	if (team_thread == this_thr)
2320	continue;
2321	team_thread->th.th_stats->setIdleFlag();
2322	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2323	team_thread->th.th_sleep_loc != NULL)
2324	__kmp_null_resume_wrapper(team_thread);
2325	}
2326	#endif
2327	#if USE_ITT_BUILD
2328	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
2329	__kmp_itt_barrier_middle(gtid, object: itt_sync_obj);
2330	#endif /* USE_ITT_BUILD */
2331
2332	#if USE_ITT_BUILD && USE_ITT_NOTIFY
2333	// Join barrier - report frame end
2334	if ((__itt_frame_submit_v3_ptr \|\| KMP_ITT_DEBUG) &&
2335	__kmp_forkjoin_frames_mode &&
2336	(this_thr->th.th_teams_microtask == NULL \|\| // either not in teams
2337	this_thr->th.th_teams_size.nteams == `1`) && // or inside single team
2338	team->t.t_active_level == `1`) {
2339	kmp_uint64 cur_time = __itt_get_timestamp();
2340	ident_t *loc = team->t.t_ident;
2341	kmp_info_t **other_threads = team->t.t_threads;
2342	switch (__kmp_forkjoin_frames_mode) {
2343	case `1`:
2344	__kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: `0`,
2345	loc, team_size: nproc);
2346	break;
2347	case `2`:
2348	__kmp_itt_frame_submit(gtid, begin: this_thr->th.th_bar_min_time, end: cur_time, imbalance: `1`,
2349	loc, team_size: nproc);
2350	break;
2351	case `3`:
2352	if (__itt_metadata_add_ptr) {
2353	// Initialize with primary thread's wait time
2354	kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2355	// Set arrive time to zero to be able to check it in
2356	// __kmp_invoke_task(); the same is done inside the loop below
2357	this_thr->th.th_bar_arrive_time = `0`;
2358	for (int i = `1`; i < nproc; ++i) {
2359	delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2360	other_threads[i]->th.th_bar_arrive_time = `0`;
2361	}
2362	__kmp_itt_metadata_imbalance(gtid, begin: this_thr->th.th_frame_time,
2363	end: cur_time, imbalance: delta, reduction: `0`);
2364	}
2365	__kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: `0`,
2366	loc, team_size: nproc);
2367	this_thr->th.th_frame_time = cur_time;
2368	break;
2369	}
2370	}
2371	#endif /* USE_ITT_BUILD */
2372	}
2373	#if USE_ITT_BUILD
2374	else {
2375	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG)
2376	__kmp_itt_barrier_middle(gtid, object: itt_sync_obj);
2377	}
2378	#endif /* USE_ITT_BUILD */
2379
2380	#if KMP_DEBUG
2381	if (KMP_MASTER_TID(tid)) {
2382	KA_TRACE(
2383	`15`,
2384	("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2385	gtid, team_id, tid, nproc));
2386	}
2387	#endif /* KMP_DEBUG */
2388
2389	// TODO now, mark worker threads as done so they may be disbanded
2390	KMP_MB(); // Flush all pending memory write invalidates.
2391	KA_TRACE(`10`,
2392	("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2393
2394	}
2395
2396	// TODO release worker threads' fork barriers as we are ready instead of all at
2397	// once
2398	void __kmp_fork_barrier(int gtid, int tid) {
2399	KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2400	KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2401	kmp_info_t *this_thr = __kmp_threads[gtid];
2402	kmp_team_t *team = (tid == `0`) ? this_thr->th.th_team : NULL;
2403	#if USE_ITT_BUILD
2404	void *itt_sync_obj = NULL;
2405	#endif /* USE_ITT_BUILD */
2406	#ifdef KMP_DEBUG
2407	if (team)
2408	KA_TRACE(`10`, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2409	(team != NULL) ? team->t.t_id : -`1`, tid));
2410	#endif
2411	// th_team pointer only valid for primary thread here
2412	if (KMP_MASTER_TID(tid)) {
2413	#if USE_ITT_BUILD && USE_ITT_NOTIFY
2414	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG) {
2415	// Create itt barrier object
2416	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: `1`);
2417	__kmp_itt_barrier_middle(gtid, object: itt_sync_obj); // Call acquired/releasing
2418	}
2419	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2420
2421	#ifdef KMP_DEBUG
2422	KMP_DEBUG_ASSERT(team);
2423	kmp_info_t **other_threads = team->t.t_threads;
2424	int i;
2425
2426	// Verify state
2427	KMP_MB();
2428
2429	for (i = `1`; i < team->t.t_nproc; ++i) {
2430	KA_TRACE(`500`,
2431	("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2432	"== %u.\n",
2433	gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2434	team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2435	other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2436	KMP_DEBUG_ASSERT(
2437	(TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2438	~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2439	KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2440	}
2441	#endif
2442
2443	if (__kmp_tasking_mode != tskm_immediate_exec) {
2444	// 0 indicates setup current task team if nthreads > 1
2445	__kmp_task_team_setup(this_thr, team, always: `0`);
2446	}
2447
2448	/ The primary thread may have changed its blocktime between join barrier*
2449	and fork barrier. Copy the blocktime info to the thread, where
2450	__kmp_wait_template() can access it when the team struct is not
2451	guaranteed to exist. /*
2452	// See note about the corresponding code in __kmp_join_barrier() being
2453	// performance-critical
2454	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2455	#if KMP_USE_MONITOR
2456	this_thr->th.th_team_bt_intervals =
2457	team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2458	this_thr->th.th_team_bt_set =
2459	team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2460	#else
2461	this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2462	#endif
2463	}
2464	} // primary thread
2465
2466	switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2467	case bp_dist_bar: {
2468	__kmp_dist_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2469	TRUE USE_ITT_BUILD_ARG(NULL));
2470	break;
2471	}
2472	case bp_hyper_bar: {
2473	KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2474	__kmp_hyper_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2475	TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2476	break;
2477	}
2478	case bp_hierarchical_bar: {
2479	__kmp_hierarchical_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2480	TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2481	break;
2482	}
2483	case bp_tree_bar: {
2484	KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2485	__kmp_tree_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2486	TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2487	break;
2488	}
2489	default: {
2490	__kmp_linear_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid,
2491	TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2492	}
2493	}
2494
2495	#if OMPT_SUPPORT
2496	if (ompt_enabled.enabled &&
2497	this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2498	int ds_tid = this_thr->th.th_info.ds.ds_tid;
2499	ompt_data_t *task_data = (team)
2500	? OMPT_CUR_TASK_DATA(this_thr)
2501	: &(this_thr->th.ompt_thread_info.task_data);
2502	this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2503	#if OMPT_OPTIONAL
2504	void *codeptr = NULL;
2505	if (KMP_MASTER_TID(ds_tid) &&
2506	(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) \|\|
2507	ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2508	codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2509	if (ompt_enabled.ompt_callback_sync_region_wait) {
2510	ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2511	ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2512	codeptr);
2513	}
2514	if (ompt_enabled.ompt_callback_sync_region) {
2515	ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2516	ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2517	codeptr);
2518	}
2519	#endif
2520	if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2521	ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2522	ompt_scope_end, NULL, task_data, `0`, ds_tid,
2523	ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2524	}
2525	}
2526	#endif
2527
2528	// Early exit for reaping threads releasing forkjoin barrier
2529	if (TCR_4(__kmp_global.g.g_done)) {
2530	this_thr->th.th_task_team = NULL;
2531
2532	#if USE_ITT_BUILD && USE_ITT_NOTIFY
2533	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG) {
2534	if (!KMP_MASTER_TID(tid)) {
2535	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier);
2536	if (itt_sync_obj)
2537	__kmp_itt_barrier_finished(gtid, object: itt_sync_obj);
2538	}
2539	}
2540	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2541	KA_TRACE(`10`, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2542	return;
2543	}
2544
2545	/ We can now assume that a valid team structure has been allocated by the*
2546	primary thread and propagated to all worker threads. The current thread,
2547	however, may not be part of the team, so we can't blindly assume that the
2548	team pointer is non-null. /*
2549	team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2550	KMP_DEBUG_ASSERT(team != NULL);
2551	tid = __kmp_tid_from_gtid(gtid);
2552
2553	#if KMP_BARRIER_ICV_PULL
2554	/ Primary thread's copy of the ICVs was set up on the implicit taskdata in*
2555	__kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2556	implicit task has this data before this function is called. We cannot
2557	modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2558	thread struct, because it is not always the case that the threads arrays
2559	have been allocated when __kmp_fork_call() is executed. /*
2560	{
2561	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2562	if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2563	// Copy the initial ICVs from the primary thread's thread struct to the
2564	// implicit task for this tid.
2565	KA_TRACE(`10`,
2566	("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2567	__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2568	tid, FALSE);
2569	copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2570	&team->t.t_threads[`0`]
2571	->th.th_bar[bs_forkjoin_barrier]
2572	.bb.th_fixed_icvs);
2573	}
2574	}
2575	#endif // KMP_BARRIER_ICV_PULL
2576
2577	if (__kmp_tasking_mode != tskm_immediate_exec) {
2578	__kmp_task_team_sync(this_thr, team);
2579	}
2580
2581	#if KMP_AFFINITY_SUPPORTED
2582	kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2583	if (proc_bind == proc_bind_intel) {
2584	// Call dynamic affinity settings
2585	if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2586	__kmp_balanced_affinity(th: this_thr, team_size: team->t.t_nproc);
2587	}
2588	} else if (proc_bind != proc_bind_false) {
2589	if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2590	KA_TRACE(`100`, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2591	__kmp_gtid_from_thread(this_thr),
2592	this_thr->th.th_current_place));
2593	} else {
2594	__kmp_affinity_bind_place(gtid);
2595	}
2596	}
2597	#endif // KMP_AFFINITY_SUPPORTED
2598	// Perform the display affinity functionality
2599	if (__kmp_display_affinity) {
2600	if (team->t.t_display_affinity
2601	#if KMP_AFFINITY_SUPPORTED
2602	\|\| (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2603	#endif
2604	) {
2605	// NULL means use the affinity-format-var ICV
2606	__kmp_aux_display_affinity(gtid, NULL);
2607	this_thr->th.th_prev_num_threads = team->t.t_nproc;
2608	this_thr->th.th_prev_level = team->t.t_level;
2609	}
2610	}
2611	if (!KMP_MASTER_TID(tid))
2612	KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2613
2614	#if USE_ITT_BUILD && USE_ITT_NOTIFY
2615	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG) {
2616	if (!KMP_MASTER_TID(tid)) {
2617	// Get correct barrier object
2618	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier);
2619	__kmp_itt_barrier_finished(gtid, object: itt_sync_obj); // Workers call acquired
2620	} // (prepare called inside barrier_release)
2621	}
2622	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2623	KA_TRACE(`10`, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2624	team->t.t_id, tid));
2625	}
2626
2627	void __kmp_setup_icv_copy(kmp_team_t team, int* new_nproc,
2628	kmp_internal_control_t new_icvs, ident_t loc) {
2629	KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2630
2631	KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2632	KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) \|\| new_icvs->nproc);
2633
2634	/ Primary thread's copy of the ICVs was set up on the implicit taskdata in*
2635	__kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2636	implicit task has this data before this function is called. /*
2637	#if KMP_BARRIER_ICV_PULL
2638	/ Copy ICVs to primary thread's thread structure into th_fixed_icvs (which*
2639	remains untouched), where all of the worker threads can access them and
2640	make their own copies after the barrier. /*
2641	KMP_DEBUG_ASSERT(team->t.t_threads[`0`]); // The threads arrays should be
2642	// allocated at this point
2643	copy_icvs(
2644	&team->t.t_threads[`0`]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2645	new_icvs);
2646	KF_TRACE(`10`, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", `0`,
2647	team->t.t_threads[`0`], team));
2648	#elif KMP_BARRIER_ICV_PUSH
2649	// The ICVs will be propagated in the fork barrier, so nothing needs to be
2650	// done here.
2651	KF_TRACE(`10`, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", `0`,
2652	team->t.t_threads[`0`], team));
2653	#else
2654	// Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2655	// time.
2656	ngo_load(new_icvs);
2657	KMP_DEBUG_ASSERT(team->t.t_threads[`0`]); // The threads arrays should be
2658	// allocated at this point
2659	for (int f = `1`; f < new_nproc; ++f) { // Skip the primary thread
2660	// TODO: GEH - pass in better source location info since usually NULL here
2661	KF_TRACE(`10`, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2662	f, team->t.t_threads[f], team));
2663	__kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2664	ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2665	KF_TRACE(`10`, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2666	f, team->t.t_threads[f], team));
2667	}
2668	ngo_sync();
2669	#endif // KMP_BARRIER_ICV_PULL
2670	}
2671

source code of openmp/runtime/src/kmp_barrier.cpp